summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Jelinek <jakub@redhat.com>2005-01-06 14:57:16 +0000
committerJakub Jelinek <jakub@redhat.com>2005-01-06 14:57:16 +0000
commit0ecfa2580d1aedb744deb5af1b60f92c69b9e9e0 (patch)
tree1ef0d0dc09dba23037800d5f3794a77d9b45554f
parente4f5d077e9190f57abd49684bd7afcf4325bd348 (diff)
Updated to fedora-glibc-20050106T1443
-rw-r--r--ChangeLog374
-rwxr-xr-xconfigure55
-rw-r--r--configure.in33
-rw-r--r--csu/Makefile4
-rw-r--r--elf/Makefile4
-rw-r--r--elf/rtld.c6
-rw-r--r--fedora/branch.mk4
-rw-r--r--iconv/Makefile12
-rw-r--r--include/features.h2
-rw-r--r--include/signal.h3
-rw-r--r--libio/fmemopen.c24
-rw-r--r--libio/iofopncook.c21
-rw-r--r--linuxthreads/ChangeLog7
-rw-r--r--linuxthreads/Makefile2
-rw-r--r--linuxthreads/sysdeps/i386/Makefile1
-rw-r--r--linuxthreads/tst-align.c71
-rw-r--r--localedata/ChangeLog4
-rw-r--r--localedata/gen-unicode-ctype.c2
-rw-r--r--misc/efgcvt_r.c25
-rw-r--r--misc/qefgcvt_r.c12
-rw-r--r--misc/tst-efgcvt.c7
-rw-r--r--nptl/ChangeLog13
-rw-r--r--nptl/Makefile2
-rw-r--r--nptl/init.c2
-rw-r--r--nptl/sysdeps/i386/Makefile2
-rw-r--r--nptl/sysdeps/i386/tls.h7
-rw-r--r--nptl/tst-align2.c87
-rw-r--r--po/es.po1327
-rw-r--r--posix/regex_internal.h3
-rw-r--r--posix/regexec.c94
-rw-r--r--stdio-common/Makefile4
-rw-r--r--stdio-common/tst-fmemopen2.c67
-rw-r--r--stdlib/Makefile23
-rw-r--r--stdlib/tst-putenv.c18
-rw-r--r--stdlib/tst-putenvmod.c17
-rw-r--r--sysdeps/generic/dl-tls.c2
-rw-r--r--sysdeps/generic/libc-start.c4
-rwxr-xr-xsysdeps/i386/configure54
-rw-r--r--sysdeps/i386/configure.in35
-rw-r--r--sysdeps/i386/tst-stack-align.h42
-rw-r--r--sysdeps/ia64/fpu/Makefile35
-rw-r--r--sysdeps/ia64/fpu/README50
-rw-r--r--sysdeps/ia64/fpu/e_acos.S1500
-rw-r--r--sysdeps/ia64/fpu/e_acosf.S79
-rw-r--r--sysdeps/ia64/fpu/e_acosh.S1200
-rw-r--r--sysdeps/ia64/fpu/e_acoshf.S1029
-rw-r--r--sysdeps/ia64/fpu/e_acoshl.S1713
-rw-r--r--sysdeps/ia64/fpu/e_acosl.S2916
-rw-r--r--sysdeps/ia64/fpu/e_asin.S1466
-rw-r--r--sysdeps/ia64/fpu/e_asinf.S73
-rw-r--r--sysdeps/ia64/fpu/e_asinl.S2833
-rw-r--r--sysdeps/ia64/fpu/e_atan2.S736
-rw-r--r--sysdeps/ia64/fpu/e_atan2f.S87
-rw-r--r--sysdeps/ia64/fpu/e_atanh.S1069
-rw-r--r--sysdeps/ia64/fpu/e_atanhf.S844
-rw-r--r--sysdeps/ia64/fpu/e_atanhl.S1155
-rw-r--r--sysdeps/ia64/fpu/e_cosh.S1477
-rw-r--r--sysdeps/ia64/fpu/e_coshf.S1447
-rw-r--r--sysdeps/ia64/fpu/e_coshl.S1661
-rw-r--r--sysdeps/ia64/fpu/e_exp.S887
-rw-r--r--sysdeps/ia64/fpu/e_exp10.S602
-rw-r--r--sysdeps/ia64/fpu/e_exp10f.S561
-rw-r--r--sysdeps/ia64/fpu/e_exp10l.S805
-rw-r--r--sysdeps/ia64/fpu/e_exp2.S563
-rw-r--r--sysdeps/ia64/fpu/e_exp2f.S538
-rw-r--r--sysdeps/ia64/fpu/e_exp2l.S806
-rw-r--r--sysdeps/ia64/fpu/e_expf.S949
-rw-r--r--sysdeps/ia64/fpu/e_fmod.S219
-rw-r--r--sysdeps/ia64/fpu/e_fmodf.S226
-rw-r--r--sysdeps/ia64/fpu/e_fmodl.S221
-rw-r--r--sysdeps/ia64/fpu/e_gamma_r.c1
-rw-r--r--sysdeps/ia64/fpu/e_gammaf_r.c1
-rw-r--r--sysdeps/ia64/fpu/e_gammal_r.c1
-rw-r--r--sysdeps/ia64/fpu/e_hypot.S73
-rw-r--r--sysdeps/ia64/fpu/e_hypotf.S74
-rw-r--r--sysdeps/ia64/fpu/e_hypotl.S71
-rw-r--r--sysdeps/ia64/fpu/e_lgamma_r.c71
-rw-r--r--sysdeps/ia64/fpu/e_lgammaf_r.c71
-rw-r--r--sysdeps/ia64/fpu/e_lgammal_r.c70
-rw-r--r--sysdeps/ia64/fpu/e_log.S2454
-rw-r--r--sysdeps/ia64/fpu/e_log2.S710
-rw-r--r--sysdeps/ia64/fpu/e_log2f.S550
-rw-r--r--sysdeps/ia64/fpu/e_log2l.S816
-rw-r--r--sysdeps/ia64/fpu/e_logf.S1787
-rw-r--r--sysdeps/ia64/fpu/e_logl.S1198
-rw-r--r--sysdeps/ia64/fpu/e_pow.S1633
-rw-r--r--sysdeps/ia64/fpu/e_powf.S1573
-rw-r--r--sysdeps/ia64/fpu/e_powl.S4076
-rw-r--r--sysdeps/ia64/fpu/e_remainder.S114
-rw-r--r--sysdeps/ia64/fpu/e_remainderf.S114
-rw-r--r--sysdeps/ia64/fpu/e_remainderl.S116
-rw-r--r--sysdeps/ia64/fpu/e_scalb.S69
-rw-r--r--sysdeps/ia64/fpu/e_scalbf.S69
-rw-r--r--sysdeps/ia64/fpu/e_scalbl.S69
-rw-r--r--sysdeps/ia64/fpu/e_sinh.S1652
-rw-r--r--sysdeps/ia64/fpu/e_sinhf.S1614
-rw-r--r--sysdeps/ia64/fpu/e_sinhl.S1778
-rw-r--r--sysdeps/ia64/fpu/e_sqrt.S69
-rw-r--r--sysdeps/ia64/fpu/e_sqrtf.S70
-rw-r--r--sysdeps/ia64/fpu/e_sqrtl.S68
-rw-r--r--sysdeps/ia64/fpu/gen_import_file_list80
-rw-r--r--sysdeps/ia64/fpu/import_check81
-rw-r--r--sysdeps/ia64/fpu/import_diffs7
-rw-r--r--sysdeps/ia64/fpu/import_file.awk148
-rw-r--r--sysdeps/ia64/fpu/import_intel_libm42
-rw-r--r--sysdeps/ia64/fpu/libm-symbols.h64
-rw-r--r--sysdeps/ia64/fpu/libm_atan2_reg.S1234
-rw-r--r--sysdeps/ia64/fpu/libm_error.c1789
-rw-r--r--sysdeps/ia64/fpu/libm_frexp.S209
-rw-r--r--sysdeps/ia64/fpu/libm_frexpf.S209
-rw-r--r--sysdeps/ia64/fpu/libm_frexpl.S209
-rw-r--r--sysdeps/ia64/fpu/libm_lgamma.S3594
-rw-r--r--sysdeps/ia64/fpu/libm_lgammaf.S2189
-rw-r--r--sysdeps/ia64/fpu/libm_lgammal.S7676
-rw-r--r--sysdeps/ia64/fpu/libm_reduce.S1492
-rw-r--r--sysdeps/ia64/fpu/libm_scalblnf.S (renamed from sysdeps/ia64/fpu/s_scalbnf.S)81
-rw-r--r--sysdeps/ia64/fpu/libm_sincos.S782
-rw-r--r--sysdeps/ia64/fpu/libm_sincos_large.S2754
-rw-r--r--sysdeps/ia64/fpu/libm_sincosf.S744
-rw-r--r--sysdeps/ia64/fpu/libm_sincosl.S2527
-rw-r--r--sysdeps/ia64/fpu/libm_support.h570
-rw-r--r--sysdeps/ia64/fpu/s_asinh.S1136
-rw-r--r--sysdeps/ia64/fpu/s_asinhf.S937
-rw-r--r--sysdeps/ia64/fpu/s_asinhl.S1346
-rw-r--r--sysdeps/ia64/fpu/s_atan.S1193
-rw-r--r--sysdeps/ia64/fpu/s_atanf.S75
-rw-r--r--sysdeps/ia64/fpu/s_atanl.S2157
-rw-r--r--sysdeps/ia64/fpu/s_cbrt.S1224
-rw-r--r--sysdeps/ia64/fpu/s_cbrtf.S1226
-rw-r--r--sysdeps/ia64/fpu/s_cbrtl.S64
-rw-r--r--sysdeps/ia64/fpu/s_ceil.S274
-rw-r--r--sysdeps/ia64/fpu/s_ceilf.S274
-rw-r--r--sysdeps/ia64/fpu/s_ceill.S276
-rw-r--r--sysdeps/ia64/fpu/s_copysign.S6
-rw-r--r--sysdeps/ia64/fpu/s_cos.S3482
-rw-r--r--sysdeps/ia64/fpu/s_cosf.S1181
-rw-r--r--sysdeps/ia64/fpu/s_cosl.S2756
-rw-r--r--sysdeps/ia64/fpu/s_erf.S924
-rw-r--r--sysdeps/ia64/fpu/s_erfc.S1197
-rw-r--r--sysdeps/ia64/fpu/s_erfcf.S981
-rw-r--r--sysdeps/ia64/fpu/s_erfcl.S2064
-rw-r--r--sysdeps/ia64/fpu/s_erff.S557
-rw-r--r--sysdeps/ia64/fpu/s_erfl.S1239
-rw-r--r--sysdeps/ia64/fpu/s_expm1.S2142
-rw-r--r--sysdeps/ia64/fpu/s_expm1f.S2062
-rw-r--r--sysdeps/ia64/fpu/s_expm1l.S1950
-rw-r--r--sysdeps/ia64/fpu/s_fabs.S116
-rw-r--r--sysdeps/ia64/fpu/s_fabsf.S83
-rw-r--r--sysdeps/ia64/fpu/s_fabsl.S83
-rw-r--r--sysdeps/ia64/fpu/s_fdim.S227
-rw-r--r--sysdeps/ia64/fpu/s_fdimf.S227
-rw-r--r--sysdeps/ia64/fpu/s_fdiml.S227
-rw-r--r--sysdeps/ia64/fpu/s_floor.S252
-rw-r--r--sysdeps/ia64/fpu/s_floorf.S250
-rw-r--r--sysdeps/ia64/fpu/s_floorl.S250
-rw-r--r--sysdeps/ia64/fpu/s_fma.S71
-rw-r--r--sysdeps/ia64/fpu/s_fmaf.S71
-rw-r--r--sysdeps/ia64/fpu/s_fmal.S71
-rw-r--r--sysdeps/ia64/fpu/s_fmax.S114
-rw-r--r--sysdeps/ia64/fpu/s_fmaxf.S114
-rw-r--r--sysdeps/ia64/fpu/s_fmaxl.S114
-rw-r--r--sysdeps/ia64/fpu/s_frexp.c33
-rw-r--r--sysdeps/ia64/fpu/s_frexpf.c33
-rw-r--r--sysdeps/ia64/fpu/s_frexpl.c33
-rw-r--r--sysdeps/ia64/fpu/s_ilogb.S306
-rw-r--r--sysdeps/ia64/fpu/s_ilogbf.S306
-rw-r--r--sysdeps/ia64/fpu/s_ilogbl.S306
-rw-r--r--sysdeps/ia64/fpu/s_ldexp.S380
-rw-r--r--sysdeps/ia64/fpu/s_ldexp.c62
-rw-r--r--sysdeps/ia64/fpu/s_ldexpf.c62
-rw-r--r--sysdeps/ia64/fpu/s_ldexpl.c62
-rw-r--r--sysdeps/ia64/fpu/s_libm_ldexp.S382
-rw-r--r--sysdeps/ia64/fpu/s_libm_ldexpf.S (renamed from sysdeps/ia64/fpu/s_ldexpf.S)91
-rw-r--r--sysdeps/ia64/fpu/s_libm_ldexpl.S (renamed from sysdeps/ia64/fpu/s_ldexpl.S)100
-rw-r--r--sysdeps/ia64/fpu/s_libm_scalbn.S (renamed from sysdeps/ia64/fpu/s_scalbn.S)90
-rw-r--r--sysdeps/ia64/fpu/s_libm_scalbnf.S381
-rw-r--r--sysdeps/ia64/fpu/s_libm_scalbnl.S (renamed from sysdeps/ia64/fpu/s_scalbnl.S)96
-rw-r--r--sysdeps/ia64/fpu/s_log1p.S2312
-rw-r--r--sysdeps/ia64/fpu/s_log1pf.S2028
-rw-r--r--sysdeps/ia64/fpu/s_log1pl.S2067
-rw-r--r--sysdeps/ia64/fpu/s_logb.S315
-rw-r--r--sysdeps/ia64/fpu/s_logbf.S334
-rw-r--r--sysdeps/ia64/fpu/s_logbl.S311
-rw-r--r--sysdeps/ia64/fpu/s_modf.S47
-rw-r--r--sysdeps/ia64/fpu/s_modff.S47
-rw-r--r--sysdeps/ia64/fpu/s_modfl.S49
-rw-r--r--sysdeps/ia64/fpu/s_nearbyint.S46
-rw-r--r--sysdeps/ia64/fpu/s_nearbyintf.S46
-rw-r--r--sysdeps/ia64/fpu/s_nearbyintl.S46
-rw-r--r--sysdeps/ia64/fpu/s_nextafter.S495
-rw-r--r--sysdeps/ia64/fpu/s_nextafterf.S502
-rw-r--r--sysdeps/ia64/fpu/s_nextafterl.S501
-rw-r--r--sysdeps/ia64/fpu/s_nextafterl.c1
-rw-r--r--sysdeps/ia64/fpu/s_nexttoward.S488
-rw-r--r--sysdeps/ia64/fpu/s_nexttoward.c1
-rw-r--r--sysdeps/ia64/fpu/s_nexttowardf.S494
-rw-r--r--sysdeps/ia64/fpu/s_nexttowardf.c1
-rw-r--r--sysdeps/ia64/fpu/s_nexttowardl.S492
-rw-r--r--sysdeps/ia64/fpu/s_rint.S287
-rw-r--r--sysdeps/ia64/fpu/s_rintf.S289
-rw-r--r--sysdeps/ia64/fpu/s_rintl.S289
-rw-r--r--sysdeps/ia64/fpu/s_round.S316
-rw-r--r--sysdeps/ia64/fpu/s_roundf.S316
-rw-r--r--sysdeps/ia64/fpu/s_roundl.S316
-rw-r--r--sysdeps/ia64/fpu/s_scalblnf.c62
-rw-r--r--sysdeps/ia64/fpu/s_scalbn.c62
-rw-r--r--sysdeps/ia64/fpu/s_scalbnf.c62
-rw-r--r--sysdeps/ia64/fpu/s_scalbnl.c62
-rw-r--r--sysdeps/ia64/fpu/s_significand.S61
-rw-r--r--sysdeps/ia64/fpu/s_significandf.S61
-rw-r--r--sysdeps/ia64/fpu/s_significandl.S61
-rw-r--r--sysdeps/ia64/fpu/s_sincos.c10
-rw-r--r--sysdeps/ia64/fpu/s_sincosf.c10
-rw-r--r--sysdeps/ia64/fpu/s_sincosl.c10
-rw-r--r--sysdeps/ia64/fpu/s_tan.S554
-rw-r--r--sysdeps/ia64/fpu/s_tanf.S1003
-rw-r--r--sysdeps/ia64/fpu/s_tanh.S987
-rw-r--r--sysdeps/ia64/fpu/s_tanhf.S581
-rw-r--r--sysdeps/ia64/fpu/s_tanhl.S1347
-rw-r--r--sysdeps/ia64/fpu/s_tanl.S3028
-rw-r--r--sysdeps/ia64/fpu/s_trunc.S207
-rw-r--r--sysdeps/ia64/fpu/s_truncf.S207
-rw-r--r--sysdeps/ia64/fpu/s_truncl.S209
-rw-r--r--sysdeps/ia64/fpu/t_exp.c1
-rw-r--r--sysdeps/ia64/fpu/w_acosh.c1
-rw-r--r--sysdeps/ia64/fpu/w_acoshf.c1
-rw-r--r--sysdeps/ia64/fpu/w_acoshl.c1
-rw-r--r--sysdeps/ia64/fpu/w_atanh.c1
-rw-r--r--sysdeps/ia64/fpu/w_atanhf.c1
-rw-r--r--sysdeps/ia64/fpu/w_atanhl.c1
-rw-r--r--sysdeps/ia64/fpu/w_exp10.c1
-rw-r--r--sysdeps/ia64/fpu/w_exp10f.c1
-rw-r--r--sysdeps/ia64/fpu/w_exp10l.c1
-rw-r--r--sysdeps/ia64/fpu/w_exp2.c1
-rw-r--r--sysdeps/ia64/fpu/w_exp2f.c1
-rw-r--r--sysdeps/ia64/fpu/w_exp2l.c1
-rw-r--r--sysdeps/ia64/fpu/w_expl.c1
-rw-r--r--sysdeps/ia64/fpu/w_lgamma.c80
-rw-r--r--sysdeps/ia64/fpu/w_lgamma_r.c1
-rw-r--r--sysdeps/ia64/fpu/w_lgammaf.c80
-rw-r--r--sysdeps/ia64/fpu/w_lgammaf_r.c1
-rw-r--r--sysdeps/ia64/fpu/w_lgammal.c79
-rw-r--r--sysdeps/ia64/fpu/w_lgammal_r.c1
-rw-r--r--sysdeps/ia64/fpu/w_log2.c1
-rw-r--r--sysdeps/ia64/fpu/w_log2f.c1
-rw-r--r--sysdeps/ia64/fpu/w_log2l.c1
-rw-r--r--sysdeps/ia64/fpu/w_sinh.c1
-rw-r--r--sysdeps/ia64/fpu/w_sinhf.c1
-rw-r--r--sysdeps/ia64/fpu/w_sinhl.c1
-rw-r--r--sysdeps/ia64/fpu/w_tgamma.S1835
-rw-r--r--sysdeps/ia64/fpu/w_tgammaf.S1328
-rw-r--r--sysdeps/ia64/fpu/w_tgammal.S4485
-rw-r--r--sysdeps/mips/Makefile4
-rw-r--r--sysdeps/unix/alarm.c7
-rw-r--r--sysdeps/unix/mips/rt-sysdep.S1
-rw-r--r--sysdeps/unix/sysv/linux/bits/waitflags.h4
-rw-r--r--sysdeps/unix/sysv/linux/i386/clone.S2
-rw-r--r--sysdeps/unix/sysv/linux/init-first.c40
-rw-r--r--version.h4
259 files changed, 99430 insertions, 43014 deletions
diff --git a/ChangeLog b/ChangeLog
index 8bd99f21f1..68beedf901 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,12 +1,384 @@
+2004-12-29 Jakub Jelinek <jakub@redhat.com>
+
+ * sysdeps/ia64/fpu/libm_support.h (__libm_error_support): Use
+ libc_hidden_proto instead of HIDDEN_PROTO.
+ * sysdeps/ia64/fpu/libm-symbols.h (HIDDEN_PROTO): Remove.
+ (__libm_error_support): If ASSEMBLER and in libc, define to
+ HIDDEN_JUMPTARGET(__libm_error_support).
+
+2004-12-28 David Mosberger <davidm@hpl.hp.com>
+
+ * sysdeps/ia64/fpu/Makefile (duplicated-routines): New macro.
+ (sysdep_routines): Replace libm_ldexp{,f,l} and libm_scalbn{,f,l}
+ with $(duplicated-routines).
+ (libm-sysdep_routines): Likewise, but substitute "s_" prefix for
+ "m_" prefix.
+
+2004-12-27 David Mosberger <davidm@hpl.hp.com>
+
+ * sysdeps/ia64/fpu/libm-symbols.h: Add include of <sysdep.h> and
+ undefine "ret" macro. Add __libm_error_support hidden definitions.
+
+ * sysdeps/ia64/fpu/e_lgamma_r.c: Remove CVS-id comment. Add
+ missing portion of copyright statement.
+ * sysdeps/ia64/fpu/e_lgammaf_r.c: Likewise.
+ * sysdeps/ia64/fpu/e_lgammal_r.c: Likewise.
+
+ * sysdeps/ia64/fpu/w_lgamma.c: Remove CVS-id comment. Add
+ missing portion of copyright statement.
+ (__ieee754_lgamma): Rename from lgamma(). Make lgamma() a weak alias.
+ (__ieee754_gamma): Likewise.
+ * sysdeps/ia64/fpu/w_lgammaf.c: Likewise.
+ * sysdeps/ia64/fpu/w_lgammal.c: Likewise.
+
+2004-12-09 H. J. Lu <hjl@lucon.org>
+
+ * sysdeps/ia64/fpu/s_nextafterl.c: Remove.
+ * sysdeps/ia64/fpu/s_nexttoward.c: Likewise.
+ * sysdeps/ia64/fpu/s_nexttowardf.c: Likewise.
+ * sysdeps/ia64/fpu/e_atan2l.S: Remove (duplicate of e_atan2l.c).
+ * sysdeps/ia64/fpu/e_expl.S: Likewise.
+ * sysdeps/ia64/fpu/e_logl.c: Remove (conflicts with e_logl.S).
+
+2004-11-18 David Mosberger <davidm@hpl.hp.com>
+
+ * sysdeps/ia64/fpu/README: New file.
+ * sysdeps/ia64/fpu/gen_import_file_list: New file.
+ * sysdeps/ia64/fpu/import_check: Likewise.
+ * sysdeps/ia64/fpu/import_diffs: Likewise.
+ * sysdeps/ia64/fpu/import_file.awk: Likewise.
+ * sysdeps/ia64/fpu/import_intel_libm: Likewise.
+ * sysdeps/ia64/fpu/libm-symbols.h: Likewise.
+
+ * sysdeps/ia64/fpu/e_acos.S: Update from Intel libm v2.1+.
+ * sysdeps/ia64/fpu/e_acosf.S: Likewise.
+ * sysdeps/ia64/fpu/e_acosl.S: Likewise.
+ * sysdeps/ia64/fpu/e_asin.S: Likewise.
+ * sysdeps/ia64/fpu/e_asinf.S: Likewise.
+ * sysdeps/ia64/fpu/e_asinl.S: Likewise.
+ * sysdeps/ia64/fpu/e_atan2.S: Likewise.
+ * sysdeps/ia64/fpu/e_atan2f.S: Likewise.
+ * sysdeps/ia64/fpu/e_cosh.S: Likewise.
+ * sysdeps/ia64/fpu/e_coshf.S: Likewise.
+ * sysdeps/ia64/fpu/e_coshl.S: Likewise.
+ * sysdeps/ia64/fpu/e_exp.S: Likewise.
+ * sysdeps/ia64/fpu/e_expf.S: Likewise.
+ * sysdeps/ia64/fpu/e_fmod.S: Likewise.
+ * sysdeps/ia64/fpu/e_fmodf.S: Likewise.
+ * sysdeps/ia64/fpu/e_fmodl.S: Likewise.
+ * sysdeps/ia64/fpu/e_hypot.S: Likewise.
+ * sysdeps/ia64/fpu/e_hypotf.S: Likewise.
+ * sysdeps/ia64/fpu/e_hypotl.S: Likewise.
+ * sysdeps/ia64/fpu/e_log.S: Likewise.
+ * sysdeps/ia64/fpu/e_log2.S: Likewise.
+ * sysdeps/ia64/fpu/e_log2f.S: Likewise.
+ * sysdeps/ia64/fpu/e_log2l.S: Likewise.
+ * sysdeps/ia64/fpu/e_logf.S: Likewise.
+ * sysdeps/ia64/fpu/e_pow.S: Likewise.
+ * sysdeps/ia64/fpu/e_powf.S: Likewise.
+ * sysdeps/ia64/fpu/e_powl.S: Likewise.
+ * sysdeps/ia64/fpu/e_remainder.S: Likewise.
+ * sysdeps/ia64/fpu/e_remainderf.S: Likewise.
+ * sysdeps/ia64/fpu/e_remainderl.S: Likewise.
+ * sysdeps/ia64/fpu/e_scalb.S: Likewise.
+ * sysdeps/ia64/fpu/e_scalbf.S: Likewise.
+ * sysdeps/ia64/fpu/e_scalbl.S: Likewise.
+ * sysdeps/ia64/fpu/e_sinh.S: Likewise.
+ * sysdeps/ia64/fpu/e_sinhf.S: Likewise.
+ * sysdeps/ia64/fpu/e_sinhl.S: Likewise.
+ * sysdeps/ia64/fpu/e_sqrt.S: Likewise.
+ * sysdeps/ia64/fpu/e_sqrtf.S: Likewise.
+ * sysdeps/ia64/fpu/e_sqrtl.S: Likewise.
+ * sysdeps/ia64/fpu/libm_error.c: Likewise.
+ * sysdeps/ia64/fpu/libm_reduce.c: Likewise.
+ * sysdeps/ia64/fpu/libm_support.h: Likewise.
+ * sysdeps/ia64/fpu/s_atan.S: Likewise.
+ * sysdeps/ia64/fpu/s_atanf.S: Likewise.
+ * sysdeps/ia64/fpu/s_atanl.S: Likewise.
+ * sysdeps/ia64/fpu/s_cbrt.S: Likewise.
+ * sysdeps/ia64/fpu/s_cbrtf.S: Likewise.
+ * sysdeps/ia64/fpu/s_cbrtl.S: Likewise.
+ * sysdeps/ia64/fpu/s_ceil.S: Likewise.
+ * sysdeps/ia64/fpu/s_ceilf.S: Likewise.
+ * sysdeps/ia64/fpu/s_ceill.S: Likewise.
+ * sysdeps/ia64/fpu/s_cos.S: Likewise.
+ * sysdeps/ia64/fpu/s_cosf.S: Likewise.
+ * sysdeps/ia64/fpu/s_cosl.S: Likewise.
+ * sysdeps/ia64/fpu/s_expm1.S: Likewise.
+ * sysdeps/ia64/fpu/s_expm1f.S: Likewise.
+ * sysdeps/ia64/fpu/s_expm1l.S: Likewise.
+ * sysdeps/ia64/fpu/s_fabs.S: Likewise.
+ * sysdeps/ia64/fpu/s_fabsf.S: Likewise.
+ * sysdeps/ia64/fpu/s_fabsl.S: Likewise.
+ * sysdeps/ia64/fpu/s_floor.S: Likewise.
+ * sysdeps/ia64/fpu/s_floorf.S: Likewise.
+ * sysdeps/ia64/fpu/s_floorl.S: Likewise.
+ * sysdeps/ia64/fpu/s_frexp.c: Likewise.
+ * sysdeps/ia64/fpu/s_frexpf.c: Likewise.
+ * sysdeps/ia64/fpu/s_frexpl.c: Likewise.
+ * sysdeps/ia64/fpu/s_ilogb.S: Likewise.
+ * sysdeps/ia64/fpu/s_ilogbf.S: Likewise.
+ * sysdeps/ia64/fpu/s_ilogbl.S: Likewise.
+ * sysdeps/ia64/fpu/s_log1p.S: Likewise.
+ * sysdeps/ia64/fpu/s_log1pf.S: Likewise.
+ * sysdeps/ia64/fpu/s_log1pl.S: Likewise.
+ * sysdeps/ia64/fpu/s_logb.S: Likewise.
+ * sysdeps/ia64/fpu/s_logbf.S: Likewise.
+ * sysdeps/ia64/fpu/s_logbl.S: Likewise.
+ * sysdeps/ia64/fpu/s_modf.S: Likewise.
+ * sysdeps/ia64/fpu/s_modff.S: Likewise.
+ * sysdeps/ia64/fpu/s_modfl.S: Likewise.
+ * sysdeps/ia64/fpu/s_nearbyint.S: Likewise.
+ * sysdeps/ia64/fpu/s_nearbyintf.S: Likewise.
+ * sysdeps/ia64/fpu/s_nearbyintl.S: Likewise.
+ * sysdeps/ia64/fpu/s_rint.S: Likewise.
+ * sysdeps/ia64/fpu/s_rintf.S: Likewise.
+ * sysdeps/ia64/fpu/s_rintl.S: Likewise.
+ * sysdeps/ia64/fpu/s_round.S: Likewise.
+ * sysdeps/ia64/fpu/s_roundf.S: Likewise.
+ * sysdeps/ia64/fpu/s_roundl.S: Likewise.
+ * sysdeps/ia64/fpu/s_significand.S: Likewise.
+ * sysdeps/ia64/fpu/s_significandf.S: Likewise.
+ * sysdeps/ia64/fpu/s_significandl.S: Likewise.
+ * sysdeps/ia64/fpu/s_tan.S: Likewise.
+ * sysdeps/ia64/fpu/s_tanf.S: Likewise.
+ * sysdeps/ia64/fpu/s_tanl.S: Likewise.
+ * sysdeps/ia64/fpu/s_trunc.S: Likewise.
+ * sysdeps/ia64/fpu/s_truncf.S: Likewise.
+ * sysdeps/ia64/fpu/s_truncl.S: Likewise.
+
+ * sysdeps/ia64/fpu/e_acosh.S: New file from Intel libm v2.1+.
+ * sysdeps/ia64/fpu/e_acoshf.S: Likewise.
+ * sysdeps/ia64/fpu/e_acoshl.S: Likewise.
+ * sysdeps/ia64/fpu/e_atanh.S: Likewise.
+ * sysdeps/ia64/fpu/e_atanhf.S: Likewise.
+ * sysdeps/ia64/fpu/e_atanhl.S: Likewise.
+ * sysdeps/ia64/fpu/e_exp10.S: Likewise.
+ * sysdeps/ia64/fpu/e_exp10f.S: Likewise.
+ * sysdeps/ia64/fpu/e_exp10l.S: Likewise.
+ * sysdeps/ia64/fpu/e_exp2.S: Likewise.
+ * sysdeps/ia64/fpu/e_exp2f.S: Likewise.
+ * sysdeps/ia64/fpu/e_exp2l.S: Likewise.
+ * sysdeps/ia64/fpu/e_lgamma_r.S: Likewise.
+ * sysdeps/ia64/fpu/e_lgammaf_r.S: Likewise.
+ * sysdeps/ia64/fpu/e_lgammal_r.S: Likewise.
+ * sysdeps/ia64/fpu/e_logl.S: Likewise.
+ * sysdeps/ia64/fpu/libm_frexp.S: Likewise.
+ * sysdeps/ia64/fpu/libm_frexpf.S: Likewise.
+ * sysdeps/ia64/fpu/libm_frexpl.S: Likewise.
+ * sysdeps/ia64/fpu/s_libm_ldexp.S: Likewise.
+ * sysdeps/ia64/fpu/s_libm_ldexpf.S: Likewise.
+ * sysdeps/ia64/fpu/s_libm_ldexpl.S: Likewise.
+ * sysdeps/ia64/fpu/s_libm_scalbn.S: Likewise.
+ * sysdeps/ia64/fpu/s_libm_scalbnf.S: Likewise.
+ * sysdeps/ia64/fpu/s_libm_scalbnl.S: Likewise.
+ * sysdeps/ia64/fpu/libm_lgamma.S: Likewise.
+ * sysdeps/ia64/fpu/libm_lgammaf.S: Likewise.
+ * sysdeps/ia64/fpu/libm_lgammal.S: Likewise.
+ * sysdeps/ia64/fpu/libm_sincos.S: Likewise.
+ * sysdeps/ia64/fpu/libm_sincos_large.S: Likewise.
+ * sysdeps/ia64/fpu/libm_sincosf.S: Likewise.
+ * sysdeps/ia64/fpu/libm_sincosl.S: Likewise.
+ * sysdeps/ia64/fpu/libm_scalblnf.S: Likewise.
+ * sysdeps/ia64/fpu/s_asinh.S: Likewise.
+ * sysdeps/ia64/fpu/s_asinhf.S: Likewise.
+ * sysdeps/ia64/fpu/s_asinhl.S: Likewise.
+ * sysdeps/ia64/fpu/s_erf.S: Likewise.
+ * sysdeps/ia64/fpu/s_erfc.S: Likewise.
+ * sysdeps/ia64/fpu/s_erfcf.S: Likewise.
+ * sysdeps/ia64/fpu/s_erfcl.S: Likewise.
+ * sysdeps/ia64/fpu/s_erff.S: Likewise.
+ * sysdeps/ia64/fpu/s_erfl.S: Likewise.
+ * sysdeps/ia64/fpu/s_fdim.S: Likewise.
+ * sysdeps/ia64/fpu/s_fdimf.S: Likewise.
+ * sysdeps/ia64/fpu/s_fdiml.S: Likewise.
+ * sysdeps/ia64/fpu/s_fma.S: Likewise.
+ * sysdeps/ia64/fpu/s_fmaf.S: Likewise.
+ * sysdeps/ia64/fpu/s_fmal.S: Likewise.
+ * sysdeps/ia64/fpu/s_fmax.S: Likewise.
+ * sysdeps/ia64/fpu/s_fmaxf.S: Likewise.
+ * sysdeps/ia64/fpu/s_fmaxl.S: Likewise.
+ * sysdeps/ia64/fpu/s_ldexp.c: Likewise.
+ * sysdeps/ia64/fpu/s_ldexpf.c: Likewise.
+ * sysdeps/ia64/fpu/s_ldexpl.c: Likewise.
+ * sysdeps/ia64/fpu/s_nextafter.S: Likewise.
+ * sysdeps/ia64/fpu/s_nextafterf.S: Likewise.
+ * sysdeps/ia64/fpu/s_nextafterl.S: Likewise.
+ * sysdeps/ia64/fpu/s_nexttoward.S: Likewise.
+ * sysdeps/ia64/fpu/s_nexttowardf.S: Likewise.
+ * sysdeps/ia64/fpu/s_nexttowardl.S: Likewise.
+ * sysdeps/ia64/fpu/s_tanh.S: Likewise.
+ * sysdeps/ia64/fpu/s_tanhf.S: Likewise.
+ * sysdeps/ia64/fpu/s_tanhl.S: Likewise.
+ * sysdeps/ia64/fpu/s_scalblnf.c: Likewise.
+ * sysdeps/ia64/fpu/w_lgamma.c: Likewise.
+ * sysdeps/ia64/fpu/w_lgammaf.c: Likewise.
+ * sysdeps/ia64/fpu/w_lgammal.c: Likewise.
+ * sysdeps/ia64/fpu/w_tgamma.S: Likewise.
+ * sysdeps/ia64/fpu/w_tgammaf.S: Likewise.
+ * sysdeps/ia64/fpu/w_tgammal.S: Likewise.
+
+ * sysdeps/ia64/fpu/e_gamma_r.c: New empty dummy-file.
+ * sysdeps/ia64/fpu/e_gammaf_r.c: Likewise.
+ * sysdeps/ia64/fpu/e_gammal_r.c: Likewise.
+ * sysdeps/ia64/fpu/w_acosh.c: Likewise.
+ * sysdeps/ia64/fpu/w_acoshf.c: Likewise.
+ * sysdeps/ia64/fpu/w_acoshl.c: Likewise.
+ * sysdeps/ia64/fpu/w_atanh.c: Likewise.
+ * sysdeps/ia64/fpu/w_atanhf.c: Likewise.
+ * sysdeps/ia64/fpu/w_atanhl.c: Likewise.
+ * sysdeps/ia64/fpu/w_exp10.c: Likewise.
+ * sysdeps/ia64/fpu/w_exp10f.c: Likewise.
+ * sysdeps/ia64/fpu/w_exp10l.c: Likewise.
+ * sysdeps/ia64/fpu/w_exp2.c: Likewise.
+ * sysdeps/ia64/fpu/w_exp2f.c: Likewise.
+ * sysdeps/ia64/fpu/w_exp2l.c: Likewise.
+ * sysdeps/ia64/fpu/w_expl.c: Likewise.
+ * sysdeps/ia64/fpu/e_expl.S: Likewise.
+ * sysdeps/ia64/fpu/w_lgamma_r.c: Likewise.
+ * sysdeps/ia64/fpu/w_lgammaf_r.c: Likewise.
+ * sysdeps/ia64/fpu/w_lgammal_r.c: Likewise.
+ * sysdeps/ia64/fpu/w_log2.c: Likewise.
+ * sysdeps/ia64/fpu/w_log2f.c: Likewise.
+ * sysdeps/ia64/fpu/w_log2l.c: Likewise.
+ * sysdeps/ia64/fpu/w_sinh.c: Likewise.
+ * sysdeps/ia64/fpu/w_sinhf.c: Likewise.
+ * sysdeps/ia64/fpu/w_sinhl.c: Likewise.
+
+ * sysdeps/ia64/fpu/libm_atan2_reg.S: Remove.
+ * sysdeps/ia64/fpu/s_ldexp.S: Likewise.
+ * sysdeps/ia64/fpu/s_ldexpf.S: Likewise.
+ * sysdeps/ia64/fpu/s_ldexpl.S: Likewise.
+ * sysdeps/ia64/fpu/s_scalbn.S: Likewise.
+ * sysdeps/ia64/fpu/s_scalbnf.S: Likewise.
+ * sysdeps/ia64/fpu/s_scalbnl.S: Likewise.
+
+ * sysdeps/ia64/fpu/s_sincos.c: Make it an empty dummy-file.
+ * sysdeps/ia64/fpu/s_sincosf.c: Likewise.
+ * sysdeps/ia64/fpu/s_sincosl.c: Likewise.
+
+ * sysdeps/ia64/fpu/e_atan2l.S: Add "Not needed" comment.
+
+ * sysdeps/ia64/fpu/s_copysign.S: Add __libm_copysign{,f,l}
+ alias for use by libm_error.c
+
+ * sysdeps/ia64/fpu/Makefile (libm-sysdep_routines): Remove
+ libm_atan2_reg, libm_tan, libm_frexp4{f,l}.
+ Mention s_erfc{,f,l}, libm_frexp{,f,l}, libm_ldexp{,f,l},
+ libm_sincos{,f,l}, libm_sincos_large, libm_lgamma{,f,l},
+ libm_scalbn{,f,l}, libm_scalblnf.
+ (sysdep_routines): Remove libm_frexp4{,f,l}.
+ Mention libm_frexp{,f,l}, libm_ldexp{,f,l}, and libm_scalbn{,f,l}.
+ (sysdep-CPPFLAGS): Add -include libm-symbols.h, -D__POSIX__,
+ _D_LIB_VERSIONIMF=_LIB_VERSION, -DSIZE_LONG_INT_64, and
+ -DSIZE_LONG_LONG_INT_64.
+
+2005-01-05 Steven Munroe <sjmunroe@us.ibm.com>
+
+ * elf/rtld.c (dl_main) [NEED_DL_SYSINFO_DSO]: Insure l_map_end and
+ l_text_end are set for a VDSO with a single PT_LOAD entry.
+
+2005-01-05 Ulrich Drepper <drepper@redhat.com>
+
+ * libio/fmemopen.c (fmemopen_seek): SEEK_END should count from
+ maximum used address, not maximum buffer position.
+
+ * libio/iofopncook.c (_IO_cookie_seekoff): Define. Mark offset as
+ invalid to disable optimizations in fileops which won't work here.
+ (_IO_cookie_jumps): Use it.
+ (_IO_old_cookie_jumps): Likewise.
+ * libio/fmemopen.c (fmemopen_seek): Result must be returned in *P,
+ not the return value.
+ * stdio-common/Makefile (tests): Add tst-fmemopen2.
+ * stdio-common/tst-fmemopen2.c: New file.
+
+ * sysdeps/unix/sysv/linux/bits/waitflags.h: Define __WNOTHREAD.
+
+2005-01-05 Roland McGrath <roland@redhat.com>
+
+ * configure.in (libc_cv_cpp_asm_debuginfo): Checked moved ...
+ * sysdeps/i386/configure.in: ... here. New file.
+
+2005-01-03 Ulrich Drepper <drepper@redhat.com>
+
+ * sysdeps/generic/libc-start.c [SHARED] (__libc_start_main): Don't
+ initialize __environ again.
+ * stdlib/Makefile: Add rules to build and run tst-putenv.
+ * stdlib/tst-putenv.c: New file.
+ * stdlib/tst-putenvmod.c: New file.
+
+ * sysdeps/unix/sysv/linux/init-first.c: Clean file up. Don't
+ define unnecessary wrappers or aliases of static functions.
+
+2005-01-03 Andreas Jaeger <aj@suse.de>
+
+ * csu/Makefile (generated): Add start.os and start.ob.
+
+ * elf/Makefile (tests): Change rule for tst-pie1 so that make
+ clean works.
+
+2004-12-29 Roland McGrath <roland@redhat.com>
+
+ [BZ #626]
+ * sysdeps/unix/alarm.c (alarm): Round return value to nearest rather
+ than always up; when nearest is zero, round up to one.
+
+2004-12-28 Ulrich Drepper <drepper@redhat.com>
+
+ * po/es.po: Update from translation team.
+
+ * sysdeps/generic/dl-tls.c (__tls_get_addr): Fix typo.
+
+2004-12-27 Ulrich Drepper <drepper@redhat.com>
+
+ * include/signal.h: Define __sigemptyset.
+
+2004-04-27 Paolo Bonzini <bonzini@gnu.org>
+
+ * posix/regex_internal.h (struct re_dfastate_t): Make
+ word_trtable a pointer to the 512-item transition table.
+ * posix/regexec.c (build_trtable): Fill in either state->trtable
+ or state->word_trtable. Return a boolean indicating success.
+ (transit_state): Expect state->trtable to be a 256-item
+ transition table. Reorganize code to have less tests in
+ the common case, and to save an indentation level.
+
+2004-12-21 Jakub Jelinek <jakub@redhat.com>
+
+ * sysdeps/unix/sysv/linux/i386/clone.S (__clone): Make sure %esp when
+ calling fn is 16 byte aligned.
+ * sysdeps/i386/tst-stack-align.h: New file.
+
+ * misc/efgcvt_r.c (FLOAT_MIN_10_EXP, FLOAT_MIN_10_NORM): Define.
+ (ecvt_r): Special case denormals.
+ * misc/qefgcvt_r.c (FLOAT_MIN_10_EXP, FLOAT_MIN_10_NORM): Define.
+ * misc/tst-efgcvt.c: Include float.h.
+ (ecvt_tests): Add 2 new tests.
+
+2004-12-20 Roland McGrath <roland@frob.com>
+
+ * version.h (RELEASE, VERSION): development, 2.3.90
+ * include/features.h (__GLIBC_MINOR__): Now 4.
+
+2004-12-20 Jakub Jelinek <jakub@redhat.com>,
+ Jim Gifford <giffordj@linkline.com>
+
+ [BZ #562]
+ * sysdeps/mips/Makefile (librt-sysdep_routines): Add.
+ * sysdeps/unix/mips/rt-sysdep.S: New file.
+
2004-12-19 Roland McGrath <roland@redhat.com>
+ * iconv/Makefile (test-iconvconfig): New target.
+ [$(cross-compiling) != yes] (xtests): Depend on it.
+
* iconv/iconvconfig.c (nostdlib, output_file, output_file_len):
New variables.
(options, parse_opt, main): Take new options --nostdlib and
--output/-o to set them. Under --nostdlib, skip GCONV_PATH dirs.
(write_output): If output_file is set, write the output there.
-2004-12-19 Andreas Jaeger <aj@suse.de>NULL
+2004-12-19 Andreas Jaeger <aj@suse.de>
[BZ #560]
* inet/netinet/in.h: Use __interface_addr instead of __interface.
diff --git a/configure b/configure
index 93fff2a64d..e10f4b9836 100755
--- a/configure
+++ b/configure
@@ -313,7 +313,7 @@ ac_includes_default="\
# include <unistd.h>
#endif"
-ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS with_fp with_cvs enable_check_abi oldest_abi bindnow force_install all_warnings build build_cpu build_vendor build_os host host_cpu host_vendor host_os subdirs add_ons base_machine sysnames INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA LN_S CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC OBJEXT BUILD_CC cross_compiling CPP CXX CXXFLAGS ac_ct_CXX AR OBJDUMP RANLIB ac_ct_RANLIB MIG AS LD PWD_P MAKE MSGFMT MAKEINFO SED AUTOCONF SYSINCLUDES libc_cv_gcc_static_libgcc BASH libc_cv_have_bash2 KSH libc_cv_have_ksh AWK PERL INSTALL_INFO BISON VERSIONING libc_cv_asm_protected_directive libc_cv_initfinit_array libc_cv_cc_with_libunwind libc_cv_z_nodelete libc_cv_z_nodlopen libc_cv_z_initfirst libc_cv_z_relro libc_cv_Bgroup libc_cv_libgcc_s_suffix libc_cv_as_needed ASFLAGS_config libc_cv_z_combreloc libc_cv_z_execstack libc_cv_fpie fno_unit_at_a_time libc_cv_have_initfini libc_cv_cpp_asm_debuginfo no_whole_archive exceptions LIBGD have_selinux EGREP sizeof_long_double libc_cv_gcc_unwind_find_fde uname_sysname uname_release uname_version old_glibc_headers libc_cv_slibdir libc_cv_localedir libc_cv_sysconfdir libc_cv_rootsbindir libc_cv_forced_unwind use_ldconfig ldd_rewrite_script gnu_ld gnu_as elf xcoff static shared pic_default profile omitfp bounded static_nss nopic_initfini DEFINES linux_doors mach_interface_list VERSION RELEASE LIBOBJS LTLIBOBJS'
+ac_subst_vars='SHELL PATH_SEPARATOR PACKAGE_NAME PACKAGE_TARNAME PACKAGE_VERSION PACKAGE_STRING PACKAGE_BUGREPORT exec_prefix prefix program_transform_name bindir sbindir libexecdir datadir sysconfdir sharedstatedir localstatedir libdir includedir oldincludedir infodir mandir build_alias host_alias target_alias DEFS ECHO_C ECHO_N ECHO_T LIBS with_fp with_cvs enable_check_abi oldest_abi bindnow force_install all_warnings build build_cpu build_vendor build_os host host_cpu host_vendor host_os subdirs add_ons base_machine sysnames INSTALL_PROGRAM INSTALL_SCRIPT INSTALL_DATA LN_S CC CFLAGS LDFLAGS CPPFLAGS ac_ct_CC OBJEXT BUILD_CC cross_compiling CPP CXX CXXFLAGS ac_ct_CXX AR OBJDUMP RANLIB ac_ct_RANLIB MIG AS LD PWD_P MAKE MSGFMT MAKEINFO SED AUTOCONF SYSINCLUDES libc_cv_gcc_static_libgcc BASH libc_cv_have_bash2 KSH libc_cv_have_ksh AWK PERL INSTALL_INFO BISON VERSIONING libc_cv_asm_protected_directive libc_cv_initfinit_array libc_cv_cc_with_libunwind libc_cv_z_nodelete libc_cv_z_nodlopen libc_cv_z_initfirst libc_cv_z_relro libc_cv_Bgroup libc_cv_libgcc_s_suffix libc_cv_as_needed ASFLAGS_config libc_cv_z_combreloc libc_cv_z_execstack libc_cv_fpie fno_unit_at_a_time libc_cv_have_initfini no_whole_archive exceptions LIBGD have_selinux EGREP sizeof_long_double libc_cv_gcc_unwind_find_fde uname_sysname uname_release uname_version old_glibc_headers libc_cv_slibdir libc_cv_localedir libc_cv_sysconfdir libc_cv_rootsbindir libc_cv_forced_unwind use_ldconfig ldd_rewrite_script gnu_ld gnu_as elf xcoff static shared pic_default profile omitfp bounded static_nss nopic_initfini DEFINES linux_doors mach_interface_list VERSION RELEASE LIBOBJS LTLIBOBJS'
ac_subst_files=''
# Initialize some variables set by options.
@@ -6085,58 +6085,6 @@ _ACEOF
fi
-echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5
-echo $ECHO_N "checking if -g produces usable source locations for assembler-with-cpp... $ECHO_C" >&6
-if test "${libc_cv_cpp_asm_debuginfo+set}" = set; then
- echo $ECHO_N "(cached) $ECHO_C" >&6
-else
- cat > conftest.S <<EOF
-#include "confdefs.h"
-
-/* comment on
- two lines */
- ${libc_cv_dot_text}
- ${libc_cv_asm_global_directive} foo
-foo:
- /* Unfortunately this test only works for a real instruction,
- not for any of the machine-independent pseudo-ops.
- So we just have to assume everybody has a "nop". */
- nop
- /* comment */
- nop
- /* comment */
- nop
-EOF
-if { ac_try='${CC-cc} $CPPFLAGS $ASFLAGS -g -c conftest.S 1>&5'
- { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
- (eval $ac_try) 2>&5
- ac_status=$?
- echo "$as_me:$LINENO: \$? = $ac_status" >&5
- (exit $ac_status); }; } && {
- ac_pattern='conftest\.S'
- { ac_try='readelf --debug-dump=line conftest.o |
- grep $ac_pattern 1>&5'
- { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
- (eval $ac_try) 2>&5
- ac_status=$?
- echo "$as_me:$LINENO: \$? = $ac_status" >&5
- (exit $ac_status); }; }
- }; then
- libc_cv_cpp_asm_debuginfo=yes
-else
- libc_cv_cpp_asm_debuginfo=no
-fi
-rm -f conftest*
-fi
-echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5
-echo "${ECHO_T}$libc_cv_cpp_asm_debuginfo" >&6
-if test $libc_cv_cpp_asm_debuginfo = yes; then
- cat >>confdefs.h <<\_ACEOF
-#define HAVE_CPP_ASM_DEBUGINFO 1
-_ACEOF
-
-fi
-
echo "$as_me:$LINENO: checking for ld --no-whole-archive" >&5
echo $ECHO_N "checking for ld --no-whole-archive... $ECHO_C" >&6
if test "${libc_cv_ld_no_whole_archive+set}" = set; then
@@ -8390,7 +8338,6 @@ s,@libc_cv_z_execstack@,$libc_cv_z_execstack,;t t
s,@libc_cv_fpie@,$libc_cv_fpie,;t t
s,@fno_unit_at_a_time@,$fno_unit_at_a_time,;t t
s,@libc_cv_have_initfini@,$libc_cv_have_initfini,;t t
-s,@libc_cv_cpp_asm_debuginfo@,$libc_cv_cpp_asm_debuginfo,;t t
s,@no_whole_archive@,$no_whole_archive,;t t
s,@exceptions@,$exceptions,;t t
s,@LIBGD@,$LIBGD,;t t
diff --git a/configure.in b/configure.in
index 559bee0173..4ad54cfefd 100644
--- a/configure.in
+++ b/configure.in
@@ -1641,39 +1641,6 @@ if test $libc_cv_asm_cfi_directives = yes; then
AC_DEFINE(HAVE_ASM_CFI_DIRECTIVES)
fi
-AC_CACHE_CHECK(if -g produces usable source locations for assembler-with-cpp,
- libc_cv_cpp_asm_debuginfo, [dnl
-cat > conftest.S <<EOF
-#include "confdefs.h"
-
-/* comment on
- two lines */
- ${libc_cv_dot_text}
- ${libc_cv_asm_global_directive} foo
-foo:
- /* Unfortunately this test only works for a real instruction,
- not for any of the machine-independent pseudo-ops.
- So we just have to assume everybody has a "nop". */
- nop
- /* comment */
- nop
- /* comment */
- nop
-EOF
-if AC_TRY_COMMAND([${CC-cc} $CPPFLAGS $ASFLAGS -g -c conftest.S 1>&AS_MESSAGE_LOG_FD]) && {
- ac_pattern='conftest\.S'
- AC_TRY_COMMAND([readelf --debug-dump=line conftest.o |
- grep $ac_pattern 1>&AS_MESSAGE_LOG_FD])
- }; then
- libc_cv_cpp_asm_debuginfo=yes
-else
- libc_cv_cpp_asm_debuginfo=no
-fi
-rm -f conftest*])AC_SUBST(libc_cv_cpp_asm_debuginfo)
-if test $libc_cv_cpp_asm_debuginfo = yes; then
- AC_DEFINE(HAVE_CPP_ASM_DEBUGINFO)
-fi
-
AC_CACHE_CHECK(for ld --no-whole-archive, libc_cv_ld_no_whole_archive, [dnl
cat > conftest.c <<\EOF
_start () {}
diff --git a/csu/Makefile b/csu/Makefile
index fbbfe0050a..20709c3c8d 100644
--- a/csu/Makefile
+++ b/csu/Makefile
@@ -1,5 +1,5 @@
# Makefile for csu code for GNU C library.
-# Copyright (C) 1995-2003, 2004 Free Software Foundation, Inc.
+# Copyright (C) 1995-2004, 2005 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
# The GNU C Library is free software; you can redistribute it and/or
@@ -54,11 +54,13 @@ include ../Makeconfig
ifeq (yes,$(build-shared))
extra-objs += S$(start-installed-name)
install-lib += S$(start-installed-name)
+generated += start.os
endif
ifeq (yes,$(build-bounded))
extra-objs += b$(start-installed-name)
install-lib += b$(start-installed-name)
+generated += start.ob
endif
ifneq ($(start-installed-name),$(static-start-installed-name))
diff --git a/elf/Makefile b/elf/Makefile
index 028be25b2d..87172d367d 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -1,4 +1,4 @@
-# Copyright (C) 1995-2002, 2003, 2004 Free Software Foundation, Inc.
+# Copyright (C) 1995-2004, 2005 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
# The GNU C Library is free software; you can redistribute it and/or
@@ -163,7 +163,7 @@ tests-nodlopen-yes = nodlopen nodlopen2
tests-execstack-yes = tst-execstack tst-execstack-needed tst-execstack-prog
endif
ifeq (yesyes,$(have-fpie)$(build-shared))
-tests: $(objpfx)tst-pie1.out
+tests += tst-pie1
endif
modules-names = testobj1 testobj2 testobj3 testobj4 testobj5 testobj6 \
testobj1_1 failobj constload2 constload3 unloadmod \
diff --git a/elf/rtld.c b/elf/rtld.c
index ee7291477f..3835a207ec 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -1,5 +1,5 @@
/* Run time dynamic linker.
- Copyright (C) 1995-2002, 2003, 2004 Free Software Foundation, Inc.
+ Copyright (C) 1995-2002, 2003, 2004, 2005 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -1412,9 +1412,9 @@ ERROR: ld.so: object '%s' from %s cannot be preloaded: ignored.\n",
{
if (! l->l_addr)
l->l_addr = ph->p_vaddr;
- else if (ph->p_vaddr + ph->p_memsz >= l->l_map_end)
+ if (ph->p_vaddr + ph->p_memsz >= l->l_map_end)
l->l_map_end = ph->p_vaddr + ph->p_memsz;
- else if ((ph->p_flags & PF_X)
+ if ((ph->p_flags & PF_X)
&& ph->p_vaddr + ph->p_memsz >= l->l_text_end)
l->l_text_end = ph->p_vaddr + ph->p_memsz;
}
diff --git a/fedora/branch.mk b/fedora/branch.mk
index 20986645ab..da12880aec 100644
--- a/fedora/branch.mk
+++ b/fedora/branch.mk
@@ -1,5 +1,5 @@
# This file is updated automatically by Makefile.
glibc-branch := fedora
glibc-base := HEAD
-fedora-sync-date := 2004-12-19 23:31 UTC
-fedora-sync-tag := fedora-glibc-20041219T2331
+fedora-sync-date := 2005-01-06 14:43 UTC
+fedora-sync-tag := fedora-glibc-20050106T1443
diff --git a/iconv/Makefile b/iconv/Makefile
index fe0c453e7b..40c7cbcdd3 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -78,3 +78,15 @@ $(inst_bindir)/iconv: $(objpfx)iconv_prog $(+force)
$(objpfx)iconv_prog: $(iconv_prog-modules:%=$(objpfx)%.o)
$(objpfx)iconvconfig: $(iconvconfig-modules:%=$(objpfx)%.o)
+
+ifneq ($(cross-compiling),yes)
+xtests: test-iconvconfig
+endif
+
+.PHONY: test-iconvconfig
+test-iconvconfig: /dev/null $(objpfx)iconvconfig
+ tmp=$(objpfx)gconv-modules.cache.$$$$; \
+ rm -f $$tmp; \
+ $(make-test-out) --output=$$tmp --nostdlib $(inst_gconvdir) && \
+ cmp $$tmp $(inst_gconvdir)/gconv-modules.cache && \
+ rm -f $$tmp
diff --git a/include/features.h b/include/features.h
index ff1de8f472..5e6cca5c86 100644
--- a/include/features.h
+++ b/include/features.h
@@ -295,7 +295,7 @@
/* Major and minor version number of the GNU C library package. Use
these macros to test for features in specific releases. */
#define __GLIBC__ 2
-#define __GLIBC_MINOR__ 3
+#define __GLIBC_MINOR__ 4
#define __GLIBC_PREREQ(maj, min) \
((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min))
diff --git a/include/signal.h b/include/signal.h
index 104ea8f83a..dc1e0a12e5 100644
--- a/include/signal.h
+++ b/include/signal.h
@@ -48,6 +48,9 @@ extern int __sigpause (int sig_or_mask, int is_sig);
extern int __default_sigpause (int mask);
extern int __xpg_sigpause (int sig);
+/* Simplified sigemptyset() implementation without the parameter checking. */
+#undef __sigemptyset
+#define __sigemptyset(ss) (memset (ss, '\0', sizeof (sigset_t)), 0)
/* Allocate real-time signal with highest/lowest available priority. */
diff --git a/libio/fmemopen.c b/libio/fmemopen.c
index ab6ffdd678..51e849e846 100644
--- a/libio/fmemopen.c
+++ b/libio/fmemopen.c
@@ -1,5 +1,5 @@
/* Fmemopen implementation.
- Copyright (C) 2000, 2002 Free Software Foundation, Inc.
+ Copyright (C) 2000, 2002, 2005 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Hanno Mueller, kontakt@hanno.de, 2000.
@@ -27,8 +27,6 @@
* but couldn't find it in libio. The following snippet of code is an
* attempt to implement what glibc's documentation describes.
*
- * No, it isn't really tested yet. :-)
- *
*
*
* I already see some potential problems:
@@ -166,7 +164,7 @@ fmemopen_seek (void *cookie, _IO_off64_t *p, int w)
break;
case SEEK_END:
- np = c->size - *p;
+ np = c->maxpos - *p;
break;
default:
@@ -176,9 +174,9 @@ fmemopen_seek (void *cookie, _IO_off64_t *p, int w)
if (np < 0 || (size_t) np > c->size)
return -1;
- c->pos = np;
+ *p = c->pos = np;
- return np;
+ return 0;
}
@@ -203,6 +201,13 @@ fmemopen (void *buf, size_t len, const char *mode)
cookie_io_functions_t iof;
fmemopen_cookie_t *c;
+ if (len == 0)
+ {
+ einval:
+ __set_errno (EINVAL);
+ return NULL;
+ }
+
c = (fmemopen_cookie_t *) malloc (sizeof (fmemopen_cookie_t));
if (c == NULL)
return NULL;
@@ -220,7 +225,12 @@ fmemopen (void *buf, size_t len, const char *mode)
c->buffer[0] = '\0';
}
else
- c->buffer = buf;
+ {
+ if ((uintptr_t) len > -(uintptr_t) buf)
+ goto einval;
+
+ c->buffer = buf;
+ }
c->size = len;
diff --git a/libio/iofopncook.c b/libio/iofopncook.c
index 321eb67b8d..eba3d435dc 100644
--- a/libio/iofopncook.c
+++ b/libio/iofopncook.c
@@ -1,4 +1,5 @@
-/* Copyright (C) 1993,95,97,99,2000,2002,2004 Free Software Foundation, Inc.
+/* Copyright (C) 1993,95,97,99,2000,2002,2004, 2005
+ Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -94,6 +95,20 @@ _IO_cookie_close (fp)
}
+static _IO_off64_t
+_IO_cookie_seekoff (fp, offset, dir, mode)
+ _IO_FILE *fp;
+ _IO_off64_t offset;
+ int dir;
+ int mode;
+{
+ /* We must force the fileops code to always use seek to determine
+ the position. */
+ fp->_offset = _IO_pos_BAD;
+ return INTUSE(_IO_file_seekoff) (fp, offset, dir, mode);
+}
+
+
static const struct _IO_jump_t _IO_cookie_jumps = {
JUMP_INIT_DUMMY,
JUMP_INIT(finish, INTUSE(_IO_file_finish)),
@@ -103,7 +118,7 @@ static const struct _IO_jump_t _IO_cookie_jumps = {
JUMP_INIT(pbackfail, INTUSE(_IO_default_pbackfail)),
JUMP_INIT(xsputn, INTUSE(_IO_file_xsputn)),
JUMP_INIT(xsgetn, INTUSE(_IO_default_xsgetn)),
- JUMP_INIT(seekoff, INTUSE(_IO_file_seekoff)),
+ JUMP_INIT(seekoff, _IO_cookie_seekoff),
JUMP_INIT(seekpos, _IO_default_seekpos),
JUMP_INIT(setbuf, INTUSE(_IO_file_setbuf)),
JUMP_INIT(sync, INTUSE(_IO_file_sync)),
@@ -223,7 +238,7 @@ static const struct _IO_jump_t _IO_old_cookie_jumps = {
JUMP_INIT(pbackfail, INTUSE(_IO_default_pbackfail)),
JUMP_INIT(xsputn, INTUSE(_IO_file_xsputn)),
JUMP_INIT(xsgetn, INTUSE(_IO_default_xsgetn)),
- JUMP_INIT(seekoff, INTUSE(_IO_file_seekoff)),
+ JUMP_INIT(seekoff, _IO_cookie_seekoff),
JUMP_INIT(seekpos, _IO_default_seekpos),
JUMP_INIT(setbuf, INTUSE(_IO_file_setbuf)),
JUMP_INIT(sync, INTUSE(_IO_file_sync)),
diff --git a/linuxthreads/ChangeLog b/linuxthreads/ChangeLog
index 9577130e10..ff1b7fea93 100644
--- a/linuxthreads/ChangeLog
+++ b/linuxthreads/ChangeLog
@@ -1,3 +1,10 @@
+2004-12-21 Jakub Jelinek <jakub@redhat.com>
+
+ * Makefile (tests): Add tst-align.
+ * tst-align.c: New test.
+ * sysdeps/i386/Makefile (CFLAGS-tst-align.c): Add
+ -mpreferred-stack-boundary=4.
+
2004-12-12 Ulrich Drepper <drepper@redhat.com>
* internals.h: Include <stdbool.h> to match includes used in nptl.
diff --git a/linuxthreads/Makefile b/linuxthreads/Makefile
index f4c9f2a916..d9bf476c7e 100644
--- a/linuxthreads/Makefile
+++ b/linuxthreads/Makefile
@@ -111,7 +111,7 @@ tests = ex1 ex2 ex3 ex4 ex5 ex6 ex7 ex8 ex9 $(librt-tests) ex12 ex13 joinrace \
ex17 ex18 tst-cancel tst-context bug-sleep \
tst-cancel1 tst-cancel2 tst-cancel3 tst-cancel4 tst-cancel5 \
tst-cancel6 tst-cancel7 tst-cancel8 tst-popen tst-popen2 tst-attr1 \
- tst-stack1
+ tst-stack1 tst-align
test-srcs = tst-signal
# These tests are linked with libc before libpthread
tests-reverse += tst-cancel5
diff --git a/linuxthreads/sysdeps/i386/Makefile b/linuxthreads/sysdeps/i386/Makefile
index 45183d1cd3..418fa5c6ef 100644
--- a/linuxthreads/sysdeps/i386/Makefile
+++ b/linuxthreads/sysdeps/i386/Makefile
@@ -15,6 +15,7 @@ CFLAGS-pthread.c += -fno-omit-frame-pointer -mpreferred-stack-boundary=4
CFLAGS-ptlongjmp.c += -fno-omit-frame-pointer
CFLAGS-semaphore.c += -fno-omit-frame-pointer
CFLAGS-sighandler.c += -fno-omit-frame-pointer -mpreferred-stack-boundary=4
+CFLAGS-tst-align.c += -mpreferred-stack-boundary=4
endif
ifeq ($(subdir),csu)
diff --git a/linuxthreads/tst-align.c b/linuxthreads/tst-align.c
new file mode 100644
index 0000000000..2de9d7a107
--- /dev/null
+++ b/linuxthreads/tst-align.c
@@ -0,0 +1,71 @@
+/* Copyright (C) 2003 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Ulrich Drepper <drepper@redhat.com>, 2003.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <tst-stack-align.h>
+
+static void *
+tf (void *arg)
+{
+ bool ok = true;
+
+ puts ("in thread");
+
+ if (TEST_STACK_ALIGN ())
+ ok = false;
+
+ return ok ? NULL : (void *) -1l;
+}
+
+static int
+do_test (void)
+{
+ bool ok = true;
+
+ puts ("in main");
+
+ if (TEST_STACK_ALIGN ())
+ ok = false;
+
+ pthread_t th;
+ if (pthread_create (&th, NULL, tf, NULL) != 0)
+ {
+ puts ("create failed");
+ return 1;
+ }
+
+ void *res;
+ if (pthread_join (th, &res) != 0)
+ {
+ puts ("join failed");
+ return 1;
+ }
+
+ if (res != NULL)
+ ok = false;
+
+ return ok ? 0 : 1;
+}
+
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/localedata/ChangeLog b/localedata/ChangeLog
index 42ce0637ae..8c8616e7cf 100644
--- a/localedata/ChangeLog
+++ b/localedata/ChangeLog
@@ -1,3 +1,7 @@
+2004-12-19 Roland McGrath <roland@frob.com>
+
+ * gen-unicode-ctype.c (output_tables): Fix email address in output.
+
2004-10-02 Petter Reinholdtsen <pere@hungry.com>
[BZ #82]
diff --git a/localedata/gen-unicode-ctype.c b/localedata/gen-unicode-ctype.c
index a9c51b3f48..849f272ed5 100644
--- a/localedata/gen-unicode-ctype.c
+++ b/localedata/gen-unicode-ctype.c
@@ -638,7 +638,7 @@ output_tables (const char *filename, const char *version)
fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
fprintf (stream, "address \"\"\n");
fprintf (stream, "contact \"\"\n");
- fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
+ fprintf (stream, "email \"bug-glibc-locales@gnu.org\"\n");
fprintf (stream, "tel \"\"\n");
fprintf (stream, "fax \"\"\n");
fprintf (stream, "language \"\"\n");
diff --git a/misc/efgcvt_r.c b/misc/efgcvt_r.c
index ac2a5c45bf..69cca9038f 100644
--- a/misc/efgcvt_r.c
+++ b/misc/efgcvt_r.c
@@ -1,5 +1,5 @@
/* Compatibility functions for floating point formatting, reentrant versions.
- Copyright (C) 1995,96,97,98,99,2000,01,02 Free Software Foundation, Inc.
+ Copyright (C) 1995,96,97,98,99,2000,01,02,04 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -31,6 +31,7 @@
# define FUNC_PREFIX
# define FLOAT_FMT_FLAG
# define FLOAT_NAME_EXT
+# define FLOAT_MIN_10_EXP DBL_MIN_10_EXP
# if DBL_MANT_DIG == 53
# define NDIGIT_MAX 17
# elif DBL_MANT_DIG == 24
@@ -43,6 +44,17 @@
# error "NDIGIT_MAX must be precomputed"
# define NDIGIT_MAX (lrint (ceil (M_LN2 / M_LN10 * DBL_MANT_DIG + 1.0)))
# endif
+# if DBL_MIN_10_EXP == -37
+# define FLOAT_MIN_10_NORM 1.0e-37
+# elif DBL_MIN_10_EXP == -307
+# define FLOAT_MIN_10_NORM 1.0e-307
+# elif DBL_MIN_10_EXP == -4931
+# define FLOAT_MIN_10_NORM 1.0e-4931
+# else
+/* libc can't depend on libm. */
+# error "FLOAT_MIN_10_NORM must be precomputed"
+# define FLOAT_MIN_10_NORM exp10 (DBL_MIN_10_EXP)
+# endif
#endif
#define APPEND(a, b) APPEND2 (a, b)
@@ -171,6 +183,17 @@ APPEND (FUNC_PREFIX, ecvt_r) (value, ndigit, decpt, sign, buf, len)
d = -value;
else
d = value;
+ /* For denormalized numbers the d < 1.0 case below won't work,
+ as f can overflow to +Inf. */
+ if (d < FLOAT_MIN_10_NORM)
+ {
+ value /= FLOAT_MIN_10_NORM;
+ if (value < 0.0)
+ d = -value;
+ else
+ d = value;
+ exponent += FLOAT_MIN_10_EXP;
+ }
if (d < 1.0)
{
do
diff --git a/misc/qefgcvt_r.c b/misc/qefgcvt_r.c
index 66cc049ec8..d5b2a799b3 100644
--- a/misc/qefgcvt_r.c
+++ b/misc/qefgcvt_r.c
@@ -24,6 +24,7 @@
#define FUNC_PREFIX q
#define FLOAT_FMT_FLAG "L"
#define FLOAT_NAME_EXT l
+#define FLOAT_MIN_10_EXP LDBL_MIN_10_EXP
#if LDBL_MANT_DIG == 64
# define NDIGIT_MAX 21
#elif LDBL_MANT_DIG == 53
@@ -40,5 +41,16 @@
# error "NDIGIT_MAX must be precomputed"
# define NDIGIT_MAX (lrint (ceil (M_LN2 / M_LN10 * LDBL_MANT_DIG + 1.0)))
#endif
+#if LDBL_MIN_10_EXP == -37
+# define FLOAT_MIN_10_NORM 1.0e-37L
+#elif LDBL_MIN_10_EXP == -307
+# define FLOAT_MIN_10_NORM 1.0e-307L
+#elif LDBL_MIN_10_EXP == -4931
+# define FLOAT_MIN_10_NORM 1.0e-4931L
+#else
+/* libc can't depend on libm. */
+# error "FLOAT_MIN_10_NORM must be precomputed"
+# define FLOAT_MIN_10_NORM exp10l (LDBL_MIN_10_EXP)
+#endif
#include "efgcvt_r.c"
diff --git a/misc/tst-efgcvt.c b/misc/tst-efgcvt.c
index 91e5cf929e..30ab0f17a0 100644
--- a/misc/tst-efgcvt.c
+++ b/misc/tst-efgcvt.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+/* Copyright (C) 1998, 1999, 2000, 2004 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -20,6 +20,7 @@
# define _GNU_SOURCE 1
#endif
+#include <float.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
@@ -59,6 +60,10 @@ static testcase ecvt_tests[] =
{ 123.01, -4, 3, "" },
{ 126.71, -4, 3, "" },
{ 0.0, 4, 1, "0000" },
+#if DBL_MANT_DIG == 53
+ { 0x1p-1074, 3, -323, "494" },
+ { -0x1p-1074, 3, -323, "494" },
+#endif
/* -1.0 is end marker. */
{ -1.0, 0, 0, "" }
};
diff --git a/nptl/ChangeLog b/nptl/ChangeLog
index de90f2a66d..ec4d10b7ad 100644
--- a/nptl/ChangeLog
+++ b/nptl/ChangeLog
@@ -1,3 +1,16 @@
+2004-12-27 Ulrich Drepper <drepper@redhat.com>
+
+ * init.c (__pthread_initialize_minimal_internal): Use __sigemptyset.
+
+2004-12-21 Jakub Jelinek <jakub@redhat.com>
+
+ * sysdeps/i386/tls.h (CALL_THREAD_FCT): Maintain 16 byte alignment of
+ %esp.
+ * Makefile (tests): Add tst-align2.
+ * tst-align2.c: New test.
+ * sysdeps/i386/Makefile (CFLAGS-tst-align{,2}.c): Add
+ -mpreferred-stack-boundary=4.
+
2004-12-18 Roland McGrath <roland@redhat.com>
* sysdeps/unix/sysv/linux/powerpc/powerpc64/bits/local_lim.h:
diff --git a/nptl/Makefile b/nptl/Makefile
index d42f356131..8d18946e6f 100644
--- a/nptl/Makefile
+++ b/nptl/Makefile
@@ -205,7 +205,7 @@ tests = tst-attr1 tst-attr2 tst-attr3 \
tst-sem1 tst-sem2 tst-sem3 tst-sem4 tst-sem5 tst-sem6 tst-sem7 \
tst-sem8 tst-sem9 \
tst-barrier1 tst-barrier2 tst-barrier3 tst-barrier4 \
- tst-align \
+ tst-align tst-align2 \
tst-basic1 tst-basic2 tst-basic3 tst-basic4 tst-basic5 tst-basic6 \
tst-kill1 tst-kill2 tst-kill3 tst-kill4 tst-kill5 tst-kill6 \
tst-raise1 \
diff --git a/nptl/init.c b/nptl/init.c
index 3751e6be77..86745af8d1 100644
--- a/nptl/init.c
+++ b/nptl/init.c
@@ -262,7 +262,7 @@ __pthread_initialize_minimal_internal (void)
struct sigaction sa;
sa.sa_sigaction = sigcancel_handler;
sa.sa_flags = SA_SIGINFO;
- sigemptyset (&sa.sa_mask);
+ __sigemptyset (&sa.sa_mask);
(void) __libc_sigaction (SIGCANCEL, &sa, NULL);
diff --git a/nptl/sysdeps/i386/Makefile b/nptl/sysdeps/i386/Makefile
index 693fb0569f..2f0d88f303 100644
--- a/nptl/sysdeps/i386/Makefile
+++ b/nptl/sysdeps/i386/Makefile
@@ -22,4 +22,6 @@ endif
ifeq ($(subdir),nptl)
CFLAGS-pthread_create.c += -mpreferred-stack-boundary=4
+CFLAGS-tst-align.c += -mpreferred-stack-boundary=4
+CFLAGS-tst-align2.c += -mpreferred-stack-boundary=4
endif
diff --git a/nptl/sysdeps/i386/tls.h b/nptl/sysdeps/i386/tls.h
index 18b038f93e..945a4c71d6 100644
--- a/nptl/sysdeps/i386/tls.h
+++ b/nptl/sysdeps/i386/tls.h
@@ -397,9 +397,12 @@ union user_desc_init
#define CALL_THREAD_FCT(descr) \
({ void *__res; \
int __ignore1, __ignore2; \
- asm volatile ("pushl %%gs:%P4\n\t" \
+ asm volatile ("pushl %%eax\n\t" \
+ "pushl %%eax\n\t" \
+ "pushl %%eax\n\t" \
+ "pushl %%gs:%P4\n\t" \
"call *%%gs:%P3\n\t" \
- "addl $4, %%esp" \
+ "addl $16, %%esp" \
: "=a" (__res), "=c" (__ignore1), "=d" (__ignore2) \
: "i" (offsetof (struct pthread, start_routine)), \
"i" (offsetof (struct pthread, arg))); \
diff --git a/nptl/tst-align2.c b/nptl/tst-align2.c
new file mode 100644
index 0000000000..ec85f435b6
--- /dev/null
+++ b/nptl/tst-align2.c
@@ -0,0 +1,87 @@
+/* Copyright (C) 2004 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sched.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <tst-stack-align.h>
+
+static int
+f (void *arg)
+{
+ bool ok = true;
+
+ if (TEST_STACK_ALIGN ())
+ ok = false;
+
+ return ok ? 0 : 1;
+}
+
+static int
+do_test (void)
+{
+ bool ok = true;
+
+ puts ("in main");
+
+ if (TEST_STACK_ALIGN ())
+ ok = false;
+
+#ifdef __ia64__
+ extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
+ size_t __child_stack_size, int __flags,
+ void *__arg, ...);
+ char st[256 * 1024];
+ pid_t p = __clone2 (f, st, sizeof (st), 0, 0);
+#else
+ char st[128 * 1024];
+ pid_t p = clone (f, st + sizeof (st), 0, 0);
+#endif
+ if (p == -1)
+ {
+ printf("clone failed: %m\n");
+ return 1;
+ }
+
+ int e;
+ if (waitpid (p, &e, __WCLONE) != p)
+ {
+ puts ("waitpid failed");
+ kill (p, SIGKILL);
+ return 1;
+ }
+ if (!WIFEXITED (e))
+ {
+ if (WIFSIGNALED (e))
+ printf ("died from signal %s\n", strsignal (WTERMSIG (e)));
+ else
+ puts ("did not terminate correctly");
+ return 1;
+ }
+ if (WEXITSTATUS (e) != 0)
+ ok = false;
+
+ return ok ? 0 : 1;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/po/es.po b/po/es.po
index eb79bd3d02..ba6a1fbd83 100644
--- a/po/es.po
+++ b/po/es.po
@@ -1,13 +1,14 @@
# Mensajes en español para GNU libc.
-# Copyright (C) 1996, 1997, 1998, 2001, 2002, 2003 Free Software Foundation, Inc.
+# Copyright (C) 1996, 1997, 1998, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
# Enrique Melero Gómez <melero@eurolands.com>, 1996, 1997.
-# Santiago Vila Doncel <sanvila@unex.es>, 1997, 1998, 2001, 2002, 2003.
+# Santiago Vila Doncel <sanvila@unex.es>, 1997, 1998, 2001, 2002, 2003, 2004.
#
msgid ""
msgstr ""
-"Project-Id-Version: GNU libc 2.3.2\n"
-"POT-Creation-Date: 2003-02-22 15:34-0800\n"
-"PO-Revision-Date: 2003-03-03 17:20+0100\n"
+"Project-Id-Version: GNU libc 2.3.3\n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2004-08-05 09:16+0200\n"
+"PO-Revision-Date: 2004-12-28 19:46+0100\n"
"Last-Translator: Santiago Vila Doncel <sanvila@unex.es>\n"
"Language-Team: Spanish <es@li.org>\n"
"MIME-Version: 1.0\n"
@@ -219,12 +220,12 @@ msgstr "no se puede abrir `%s'"
msgid "cannot read header from `%s'"
msgstr "no se puede leer la cabecera de `%s'"
-#: iconv/iconv_charmap.c:159 iconv/iconv_prog.c:293 catgets/gencat.c:288
+#: iconv/iconv_charmap.c:159 iconv/iconv_prog.c:295 catgets/gencat.c:288
#, c-format
msgid "cannot open input file `%s'"
msgstr "no se puede abrir el fichero de entrada `%s'"
-#: iconv/iconv_charmap.c:177 iconv/iconv_prog.c:311
+#: iconv/iconv_charmap.c:177 iconv/iconv_prog.c:313
#, c-format
msgid "error while closing input `%s'"
msgstr "error al cerrar la entrada `%s'"
@@ -234,16 +235,16 @@ msgstr "error al cerrar la entrada `%s'"
msgid "illegal input sequence at position %Zd"
msgstr "secuencia de entrada ilegal en la posición %Zd"
-#: iconv/iconv_charmap.c:462 iconv/iconv_prog.c:503
+#: iconv/iconv_charmap.c:462 iconv/iconv_prog.c:506
msgid "incomplete character or shift sequence at end of buffer"
msgstr "carácter o secuencia de desplazamiento incompleta al final del búfer"
-#: iconv/iconv_charmap.c:507 iconv/iconv_charmap.c:543 iconv/iconv_prog.c:546
-#: iconv/iconv_prog.c:582
+#: iconv/iconv_charmap.c:507 iconv/iconv_charmap.c:543 iconv/iconv_prog.c:549
+#: iconv/iconv_prog.c:585
msgid "error while reading the input"
msgstr "error al leer la entrada"
-#: iconv/iconv_charmap.c:525 iconv/iconv_prog.c:564
+#: iconv/iconv_charmap.c:525 iconv/iconv_prog.c:567
msgid "unable to allocate buffer for input"
msgstr "no se puede asignar espacio para el búfer de entrada"
@@ -295,47 +296,47 @@ msgstr "Convierte la codificación de los ficheros dados de una codificación a ot
msgid "[FILE...]"
msgstr "[FICHERO...]"
-#: iconv/iconv_prog.c:199
+#: iconv/iconv_prog.c:201
msgid "cannot open output file"
msgstr "no se puede abrir el fichero de salida"
-#: iconv/iconv_prog.c:241
+#: iconv/iconv_prog.c:243
#, c-format
msgid "conversion from `%s' and to `%s' are not supported"
msgstr "no se admite la conversión de `%s' a `%s'"
-#: iconv/iconv_prog.c:246
+#: iconv/iconv_prog.c:248
#, c-format
msgid "conversion from `%s' is not supported"
msgstr "no se admite la conversión de `%s'"
-#: iconv/iconv_prog.c:253
+#: iconv/iconv_prog.c:255
#, c-format
msgid "conversion to `%s' is not supported"
msgstr "no se admite la conversión a `%s'"
-#: iconv/iconv_prog.c:257
+#: iconv/iconv_prog.c:259
#, c-format
msgid "conversion from `%s' to `%s' is not supported"
msgstr "no se admite la conversión de `%s' a `%s'"
-#: iconv/iconv_prog.c:263
+#: iconv/iconv_prog.c:265
msgid "failed to start conversion processing"
msgstr "fallo al comenzar el proceso de conversión"
-#: iconv/iconv_prog.c:358
+#: iconv/iconv_prog.c:360
msgid "error while closing output file"
msgstr "error al cerrar el fichero de salida"
-#: iconv/iconv_prog.c:407 iconv/iconvconfig.c:357 locale/programs/locale.c:274
+#: iconv/iconv_prog.c:409 iconv/iconvconfig.c:357 locale/programs/locale.c:279
#: locale/programs/localedef.c:372 catgets/gencat.c:233
#: malloc/memusagestat.c:602 debug/pcprofiledump.c:199
msgid "Report bugs using the `glibcbug' script to <bugs@gnu.org>.\n"
msgstr "Comunicar bichos usando el programa `glibcbug' a <bugs@gnu.org>.\n"
-#: iconv/iconv_prog.c:421 iconv/iconvconfig.c:371 locale/programs/locale.c:287
-#: locale/programs/localedef.c:386 catgets/gencat.c:246 posix/getconf.c:910
-#: nss/getent.c:74 nscd/nscd.c:330 nscd/nscd_nischeck.c:90 elf/ldconfig.c:271
+#: iconv/iconv_prog.c:423 iconv/iconvconfig.c:371 locale/programs/locale.c:292
+#: locale/programs/localedef.c:386 catgets/gencat.c:246 posix/getconf.c:913
+#: nss/getent.c:74 nscd/nscd.c:355 nscd/nscd_nischeck.c:90 elf/ldconfig.c:274
#: elf/sprof.c:349
#, c-format
msgid ""
@@ -348,34 +349,34 @@ msgstr ""
"No hay NINGUNA garantía; ni siquiera de COMERCIABILIDAD o IDONEIDAD PARA UN\n"
"FIN DETERMINADO.\n"
-#: iconv/iconv_prog.c:426 iconv/iconvconfig.c:376 locale/programs/locale.c:292
-#: locale/programs/localedef.c:391 catgets/gencat.c:251 posix/getconf.c:915
-#: nss/getent.c:79 nscd/nscd.c:335 nscd/nscd_nischeck.c:95 elf/ldconfig.c:276
+#: iconv/iconv_prog.c:428 iconv/iconvconfig.c:376 locale/programs/locale.c:297
+#: locale/programs/localedef.c:391 catgets/gencat.c:251 posix/getconf.c:918
+#: nss/getent.c:79 nscd/nscd.c:360 nscd/nscd_nischeck.c:95 elf/ldconfig.c:279
#: elf/sprof.c:355
#, c-format
msgid "Written by %s.\n"
msgstr "Escrito por %s.\n"
-#: iconv/iconv_prog.c:456 iconv/iconv_prog.c:482
+#: iconv/iconv_prog.c:458 iconv/iconv_prog.c:484
msgid "conversion stopped due to problem in writing the output"
msgstr "la conversión se ha detenido debido a un problema al escribir el resultado"
-#: iconv/iconv_prog.c:499
+#: iconv/iconv_prog.c:502
#, c-format
msgid "illegal input sequence at position %ld"
msgstr "secuencia de entrada ilegal en la posición %ld"
-#: iconv/iconv_prog.c:507
+#: iconv/iconv_prog.c:510
msgid "internal error (illegal descriptor)"
msgstr "error interno (descriptor ilegal)"
-#: iconv/iconv_prog.c:510
+#: iconv/iconv_prog.c:513
#, c-format
msgid "unknown iconv() error %d"
msgstr "error de iconv() desconocido %d"
# FIXME: Espacio en blanco final.
-#: iconv/iconv_prog.c:753
+#: iconv/iconv_prog.c:756
msgid ""
"The following list contain all the coded character sets known. This does\n"
"not necessarily mean that all combinations of these names can be used for\n"
@@ -426,169 +427,169 @@ msgstr "no se puede generar el fichero de salida"
msgid "cannot read character map directory `%s'"
msgstr "no se puede leer el directorio de tablas de caracteres `%s'"
-#: locale/programs/charmap.c:135
+#: locale/programs/charmap.c:136
#, c-format
msgid "character map file `%s' not found"
msgstr "el fichero de tabla de caracteres `%s' no se encontró"
-#: locale/programs/charmap.c:193
+#: locale/programs/charmap.c:194
#, c-format
msgid "default character map file `%s' not found"
msgstr "no se encontró el fichero de tabla de caracteres predeterminado `%s'"
-#: locale/programs/charmap.c:255
+#: locale/programs/charmap.c:257
#, c-format
msgid "character map `%s' is not ASCII compatible, locale not ISO C compliant\n"
msgstr ""
"la tabla de caracteres `%s' no es compatible con ASCII, el local no cumple\n"
"con ISO C\n"
-#: locale/programs/charmap.c:332
+#: locale/programs/charmap.c:336
#, c-format
msgid "%s: <mb_cur_max> must be greater than <mb_cur_min>\n"
-msgstr "%s: <mb_cur_max> debe ser más grande que <mb_cur_min>\n"
+msgstr "%s: <mb_cur_max> debe ser mayor que <mb_cur_min>\n"
-#: locale/programs/charmap.c:352 locale/programs/charmap.c:369
+#: locale/programs/charmap.c:356 locale/programs/charmap.c:373
#: locale/programs/repertoire.c:175
#, c-format
msgid "syntax error in prolog: %s"
msgstr "error de sintaxis en el prólogo: %s"
-#: locale/programs/charmap.c:353
+#: locale/programs/charmap.c:357
msgid "invalid definition"
msgstr "definición inválida"
-#: locale/programs/charmap.c:370 locale/programs/locfile.c:126
+#: locale/programs/charmap.c:374 locale/programs/locfile.c:126
#: locale/programs/locfile.c:153 locale/programs/repertoire.c:176
msgid "bad argument"
-msgstr "Argumento erróneo"
+msgstr "argumento erróneo"
-#: locale/programs/charmap.c:398
+#: locale/programs/charmap.c:402
#, c-format
msgid "duplicate definition of <%s>"
msgstr "definición duplicada de <%s>"
-#: locale/programs/charmap.c:405
+#: locale/programs/charmap.c:409
#, c-format
msgid "value for <%s> must be 1 or greater"
msgstr "el valor para <%s> debe ser 1 o mayor"
# Milagro, por una vez es más corto en español :-) sv
-#: locale/programs/charmap.c:417
+#: locale/programs/charmap.c:421
#, c-format
msgid "value of <%s> must be greater or equal than the value of <%s>"
msgstr "el valor de <%s> debe ser mayor o igual que el valor de <%s>"
-#: locale/programs/charmap.c:440 locale/programs/repertoire.c:184
+#: locale/programs/charmap.c:444 locale/programs/repertoire.c:184
#, c-format
msgid "argument to <%s> must be a single character"
msgstr "el argumento para <%s> debe ser un único carácter"
-#: locale/programs/charmap.c:466
+#: locale/programs/charmap.c:470
msgid "character sets with locking states are not supported"
msgstr "los conjuntos de caracteres con estados de bloqueo no están soportados"
-#: locale/programs/charmap.c:493 locale/programs/charmap.c:547
-#: locale/programs/charmap.c:579 locale/programs/charmap.c:673
-#: locale/programs/charmap.c:728 locale/programs/charmap.c:769
-#: locale/programs/charmap.c:810
+#: locale/programs/charmap.c:497 locale/programs/charmap.c:551
+#: locale/programs/charmap.c:583 locale/programs/charmap.c:677
+#: locale/programs/charmap.c:732 locale/programs/charmap.c:773
+#: locale/programs/charmap.c:814
#, c-format
msgid "syntax error in %s definition: %s"
msgstr "error de sintaxis en la definición de %s: %s"
-#: locale/programs/charmap.c:494 locale/programs/charmap.c:674
-#: locale/programs/charmap.c:770 locale/programs/repertoire.c:231
+#: locale/programs/charmap.c:498 locale/programs/charmap.c:678
+#: locale/programs/charmap.c:774 locale/programs/repertoire.c:231
msgid "no symbolic name given"
msgstr "no se ha especificado ningún nombre simbólico"
-#: locale/programs/charmap.c:548
+#: locale/programs/charmap.c:552
msgid "invalid encoding given"
msgstr "especificada una codificación inválida"
-#: locale/programs/charmap.c:557
+#: locale/programs/charmap.c:561
msgid "too few bytes in character encoding"
msgstr "insuficiente número de bytes en la codificación del carácter"
-#: locale/programs/charmap.c:559
+#: locale/programs/charmap.c:563
msgid "too many bytes in character encoding"
msgstr "demasiados bytes en la codificación del carácter"
-#: locale/programs/charmap.c:581 locale/programs/charmap.c:729
-#: locale/programs/charmap.c:812 locale/programs/repertoire.c:297
+#: locale/programs/charmap.c:585 locale/programs/charmap.c:733
+#: locale/programs/charmap.c:816 locale/programs/repertoire.c:297
msgid "no symbolic name given for end of range"
msgstr "no se ha especificado ningún nombre simbólico para el final del rango"
-#: locale/programs/charmap.c:605 locale/programs/locfile.h:96
+#: locale/programs/charmap.c:609 locale/programs/locfile.c:818
#: locale/programs/repertoire.c:314
#, c-format
msgid "`%1$s' definition does not end with `END %1$s'"
msgstr "La definición `%1$s' no termina con `END %1$s'"
-#: locale/programs/charmap.c:638
+#: locale/programs/charmap.c:642
msgid "only WIDTH definitions are allowed to follow the CHARMAP definition"
msgstr "solamente se permiten ANCHO definiciones después de la definición CHARMAP"
-#: locale/programs/charmap.c:646 locale/programs/charmap.c:709
+#: locale/programs/charmap.c:650 locale/programs/charmap.c:713
#, c-format
msgid "value for %s must be an integer"
msgstr "el valor para %s debe ser un número entero"
# Para entender este mensaje, pensar en Turing.
-#: locale/programs/charmap.c:837
+#: locale/programs/charmap.c:841
#, c-format
msgid "%s: error in state machine"
msgstr "%s: error en la máquina de estados"
-#: locale/programs/charmap.c:845 locale/programs/ld-address.c:605
-#: locale/programs/ld-collate.c:2635 locale/programs/ld-collate.c:3793
-#: locale/programs/ld-ctype.c:2216 locale/programs/ld-ctype.c:2977
+#: locale/programs/charmap.c:849 locale/programs/ld-address.c:605
+#: locale/programs/ld-collate.c:2650 locale/programs/ld-collate.c:3818
+#: locale/programs/ld-ctype.c:2225 locale/programs/ld-ctype.c:2994
#: locale/programs/ld-identification.c:469
#: locale/programs/ld-measurement.c:255 locale/programs/ld-messages.c:349
-#: locale/programs/ld-monetary.c:952 locale/programs/ld-name.c:324
+#: locale/programs/ld-monetary.c:958 locale/programs/ld-name.c:324
#: locale/programs/ld-numeric.c:392 locale/programs/ld-paper.c:258
-#: locale/programs/ld-telephone.c:330 locale/programs/ld-time.c:1217
-#: locale/programs/locfile.h:103 locale/programs/repertoire.c:325
+#: locale/programs/ld-telephone.c:330 locale/programs/ld-time.c:1219
+#: locale/programs/locfile.c:825 locale/programs/repertoire.c:325
#, c-format
msgid "%s: premature end of file"
msgstr "%s: fin de fichero no esperado"
-#: locale/programs/charmap.c:864 locale/programs/charmap.c:875
+#: locale/programs/charmap.c:868 locale/programs/charmap.c:879
#, c-format
msgid "unknown character `%s'"
msgstr "carácter desconocido `%s'"
-#: locale/programs/charmap.c:883
+#: locale/programs/charmap.c:887
#, c-format
msgid "number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"
msgstr ""
"el número de bytes para la sucesión de bytes de comienzo y final del rango\n"
"no es el mismo: %d vs %d"
-#: locale/programs/charmap.c:987 locale/programs/ld-collate.c:2915
+#: locale/programs/charmap.c:991 locale/programs/ld-collate.c:2930
#: locale/programs/repertoire.c:420
msgid "invalid names for character range"
msgstr "nombres inválidos para el rango de caracteres"
-#: locale/programs/charmap.c:999 locale/programs/repertoire.c:432
+#: locale/programs/charmap.c:1003 locale/programs/repertoire.c:432
msgid "hexadecimal range format should use only capital characters"
-msgstr "el formato de rango hexadecimal debe usar solamente caracteres en mayúscula"
+msgstr "el formato de rango hexadecimal debe usar solamente caracteres en mayúsculas"
-#: locale/programs/charmap.c:1017
+#: locale/programs/charmap.c:1021
#, c-format
msgid "<%s> and <%s> are illegal names for range"
msgstr "<%s> y <%s> son nombres no permitidos para el rango de caracteres"
# FIXME: then -> than
-#: locale/programs/charmap.c:1023
+#: locale/programs/charmap.c:1027
msgid "upper limit in range is not higher then lower limit"
msgstr "el límite superior del rango no es mayor que el límite inferior"
-#: locale/programs/charmap.c:1081
+#: locale/programs/charmap.c:1085
msgid "resulting bytes for range not representable."
msgstr "los bytes resultantes para el rango no son representables."
-#: locale/programs/ld-address.c:134 locale/programs/ld-collate.c:1519
-#: locale/programs/ld-ctype.c:416 locale/programs/ld-identification.c:134
+#: locale/programs/ld-address.c:134 locale/programs/ld-collate.c:1534
+#: locale/programs/ld-ctype.c:421 locale/programs/ld-identification.c:134
#: locale/programs/ld-measurement.c:95 locale/programs/ld-messages.c:98
#: locale/programs/ld-monetary.c:194 locale/programs/ld-name.c:95
#: locale/programs/ld-numeric.c:99 locale/programs/ld-paper.c:92
@@ -603,8 +604,8 @@ msgstr "No se encontró ninguna definición para la categoría %s"
#: locale/programs/ld-address.c:290 locale/programs/ld-address.c:309
#: locale/programs/ld-address.c:322 locale/programs/ld-identification.c:147
#: locale/programs/ld-measurement.c:106 locale/programs/ld-monetary.c:206
-#: locale/programs/ld-monetary.c:244 locale/programs/ld-monetary.c:260
-#: locale/programs/ld-monetary.c:272 locale/programs/ld-name.c:106
+#: locale/programs/ld-monetary.c:250 locale/programs/ld-monetary.c:266
+#: locale/programs/ld-monetary.c:278 locale/programs/ld-name.c:106
#: locale/programs/ld-name.c:143 locale/programs/ld-numeric.c:113
#: locale/programs/ld-numeric.c:127 locale/programs/ld-paper.c:103
#: locale/programs/ld-paper.c:112 locale/programs/ld-telephone.c:106
@@ -648,503 +649,503 @@ msgid "%s: numeric country code `%d' not valid"
msgstr "%s: el código numérico de país `%d' no es válido"
#: locale/programs/ld-address.c:497 locale/programs/ld-address.c:534
-#: locale/programs/ld-address.c:572 locale/programs/ld-ctype.c:2592
+#: locale/programs/ld-address.c:572 locale/programs/ld-ctype.c:2601
#: locale/programs/ld-identification.c:365
#: locale/programs/ld-measurement.c:222 locale/programs/ld-messages.c:302
-#: locale/programs/ld-monetary.c:694 locale/programs/ld-monetary.c:729
-#: locale/programs/ld-monetary.c:770 locale/programs/ld-name.c:281
+#: locale/programs/ld-monetary.c:700 locale/programs/ld-monetary.c:735
+#: locale/programs/ld-monetary.c:776 locale/programs/ld-name.c:281
#: locale/programs/ld-numeric.c:264 locale/programs/ld-paper.c:225
-#: locale/programs/ld-telephone.c:289 locale/programs/ld-time.c:1106
-#: locale/programs/ld-time.c:1148
+#: locale/programs/ld-telephone.c:289 locale/programs/ld-time.c:1108
+#: locale/programs/ld-time.c:1150
#, c-format
msgid "%s: field `%s' declared more than once"
msgstr "%s: el campo `%s' ha sido declarado más de una vez"
#: locale/programs/ld-address.c:501 locale/programs/ld-address.c:539
#: locale/programs/ld-identification.c:369 locale/programs/ld-messages.c:312
-#: locale/programs/ld-monetary.c:698 locale/programs/ld-monetary.c:733
+#: locale/programs/ld-monetary.c:704 locale/programs/ld-monetary.c:739
#: locale/programs/ld-name.c:285 locale/programs/ld-numeric.c:268
-#: locale/programs/ld-telephone.c:293 locale/programs/ld-time.c:1000
-#: locale/programs/ld-time.c:1069 locale/programs/ld-time.c:1111
+#: locale/programs/ld-telephone.c:293 locale/programs/ld-time.c:1002
+#: locale/programs/ld-time.c:1071 locale/programs/ld-time.c:1113
#, c-format
msgid "%s: unknown character in field `%s'"
msgstr "%s: carácter desconocido en el campo `%s'"
-#: locale/programs/ld-address.c:586 locale/programs/ld-collate.c:3775
-#: locale/programs/ld-ctype.c:2957 locale/programs/ld-identification.c:450
+#: locale/programs/ld-address.c:586 locale/programs/ld-collate.c:3800
+#: locale/programs/ld-ctype.c:2974 locale/programs/ld-identification.c:450
#: locale/programs/ld-measurement.c:236 locale/programs/ld-messages.c:331
-#: locale/programs/ld-monetary.c:934 locale/programs/ld-name.c:306
+#: locale/programs/ld-monetary.c:940 locale/programs/ld-name.c:306
#: locale/programs/ld-numeric.c:374 locale/programs/ld-paper.c:240
-#: locale/programs/ld-telephone.c:312 locale/programs/ld-time.c:1199
+#: locale/programs/ld-telephone.c:312 locale/programs/ld-time.c:1201
#, c-format
msgid "%s: incomplete `END' line"
msgstr "%s: línea `END' incompleta"
-#: locale/programs/ld-address.c:589 locale/programs/ld-collate.c:2638
-#: locale/programs/ld-collate.c:3777 locale/programs/ld-ctype.c:2219
-#: locale/programs/ld-ctype.c:2960 locale/programs/ld-identification.c:453
+#: locale/programs/ld-address.c:589 locale/programs/ld-collate.c:2653
+#: locale/programs/ld-collate.c:3802 locale/programs/ld-ctype.c:2228
+#: locale/programs/ld-ctype.c:2977 locale/programs/ld-identification.c:453
#: locale/programs/ld-measurement.c:239 locale/programs/ld-messages.c:333
-#: locale/programs/ld-monetary.c:936 locale/programs/ld-name.c:308
+#: locale/programs/ld-monetary.c:942 locale/programs/ld-name.c:308
#: locale/programs/ld-numeric.c:376 locale/programs/ld-paper.c:242
-#: locale/programs/ld-telephone.c:314 locale/programs/ld-time.c:1201
+#: locale/programs/ld-telephone.c:314 locale/programs/ld-time.c:1203
#, c-format
msgid "%1$s: definition does not end with `END %1$s'"
msgstr "%1$s: la definición no termina con `END %1$s'"
-#: locale/programs/ld-address.c:596 locale/programs/ld-collate.c:520
-#: locale/programs/ld-collate.c:572 locale/programs/ld-collate.c:869
-#: locale/programs/ld-collate.c:882 locale/programs/ld-collate.c:2625
-#: locale/programs/ld-collate.c:3784 locale/programs/ld-ctype.c:1947
-#: locale/programs/ld-ctype.c:2206 locale/programs/ld-ctype.c:2782
-#: locale/programs/ld-ctype.c:2968 locale/programs/ld-identification.c:460
+#: locale/programs/ld-address.c:596 locale/programs/ld-collate.c:523
+#: locale/programs/ld-collate.c:575 locale/programs/ld-collate.c:871
+#: locale/programs/ld-collate.c:884 locale/programs/ld-collate.c:2640
+#: locale/programs/ld-collate.c:3809 locale/programs/ld-ctype.c:1956
+#: locale/programs/ld-ctype.c:2215 locale/programs/ld-ctype.c:2799
+#: locale/programs/ld-ctype.c:2985 locale/programs/ld-identification.c:460
#: locale/programs/ld-measurement.c:246 locale/programs/ld-messages.c:340
-#: locale/programs/ld-monetary.c:943 locale/programs/ld-name.c:315
+#: locale/programs/ld-monetary.c:949 locale/programs/ld-name.c:315
#: locale/programs/ld-numeric.c:383 locale/programs/ld-paper.c:249
-#: locale/programs/ld-telephone.c:321 locale/programs/ld-time.c:1208
+#: locale/programs/ld-telephone.c:321 locale/programs/ld-time.c:1210
#, c-format
msgid "%s: syntax error"
msgstr "%s: error de sintaxis"
-#: locale/programs/ld-collate.c:395
+#: locale/programs/ld-collate.c:398
#, c-format
msgid "`%.*s' already defined in charmap"
msgstr "`%.*s' ya está definido en la tabla de caracteres"
-#: locale/programs/ld-collate.c:404
+#: locale/programs/ld-collate.c:407
#, c-format
msgid "`%.*s' already defined in repertoire"
msgstr "`%.*s' ya está definido en el repertorio"
-#: locale/programs/ld-collate.c:411
+#: locale/programs/ld-collate.c:414
#, c-format
msgid "`%.*s' already defined as collating symbol"
msgstr "`%.*s' ya está definido como símbolo de ordenación"
-#: locale/programs/ld-collate.c:418
+#: locale/programs/ld-collate.c:421
#, c-format
msgid "`%.*s' already defined as collating element"
msgstr "`%.*s' ya está definido como elemento de ordenación"
-#: locale/programs/ld-collate.c:449 locale/programs/ld-collate.c:475
+#: locale/programs/ld-collate.c:452 locale/programs/ld-collate.c:478
#, c-format
msgid "%s: `forward' and `backward' are mutually excluding each other"
msgstr "%s: `forward' y `backward' se excluyen mutuamente"
-#: locale/programs/ld-collate.c:459 locale/programs/ld-collate.c:485
-#: locale/programs/ld-collate.c:501
+#: locale/programs/ld-collate.c:462 locale/programs/ld-collate.c:488
+#: locale/programs/ld-collate.c:504
#, c-format
msgid "%s: `%s' mentioned more than once in definition of weight %d"
msgstr "%s: `%s' mencionado más de una vez en la definición del peso %d"
-#: locale/programs/ld-collate.c:557
+#: locale/programs/ld-collate.c:560
#, c-format
msgid "%s: too many rules; first entry only had %d"
msgstr "%s: demasiadas reglas; la primera entrada solamente tenía %d"
-#: locale/programs/ld-collate.c:593
+#: locale/programs/ld-collate.c:596
#, c-format
msgid "%s: not enough sorting rules"
msgstr "%s: no hay suficientes reglas de ordenación"
-#: locale/programs/ld-collate.c:759
+#: locale/programs/ld-collate.c:761
#, c-format
msgid "%s: empty weight string not allowed"
msgstr "%s: no se permite una cadena de peso vacía"
-#: locale/programs/ld-collate.c:854
+#: locale/programs/ld-collate.c:856
#, c-format
msgid "%s: weights must use the same ellipsis symbol as the name"
msgstr "%s: los pesos deben usar el mismo símbolo de elipsis que el nombre"
-#: locale/programs/ld-collate.c:910
+#: locale/programs/ld-collate.c:912
#, c-format
msgid "%s: too many values"
msgstr "%s: demasiados valores"
-#: locale/programs/ld-collate.c:1023 locale/programs/ld-collate.c:1194
+#: locale/programs/ld-collate.c:1031 locale/programs/ld-collate.c:1206
#, c-format
msgid "order for `%.*s' already defined at %s:%Zu"
msgstr "el orden para `%.*s' ya está definido en %s:%Zu"
-#: locale/programs/ld-collate.c:1073
+#: locale/programs/ld-collate.c:1081
#, c-format
msgid "%s: the start and the end symbol of a range must stand for characters"
msgstr "%s: los símbolos de comienzo y de final de un rango deben representar caracteres"
-#: locale/programs/ld-collate.c:1100
+#: locale/programs/ld-collate.c:1108
#, c-format
msgid "%s: byte sequences of first and last character must have the same length"
msgstr ""
"%s: los órdenes de byte de los caracteres primero y último deben tener\n"
"la misma longitud"
-#: locale/programs/ld-collate.c:1142
+#: locale/programs/ld-collate.c:1150
#, c-format
msgid "%s: byte sequence of first character of sequence is not lower than that of the last character"
msgstr ""
"%s: el orden de byte del primer carácter de la sucesión no es menor que\n"
"el del último carácter"
-#: locale/programs/ld-collate.c:1263
+#: locale/programs/ld-collate.c:1275
#, c-format
msgid "%s: symbolic range ellipsis must not directly follow `order_start'"
msgstr "%s: el rango simbólico de la elipsis no debe seguir directamente a `order_start'"
-#: locale/programs/ld-collate.c:1267
+#: locale/programs/ld-collate.c:1279
#, c-format
msgid "%s: symbolic range ellipsis must not be directly followed by `order_end'"
msgstr "%s: el rango simbólico de la elipsis no debe estar directamente seguido por `order_end'"
-#: locale/programs/ld-collate.c:1287 locale/programs/ld-ctype.c:1467
+#: locale/programs/ld-collate.c:1299 locale/programs/ld-ctype.c:1476
#, c-format
msgid "`%s' and `%.*s' are no valid names for symbolic range"
msgstr "`%s' y `%.*s' no son nombres válidos para el rango simbólico"
-#: locale/programs/ld-collate.c:1333 locale/programs/ld-collate.c:3712
+#: locale/programs/ld-collate.c:1348 locale/programs/ld-collate.c:3737
#, c-format
msgid "%s: order for `%.*s' already defined at %s:%Zu"
msgstr "%s: el orden para `%.*s' ya está definido en %s:%Zu"
-#: locale/programs/ld-collate.c:1342
+#: locale/programs/ld-collate.c:1357
#, c-format
msgid "%s: `%s' must be a character"
msgstr "%s: `%s' debe ser un carácter"
-#: locale/programs/ld-collate.c:1535
+#: locale/programs/ld-collate.c:1550
#, c-format
msgid "%s: `position' must be used for a specific level in all sections or none"
msgstr "%s: `position' debe utilizarse para un nivel específico en todas las secciones o en ninguna"
-#: locale/programs/ld-collate.c:1560
+#: locale/programs/ld-collate.c:1575
#, c-format
msgid "symbol `%s' not defined"
msgstr "el símbolo `%s' no está definido"
-#: locale/programs/ld-collate.c:1636 locale/programs/ld-collate.c:1742
+#: locale/programs/ld-collate.c:1651 locale/programs/ld-collate.c:1757
#, c-format
msgid "symbol `%s' has the same encoding as"
msgstr "el símbolo `%s' tiene la misma codificación que"
-#: locale/programs/ld-collate.c:1640 locale/programs/ld-collate.c:1746
+#: locale/programs/ld-collate.c:1655 locale/programs/ld-collate.c:1761
#, c-format
msgid "symbol `%s'"
msgstr "el símbolo `%s'"
-#: locale/programs/ld-collate.c:1788
+#: locale/programs/ld-collate.c:1803
msgid "no definition of `UNDEFINED'"
msgstr "no hay definición para `UNDEFINED'"
-#: locale/programs/ld-collate.c:1817
+#: locale/programs/ld-collate.c:1832
msgid "too many errors; giving up"
msgstr "demasiados errores; abandono"
-#: locale/programs/ld-collate.c:2720
+#: locale/programs/ld-collate.c:2735
#, c-format
msgid "%s: duplicate definition of `%s'"
msgstr "%s: definición duplicada de `%s'"
-#: locale/programs/ld-collate.c:2756
+#: locale/programs/ld-collate.c:2771
#, c-format
msgid "%s: duplicate declaration of section `%s'"
msgstr "%s: definición duplicada de la sección `%s'"
-#: locale/programs/ld-collate.c:2895
+#: locale/programs/ld-collate.c:2910
#, c-format
msgid "%s: unknown character in collating symbol name"
msgstr "%s: carácter desconocido en el nombre de un símbolo de ordenación"
-#: locale/programs/ld-collate.c:3027
+#: locale/programs/ld-collate.c:3042
#, c-format
msgid "%s: unknown character in equivalent definition name"
msgstr "%s: carácter desconocido en el nombre de definición equivalente"
-#: locale/programs/ld-collate.c:3040
+#: locale/programs/ld-collate.c:3055
#, c-format
msgid "%s: unknown character in equivalent definition value"
msgstr "%s: carácter desconocido en el valor de definición equivalente"
-#: locale/programs/ld-collate.c:3050
+#: locale/programs/ld-collate.c:3065
#, c-format
msgid "%s: unknown symbol `%s' in equivalent definition"
msgstr "%s: símbolo desconocido `%s' en la definición equivalente"
-#: locale/programs/ld-collate.c:3059
+#: locale/programs/ld-collate.c:3074
msgid "error while adding equivalent collating symbol"
msgstr "error al añadir símbolo de ordenación equivalente"
-#: locale/programs/ld-collate.c:3089
+#: locale/programs/ld-collate.c:3104
#, c-format
msgid "duplicate definition of script `%s'"
msgstr "definición duplicada de `script' `%s'"
-#: locale/programs/ld-collate.c:3137
+#: locale/programs/ld-collate.c:3152
#, c-format
msgid "%s: unknown section name `%s'"
msgstr "%s: nombre de sección desconocido `%s'"
-#: locale/programs/ld-collate.c:3165
+#: locale/programs/ld-collate.c:3180
#, c-format
msgid "%s: multiple order definitions for section `%s'"
msgstr "%s: hay varias definiciones de orden para la sección `%s'"
-#: locale/programs/ld-collate.c:3190
+#: locale/programs/ld-collate.c:3205
#, c-format
msgid "%s: invalid number of sorting rules"
msgstr "%s: número inválido de reglas de ordenación"
-#: locale/programs/ld-collate.c:3217
+#: locale/programs/ld-collate.c:3232
#, c-format
msgid "%s: multiple order definitions for unnamed section"
msgstr "%s: varias definiciones de orden para la sección sin nombre"
-#: locale/programs/ld-collate.c:3271 locale/programs/ld-collate.c:3394
-#: locale/programs/ld-collate.c:3753
+#: locale/programs/ld-collate.c:3286 locale/programs/ld-collate.c:3414
+#: locale/programs/ld-collate.c:3778
#, c-format
msgid "%s: missing `order_end' keyword"
msgstr "%s: falta la palabra clave `order_end'"
-#: locale/programs/ld-collate.c:3329
+#: locale/programs/ld-collate.c:3347
#, c-format
msgid "%s: order for collating symbol %.*s not yet defined"
msgstr "%s: el orden para el símbolo de ordenación %.*s todavía no está definido"
# FIXME: ¿Por qué este y el siguiente no son iguales?
-#: locale/programs/ld-collate.c:3345
+#: locale/programs/ld-collate.c:3365
#, c-format
msgid "%s: order for collating element %.*s not yet defined"
msgstr "%s: el orden para el elemento de ordenación %.*s todavía no está definido"
-#: locale/programs/ld-collate.c:3356
+#: locale/programs/ld-collate.c:3376
#, c-format
msgid "%s: cannot reorder after %.*s: symbol not known"
msgstr "%s: no se puede reordenar después de %.*s: símbolo desconocido"
-#: locale/programs/ld-collate.c:3408 locale/programs/ld-collate.c:3765
+#: locale/programs/ld-collate.c:3428 locale/programs/ld-collate.c:3790
#, c-format
msgid "%s: missing `reorder-end' keyword"
msgstr "%s: falta la palabra clave `reorder-end'"
-#: locale/programs/ld-collate.c:3442 locale/programs/ld-collate.c:3637
+#: locale/programs/ld-collate.c:3462 locale/programs/ld-collate.c:3662
#, c-format
msgid "%s: section `%.*s' not known"
msgstr "%s: la sección `%.*s' es desconocida"
-#: locale/programs/ld-collate.c:3507
+#: locale/programs/ld-collate.c:3527
#, c-format
msgid "%s: bad symbol <%.*s>"
msgstr "%s: símbolo erróneo <%.*s>"
-#: locale/programs/ld-collate.c:3700
+#: locale/programs/ld-collate.c:3725
#, c-format
msgid "%s: cannot have `%s' as end of ellipsis range"
msgstr "%s: no puede tener `%s' como final de un rango de elipsis"
-#: locale/programs/ld-collate.c:3749
+#: locale/programs/ld-collate.c:3774
#, c-format
msgid "%s: empty category description not allowed"
msgstr "%s: no se permite una descripción de categoría vacía"
-#: locale/programs/ld-collate.c:3768
+#: locale/programs/ld-collate.c:3793
#, c-format
msgid "%s: missing `reorder-sections-end' keyword"
msgstr "%s: falta la palabra clave `reorder-sections-end'"
-#: locale/programs/ld-ctype.c:435
+#: locale/programs/ld-ctype.c:440
msgid "No character set name specified in charmap"
msgstr ""
"No se ha especificado ningún nombre de conjunto de caracteres en la tabla\n"
"de caracteres"
-#: locale/programs/ld-ctype.c:464
+#: locale/programs/ld-ctype.c:469
#, c-format
msgid "character L'\\u%0*x' in class `%s' must be in class `%s'"
msgstr "el carácter L'\\u%0*x' en la clase `%s' debe estar en la clase `%s'"
-#: locale/programs/ld-ctype.c:479
+#: locale/programs/ld-ctype.c:484
#, c-format
msgid "character L'\\u%0*x' in class `%s' must not be in class `%s'"
msgstr "el carácter L'\\u%0*x' en la clase `%s' no debe estar en la clase `%s"
-#: locale/programs/ld-ctype.c:493 locale/programs/ld-ctype.c:551
+#: locale/programs/ld-ctype.c:498 locale/programs/ld-ctype.c:556
#, c-format
msgid "internal error in %s, line %u"
msgstr "error interno en %s, línea %u"
-#: locale/programs/ld-ctype.c:522
+#: locale/programs/ld-ctype.c:527
#, c-format
msgid "character '%s' in class `%s' must be in class `%s'"
msgstr "el carácter '%s' en la clase `%s' debe estar en la clase `%s'"
-#: locale/programs/ld-ctype.c:538
+#: locale/programs/ld-ctype.c:543
#, c-format
msgid "character '%s' in class `%s' must not be in class `%s'"
msgstr "el carácter '%s' en la clase `%s' no debe estar en la clase `%s"
-#: locale/programs/ld-ctype.c:568 locale/programs/ld-ctype.c:606
+#: locale/programs/ld-ctype.c:573 locale/programs/ld-ctype.c:611
#, c-format
msgid "<SP> character not in class `%s'"
msgstr "El carácter <SP> no está en la clase `%s'"
-#: locale/programs/ld-ctype.c:580 locale/programs/ld-ctype.c:617
+#: locale/programs/ld-ctype.c:585 locale/programs/ld-ctype.c:622
#, c-format
msgid "<SP> character must not be in class `%s'"
msgstr "El carácter <SP> no debe estar en la clase `%s'"
-#: locale/programs/ld-ctype.c:595
+#: locale/programs/ld-ctype.c:600
msgid "character <SP> not defined in character map"
msgstr "el carácter <SP> no está definido en la tabla de caracteres"
-#: locale/programs/ld-ctype.c:709
+#: locale/programs/ld-ctype.c:714
msgid "`digit' category has not entries in groups of ten"
msgstr "la categoría `digit' no tiene entradas en grupos de diez"
# FIXME: El original no se entiende. ¿Es gramaticalmente correcto? sv
-#: locale/programs/ld-ctype.c:758
+#: locale/programs/ld-ctype.c:763
msgid "no input digits defined and none of the standard names in the charmap"
msgstr ""
"no hay ningún dígito de entrada definido y ninguno de los nombres estándar\n"
"en el conjunto de caracteres"
-#: locale/programs/ld-ctype.c:823
+#: locale/programs/ld-ctype.c:828
msgid "not all characters used in `outdigit' are available in the charmap"
msgstr ""
"no todos los caracteres usados en `outdigit' están disponibles en la tabla\n"
"de caracteres"
-#: locale/programs/ld-ctype.c:840
+#: locale/programs/ld-ctype.c:845
msgid "not all characters used in `outdigit' are available in the repertoire"
msgstr "no todos los caracteres usados en `outdigit' están disponibles en el repertorio"
-#: locale/programs/ld-ctype.c:1235
+#: locale/programs/ld-ctype.c:1244
#, c-format
msgid "character class `%s' already defined"
msgstr "la clase de carácter `%s' ya fue definida"
-#: locale/programs/ld-ctype.c:1241
+#: locale/programs/ld-ctype.c:1250
#, c-format
msgid "implementation limit: no more than %Zd character classes allowed"
msgstr "límite de la implementación: no se permiten más de %Zd clases de caracteres"
-#: locale/programs/ld-ctype.c:1267
+#: locale/programs/ld-ctype.c:1276
#, c-format
msgid "character map `%s' already defined"
msgstr "la tabla de caracteres `%s' ya está definida"
-#: locale/programs/ld-ctype.c:1273
+#: locale/programs/ld-ctype.c:1282
#, c-format
msgid "implementation limit: no more than %d character maps allowed"
msgstr "límite de la implementación: no se permiten más de %d tablas de caracteres"
-#: locale/programs/ld-ctype.c:1538 locale/programs/ld-ctype.c:1663
-#: locale/programs/ld-ctype.c:1769 locale/programs/ld-ctype.c:2455
-#: locale/programs/ld-ctype.c:3443
+#: locale/programs/ld-ctype.c:1547 locale/programs/ld-ctype.c:1672
+#: locale/programs/ld-ctype.c:1778 locale/programs/ld-ctype.c:2464
+#: locale/programs/ld-ctype.c:3460
#, c-format
msgid "%s: field `%s' does not contain exactly ten entries"
msgstr "%s: el campo `%s' no contiene exactamente diez entradas"
-#: locale/programs/ld-ctype.c:1566 locale/programs/ld-ctype.c:2137
+#: locale/programs/ld-ctype.c:1575 locale/programs/ld-ctype.c:2146
#, c-format
msgid "to-value <U%0*X> of range is smaller than from-value <U%0*X>"
msgstr "el valor `to' del rango <U%0*X> es más pequeño que el valor `from' <U%0*X>"
-#: locale/programs/ld-ctype.c:1693
+#: locale/programs/ld-ctype.c:1702
msgid "start and end character sequence of range must have the same length"
msgstr "los caracteres de comienzo y final del rango debe tener la misma longitud"
-#: locale/programs/ld-ctype.c:1700
+#: locale/programs/ld-ctype.c:1709
msgid "to-value character sequence is smaller than from-value sequence"
msgstr "el valor `to' de la sucesión de caracteres es más pequeño que el valor `from'"
-#: locale/programs/ld-ctype.c:2057 locale/programs/ld-ctype.c:2108
+#: locale/programs/ld-ctype.c:2066 locale/programs/ld-ctype.c:2117
msgid "premature end of `translit_ignore' definition"
msgstr "Fin no esperado de la definición `translit_ignore'"
-#: locale/programs/ld-ctype.c:2063 locale/programs/ld-ctype.c:2114
-#: locale/programs/ld-ctype.c:2156
+#: locale/programs/ld-ctype.c:2072 locale/programs/ld-ctype.c:2123
+#: locale/programs/ld-ctype.c:2165
msgid "syntax error"
msgstr "error de sintaxis"
-#: locale/programs/ld-ctype.c:2287
+#: locale/programs/ld-ctype.c:2296
#, c-format
msgid "%s: syntax error in definition of new character class"
msgstr "%s: error de sintaxis en la definición de una nueva clase de caracteres"
-#: locale/programs/ld-ctype.c:2302
+#: locale/programs/ld-ctype.c:2311
#, c-format
msgid "%s: syntax error in definition of new character map"
msgstr "%s: error de sintaxis en la definición de un nueva tabla de caracteres"
-#: locale/programs/ld-ctype.c:2477
+#: locale/programs/ld-ctype.c:2486
msgid "ellipsis range must be marked by two operands of same type"
msgstr "el rango de la elipsis debe estar marcada mediante dos operandos del mismo tipo"
-#: locale/programs/ld-ctype.c:2486
+#: locale/programs/ld-ctype.c:2495
msgid "with symbolic name range values the absolute ellipsis `...' must not be used"
msgstr "con valores de rango nombre simbólico la elipsis absoluta `...' no debe usarse"
-#: locale/programs/ld-ctype.c:2501
+#: locale/programs/ld-ctype.c:2510
msgid "with UCS range values one must use the hexadecimal symbolic ellipsis `..'"
msgstr "con valores de rango UCS se debe utilizar la elipsis simbólica hexadecimal `..'"
-#: locale/programs/ld-ctype.c:2515
+#: locale/programs/ld-ctype.c:2524
msgid "with character code range values one must use the absolute ellipsis `...'"
msgstr "con valores de rango código de caracteres se debe utilizar la elipsis absoluta `...'"
-#: locale/programs/ld-ctype.c:2666
+#: locale/programs/ld-ctype.c:2675
#, c-format
msgid "duplicated definition for mapping `%s'"
msgstr "definición duplicada para la asignación `%s'"
-#: locale/programs/ld-ctype.c:2744 locale/programs/ld-ctype.c:2888
+#: locale/programs/ld-ctype.c:2761 locale/programs/ld-ctype.c:2905
#, c-format
msgid "%s: `translit_start' section does not end with `translit_end'"
msgstr "%s: la sección `translit_start' no termina con `translit_end'"
-#: locale/programs/ld-ctype.c:2839
+#: locale/programs/ld-ctype.c:2856
#, c-format
msgid "%s: duplicate `default_missing' definition"
msgstr "%s: definición `default_missing' duplicada"
-#: locale/programs/ld-ctype.c:2844
+#: locale/programs/ld-ctype.c:2861
msgid "previous definition was here"
-msgstr "la definición anterior estaba aquí"
+msgstr "aquí estaba la definición anterior"
-#: locale/programs/ld-ctype.c:2866
+#: locale/programs/ld-ctype.c:2883
#, c-format
msgid "%s: no representable `default_missing' definition found"
msgstr "%s: no se ha encontrado ninguna definición de `default_missing' representable"
-#: locale/programs/ld-ctype.c:3019
+#: locale/programs/ld-ctype.c:3036
#, c-format
msgid "%s: character `%s' not defined in charmap while needed as default value"
msgstr ""
"%s: el carácter `%s' no está definido en el conjundo de caracteres\n"
"cuando se necesitó como valor predeterminado"
-#: locale/programs/ld-ctype.c:3024 locale/programs/ld-ctype.c:3108
-#: locale/programs/ld-ctype.c:3128 locale/programs/ld-ctype.c:3149
-#: locale/programs/ld-ctype.c:3170 locale/programs/ld-ctype.c:3191
-#: locale/programs/ld-ctype.c:3212 locale/programs/ld-ctype.c:3252
-#: locale/programs/ld-ctype.c:3273 locale/programs/ld-ctype.c:3340
+#: locale/programs/ld-ctype.c:3041 locale/programs/ld-ctype.c:3125
+#: locale/programs/ld-ctype.c:3145 locale/programs/ld-ctype.c:3166
+#: locale/programs/ld-ctype.c:3187 locale/programs/ld-ctype.c:3208
+#: locale/programs/ld-ctype.c:3229 locale/programs/ld-ctype.c:3269
+#: locale/programs/ld-ctype.c:3290 locale/programs/ld-ctype.c:3357
#, c-format
msgid "%s: character `%s' in charmap not representable with one byte"
msgstr "%s: el carácter `%s' en la tabla de caracteres no es representable con un byte"
-#: locale/programs/ld-ctype.c:3103 locale/programs/ld-ctype.c:3123
-#: locale/programs/ld-ctype.c:3165 locale/programs/ld-ctype.c:3186
-#: locale/programs/ld-ctype.c:3207 locale/programs/ld-ctype.c:3247
-#: locale/programs/ld-ctype.c:3268 locale/programs/ld-ctype.c:3335
-#: locale/programs/ld-ctype.c:3377 locale/programs/ld-ctype.c:3402
+#: locale/programs/ld-ctype.c:3120 locale/programs/ld-ctype.c:3140
+#: locale/programs/ld-ctype.c:3182 locale/programs/ld-ctype.c:3203
+#: locale/programs/ld-ctype.c:3224 locale/programs/ld-ctype.c:3264
+#: locale/programs/ld-ctype.c:3285 locale/programs/ld-ctype.c:3352
+#: locale/programs/ld-ctype.c:3394 locale/programs/ld-ctype.c:3419
#, c-format
msgid "%s: character `%s' not defined while needed as default value"
msgstr "%s: el carácter `%s' no está definido cuando se necesitó como valor predeterminado"
-#: locale/programs/ld-ctype.c:3144
+#: locale/programs/ld-ctype.c:3161
#, c-format
msgid "character `%s' not defined while needed as default value"
msgstr "el carácter `%s' no está definido cuando se necesitó como valor por defecto"
-#: locale/programs/ld-ctype.c:3384 locale/programs/ld-ctype.c:3409
+#: locale/programs/ld-ctype.c:3401 locale/programs/ld-ctype.c:3426
#, c-format
msgid "%s: character `%s' needed as default value not representable with one byte"
msgstr ""
@@ -1152,29 +1153,29 @@ msgstr ""
"con un byte"
# FIXME: Lo mismo de antes.
-#: locale/programs/ld-ctype.c:3464
+#: locale/programs/ld-ctype.c:3481
msgid "no output digits defined and none of the standard names in the charmap"
msgstr ""
"no hay ningún dígito de salida definido y ninguno de los nombres estándar\n"
"en el conjunto de caracteres"
# Pregunta: ¿De verdad existe transliteración en español? sv
-#: locale/programs/ld-ctype.c:3755
+#: locale/programs/ld-ctype.c:3772
#, c-format
msgid "%s: transliteration data from locale `%s' not available"
msgstr "%s: los datos de transliteración del local `%s' no están disponibles"
-#: locale/programs/ld-ctype.c:3851
+#: locale/programs/ld-ctype.c:3868
#, c-format
msgid "%s: table for class \"%s\": %lu bytes\n"
msgstr "%s: tabla para la clase \"%s\": %lu bytes\n"
-#: locale/programs/ld-ctype.c:3920
+#: locale/programs/ld-ctype.c:3937
#, c-format
msgid "%s: table for map \"%s\": %lu bytes\n"
msgstr "%s: tabla para la asignación \"%s\": %lu bytes\n"
-#: locale/programs/ld-ctype.c:4053
+#: locale/programs/ld-ctype.c:4070
#, c-format
msgid "%s: table for width: %lu bytes\n"
msgstr "%s: tabla para el ancho: %lu bytes\n"
@@ -1218,39 +1219,39 @@ msgstr "%s: la expresión regular para el campo `%s' no es correcta: %s"
msgid "%s: value of field `int_curr_symbol' has wrong length"
msgstr "%s: el valor del campo `int_curr_symbol' tiene una longitud errónea"
-#: locale/programs/ld-monetary.c:232
+#: locale/programs/ld-monetary.c:237
#, c-format
msgid "%s: value of field `int_curr_symbol' does not correspond to a valid name in ISO 4217"
msgstr ""
"%s: el valor del campo `int_curr_symbol' no se corresponde con un nombre\n"
"válido en ISO 4217"
-#: locale/programs/ld-monetary.c:250 locale/programs/ld-numeric.c:119
+#: locale/programs/ld-monetary.c:256 locale/programs/ld-numeric.c:119
#, c-format
msgid "%s: value for field `%s' must not be the empty string"
msgstr "%s: el valor para el campo `%s' no debe estar vacío"
-#: locale/programs/ld-monetary.c:278 locale/programs/ld-monetary.c:308
+#: locale/programs/ld-monetary.c:284 locale/programs/ld-monetary.c:314
#, c-format
msgid "%s: value for field `%s' must be in range %d...%d"
msgstr "%s: el valor para el campo `%s' debe estar en el rango %d...%d"
-#: locale/programs/ld-monetary.c:740 locale/programs/ld-numeric.c:275
+#: locale/programs/ld-monetary.c:746 locale/programs/ld-numeric.c:275
#, c-format
msgid "%s: value for field `%s' must be a single character"
msgstr "%s: el valor para el campo `%s' debe ser un único carácter"
-#: locale/programs/ld-monetary.c:837 locale/programs/ld-numeric.c:319
+#: locale/programs/ld-monetary.c:843 locale/programs/ld-numeric.c:319
#, c-format
msgid "%s: `-1' must be last entry in `%s' field"
msgstr "%s: `-1' debe ser la última entrada del campo `%s'"
-#: locale/programs/ld-monetary.c:859 locale/programs/ld-numeric.c:340
+#: locale/programs/ld-monetary.c:865 locale/programs/ld-numeric.c:340
#, c-format
msgid "%s: values for field `%s' must be smaller than 127"
msgstr "%s: los valores para el campo `%s' deben ser menores que 127"
-#: locale/programs/ld-monetary.c:902
+#: locale/programs/ld-monetary.c:908
msgid "conversion rate value cannot be zero"
msgstr "el valor de la tasa de conversión no puede ser cero"
@@ -1342,62 +1343,62 @@ msgstr "%s: los valores para el campo `%s' no deben ser mayores que %d"
msgid "%s: values for field `%s' must not be larger than %d"
msgstr "%s: los valores para el campo `%s' no deben ser mayores que %d"
-#: locale/programs/ld-time.c:984
+#: locale/programs/ld-time.c:986
#, c-format
msgid "%s: too few values for field `%s'"
msgstr "%s: insuficiente número de valores para el campo `%s'"
-#: locale/programs/ld-time.c:1029
+#: locale/programs/ld-time.c:1031
msgid "extra trailing semicolon"
msgstr "sobra un punto y coma al final"
-#: locale/programs/ld-time.c:1032
+#: locale/programs/ld-time.c:1034
#, c-format
msgid "%s: too many values for field `%s'"
msgstr "%s: demasiados valores para el campo `%s'"
-#: locale/programs/linereader.c:275
+#: locale/programs/linereader.c:130
+msgid "trailing garbage at end of line"
+msgstr "hay inconsistencias al final de la línea"
+
+#: locale/programs/linereader.c:304
msgid "garbage at end of number"
msgstr "inconsistencias al final del número"
-#: locale/programs/linereader.c:387
+#: locale/programs/linereader.c:416
msgid "garbage at end of character code specification"
msgstr "inconsistencia al final de la especificación del código de caracteres"
-#: locale/programs/linereader.c:473
+#: locale/programs/linereader.c:502
msgid "unterminated symbolic name"
msgstr "nombre simbólico sin terminar"
-#: locale/programs/linereader.c:537 catgets/gencat.c:1195
+#: locale/programs/linereader.c:566 catgets/gencat.c:1195
msgid "invalid escape sequence"
msgstr "secuencia de escape inválida"
-#: locale/programs/linereader.c:600
+#: locale/programs/linereader.c:629
msgid "illegal escape sequence at end of string"
msgstr "secuencia de escape ilegal al final de la cadena de caracteres"
-#: locale/programs/linereader.c:604 locale/programs/linereader.c:832
+#: locale/programs/linereader.c:633 locale/programs/linereader.c:861
msgid "unterminated string"
msgstr "cadena de caracteres sin terminar"
-#: locale/programs/linereader.c:646
+#: locale/programs/linereader.c:675
msgid "non-symbolic character value should not be used"
msgstr "los valores de caracteres no simbólicos no deben utilizarse"
-#: locale/programs/linereader.c:793
+#: locale/programs/linereader.c:822
#, c-format
msgid "symbol `%.*s' not in charmap"
msgstr "el símbolo `%.*s' no está en la tabla de caracteres"
-#: locale/programs/linereader.c:814
+#: locale/programs/linereader.c:843
#, c-format
msgid "symbol `%.*s' not in repertoire map"
msgstr "el símbolo `%.*s' no está en el repertorio"
-#: locale/programs/linereader.h:162
-msgid "trailing garbage at end of line"
-msgstr "hay inconsistencias al final de la línea"
-
#: locale/programs/locale.c:75
msgid "System information:"
msgstr "Información del sistema:"
@@ -1428,7 +1429,7 @@ msgstr "Muestra más información"
#: locale/programs/locale.c:88
msgid "Get locale-specific information."
-msgstr "Obtiene la información específica del locale."
+msgstr "Obtiene la información específica del local."
#: locale/programs/locale.c:91
msgid ""
@@ -1438,7 +1439,23 @@ msgstr ""
"NOMBRE\n"
"[-a|-m]"
-#: locale/programs/locale.c:512
+#: locale/programs/locale.c:195
+msgid "Cannot set LC_CTYPE to default locale"
+msgstr "No se puede establecer LC_CTYPE al local predeterminado"
+
+#: locale/programs/locale.c:197
+msgid "Cannot set LC_MESSAGES to default locale"
+msgstr "No se puede establecer LC_MESSAGES al local predeterminado"
+
+#: locale/programs/locale.c:210
+msgid "Cannot set LC_COLLATE to default locale"
+msgstr "No se puede establecer LC_COLLATE al local predeterminado"
+
+#: locale/programs/locale.c:226
+msgid "Cannot set LC_ALL to default locale"
+msgstr "No se puede establecer LC_ALL al local predeterminado"
+
+#: locale/programs/locale.c:517
msgid "while preparing output"
msgstr "al preparar la salida"
@@ -1660,7 +1677,7 @@ msgstr "falló la llamada a `stat' sobre \"%s\": %s: descartado"
#: locale/programs/locarchive.c:1138
#, c-format
msgid "\"%s\" is no directory; ignored"
-msgstr "\"%s\" no es un directorio; descarrtado"
+msgstr "\"%s\" no es un directorio; descartado"
#: locale/programs/locarchive.c:1145
#, c-format
@@ -1706,17 +1723,17 @@ msgstr "error al escribir los datos para la categoría `%s'"
msgid "cannot create output file `%s' for category `%s'"
msgstr "no se puede crear el fichero de salida `%s' para la categoría `%s'"
-#: locale/programs/locfile.h:59
+#: locale/programs/locfile.c:781
msgid "expect string argument for `copy'"
msgstr "se espera un argumento de cadena de caracteres para `copy'"
-#: locale/programs/locfile.h:63
+#: locale/programs/locfile.c:785
msgid "locale name should consist only of portable characters"
msgstr "el nombre del local debe estar formado por caracteres portables únicamente"
-#: locale/programs/locfile.h:82
+#: locale/programs/locfile.c:804
msgid "no other keyword shall be specified when `copy' is used"
-msgstr "ninguna otra palabra clave debe ser especificada al usar `copy'"
+msgstr "cuando se utiliza `copy' no debe especificarse ninguna otra palabra clave"
#: locale/programs/repertoire.c:230 locale/programs/repertoire.c:271
#: locale/programs/repertoire.c:296
@@ -1750,7 +1767,7 @@ msgid "upper limit in range is not smaller then lower limit"
msgstr "el límite superior del rango no es menor que el límite inferior"
#: locale/programs/xmalloc.c:70 malloc/obstack.c:505 malloc/obstack.c:508
-#: posix/getconf.c:1002
+#: posix/getconf.c:1007
msgid "memory exhausted"
msgstr "memoria agotada"
@@ -1793,7 +1810,7 @@ msgstr "Primera cadena para hacer pruebas."
msgid "Another string for testing."
msgstr "Otra cadena para hacer pruebas."
-#: catgets/gencat.c:111 catgets/gencat.c:115 nscd/nscd.c:84
+#: catgets/gencat.c:111 catgets/gencat.c:115 nscd/nscd.c:88
msgid "NAME"
msgstr "NOMBRE"
@@ -1902,6 +1919,7 @@ msgid "cannot determine escape character"
msgstr "no se puede determinar el carácter de escape"
#: stdlib/../sysdeps/unix/sysv/linux/ia64/makecontext.c:63
+#, c-format
msgid "makecontext: does not know how to handle more than 8 arguments\n"
msgstr "makecontext: no sabe cómo manejar más de 8 argumentos\n"
@@ -1909,8 +1927,8 @@ msgstr "makecontext: no sabe cómo manejar más de 8 argumentos\n"
# me gustaría que hubiera otra palabra mejor. SV
# Siempre me han gustado F&C ;-)
# A mí también :-) sv
-#: stdio-common/../sysdeps/gnu/errlist.c:12 posix/regcomp.c:133
-#: nis/nis_error.c:29 nis/ypclnt.c:787 nis/ypclnt.c:861
+#: stdio-common/../sysdeps/gnu/errlist.c:12 posix/regcomp.c:147
+#: nis/nis_error.c:29 nis/ypclnt.c:778 nis/ypclnt.c:852
msgid "Success"
msgstr "Conseguido"
@@ -1939,7 +1957,7 @@ msgstr "No existe el fichero o el directorio"
#: stdio-common/../sysdeps/gnu/errlist.c:37
#: stdio-common/../sysdeps/unix/sysv/sysv4/solaris2/sparc/errlist.c:33
msgid "No such process"
-msgstr "No existe tal proceso"
+msgstr "No existe el proceso"
#. TRANS Interrupted function call; an asynchronous signal occurred and prevented
#. TRANS completion of the call. When this happens, you should try the call
@@ -1966,7 +1984,7 @@ msgstr "Error de entrada/salida"
#: stdio-common/../sysdeps/gnu/errlist.c:74
#: stdio-common/../sysdeps/unix/sysv/sysv4/solaris2/sparc/errlist.c:36
msgid "No such device or address"
-msgstr "No existe tal dispositivo o dirección"
+msgstr "No existe el dispositivo o la dirección"
#. TRANS Argument list too long; used when the arguments passed to a new program
#. TRANS being executed with one of the @code{exec} functions (@pxref{Executing a
@@ -2026,7 +2044,7 @@ msgstr "No se pudo asignar memoria"
#. TRANS Permission denied; the file permissions do not allow the attempted operation.
#: stdio-common/../sysdeps/gnu/errlist.c:149
#: stdio-common/../sysdeps/unix/sysv/sysv4/solaris2/sparc/errlist.c:43
-#: nis/nis_error.c:39 nis/ypclnt.c:817
+#: nis/nis_error.c:39 nis/ypclnt.c:808
msgid "Permission denied"
msgstr "Permiso denegado"
@@ -2057,7 +2075,7 @@ msgstr "Dispositivo o recurso ocupado"
#: stdio-common/../sysdeps/gnu/errlist.c:191
#: stdio-common/../sysdeps/unix/sysv/sysv4/solaris2/sparc/errlist.c:47
msgid "File exists"
-msgstr "El fichero existe"
+msgstr "El fichero ya existe"
# ??? ver esto.
#. TRANS An attempt to make an improper link across file systems was detected.
@@ -2072,7 +2090,7 @@ msgstr "Enlace cruzado entre dispositivos no permitido"
#: stdio-common/../sysdeps/gnu/errlist.c:212
#: stdio-common/../sysdeps/unix/sysv/sysv4/solaris2/sparc/errlist.c:49
msgid "No such device"
-msgstr "No existe tal dispositivo"
+msgstr "No existe el dispositivo"
#. TRANS A file that isn't a directory was specified when a directory is required.
#: stdio-common/../sysdeps/gnu/errlist.c:221
@@ -2171,9 +2189,6 @@ msgstr "Sistema de ficheros de sólo lectura"
msgid "Too many links"
msgstr "Demasiados enlaces"
-# fuentes
-# Sugerencia: Argumento numérico. sv
-#
#. TRANS Domain error; used by mathematical functions when an argument value does
#. TRANS not fall into the domain over which the function is defined.
#: stdio-common/../sysdeps/gnu/errlist.c:361
@@ -2887,10 +2902,11 @@ msgid "Invalid request code"
msgstr "Código de petición incorrecto"
# ¿Ranura no válida?, creo que no hay traducción para slot :) em+
+# Antes: `slot' incorrecto
#: stdio-common/../sysdeps/gnu/errlist.c:1205
#: stdio-common/../sysdeps/unix/sysv/sysv4/solaris2/sparc/errlist.c:85
msgid "Invalid slot"
-msgstr "`slot' incorrecto"
+msgstr "Ranura inválida"
# FUZZY em+
#: stdio-common/../sysdeps/gnu/errlist.c:1213
@@ -3316,6 +3332,14 @@ msgstr "No se puede enviar después de la destrucción del `socket'"
msgid "%s%sUnknown signal %d\n"
msgstr "%s%sSeñal desconocida %d\n"
+#: dlfcn/dlinfo.c:51
+msgid "RTLD_SELF used in code not dynamically loaded"
+msgstr "Se ha usado RTLD_SELF en una parte del código que no se cargó dinámicamente"
+
+#: dlfcn/dlinfo.c:61
+msgid "unsupported dlinfo request"
+msgstr "Petición dlinfo no admitida"
+
#: malloc/mcheck.c:346
msgid "memory is consistent, library is buggy\n"
msgstr "la memoria es consistente, la biblioteca tiene un bicho\n"
@@ -3386,114 +3410,114 @@ msgstr "Señal de tiempo real %d"
msgid "Unknown signal %d"
msgstr "Señal desconocida %d"
-#: timezone/zdump.c:175
+#: timezone/zdump.c:176
#, c-format
-msgid "%s: usage is %s [ -v ] [ -c cutoff ] zonename ...\n"
-msgstr "%s: el modo de empleo es %s [ -v ] [ -c cutoff ] nombrezona ...\n"
+msgid "%s: usage is %s [ --version ] [ -v ] [ -c cutoff ] zonename ...\n"
+msgstr "%s: el modo de empleo es %s [ --version ] [ -v ] [ -c cutoff ] nombrezona ...\n"
-#: timezone/zdump.c:268
+#: timezone/zdump.c:269
msgid "Error writing standard output"
msgstr "Error al escribir en la salida estándar"
-#: timezone/zic.c:365
+#: timezone/zic.c:361
#, c-format
msgid "%s: Memory exhausted: %s\n"
msgstr "%s: Memoria agotada: %s\n"
-#: timezone/zic.c:390 misc/error.c:127 misc/error.c:155
+#: timezone/zic.c:386 misc/error.c:129 misc/error.c:157
msgid "Unknown system error"
msgstr "Error del sistema desconocido"
-#: timezone/zic.c:424
+#: timezone/zic.c:420
#, c-format
msgid "\"%s\", line %d: %s"
msgstr "\"%s\", línea %d: %s"
-#: timezone/zic.c:427
+#: timezone/zic.c:423
#, c-format
msgid " (rule from \"%s\", line %d)"
msgstr " (regla desde \"%s\", línea %d)"
-#: timezone/zic.c:439
+#: timezone/zic.c:435
msgid "warning: "
msgstr "atención: "
# FIXME: Decir al autor que no use tabs. sv
-#: timezone/zic.c:449
+#: timezone/zic.c:445
#, c-format
msgid ""
-"%s: usage is %s [ -s ] [ -v ] [ -l localtime ] [ -p posixrules ] \\\n"
+"%s: usage is %s [ --version ] [ -s ] [ -v ] [ -l localtime ] [ -p posixrules ] \\\n"
"\t[ -d directory ] [ -L leapseconds ] [ -y yearistype ] [ filename ... ]\n"
msgstr ""
-"%s: el modo de empleo es %s [ -s ] [ -v ] [ -l hora_local ] [ -p reglasposix ] \\\n"
+"%s: el modo de empleo es %s [ --version ] [ -s ] [ -v ] [ -l hora_local ] [ -p reglasposix ] \\\n"
" [ -d directorio ] [ -L segundos_intercalares ] [ -y tipoaño ] [ fichero ... ]\n"
-#: timezone/zic.c:491
+#: timezone/zic.c:492
#, c-format
msgid "%s: More than one -d option specified\n"
msgstr "%s: La opción -d se ha especificado más de una vez\n"
-#: timezone/zic.c:501
+#: timezone/zic.c:502
#, c-format
msgid "%s: More than one -l option specified\n"
msgstr "%s: La opción -l se ha especificado más de una vez\n"
-#: timezone/zic.c:511
+#: timezone/zic.c:512
#, c-format
msgid "%s: More than one -p option specified\n"
msgstr "%s: La opción -p se ha especificado más de una vez\n"
-#: timezone/zic.c:521
+#: timezone/zic.c:522
#, c-format
msgid "%s: More than one -y option specified\n"
msgstr "%s: La opción -y se ha especificado más de una vez\n"
-#: timezone/zic.c:531
+#: timezone/zic.c:532
#, c-format
msgid "%s: More than one -L option specified\n"
msgstr "%s: La opción -L se ha especificado más de una vez\n"
-#: timezone/zic.c:638
+#: timezone/zic.c:639
#, c-format
msgid "%s: Can't unlink %s: %s\n"
msgstr "%s: No se puede borrar %s: %s\n"
-#: timezone/zic.c:645
+#: timezone/zic.c:646
msgid "hard link failed, symbolic link used"
msgstr "el enlace duro falló, se usará un enlace simbólico"
-#: timezone/zic.c:653
+#: timezone/zic.c:654
#, c-format
msgid "%s: Can't link from %s to %s: %s\n"
msgstr "%s: No se pudo crear un enlace de %s a %s: %s\n"
-#: timezone/zic.c:751 timezone/zic.c:753
+#: timezone/zic.c:752 timezone/zic.c:754
msgid "same rule name in multiple files"
msgstr "mismo nombre de regla en varios ficheros"
-#: timezone/zic.c:794
+#: timezone/zic.c:795
msgid "unruly zone"
msgstr "zona sin reglas"
-#: timezone/zic.c:801
+#: timezone/zic.c:802
#, c-format
msgid "%s in ruleless zone"
msgstr "%s en una zona sin reglas"
-#: timezone/zic.c:822
+#: timezone/zic.c:823
msgid "standard input"
msgstr "entrada estándar"
-#: timezone/zic.c:827
+#: timezone/zic.c:828
#, c-format
msgid "%s: Can't open %s: %s\n"
msgstr "%s: No se puede abrir %s: %s\n"
-#: timezone/zic.c:838
+#: timezone/zic.c:839
msgid "line too long"
msgstr "línea demasiado larga"
-#: timezone/zic.c:858
+#: timezone/zic.c:859
msgid "input line of unknown type"
msgstr "línea de entrada de tipo desconocido"
@@ -3528,7 +3552,7 @@ msgstr "línea de entrada de tipo desconocido"
# Segundo, según he visto en la documentación, sólo existe un fichero
# de leap lines, por eso pongo 'el'... em+
#
-#: timezone/zic.c:874
+#: timezone/zic.c:875
#, c-format
msgid "%s: Leap line in non leap seconds file %s\n"
msgstr ""
@@ -3536,70 +3560,70 @@ msgstr ""
"ajuste de años bisiestos %s\n"
# Ídem. 1984.
-#: timezone/zic.c:881 timezone/zic.c:1295 timezone/zic.c:1320
+#: timezone/zic.c:882 timezone/zic.c:1297 timezone/zic.c:1322
#, c-format
msgid "%s: panic: Invalid l_value %d\n"
msgstr "%s: grave: valor_l %d inválido\n"
-#: timezone/zic.c:889
+#: timezone/zic.c:890
#, c-format
msgid "%s: Error reading %s\n"
msgstr "%s: Error al leer %s\n"
-#: timezone/zic.c:896
+#: timezone/zic.c:897
#, c-format
msgid "%s: Error closing %s: %s\n"
msgstr "%s: Error al cerrar %s: %s\n"
-#: timezone/zic.c:901
+#: timezone/zic.c:902
msgid "expected continuation line not found"
msgstr "la línea de continuación esperada no se encuentra"
-#: timezone/zic.c:957
+#: timezone/zic.c:958
msgid "wrong number of fields on Rule line"
msgstr "número incorrecto de argumentos en la línea de regla (Rule)"
-#: timezone/zic.c:961
+#: timezone/zic.c:962
msgid "nameless rule"
msgstr "regla sin nombre"
-#: timezone/zic.c:966
+#: timezone/zic.c:967
msgid "invalid saved time"
msgstr "la hora almacenada no es válida"
-#: timezone/zic.c:985
+#: timezone/zic.c:986
msgid "wrong number of fields on Zone line"
msgstr "número de campos incorrecto en la línea de zona (Zone)"
-#: timezone/zic.c:991
+#: timezone/zic.c:992
#, c-format
msgid "\"Zone %s\" line and -l option are mutually exclusive"
msgstr "la línea \"Zone %s\" y la opción -l son mutuamente excluyentes"
-#: timezone/zic.c:999
+#: timezone/zic.c:1000
#, c-format
msgid "\"Zone %s\" line and -p option are mutually exclusive"
msgstr "la línea \"Zone %s\" y la opción -p son mutuamente excluyentes"
-#: timezone/zic.c:1011
+#: timezone/zic.c:1012
#, c-format
msgid "duplicate zone name %s (file \"%s\", line %d)"
msgstr "nombre de zona %s duplicado (fichero \"%s\", línea %d)"
-#: timezone/zic.c:1027
+#: timezone/zic.c:1028
msgid "wrong number of fields on Zone continuation line"
msgstr "número de campos incorrecto en la línea de continuación de zona (Zone)"
-#: timezone/zic.c:1067
+#: timezone/zic.c:1068
msgid "invalid UTC offset"
msgstr "desplazamiento UTC inválido"
-#: timezone/zic.c:1070
+#: timezone/zic.c:1071
msgid "invalid abbreviation format"
msgstr "formato de abreviatura incorrecto"
# VER
-#: timezone/zic.c:1096
+#: timezone/zic.c:1097
msgid "Zone continuation line end time is not after end time of previous line"
msgstr ""
"La línea de continuación de la zona no está después del tiempo de final\n"
@@ -3615,154 +3639,170 @@ msgstr ""
# Si es mejor, ponlo en todos los sitios. Y si no, en ninguno.
# Yo creo que es mucho mejor poner "número incorrecto ..."
# Si no, queda como "al revés". sv+
-#: timezone/zic.c:1123
+#: timezone/zic.c:1124
msgid "wrong number of fields on Leap line"
msgstr "número incorrecto de campos en la línea de bisiesto (Leap)"
-#: timezone/zic.c:1132
+#: timezone/zic.c:1133
msgid "invalid leaping year"
msgstr "año bisiesto inválido"
-#: timezone/zic.c:1147 timezone/zic.c:1250
+#: timezone/zic.c:1148 timezone/zic.c:1252
msgid "invalid month name"
msgstr "nombre de mes incorrecto"
-#: timezone/zic.c:1160 timezone/zic.c:1372 timezone/zic.c:1386
+#: timezone/zic.c:1161 timezone/zic.c:1374 timezone/zic.c:1388
msgid "invalid day of month"
msgstr "día del mes inválido"
-#: timezone/zic.c:1165
+#: timezone/zic.c:1166
msgid "time before zero"
msgstr "hora antes de cero"
-# Sugerencia: Desbordamiento de fecha. (?) sv+
-#: timezone/zic.c:1173 timezone/zic.c:2049 timezone/zic.c:2068
-msgid "time overflow"
-msgstr "desbordamiento horario"
+#: timezone/zic.c:1170
+msgid "time too small"
+msgstr "tiempo demasiado pequeño"
+
+#: timezone/zic.c:1174
+msgid "time too large"
+msgstr "tiempo demasiado grande"
-#: timezone/zic.c:1176 timezone/zic.c:1279
+#: timezone/zic.c:1178 timezone/zic.c:1281
msgid "invalid time of day"
msgstr "hora del día inválida"
-#: timezone/zic.c:1195
+#: timezone/zic.c:1197
msgid "illegal CORRECTION field on Leap line"
msgstr "El campo CORRECTION en la línea de año bisiesto es ilegal"
-#: timezone/zic.c:1199
+#: timezone/zic.c:1201
msgid "illegal Rolling/Stationary field on Leap line"
msgstr "Campo Rolling/Stationary ilegal en la línea de año bisiesto"
-#: timezone/zic.c:1214
+#: timezone/zic.c:1216
msgid "wrong number of fields on Link line"
msgstr "número incorrecto de campos en la línea de enlace (Link)"
-#: timezone/zic.c:1218
+#: timezone/zic.c:1220
msgid "blank FROM field on Link line"
msgstr "Campo FROM vacío en la línea `Link'"
-#: timezone/zic.c:1222
+#: timezone/zic.c:1224
msgid "blank TO field on Link line"
msgstr "Campo TO vacío en la línea `Link'"
-#: timezone/zic.c:1299
+#: timezone/zic.c:1301
msgid "invalid starting year"
msgstr "año de comienzo inválido"
-#: timezone/zic.c:1303 timezone/zic.c:1328
+#: timezone/zic.c:1305
msgid "starting year too low to be represented"
msgstr "el año de comienzo es demasiado bajo para ser representado"
-#: timezone/zic.c:1305 timezone/zic.c:1330
+#: timezone/zic.c:1307
msgid "starting year too high to be represented"
msgstr "el año de comienzo es demasiado alto para ser representado"
-#: timezone/zic.c:1324
+#: timezone/zic.c:1326
msgid "invalid ending year"
msgstr "año de final inválido"
-#: timezone/zic.c:1333
+#: timezone/zic.c:1330
+msgid "ending year too low to be represented"
+msgstr "el año de final es demasiado bajo para ser representado"
+
+#: timezone/zic.c:1332
+msgid "ending year too high to be represented"
+msgstr "el año de final es demasiado alto para ser representado"
+
+#: timezone/zic.c:1335
msgid "starting year greater than ending year"
msgstr "año de comienzo mayor que año de final"
-#: timezone/zic.c:1340
+#: timezone/zic.c:1342
msgid "typed single year"
msgstr "tecleado un único año"
-#: timezone/zic.c:1377
+#: timezone/zic.c:1379
msgid "invalid weekday name"
msgstr "nombre del día de la semana incorrecto"
-#: timezone/zic.c:1492
+#: timezone/zic.c:1494
#, c-format
msgid "%s: Can't remove %s: %s\n"
msgstr "%s: No se puede eliminar %s: %s\n"
-#: timezone/zic.c:1502
+#: timezone/zic.c:1504
#, c-format
msgid "%s: Can't create %s: %s\n"
msgstr "%s: No se puede crear %s: %s\n"
-#: timezone/zic.c:1568
+#: timezone/zic.c:1570
#, c-format
msgid "%s: Error writing %s\n"
msgstr "%s: Error al escribir %s\n"
# FUZZY
-#: timezone/zic.c:1758
+#: timezone/zic.c:1760
msgid "can't determine time zone abbreviation to use just after until time"
msgstr ""
"No se puede determinar la abreviación de zona horaria que se usará justo\n"
"después"
-#: timezone/zic.c:1801
+#: timezone/zic.c:1803
msgid "too many transitions?!"
msgstr "¡¿demasiadas transiciones?!"
-#: timezone/zic.c:1820
+#: timezone/zic.c:1822
msgid "internal error - addtype called with bad isdst"
msgstr "error interno - se llamó a `addtype' con un `isdst' erróneo"
-#: timezone/zic.c:1824
+#: timezone/zic.c:1826
msgid "internal error - addtype called with bad ttisstd"
msgstr "error interno - se llamó a `addtype' con un `ttisstd' erróneo"
-#: timezone/zic.c:1828
+#: timezone/zic.c:1830
msgid "internal error - addtype called with bad ttisgmt"
msgstr "error interno - se llamó a `addtype' con un `ttisgmt' erróneo"
-#: timezone/zic.c:1847
+#: timezone/zic.c:1849
msgid "too many local time types"
msgstr "demasiados tipos de hora local"
-#: timezone/zic.c:1875
+#: timezone/zic.c:1877
msgid "too many leap seconds"
msgstr "demasiados segundos intercalares"
-#: timezone/zic.c:1881
+#: timezone/zic.c:1883
msgid "repeated leap second moment"
msgstr "segundo intercalar repetido"
# # Otra opción, resultado incongruente al ejecutar la orden em
-#: timezone/zic.c:1933
+#: timezone/zic.c:1935
msgid "Wild result from command execution"
msgstr "Resultado salvaje en la ejecución de la orden"
# FIXME: `%s'
-#: timezone/zic.c:1934
+#: timezone/zic.c:1936
#, c-format
msgid "%s: command was '%s', result was %d\n"
msgstr "%s: la orden fue `%s', el resultado fue %d\n"
-#: timezone/zic.c:2029
+#: timezone/zic.c:2031
msgid "Odd number of quotation marks"
msgstr "Número impar de comillas"
+# Sugerencia: Desbordamiento de fecha. (?) sv+
+#: timezone/zic.c:2051 timezone/zic.c:2070
+msgid "time overflow"
+msgstr "desbordamiento horario"
+
# FIXME: non leap-year -> non-leap year.
# A lo mejor si pones "veintinueve de febrero" o "29 de febrero"
# se entiende mejor. no sé. sv
# Si, estas pensando lo mismo que yo, 29 de febrero puede confundir, porque
# en el fichero pondrá 2/29 em
-#: timezone/zic.c:2115
+#: timezone/zic.c:2117
msgid "use of 2/29 in non leap-year"
msgstr "uso de 2/29 en un año no bisiesto"
@@ -3770,25 +3810,25 @@ msgstr "uso de 2/29 en un año no bisiesto"
# Esto debe tener algo que ver con la función menopausie() em
# No se me había ocurrido... ¿Se te ocurre algo mejor, ahora que ya
# sabemos lo que quiere decir? sv
-#: timezone/zic.c:2149
+#: timezone/zic.c:2151
msgid "no day in month matches rule"
msgstr "ningún día del mes coincide con la regla"
-#: timezone/zic.c:2172
+#: timezone/zic.c:2175
msgid "too many, or too long, time zone abbreviations"
msgstr "demasiadas abreviaturas de zona horaria, o demasiado largas"
-#: timezone/zic.c:2213
+#: timezone/zic.c:2216
#, c-format
msgid "%s: Can't create directory %s: %s\n"
msgstr "%s: No se puede crear el directorio %s: %s\n"
-#: timezone/zic.c:2235
+#: timezone/zic.c:2238
#, c-format
msgid "%s: %d did not sign extend correctly\n"
msgstr "%s: %d no extendió el signo correctamente\n"
-#: posix/../sysdeps/generic/wordexp.c:1801
+#: posix/../sysdeps/generic/wordexp.c:1797
msgid "parameter null or not set"
msgstr "parámetro nulo o no establecido"
@@ -3859,57 +3899,57 @@ msgstr "Realizadas todas las peticiones"
msgid "Interrupted by a signal"
msgstr "Interrumpido por una señal"
-#: posix/getconf.c:889
+#: posix/getconf.c:892
#, c-format
msgid "Usage: %s [-v specification] variable_name [pathname]\n"
msgstr "Modo de empleo: %s [-v especificación] nombre_de_variable [ruta]\n"
-#: posix/getconf.c:947
+#: posix/getconf.c:950
#, c-format
msgid "unknown specification \"%s\""
msgstr "especificación \"%s\" desconocida"
-#: posix/getconf.c:974 posix/getconf.c:990
+#: posix/getconf.c:979 posix/getconf.c:995
msgid "undefined"
msgstr "sin definir"
-#: posix/getconf.c:1012
+#: posix/getconf.c:1017
#, c-format
msgid "Unrecognized variable `%s'"
msgstr "Variable no reconocida `%s'"
-#: posix/getopt.c:692 posix/getopt.c:704
+#: posix/getopt.c:692 posix/getopt.c:711
#, c-format
msgid "%s: option `%s' is ambiguous\n"
msgstr "%s: la opción `%s' es ambigua\n"
-#: posix/getopt.c:737 posix/getopt.c:741
+#: posix/getopt.c:744 posix/getopt.c:748
#, c-format
msgid "%s: option `--%s' doesn't allow an argument\n"
msgstr "%s: la opción `--%s' no admite ningún argumento\n"
-#: posix/getopt.c:750 posix/getopt.c:755
+#: posix/getopt.c:757 posix/getopt.c:762
#, c-format
msgid "%s: option `%c%s' doesn't allow an argument\n"
msgstr "%s: la opción `%c%s' no admite ningún argumento\n"
-#: posix/getopt.c:791 posix/getopt.c:804 posix/getopt.c:1093
-#: posix/getopt.c:1106
+#: posix/getopt.c:807 posix/getopt.c:829 posix/getopt.c:1159
+#: posix/getopt.c:1181
#, c-format
msgid "%s: option `%s' requires an argument\n"
msgstr "%s: la opción `%s' requiere un argumento\n"
-#: posix/getopt.c:842 posix/getopt.c:845
+#: posix/getopt.c:867 posix/getopt.c:870
#, c-format
msgid "%s: unrecognized option `--%s'\n"
msgstr "%s: opción no reconocida `--%s'\n"
-#: posix/getopt.c:853 posix/getopt.c:856
+#: posix/getopt.c:878 posix/getopt.c:881
#, c-format
msgid "%s: unrecognized option `%c%s'\n"
msgstr "%s: opción no reconocida `%c%s'\n"
-#: posix/getopt.c:903 posix/getopt.c:906
+#: posix/getopt.c:936 posix/getopt.c:939
#, c-format
msgid "%s: illegal option -- %c\n"
msgstr "%s: opción ilegal -- %c\n"
@@ -3928,200 +3968,200 @@ msgstr "%s: opción ilegal -- %c\n"
# Después de leer "1984", lo cambio.
# Aquí y en todas partes. sv
#
-#: posix/getopt.c:912 posix/getopt.c:915
+#: posix/getopt.c:945 posix/getopt.c:948
#, c-format
msgid "%s: invalid option -- %c\n"
msgstr "%s: opción inválida -- %c\n"
-#: posix/getopt.c:962 posix/getopt.c:973 posix/getopt.c:1159
-#: posix/getopt.c:1172
+#: posix/getopt.c:1003 posix/getopt.c:1022 posix/getopt.c:1234
+#: posix/getopt.c:1255
#, c-format
msgid "%s: option requires an argument -- %c\n"
msgstr "%s: la opción requiere un argumento --%c\n"
-#: posix/getopt.c:1025 posix/getopt.c:1036
+#: posix/getopt.c:1074 posix/getopt.c:1093
#, c-format
msgid "%s: option `-W %s' is ambiguous\n"
msgstr "%s: la opción `-W %s' es ambigua\n"
-#: posix/getopt.c:1060 posix/getopt.c:1072
+#: posix/getopt.c:1117 posix/getopt.c:1138
#, c-format
msgid "%s: option `-W %s' doesn't allow an argument\n"
msgstr "%s: la opción `-W %s' no admite ningún argumento\n"
-#: posix/regcomp.c:136
+#: posix/regcomp.c:150
msgid "No match"
msgstr "No hay ninguna coincidencia"
-#: posix/regcomp.c:139
+#: posix/regcomp.c:153
msgid "Invalid regular expression"
msgstr "La expresión regular es errónea"
-#: posix/regcomp.c:142
+#: posix/regcomp.c:156
msgid "Invalid collation character"
msgstr "Carácter de unión inválido"
-#: posix/regcomp.c:145
+#: posix/regcomp.c:159
msgid "Invalid character class name"
msgstr "Nombre de clase de carácter inválido"
-#: posix/regcomp.c:148
+#: posix/regcomp.c:162
msgid "Trailing backslash"
msgstr "Barra invertida extra al final `\\'"
-#: posix/regcomp.c:151
+#: posix/regcomp.c:165
msgid "Invalid back reference"
msgstr "Referencia hacia atrás inválida"
-#: posix/regcomp.c:154
+#: posix/regcomp.c:168
msgid "Unmatched [ or [^"
msgstr "[ ó ^[ desemparejados"
-#: posix/regcomp.c:157
+#: posix/regcomp.c:171
msgid "Unmatched ( or \\("
msgstr "( ó \\( desemparejados"
-#: posix/regcomp.c:160
+#: posix/regcomp.c:174
msgid "Unmatched \\{"
msgstr "\\{ desemparejado"
-#: posix/regcomp.c:163
+#: posix/regcomp.c:177
msgid "Invalid content of \\{\\}"
msgstr "Contenido de \\{\\} inválido"
-#: posix/regcomp.c:166
+#: posix/regcomp.c:180
msgid "Invalid range end"
msgstr "Final de rango inválido"
-#: posix/regcomp.c:169
+#: posix/regcomp.c:183
msgid "Memory exhausted"
msgstr "Memoria agotada"
-#: posix/regcomp.c:172
+#: posix/regcomp.c:186
msgid "Invalid preceding regular expression"
msgstr "La expresión regular precedente es inválida"
-#: posix/regcomp.c:175
+#: posix/regcomp.c:189
msgid "Premature end of regular expression"
msgstr "Fin no esperado de la expresión regular"
-#: posix/regcomp.c:178
+#: posix/regcomp.c:192
msgid "Regular expression too big"
msgstr "La expresión regular es demasiado grande"
-#: posix/regcomp.c:181
+#: posix/regcomp.c:195
msgid "Unmatched ) or \\)"
msgstr ") ó \\) desemparejados"
-#: posix/regcomp.c:615
+#: posix/regcomp.c:661
msgid "No previous regular expression"
msgstr "No existe ninguna expresión regular anterior"
-#: argp/argp-help.c:213
+#: argp/argp-help.c:224
#, c-format
msgid "%.*s: ARGP_HELP_FMT parameter requires a value"
msgstr "%.*s: El argumento ARGP_HELP_FMT requiere un valor"
-#: argp/argp-help.c:222
+#: argp/argp-help.c:233
#, c-format
msgid "%.*s: Unknown ARGP_HELP_FMT parameter"
msgstr "%.*s: Parámetro ARGP_HELP_FMT desconocido"
-#: argp/argp-help.c:234
+#: argp/argp-help.c:245
#, c-format
msgid "Garbage in ARGP_HELP_FMT: %s"
msgstr "Inconsistencias en ARGP_HELP_FMT: %s"
-#: argp/argp-help.c:1189
+#: argp/argp-help.c:1205
msgid "Mandatory or optional arguments to long options are also mandatory or optional for any corresponding short options."
msgstr ""
"Los argumentos obligatorios u opcionales para las opciones largas son\n"
"también obligatorios u opcionales para las opciones cortas correspondientes."
-#: argp/argp-help.c:1572
+#: argp/argp-help.c:1592
msgid "Usage:"
msgstr "Modo de empleo:"
-#: argp/argp-help.c:1576
+#: argp/argp-help.c:1596
msgid " or: "
msgstr " o: "
-#: argp/argp-help.c:1588
+#: argp/argp-help.c:1608
msgid " [OPTION...]"
msgstr " [OPCIÓN...]"
-#: argp/argp-help.c:1615
+#: argp/argp-help.c:1635
#, c-format
msgid "Try `%s --help' or `%s --usage' for more information.\n"
msgstr "Pruebe `%s --help' o `%s --usage' para más información.\n"
-#: argp/argp-help.c:1643
+#: argp/argp-help.c:1663
#, c-format
msgid "Report bugs to %s.\n"
msgstr "Comunicar bichos a %s.\n"
-#: argp/argp-parse.c:100
+#: argp/argp-parse.c:115
msgid "Give this help list"
msgstr "Da esta lista de ayuda"
-#: argp/argp-parse.c:101
+#: argp/argp-parse.c:116
msgid "Give a short usage message"
msgstr "Da un mensaje corto de uso"
-#: argp/argp-parse.c:102
+#: argp/argp-parse.c:117
msgid "Set the program name"
msgstr "Establece el nombre del programa"
-#: argp/argp-parse.c:104
+#: argp/argp-parse.c:119
msgid "Hang for SECS seconds (default 3600)"
msgstr "Cuelga durante SECS segundos (por omisión, 3600)"
-#: argp/argp-parse.c:161
+#: argp/argp-parse.c:180
msgid "Print program version"
msgstr "Muestra la versión del programa"
-#: argp/argp-parse.c:177
+#: argp/argp-parse.c:196
msgid "(PROGRAM ERROR) No version known!?"
msgstr "(ERROR DEL PROGRAMA) ¿¡No se conoce ninguna versión!?"
-#: argp/argp-parse.c:653
+#: argp/argp-parse.c:672
#, c-format
msgid "%s: Too many arguments\n"
msgstr "%s: Demasiados argumentos\n"
-#: argp/argp-parse.c:794
+#: argp/argp-parse.c:813
msgid "(PROGRAM ERROR) Option should have been recognized!?"
msgstr "(ERROR DEL PROGRAMA) ¿¡No se debería haber reconocido la opción!?"
# ??? resolvedor, determinador, investigador, solucionador ?
# Me suena que quizá exista resolvedor. Habría que enterarse. sv
-#: resolv/herror.c:67
+#: resolv/herror.c:68
msgid "Resolver Error 0 (no error)"
msgstr "Error del determinador de nombres 0 (ningún error)"
# En el libro de Infovía traducen host por "anfitrión"
-#: resolv/herror.c:68
+#: resolv/herror.c:69
msgid "Unknown host"
msgstr "`Host' desconocido"
-#: resolv/herror.c:69
+#: resolv/herror.c:70
msgid "Host name lookup failure"
msgstr "Nombre de `host' no encontrado"
-#: resolv/herror.c:70
+#: resolv/herror.c:71
msgid "Unknown server error"
msgstr "Error del servidor desconocido"
-#: resolv/herror.c:71
+#: resolv/herror.c:72
msgid "No address associated with name"
msgstr "No existe ninguna dirección asociada al nombre"
# ??? lo mismo que arriba
-#: resolv/herror.c:107
+#: resolv/herror.c:108
msgid "Resolver internal error"
msgstr "Error interno del determinador de nombres"
-#: resolv/herror.c:110
+#: resolv/herror.c:111
msgid "Unknown resolver error"
msgstr "Error del determinador de nombres desconocido"
@@ -4174,24 +4214,24 @@ msgstr "basededatos [clave ...]"
msgid "Service configuration to be used"
msgstr "Configuración del servicio"
-#: nss/getent.c:136 nss/getent.c:308
+#: nss/getent.c:136 nss/getent.c:375
#, c-format
msgid "Enumeration not supported on %s\n"
msgstr "La enumeración no está soportada sobre %s\n"
-#: nss/getent.c:732
+#: nss/getent.c:800
msgid "getent - get entries from administrative database."
msgstr "getent - obtiene entradas de la base de datos administrativa."
-#: nss/getent.c:733
+#: nss/getent.c:801
msgid "Supported databases:"
msgstr "Bases de datos admitidas:"
-#: nss/getent.c:790 nscd/nscd.c:124 nscd/nscd_nischeck.c:64
+#: nss/getent.c:858 nscd/nscd.c:131 nscd/nscd_nischeck.c:64
msgid "wrong number of arguments"
msgstr "número incorrecto de argumentos"
-#: nss/getent.c:800
+#: nss/getent.c:868
#, c-format
msgid "Unknown database: %s\n"
msgstr "Base de datos desconocida: %s\n"
@@ -4584,10 +4624,12 @@ msgid "illegal nettype :`%s'\n"
msgstr "tipodered ilegal :`%s'\n"
#: sunrpc/rpc_main.c:1104
+#, c-format
msgid "rpcgen: too many defines\n"
msgstr "rpcgen: demasiados defines\n"
#: sunrpc/rpc_main.c:1116
+#, c-format
msgid "rpcgen: arglist coding error\n"
msgstr "rpcgen: error de codificación de la lista de argumentos\n"
@@ -4599,6 +4641,7 @@ msgid "file `%s' already exists and may be overwritten\n"
msgstr "el fichero `%s' ya existe y podría ser sobreescrito\n"
#: sunrpc/rpc_main.c:1194
+#, c-format
msgid "Cannot specify more than one input file!\n"
msgstr "No se puede especificar más de un fichero de entrada\n"
@@ -4608,6 +4651,7 @@ msgid "This implementation doesn't support newstyle or MT-safe code!\n"
msgstr "¡Esta implementación no admite código de nuevo estilo o `MT-safe'!\n"
#: sunrpc/rpc_main.c:1373
+#, c-format
msgid "Cannot use netid flag with inetd flag!\n"
msgstr "No se puede usar la opción netid con la opción inetd\n"
@@ -4620,12 +4664,14 @@ msgid "Cannot use table flags with newstyle!\n"
msgstr "No se pueden usar las opciones de la tabla con el nuevo estilo\n"
#: sunrpc/rpc_main.c:1411
+#, c-format
msgid "\"infile\" is required for template generation flags.\n"
msgstr ""
"se necesita un \"fichero_de_entrada\" para las opciones de generación\n"
"de plantillas\n"
#: sunrpc/rpc_main.c:1416
+#, c-format
msgid "Cannot have more than one file generation flag!\n"
msgstr "No se puede tener más de una opción de generación de fichero\n"
@@ -4840,7 +4886,7 @@ msgstr "svcudp_create: memoria agotada\n"
msgid "svcudp_create: xp_pad is too small for IP_PKTINFO\n"
msgstr "svcudp_create: xp_pad es demasiado pequeño para IP_PKTINFO\n"
-#: sunrpc/svc_udp.c:471
+#: sunrpc/svc_udp.c:493
msgid "enablecache: cache already enabled"
msgstr "enablecache: el caché ya estaba activado"
@@ -4848,27 +4894,27 @@ msgstr "enablecache: el caché ya estaba activado"
# Parece ser indistinto, así que unas veces puede ser "la" y otras "el".
# dependiendo del caso (lo que mejor suene).
#
-#: sunrpc/svc_udp.c:477
+#: sunrpc/svc_udp.c:499
msgid "enablecache: could not allocate cache"
msgstr "enablecache: no se pudo crear espacio para el caché"
-#: sunrpc/svc_udp.c:485
+#: sunrpc/svc_udp.c:507
msgid "enablecache: could not allocate cache data"
msgstr "enablecache: no se pudo crear espacio para los datos del caché"
-#: sunrpc/svc_udp.c:492
+#: sunrpc/svc_udp.c:514
msgid "enablecache: could not allocate cache fifo"
msgstr "enablecache: no se pudo crear espacio para la pila del caché"
-#: sunrpc/svc_udp.c:528
+#: sunrpc/svc_udp.c:550
msgid "cache_set: victim not found"
msgstr "cache_set: no se encontró el objetivo"
-#: sunrpc/svc_udp.c:539
+#: sunrpc/svc_udp.c:561
msgid "cache_set: victim alloc failed"
msgstr "cache_set: falló la asignación de espacio para el objetivo"
-#: sunrpc/svc_udp.c:545
+#: sunrpc/svc_udp.c:567
msgid "cache_set: could not allocate new rpc_buffer"
msgstr "cache_set: no se pudo asignar espacio para un nuevo búfer rpc"
@@ -4892,7 +4938,7 @@ msgstr "svc_unix: makefd_xprt: memoria agotada\n"
msgid "xdr_bytes: out of memory\n"
msgstr "xdr_bytes: memoria agotada\n"
-#: sunrpc/xdr.c:725 sunrpc/xdr.c:728
+#: sunrpc/xdr.c:728 sunrpc/xdr.c:731
msgid "xdr_string: out of memory\n"
msgstr "xdr_string: memoria agotada\n"
@@ -5317,6 +5363,7 @@ msgid "Access Rights : "
msgstr "Derechos de acceso : "
#: nis/nis_print.c:326
+#, c-format
msgid ""
"\n"
"Time to Live : "
@@ -5408,112 +5455,112 @@ msgstr " No hay ningún no-miembro implícito\n"
msgid " No recursive nonmembers\n"
msgstr " No hay ningún no-miembro recursivo\n"
-#: nis/nss_nisplus/nisplus-publickey.c:96
-#: nis/nss_nisplus/nisplus-publickey.c:172
+#: nis/nss_nisplus/nisplus-publickey.c:101
+#: nis/nss_nisplus/nisplus-publickey.c:182
#, c-format
msgid "DES entry for netname %s not unique\n"
msgstr "La entrada DES para el nombre %s no es única\n"
-#: nis/nss_nisplus/nisplus-publickey.c:208
+#: nis/nss_nisplus/nisplus-publickey.c:218
#, c-format
msgid "netname2user: missing group id list in `%s'."
msgstr "netname2user: falta la lista de ids de grupo en `%s'."
-#: nis/nss_nisplus/nisplus-publickey.c:285
-#: nis/nss_nisplus/nisplus-publickey.c:291
-#: nis/nss_nisplus/nisplus-publickey.c:350
-#: nis/nss_nisplus/nisplus-publickey.c:359
+#: nis/nss_nisplus/nisplus-publickey.c:300
+#: nis/nss_nisplus/nisplus-publickey.c:306
+#: nis/nss_nisplus/nisplus-publickey.c:370
+#: nis/nss_nisplus/nisplus-publickey.c:379
#, c-format
msgid "netname2user: (nis+ lookup): %s\n"
msgstr "netname2user: (búsqueda nis+): %s\n"
-#: nis/nss_nisplus/nisplus-publickey.c:304
+#: nis/nss_nisplus/nisplus-publickey.c:319
#, c-format
msgid "netname2user: DES entry for %s in directory %s not unique"
msgstr "netname2user: la entrada DES para %s en el directorio %s no es única"
-#: nis/nss_nisplus/nisplus-publickey.c:322
+#: nis/nss_nisplus/nisplus-publickey.c:337
#, c-format
msgid "netname2user: principal name `%s' too long"
msgstr "netname2user: el nombre principal `%s' es demasiado largo"
-#: nis/nss_nisplus/nisplus-publickey.c:372
+#: nis/nss_nisplus/nisplus-publickey.c:392
#, c-format
msgid "netname2user: LOCAL entry for %s in directory %s not unique"
msgstr "netname2user: la entrada LOCAL para %s en el directorio %s no es única"
-#: nis/nss_nisplus/nisplus-publickey.c:379
+#: nis/nss_nisplus/nisplus-publickey.c:399
msgid "netname2user: should not have uid 0"
msgstr "netname2user: no debería tener uid 0"
-#: nis/ypclnt.c:174
+#: nis/ypclnt.c:171
#, c-format
msgid "YPBINDPROC_DOMAIN: %s\n"
msgstr "YPBINDPROC_DOMAIN: %s\n"
-#: nis/ypclnt.c:789
+#: nis/ypclnt.c:780
msgid "Request arguments bad"
msgstr "Los argumentos de la petición son incorrectos"
-#: nis/ypclnt.c:791
+#: nis/ypclnt.c:782
msgid "RPC failure on NIS operation"
msgstr "Fallo RPC en una operación NIS"
-#: nis/ypclnt.c:793
+#: nis/ypclnt.c:784
msgid "Can't bind to server which serves this domain"
msgstr "Ha fallado la llamada a bind() con el servidor que sirve a este dominio"
-#: nis/ypclnt.c:795
+#: nis/ypclnt.c:786
msgid "No such map in server's domain"
msgstr "No existe esa tabla en el dominio del servidor"
-#: nis/ypclnt.c:797
+#: nis/ypclnt.c:788
msgid "No such key in map"
msgstr "No existe esta clave en la tabla"
-#: nis/ypclnt.c:799
+#: nis/ypclnt.c:790
msgid "Internal NIS error"
msgstr "Error interno de NIS"
-#: nis/ypclnt.c:801
+#: nis/ypclnt.c:792
msgid "Local resource allocation failure"
msgstr "La asignación de recursos locales ha fallado"
-#: nis/ypclnt.c:803
+#: nis/ypclnt.c:794
msgid "No more records in map database"
msgstr "No hay más registros en la base de datos"
-#: nis/ypclnt.c:805
+#: nis/ypclnt.c:796
msgid "Can't communicate with portmapper"
msgstr "No se puede comunicar con el asignador de puertos"
-#: nis/ypclnt.c:807
+#: nis/ypclnt.c:798
msgid "Can't communicate with ypbind"
msgstr "No se puede establecer comunicación con `ypbind'"
-#: nis/ypclnt.c:809
+#: nis/ypclnt.c:800
msgid "Can't communicate with ypserv"
msgstr "No se puede establecer comunicación con `ypserv'"
-#: nis/ypclnt.c:811
+#: nis/ypclnt.c:802
msgid "Local domain name not set"
msgstr "No se ha establecido el nombre del dominio local"
-#: nis/ypclnt.c:813
+#: nis/ypclnt.c:804
msgid "NIS map database is bad"
msgstr "La base de datos de la tabla NIS no es correcta"
-#: nis/ypclnt.c:815
+#: nis/ypclnt.c:806
msgid "NIS client/server version mismatch - can't supply service"
msgstr ""
"Discordancia en las versiones de NIS del cliente y el servidor.\n"
"No se puede suministrar el servicio."
-#: nis/ypclnt.c:819
+#: nis/ypclnt.c:810
msgid "Database is busy"
msgstr "La base de datos está ocupada"
-#: nis/ypclnt.c:821
+#: nis/ypclnt.c:812
msgid "Unknown NIS error code"
msgstr "Error de NIS desconocido"
@@ -5524,116 +5571,117 @@ msgstr "Error de NIS desconocido"
# De acuerdo.
# [ Antes decía ... la llamada a bind para el servicio de páginas amarillas ]
# Un poco demasiado explicativo. sv
-#: nis/ypclnt.c:863
+#: nis/ypclnt.c:854
msgid "Internal ypbind error"
msgstr "Error interno en ypbind"
# FUZZY
-#: nis/ypclnt.c:865
+#: nis/ypclnt.c:856
msgid "Domain not bound"
msgstr "No se pudo conectar con el dominio"
-#: nis/ypclnt.c:867
+#: nis/ypclnt.c:858
msgid "System resource allocation failure"
msgstr "Fallo en la asignación de recursos del sistema"
-#: nis/ypclnt.c:869
+#: nis/ypclnt.c:860
msgid "Unknown ypbind error"
msgstr "Error desconocido en la llamada a `ypbind()'"
-#: nis/ypclnt.c:908
+#: nis/ypclnt.c:899
msgid "yp_update: cannot convert host to netname\n"
msgstr "yp_update: no se puede convertir el nombre del `host' a nombre de red\n"
-#: nis/ypclnt.c:920
+#: nis/ypclnt.c:911
msgid "yp_update: cannot get server address\n"
msgstr "yp_update: no se puede encontrar la dirección del servidor\n"
-#: nscd/cache.c:88
+#: nscd/cache.c:94
msgid "while allocating hash table entry"
msgstr "al asignar espacio para la entrada en la tabla `hash'"
-#: nscd/cache.c:150 nscd/connections.c:187
+#: nscd/cache.c:162 nscd/connections.c:184
#, c-format
msgid "cannot stat() file `%s': %s"
msgstr "no se puede ejecutar stat() sobre el fichero `%s': %s"
-#: nscd/connections.c:146
-msgid "cannot read configuration file; this is fatal"
-msgstr "no se puede leer el fichero de configuración; este error es fatal"
-
-#: nscd/connections.c:153
+#: nscd/connections.c:150
msgid "Cannot run nscd in secure mode as unprivileged user"
msgstr "No se puede ejecutar nscd en modo seguro como usuario no privilegiado"
-#: nscd/connections.c:175
+#: nscd/connections.c:172
#, c-format
msgid "while allocating cache: %s"
msgstr "al asignar espacio para el caché: %s"
-#: nscd/connections.c:200
+#: nscd/connections.c:197
#, c-format
msgid "cannot open socket: %s"
msgstr "no se puede abrir el `socket': %s"
-#: nscd/connections.c:218
+#: nscd/connections.c:215
#, c-format
msgid "cannot enable socket to accept connections: %s"
msgstr "no se puede activar el `socket' para aceptar conexiones: %s"
#: nscd/connections.c:260
#, c-format
-msgid "handle_request: request received (Version = %d)"
-msgstr "handle_request: petición recibida (Versión = %d)"
-
-#: nscd/connections.c:266
-#, c-format
msgid "cannot handle old request version %d; current version is %d"
msgstr ""
"no se pueden manejar peticiones de la versión %d, la versión\n"
"actual es %d"
-#: nscd/connections.c:304 nscd/connections.c:326
+#: nscd/connections.c:298 nscd/connections.c:324
#, c-format
msgid "cannot write result: %s"
msgstr "no se puede escribir el resultado: %s"
-#: nscd/connections.c:405 nscd/connections.c:499
+#: nscd/connections.c:392 nscd/connections.c:514
#, c-format
msgid "error getting callers id: %s"
msgstr "error al obtener el id de los llamantes: %s"
-#: nscd/connections.c:471
+#: nscd/connections.c:485
#, c-format
msgid "while accepting connection: %s"
msgstr "al aceptar la conexión: %s"
-#: nscd/connections.c:482
+#: nscd/connections.c:498
#, c-format
msgid "short read while reading request: %s"
msgstr "lectura insuficiente mientras se leía la petición: %s"
-#: nscd/connections.c:518
+#: nscd/connections.c:542
#, c-format
msgid "key length in request too long: %d"
msgstr "la longitud de la clave en la petición es demasiado larga: %d"
-#: nscd/connections.c:532
+#: nscd/connections.c:556
#, c-format
msgid "short read while reading request key: %s"
msgstr "se acabaron los datos mientras se leía la clave de petición: %s"
-#: nscd/connections.c:591 nscd/connections.c:592 nscd/connections.c:611
-#: nscd/connections.c:624 nscd/connections.c:630 nscd/connections.c:637
+#: nscd/connections.c:566
+#, c-format
+msgid "handle_request: request received (Version = %d) from PID %ld"
+msgstr "handle_request: petición recibida (Versión = %d) del PID %ld"
+
+#: nscd/connections.c:571
+#, c-format
+msgid "handle_request: request received (Version = %d)"
+msgstr "handle_request: petición recibida (Versión = %d)"
+
+#: nscd/connections.c:635 nscd/connections.c:636 nscd/connections.c:655
+#: nscd/connections.c:668 nscd/connections.c:674 nscd/connections.c:681
#, c-format
msgid "Failed to run nscd as user '%s'"
msgstr "Fallo al ejecutar nscd como usuario `%s'"
-#: nscd/connections.c:612
+#: nscd/connections.c:656
msgid "getgrouplist failed"
msgstr "falló `getgrouplist'"
-#: nscd/connections.c:625
+#: nscd/connections.c:669
msgid "setgroups failed"
msgstr "falló `setgroups'"
@@ -5645,121 +5693,134 @@ msgstr "al asignar espacio para la copia de la clave"
msgid "while allocating cache entry"
msgstr "al asignar espacio para la entrada en el caché"
-#: nscd/grpcache.c:196 nscd/hstcache.c:282 nscd/pwdcache.c:192
+#: nscd/grpcache.c:197 nscd/hstcache.c:283 nscd/pwdcache.c:193
#, c-format
msgid "short write in %s: %s"
msgstr "escritura insuficiente en %s: %s"
-#: nscd/grpcache.c:218
+#: nscd/grpcache.c:219
#, c-format
msgid "Haven't found \"%s\" in group cache!"
msgstr "No se ha encontrado \"%s\" en el caché de grupos"
-#: nscd/grpcache.c:284
+#: nscd/grpcache.c:285
#, c-format
msgid "Invalid numeric gid \"%s\"!"
msgstr "¡gid numérico inválido \"%s\"!"
-#: nscd/grpcache.c:291
+#: nscd/grpcache.c:292
#, c-format
msgid "Haven't found \"%d\" in group cache!"
msgstr "No se ha encontrado \"%d\" en el caché de grupo"
-#: nscd/hstcache.c:304 nscd/hstcache.c:370 nscd/hstcache.c:435
-#: nscd/hstcache.c:500
+#: nscd/hstcache.c:305 nscd/hstcache.c:371 nscd/hstcache.c:436
+#: nscd/hstcache.c:501
#, c-format
msgid "Haven't found \"%s\" in hosts cache!"
msgstr "No se ha encontrado \"%s\" en el caché de `hosts'"
-#: nscd/nscd.c:85
+#: nscd/nscd.c:89
msgid "Read configuration data from NAME"
msgstr "Lee datos de configuración de NOMBRE"
-#: nscd/nscd.c:87
+#: nscd/nscd.c:91
msgid "Do not fork and display messages on the current tty"
msgstr "No se divide y muestra los mensajes en la terminal actual"
-#: nscd/nscd.c:88
+#: nscd/nscd.c:92
msgid "NUMBER"
msgstr "NÚMERO"
-#: nscd/nscd.c:88
+#: nscd/nscd.c:92
msgid "Start NUMBER threads"
msgstr "Comienza NÚMERO hilos"
-#: nscd/nscd.c:89
+#: nscd/nscd.c:93
msgid "Shut the server down"
msgstr "Apagar el servidor"
-#: nscd/nscd.c:90
+#: nscd/nscd.c:94
msgid "Print current configuration statistic"
msgstr "Muestra una estadística sobre la configuración actual"
-#: nscd/nscd.c:91
+#: nscd/nscd.c:95
msgid "TABLE"
msgstr "TABLA"
-#: nscd/nscd.c:92
+#: nscd/nscd.c:96
msgid "Invalidate the specified cache"
msgstr "Invalida la caché especificada"
-#: nscd/nscd.c:93
+#: nscd/nscd.c:97
msgid "TABLE,yes"
msgstr "TABLA,sí"
-#: nscd/nscd.c:93
+#: nscd/nscd.c:97
msgid "Use separate cache for each user"
msgstr "Utiliza una caché separada para cada usuario"
-#: nscd/nscd.c:98
+#: nscd/nscd.c:102
msgid "Name Service Cache Daemon."
msgstr "Daemon de Caché del Servicio de Nombres."
-#: nscd/nscd.c:131
+#: nscd/nscd.c:141
+msgid "cannot read configuration file; this is fatal"
+msgstr "no se puede leer el fichero de configuración; este error es fatal"
+
+#: nscd/nscd.c:152
msgid "already running"
msgstr "ya está funcionando"
-#: nscd/nscd.c:243 nscd/nscd.c:263 nscd/nscd.c:269
+#: nscd/nscd.c:270 nscd/nscd.c:294 nscd/nscd_stat.c:132
msgid "Only root is allowed to use this option!"
msgstr "Solamente root puede usar esta opción"
-#: nscd/nscd_conf.c:83
+#: nscd/nscd_conf.c:88
#, c-format
msgid "Parse error: %s"
msgstr "Error de análisis: %s"
-#: nscd/nscd_conf.c:166
+#: nscd/nscd_conf.c:171
#, c-format
msgid "Could not create log file \"%s\""
msgstr "No se pudo crear el fichero de registro \"%s\""
-#: nscd/nscd_conf.c:182
+#: nscd/nscd_conf.c:187
msgid "Must specify user name for server-user option"
msgstr "Debe especificar un nombre de usuario para la opción `server-user'"
-#: nscd/nscd_conf.c:187
+#: nscd/nscd_conf.c:194
+msgid "Must specify user name for stat-user option"
+msgstr "Debe especificar un nombre de usuario para la opción `stat-user'"
+
+#: nscd/nscd_conf.c:205
#, c-format
msgid "Unknown option: %s %s %s"
msgstr "Opción desconocida: %s %s %s"
-#: nscd/nscd_stat.c:87
+#: nscd/nscd_stat.c:103
#, c-format
msgid "cannot write statistics: %s"
msgstr "no se pueden escribir las estadísticas: %s"
-#: nscd/nscd_stat.c:105
+#: nscd/nscd_stat.c:128
+#, c-format
+msgid "Only root or %s is allowed to use this option!"
+msgstr "Solamente root o %s puede usar esta opción"
+
+#: nscd/nscd_stat.c:139
msgid "nscd not running!\n"
msgstr "nscd no está en ejecución\n"
-#: nscd/nscd_stat.c:116
+#: nscd/nscd_stat.c:150
msgid "write incomplete"
msgstr "escritura incompleta"
-#: nscd/nscd_stat.c:128
+#: nscd/nscd_stat.c:162
msgid "cannot read statistics data"
msgstr "no se pueden leer los datos de estadística"
-#: nscd/nscd_stat.c:131
+#: nscd/nscd_stat.c:165
#, c-format
msgid ""
"nscd configuration:\n"
@@ -5770,61 +5831,96 @@ msgstr ""
"\n"
"%15d nivel de depuración del servidor\n"
-#: nscd/nscd_stat.c:146 nscd/nscd_stat.c:148
+#: nscd/nscd_stat.c:189
+#, c-format
+msgid "%3ud %2uh %2um %2lus server runtime\n"
+msgstr "%3ud %2uh %2um %2lus tiempo de funcionamiento del servidor\n"
+
+#: nscd/nscd_stat.c:192
+#, c-format
+msgid " %2uh %2um %2lus server runtime\n"
+msgstr " %2uh %2um %2lus tiempo de funcionamiento del servidor\n"
+
+#: nscd/nscd_stat.c:194
+#, c-format
+msgid " %2um %2lus server runtime\n"
+msgstr " %2um %2lus tiempo de funcionamiento del servidor\n"
+
+#: nscd/nscd_stat.c:196
+#, c-format
+msgid " %2lus server runtime\n"
+msgstr " %2lus tiempo de funcionamiento del servidor\n"
+
+#: nscd/nscd_stat.c:198
+#, c-format
+msgid "%15lu number of times clients had to wait\n"
+msgstr "%15lu número de veces que los clientes tuvieron que esperar\n"
+
+#: nscd/nscd_stat.c:213 nscd/nscd_stat.c:215
msgid " no"
msgstr " no"
-#: nscd/nscd_stat.c:146 nscd/nscd_stat.c:148
+#: nscd/nscd_stat.c:213 nscd/nscd_stat.c:215
msgid " yes"
msgstr " si"
-#: nscd/nscd_stat.c:154
+#: nscd/nscd_stat.c:221
#, c-format
msgid ""
"\n"
"%s cache:\n"
"\n"
"%15s cache is enabled\n"
-"%15Zd suggested size\n"
-"%15ld seconds time to live for positive entries\n"
-"%15ld seconds time to live for negative entries\n"
-"%15ld cache hits on positive entries\n"
-"%15ld cache hits on negative entries\n"
-"%15ld cache misses on positive entries\n"
-"%15ld cache misses on negative entries\n"
-"%15ld%% cache hit rate\n"
+"%15Zu suggested size\n"
+"%15lu seconds time to live for positive entries\n"
+"%15lu seconds time to live for negative entries\n"
+"%15lu cache hits on positive entries\n"
+"%15lu cache hits on negative entries\n"
+"%15lu cache misses on positive entries\n"
+"%15lu cache misses on negative entries\n"
+"%15lu%% cache hit rate\n"
+"%15lu current number of cached values\n"
+"%15lu maximum number of cached values\n"
+"%15lu maximum chain length searched\n"
+"%15lu number of delays on rdlock\n"
+"%15lu number of delays on wrlock\n"
"%15s check /etc/%s for changes\n"
msgstr ""
"\n"
"%s caché:\n"
"\n"
"%15s el caché está activado\n"
-"%15Zd tamaño sugerido\n"
-"%15ld segundos de vida para las entradas positivas\n"
-"%15ld segundos de vida para las entradas negativas\n"
-"%15ld aciertos de caché en las entradas positivas\n"
-"%15ld aciertos de caché en las entradas negativas\n"
-"%15ld fallos de caché en las entradas positivas\n"
-"%15ld fallos de caché en las entradas negativas\n"
-"%15ld%% tasa de aciertos de caché\n"
+"%15Zu tamaño sugerido\n"
+"%15lu segundos de vida para las entradas positivas\n"
+"%15lu segundos de vida para las entradas negativas\n"
+"%15lu aciertos de caché en las entradas positivas\n"
+"%15lu aciertos de caché en las entradas negativas\n"
+"%15lu fallos de caché en las entradas positivas\n"
+"%15lu fallos de caché en las entradas negativas\n"
+"%15lu%% tasa de aciertos de caché\n"
+"%15lu número actual de valores en caché\n"
+"%15lu número máximo de valores en caché\n"
+"%15lu longitud maxima de la cadena buscada\n"
+"%15lu número de retardos en rdlock\n"
+"%15lu número de retardos en wrlock\n"
"%15s compruebe /etc/%s para cambios\n"
-#: nscd/pwdcache.c:214
+#: nscd/pwdcache.c:215
#, c-format
msgid "Haven't found \"%s\" in password cache!"
msgstr "No se ha encontrado \"%s\" en el caché de contraseñas"
-#: nscd/pwdcache.c:280
+#: nscd/pwdcache.c:281
#, c-format
msgid "Invalid numeric uid \"%s\"!"
msgstr "¡uid numérico inválido \"%s\"!"
-#: nscd/pwdcache.c:287
+#: nscd/pwdcache.c:288
#, c-format
msgid "Haven't found \"%d\" in password cache!"
msgstr "No se ha encontrado \"%d\" en el caché de contraseñas"
-#: elf/../sysdeps/generic/dl-sysdep.c:357
+#: elf/../sysdeps/generic/dl-sysdep.c:422
msgid "cannot create capability list"
msgstr "no se puede crear la lista de capacidades"
@@ -5863,62 +5959,62 @@ msgid "%s is for unknown machine %d.\n"
msgstr "%s es para la máquina desconocida %d.\n"
# FIXME: Falta ver si es niño o niña. sv
-#: elf/cache.c:69
+#: elf/cache.c:70
msgid "unknown"
msgstr "desconocido/a"
-#: elf/cache.c:105
+#: elf/cache.c:111
msgid "Unknown OS"
msgstr "Sistema Operativo desconocido"
-#: elf/cache.c:110
+#: elf/cache.c:116
#, c-format
msgid ", OS ABI: %s %d.%d.%d"
msgstr ", ABI del SO: %s %d.%d.%d"
-#: elf/cache.c:136 elf/ldconfig.c:1045
+#: elf/cache.c:142 elf/ldconfig.c:1078
#, c-format
msgid "Can't open cache file %s\n"
msgstr "No se puede abrir el fichero de caché %s\n"
-#: elf/cache.c:148
+#: elf/cache.c:154
msgid "mmap of cache file failed.\n"
msgstr "falló la operación `mmap' sobre el fichero de caché.\n"
-#: elf/cache.c:152 elf/cache.c:162
+#: elf/cache.c:158 elf/cache.c:168
msgid "File is not a cache file.\n"
msgstr "El fichero no es un fichero de caché.\n"
-#: elf/cache.c:195 elf/cache.c:205
+#: elf/cache.c:201 elf/cache.c:211
#, c-format
msgid "%d libs found in cache `%s'\n"
msgstr "%d bibliotecas se encontraron en la caché `%s'\n"
-#: elf/cache.c:392
+#: elf/cache.c:410
#, c-format
msgid "Can't remove old temporary cache file %s"
msgstr "No se puede borrar el fichero de caché temporal antiguo %s"
-#: elf/cache.c:399
+#: elf/cache.c:417
#, c-format
msgid "Can't create temporary cache file %s"
msgstr "No se puede crear el fichero temporal de caché %s"
-#: elf/cache.c:407 elf/cache.c:416 elf/cache.c:420
+#: elf/cache.c:425 elf/cache.c:434 elf/cache.c:438
msgid "Writing of cache data failed"
msgstr "Falló la escritura de los datos de la caché"
# FIXME. Merge with previous message (?). sv
-#: elf/cache.c:424
+#: elf/cache.c:442
msgid "Writing of cache data failed."
msgstr "Falló la escritura de los datos de la caché"
-#: elf/cache.c:431
+#: elf/cache.c:449
#, c-format
msgid "Changing access rights of %s to %#o failed"
msgstr "El cambio de los derechos de acceso de %s a %#o falló"
-#: elf/cache.c:436
+#: elf/cache.c:454
#, c-format
msgid "Renaming of %s to %s failed"
msgstr "Falló el renombramiento de %s a %s"
@@ -5927,7 +6023,7 @@ msgstr "Falló el renombramiento de %s a %s"
msgid "shared object not open"
msgstr "el objeto compartido no está abierto"
-#: elf/dl-close.c:486 elf/dl-open.c:444
+#: elf/dl-close.c:531 elf/dl-open.c:454
msgid "TLS generation counter wrapped! Please send report with the 'glibcbug' script."
msgstr ""
"¡El contador de generaciones TLS ha vuelto a cero! Por favor envíe un informe\n"
@@ -5972,137 +6068,145 @@ msgstr "¡¡¡HAY UN BICHO EN EL ENLAZADOR DINÁMICO!!!"
msgid "error while loading shared libraries"
msgstr "error al cargar las bibliotecas compartidas"
-#: elf/dl-load.c:339
+#: elf/dl-load.c:347
msgid "cannot allocate name record"
msgstr "no se puede asignar el registro del nombre"
# He intentado mejorarlo un poco ...
#
-#: elf/dl-load.c:441 elf/dl-load.c:520 elf/dl-load.c:612 elf/dl-load.c:707
+#: elf/dl-load.c:449 elf/dl-load.c:528 elf/dl-load.c:648 elf/dl-load.c:743
msgid "cannot create cache for search path"
msgstr "no se puede crear un caché para la ruta de búsqueda"
-#: elf/dl-load.c:543
+#: elf/dl-load.c:551
msgid "cannot create RUNPATH/RPATH copy"
msgstr "no se puede crear una copia RUNPATH/RPATH"
-#: elf/dl-load.c:598
+#: elf/dl-load.c:634
msgid "cannot create search path array"
msgstr "no se puede crear la matriz de la ruta de búsqueda"
-#: elf/dl-load.c:794
+#: elf/dl-load.c:830
msgid "cannot stat shared object"
msgstr "no se puede efectuar `stat' sobre el objeto compartido"
-#: elf/dl-load.c:838
+#: elf/dl-load.c:874
msgid "cannot open zero fill device"
msgstr "no se puede abrir el dispositivo de `zero fill'"
-#: elf/dl-load.c:847 elf/dl-load.c:1902
+#: elf/dl-load.c:883 elf/dl-load.c:1929
msgid "cannot create shared object descriptor"
msgstr "no se puede crear el descriptor del objeto compartido"
-#: elf/dl-load.c:866 elf/dl-load.c:1398 elf/dl-load.c:1481
+#: elf/dl-load.c:902 elf/dl-load.c:1470 elf/dl-load.c:1553
msgid "cannot read file data"
msgstr "no se pueden leer los datos del fichero"
-#: elf/dl-load.c:906
+#: elf/dl-load.c:946
msgid "ELF load command alignment not page-aligned"
msgstr "El alineamiento de la orden de carga ELF no está alineada a la página"
-#: elf/dl-load.c:913
+#: elf/dl-load.c:953
msgid "ELF load command address/offset not properly aligned"
msgstr "La dirección/desplazamiento de la orden de carga ELF no está bien alineada"
-#: elf/dl-load.c:988
+#: elf/dl-load.c:1037
msgid "cannot allocate TLS data structures for initial thread"
msgstr "no se pueden crear las estructuras de datos TLS para el hilo inicial"
-#: elf/dl-load.c:1012
+#: elf/dl-load.c:1061
msgid "cannot handle TLS data"
msgstr "no se pueden manejar los datos de TLS"
-#: elf/dl-load.c:1047
+#: elf/dl-load.c:1075
+msgid "object file has no loadable segments"
+msgstr "el fichero objeto no tiene segmentos cargables"
+
+#: elf/dl-load.c:1110
msgid "failed to map segment from shared object"
msgstr "fallo al asignar un segmento del objeto compartido"
-#: elf/dl-load.c:1071
+#: elf/dl-load.c:1135
msgid "cannot dynamically load executable"
msgstr "no se puede cargar el ejecutable dinámicamente"
-#: elf/dl-load.c:1132
+#: elf/dl-load.c:1191
msgid "cannot change memory protections"
msgstr "no se pueden cambiar las protecciones de memoria"
-#: elf/dl-load.c:1151
+#: elf/dl-load.c:1210
msgid "cannot map zero-fill pages"
msgstr "no se pueden asignar páginas de tipo `zero-fill'"
-#: elf/dl-load.c:1169
+#: elf/dl-load.c:1228
msgid "cannot allocate memory for program header"
msgstr "no se puede asignar memoria para la cabecera del programa"
-#: elf/dl-load.c:1200
+#: elf/dl-load.c:1259
msgid "object file has no dynamic section"
msgstr "el fichero objeto no tiene sección dinámica"
-#: elf/dl-load.c:1240
+#: elf/dl-load.c:1299
msgid "shared object cannot be dlopen()ed"
msgstr "no se puede efectuar dlopen() sobre el objeto compartido"
-#: elf/dl-load.c:1263
+#: elf/dl-load.c:1322
msgid "cannot create searchlist"
msgstr "no se puede crear la lista de búsqueda"
-#: elf/dl-load.c:1398
+#: elf/dl-load.c:1352
+msgid "cannot enable executable stack as shared object requires"
+msgstr "no se puede activar la pila ejecutable tal y como el objeto compartido necesita"
+
+#: elf/dl-load.c:1470
msgid "file too short"
msgstr "fichero demasiado corto"
-#: elf/dl-load.c:1421
+#: elf/dl-load.c:1493
msgid "invalid ELF header"
msgstr "cabecera ELF inválida"
-#: elf/dl-load.c:1430
+#: elf/dl-load.c:1502
msgid "ELF file data encoding not big-endian"
msgstr "La codificación de los datos del fichero ELF no es `big-endian'"
-#: elf/dl-load.c:1432
+#: elf/dl-load.c:1504
msgid "ELF file data encoding not little-endian"
msgstr "La codificación de los datos del fichero ELF no es `little-endian'"
-#: elf/dl-load.c:1436
+#: elf/dl-load.c:1508
msgid "ELF file version ident does not match current one"
msgstr "La identificación de versión del fichero ELF no encaja con la actual"
-#: elf/dl-load.c:1440
+#: elf/dl-load.c:1512
msgid "ELF file OS ABI invalid"
msgstr "ABI del OS del fichero ELF inválida"
-#: elf/dl-load.c:1442
+#: elf/dl-load.c:1514
msgid "ELF file ABI version invalid"
msgstr "Versión de ABI del fichero ELF inválida"
-#: elf/dl-load.c:1445
+#: elf/dl-load.c:1517
msgid "internal error"
msgstr "error interno"
-#: elf/dl-load.c:1452
+#: elf/dl-load.c:1524
msgid "ELF file version does not match current one"
msgstr "La versión del fichero ELF no coincide con la actual"
-#: elf/dl-load.c:1460
+#: elf/dl-load.c:1532
msgid "ELF file's phentsize not the expected size"
msgstr "El `phentsize' del fichero ELF no es el tamaño esperado"
-#: elf/dl-load.c:1466
+#: elf/dl-load.c:1538
msgid "only ET_DYN and ET_EXEC can be loaded"
msgstr "solamente pueden cargarse ET_DYN y ET_EXEC"
-#: elf/dl-load.c:1917
+#: elf/dl-load.c:1944
msgid "cannot open shared object file"
msgstr "no se puede abrir el fichero del objeto compartido"
-#: elf/dl-lookup.c:265 elf/dl-lookup.c:430
+#: elf/dl-lookup.c:265 elf/dl-lookup.c:443
msgid "relocation error"
msgstr "error de relocalización"
@@ -6114,40 +6218,38 @@ msgstr "no se puede extender el ámbito global"
msgid "empty dynamic string token substitution"
msgstr "sustitución dinámica de un elemento por una cadena vacía"
-#: elf/dl-open.c:351 elf/dl-open.c:362
+#: elf/dl-open.c:361 elf/dl-open.c:372
msgid "cannot create scope list"
msgstr "no se puede crear la lista de ámbito"
-#: elf/dl-open.c:424
+#: elf/dl-open.c:434
msgid "cannot create TLS data structures"
msgstr "no se pueden crear las estructuras de datos TLS"
-#: elf/dl-open.c:486
+#: elf/dl-open.c:496
msgid "invalid mode for dlopen()"
msgstr "modo inválido para dlopen()"
-#: elf/dl-reloc.c:58
-msgid "shared object cannot be dlopen()ed: static TLS memory too small"
-msgstr ""
-"no se puede efectuar dlopen() sobre el objeto compartido: memoria estática TLS\n"
-"demasiado pequeña"
+#: elf/dl-reloc.c:57
+msgid "cannot allocate memory in static TLS block"
+msgstr "No se pudo asignar memoria en el bloque TLS estático"
-#: elf/dl-reloc.c:118
+#: elf/dl-reloc.c:176
msgid "cannot make segment writable for relocation"
msgstr "no se puede hacer el segmento escribible para su relocalización"
-#: elf/dl-reloc.c:219
+#: elf/dl-reloc.c:277
#, c-format
msgid "%s: profiler found no PLTREL in object %s\n"
msgstr "%s el `profiler' no encontró ningún PLTREL en el objeto %s\n"
-#: elf/dl-reloc.c:231
+#: elf/dl-reloc.c:289
#, c-format
msgid "%s: profiler out of memory shadowing PLTREL of %s\n"
msgstr "%s: el `profiler' se quedó sin memoria al ocultar el PLTREL de %s\n"
# Se admiten sugerencias. sv
-#: elf/dl-reloc.c:246
+#: elf/dl-reloc.c:304
msgid "cannot restore segment prot after reloc"
msgstr "no se puede restaurar el `prot' del segmento después de la relocalización"
@@ -6155,7 +6257,7 @@ msgstr "no se puede restaurar el `prot' del segmento después de la relocalizació
msgid "RTLD_NEXT used in code not dynamically loaded"
msgstr "Se ha usado RTLD_NEXT en una parte del código que no se cargó dinámicamente"
-#: elf/dl-version.c:302
+#: elf/dl-version.c:303
msgid "cannot allocate version reference table"
msgstr "no se puede asignar espacio para la tabla de versiones de referencia"
@@ -6202,143 +6304,148 @@ msgid "Format to use: new, old or compat (default)"
msgstr "Formato utilizado: new, old o compat (predeterminado)"
# FIXME: Why So Many Uppercase Letters? sv
-#: elf/ldconfig.c:136
+#: elf/ldconfig.c:139
msgid "Configure Dynamic Linker Run Time Bindings."
msgstr "Configura las asociaciones de tiempo de ejecución del enlazador dinámico"
-#: elf/ldconfig.c:294
+#: elf/ldconfig.c:297
#, c-format
msgid "Path `%s' given more than once"
msgstr "Se ha dado la ruta `%s' más de una vez"
-#: elf/ldconfig.c:338
+#: elf/ldconfig.c:341
#, c-format
msgid "%s is not a known library type"
msgstr "%s no es un tipo de biblioteca conocido"
-#: elf/ldconfig.c:356
+#: elf/ldconfig.c:361
#, c-format
msgid "Can't stat %s"
msgstr "No se puede efectuar `stat' sobre %s"
-#: elf/ldconfig.c:426
+#: elf/ldconfig.c:431
#, c-format
msgid "Can't stat %s\n"
msgstr "No se puede efectuar `stat' sobre %s\n"
-#: elf/ldconfig.c:436
+#: elf/ldconfig.c:441
#, c-format
msgid "%s is not a symbolic link\n"
msgstr "%s no es un enlace simbólico\n"
-#: elf/ldconfig.c:455
+#: elf/ldconfig.c:460
#, c-format
msgid "Can't unlink %s"
msgstr "No se puede efectuar `unlink' sobre %s"
-#: elf/ldconfig.c:461
+#: elf/ldconfig.c:466
#, c-format
msgid "Can't link %s to %s"
msgstr "No se puede crear un enlace de %s a %s"
-#: elf/ldconfig.c:467
+#: elf/ldconfig.c:472
msgid " (changed)\n"
msgstr " (cambiado)\n"
-#: elf/ldconfig.c:469
+#: elf/ldconfig.c:474
msgid " (SKIPPED)\n"
msgstr " (SALTADO)\n"
-#: elf/ldconfig.c:524
+#: elf/ldconfig.c:529
#, c-format
msgid "Can't find %s"
msgstr "No se encuentra %s"
-#: elf/ldconfig.c:540
+#: elf/ldconfig.c:545
#, c-format
msgid "Can't lstat %s"
msgstr "No se puede efectuar `lstat' sobre %s"
-#: elf/ldconfig.c:547
+#: elf/ldconfig.c:552
#, c-format
msgid "Ignored file %s since it is not a regular file."
msgstr "Descartado el fichero %s dado que no es un fichero regular."
-#: elf/ldconfig.c:555
+#: elf/ldconfig.c:560
#, c-format
msgid "No link created since soname could not be found for %s"
msgstr "No se creó el enlace ya que no se encontró el soname para %s"
-#: elf/ldconfig.c:646
+#: elf/ldconfig.c:651
#, c-format
msgid "Can't open directory %s"
msgstr "No se puede abrir el directorio %s"
-#: elf/ldconfig.c:701 elf/ldconfig.c:748
+#: elf/ldconfig.c:706 elf/ldconfig.c:753
#, c-format
msgid "Cannot lstat %s"
msgstr "No se puede efectuar `lstat' sobre %s"
-#: elf/ldconfig.c:713
+#: elf/ldconfig.c:718
#, c-format
msgid "Cannot stat %s"
msgstr "No se puede efectuar `stat' sobre %s"
-#: elf/ldconfig.c:770 elf/readlib.c:93
+#: elf/ldconfig.c:775 elf/readlib.c:92
#, c-format
msgid "Input file %s not found.\n"
msgstr "No se encontró el fichero de entrada %s.\n"
-#: elf/ldconfig.c:804
+#: elf/ldconfig.c:826
#, c-format
msgid "libc5 library %s in wrong directory"
msgstr "biblioteca libc5 %s en un directorio equivocado"
-#: elf/ldconfig.c:807
+#: elf/ldconfig.c:829
#, c-format
msgid "libc6 library %s in wrong directory"
msgstr "biblioteca libc6 %s en un directorio equivocado"
-#: elf/ldconfig.c:810
+#: elf/ldconfig.c:832
#, c-format
msgid "libc4 library %s in wrong directory"
msgstr "biblioteca libc4 %s en un directorio equivocado"
-#: elf/ldconfig.c:837
+#: elf/ldconfig.c:859
#, c-format
msgid "libraries %s and %s in directory %s have same soname but different type."
msgstr "las bibliotecas %s y %s en el directorio %s tienen el mismo soname pero distinto tipo."
-#: elf/ldconfig.c:940
+#: elf/ldconfig.c:962
#, c-format
msgid "Can't open configuration file %s"
msgstr "No se puede abrir el fichero de configuración `%s'"
-#: elf/ldconfig.c:1024
+#: elf/ldconfig.c:1033
+#, c-format
+msgid "relative path `%s' used to build cache"
+msgstr "se usa el camino relativo `%s' para construir el caché"
+
+#: elf/ldconfig.c:1057
msgid "Can't chdir to /"
msgstr "No se puede cambiar al directorio /"
-#: elf/ldconfig.c:1066
+#: elf/ldconfig.c:1099
#, c-format
msgid "Can't open cache file directory %s\n"
msgstr "No se puede leer el directorio de ficheros de caché %s\n"
-#: elf/readlib.c:99
+#: elf/readlib.c:98
#, c-format
msgid "Cannot fstat file %s.\n"
msgstr "No se puede efectuar `fstat' sobre el fichero %s.\n"
-#: elf/readlib.c:109
+#: elf/readlib.c:108
#, c-format
msgid "File %s is too small, not checked."
msgstr "El fichero %s es demasiado pequeño, no se comprueba."
-#: elf/readlib.c:118
+#: elf/readlib.c:117
#, c-format
msgid "Cannot mmap file %s.\n"
msgstr "No se puede efectuar `mmap' sobre el fichero %s.\n"
-#: elf/readlib.c:158
+#: elf/readlib.c:155
#, c-format
msgid "%s is not an ELF file - it has the wrong magic bytes at the start.\n"
msgstr "%s no es un fichero ELF - tiene los bytes mágicos equivocados en el comienzo.\n"
@@ -6438,6 +6545,11 @@ msgstr "`%s' no es un fichero de datos para `profile' correcto para `%s'"
msgid "cannot allocate symbol data"
msgstr "no se puede asignar espacio para los datos del símbolo"
+#~ msgid "shared object cannot be dlopen()ed: static TLS memory too small"
+#~ msgstr ""
+#~ "no se puede efectuar dlopen() sobre el objeto compartido: memoria estática TLS\n"
+#~ "demasiado pequeña"
+
# FIXME: Decir al autor que no use tabs. sv
#~ msgid "\t\t\t\t\t\t\t %s: value for field `%s' must be in range %d...%d"
#~ msgstr "\t\t\t\t\t\t\t %s: el valor para el campo `%s' debe estar en el rango %d...%d"
@@ -6535,9 +6647,6 @@ msgstr "no se puede asignar espacio para los datos del símbolo"
#~ msgid "cannot insert collation element `%.*s'"
#~ msgstr "no se puede insertar el elemento de unión `%.*s' "
-#~ msgid "cannot insert into result table"
-#~ msgstr "no se puede insertar el la tabla de resultados"
-
# FUZZY
#~ msgid "cannot insert new collating symbol definition: %s"
#~ msgstr "no se puede insertar la nueva definición para el símbolo de unión: %s"
diff --git a/posix/regex_internal.h b/posix/regex_internal.h
index 0ccd8d3665..23765c970e 100644
--- a/posix/regex_internal.h
+++ b/posix/regex_internal.h
@@ -486,7 +486,7 @@ struct re_dfastate_t
re_node_set non_eps_nodes;
re_node_set inveclosure;
re_node_set *entrance_nodes;
- struct re_dfastate_t **trtable;
+ struct re_dfastate_t **trtable, **word_trtable;
unsigned int context : 4;
unsigned int halt : 1;
/* If this state can accept `multi byte'.
@@ -496,7 +496,6 @@ struct re_dfastate_t
/* If this state has backreference node(s). */
unsigned int has_backref : 1;
unsigned int has_constraint : 1;
- unsigned int word_trtable : 1;
};
typedef struct re_dfastate_t re_dfastate_t;
diff --git a/posix/regexec.c b/posix/regexec.c
index 91b48dd4a2..1b21b699e9 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -175,8 +175,8 @@ static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
re_node_set *cur_nodes, int cur_str,
int subexp_num, int type) internal_function;
-static re_dfastate_t **build_trtable (re_dfa_t *dfa,
- re_dfastate_t *state) internal_function;
+static int build_trtable (re_dfa_t *dfa,
+ re_dfastate_t *state) internal_function;
#ifdef RE_ENABLE_I18N
static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
const re_string_t *input, int idx) internal_function;
@@ -2218,7 +2218,6 @@ transit_state (err, mctx, state)
re_match_context_t *mctx;
re_dfastate_t *state;
{
- re_dfa_t *const dfa = mctx->dfa;
re_dfastate_t **trtable;
unsigned char ch;
@@ -2233,21 +2232,22 @@ transit_state (err, mctx, state)
#endif /* RE_ENABLE_I18N */
/* Then decide the next state with the single byte. */
- if (1)
+#if 0
+ if (0)
+ /* don't use transition table */
+ return transit_state_sb (err, mctx, state);
+#endif
+
+ /* Use transition table */
+ ch = re_string_fetch_byte (&mctx->input);
+ for (;;)
{
- /* Use transition table */
- ch = re_string_fetch_byte (&mctx->input);
trtable = state->trtable;
- if (trtable == NULL)
- {
- trtable = build_trtable (dfa, state);
- if (trtable == NULL)
- {
- *err = REG_ESPACE;
- return NULL;
- }
- }
- if (BE (state->word_trtable, 0))
+ if (BE (trtable != NULL, 1))
+ return trtable[ch];
+
+ trtable = state->word_trtable;
+ if (BE (trtable != NULL, 1))
{
unsigned int context;
context
@@ -2259,14 +2259,15 @@ transit_state (err, mctx, state)
else
return trtable[ch];
}
- else
- return trtable[ch];
+
+ if (!build_trtable (mctx->dfa, state))
+ {
+ *err = REG_ESPACE;
+ return NULL;
+ }
+
+ /* Retry, we now have a transition table. */
}
-#if 0
- else
- /* don't use transition table */
- return transit_state_sb (err, mctx, state);
-#endif
}
/* Update the state_log if we need */
@@ -3273,15 +3274,15 @@ expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
}
/* Build transition table for the state.
- Return the new table if succeeded, otherwise return NULL. */
+ Return 1 if succeeded, otherwise return NULL. */
-static re_dfastate_t **
+static int
build_trtable (dfa, state)
re_dfa_t *dfa;
re_dfastate_t *state;
{
reg_errcode_t err;
- int i, j, ch;
+ int i, j, ch, need_word_trtable = 0;
unsigned int elem, mask;
int dests_node_malloced = 0, dest_states_malloced = 0;
int ndests; /* Number of the destination states from `state'. */
@@ -3298,20 +3299,20 @@ build_trtable (dfa, state)
#ifdef _LIBC
if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
dests_node = (re_node_set *)
- alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
+ alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
else
#endif
{
dests_node = (re_node_set *)
- malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
+ malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
if (BE (dests_node == NULL, 0))
- return NULL;
+ return 0;
dests_node_malloced = 1;
}
dests_ch = (bitset *) (dests_node + SBC_MAX);
/* Initialize transiton table. */
- state->word_trtable = 0;
+ state->word_trtable = state->trtable = NULL;
/* At first, group all nodes belonging to `state' into several
destinations. */
@@ -3320,14 +3321,14 @@ build_trtable (dfa, state)
{
if (dests_node_malloced)
free (dests_node);
- /* Return NULL in case of an error, trtable otherwise. */
+ /* Return 0 in case of an error, 1 otherwise. */
if (ndests == 0)
{
state->trtable = (re_dfastate_t **)
- calloc (sizeof (re_dfastate_t *), SBC_MAX);;
- return state->trtable;
+ calloc (sizeof (re_dfastate_t *), SBC_MAX);
+ return 1;
}
- return NULL;
+ return 0;
}
err = re_node_set_alloc (&follows, ndests + 1);
@@ -3338,12 +3339,12 @@ build_trtable (dfa, state)
if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
+ ndests * 3 * sizeof (re_dfastate_t *)))
dest_states = (re_dfastate_t **)
- alloca (ndests * 3 * sizeof (re_dfastate_t *));
+ alloca (ndests * 3 * sizeof (re_dfastate_t *));
else
#endif
{
dest_states = (re_dfastate_t **)
- malloc (ndests * 3 * sizeof (re_dfastate_t *));
+ malloc (ndests * 3 * sizeof (re_dfastate_t *));
if (BE (dest_states == NULL, 0))
{
out_free:
@@ -3354,7 +3355,7 @@ out_free:
re_node_set_free (dests_node + i);
if (dests_node_malloced)
free (dests_node);
- return NULL;
+ return 0;
}
dest_states_malloced = 1;
}
@@ -3390,9 +3391,8 @@ out_free:
if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
goto out_free;
- if (dest_states[i] != dest_states_word[i]
- && dfa->mb_cur_max > 1)
- state->word_trtable = 1;
+ if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
+ need_word_trtable = 1;
dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
CONTEXT_NEWLINE);
@@ -3407,13 +3407,14 @@ out_free:
bitset_merge (acceptable, dests_ch[i]);
}
- if (!BE (state->word_trtable, 0))
+ if (!BE (need_word_trtable, 0))
{
/* We don't care about whether the following character is a word
character, or we are in a single-byte character set so we can
discern by looking at the character code: allocate a
256-entry transition table. */
- trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
+ trtable = state->trtable =
+ (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
if (BE (trtable == NULL, 0))
goto out_free;
@@ -3443,8 +3444,8 @@ out_free:
by looking at the character code: build two 256-entry
transition tables, one starting at trtable[0] and one
starting at trtable[SBC_MAX]. */
- trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
- 2 * SBC_MAX);
+ trtable = state->word_trtable =
+ (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
if (BE (trtable == NULL, 0))
goto out_free;
@@ -3475,7 +3476,7 @@ out_free:
{
/* k-th destination accepts newline character. */
trtable[NEWLINE_CHAR] = dest_states_nl[j];
- if (state->word_trtable)
+ if (need_word_trtable)
trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
/* There must be only one destination which accepts
newline. See group_nodes_into_DFAstates. */
@@ -3493,8 +3494,7 @@ out_free:
if (dests_node_malloced)
free (dests_node);
- state->trtable = trtable;
- return trtable;
+ return 1;
}
/* Group all nodes belonging to STATE into several destinations.
diff --git a/stdio-common/Makefile b/stdio-common/Makefile
index 3a66f1d021..2e797e4dfe 100644
--- a/stdio-common/Makefile
+++ b/stdio-common/Makefile
@@ -1,4 +1,4 @@
-# Copyright (C) 1991-2002, 2003, 2004 Free Software Foundation, Inc.
+# Copyright (C) 1991-2002, 2003, 2004, 2005 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
# The GNU C Library is free software; you can redistribute it and/or
@@ -53,7 +53,7 @@ tests := tstscanf test_rdwr test-popen tstgetln test-fseek \
scanf11 scanf12 tst-tmpnam tst-cookie tst-obprintf tst-sscanf \
tst-swprintf tst-fseek tst-fmemopen test-vfprintf tst-gets \
tst-perror tst-sprintf tst-rndseek tst-fdopen tst-fphex bug14 bug15 \
- tst-popen tst-unlockedio
+ tst-popen tst-unlockedio tst-fmemopen2
test-srcs = tst-unbputc tst-printf
diff --git a/stdio-common/tst-fmemopen2.c b/stdio-common/tst-fmemopen2.c
new file mode 100644
index 0000000000..6a0ee836a2
--- /dev/null
+++ b/stdio-common/tst-fmemopen2.c
@@ -0,0 +1,67 @@
+#include <assert.h>
+#include <stdio.h>
+#include <sys/types.h>
+
+
+static int
+do_test (void)
+{
+ int result = 0;
+ char buf[100];
+ FILE *fp = fmemopen (buf, sizeof (buf), "w");
+ if (fp == NULL)
+ {
+ puts ("fmemopen failed");
+ return 0;
+ }
+ static const char str[] = "hello world";
+#define nstr (sizeof (str) - 1)
+ fputs (str, fp);
+ off_t o = ftello (fp);
+ if (o != nstr)
+ {
+ printf ("first ftello returned %ld, expected %zu\n", o, nstr);
+ result = 1;
+ }
+ rewind (fp);
+ o = ftello (fp);
+ if (o != 0)
+ {
+ printf ("second ftello returned %ld, expected %zu\n", o, 0);
+ result = 1;
+ }
+ if (fseeko (fp, 0, SEEK_END) != 0)
+ {
+ puts ("fseeko failed");
+ return 1;
+ }
+ o = ftello (fp);
+ if (o != nstr)
+ {
+ printf ("third ftello returned %ld, expected %zu\n", o, nstr);
+ result = 1;
+ }
+ rewind (fp);
+ static const char str2[] = "just hello";
+#define nstr2 (sizeof (str2) - 1)
+ assert (nstr2 < nstr);
+ fputs (str2, fp);
+ o = ftello (fp);
+ if (o != nstr2)
+ {
+ printf ("fourth ftello returned %ld, expected %zu\n", o, nstr2);
+ result = 1;
+ }
+ fclose (fp);
+ static const char str3[] = "just hellod";
+ if (strcmp (buf, str3) != 0)
+ {
+ printf ("final string is \"%s\", expected \"%s\"\n",
+ buf, str3);
+ result = 1;
+ }
+ return result;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/stdlib/Makefile b/stdlib/Makefile
index 5f4675033e..b766fb8656 100644
--- a/stdlib/Makefile
+++ b/stdlib/Makefile
@@ -1,4 +1,4 @@
-# Copyright (C) 1991-2002, 2003, 2004 Free Software Foundation, Inc.
+# Copyright (C) 1991-2002, 2003, 2004, 2005 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
# The GNU C Library is free software; you can redistribute it and/or
@@ -65,6 +65,11 @@ tests := tst-strtol tst-strtod testmb testrand testsort testdiv \
tst-rand48 bug-strtod tst-setcontext test-a64l tst-qsort \
tst-system testmb2
+include ../Makeconfig
+
+ifeq ($(build-shared),yes)
+tests += tst-putenv
+endif
# Several mpn functions from GNU MP are used by the strtod function.
mpn-routines := inlines add_n addmul_1 cmp divmod_1 divrem udiv_qrnnd \
@@ -75,9 +80,13 @@ routines := $(strip $(routines) $(mpn-routines)) \
dbl2mpn ldbl2mpn \
mpn2flt mpn2dbl mpn2ldbl
aux += fpioconst mp_clz_tab
-distribute := $(distribute) $(mpn-headers) gen-mpn-copy fpioconst.h
+distribute := $(distribute) $(mpn-headers) gen-mpn-copy fpioconst.h \
+ tst-putenvmod.c
+
+tests-extras += tst-putenvmod
+extra-objs += tst-putenvmod.os
-generated += isomac isomac.out
+generated += isomac isomac.out tst-putenvmod.so
CFLAGS-bsearch.c = $(uses-callbacks)
CFLAGS-msort.c = $(uses-callbacks)
@@ -85,8 +94,6 @@ CFLAGS-qsort.c = $(uses-callbacks)
CFLAGS-system.c = -fexceptions
CFLAGS-fmtmsg.c = -fexceptions
-include ../Makeconfig
-
ifneq (,$(filter %REENTRANT, $(defines)))
CFLAGS-strfmon.c = -D_IO_MTSAFE_IO
CFLAGS-strfmon_l.c = -D_IO_MTSAFE_IO
@@ -124,3 +131,9 @@ $(objpfx)isomac: isomac.c
$(objpfx)tst-fmtmsg.out: tst-fmtmsg.sh $(objpfx)tst-fmtmsg
$(SHELL) -e $< $(common-objpfx) '$(run-program-prefix)' $(common-objpfx)stdlib/
+
+$(objpfx)tst-putenv: $(objpfx)tst-putenvmod.so
+
+$(objpfx)tst-putenvmod.so: $(objpfx)tst-putenvmod.os
+ $(build-module)
+CFLAGS-tst-putenvmod.c = -DNOT_IN_libc=1
diff --git a/stdlib/tst-putenv.c b/stdlib/tst-putenv.c
new file mode 100644
index 0000000000..47513ea42b
--- /dev/null
+++ b/stdlib/tst-putenv.c
@@ -0,0 +1,18 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+static int
+do_test (void)
+{
+ char *p = getenv ("SOMETHING_NOBODY_USES");
+ if (p == NULL)
+ {
+ puts ("envvar not defined");
+ return 1;
+ }
+
+ return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/stdlib/tst-putenvmod.c b/stdlib/tst-putenvmod.c
new file mode 100644
index 0000000000..7c0c337d3f
--- /dev/null
+++ b/stdlib/tst-putenvmod.c
@@ -0,0 +1,17 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+void
+__attribute ((constructor))
+init (void)
+{
+ puts ("init DSO");
+
+ static char str[] = "SOMETHING_NOBODY_USES=something_else";
+ if (putenv (str) != 0)
+ {
+ puts ("putenv failed");
+ _exit (1);
+ }
+}
diff --git a/sysdeps/generic/dl-tls.c b/sysdeps/generic/dl-tls.c
index 3382e3493c..2282dda9cc 100644
--- a/sysdeps/generic/dl-tls.c
+++ b/sysdeps/generic/dl-tls.c
@@ -577,7 +577,7 @@ __tls_get_addr (GET_ADDR_ARGS)
{
size_t cnt;
- for (cnt = total = 0 ? 1 : 0; cnt < listp->len; ++cnt)
+ for (cnt = total == 0 ? 1 : 0; cnt < listp->len; ++cnt)
{
size_t gen = listp->slotinfo[cnt].gen;
struct link_map *map;
diff --git a/sysdeps/generic/libc-start.c b/sysdeps/generic/libc-start.c
index fc9df40996..ad5ebe0911 100644
--- a/sysdeps/generic/libc-start.c
+++ b/sysdeps/generic/libc-start.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1998-2003, 2004 Free Software Foundation, Inc.
+/* Copyright (C) 1998-2003, 2004, 2005 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -106,9 +106,9 @@ LIBC_START_MAIN (int (*main) (int, char **, char ** MAIN_AUXVEC_DECL),
__libc_multiple_libcs = &_dl_starting_up && !_dl_starting_up;
+#ifndef SHARED
INIT_ARGV_and_ENVIRON;
-#ifndef SHARED
/* Store the lowest stack address. This is done in ld.so if this is
the code for the DSO. */
__libc_stack_end = stack_end;
diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure
new file mode 100755
index 0000000000..d1d4dc15a7
--- /dev/null
+++ b/sysdeps/i386/configure
@@ -0,0 +1,54 @@
+# This file is generated from configure.in by Autoconf. DO NOT EDIT!
+ # Local configure fragment for sysdeps/i386.
+
+echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5
+echo $ECHO_N "checking if -g produces usable source locations for assembler-with-cpp... $ECHO_C" >&6
+if test "${libc_cv_cpp_asm_debuginfo+set}" = set; then
+ echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+ cat > conftest.S <<EOF
+#include "confdefs.h"
+
+/* comment on
+ two lines */
+ ${libc_cv_dot_text}
+ ${libc_cv_asm_global_directive} foo
+foo:
+ /* Unfortunately this test only works for a real instruction,
+ not for any of the machine-independent pseudo-ops.
+ So we just have to assume everybody has a "nop". */
+ nop
+ /* comment */
+ nop
+ /* comment */
+ nop
+EOF
+if { ac_try='${CC-cc} $CPPFLAGS $ASFLAGS -g -c conftest.S 1>&5'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; } && {
+ ac_pattern='conftest\.S'
+ { ac_try='readelf --debug-dump=line conftest.o |
+ grep $ac_pattern 1>&5'
+ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+ (eval $ac_try) 2>&5
+ ac_status=$?
+ echo "$as_me:$LINENO: \$? = $ac_status" >&5
+ (exit $ac_status); }; }
+ }; then
+ libc_cv_cpp_asm_debuginfo=yes
+else
+ libc_cv_cpp_asm_debuginfo=no
+fi
+rm -f conftest*
+fi
+echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5
+echo "${ECHO_T}$libc_cv_cpp_asm_debuginfo" >&6
+if test $libc_cv_cpp_asm_debuginfo = yes; then
+ cat >>confdefs.h <<\_ACEOF
+#define HAVE_CPP_ASM_DEBUGINFO 1
+_ACEOF
+
+fi
diff --git a/sysdeps/i386/configure.in b/sysdeps/i386/configure.in
new file mode 100644
index 0000000000..028e1ae8e1
--- /dev/null
+++ b/sysdeps/i386/configure.in
@@ -0,0 +1,35 @@
+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
+# Local configure fragment for sysdeps/i386.
+
+AC_CACHE_CHECK(if -g produces usable source locations for assembler-with-cpp,
+ libc_cv_cpp_asm_debuginfo, [dnl
+cat > conftest.S <<EOF
+#include "confdefs.h"
+
+/* comment on
+ two lines */
+ ${libc_cv_dot_text}
+ ${libc_cv_asm_global_directive} foo
+foo:
+ /* Unfortunately this test only works for a real instruction,
+ not for any of the machine-independent pseudo-ops.
+ So we just have to assume everybody has a "nop". */
+ nop
+ /* comment */
+ nop
+ /* comment */
+ nop
+EOF
+if AC_TRY_COMMAND([${CC-cc} $CPPFLAGS $ASFLAGS -g -c conftest.S 1>&AS_MESSAGE_LOG_FD]) && {
+ ac_pattern='conftest\.S'
+ AC_TRY_COMMAND([readelf --debug-dump=line conftest.o |
+ grep $ac_pattern 1>&AS_MESSAGE_LOG_FD])
+ }; then
+ libc_cv_cpp_asm_debuginfo=yes
+else
+ libc_cv_cpp_asm_debuginfo=no
+fi
+rm -f conftest*])AC_SUBST(libc_cv_cpp_asm_debuginfo)
+if test $libc_cv_cpp_asm_debuginfo = yes; then
+ AC_DEFINE(HAVE_CPP_ASM_DEBUGINFO)
+fi
diff --git a/sysdeps/i386/tst-stack-align.h b/sysdeps/i386/tst-stack-align.h
new file mode 100644
index 0000000000..6297d9faa8
--- /dev/null
+++ b/sysdeps/i386/tst-stack-align.h
@@ -0,0 +1,42 @@
+/* Copyright (C) 2004 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <stdio.h>
+#include <stdint.h>
+
+typedef struct { int i[4]; } int_al16 __attribute__((aligned (16)));
+
+#define TEST_STACK_ALIGN() \
+ ({ \
+ int_al16 _m; \
+ double _d = 12.0; \
+ long double _ld = 15.0; \
+ int _ret = 0; \
+ printf ("int_al16: %p %zu\n", &_m, __alignof (int_al16)); \
+ if ((((uintptr_t) &_m) & (__alignof (int_al16) - 1)) != 0) \
+ _ret = 1; \
+ \
+ printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \
+ if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \
+ _ret = 1; \
+ \
+ printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \
+ if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \
+ _ret = 1; \
+ _ret; \
+ })
diff --git a/sysdeps/ia64/fpu/Makefile b/sysdeps/ia64/fpu/Makefile
index 6d1b0c1717..7ec30c43d3 100644
--- a/sysdeps/ia64/fpu/Makefile
+++ b/sysdeps/ia64/fpu/Makefile
@@ -1,8 +1,33 @@
ifeq ($(subdir),math)
-libm-sysdep_routines += libm_atan2_reg s_matherrf s_matherrl libm_reduce \
- libm_tan libm_error \
- libm_frexp4 libm_frexp4f libm_frexp4l
+#
+# Some files which need to go both into libc and libm have external
+# dependencies which need to be resolved differently for libc
+# vs. libm. For example, inside libc, __libm_error_support needs to
+# resolve to HIDDEN_JUMPTARGET(__libm_error_support) whereas within
+# libm it always resolves to __libm_error_support. Such files need to
+# be compiled twice. Fortunately, math/Makefile already has logic to
+# support this: if a file starts with "s_", make will automatically
+# generate a matching file whose name starts with "m_" which simply
+# includes the corresponding "s_" file.
+#
+duplicated-routines = s_libm_ldexp s_libm_ldexpf s_libm_ldexpl \
+ s_libm_scalbn s_libm_scalbnf s_libm_scalbnl
-sysdep_routines += libm_frexp4 libm_frexp4f libm_frexp4l libc_libm_error
-sysdep-CPPFLAGS += -DSIZE_INT_32
+libm-sysdep_routines += s_erfc s_erfcf s_erfcl \
+ s_matherrf s_matherrl libm_reduce \
+ libm_error \
+ libm_frexp libm_frexpf libm_frexpl \
+ libm_sincos libm_sincosf libm_sincosl \
+ libm_sincos_large \
+ libm_lgamma libm_lgammaf libm_lgammal \
+ libm_scalblnf \
+ $(duplicated-routines:s_%=m_%)
+
+sysdep_routines += libc_libm_error libm_frexp libm_frexpf libm_frexpl \
+ $(duplicated-routines)
+
+sysdep-CPPFLAGS += -include libm-symbols.h \
+ -D__POSIX__ \
+ -D_LIB_VERSIONIMF=_LIB_VERSION \
+ -DSIZE_INT_32 -DSIZE_LONG_INT_64 -DSIZE_LONG_LONG_INT_64
endif
diff --git a/sysdeps/ia64/fpu/README b/sysdeps/ia64/fpu/README
new file mode 100644
index 0000000000..6f4af0678a
--- /dev/null
+++ b/sysdeps/ia64/fpu/README
@@ -0,0 +1,50 @@
+ ----------------------------------------------------------
+ Notes on how to update libm based on Intel's libm releases
+ ----------------------------------------------------------
+
+This source code in this directory is currently based on Intel libm
+v2.1 as available from:
+
+ http://www.intel.com/software/products/opensource/libraries/num.htm
+
+To ease importing, fix some bugs, and simplify integration into libc,
+it is also necessary to apply the patch at:
+
+ ftp://ftp.hpl.hp.com/pub/linux-ia64/intel-libm-041228.diff.gz
+
+The expectation is that Intel will integrate most if not all of these
+changes into future releases of libm, so this patching step can
+hopefully be omitted in the future.
+
+Once the patched libm sources are extracted in a directory $LIBM, they
+can be imported into the libc source tree at $LIBC with the following
+step:
+
+ $ cd $LIBC/src/sysdep/ia64/fpu
+ $ ./import_intel_libm $LIBM
+
+This should produce a number of "Importing..." messages, without
+showing any errors.
+
+At this point, you should be able to build glibc in the usual fashion.
+We assume you do this in directory $OBJ. Once the build has
+completed, run "make check" to verify that all (math) checks succeed.
+If these checks succeed, you should also run the following commands to
+verify that the new libm doesn't pollute the name-space and has proper
+size-info for the data objects:
+
+ $ cd $LIBC/src/sysdep/ia64/fpu
+ $ import_check $OBJ/math/
+
+There should be no (unexpected) errors reported by this script.
+
+As an optional step, you may also want to confirm that the new libm
+exports the exact same global symbols as the old one.
+
+If you want to see the changes introduced by the "import_intel_libm"
+script, you can run the commands:
+
+ $ cd $LIBC/src/sysdep/ia64/fpu
+ $ import_diffs
+
+That's it.
diff --git a/sysdeps/ia64/fpu/e_acos.S b/sysdeps/ia64/fpu/e_acos.S
index 7e83811727..b515f01a1e 100644
--- a/sysdeps/ia64/fpu/e_acos.S
+++ b/sysdeps/ia64/fpu/e_acos.S
@@ -1,10 +1,10 @@
.file "acos.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003 Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,9 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// WARRANTY DISCLAIMER
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -37,838 +35,800 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
//==============================================================
-// 2/02/00 Initial version
-// 8/17/00 New and much faster algorithm.
-// 8/30/00 Avoided bank conflicts on loads, shortened |x|=1 and x=0 paths,
+// 02/02/00 Initial version
+// 08/17/00 New and much faster algorithm.
+// 08/30/00 Avoided bank conflicts on loads, shortened |x|=1 and x=0 paths,
// fixed mfb split issue stalls.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 08/02/02 New and much faster algorithm II
+// 02/06/03 Reordered header: .section, .global, .proc, .align
// Description
//=========================================
-// The acos function computes the principle value of the arc sine of x.
+// The acos function computes the principal value of the arc cosine of x.
+// acos(0) returns Pi/2, acos(1) returns 0, acos(-1) returns Pi.
// A doman error occurs for arguments not in the range [-1,+1].
+//
+// The acos function returns the arc cosine in the range [0, Pi] radians.
+//
+// There are 8 paths:
+// 1. x = +/-0.0
+// Return acos(x) = Pi/2 + x
+//
+// 2. 0.0 < |x| < 0.625
+// Return acos(x) = Pi/2 - x - x^3 *PolA(x^2)
+// where PolA(x^2) = A3 + A5*x^2 + A7*x^4 +...+ A35*x^32
+//
+// 3. 0.625 <=|x| < 1.0
+// Return acos(x) = Pi/2 - asin(x) =
+// = Pi/2 - sign(x) * ( Pi/2 - sqrt(R) * PolB(R))
+// Where R = 1 - |x|,
+// PolB(R) = B0 + B1*R + B2*R^2 +...+B12*R^12
+//
+// sqrt(R) is approximated using the following sequence:
+// y0 = (1 + eps)/sqrt(R) - initial approximation by frsqrta,
+// |eps| < 2^(-8)
+// Then 3 iterations are used to refine the result:
+// H0 = 0.5*y0
+// S0 = R*y0
+//
+// d0 = 0.5 - H0*S0
+// H1 = H0 + d0*H0
+// S1 = S0 + d0*S0
+//
+// d1 = 0.5 - H1*S1
+// H2 = H1 + d0*H1
+// S2 = S1 + d0*S1
+//
+// d2 = 0.5 - H2*S2
+// S3 = S3 + d2*S3
+//
+// S3 approximates sqrt(R) with enough accuracy for this algorithm
+//
+// So, the result should be reconstracted as follows:
+// acos(x) = Pi/2 - sign(x) * (Pi/2 - S3*PolB(R))
+//
+// But for optimization purposes the reconstruction step is slightly
+// changed:
+// acos(x) = Cpi + sign(x)*PolB(R)*S2 - sign(x)*d2*S2*PolB(R)
+// where Cpi = 0 if x > 0 and Cpi = Pi if x < 0
+//
+// 4. |x| = 1.0
+// Return acos(1.0) = 0.0, acos(-1.0) = Pi
+//
+// 5. 1.0 < |x| <= +INF
+// A doman error occurs for arguments not in the range [-1,+1]
+//
+// 6. x = [S,Q]NaN
+// Return acos(x) = QNaN
+//
+// 7. x is denormal
+// Return acos(x) = Pi/2 - x,
+//
+// 8. x is unnormal
+// Normalize input in f8 and return to the very beginning of the function
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input, output
+// f6, f7, f9 -> f15, f32 -> f64
-// The acos function returns the arc cosine in the range [0, +pi] radians.
-// acos(1) returns +0, acos(-1) returns pi, acos(0) returns pi/2.
-// acos(x) returns a Nan and raises the invalid exception for |x| >1
+// General registers used:
+// r3, r21 -> r31, r32 -> r38
-// The acos function is just like asin except that pi/2 is added at the end.
+// Predicate registers used:
+// p0, p6 -> p14
//
// Assembly macros
//=========================================
-
-#include "libm_support.h"
-
-// predicate registers
-//acos_pred_LEsqrt2by2 = p7
-//acos_pred_GTsqrt2by2 = p8
-
-// integer registers
-ASIN_Addr1 = r33
-ASIN_Addr2 = r34
-ASIN_FFFE = r35
-
-GR_SAVE_B0 = r36
-GR_SAVE_PFS = r37
-GR_SAVE_GP = r38
-
-GR_Parameter_X = r39
-GR_Parameter_Y = r40
-GR_Parameter_RESULT = r41
-GR_Parameter_Tag = r42
-
-// floating point registers
-acos_coeff_P1 = f32
-acos_coeff_P2 = f33
-acos_coeff_P3 = f34
-acos_coeff_P4 = f35
-
-acos_coeff_P5 = f36
-acos_coeff_P6 = f37
-acos_coeff_P7 = f38
-acos_coeff_P8 = f39
-acos_coeff_P9 = f40
-
-acos_coeff_P10 = f41
-acos_coeff_P11 = f42
-acos_coeff_P12 = f43
-acos_coeff_P13 = f44
-acos_coeff_P14 = f45
-
-acos_coeff_P15 = f46
-acos_coeff_P16 = f47
-acos_coeff_P17 = f48
-acos_coeff_P18 = f49
-acos_coeff_P19 = f50
-
-acos_coeff_P20 = f51
-acos_coeff_P21 = f52
-acos_const_sqrt2by2 = f53
-acos_const_piby2 = f54
-acos_abs_x = f55
-
-acos_tx = f56
-acos_tx2 = f57
-acos_tx3 = f58
-acos_tx4 = f59
-acos_tx8 = f60
-
-acos_tx11 = f61
-acos_1poly_p8 = f62
-acos_1poly_p19 = f63
-acos_1poly_p4 = f64
-acos_1poly_p15 = f65
-
-acos_1poly_p6 = f66
-acos_1poly_p17 = f67
-acos_1poly_p0 = f68
-acos_1poly_p11 = f69
-acos_1poly_p2 = f70
-
-acos_1poly_p13 = f71
-acos_series_tx = f72
-acos_t = f73
-acos_t2 = f74
-acos_t3 = f75
-
-acos_t4 = f76
-acos_t8 = f77
-acos_t11 = f78
-acos_poly_p8 = f79
-acos_poly_p19 = f80
-
-acos_poly_p4 = f81
-acos_poly_p15 = f82
-acos_poly_p6 = f83
-acos_poly_p17 = f84
-acos_poly_p0 = f85
-
-acos_poly_p11 = f86
-acos_poly_p2 = f87
-acos_poly_p13 = f88
-acos_series_t = f89
-acos_1by2 = f90
-
-acos_3by2 = f91
-acos_5by2 = f92
-acos_11by4 = f93
-acos_35by8 = f94
-acos_63by8 = f95
-
-acos_231by16 = f96
-acos_y0 = f97
-acos_H0 = f98
-acos_S0 = f99
-acos_d = f100
-
-acos_l1 = f101
-acos_d2 = f102
-acos_T0 = f103
-acos_d1 = f104
-acos_e0 = f105
-
-acos_l2 = f106
-acos_d3 = f107
-acos_T3 = f108
-acos_S1 = f109
-acos_e1 = f110
-
-acos_z = f111
-answer2 = f112
-acos_sgn_x = f113
-acos_429by16 = f114
-acos_18by4 = f115
-
-acos_3by4 = f116
-acos_l3 = f117
-acos_T6 = f118
-acos_const_add = f119
+// integer registers used
+// scratch
+rTblAddr = r3
+
+rPiBy2Ptr = r21
+rTmpPtr3 = r22
+rDenoBound = r23
+rOne = r24
+rAbsXBits = r25
+rHalf = r26
+r0625 = r27
+rSign = r28
+rXBits = r29
+rTmpPtr2 = r30
+rTmpPtr1 = r31
+
+// stacked
+GR_SAVE_PFS = r32
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_Parameter_X = r35
+GR_Parameter_Y = r36
+GR_Parameter_RESULT = r37
+GR_Parameter_TAG = r38
+
+// floating point registers used
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+
+// scratch
+fXSqr = f6
+fXCube = f7
+fXQuadr = f9
+f1pX = f10
+f1mX = f11
+f1pXRcp = f12
+f1mXRcp = f13
+fH = f14
+fS = f15
+// stacked
+fA3 = f32
+fB1 = f32
+fA5 = f33
+fB2 = f33
+fA7 = f34
+fPiBy2 = f34
+fA9 = f35
+fA11 = f36
+fB10 = f35
+fB11 = f36
+fA13 = f37
+fA15 = f38
+fB4 = f37
+fB5 = f38
+fA17 = f39
+fA19 = f40
+fB6 = f39
+fB7 = f40
+fA21 = f41
+fA23 = f42
+fB3 = f41
+fB8 = f42
+fA25 = f43
+fA27 = f44
+fB9 = f43
+fB12 = f44
+fA29 = f45
+fA31 = f46
+fA33 = f47
+fA35 = f48
+fBaseP = f49
+fB0 = f50
+fSignedS = f51
+fD = f52
+fHalf = f53
+fR = f54
+fCloseTo1Pol = f55
+fSignX = f56
+fDenoBound = f57
+fNormX = f58
+fX8 = f59
+fRSqr = f60
+fRQuadr = f61
+fR8 = f62
+fX16 = f63
+fCpi = f64
// Data tables
//==============================================================
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-
+RODATA
.align 16
-
-acos_coeff_1_table:
-ASM_TYPE_DIRECTIVE(acos_coeff_1_table,@object)
-data8 0xE4E7E0A423A21249 , 0x00003FF8 //P7
-data8 0xC2F7EE0200FCE2A5 , 0x0000C003 //P18
-data8 0xB745D7F6C65C20E0 , 0x00003FF9 //P5
-data8 0xF75E381A323D4D94 , 0x0000C002 //P16
-data8 0x8959C2629C1024C0 , 0x0000C002 //P20
-data8 0xAFF68E7D241292C5 , 0x00003FF8 //P9
-data8 0xB6DB6DB7260AC30D , 0x00003FFA //P3
-data8 0xD0417CE2B41CB7BF , 0x0000C000 //P14
-data8 0x81D570FEA724E3E4 , 0x0000BFFD //P12
-data8 0xAAAAAAAAAAAAC277 , 0x00003FFC //P1
-data8 0xF534912FF3E7B76F , 0x00003FFF //P21
-data8 0xc90fdaa22168c235 , 0x00003fff // pi/2
-data8 0x0000000000000000 , 0x00000000 // pad to avoid bank conflicts
-ASM_SIZE_DIRECTIVE(acos_coeff_1_table)
-
-
-acos_coeff_2_table:
-ASM_TYPE_DIRECTIVE(acos_coeff_2_table,@object)
-data8 0x8E26AF5F29B39A2A , 0x00003FF9 //P6
-data8 0xB4F118A4B1015470 , 0x00004003 //P17
-data8 0xF8E38E10C25990E0 , 0x00003FF9 //P4
-data8 0x80F50489AEF1CAC6 , 0x00004002 //P15
-data8 0x92728015172CFE1C , 0x00004003 //P19
-data8 0xBBC3D831D4595971 , 0x00003FF8 //P8
-data8 0x999999999952A5C3 , 0x00003FFB //P2
-data8 0x855576BE6F0975EC , 0x00003FFF //P13
-data8 0xF12420E778077D89 , 0x00003FFA //P11
-data8 0xB6590FF4D23DE003 , 0x00003FF3 //P10
-data8 0xb504f333f9de6484 , 0x00003ffe // sqrt(2)/2
-ASM_SIZE_DIRECTIVE(acos_coeff_2_table)
-
-
-.align 32
-.global acos
-ASM_TYPE_DIRECTIVE(acos,@function)
+LOCAL_OBJECT_START(acos_base_range_table)
+// Ai: Polynomial coefficients for the acos(x), |x| < .625000
+// Bi: Polynomial coefficients for the acos(x), |x| > .625000
+data8 0xBFDAAB56C01AE468 //A29
+data8 0x3FE1C470B76A5B2B //A31
+data8 0xBFDC5FF82A0C4205 //A33
+data8 0x3FC71FD88BFE93F0 //A35
+data8 0xB504F333F9DE6487, 0x00003FFF //B0
+data8 0xAAAAAAAAAAAAFC18, 0x00003FFC //A3
+data8 0x3F9F1C71BC4A7823 //A9
+data8 0x3F96E8BBAAB216B2 //A11
+data8 0x3F91C4CA1F9F8A98 //A13
+data8 0x3F8C9DDCEDEBE7A6 //A15
+data8 0x3F877784442B1516 //A17
+data8 0x3F859C0491802BA2 //A19
+data8 0x9999999998C88B8F, 0x00003FFB //A5
+data8 0x3F6BD7A9A660BF5E //A21
+data8 0x3F9FC1659340419D //A23
+data8 0xB6DB6DB798149BDF, 0x00003FFA //A7
+data8 0xBFB3EF18964D3ED3 //A25
+data8 0x3FCD285315542CF2 //A27
+data8 0xF15BEEEFF7D2966A, 0x00003FFB //B1
+data8 0x3EF0DDA376D10FB3 //B10
+data8 0xBEB83CAFE05EBAC9 //B11
+data8 0x3F65FFB67B513644 //B4
+data8 0x3F5032FBB86A4501 //B5
+data8 0x3F392162276C7CBA //B6
+data8 0x3F2435949FD98BDF //B7
+data8 0xD93923D7FA08341C, 0x00003FF9 //B2
+data8 0x3F802995B6D90BDB //B3
+data8 0x3F10DF86B341A63F //B8
+data8 0xC90FDAA22168C235, 0x00003FFF // Pi/2
+data8 0x3EFA3EBD6B0ECB9D //B9
+data8 0x3EDE18BA080E9098 //B12
+LOCAL_OBJECT_END(acos_base_range_table)
.section .text
-.proc acos
-.align 32
-
-
-acos:
-
-{ .mfi
- alloc r32 = ar.pfs,1,6,4,0
- fma.s1 acos_tx = f8,f8,f0
- addl ASIN_Addr2 = @ltoff(acos_coeff_2_table),gp
-}
-{ .mfi
- mov ASIN_FFFE = 0xFFFE
- fnma.s1 acos_t = f8,f8,f1
- addl ASIN_Addr1 = @ltoff(acos_coeff_1_table),gp
+GLOBAL_LIBM_ENTRY(acos)
+acos_unnormal_back:
+{ .mfi
+ getf.d rXBits = f8 // grab bits of input value
+ // set p12 = 1 if x is a NaN, denormal, or zero
+ fclass.m p12, p0 = f8, 0xcf
+ adds rSign = 1, r0
+}
+{ .mfi
+ addl rTblAddr = @ltoff(acos_base_range_table),gp
+ // 1 - x = 1 - |x| for positive x
+ fms.s1 f1mX = f1, f1, f8
+ addl rHalf = 0xFFFE, r0 // exponent of 1/2
}
;;
-
-
-{ .mfi
- setf.exp acos_1by2 = ASIN_FFFE
- fmerge.s acos_abs_x = f1,f8
- nop.i 999 ;;
-}
-
-
-{ .mmf
- ld8 ASIN_Addr1 = [ASIN_Addr1]
- ld8 ASIN_Addr2 = [ASIN_Addr2]
- fmerge.s acos_sgn_x = f8,f1
-}
-;;
-
-
-{ .mfi
- nop.m 999
- fcmp.lt.s1 p11,p12 = f8, f0
- nop.i 999 ;;
-}
-
-
-{ .mfi
- ldfe acos_coeff_P7 = [ASIN_Addr1],16
- fma.s1 acos_tx2 = acos_tx,acos_tx,f0
- nop.i 999
-}
-{ .mfi
- ldfe acos_coeff_P6 = [ASIN_Addr2],16
- fma.s1 acos_t2 = acos_t,acos_t,f0
- nop.i 999;;
+{ .mfi
+ addl r0625 = 0x3FE4, r0 // high 16 bits of 0.625
+ // set p8 = 1 if x < 0
+ fcmp.lt.s1 p8, p9 = f8, f0
+ shl rSign = rSign, 63 // sign bit
+}
+{ .mfi
+ // point to the beginning of the table
+ ld8 rTblAddr = [rTblAddr]
+ // 1 + x = 1 - |x| for negative x
+ fma.s1 f1pX = f1, f1, f8
+ adds rOne = 0x3FF, r0
}
-
-
-{ .mmf
- ldfe acos_coeff_P18 = [ASIN_Addr1],16
- ldfe acos_coeff_P17 = [ASIN_Addr2],16
- fclass.m.unc p8,p0 = f8, 0xc3 //@qnan |@snan
-}
;;
-
-
-{ .mmf
- ldfe acos_coeff_P5 = [ASIN_Addr1],16
- ldfe acos_coeff_P4 = [ASIN_Addr2],16
- frsqrta.s1 acos_y0,p0 = acos_t
-}
+{ .mfi
+ andcm rAbsXBits = rXBits, rSign // bits of |x|
+ fmerge.s fSignX = f8, f1 // signum(x)
+ shl r0625 = r0625, 48 // bits of DP representation of 0.625
+}
+{ .mfb
+ setf.exp fHalf = rHalf // load A2 to FP reg
+ fma.s1 fXSqr = f8, f8, f0 // x^2
+ // branch on special path if x is a NaN, denormal, or zero
+(p12) br.cond.spnt acos_special
+}
;;
-
-
-{ .mfi
- ldfe acos_coeff_P16 = [ASIN_Addr1],16
- fcmp.gt.s1 p9,p0 = acos_abs_x,f1
- nop.i 999
-}
-{ .mfb
- ldfe acos_coeff_P15 = [ASIN_Addr2],16
-(p8) fma.d f8 = f8,f1,f0
-(p8) br.ret.spnt b0
+{ .mfi
+ adds rPiBy2Ptr = 272, rTblAddr
+ nop.f 0
+ shl rOne = rOne, 52 // bits of 1.0
+}
+{ .mfi
+ adds rTmpPtr1 = 16, rTblAddr
+ nop.f 0
+ // set p6 = 1 if |x| < 0.625
+ cmp.lt p6, p7 = rAbsXBits, r0625
}
;;
-
-
-{ .mmf
- ldfe acos_coeff_P20 = [ASIN_Addr1],16
- ldfe acos_coeff_P19 = [ASIN_Addr2],16
- fclass.m.unc p10,p0 = f8, 0x07 //@zero
-}
+{ .mfi
+ ldfpd fA29, fA31 = [rTblAddr] // A29, fA31
+ // 1 - x = 1 - |x| for positive x
+(p9) fms.s1 fR = f1, f1, f8
+ // point to coefficient of "near 1" polynomial
+(p7) adds rTmpPtr2 = 176, rTblAddr
+}
+{ .mfi
+ ldfpd fA33, fA35 = [rTmpPtr1], 16 // A33, fA35
+ // 1 + x = 1 - |x| for negative x
+(p8) fma.s1 fR = f1, f1, f8
+(p6) adds rTmpPtr2 = 48, rTblAddr
+}
;;
-
-
-{ .mfi
- ldfe acos_coeff_P9 = [ASIN_Addr1],16
- fma.s1 acos_t4 = acos_t2,acos_t2,f0
-(p9) mov GR_Parameter_Tag = 58
-}
-{ .mfi
- ldfe acos_coeff_P8 = [ASIN_Addr2],16
- fma.s1 acos_3by2 = acos_1by2,f1,f1
- nop.i 999;;
+{ .mfi
+ ldfe fB0 = [rTmpPtr1], 16 // B0
+ nop.f 0
+ nop.i 0
}
-
-
-{ .mfi
- ldfe acos_coeff_P2 = [ASIN_Addr2],16
- fma.s1 acos_tx4 = acos_tx2,acos_tx2,f0
- nop.i 999
-}
-{ .mfb
- ldfe acos_coeff_P3 = [ASIN_Addr1],16
- fma.s1 acos_t3 = acos_t,acos_t2,f0
-(p9) br.cond.spnt __libm_error_region
+{ .mib
+ adds rTmpPtr3 = 16, rTmpPtr2
+ // set p10 = 1 if |x| = 1.0
+ cmp.eq p10, p0 = rAbsXBits, rOne
+ // branch on special path for |x| = 1.0
+(p10) br.cond.spnt acos_abs_1
}
;;
-
-
-{ .mfi
- ldfe acos_coeff_P13 = [ASIN_Addr2],16
- fma.s1 acos_H0 = acos_y0,acos_1by2,f0
- nop.i 999
-}
-{ .mfi
- ldfe acos_coeff_P14 = [ASIN_Addr1],16
- fma.s1 acos_S0 = acos_y0,acos_t,f0
- nop.i 999;;
+{ .mfi
+ ldfe fA3 = [rTmpPtr2], 48 // A3 or B1
+ nop.f 0
+ adds rTmpPtr1 = 64, rTmpPtr3
}
-
-
-{ .mfi
- ldfe acos_coeff_P11 = [ASIN_Addr2],16
- fcmp.eq.s1 p6,p0 = acos_abs_x, f1
- nop.i 999
-}
-{ .mfi
- ldfe acos_coeff_P12 = [ASIN_Addr1],16
- fma.s1 acos_tx3 = acos_tx,acos_tx2,f0
- nop.i 999
+{ .mib
+ ldfpd fA9, fA11 = [rTmpPtr3], 16 // A9, A11 or B10, B11
+ // set p11 = 1 if |x| > 1.0
+ cmp.gt p11, p0 = rAbsXBits, rOne
+ // branch on special path for |x| > 1.0
+(p11) br.cond.spnt acos_abs_gt_1
}
;;
-
-
-{ .mfi
- ldfe acos_coeff_P10 = [ASIN_Addr2],16
- fma.s1 acos_1poly_p6 = acos_tx,acos_coeff_P7,acos_coeff_P6
- nop.i 999
-}
-{ .mfi
- ldfe acos_coeff_P1 = [ASIN_Addr1],16
- fma.s1 acos_poly_p6 = acos_t,acos_coeff_P7,acos_coeff_P6
- nop.i 999;;
+{ .mfi
+ ldfpd fA17, fA19 = [rTmpPtr2], 16 // A17, A19 or B6, B7
+ // initial approximation of 1 / sqrt(1 - x)
+ frsqrta.s1 f1mXRcp, p0 = f1mX
+ nop.i 0
}
-
-
-{ .mfi
- ldfe acos_const_sqrt2by2 = [ASIN_Addr2],16
- fma.s1 acos_5by2 = acos_3by2,f1,f1
- nop.i 999
-}
-{ .mfi
- ldfe acos_coeff_P21 = [ASIN_Addr1],16
- fma.s1 acos_11by4 = acos_3by2,acos_3by2,acos_1by2
- nop.i 999;;
+{ .mfi
+ ldfpd fA13, fA15 = [rTmpPtr3] // A13, A15 or B4, B5
+ fma.s1 fXCube = fXSqr, f8, f0 // x^3
+ nop.i 0
}
-
-
-{ .mfi
- ldfe acos_const_piby2 = [ASIN_Addr1],16
- fma.s1 acos_poly_p17 = acos_t,acos_coeff_P18,acos_coeff_P17
- nop.i 999
-}
-{ .mfb
- nop.m 999
- fma.s1 acos_3by4 = acos_3by2,acos_1by2,f0
-(p10) br.cond.spnt L(ACOS_ZERO) // Branch to short path if x=0
+;;
+{ .mfi
+ ldfe fA5 = [rTmpPtr2], 48 // A5 or B2
+ // initial approximation of 1 / sqrt(1 + x)
+ frsqrta.s1 f1pXRcp, p0 = f1pX
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA21, fA23 = [rTmpPtr1], 16 // A21, A23 or B3, B8
+ fma.s1 fXQuadr = fXSqr, fXSqr, f0 // x^4
+ nop.i 0
}
;;
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p15 = acos_t,acos_coeff_P16,acos_coeff_P15
- nop.i 999
-}
-{ .mfb
- nop.m 999
- fnma.s1 acos_d = acos_S0,acos_H0,acos_1by2
-(p6) br.cond.spnt L(ACOS_ABS_ONE) // Branch to short path if |x|=1
+{ .mfi
+ ldfe fA7 = [rTmpPtr1] // A7 or Pi/2
+ fma.s1 fRSqr = fR, fR, f0 // R^2
+ nop.i 0
+}
+{ .mfb
+ ldfpd fA25, fA27 = [rTmpPtr2] // A25, A27 or B9, B12
+ nop.f 0
+(p6) br.cond.spnt acos_base_range;
}
;;
-
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p19 = acos_t,acos_coeff_P20,acos_coeff_P19
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p4 = acos_t,acos_coeff_P5,acos_coeff_P4
- nop.i 999;;
+{ .mfi
+ nop.m 0
+(p9) fma.s1 fH = fHalf, f1mXRcp, f0 // H0 for x > 0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p17 = acos_tx,acos_coeff_P18,acos_coeff_P17
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p8 = acos_t,acos_coeff_P9,acos_coeff_P8
- nop.i 999;;
+{ .mfi
+ nop.m 0
+(p9) fma.s1 fS = f1mX, f1mXRcp, f0 // S0 for x > 0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fms.s1 acos_35by8 = acos_5by2,acos_11by4,acos_5by2
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_63by8 = acos_5by2,acos_11by4,f1
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+(p8) fma.s1 fH = fHalf, f1pXRcp, f0 // H0 for x < 0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p13 = acos_t,acos_coeff_P14,acos_coeff_P13
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_18by4 = acos_3by2,acos_5by2,acos_3by4
- nop.i 999;;
+{ .mfi
+ nop.m 0
+(p8) fma.s1 fS = f1pX, f1pXRcp, f0 // S0 for x > 0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_l1 = acos_5by2,acos_d,acos_3by2
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_d2 = acos_d,acos_d,f0
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRQuadr = fRSqr, fRSqr, f0 // R^4
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p15 = acos_t2,acos_poly_p17,acos_poly_p15
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_T0 = acos_d,acos_S0,f0
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB11 = fB11, fR, fB10
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p19 = acos_t2,acos_coeff_P21,acos_poly_p19
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p4 = acos_t2,acos_poly_p6,acos_poly_p4
- nop.i 999;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB1 = fB1, fR, fB0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_d1 = acos_35by8,acos_d,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_231by16 = acos_3by2,acos_35by8,acos_63by8
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB5 = fB5, fR, fB4
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p2 = acos_t,acos_coeff_P3,acos_coeff_P2
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p8 = acos_t2,acos_coeff_P10,acos_poly_p8
- nop.i 999;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB7 = fB7, fR, fB6
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p11 = acos_t,acos_coeff_P12,acos_coeff_P11
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_e0 = acos_d2,acos_l1,acos_d
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB3 = fB3, fR, fB2
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p15 = acos_tx,acos_coeff_P16,acos_coeff_P15
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p0 = acos_t,acos_coeff_P1,f1
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fnma.s1 fD = fH, fS, fHalf // d0 = 1/2 - H0*S0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p19 = acos_tx,acos_coeff_P20,acos_coeff_P19
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p4 = acos_tx,acos_coeff_P5,acos_coeff_P4
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fR8 = fRQuadr, fRQuadr, f0 // R^4
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p8 = acos_tx,acos_coeff_P9,acos_coeff_P8
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_l2 = acos_231by16,acos_d,acos_63by8
- nop.i 999;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB9 = fB9, fR, fB8
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_d3 = acos_d2,acos_d,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_T3 = acos_d2,acos_T0,f0
- nop.i 999;;
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fB12 = fB12, fRSqr, fB11
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_429by16 = acos_18by4,acos_11by4,acos_231by16
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_S1 = acos_e0,acos_S0,acos_S0
- nop.i 999;;
+{.mfi
+ nop.m 0
+ fma.s1 fB7 = fB7, fRSqr, fB5
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p4 = acos_t4,acos_poly_p8,acos_poly_p4
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p15 = acos_t4,acos_poly_p19,acos_poly_p15
- nop.i 999;;
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fB3 = fB3, fRSqr, fB1
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p0 = acos_t2,acos_poly_p2,acos_poly_p0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p11 = acos_t2,acos_poly_p13,acos_poly_p11
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fH = fH, fD, fH // H1 = H0 + H0*d0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_t8 = acos_t4,acos_t4,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_e1 = acos_d2,acos_l2,acos_d1
- nop.i 999;;
+{ .mfi
+ nop.m 0
+ fma.s1 fS = fS, fD, fS // S1 = S0 + S0*d0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p4 = acos_tx2,acos_1poly_p6,acos_1poly_p4
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p15 = acos_tx2,acos_1poly_p17,acos_1poly_p15
- nop.i 999;;
+;;
+{.mfi
+ nop.m 0
+(p9) fma.s1 fCpi = f1, f0, f0 // Cpi = 0 if x > 0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p8 = acos_tx2,acos_coeff_P10,acos_1poly_p8
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p19 = acos_tx2,acos_coeff_P21,acos_1poly_p19
- nop.i 999;;
+{ .mfi
+ nop.m 0
+(p8) fma.s1 fCpi = fPiBy2, f1, fPiBy2 // Cpi = Pi if x < 0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p2 = acos_tx,acos_coeff_P3,acos_coeff_P2
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p13 = acos_tx,acos_coeff_P14,acos_coeff_P13
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB12 = fB12, fRSqr, fB9
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p0 = acos_tx,acos_coeff_P1,f1
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p11 = acos_tx,acos_coeff_P12,acos_coeff_P11
- nop.i 999;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB7 = fB7, fRQuadr, fB3
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_l3 = acos_429by16,acos_d,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_z = acos_e1,acos_T3,acos_S1
- nop.i 999;;
+;;
+{.mfi
+ nop.m 0
+ fnma.s1 fD = fH, fS, fHalf // d1 = 1/2 - H1*S1
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p11 = acos_t4,acos_poly_p15,acos_poly_p11
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_T6 = acos_T3,acos_d3,f0
- nop.i 999;;
+{ .mfi
+ nop.m 0
+ fnma.s1 fSignedS = fSignX, fS, f0 // -signum(x)*S1
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_t11 = acos_t8,acos_t3,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_poly_p0 = acos_t4,acos_poly_p4,acos_poly_p0
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fCloseTo1Pol = fB12, fR8, fB7
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p4 = acos_tx4,acos_1poly_p8,acos_1poly_p4
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p15 = acos_tx4,acos_1poly_p19,acos_1poly_p15
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fH = fH, fD, fH // H2 = H1 + H1*d1
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p0 = acos_tx2,acos_1poly_p2,acos_1poly_p0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p11 = acos_tx2,acos_1poly_p13,acos_1poly_p11
- nop.i 999;;
+{ .mfi
+ nop.m 0
+ fma.s1 fS = fS, fD, fS // S2 = S1 + S1*d1
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
-// fcmp.le.s1 acos_pred_LEsqrt2by2,acos_pred_GTsqrt2by2 = acos_abs_x,acos_const_sqrt2by2
- fcmp.le.s1 p7,p8 = acos_abs_x,acos_const_sqrt2by2
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_tx8 = acos_tx4,acos_tx4,f0
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ // -signum(x)* S2 = -signum(x)*(S1 + S1*d1)
+ fma.s1 fSignedS = fSignedS, fD, fSignedS
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_z = acos_l3,acos_T6,acos_z
- nop.i 999;;
-}
-
-{ .mfi
- nop.m 999
- fma.s1 acos_series_t = acos_t11,acos_poly_p11,acos_poly_p0
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p11) fma.s1 acos_const_add = acos_const_piby2, f1, acos_const_piby2
- nop.i 999
+;;
+{.mfi
+ nop.m 0
+ fnma.s1 fD = fH, fS, fHalf // d2 = 1/2 - H2*S2
+ nop.i 0
}
;;
-
{ .mfi
- nop.m 999
-(p12) fma.s1 acos_const_add = f1,f0,f0
- nop.i 999
+ nop.m 0
+ // Cpi + signum(x)*PolB*S2
+ fnma.s1 fCpi = fSignedS, fCloseTo1Pol, fCpi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // signum(x)*PolB * S2
+ fnma.s1 fCloseTo1Pol = fSignedS, fCloseTo1Pol, f0
+ nop.i 0
}
;;
-
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p0 = acos_tx4,acos_1poly_p4,acos_1poly_p0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 acos_1poly_p11 = acos_tx4,acos_1poly_p15,acos_1poly_p11
- nop.i 999;;
+{ .mfb
+ nop.m 0
+ // final result for 0.625 <= |x| < 1
+ fma.d.s0 f8 = fCloseTo1Pol, fD, fCpi
+ // exit here for 0.625 <= |x| < 1
+ br.ret.sptk b0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 acos_tx11 = acos_tx8,acos_tx3,f0
- nop.i 999;;
-}
-
-{ .mfi
- nop.m 999
-//(acos_pred_GTsqrt2by2) fnma.s1 answer2 = acos_z,acos_series_t,acos_const_piby2
-(p8) fnma.s1 answer2 = acos_z,acos_series_t,f0
- nop.i 999;;
-}
-
-{ .mfi
- nop.m 999
- fma.s1 acos_series_tx = acos_tx11,acos_1poly_p11,acos_1poly_p0
- nop.i 999;;
-}
-
-{ .mfi
- nop.m 999
-//(acos_pred_GTsqrt2by2) fnma.d f8 = acos_sgn_x,answer2,acos_const_piby2
-(p8) fnma.d f8 = acos_sgn_x,answer2,acos_const_add
- nop.i 999;;
-}
-
-{ .mfb
- nop.m 999
-//(acos_pred_LEsqrt2by2) fnma.d f8 = f8,acos_series_tx,acos_const_piby2
-(p7) fnma.d f8 = f8,acos_series_tx,acos_const_piby2
- br.ret.sptk b0 ;;
-}
+;;
-L(ACOS_ZERO):
-// Here if x=0
-{ .mfb
- nop.m 999
- fma.d f8 = acos_const_piby2,f1,f0
- br.ret.sptk b0 ;;
-}
+// here if |x| < 0.625
+.align 32
+acos_base_range:
+{ .mfi
+ ldfe fCpi = [rPiBy2Ptr] // Pi/2
+ fma.s1 fA33 = fA33, fXSqr, fA31
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fXSqr, fA13
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA29 = fA29, fXSqr, fA27
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fXSqr, fA23
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA21 = fA21, fXSqr, fA19
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, fXSqr, fA7
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA5 = fA5, fXSqr, fA3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA35 = fA35, fXQuadr, fA33
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, fXQuadr, fA15
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fX8 = fXQuadr, fXQuadr, f0 // x^8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fXQuadr, fA21
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, fXQuadr, fA5
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fCpi = fCpi, f1, f8 // Pi/2 - x
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA35 = fA35, fXQuadr, fA29
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, fXSqr, fA11
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fX16 = fX8, fX8, f0 // x^16
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA35 = fA35, fX8, fA25
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, fX8, fA9
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fBaseP = fA35, fX16, fA17
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for |x| < 0.625
+ fnma.d.s0 f8 = fBaseP, fXCube, fCpi
+ // exit here for |x| < 0.625 path
+ br.ret.sptk b0
+}
+;;
+// here if |x| = 1
+// acos(1) = 0
+// acos(-1) = Pi
+.align 32
+acos_abs_1:
+{ .mfi
+ ldfe fPiBy2 = [rPiBy2Ptr] // Pi/2
+ nop.f 0
+ nop.i 0
+}
+;;
+.pred.rel "mutex", p8, p9
+{ .mfi
+ nop.m 0
+ // result for x = 1.0
+(p9) fma.d.s0 f8 = f1, f0, f0 // 0.0
+ nop.i 0
+}
+{.mfb
+ nop.m 0
+ // result for x = -1.0
+(p8) fma.d.s0 f8 = fPiBy2, f1, fPiBy2 // Pi
+ // exit here for |x| = 1.0
+ br.ret.sptk b0
+}
+;;
-L(ACOS_ABS_ONE):
-.pred.rel "mutex",p11,p12
-// Here if |x|=1
-{ .mfi
- nop.m 999
-(p11) fma.d f8 = acos_const_piby2,f1,acos_const_piby2 // acos(-1)=pi
- nop.i 999
-}
-{ .mfb
- nop.m 999
-(p12) fma.d f8 = f1,f0,f0 // acos(1)=0
- br.ret.sptk b0 ;;
-}
+// here if x is a NaN, denormal, or zero
+.align 32
+acos_special:
+{ .mfi
+ // point to Pi/2
+ adds rPiBy2Ptr = 272, rTblAddr
+ // set p12 = 1 if x is a NaN
+ fclass.m p12, p0 = f8, 0xc3
+ nop.i 0
+}
+{ .mlx
+ nop.m 0
+ // smallest positive DP normalized number
+ movl rDenoBound = 0x0010000000000000
+}
+;;
+{ .mfi
+ ldfe fPiBy2 = [rPiBy2Ptr] // Pi/2
+ // set p13 = 1 if x = 0.0
+ fclass.m p13, p0 = f8, 0x07
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnorm.s1 fNormX = f8
+ nop.i 0
+}
+;;
+{ .mfb
+ // load smallest normal to FP reg
+ setf.d fDenoBound = rDenoBound
+ // answer if x is a NaN
+(p12) fma.d.s0 f8 = f8,f1,f0
+ // exit here if x is a NaN
+(p12) br.ret.spnt b0
+}
+;;
+{ .mfi
+ nop.m 0
+ // absolute value of normalized x
+ fmerge.s fNormX = f1, fNormX
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for x = 0
+(p13) fma.d.s0 f8 = fPiBy2, f1, f8
+ // exit here if x = 0.0
+(p13) br.ret.spnt b0
+}
+;;
+// if we still here then x is denormal or unnormal
+{ .mfi
+ nop.m 0
+ // set p14 = 1 if normalized x is greater than or
+ // equal to the smallest denormalized value
+ // So, if p14 is set to 1 it means that we deal with
+ // unnormal rather than with "true" denormal
+ fcmp.ge.s1 p14, p0 = fNormX, fDenoBound
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+(p14) fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag if x unnormal
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ // normalize unnormal input
+(p14) fnorm.s1 f8 = f8
+ // return to the main path
+(p14) br.cond.sptk acos_unnormal_back
+}
+;;
+// if we still here it means that input is "true" denormal
+{ .mfb
+ nop.m 0
+ // final result if x is denormal
+ fms.d.s0 f8 = fPiBy2, f1, f8 // Pi/2 - x
+ // exit here if x is denormal
+ br.ret.sptk b0
+}
+;;
+// here if |x| > 1.0
+// error handler should be called
+.align 32
+acos_abs_gt_1:
+{ .mfi
+ alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers
+ fmerge.s FR_X = f8,f8
+ nop.i 0
+}
+{ .mfb
+ mov GR_Parameter_TAG = 58 // error code
+ frcpa.s0 FR_RESULT, p0 = f0,f0
+ // call error handler routine
+ br.cond.sptk __libm_error_region
+}
+;;
+GLOBAL_LIBM_END(acos)
-.endp acos
-ASM_SIZE_DIRECTIVE(acos)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
- nop.f 999
+ nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
@@ -879,28 +839,29 @@ __libm_error_region:
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
- stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack
+ stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
-
.body
- frcpa.s0 f9,p0 = f0,f0
-;;
-
{ .mib
- stfd [GR_Parameter_X] = f8 // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0 // Parameter 3 address
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfd [GR_Parameter_Y] = f9,-16 // Store Parameter 3 on stack
- adds r32 = 48,sp
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- ldfd f8 = [r32] // Get return result off stack
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
+};;
+{ .mmi
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
@@ -909,11 +870,8 @@ __libm_error_region:
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
-
-.type __libm_error_support,@function
-.global __libm_error_support
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_acosf.S b/sysdeps/ia64/fpu/e_acosf.S
index a3425414cf..417f5b7ddc 100644
--- a/sysdeps/ia64/fpu/e_acosf.S
+++ b/sysdeps/ia64/fpu/e_acosf.S
@@ -1,10 +1,10 @@
.file "acosf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,19 +35,23 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
//==============================================================
-// 2/02/00 Initial revision
-// 6/28/00 Improved speed
-// 6/31/00 Changed register allocation because of some duplicate macros
+// 02/02/00 Initial version
+// 06/28/00 Improved speed
+// 06/31/00 Changed register allocation because of some duplicate macros
// moved nan exit bundle up to gain a cycle.
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-// 8/17/00 Changed predicate register macro-usage to direct predicate
+// 08/17/00 Changed predicate register macro-usage to direct predicate
// names due to an assembler bug.
// 10/17/00 Improved speed of x=0 and x=1 paths, set D flag if x denormal.
+// 03/13/01 Corrected sign of imm1 value in dep instruction.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+// 04/17/03 Moved mutex after label
// Description
@@ -115,7 +119,6 @@
// answer2 = sign(x) z P(t) if x>0
// = sign(x) z P(t) + pi if x<0
-#include "libm_support.h"
//
// Assembly macros
@@ -222,42 +225,30 @@ acosf_poly_p1a = f90
// Data tables
//==============================================================
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+RODATA
.align 16
-acosf_coeff_1_table:
-ASM_TYPE_DIRECTIVE(acosf_coeff_1_table,@object)
+LOCAL_OBJECT_START(acosf_coeff_1_table)
data8 0x3FC5555607DCF816 // P1
data8 0x3F9CF81AD9BAB2C6 // P4
data8 0x3FC59E0975074DF3 // P7
data8 0xBFA6F4CC2780AA1D // P6
data8 0x3FC2DD45292E93CB // P9
data8 0x3fe6a09e667f3bcd // sqrt(2)/2
-ASM_SIZE_DIRECTIVE(acosf_coeff_1_table)
+LOCAL_OBJECT_END(acosf_coeff_1_table)
-acosf_coeff_2_table:
-ASM_TYPE_DIRECTIVE(acosf_coeff_2_table,@object)
+LOCAL_OBJECT_START(acosf_coeff_2_table)
data8 0x3FA6F108E31EFBA6 // P3
data8 0xBFCA31BF175D82A0 // P8
data8 0x3FA30C0337F6418B // P5
data8 0x3FB332C9266CB1F9 // P2
data8 0x3ff921fb54442d18 // pi_by_2
-ASM_SIZE_DIRECTIVE(acosf_coeff_2_table)
+LOCAL_OBJECT_END(acosf_coeff_2_table)
-.align 32
-.global acosf
-ASM_TYPE_DIRECTIVE(acosf,@function)
.section .text
-.proc acosf
-.align 32
-
-acosf:
+GLOBAL_LIBM_ENTRY(acosf)
// Load the addresses of the two tables.
// Then, load the coefficients and other constants.
@@ -342,7 +333,7 @@ acosf:
}
{ .mfb
nop.m 999
-(p8) fma.s f8 = f8,f1,f0
+(p8) fma.s.s0 f8 = f8,f1,f0
(p8) br.ret.spnt b0 ;; // Exit if x=nan
}
@@ -350,7 +341,7 @@ acosf:
{ .mfb
nop.m 999
fcmp.eq.s1 p6,p0 = acosf_abs_x,f1
-(p10) br.cond.spnt L(ACOSF_ZERO) ;; // Branch if x=0
+(p10) br.cond.spnt ACOSF_ZERO ;; // Branch if x=0
}
{ .mfi
@@ -367,7 +358,7 @@ acosf:
{ .mfb
nop.m 999
fma.s1 acosf_t4 = acosf_t2,acosf_t2,f0
-(p6) br.cond.spnt L(ACOSF_ABS_ONE) ;; // Branch if |x|=1
+(p6) br.cond.spnt ACOSF_ABS_ONE ;; // Branch if |x|=1
}
{ .mfi
@@ -575,42 +566,40 @@ acosf:
.pred.rel "mutex",p8,p7 //acosf_pred_GTsqrt2by2,acosf_pred_LEsqrt2by2
{ .mfi
nop.m 999
-(p8) fma.s f8 = acosf_z,acosf_Pt,acosf_sgn_x_piby2
+(p8) fma.s.s0 f8 = acosf_z,acosf_Pt,acosf_sgn_x_piby2
nop.i 999
}
{ .mfb
nop.m 999
-(p7) fms.s f8 = acosf_const_piby2,f1,acosf_sinf1
+(p7) fms.s.s0 f8 = acosf_const_piby2,f1,acosf_sinf1
br.ret.sptk b0 ;;
}
-L(ACOSF_ZERO):
+ACOSF_ZERO:
// Here if x=0
{ .mfb
nop.m 999
- fma.s f8 = acosf_const_piby2,f1,f0 // acosf(0)=pi/2
+ fma.s.s0 f8 = acosf_const_piby2,f1,f0 // acosf(0)=pi/2
br.ret.sptk b0 ;;
}
-L(ACOSF_ABS_ONE):
+ACOSF_ABS_ONE:
.pred.rel "mutex",p11,p12
// Here if |x|=1
{ .mfi
nop.m 999
-(p11) fma.s f8 = acosf_const_piby2,f1,acosf_const_piby2 // acosf(-1)=pi
+(p11) fma.s.s0 f8 = acosf_const_piby2,f1,acosf_const_piby2 // acosf(-1)=pi
nop.i 999
}
{ .mfb
nop.m 999
-(p12) fma.s f8 = f1,f0,f0 // acosf(1)=0
+(p12) fma.s.s0 f8 = f1,f0,f0 // acosf(1)=0
br.ret.sptk b0 ;;
}
-.endp acosf
-ASM_SIZE_DIRECTIVE(acosf)
-
+GLOBAL_LIBM_END(acosf)
// Stack operations when calling error support.
// (1) (2)
@@ -642,8 +631,7 @@ ASM_SIZE_DIRECTIVE(acosf)
// restore ar.pfs
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
@@ -699,8 +687,7 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_acosh.S b/sysdeps/ia64/fpu/e_acosh.S
new file mode 100644
index 0000000000..675d5fe799
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_acosh.S
@@ -0,0 +1,1200 @@
+.file "acosh.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// ==============================================================
+// History
+// ==============================================================
+// 03/23/01 Initial version
+// 04/19/01 Improved speed of the paths #1,2,3,4,5
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+// 05/14/03 Improved performance, set denormal flag for unorms >= 1.0
+//
+// API
+// ==============================================================
+// double acosh(double)
+//
+// Overview of operation
+// ==============================================================
+//
+// There are 7 paths:
+// 1. x = 1.0
+// Return acosh(x) = 0.0
+// 2. 1.0 < x < 1.000499725341796875(0x3FF0020C00000000)
+// Return acosh(x) = sqrt(x-1) * Pol4(x), where Pol4(x) =
+// (((x*C4 + C3)*(x-1) + C2)*(x-1) + C1)*(x-1) + C0
+
+// 3. 1.000499725341796875(0x3FF0020C00000000) <= x < 2^63
+// Return acosh(x) = log(x + sqrt(x^2 -1.0))
+// To compute x + sqrt(x^2 -1.0) modified Newton Raphson method is used
+// (3 iterations)
+// Algorithm description for log function see below.
+//
+// 4. 2^63 <= x < +INF
+// Return acosh(x) = log(2*x)
+// Algorithm description for log function see below.
+//
+// 5. x = +INF
+// Return acosh(x) = +INF
+//
+// 6. x = [S,Q]NaN
+// Return acosh(x) = QNaN
+//
+// 7. x < 1.0
+// It's domain error. Error handler with tag = 136 is called
+//
+//==============================================================
+// Algorithm Description for log(x) function
+// Below we are using the fact that inequality x - 1.0 > 2^(-6) is always
+// true for this acosh implementation
+//
+// Consider x = 2^N 1.f1 f2 f3 f4...f63
+// Log(x) = log(frcpa(x) x/frcpa(x))
+// = log(1/frcpa(x)) + log(frcpa(x) x)
+// = -log(frcpa(x)) + log(frcpa(x) x)
+//
+// frcpa(x) = 2^-N frcpa((1.f1 f2 ... f63)
+//
+// -log(frcpa(x)) = -log(C)
+// = -log(2^-N) - log(frcpa(1.f1 f2 ... f63))
+//
+// -log(frcpa(x)) = -log(C)
+// = +Nlog2 - log(frcpa(1.f1 f2 ... f63))
+//
+// -log(frcpa(x)) = -log(C)
+// = +Nlog2 + log(frcpa(1.f1 f2 ... f63))
+//
+// Log(x) = log(1/frcpa(x)) + log(frcpa(x) x)
+//
+// Log(x) = +Nlog2 + log(1./frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)
+// Log(x) = +Nlog2 - log(/frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)
+// Log(x) = +Nlog2 + T + log(frcpa(x) x)
+//
+// Log(x) = +Nlog2 + T + log(C x)
+//
+// Cx = 1 + r
+//
+// Log(x) = +Nlog2 + T + log(1+r)
+// Log(x) = +Nlog2 + T + Series( r - r^2/2 + r^3/3 - r^4/4 ....)
+//
+// 1.f1 f2 ... f8 has 256 entries.
+// They are 1 + k/2^8, k = 0 ... 255
+// These 256 values are the table entries.
+//
+// Implementation
+//==============================================================
+// C = frcpa(x)
+// r = C * x - 1
+//
+// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4 + P4*r^5 + P5*r^6
+//
+// x = f * 2*n where f is 1.f_1f_2f_3....f_63
+// Nfloat = float(n) where n is the true unbiased exponent
+// pre-index = f_1f_2....f_8
+// index = pre_index * 16
+// get the dxt table entry at index + offset = T
+//
+// result = (T + Nfloat * log(2)) + rseries
+//
+// The T table is calculated as follows
+// Form x_k = 1 + k/2^8 where k goes from 0... 255
+// y_k = frcpa(x_k)
+// log(1/y_k) in quad and round to double-extended
+//
+
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f9 -> f15, f32 -> f65
+
+// General registers used:
+// r14 -> r27, r32 -> r39
+
+// Predicate registers used:
+// p6 -> p15
+
+// p6 to filter out case when x = [Q,S]NaN
+// p7,p8 to filter out case when x < 1.0
+// p10 to select path #1
+// p11 to filter out case when x = +INF
+// p12 used in the frcpa
+// p13 to select path #4
+// p14,p15 to select path #2
+
+// Assembly macros
+//==============================================================
+log_GR_exp_17_ones = r14
+log_GR_signexp_f8 = r15
+log_table_address2 = r16
+log_GR_exp_16_ones = r17
+log_GR_exp_f8 = r18
+log_GR_true_exp_f8 = r19
+log_GR_significand_f8 = r20
+log_GR_index = r21
+log_GR_comp2 = r22
+acosh_GR_f8 = r23
+log_GR_comp = r24
+acosh_GR_f8_sig = r25
+log_table_address3 = r26
+NR_table_address = r27
+
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_SAVE_PFS = r35
+
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+acosh_GR_tag = r39
+
+//==============================================================
+log_y = f9
+NR1 = f10
+NR2 = f11
+log_y_rs = f12
+log_y_rs_iter = f13
+log_y_rs_iter1 = f14
+log_NORM_f8 = f15
+acosh_comp = f32
+log_w = f34
+log_P5 = f35
+log_P4 = f36
+log_P3 = f37
+log_P2 = f38
+log_P1 = f39
+log_C0 = f40
+log_C1 = f41
+log_C2 = f42
+log2 = f43
+acosh_w_rs = f44
+log_C = f45
+log_arg = f46
+acosh_w_iter1 = f47
+acosh_w_iter2 = f48
+log_int_Nfloat = f49
+log_r = f50
+log_rsq = f51
+log_rp_p4 = f52
+log_rp_p32 = f53
+log_rcube = f54
+log_rp_p10 = f55
+log_rp_p2 = f56
+log_Nfloat = f57
+log_T = f58
+log_r2P_r = f59
+log_T_plus_Nlog2 = f60
+acosh_w_sqrt = f61
+acosh_w_1 = f62
+log_C3 = f63
+log_C4 = f64
+log_arg_early = f65
+
+
+// Data tables
+//==============================================================
+
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(log_table_1)
+data8 0x3FF0020C49BA5E35 // 1.0005
+data8 0xBFC5555DA7212371 // P5
+data8 0x3FC999A19EEF5826 // P4
+data8 0xBFCFFFFFFFFEF009 // P3
+data8 0x3FD555555554ECB2 // P2
+data8 0xBFE0000000000000 // P1 = -0.5
+//
+data8 0xb17217f7d1cf79ac, 0x00003ffe // log2
+LOCAL_OBJECT_END(log_table_1)
+
+LOCAL_OBJECT_START(log_table_2)
+data8 0x3FE0000000000000 // 0.5
+data8 0x4008000000000000 // 3.0
+//
+data8 0xAFE8F9203939CCF8, 0x00003FF6 // C4 3FF6AFE8F9203939CCF8
+data8 0xAD46EB6AE752D809, 0x0000BFF8 // C3 BFF8AD46EB6AE752D809
+data8 0xD93923D7F53F3627, 0x00003FF9 // C2 3FF9D93923D7F53F3627
+data8 0xF15BEEEFF7D32D36, 0x0000BFFB // C1 BFFBF15BEEEFF7D32D36
+data8 0xB504F333F9DE6484, 0x00003FFF // C0 3FFFB504F333F9DE6484
+LOCAL_OBJECT_END(log_table_2)
+
+
+LOCAL_OBJECT_START(log_table_3)
+data8 0x80200aaeac44ef38 , 0x00003ff6 // log(1/frcpa(1+ 0/2^-8))
+//
+data8 0xc09090a2c35aa070 , 0x00003ff7 // log(1/frcpa(1+ 1/2^-8))
+data8 0xa0c94fcb41977c75 , 0x00003ff8 // log(1/frcpa(1+ 2/2^-8))
+data8 0xe18b9c263af83301 , 0x00003ff8 // log(1/frcpa(1+ 3/2^-8))
+data8 0x8d35c8d6399c30ea , 0x00003ff9 // log(1/frcpa(1+ 4/2^-8))
+data8 0xadd4d2ecd601cbb8 , 0x00003ff9 // log(1/frcpa(1+ 5/2^-8))
+//
+data8 0xce95403a192f9f01 , 0x00003ff9 // log(1/frcpa(1+ 6/2^-8))
+data8 0xeb59392cbcc01096 , 0x00003ff9 // log(1/frcpa(1+ 7/2^-8))
+data8 0x862c7d0cefd54c5d , 0x00003ffa // log(1/frcpa(1+ 8/2^-8))
+data8 0x94aa63c65e70d499 , 0x00003ffa // log(1/frcpa(1+ 9/2^-8))
+data8 0xa54a696d4b62b382 , 0x00003ffa // log(1/frcpa(1+ 10/2^-8))
+//
+data8 0xb3e4a796a5dac208 , 0x00003ffa // log(1/frcpa(1+ 11/2^-8))
+data8 0xc28c45b1878340a9 , 0x00003ffa // log(1/frcpa(1+ 12/2^-8))
+data8 0xd35c55f39d7a6235 , 0x00003ffa // log(1/frcpa(1+ 13/2^-8))
+data8 0xe220f037b954f1f5 , 0x00003ffa // log(1/frcpa(1+ 14/2^-8))
+data8 0xf0f3389b036834f3 , 0x00003ffa // log(1/frcpa(1+ 15/2^-8))
+//
+data8 0xffd3488d5c980465 , 0x00003ffa // log(1/frcpa(1+ 16/2^-8))
+data8 0x87609ce2ed300490 , 0x00003ffb // log(1/frcpa(1+ 17/2^-8))
+data8 0x8ede9321e8c85927 , 0x00003ffb // log(1/frcpa(1+ 18/2^-8))
+data8 0x96639427f2f8e2f4 , 0x00003ffb // log(1/frcpa(1+ 19/2^-8))
+data8 0x9defad3e8f73217b , 0x00003ffb // log(1/frcpa(1+ 20/2^-8))
+//
+data8 0xa582ebd50097029c , 0x00003ffb // log(1/frcpa(1+ 21/2^-8))
+data8 0xac06dbe75ab80fee , 0x00003ffb // log(1/frcpa(1+ 22/2^-8))
+data8 0xb3a78449b2d3ccca , 0x00003ffb // log(1/frcpa(1+ 23/2^-8))
+data8 0xbb4f79635ab46bb2 , 0x00003ffb // log(1/frcpa(1+ 24/2^-8))
+data8 0xc2fec93a83523f3f , 0x00003ffb // log(1/frcpa(1+ 25/2^-8))
+//
+data8 0xc99af2eaca4c4571 , 0x00003ffb // log(1/frcpa(1+ 26/2^-8))
+data8 0xd1581106472fa653 , 0x00003ffb // log(1/frcpa(1+ 27/2^-8))
+data8 0xd8002560d4355f2e , 0x00003ffb // log(1/frcpa(1+ 28/2^-8))
+data8 0xdfcb43b4fe508632 , 0x00003ffb // log(1/frcpa(1+ 29/2^-8))
+data8 0xe67f6dff709d4119 , 0x00003ffb // log(1/frcpa(1+ 30/2^-8))
+//
+data8 0xed393b1c22351280 , 0x00003ffb // log(1/frcpa(1+ 31/2^-8))
+data8 0xf5192bff087bcc35 , 0x00003ffb // log(1/frcpa(1+ 32/2^-8))
+data8 0xfbdf4ff6dfef2fa3 , 0x00003ffb // log(1/frcpa(1+ 33/2^-8))
+data8 0x81559a97f92f9cc7 , 0x00003ffc // log(1/frcpa(1+ 34/2^-8))
+data8 0x84be72bce90266e8 , 0x00003ffc // log(1/frcpa(1+ 35/2^-8))
+//
+data8 0x88bc74113f23def2 , 0x00003ffc // log(1/frcpa(1+ 36/2^-8))
+data8 0x8c2ba3edf6799d11 , 0x00003ffc // log(1/frcpa(1+ 37/2^-8))
+data8 0x8f9dc92f92ea08b1 , 0x00003ffc // log(1/frcpa(1+ 38/2^-8))
+data8 0x9312e8f36efab5a7 , 0x00003ffc // log(1/frcpa(1+ 39/2^-8))
+data8 0x968b08643409ceb6 , 0x00003ffc // log(1/frcpa(1+ 40/2^-8))
+//
+data8 0x9a062cba08a1708c , 0x00003ffc // log(1/frcpa(1+ 41/2^-8))
+data8 0x9d845b3abf95485c , 0x00003ffc // log(1/frcpa(1+ 42/2^-8))
+data8 0xa06fd841bc001bb4 , 0x00003ffc // log(1/frcpa(1+ 43/2^-8))
+data8 0xa3f3a74652fbe0db , 0x00003ffc // log(1/frcpa(1+ 44/2^-8))
+data8 0xa77a8fb2336f20f5 , 0x00003ffc // log(1/frcpa(1+ 45/2^-8))
+//
+data8 0xab0497015d28b0a0 , 0x00003ffc // log(1/frcpa(1+ 46/2^-8))
+data8 0xae91c2be6ba6a615 , 0x00003ffc // log(1/frcpa(1+ 47/2^-8))
+data8 0xb189d1b99aebb20b , 0x00003ffc // log(1/frcpa(1+ 48/2^-8))
+data8 0xb51cced5de9c1b2c , 0x00003ffc // log(1/frcpa(1+ 49/2^-8))
+data8 0xb819bee9e720d42f , 0x00003ffc // log(1/frcpa(1+ 50/2^-8))
+//
+data8 0xbbb2a0947b093a5d , 0x00003ffc // log(1/frcpa(1+ 51/2^-8))
+data8 0xbf4ec1505811684a , 0x00003ffc // log(1/frcpa(1+ 52/2^-8))
+data8 0xc2535bacfa8975ff , 0x00003ffc // log(1/frcpa(1+ 53/2^-8))
+data8 0xc55a3eafad187eb8 , 0x00003ffc // log(1/frcpa(1+ 54/2^-8))
+data8 0xc8ff2484b2c0da74 , 0x00003ffc // log(1/frcpa(1+ 55/2^-8))
+//
+data8 0xcc0b1a008d53ab76 , 0x00003ffc // log(1/frcpa(1+ 56/2^-8))
+data8 0xcfb6203844b3209b , 0x00003ffc // log(1/frcpa(1+ 57/2^-8))
+data8 0xd2c73949a47a19f5 , 0x00003ffc // log(1/frcpa(1+ 58/2^-8))
+data8 0xd5daae18b49d6695 , 0x00003ffc // log(1/frcpa(1+ 59/2^-8))
+data8 0xd8f08248cf7e8019 , 0x00003ffc // log(1/frcpa(1+ 60/2^-8))
+//
+data8 0xdca7749f1b3e540e , 0x00003ffc // log(1/frcpa(1+ 61/2^-8))
+data8 0xdfc28e033aaaf7c7 , 0x00003ffc // log(1/frcpa(1+ 62/2^-8))
+data8 0xe2e012a5f91d2f55 , 0x00003ffc // log(1/frcpa(1+ 63/2^-8))
+data8 0xe600064ed9e292a8 , 0x00003ffc // log(1/frcpa(1+ 64/2^-8))
+data8 0xe9226cce42b39f60 , 0x00003ffc // log(1/frcpa(1+ 65/2^-8))
+//
+data8 0xec4749fd97a28360 , 0x00003ffc // log(1/frcpa(1+ 66/2^-8))
+data8 0xef6ea1bf57780495 , 0x00003ffc // log(1/frcpa(1+ 67/2^-8))
+data8 0xf29877ff38809091 , 0x00003ffc // log(1/frcpa(1+ 68/2^-8))
+data8 0xf5c4d0b245cb89be , 0x00003ffc // log(1/frcpa(1+ 69/2^-8))
+data8 0xf8f3afd6fcdef3aa , 0x00003ffc // log(1/frcpa(1+ 70/2^-8))
+//
+data8 0xfc2519756be1abc7 , 0x00003ffc // log(1/frcpa(1+ 71/2^-8))
+data8 0xff59119f503e6832 , 0x00003ffc // log(1/frcpa(1+ 72/2^-8))
+data8 0x8147ce381ae0e146 , 0x00003ffd // log(1/frcpa(1+ 73/2^-8))
+data8 0x82e45f06cb1ad0f2 , 0x00003ffd // log(1/frcpa(1+ 74/2^-8))
+data8 0x842f5c7c573cbaa2 , 0x00003ffd // log(1/frcpa(1+ 75/2^-8))
+//
+data8 0x85ce471968c8893a , 0x00003ffd // log(1/frcpa(1+ 76/2^-8))
+data8 0x876e8305bc04066d , 0x00003ffd // log(1/frcpa(1+ 77/2^-8))
+data8 0x891012678031fbb3 , 0x00003ffd // log(1/frcpa(1+ 78/2^-8))
+data8 0x8a5f1493d766a05f , 0x00003ffd // log(1/frcpa(1+ 79/2^-8))
+data8 0x8c030c778c56fa00 , 0x00003ffd // log(1/frcpa(1+ 80/2^-8))
+//
+data8 0x8da85df17e31d9ae , 0x00003ffd // log(1/frcpa(1+ 81/2^-8))
+data8 0x8efa663e7921687e , 0x00003ffd // log(1/frcpa(1+ 82/2^-8))
+data8 0x90a22b6875c6a1f8 , 0x00003ffd // log(1/frcpa(1+ 83/2^-8))
+data8 0x91f62cc8f5d24837 , 0x00003ffd // log(1/frcpa(1+ 84/2^-8))
+data8 0x93a06cfc3857d980 , 0x00003ffd // log(1/frcpa(1+ 85/2^-8))
+//
+data8 0x94f66d5e6fd01ced , 0x00003ffd // log(1/frcpa(1+ 86/2^-8))
+data8 0x96a330156e6772f2 , 0x00003ffd // log(1/frcpa(1+ 87/2^-8))
+data8 0x97fb3582754ea25b , 0x00003ffd // log(1/frcpa(1+ 88/2^-8))
+data8 0x99aa8259aad1bbf2 , 0x00003ffd // log(1/frcpa(1+ 89/2^-8))
+data8 0x9b0492f6227ae4a8 , 0x00003ffd // log(1/frcpa(1+ 90/2^-8))
+//
+data8 0x9c5f8e199bf3a7a5 , 0x00003ffd // log(1/frcpa(1+ 91/2^-8))
+data8 0x9e1293b9998c1daa , 0x00003ffd // log(1/frcpa(1+ 92/2^-8))
+data8 0x9f6fa31e0b41f308 , 0x00003ffd // log(1/frcpa(1+ 93/2^-8))
+data8 0xa0cda11eaf46390e , 0x00003ffd // log(1/frcpa(1+ 94/2^-8))
+data8 0xa22c8f029cfa45aa , 0x00003ffd // log(1/frcpa(1+ 95/2^-8))
+//
+data8 0xa3e48badb7856b34 , 0x00003ffd // log(1/frcpa(1+ 96/2^-8))
+data8 0xa5459a0aa95849f9 , 0x00003ffd // log(1/frcpa(1+ 97/2^-8))
+data8 0xa6a79c84480cfebd , 0x00003ffd // log(1/frcpa(1+ 98/2^-8))
+data8 0xa80a946d0fcb3eb2 , 0x00003ffd // log(1/frcpa(1+ 99/2^-8))
+data8 0xa96e831a3ea7b314 , 0x00003ffd // log(1/frcpa(1+100/2^-8))
+//
+data8 0xaad369e3dc544e3b , 0x00003ffd // log(1/frcpa(1+101/2^-8))
+data8 0xac92e9588952c815 , 0x00003ffd // log(1/frcpa(1+102/2^-8))
+data8 0xadfa035aa1ed8fdc , 0x00003ffd // log(1/frcpa(1+103/2^-8))
+data8 0xaf6219eae1ad6e34 , 0x00003ffd // log(1/frcpa(1+104/2^-8))
+data8 0xb0cb2e6d8160f753 , 0x00003ffd // log(1/frcpa(1+105/2^-8))
+//
+data8 0xb2354249ad950f72 , 0x00003ffd // log(1/frcpa(1+106/2^-8))
+data8 0xb3a056e98ef4a3b4 , 0x00003ffd // log(1/frcpa(1+107/2^-8))
+data8 0xb50c6dba52c6292a , 0x00003ffd // log(1/frcpa(1+108/2^-8))
+data8 0xb679882c33876165 , 0x00003ffd // log(1/frcpa(1+109/2^-8))
+data8 0xb78c07429785cedc , 0x00003ffd // log(1/frcpa(1+110/2^-8))
+//
+data8 0xb8faeb8dc4a77d24 , 0x00003ffd // log(1/frcpa(1+111/2^-8))
+data8 0xba6ad77eb36ae0d6 , 0x00003ffd // log(1/frcpa(1+112/2^-8))
+data8 0xbbdbcc915e9bee50 , 0x00003ffd // log(1/frcpa(1+113/2^-8))
+data8 0xbd4dcc44f8cf12ef , 0x00003ffd // log(1/frcpa(1+114/2^-8))
+data8 0xbec0d81bf5b531fa , 0x00003ffd // log(1/frcpa(1+115/2^-8))
+//
+data8 0xc034f19c139186f4 , 0x00003ffd // log(1/frcpa(1+116/2^-8))
+data8 0xc14cb69f7c5e55ab , 0x00003ffd // log(1/frcpa(1+117/2^-8))
+data8 0xc2c2abbb6e5fd56f , 0x00003ffd // log(1/frcpa(1+118/2^-8))
+data8 0xc439b2c193e6771e , 0x00003ffd // log(1/frcpa(1+119/2^-8))
+data8 0xc553acb9d5c67733 , 0x00003ffd // log(1/frcpa(1+120/2^-8))
+//
+data8 0xc6cc96e441272441 , 0x00003ffd // log(1/frcpa(1+121/2^-8))
+data8 0xc8469753eca88c30 , 0x00003ffd // log(1/frcpa(1+122/2^-8))
+data8 0xc962cf3ce072b05c , 0x00003ffd // log(1/frcpa(1+123/2^-8))
+data8 0xcadeba8771f694aa , 0x00003ffd // log(1/frcpa(1+124/2^-8))
+data8 0xcc5bc08d1f72da94 , 0x00003ffd // log(1/frcpa(1+125/2^-8))
+//
+data8 0xcd7a3f99ea035c29 , 0x00003ffd // log(1/frcpa(1+126/2^-8))
+data8 0xcef93860c8a53c35 , 0x00003ffd // log(1/frcpa(1+127/2^-8))
+data8 0xd0192f68a7ed23df , 0x00003ffd // log(1/frcpa(1+128/2^-8))
+data8 0xd19a201127d3c645 , 0x00003ffd // log(1/frcpa(1+129/2^-8))
+data8 0xd2bb92f4061c172c , 0x00003ffd // log(1/frcpa(1+130/2^-8))
+//
+data8 0xd43e80b2ee8cc8fc , 0x00003ffd // log(1/frcpa(1+131/2^-8))
+data8 0xd56173601fc4ade4 , 0x00003ffd // log(1/frcpa(1+132/2^-8))
+data8 0xd6e6637efb54086f , 0x00003ffd // log(1/frcpa(1+133/2^-8))
+data8 0xd80ad9f58f3c8193 , 0x00003ffd // log(1/frcpa(1+134/2^-8))
+data8 0xd991d1d31aca41f8 , 0x00003ffd // log(1/frcpa(1+135/2^-8))
+//
+data8 0xdab7d02231484a93 , 0x00003ffd // log(1/frcpa(1+136/2^-8))
+data8 0xdc40d532cde49a54 , 0x00003ffd // log(1/frcpa(1+137/2^-8))
+data8 0xdd685f79ed8b265e , 0x00003ffd // log(1/frcpa(1+138/2^-8))
+data8 0xde9094bbc0e17b1d , 0x00003ffd // log(1/frcpa(1+139/2^-8))
+data8 0xe01c91b78440c425 , 0x00003ffd // log(1/frcpa(1+140/2^-8))
+//
+data8 0xe14658f26997e729 , 0x00003ffd // log(1/frcpa(1+141/2^-8))
+data8 0xe270cdc2391e0d23 , 0x00003ffd // log(1/frcpa(1+142/2^-8))
+data8 0xe3ffce3a2aa64922 , 0x00003ffd // log(1/frcpa(1+143/2^-8))
+data8 0xe52bdb274ed82887 , 0x00003ffd // log(1/frcpa(1+144/2^-8))
+data8 0xe6589852e75d7df6 , 0x00003ffd // log(1/frcpa(1+145/2^-8))
+//
+data8 0xe786068c79937a7d , 0x00003ffd // log(1/frcpa(1+146/2^-8))
+data8 0xe91903adad100911 , 0x00003ffd // log(1/frcpa(1+147/2^-8))
+data8 0xea481236f7d35bb0 , 0x00003ffd // log(1/frcpa(1+148/2^-8))
+data8 0xeb77d48c692e6b14 , 0x00003ffd // log(1/frcpa(1+149/2^-8))
+data8 0xeca84b83d7297b87 , 0x00003ffd // log(1/frcpa(1+150/2^-8))
+//
+data8 0xedd977f4962aa158 , 0x00003ffd // log(1/frcpa(1+151/2^-8))
+data8 0xef7179a22f257754 , 0x00003ffd // log(1/frcpa(1+152/2^-8))
+data8 0xf0a450d139366ca7 , 0x00003ffd // log(1/frcpa(1+153/2^-8))
+data8 0xf1d7e0524ff9ffdb , 0x00003ffd // log(1/frcpa(1+154/2^-8))
+data8 0xf30c29036a8b6cae , 0x00003ffd // log(1/frcpa(1+155/2^-8))
+//
+data8 0xf4412bc411ea8d92 , 0x00003ffd // log(1/frcpa(1+156/2^-8))
+data8 0xf576e97564c8619d , 0x00003ffd // log(1/frcpa(1+157/2^-8))
+data8 0xf6ad62fa1b5f172f , 0x00003ffd // log(1/frcpa(1+158/2^-8))
+data8 0xf7e499368b55c542 , 0x00003ffd // log(1/frcpa(1+159/2^-8))
+data8 0xf91c8d10abaffe22 , 0x00003ffd // log(1/frcpa(1+160/2^-8))
+//
+data8 0xfa553f7018c966f3 , 0x00003ffd // log(1/frcpa(1+161/2^-8))
+data8 0xfb8eb13e185d802c , 0x00003ffd // log(1/frcpa(1+162/2^-8))
+data8 0xfcc8e3659d9bcbed , 0x00003ffd // log(1/frcpa(1+163/2^-8))
+data8 0xfe03d6d34d487fd2 , 0x00003ffd // log(1/frcpa(1+164/2^-8))
+data8 0xff3f8c7581e9f0ae , 0x00003ffd // log(1/frcpa(1+165/2^-8))
+//
+data8 0x803e029e280173ae , 0x00003ffe // log(1/frcpa(1+166/2^-8))
+data8 0x80dca10cc52d0757 , 0x00003ffe // log(1/frcpa(1+167/2^-8))
+data8 0x817ba200632755a1 , 0x00003ffe // log(1/frcpa(1+168/2^-8))
+data8 0x821b05f3b01d6774 , 0x00003ffe // log(1/frcpa(1+169/2^-8))
+data8 0x82bacd623ff19d06 , 0x00003ffe // log(1/frcpa(1+170/2^-8))
+//
+data8 0x835af8c88e7a8f47 , 0x00003ffe // log(1/frcpa(1+171/2^-8))
+data8 0x83c5f8299e2b4091 , 0x00003ffe // log(1/frcpa(1+172/2^-8))
+data8 0x8466cb43f3d87300 , 0x00003ffe // log(1/frcpa(1+173/2^-8))
+data8 0x850803a67c80ca4b , 0x00003ffe // log(1/frcpa(1+174/2^-8))
+data8 0x85a9a1d11a23b461 , 0x00003ffe // log(1/frcpa(1+175/2^-8))
+//
+data8 0x864ba644a18e6e05 , 0x00003ffe // log(1/frcpa(1+176/2^-8))
+data8 0x86ee1182dcc432f7 , 0x00003ffe // log(1/frcpa(1+177/2^-8))
+data8 0x875a925d7e48c316 , 0x00003ffe // log(1/frcpa(1+178/2^-8))
+data8 0x87fdaa109d23aef7 , 0x00003ffe // log(1/frcpa(1+179/2^-8))
+data8 0x88a129ed4becfaf2 , 0x00003ffe // log(1/frcpa(1+180/2^-8))
+//
+data8 0x89451278ecd7f9cf , 0x00003ffe // log(1/frcpa(1+181/2^-8))
+data8 0x89b29295f8432617 , 0x00003ffe // log(1/frcpa(1+182/2^-8))
+data8 0x8a572ac5a5496882 , 0x00003ffe // log(1/frcpa(1+183/2^-8))
+data8 0x8afc2d0ce3b2dadf , 0x00003ffe // log(1/frcpa(1+184/2^-8))
+data8 0x8b6a69c608cfd3af , 0x00003ffe // log(1/frcpa(1+185/2^-8))
+//
+data8 0x8c101e106e899a83 , 0x00003ffe // log(1/frcpa(1+186/2^-8))
+data8 0x8cb63de258f9d626 , 0x00003ffe // log(1/frcpa(1+187/2^-8))
+data8 0x8d2539c5bd19e2b1 , 0x00003ffe // log(1/frcpa(1+188/2^-8))
+data8 0x8dcc0e064b29e6f1 , 0x00003ffe // log(1/frcpa(1+189/2^-8))
+data8 0x8e734f45d88357ae , 0x00003ffe // log(1/frcpa(1+190/2^-8))
+//
+data8 0x8ee30cef034a20db , 0x00003ffe // log(1/frcpa(1+191/2^-8))
+data8 0x8f8b0515686d1d06 , 0x00003ffe // log(1/frcpa(1+192/2^-8))
+data8 0x90336bba039bf32f , 0x00003ffe // log(1/frcpa(1+193/2^-8))
+data8 0x90a3edd23d1c9d58 , 0x00003ffe // log(1/frcpa(1+194/2^-8))
+data8 0x914d0de2f5d61b32 , 0x00003ffe // log(1/frcpa(1+195/2^-8))
+//
+data8 0x91be0c20d28173b5 , 0x00003ffe // log(1/frcpa(1+196/2^-8))
+data8 0x9267e737c06cd34a , 0x00003ffe // log(1/frcpa(1+197/2^-8))
+data8 0x92d962ae6abb1237 , 0x00003ffe // log(1/frcpa(1+198/2^-8))
+data8 0x9383fa6afbe2074c , 0x00003ffe // log(1/frcpa(1+199/2^-8))
+data8 0x942f0421651c1c4e , 0x00003ffe // log(1/frcpa(1+200/2^-8))
+//
+data8 0x94a14a3845bb985e , 0x00003ffe // log(1/frcpa(1+201/2^-8))
+data8 0x954d133857f861e7 , 0x00003ffe // log(1/frcpa(1+202/2^-8))
+data8 0x95bfd96468e604c4 , 0x00003ffe // log(1/frcpa(1+203/2^-8))
+data8 0x9632d31cafafa858 , 0x00003ffe // log(1/frcpa(1+204/2^-8))
+data8 0x96dfaabd86fa1647 , 0x00003ffe // log(1/frcpa(1+205/2^-8))
+//
+data8 0x9753261fcbb2a594 , 0x00003ffe // log(1/frcpa(1+206/2^-8))
+data8 0x9800c11b426b996d , 0x00003ffe // log(1/frcpa(1+207/2^-8))
+data8 0x9874bf4d45ae663c , 0x00003ffe // log(1/frcpa(1+208/2^-8))
+data8 0x99231f5ee9a74f79 , 0x00003ffe // log(1/frcpa(1+209/2^-8))
+data8 0x9997a18a56bcad28 , 0x00003ffe // log(1/frcpa(1+210/2^-8))
+//
+data8 0x9a46c873a3267e79 , 0x00003ffe // log(1/frcpa(1+211/2^-8))
+data8 0x9abbcfc621eb6cb6 , 0x00003ffe // log(1/frcpa(1+212/2^-8))
+data8 0x9b310cb0d354c990 , 0x00003ffe // log(1/frcpa(1+213/2^-8))
+data8 0x9be14cf9e1b3515c , 0x00003ffe // log(1/frcpa(1+214/2^-8))
+data8 0x9c5710b8cbb73a43 , 0x00003ffe // log(1/frcpa(1+215/2^-8))
+//
+data8 0x9ccd0abd301f399c , 0x00003ffe // log(1/frcpa(1+216/2^-8))
+data8 0x9d7e67f3bdce8888 , 0x00003ffe // log(1/frcpa(1+217/2^-8))
+data8 0x9df4ea81a99daa01 , 0x00003ffe // log(1/frcpa(1+218/2^-8))
+data8 0x9e6ba405a54514ba , 0x00003ffe // log(1/frcpa(1+219/2^-8))
+data8 0x9f1e21c8c7bb62b3 , 0x00003ffe // log(1/frcpa(1+220/2^-8))
+//
+data8 0x9f956593f6b6355c , 0x00003ffe // log(1/frcpa(1+221/2^-8))
+data8 0xa00ce1092e5498c3 , 0x00003ffe // log(1/frcpa(1+222/2^-8))
+data8 0xa0c08309c4b912c1 , 0x00003ffe // log(1/frcpa(1+223/2^-8))
+data8 0xa1388a8c6faa2afa , 0x00003ffe // log(1/frcpa(1+224/2^-8))
+data8 0xa1b0ca7095b5f985 , 0x00003ffe // log(1/frcpa(1+225/2^-8))
+//
+data8 0xa22942eb47534a00 , 0x00003ffe // log(1/frcpa(1+226/2^-8))
+data8 0xa2de62326449d0a3 , 0x00003ffe // log(1/frcpa(1+227/2^-8))
+data8 0xa357690f88bfe345 , 0x00003ffe // log(1/frcpa(1+228/2^-8))
+data8 0xa3d0a93f45169a4b , 0x00003ffe // log(1/frcpa(1+229/2^-8))
+data8 0xa44a22f7ffe65f30 , 0x00003ffe // log(1/frcpa(1+230/2^-8))
+//
+data8 0xa500c5e5b4c1aa36 , 0x00003ffe // log(1/frcpa(1+231/2^-8))
+data8 0xa57ad064eb2ebbc2 , 0x00003ffe // log(1/frcpa(1+232/2^-8))
+data8 0xa5f5152dedf4384e , 0x00003ffe // log(1/frcpa(1+233/2^-8))
+data8 0xa66f9478856233ec , 0x00003ffe // log(1/frcpa(1+234/2^-8))
+data8 0xa6ea4e7cca02c32e , 0x00003ffe // log(1/frcpa(1+235/2^-8))
+//
+data8 0xa765437325341ccf , 0x00003ffe // log(1/frcpa(1+236/2^-8))
+data8 0xa81e21e6c75b4020 , 0x00003ffe // log(1/frcpa(1+237/2^-8))
+data8 0xa899ab333fe2b9ca , 0x00003ffe // log(1/frcpa(1+238/2^-8))
+data8 0xa9157039c51ebe71 , 0x00003ffe // log(1/frcpa(1+239/2^-8))
+data8 0xa991713433c2b999 , 0x00003ffe // log(1/frcpa(1+240/2^-8))
+//
+data8 0xaa0dae5cbcc048b3 , 0x00003ffe // log(1/frcpa(1+241/2^-8))
+data8 0xaa8a27ede5eb13ad , 0x00003ffe // log(1/frcpa(1+242/2^-8))
+data8 0xab06de228a9e3499 , 0x00003ffe // log(1/frcpa(1+243/2^-8))
+data8 0xab83d135dc633301 , 0x00003ffe // log(1/frcpa(1+244/2^-8))
+data8 0xac3fb076adc7fe7a , 0x00003ffe // log(1/frcpa(1+245/2^-8))
+//
+data8 0xacbd3cbbe47988f1 , 0x00003ffe // log(1/frcpa(1+246/2^-8))
+data8 0xad3b06b1a5dc57c3 , 0x00003ffe // log(1/frcpa(1+247/2^-8))
+data8 0xadb90e94af887717 , 0x00003ffe // log(1/frcpa(1+248/2^-8))
+data8 0xae3754a218f7c816 , 0x00003ffe // log(1/frcpa(1+249/2^-8))
+data8 0xaeb5d9175437afa2 , 0x00003ffe // log(1/frcpa(1+250/2^-8))
+//
+data8 0xaf349c322e9c7cee , 0x00003ffe // log(1/frcpa(1+251/2^-8))
+data8 0xafb39e30d1768d1c , 0x00003ffe // log(1/frcpa(1+252/2^-8))
+data8 0xb032df51c2c93116 , 0x00003ffe // log(1/frcpa(1+253/2^-8))
+data8 0xb0b25fd3e6035ad9 , 0x00003ffe // log(1/frcpa(1+254/2^-8))
+data8 0xb1321ff67cba178c , 0x00003ffe // log(1/frcpa(1+255/2^-8))
+LOCAL_OBJECT_END(log_table_3)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(acosh)
+
+{ .mfi
+ getf.exp acosh_GR_f8 = f8
+ fclass.m p6,p0 = f8, 0xc3 // Test for x = NaN
+ mov log_GR_comp2 = 0x1003e
+}
+{ .mfi
+ addl NR_table_address = @ltoff(log_table_1), gp
+ fms.s1 log_y = f8, f8, f1 // y = x^2-1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ getf.sig acosh_GR_f8_sig = f8
+ fclass.m p11,p0 = f8, 0x21 // Test for x=+inf
+ mov log_GR_exp_17_ones = 0x1ffff
+}
+{ .mfi
+ ld8 NR_table_address = [NR_table_address]
+ fms.s1 log_w = f8,f1,f1 // w = x - 1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fcmp.lt.s1 p7,p8 = f8, f1 // Test for x<1.0
+ addl log_GR_comp = 0x10020C,r0 // Upper 21 bits of signif of 1.0005
+}
+{ .mfb
+ mov log_GR_exp_16_ones = 0xffff //BIAS
+(p6) fma.d.s0 f8 = f8,f1,f0 // quietize nan result if x=nan
+(p6) br.ret.spnt b0 // Exit for x=nan
+}
+;;
+
+{ .mfb
+ //get second table address
+ adds log_table_address2 = 0x40, NR_table_address
+ fcmp.eq.s1 p10,p0 = f8, f1 // Test for x=+1.0
+(p11) br.ret.spnt b0 // Exit for x=+inf
+}
+;;
+
+{ .mfi
+ ldfpd NR1,NR2 = [log_table_address2],16
+ frsqrta.s1 log_y_rs,p0 = log_y // z=1/sqrt(y)
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fma.s1 log_arg = f8,f1,f8
+(p7) br.cond.spnt ACOSH_LESS_ONE // Branch if path 7, x < 1.0
+}
+;;
+
+{ .mfi
+ ldfe log_C4 = [log_table_address2],16
+(p8) fcmp.eq.s0 p6,p0 = f8, f0 // Dummy op sets denorm flag if unorm>=1.0
+ nop.i 0
+}
+{ .mfb
+(p8) cmp.le.unc p13,p0 = log_GR_comp2,acosh_GR_f8
+ nop.f 0
+(p13) br.cond.spnt LOG_COMMON1 // Branch if path 4, x >= 2^63
+}
+;;
+
+{ .mfi
+ ldfe log_C3 = [log_table_address2],16
+(p10) fmerge.s f8 = f0, f0 // Return 0 if x=1.0
+ shr.u acosh_GR_f8_sig = acosh_GR_f8_sig,43
+}
+{ .mib
+ cmp.eq p14,p0 = log_GR_exp_16_ones,acosh_GR_f8
+ nop.i 0
+(p10) br.ret.spnt b0 // Exit for x=1.0
+}
+;;
+
+{ .mfi
+ ldfe log_C2 = [log_table_address2],16
+ frsqrta.s1 acosh_w_rs,p0 = log_w // t=1/sqrt(w)
+ nop.i 0
+}
+{ .mfb
+(p14) cmp.lt.unc p15,p0 = acosh_GR_f8_sig,log_GR_comp
+ nop.f 0
+(p15) br.cond.spnt ACOSH_NEAR_ONE // Branch if path 2, 1.0 < x < 1.0005
+}
+;;
+
+// Here is main path, 1.0005 <= x < 2^63
+/////////////// The first iteration //////////////////////////////////
+{ .mfi
+ ldfpd acosh_comp,log_P5 = [NR_table_address],16
+ fma.s1 log_y_rs_iter = log_y_rs,log_y,f0 // y*z
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfpd log_P4,log_P3 = [NR_table_address],16
+ fnma.s1 log_y_rs_iter = log_y_rs_iter,log_y_rs,NR2 // 3-(y*z)*z
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs,NR1,f0 // 0.5*z
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfpd log_P2,log_P1 = [NR_table_address],16
+ //(0.5*z)*(3-(y*z)*z)
+ fma.s1 log_y_rs_iter = log_y_rs_iter1,log_y_rs_iter,f0
+ nop.i 0
+}
+;;
+
+/////////////////////////// The second iteration /////////////////////////////
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs = log_y_rs_iter,log_y,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 log_y_rs = log_y_rs,log_y_rs_iter,NR2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs_iter,NR1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //(0.5*z)*(3-(y*z)*z)
+ fma.s1 log_y_rs_iter = log_y_rs_iter1,log_y_rs,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ //(0.5*z)*(3-(y*z)*z)
+ fma.s1 log_arg_early = log_y_rs_iter1,log_y_rs,f0
+ nop.i 0
+}
+;;
+
+//////////////////////////////////////// The third iteration /////////////////
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs = log_y_rs_iter,log_y,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs_iter,NR1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_arg_early = log_arg_early,log_y,f8
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 log_y_rs = log_y_rs,log_y_rs_iter,NR2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs_iter1,log_y,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ frcpa.s1 log_C,p0 = f1,log_arg_early
+ nop.i 0
+}
+;;
+
+{ .mfi
+ getf.exp log_GR_signexp_f8 = log_arg_early
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ getf.sig log_GR_significand_f8 = log_arg_early
+ fma.s1 log_arg = log_y_rs_iter1,log_y_rs,f8 // (0.5*z)*(3-(y*z)*z)
+ adds log_table_address3 = 0x70, NR_table_address
+}
+;;
+
+///////////////////////////////// The end NR iterations /////////////////////
+{ .mfi
+ ldfe log2 = [NR_table_address],16
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mmi
+ //significant bit destruction
+ and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones
+;;
+ //BIAS subtraction
+ sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones
+ nop.i 0
+}
+;;
+
+{ .mfi
+ setf.sig log_int_Nfloat = log_GR_true_exp_f8
+ fms.s1 log_r = log_C,log_arg,f1 // C = frcpa(x); r = C * x - 1
+ extr.u log_GR_index = log_GR_significand_f8,55,8 //Extract 8 bits
+}
+;;
+
+{ .mmi
+ //pre-index*16 + index
+ shladd log_table_address3 = log_GR_index,4,log_table_address3
+;;
+ ldfe log_T = [log_table_address3]
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rsq = log_r, log_r, f0 //r^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p4 = log_P5, log_r, log_P4 //P5*r + P4
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p32 = log_P3, log_r, log_P2 //P3*r + P2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //convert N to the floating-point format log_Nfloat
+ fcvt.xf log_Nfloat = log_int_Nfloat
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rcube = log_rsq, log_r, f0 //r^3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p10 = log_rsq, log_P1, log_r //P1*r^2 + r
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //(P5*r + P4)*r^2 + P3*r + P2
+ fma.s1 log_rp_p2 = log_rp_p4, log_rsq, log_rp_p32
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 + T
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ //((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r
+ fma.s1 log_r2P_r = log_rp_p2, log_rcube, log_rp_p10
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ // N*log2 + T + ((P5*r + P4)*r^2 + P3*r + P2)*w^3 + P1*r^2 + r
+ fadd.d.s0 f8 = log_T_plus_Nlog2, log_r2P_r
+ br.ret.sptk b0 // Exit main path, path 3: 1.0005 <= x < 2^63
+}
+;;
+
+// Here if path 2, 1.0 < x < 1.0005
+ACOSH_NEAR_ONE:
+// The first NR iteration
+{ .mfi
+ ldfe log_C1 = [log_table_address2],16
+ fma.s1 acosh_w_iter1 = acosh_w_rs,log_w,f0 //t*w
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_1 = f8,log_C4,log_C3 //x*C4 + C3
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe log_C0 = [log_table_address2],16
+ fma.s1 acosh_w_iter2 = acosh_w_rs,NR1,f0 //t*0.5
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 acosh_w_iter1 = acosh_w_iter1,acosh_w_rs,NR2 //3-t*t*w
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //(3-t*t*w)*t*0.5
+ fma.s1 acosh_w_iter2 = acosh_w_iter2,acosh_w_iter1,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_1 = acosh_w_1,log_w,log_C2 //(x*C4 + C3)*(x-1) + C2
+ nop.i 0
+}
+;;
+
+// The second NR iteration
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_rs = acosh_w_iter2,log_w,f0 //t*w
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ //((x*C4 + C3)*(x-1) + C2)*(x-1) + C1
+ fma.s1 acosh_w_1 = acosh_w_1,log_w,log_C1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 acosh_w_iter1 = acosh_w_iter2,acosh_w_rs,NR2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_iter2 = acosh_w_iter2,NR1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_iter2 = acosh_w_iter2,acosh_w_iter1,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ //(((x*C4 + C3)*(x-1) + C2)*(x-1) + C1)*(x-1) + C0
+ fma.s1 acosh_w_1 = acosh_w_1,log_w,log_C0
+ nop.i 0
+}
+;;
+
+//The third NR iteration
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_rs = acosh_w_iter2,log_w,f0 //t*w
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 acosh_w_iter1 = acosh_w_iter2,acosh_w_rs,NR2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_iter2 = acosh_w_iter2,NR1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_iter2 = acosh_w_iter2,acosh_w_iter1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_sqrt = acosh_w_iter2,log_w,f0
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = acosh_w_1,acosh_w_sqrt,f0
+ br.ret.sptk b0 // Exit path 2, 1.0 < x < 1.0005
+}
+;;
+
+// Here if path 4, x >= 2^63
+LOG_COMMON1:
+{ .mfi
+ ldfpd acosh_comp,log_P5 = [NR_table_address],16
+ frcpa.s1 log_C,p0 = f1,log_arg
+ nop.i 0
+}
+;;
+
+{ .mmi
+ getf.exp log_GR_signexp_f8 = log_arg
+ ldfpd log_P4,log_P3 = [NR_table_address],16
+ nop.i 0
+}
+;;
+
+{ .mmi
+ getf.sig log_GR_significand_f8 = log_arg
+ ldfpd log_P2,log_P1 = [NR_table_address],16
+ nop.i 0
+}
+;;
+
+{ .mfi
+ adds log_table_address3 = 0x70, NR_table_address
+ nop.f 0
+ //significant bit destruction
+ and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones
+}
+;;
+
+{ .mmf
+ ldfe log2 = [NR_table_address],16
+ //BIAS subtraction
+ sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones
+ fms.s1 log_r = log_C,log_arg,f1 // C = frcpa(x); r = C * x - 1
+}
+;;
+
+{ .mfi
+ setf.sig log_int_Nfloat = log_GR_true_exp_f8
+ nop.f 0
+ extr.u log_GR_index = log_GR_significand_f8,55,8 //Extract 8 bits
+}
+;;
+
+{ .mmi
+ //pre-index*16 + index
+ shladd log_table_address3 = log_GR_index,4,log_table_address3
+;;
+ ldfe log_T = [log_table_address3]
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rsq = log_r, log_r, f0 //r^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p4 = log_P5, log_r, log_P4 //P5*r + P4
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p32 = log_P3, log_r, log_P2 //P3*r + P2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rcube = log_rsq, log_r, f0 //r^3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p10 = log_rsq, log_P1, log_r //P1*r^2 + r
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //convert N to the floating-point format log_Nfloat
+ fcvt.xf log_Nfloat = log_int_Nfloat
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ //(P5*r + P4)*r^2 + P3*r + P2
+ fma.s1 log_rp_p2 = log_rp_p4, log_rsq, log_rp_p32
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 + T
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ //((P5*r + P4)*r^2 + P3*r + P2)*w^3 + P1*r^2 + r
+ fma.s1 log_r2P_r = log_rp_p2, log_rcube, log_rp_p10
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ // N*log2 + T + ((P5*r + P4)*r^2 + P3*r + P2)*w^3 + P1*r^2 + r
+ fadd.d.s0 f8 = log_T_plus_Nlog2, log_r2P_r
+ br.ret.sptk b0 // Exit path 4, x >= 2^63
+}
+;;
+
+// Here if path 7, x < 1.0
+ACOSH_LESS_ONE:
+{ .mfi
+ alloc r32 = ar.pfs,1,3,4,0
+ fmerge.s f10 = f8,f8
+ nop.i 0
+}
+;;
+
+{ .mfb
+ mov acosh_GR_tag = 136
+ frcpa.s0 f8,p0 = f0,f0
+ br.cond.sptk __libm_error_region
+}
+;;
+
+GLOBAL_LIBM_END(acosh)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+{ .mmi
+ stfd [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+{ .mib
+ stfd [GR_Parameter_X] = f10 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+
+{ .mmi
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_acoshf.S b/sysdeps/ia64/fpu/e_acoshf.S
new file mode 100644
index 0000000000..4a54c264c1
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_acoshf.S
@@ -0,0 +1,1029 @@
+.file "acoshf.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// ==============================================================
+// History
+// ==============================================================
+// 03/28/01 Initial version
+// 04/19/01 Improved speed of the paths #1,2,3,4,5
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+// 05/14/03 Improved performance, set denormal flag for unorms >= 1.0
+//
+// API
+// ==============================================================
+// float acoshf(float)
+//
+// Overview of operation
+// ==============================================================
+//
+// There are 7 paths:
+// 1. x = 1.0
+// Return acoshf(x) = 0.0
+// 2. 1.0 < x < 1.000499725341796875(0x3FF0020C00000000)
+// Return acoshf(x) = sqrt(x-1) * Pol4(x),
+// where Pol4(x) = (x*C2 + C1)*(x-1) + C0
+//
+// 3. 1.000499725341796875(0x3FF0020C00000000) <= x < 2^51
+// Return acoshf(x) = log(x + sqrt(x^2 -1.0))
+// To compute x + sqrt(x^2 -1.0) modified Newton Raphson method is used
+// (2 iterations)
+// Algorithm description for log function see below.
+//
+// 4. 2^51 <= x < +INF
+// Return acoshf(x) = log(2*x)
+// Algorithm description for log function see below.
+//
+// 5. x = +INF
+// Return acoshf(x) = +INF
+//
+// 6. x = [S,Q]NaN
+// Return acoshf(x) = QNaN
+//
+// 7. x < 1.0
+// It's domain error. Error handler with tag = 137 is called
+//
+//==============================================================
+// Algorithm Description for log(x) function
+// Below we are using the fact that inequality x - 1.0 > 2^(-6) is always
+// true for this acosh implementation
+//
+// Consider x = 2^N 1.f1 f2 f3 f4...f63
+// Log(x) = log(frcpa(x) x/frcpa(x))
+// = log(1/frcpa(x)) + log(frcpa(x) x)
+// = -log(frcpa(x)) + log(frcpa(x) x)
+//
+// frcpa(x) = 2^-N frcpa((1.f1 f2 ... f63)
+//
+// -log(frcpa(x)) = -log(C)
+// = -log(2^-N) - log(frcpa(1.f1 f2 ... f63))
+//
+// -log(frcpa(x)) = -log(C)
+// = +Nlog2 - log(frcpa(1.f1 f2 ... f63))
+//
+// -log(frcpa(x)) = -log(C)
+// = +Nlog2 + log(frcpa(1.f1 f2 ... f63))
+//
+// Log(x) = log(1/frcpa(x)) + log(frcpa(x) x)
+//
+// Log(x) = +Nlog2 + log(1./frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)
+// Log(x) = +Nlog2 - log(/frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)
+// Log(x) = +Nlog2 + T + log(frcpa(x) x)
+//
+// Log(x) = +Nlog2 + T + log(C x)
+//
+// Cx = 1 + r
+//
+// Log(x) = +Nlog2 + T + log(1+r)
+// Log(x) = +Nlog2 + T + Series( r - r^2/2 + r^3/3 - r^4/4 ....)
+//
+// 1.f1 f2 ... f8 has 256 entries.
+// They are 1 + k/2^8, k = 0 ... 255
+// These 256 values are the table entries.
+//
+// Implementation
+//==============================================================
+// C = frcpa(x)
+// r = C * x - 1
+//
+// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4
+//
+// x = f * 2*n where f is 1.f_1f_2f_3....f_63
+// Nfloat = float(n) where n is the true unbiased exponent
+// pre-index = f_1f_2....f_8
+// index = pre_index * 8
+// get the dxt table entry at index + offset = T
+//
+// result = (T + Nfloat * log(2)) + rseries
+//
+// The T table is calculated as follows
+// Form x_k = 1 + k/2^8 where k goes from 0... 255
+// y_k = frcpa(x_k)
+// log(1/y_k) in quad and round to double
+//
+
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f9 -> f15, f32 -> f62
+//
+// General registers used:
+// r14 -> r27, r32 -> r39
+//
+// Predicate registers used:
+// p6 -> p15
+//
+// p6 to filter out case when x = [Q,S]NaN
+// p7,p8 to filter out case when x < 1.0
+//
+// p10 to select path #1
+// p11 to filter out case when x = +INF
+// p12 used in the frcpa
+// p13 to select path #4
+// p14,p15 to select path #2
+
+// Assembly macros
+//==============================================================
+log_GR_exp_17_ones = r14
+log_GR_signexp_f8 = r15
+log_table_address2 = r16
+log_GR_exp_16_ones = r17
+log_GR_exp_f8 = r18
+log_GR_true_exp_f8 = r19
+log_GR_significand_f8 = r20
+log_GR_index = r21
+log_GR_comp2 = r22
+acosh_GR_f8 = r23
+log_GR_comp = r24
+acosh_GR_f8_sig = r25
+log_table_address3 = r26
+NR_table_address = r27
+
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_SAVE_PFS = r35
+
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+acosh_GR_tag = r39
+
+//==============================================================
+log_y = f9
+NR1 = f10
+NR2 = f11
+log_y_rs = f12
+log_y_rs_iter = f13
+log_y_rs_iter1 = f14
+log_NORM_f8 = f15
+log_w = f32
+acosh_comp = f34
+acosh_comp2 = f33
+log_P3 = f35
+log_P2 = f36
+log_P1 = f37
+log2 = f38
+log_C0 = f39
+log_C1 = f40
+log_C2 = f41
+acosh_w_rs = f42
+log_C = f43
+log_arg = f44
+acosh_w_iter1 = f45
+acosh_w_iter2 = f46
+log_int_Nfloat = f47
+log_r = f48
+log_rsq = f49
+log_rp_p4 = f50
+log_rp_p32 = f51
+log_rcube = f52
+log_rp_p10 = f53
+log_rp_p2 = f54
+log_Nfloat = f55
+log_T = f56
+log_r2P_r = f57
+log_T_plus_Nlog2 = f58
+acosh_w_sqrt = f59
+acosh_w_1 = f60
+log_arg_early = f61
+log_y_rs_iter2 = f62
+
+
+// Data tables
+//==============================================================
+
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(log_table_1)
+data8 0xbfd0001008f39d59 // p3
+data8 0x3fd5556073e0c45a // p2
+data8 0xbfdffffffffaea15 // p1
+data8 0x3FE62E42FEFA39EF // log2
+LOCAL_OBJECT_END(log_table_1)
+
+LOCAL_OBJECT_START(log_table_2)
+
+data8 0x3FE0000000000000 // 0.5
+data8 0x4008000000000000 // 3.0
+data8 0xD92CBAD213719F11, 0x00003FF9 // C2 3FF9D92CBAD213719F11
+data8 0x93D38EBF2EC9B073, 0x0000BFFC // C1 BFFC93D38EBF2EC9B073
+data8 0xB504F333F9DA0E32, 0x00003FFF // C0 3FFFB504F333F9DA0E32
+LOCAL_OBJECT_END(log_table_2)
+
+LOCAL_OBJECT_START(log_table_3)
+data8 0x3F60040155D5889E //log(1/frcpa(1+ 0/256)
+data8 0x3F78121214586B54 //log(1/frcpa(1+ 1/256)
+data8 0x3F841929F96832F0 //log(1/frcpa(1+ 2/256)
+data8 0x3F8C317384C75F06 //log(1/frcpa(1+ 3/256)
+data8 0x3F91A6B91AC73386 //log(1/frcpa(1+ 4/256)
+data8 0x3F95BA9A5D9AC039 //log(1/frcpa(1+ 5/256)
+data8 0x3F99D2A8074325F4 //log(1/frcpa(1+ 6/256)
+data8 0x3F9D6B2725979802 //log(1/frcpa(1+ 7/256)
+data8 0x3FA0C58FA19DFAAA //log(1/frcpa(1+ 8/256)
+data8 0x3FA2954C78CBCE1B //log(1/frcpa(1+ 9/256)
+data8 0x3FA4A94D2DA96C56 //log(1/frcpa(1+ 10/256)
+data8 0x3FA67C94F2D4BB58 //log(1/frcpa(1+ 11/256)
+data8 0x3FA85188B630F068 //log(1/frcpa(1+ 12/256)
+data8 0x3FAA6B8ABE73AF4C //log(1/frcpa(1+ 13/256)
+data8 0x3FAC441E06F72A9E //log(1/frcpa(1+ 14/256)
+data8 0x3FAE1E6713606D07 //log(1/frcpa(1+ 15/256)
+data8 0x3FAFFA6911AB9301 //log(1/frcpa(1+ 16/256)
+data8 0x3FB0EC139C5DA601 //log(1/frcpa(1+ 17/256)
+data8 0x3FB1DBD2643D190B //log(1/frcpa(1+ 18/256)
+data8 0x3FB2CC7284FE5F1C //log(1/frcpa(1+ 19/256)
+data8 0x3FB3BDF5A7D1EE64 //log(1/frcpa(1+ 20/256)
+data8 0x3FB4B05D7AA012E0 //log(1/frcpa(1+ 21/256)
+data8 0x3FB580DB7CEB5702 //log(1/frcpa(1+ 22/256)
+data8 0x3FB674F089365A7A //log(1/frcpa(1+ 23/256)
+data8 0x3FB769EF2C6B568D //log(1/frcpa(1+ 24/256)
+data8 0x3FB85FD927506A48 //log(1/frcpa(1+ 25/256)
+data8 0x3FB9335E5D594989 //log(1/frcpa(1+ 26/256)
+data8 0x3FBA2B0220C8E5F5 //log(1/frcpa(1+ 27/256)
+data8 0x3FBB0004AC1A86AC //log(1/frcpa(1+ 28/256)
+data8 0x3FBBF968769FCA11 //log(1/frcpa(1+ 29/256)
+data8 0x3FBCCFEDBFEE13A8 //log(1/frcpa(1+ 30/256)
+data8 0x3FBDA727638446A2 //log(1/frcpa(1+ 31/256)
+data8 0x3FBEA3257FE10F7A //log(1/frcpa(1+ 32/256)
+data8 0x3FBF7BE9FEDBFDE6 //log(1/frcpa(1+ 33/256)
+data8 0x3FC02AB352FF25F4 //log(1/frcpa(1+ 34/256)
+data8 0x3FC097CE579D204D //log(1/frcpa(1+ 35/256)
+data8 0x3FC1178E8227E47C //log(1/frcpa(1+ 36/256)
+data8 0x3FC185747DBECF34 //log(1/frcpa(1+ 37/256)
+data8 0x3FC1F3B925F25D41 //log(1/frcpa(1+ 38/256)
+data8 0x3FC2625D1E6DDF57 //log(1/frcpa(1+ 39/256)
+data8 0x3FC2D1610C86813A //log(1/frcpa(1+ 40/256)
+data8 0x3FC340C59741142E //log(1/frcpa(1+ 41/256)
+data8 0x3FC3B08B6757F2A9 //log(1/frcpa(1+ 42/256)
+data8 0x3FC40DFB08378003 //log(1/frcpa(1+ 43/256)
+data8 0x3FC47E74E8CA5F7C //log(1/frcpa(1+ 44/256)
+data8 0x3FC4EF51F6466DE4 //log(1/frcpa(1+ 45/256)
+data8 0x3FC56092E02BA516 //log(1/frcpa(1+ 46/256)
+data8 0x3FC5D23857CD74D5 //log(1/frcpa(1+ 47/256)
+data8 0x3FC6313A37335D76 //log(1/frcpa(1+ 48/256)
+data8 0x3FC6A399DABBD383 //log(1/frcpa(1+ 49/256)
+data8 0x3FC70337DD3CE41B //log(1/frcpa(1+ 50/256)
+data8 0x3FC77654128F6127 //log(1/frcpa(1+ 51/256)
+data8 0x3FC7E9D82A0B022D //log(1/frcpa(1+ 52/256)
+data8 0x3FC84A6B759F512F //log(1/frcpa(1+ 53/256)
+data8 0x3FC8AB47D5F5A310 //log(1/frcpa(1+ 54/256)
+data8 0x3FC91FE49096581B //log(1/frcpa(1+ 55/256)
+data8 0x3FC981634011AA75 //log(1/frcpa(1+ 56/256)
+data8 0x3FC9F6C407089664 //log(1/frcpa(1+ 57/256)
+data8 0x3FCA58E729348F43 //log(1/frcpa(1+ 58/256)
+data8 0x3FCABB55C31693AD //log(1/frcpa(1+ 59/256)
+data8 0x3FCB1E104919EFD0 //log(1/frcpa(1+ 60/256)
+data8 0x3FCB94EE93E367CB //log(1/frcpa(1+ 61/256)
+data8 0x3FCBF851C067555F //log(1/frcpa(1+ 62/256)
+data8 0x3FCC5C0254BF23A6 //log(1/frcpa(1+ 63/256)
+data8 0x3FCCC000C9DB3C52 //log(1/frcpa(1+ 64/256)
+data8 0x3FCD244D99C85674 //log(1/frcpa(1+ 65/256)
+data8 0x3FCD88E93FB2F450 //log(1/frcpa(1+ 66/256)
+data8 0x3FCDEDD437EAEF01 //log(1/frcpa(1+ 67/256)
+data8 0x3FCE530EFFE71012 //log(1/frcpa(1+ 68/256)
+data8 0x3FCEB89A1648B971 //log(1/frcpa(1+ 69/256)
+data8 0x3FCF1E75FADF9BDE //log(1/frcpa(1+ 70/256)
+data8 0x3FCF84A32EAD7C35 //log(1/frcpa(1+ 71/256)
+data8 0x3FCFEB2233EA07CD //log(1/frcpa(1+ 72/256)
+data8 0x3FD028F9C7035C1C //log(1/frcpa(1+ 73/256)
+data8 0x3FD05C8BE0D9635A //log(1/frcpa(1+ 74/256)
+data8 0x3FD085EB8F8AE797 //log(1/frcpa(1+ 75/256)
+data8 0x3FD0B9C8E32D1911 //log(1/frcpa(1+ 76/256)
+data8 0x3FD0EDD060B78081 //log(1/frcpa(1+ 77/256)
+data8 0x3FD122024CF0063F //log(1/frcpa(1+ 78/256)
+data8 0x3FD14BE2927AECD4 //log(1/frcpa(1+ 79/256)
+data8 0x3FD180618EF18ADF //log(1/frcpa(1+ 80/256)
+data8 0x3FD1B50BBE2FC63B //log(1/frcpa(1+ 81/256)
+data8 0x3FD1DF4CC7CF242D //log(1/frcpa(1+ 82/256)
+data8 0x3FD214456D0EB8D4 //log(1/frcpa(1+ 83/256)
+data8 0x3FD23EC5991EBA49 //log(1/frcpa(1+ 84/256)
+data8 0x3FD2740D9F870AFB //log(1/frcpa(1+ 85/256)
+data8 0x3FD29ECDABCDFA04 //log(1/frcpa(1+ 86/256)
+data8 0x3FD2D46602ADCCEE //log(1/frcpa(1+ 87/256)
+data8 0x3FD2FF66B04EA9D4 //log(1/frcpa(1+ 88/256)
+data8 0x3FD335504B355A37 //log(1/frcpa(1+ 89/256)
+data8 0x3FD360925EC44F5D //log(1/frcpa(1+ 90/256)
+data8 0x3FD38BF1C3337E75 //log(1/frcpa(1+ 91/256)
+data8 0x3FD3C25277333184 //log(1/frcpa(1+ 92/256)
+data8 0x3FD3EDF463C1683E //log(1/frcpa(1+ 93/256)
+data8 0x3FD419B423D5E8C7 //log(1/frcpa(1+ 94/256)
+data8 0x3FD44591E0539F49 //log(1/frcpa(1+ 95/256)
+data8 0x3FD47C9175B6F0AD //log(1/frcpa(1+ 96/256)
+data8 0x3FD4A8B341552B09 //log(1/frcpa(1+ 97/256)
+data8 0x3FD4D4F3908901A0 //log(1/frcpa(1+ 98/256)
+data8 0x3FD501528DA1F968 //log(1/frcpa(1+ 99/256)
+data8 0x3FD52DD06347D4F6 //log(1/frcpa(1+ 100/256)
+data8 0x3FD55A6D3C7B8A8A //log(1/frcpa(1+ 101/256)
+data8 0x3FD5925D2B112A59 //log(1/frcpa(1+ 102/256)
+data8 0x3FD5BF406B543DB2 //log(1/frcpa(1+ 103/256)
+data8 0x3FD5EC433D5C35AE //log(1/frcpa(1+ 104/256)
+data8 0x3FD61965CDB02C1F //log(1/frcpa(1+ 105/256)
+data8 0x3FD646A84935B2A2 //log(1/frcpa(1+ 106/256)
+data8 0x3FD6740ADD31DE94 //log(1/frcpa(1+ 107/256)
+data8 0x3FD6A18DB74A58C5 //log(1/frcpa(1+ 108/256)
+data8 0x3FD6CF31058670EC //log(1/frcpa(1+ 109/256)
+data8 0x3FD6F180E852F0BA //log(1/frcpa(1+ 110/256)
+data8 0x3FD71F5D71B894F0 //log(1/frcpa(1+ 111/256)
+data8 0x3FD74D5AEFD66D5C //log(1/frcpa(1+ 112/256)
+data8 0x3FD77B79922BD37E //log(1/frcpa(1+ 113/256)
+data8 0x3FD7A9B9889F19E2 //log(1/frcpa(1+ 114/256)
+data8 0x3FD7D81B037EB6A6 //log(1/frcpa(1+ 115/256)
+data8 0x3FD8069E33827231 //log(1/frcpa(1+ 116/256)
+data8 0x3FD82996D3EF8BCB //log(1/frcpa(1+ 117/256)
+data8 0x3FD85855776DCBFB //log(1/frcpa(1+ 118/256)
+data8 0x3FD8873658327CCF //log(1/frcpa(1+ 119/256)
+data8 0x3FD8AA75973AB8CF //log(1/frcpa(1+ 120/256)
+data8 0x3FD8D992DC8824E5 //log(1/frcpa(1+ 121/256)
+data8 0x3FD908D2EA7D9512 //log(1/frcpa(1+ 122/256)
+data8 0x3FD92C59E79C0E56 //log(1/frcpa(1+ 123/256)
+data8 0x3FD95BD750EE3ED3 //log(1/frcpa(1+ 124/256)
+data8 0x3FD98B7811A3EE5B //log(1/frcpa(1+ 125/256)
+data8 0x3FD9AF47F33D406C //log(1/frcpa(1+ 126/256)
+data8 0x3FD9DF270C1914A8 //log(1/frcpa(1+ 127/256)
+data8 0x3FDA0325ED14FDA4 //log(1/frcpa(1+ 128/256)
+data8 0x3FDA33440224FA79 //log(1/frcpa(1+ 129/256)
+data8 0x3FDA57725E80C383 //log(1/frcpa(1+ 130/256)
+data8 0x3FDA87D0165DD199 //log(1/frcpa(1+ 131/256)
+data8 0x3FDAAC2E6C03F896 //log(1/frcpa(1+ 132/256)
+data8 0x3FDADCCC6FDF6A81 //log(1/frcpa(1+ 133/256)
+data8 0x3FDB015B3EB1E790 //log(1/frcpa(1+ 134/256)
+data8 0x3FDB323A3A635948 //log(1/frcpa(1+ 135/256)
+data8 0x3FDB56FA04462909 //log(1/frcpa(1+ 136/256)
+data8 0x3FDB881AA659BC93 //log(1/frcpa(1+ 137/256)
+data8 0x3FDBAD0BEF3DB165 //log(1/frcpa(1+ 138/256)
+data8 0x3FDBD21297781C2F //log(1/frcpa(1+ 139/256)
+data8 0x3FDC039236F08819 //log(1/frcpa(1+ 140/256)
+data8 0x3FDC28CB1E4D32FD //log(1/frcpa(1+ 141/256)
+data8 0x3FDC4E19B84723C2 //log(1/frcpa(1+ 142/256)
+data8 0x3FDC7FF9C74554C9 //log(1/frcpa(1+ 143/256)
+data8 0x3FDCA57B64E9DB05 //log(1/frcpa(1+ 144/256)
+data8 0x3FDCCB130A5CEBB0 //log(1/frcpa(1+ 145/256)
+data8 0x3FDCF0C0D18F326F //log(1/frcpa(1+ 146/256)
+data8 0x3FDD232075B5A201 //log(1/frcpa(1+ 147/256)
+data8 0x3FDD490246DEFA6B //log(1/frcpa(1+ 148/256)
+data8 0x3FDD6EFA918D25CD //log(1/frcpa(1+ 149/256)
+data8 0x3FDD9509707AE52F //log(1/frcpa(1+ 150/256)
+data8 0x3FDDBB2EFE92C554 //log(1/frcpa(1+ 151/256)
+data8 0x3FDDEE2F3445E4AF //log(1/frcpa(1+ 152/256)
+data8 0x3FDE148A1A2726CE //log(1/frcpa(1+ 153/256)
+data8 0x3FDE3AFC0A49FF40 //log(1/frcpa(1+ 154/256)
+data8 0x3FDE6185206D516E //log(1/frcpa(1+ 155/256)
+data8 0x3FDE882578823D52 //log(1/frcpa(1+ 156/256)
+data8 0x3FDEAEDD2EAC990C //log(1/frcpa(1+ 157/256)
+data8 0x3FDED5AC5F436BE3 //log(1/frcpa(1+ 158/256)
+data8 0x3FDEFC9326D16AB9 //log(1/frcpa(1+ 159/256)
+data8 0x3FDF2391A2157600 //log(1/frcpa(1+ 160/256)
+data8 0x3FDF4AA7EE03192D //log(1/frcpa(1+ 161/256)
+data8 0x3FDF71D627C30BB0 //log(1/frcpa(1+ 162/256)
+data8 0x3FDF991C6CB3B379 //log(1/frcpa(1+ 163/256)
+data8 0x3FDFC07ADA69A910 //log(1/frcpa(1+ 164/256)
+data8 0x3FDFE7F18EB03D3E //log(1/frcpa(1+ 165/256)
+data8 0x3FE007C053C5002E //log(1/frcpa(1+ 166/256)
+data8 0x3FE01B942198A5A1 //log(1/frcpa(1+ 167/256)
+data8 0x3FE02F74400C64EB //log(1/frcpa(1+ 168/256)
+data8 0x3FE04360BE7603AD //log(1/frcpa(1+ 169/256)
+data8 0x3FE05759AC47FE34 //log(1/frcpa(1+ 170/256)
+data8 0x3FE06B5F1911CF52 //log(1/frcpa(1+ 171/256)
+data8 0x3FE078BF0533C568 //log(1/frcpa(1+ 172/256)
+data8 0x3FE08CD9687E7B0E //log(1/frcpa(1+ 173/256)
+data8 0x3FE0A10074CF9019 //log(1/frcpa(1+ 174/256)
+data8 0x3FE0B5343A234477 //log(1/frcpa(1+ 175/256)
+data8 0x3FE0C974C89431CE //log(1/frcpa(1+ 176/256)
+data8 0x3FE0DDC2305B9886 //log(1/frcpa(1+ 177/256)
+data8 0x3FE0EB524BAFC918 //log(1/frcpa(1+ 178/256)
+data8 0x3FE0FFB54213A476 //log(1/frcpa(1+ 179/256)
+data8 0x3FE114253DA97D9F //log(1/frcpa(1+ 180/256)
+data8 0x3FE128A24F1D9AFF //log(1/frcpa(1+ 181/256)
+data8 0x3FE1365252BF0865 //log(1/frcpa(1+ 182/256)
+data8 0x3FE14AE558B4A92D //log(1/frcpa(1+ 183/256)
+data8 0x3FE15F85A19C765B //log(1/frcpa(1+ 184/256)
+data8 0x3FE16D4D38C119FA //log(1/frcpa(1+ 185/256)
+data8 0x3FE18203C20DD133 //log(1/frcpa(1+ 186/256)
+data8 0x3FE196C7BC4B1F3B //log(1/frcpa(1+ 187/256)
+data8 0x3FE1A4A738B7A33C //log(1/frcpa(1+ 188/256)
+data8 0x3FE1B981C0C9653D //log(1/frcpa(1+ 189/256)
+data8 0x3FE1CE69E8BB106B //log(1/frcpa(1+ 190/256)
+data8 0x3FE1DC619DE06944 //log(1/frcpa(1+ 191/256)
+data8 0x3FE1F160A2AD0DA4 //log(1/frcpa(1+ 192/256)
+data8 0x3FE2066D7740737E //log(1/frcpa(1+ 193/256)
+data8 0x3FE2147DBA47A394 //log(1/frcpa(1+ 194/256)
+data8 0x3FE229A1BC5EBAC3 //log(1/frcpa(1+ 195/256)
+data8 0x3FE237C1841A502E //log(1/frcpa(1+ 196/256)
+data8 0x3FE24CFCE6F80D9A //log(1/frcpa(1+ 197/256)
+data8 0x3FE25B2C55CD5762 //log(1/frcpa(1+ 198/256)
+data8 0x3FE2707F4D5F7C41 //log(1/frcpa(1+ 199/256)
+data8 0x3FE285E0842CA384 //log(1/frcpa(1+ 200/256)
+data8 0x3FE294294708B773 //log(1/frcpa(1+ 201/256)
+data8 0x3FE2A9A2670AFF0C //log(1/frcpa(1+ 202/256)
+data8 0x3FE2B7FB2C8D1CC1 //log(1/frcpa(1+ 203/256)
+data8 0x3FE2C65A6395F5F5 //log(1/frcpa(1+ 204/256)
+data8 0x3FE2DBF557B0DF43 //log(1/frcpa(1+ 205/256)
+data8 0x3FE2EA64C3F97655 //log(1/frcpa(1+ 206/256)
+data8 0x3FE3001823684D73 //log(1/frcpa(1+ 207/256)
+data8 0x3FE30E97E9A8B5CD //log(1/frcpa(1+ 208/256)
+data8 0x3FE32463EBDD34EA //log(1/frcpa(1+ 209/256)
+data8 0x3FE332F4314AD796 //log(1/frcpa(1+ 210/256)
+data8 0x3FE348D90E7464D0 //log(1/frcpa(1+ 211/256)
+data8 0x3FE35779F8C43D6E //log(1/frcpa(1+ 212/256)
+data8 0x3FE36621961A6A99 //log(1/frcpa(1+ 213/256)
+data8 0x3FE37C299F3C366A //log(1/frcpa(1+ 214/256)
+data8 0x3FE38AE2171976E7 //log(1/frcpa(1+ 215/256)
+data8 0x3FE399A157A603E7 //log(1/frcpa(1+ 216/256)
+data8 0x3FE3AFCCFE77B9D1 //log(1/frcpa(1+ 217/256)
+data8 0x3FE3BE9D503533B5 //log(1/frcpa(1+ 218/256)
+data8 0x3FE3CD7480B4A8A3 //log(1/frcpa(1+ 219/256)
+data8 0x3FE3E3C43918F76C //log(1/frcpa(1+ 220/256)
+data8 0x3FE3F2ACB27ED6C7 //log(1/frcpa(1+ 221/256)
+data8 0x3FE4019C2125CA93 //log(1/frcpa(1+ 222/256)
+data8 0x3FE4181061389722 //log(1/frcpa(1+ 223/256)
+data8 0x3FE42711518DF545 //log(1/frcpa(1+ 224/256)
+data8 0x3FE436194E12B6BF //log(1/frcpa(1+ 225/256)
+data8 0x3FE445285D68EA69 //log(1/frcpa(1+ 226/256)
+data8 0x3FE45BCC464C893A //log(1/frcpa(1+ 227/256)
+data8 0x3FE46AED21F117FC //log(1/frcpa(1+ 228/256)
+data8 0x3FE47A1527E8A2D3 //log(1/frcpa(1+ 229/256)
+data8 0x3FE489445EFFFCCC //log(1/frcpa(1+ 230/256)
+data8 0x3FE4A018BCB69835 //log(1/frcpa(1+ 231/256)
+data8 0x3FE4AF5A0C9D65D7 //log(1/frcpa(1+ 232/256)
+data8 0x3FE4BEA2A5BDBE87 //log(1/frcpa(1+ 233/256)
+data8 0x3FE4CDF28F10AC46 //log(1/frcpa(1+ 234/256)
+data8 0x3FE4DD49CF994058 //log(1/frcpa(1+ 235/256)
+data8 0x3FE4ECA86E64A684 //log(1/frcpa(1+ 236/256)
+data8 0x3FE503C43CD8EB68 //log(1/frcpa(1+ 237/256)
+data8 0x3FE513356667FC57 //log(1/frcpa(1+ 238/256)
+data8 0x3FE522AE0738A3D8 //log(1/frcpa(1+ 239/256)
+data8 0x3FE5322E26867857 //log(1/frcpa(1+ 240/256)
+data8 0x3FE541B5CB979809 //log(1/frcpa(1+ 241/256)
+data8 0x3FE55144FDBCBD62 //log(1/frcpa(1+ 242/256)
+data8 0x3FE560DBC45153C7 //log(1/frcpa(1+ 243/256)
+data8 0x3FE5707A26BB8C66 //log(1/frcpa(1+ 244/256)
+data8 0x3FE587F60ED5B900 //log(1/frcpa(1+ 245/256)
+data8 0x3FE597A7977C8F31 //log(1/frcpa(1+ 246/256)
+data8 0x3FE5A760D634BB8B //log(1/frcpa(1+ 247/256)
+data8 0x3FE5B721D295F10F //log(1/frcpa(1+ 248/256)
+data8 0x3FE5C6EA94431EF9 //log(1/frcpa(1+ 249/256)
+data8 0x3FE5D6BB22EA86F6 //log(1/frcpa(1+ 250/256)
+data8 0x3FE5E6938645D390 //log(1/frcpa(1+ 251/256)
+data8 0x3FE5F673C61A2ED2 //log(1/frcpa(1+ 252/256)
+data8 0x3FE6065BEA385926 //log(1/frcpa(1+ 253/256)
+data8 0x3FE6164BFA7CC06B //log(1/frcpa(1+ 254/256)
+data8 0x3FE62643FECF9743 //log(1/frcpa(1+ 255/256)
+LOCAL_OBJECT_END(log_table_3)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(acoshf)
+
+{ .mfi
+ getf.exp acosh_GR_f8 = f8
+ fclass.m p6,p0 = f8, 0xc3 // Test for x = NaN
+ mov log_GR_comp2 = 0x10032
+}
+{ .mfi
+ addl NR_table_address = @ltoff(log_table_1), gp
+ fms.s1 log_y = f8, f8, f1 // y = x^2-1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ getf.sig acosh_GR_f8_sig = f8
+ fclass.m p11,p0 = f8, 0x21 // Test for x=+inf
+ mov log_GR_exp_17_ones = 0x1ffff
+}
+{ .mfi
+ ld8 NR_table_address = [NR_table_address]
+ fms.s1 log_w = f8,f1,f1 // w = x - 1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fcmp.lt.s1 p7,p8 = f8, f1 // Test for x<1.0
+ addl log_GR_comp = 0x10020C,r0 // Upper 21 bits of signif of 1.0005
+}
+{ .mfb
+ mov log_GR_exp_16_ones = 0xffff //BIAS
+(p6) fma.s.s0 f8 = f8,f1,f0 // quietize nan result if x=nan
+(p6) br.ret.spnt b0 // Exit for x=nan
+}
+;;
+
+{ .mfb
+ //get second table address
+ adds log_table_address2 = 0x20, NR_table_address
+ fcmp.eq.s1 p10,p0 = f8, f1 // Test for x=+1.0
+(p11) br.ret.spnt b0 // Exit for x=+inf
+}
+;;
+
+{ .mfi
+ ldfpd NR1,NR2 = [log_table_address2],16
+ frsqrta.s1 log_y_rs,p0 = log_y // z=1/sqrt(y)
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fma.s1 log_arg = f8,f1,f8
+(p7) br.cond.spnt ACOSH_LESS_ONE // Branch if path 7, x < 1.0
+}
+;;
+
+{ .mfi
+ ldfe log_C2 = [log_table_address2],16
+(p8) fcmp.eq.s0 p6,p0 = f8, f0 // Dummy op sets denorm flag if unorm>=1.0
+ nop.i 0
+}
+{ .mfb
+(p8) cmp.le.unc p13,p0 = log_GR_comp2,acosh_GR_f8
+ nop.f 0
+(p13) br.cond.spnt LOG_COMMON1 // Branch if path 4, x >= 2^51
+}
+;;
+
+{ .mfi
+ ldfe log_C1 = [log_table_address2],16
+(p10) fmerge.s f8 = f0, f0 // Return 0 if x=1.0
+ shr.u acosh_GR_f8_sig = acosh_GR_f8_sig,43
+}
+{ .mib
+ cmp.eq p14,p0 = log_GR_exp_16_ones,acosh_GR_f8
+ nop.i 0
+(p10) br.ret.spnt b0 // Exit for x=1.0
+}
+;;
+
+{ .mfi
+ ldfe log_C0 = [log_table_address2],16
+ frsqrta.s1 acosh_w_rs,p0 = log_w // t=1/sqrt(w)
+ nop.i 0
+}
+{ .mfb
+(p14) cmp.lt.unc p15,p0 = acosh_GR_f8_sig,log_GR_comp
+ nop.f 0
+(p15) br.cond.spnt ACOSH_NEAR_ONE // Branch if path 2, 1.0 < x < 1.0005
+}
+;;
+
+// Here is main path, 1.0005 <= x < 2^51
+/////////////// The first iteration //////////////////////////////////
+{ .mfi
+ ldfpd log_P3,log_P2 = [NR_table_address],16
+ fma.s1 log_y_rs_iter = log_y_rs,log_y,f0 // y*z
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfpd log_P1,log2 = [NR_table_address],16
+ fnma.s1 log_y_rs_iter2 = log_y_rs_iter,log_y_rs,NR2 // 3-(y*z)*z
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs,NR1,f0 // 0.5*z
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // (0.5*z)*(3-(y*z)*z)
+ fma.s1 log_y_rs_iter = log_y_rs_iter1,log_y_rs_iter2,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (0.5*z)*(3-(y*z)*z)
+ fma.s1 log_arg_early = log_y_rs_iter1,log_y_rs_iter2,f0
+ nop.i 0
+}
+;;
+
+/////////////////////////// The second iteration /////////////////////////////
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs = log_y_rs_iter,log_y,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs_iter,NR1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_arg_early = log_arg_early,log_y,f8
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 log_y_rs = log_y_rs,log_y_rs_iter,NR2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs_iter1,log_y,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ frcpa.s1 log_C,p0 = f1,log_arg_early
+ nop.i 0
+}
+;;
+
+{ .mfi
+ getf.exp log_GR_signexp_f8 = log_arg_early
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ getf.sig log_GR_significand_f8 = log_arg_early
+ fma.s1 log_arg = log_y_rs_iter1,log_y_rs,f8 // (0.5*z)*(3-(y*z)*z)
+ adds log_table_address3 = 0x40, NR_table_address
+}
+;;
+
+///////////////////////////////// The end NR iterations /////////////////////
+
+{ .mmi
+ //significant bit destruction
+ and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones
+;;
+ //BIAS subtraction
+ sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones
+ nop.i 0
+}
+;;
+
+{ .mfi
+ setf.sig log_int_Nfloat = log_GR_true_exp_f8
+ fms.s1 log_r = log_C,log_arg,f1 // C = frcpa(x); r = C * x - 1
+ extr.u log_GR_index = log_GR_significand_f8,55,8 //Extract 8 bits
+}
+;;
+
+{ .mmi
+ //pre-index*8 + index
+ shladd log_table_address3 = log_GR_index,3,log_table_address3
+;;
+ ldfd log_T = [log_table_address3]
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rsq = log_r, log_r, f0 //r^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p32 = log_P3, log_r, log_P2 //P3*r + P2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p10 = log_P1, log_r, f1 //P1*r + 1.0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //convert N to the floating-point format log_Nfloat
+ fcvt.xf log_Nfloat = log_int_Nfloat
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //(P3*r + P2)*r^2 + P1*r + 1.0
+ fma.s1 log_rp_p2 = log_rp_p32, log_rsq, log_rp_p10
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 + T
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ fma.s.s0 f8 = log_rp_p2,log_r,log_T_plus_Nlog2
+ br.ret.sptk b0 // Exit main path, path 3: 1.0005 <= x < 2^51
+}
+;;
+
+// Here if path 2, 1.0 < x < 1.0005
+ACOSH_NEAR_ONE:
+// The first NR iteration
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_iter1 = acosh_w_rs,log_w,f0 //t*w
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_1 = f8,log_C2,log_C1 //x*C2 + C1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_iter2 = acosh_w_rs,NR1,f0 //t*0.5
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 acosh_w_iter1 = acosh_w_iter1,acosh_w_rs,NR2 //3-t*t*w
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //(3-t*t*w)*t*0.5
+ fma.s1 acosh_w_iter2 = acosh_w_iter2,acosh_w_iter1,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_1 = acosh_w_1,log_w,log_C0 //(x*C2 + C1)*(x-1) + C0
+ nop.i 0
+}
+;;
+
+// The second NR iteration
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_rs = acosh_w_iter2,log_w,f0 //t*w
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 acosh_w_iter1 = acosh_w_iter2,acosh_w_rs,NR2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_iter2 = acosh_w_iter2,NR1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_iter2 = acosh_w_iter2,acosh_w_iter1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 acosh_w_sqrt = acosh_w_iter2,log_w,f0
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ fma.s.s0 f8 = acosh_w_1,acosh_w_sqrt,f0
+ br.ret.sptk b0 // Exit path 2, 1.0 < x < 1.0005
+}
+;;
+
+// Here if path 4, x >= 2^51
+LOG_COMMON1:
+{ .mfi
+ ldfpd log_P3,log_P2 = [NR_table_address],16
+ frcpa.s1 log_C,p0 = f1,log_arg
+ nop.i 0
+}
+;;
+
+{ .mmi
+ getf.exp log_GR_signexp_f8 = log_arg
+ ldfpd log_P1,log2 = [NR_table_address],16
+ nop.i 0
+}
+;;
+
+{ .mmi
+ getf.sig log_GR_significand_f8 = log_arg
+ nop.m 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ adds log_table_address3 = 0x40, NR_table_address
+ nop.f 0
+ //significant bit destruction
+ and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones
+}
+;;
+
+{ .mmf
+ nop.m 0
+ //BIAS subtraction
+ sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones
+ fms.s1 log_r = log_C,log_arg,f1 // C = frcpa(x); r = C * x - 1
+}
+;;
+
+{ .mfi
+ setf.sig log_int_Nfloat = log_GR_true_exp_f8
+ nop.f 0
+ extr.u log_GR_index = log_GR_significand_f8,55,8 //Extract 8 bits
+}
+;;
+
+{ .mmi
+ //pre-index*8 + index
+ shladd log_table_address3 = log_GR_index,3,log_table_address3
+;;
+ ldfd log_T = [log_table_address3]
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rsq = log_r, log_r, f0 //r^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p32 = log_P3, log_r, log_P2 //P3*r + P2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p10 = log_P1, log_r, f1 //P1*r + 1.0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //convert N to the floating-point format log_Nfloat
+ fcvt.xf log_Nfloat = log_int_Nfloat
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p2 = log_rp_p32, log_rsq, log_rp_p10
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 + T
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ fma.s.s0 f8 = log_rp_p2,log_r,log_T_plus_Nlog2
+ br.ret.sptk b0 // Exit path 4, x >= 2^51
+}
+;;
+
+// Here if path 7, x < 1.0
+ACOSH_LESS_ONE:
+{ .mfi
+ alloc r32 = ar.pfs,1,3,4,0
+ fmerge.s f10 = f8,f8
+ nop.i 0
+}
+;;
+
+{ .mfb
+ mov acosh_GR_tag = 137
+ frcpa.s0 f8,p0 = f0,f0
+ br.cond.sptk __libm_error_region
+}
+;;
+
+GLOBAL_LIBM_END(acoshf)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+{ .mmi
+ stfs [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+{ .mib
+ stfs [GR_Parameter_X] = f10 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+
+{ .mmi
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_acoshl.S b/sysdeps/ia64/fpu/e_acoshl.S
new file mode 100644
index 0000000000..85282d16d0
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_acoshl.S
@@ -0,0 +1,1713 @@
+.file "acoshl.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//*********************************************************************
+//
+// History:
+// 10/01/01 Initial version
+// 10/10/01 Performance inproved
+// 12/11/01 Changed huges_logp to not be global
+// 01/02/02 Corrected .restore syntax
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 08/14/02 Changed mli templates to mlx
+// 02/06/03 Reorganized data tables
+//
+//*********************************************************************
+//
+// API
+//==============================================================
+// long double acoshl(long double);
+//
+// Overview of operation
+//==============================================================
+//
+// There are 6 paths:
+// 1. x = 1
+// Return acoshl(x) = 0;
+//
+// 2. x < 1
+// Return acoshl(x) = Nan (Domain error, error handler call with tag 135);
+//
+// 3. x = [S,Q]Nan or +INF
+// Return acoshl(x) = x + x;
+//
+// 4. 'Near 1': 1 < x < 1+1/8
+// Return acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
+// where y = 1, P(y)/Q(y) - rational approximation
+//
+// 5. 'Huges': x > 0.5*2^64
+// Return acoshl(x) = (logl(2*x-1));
+//
+// 6. 'Main path': 1+1/8 < x < 0.5*2^64
+// b_hi + b_lo = x + sqrt(x^2 - 1);
+// acoshl(x) = logl_special(b_hi, b_lo);
+//
+// Algorithm description
+//==============================================================
+//
+// I. Near 1 path algorithm
+// **************************************************************
+// The formula is acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
+// where y = 1, P(y)/Q(y) - rational approximation
+//
+// 1) y = x - 1, y2 = 2 * y
+//
+// 2) Compute in parallel sqrtl(2*y) and P(y)/Q(y)
+// a) sqrtl computation method described below (main path algorithm, item 2))
+// As result we obtain (gg+gl) - multiprecision result
+// as pair of double extended values
+// b) P(y) and Q(y) calculated without any extra precision manipulations
+// c) P/Q division:
+// y = frcpa(Q) initial approximation of 1/Q
+// z = P*y initial approximation of P/Q
+//
+// e = 1 - b*y
+// e2 = e + e^2
+// e1 = e^2
+// y1 = y + y*e2 = y + y*(e+e^2)
+//
+// e3 = e + e1^2
+// y2 = y + y1*e3 = y + y*(e+e^2+..+e^6)
+//
+// r = P - Q*z
+// e = 1 - Q*y2
+// xx = z + r*y2 high part of a/b
+//
+// y3 = y2 + y2*e4
+// r1 = P - Q*xx
+// xl = r1*y3 low part of a/b
+//
+// 3) res = sqrt(2*y) - sqrt(2*y)*(P(y)/Q(y)) =
+// = (gg+gl) - (gg + gl)*(xx+xl);
+//
+// a) hh = gg*xx; hl = gg*xl; lh = gl*xx; ll = gl*xl;
+// b) res = ((((gl + ll) + lh) + hl) + hh) + gg;
+// (exactly in this order)
+//
+// II. Main path algorithm
+// ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
+// **********************************************************************
+//
+// There are 3 parts of x+sqrt(x^2-1) computation:
+//
+// 1) m2 = (m2_hi+m2_lo) = x^2-1 obtaining
+// ------------------------------------
+// m2_hi = x2_hi - 1, where x2_hi = x * x;
+// m2_lo = x2_lo + p1_lo, where
+// x2_lo = FMS(x*x-x2_hi),
+// p1_lo = (1 + m2_hi) - x2_hi;
+//
+// 2) g = (g_hi+g_lo) = sqrt(m2) = sqrt(m2_hi+m2_lo)
+// ----------------------------------------------
+// r = invsqrt(m2_hi) (8-bit reciprocal square root approximation);
+// g = m2_hi * r (first 8 bit-approximation of sqrt);
+//
+// h = 0.5 * r;
+// e = 0.5 - g * h;
+// g = g * e + g (second 16 bit-approximation of sqrt);
+//
+// h = h * e + h;
+// e = 0.5 - g * h;
+// g = g * e + g (third 32 bit-approximation of sqrt);
+//
+// h = h * e + h;
+// e = 0.5 - g * h;
+// g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
+//
+// Remainder computation:
+// h = h * e + h;
+// d = (m2_hi - g_hi * g_hi) + m2_lo;
+// g_lo = d * h;
+//
+// 3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2-1)
+// -------------------------------------------------------------------
+// b_hi = (g_hi + x) + gl;
+// b_lo = (x - b_hi) + g_hi + gl;
+//
+// Now we pass b presented as sum b_hi + b_lo to special version
+// of logl function which accept a pair of arguments as
+// mutiprecision value.
+//
+// Special log algorithm overview
+// ================================
+// Here we use a table lookup method. The basic idea is that in
+// order to compute logl(Arg) for an argument Arg in [1,2),
+// we construct a value G such that G*Arg is close to 1 and that
+// logl(1/G) is obtainable easily from a table of values calculated
+// beforehand. Thus
+//
+// logl(Arg) = logl(1/G) + logl((G*Arg - 1))
+//
+// Because |G*Arg - 1| is small, the second term on the right hand
+// side can be approximated by a short polynomial. We elaborate
+// this method in four steps.
+//
+// Step 0: Initialization
+//
+// We need to calculate logl( X+1 ). Obtain N, S_hi such that
+//
+// X = 2^N * ( S_hi + S_lo ) exactly
+//
+// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
+// that |S_lo| <= ulp(S_hi).
+//
+// For the special version of logl: S_lo = b_lo
+// !-----------------------------------------------!
+//
+// Step 1: Argument Reduction
+//
+// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
+//
+// G := G_1 * G_2 * G_3
+// r := (G * S_hi - 1) + G * S_lo
+//
+// These G_j's have the property that the product is exactly
+// representable and that |r| < 2^(-12) as a result.
+//
+// Step 2: Approximation
+//
+// logl(1 + r) is approximated by a short polynomial poly(r).
+//
+// Step 3: Reconstruction
+//
+// Finally, logl( X ) = logl( X+1 ) is given by
+//
+// logl( X ) = logl( 2^N * (S_hi + S_lo) )
+// ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
+// ~=~ N*logl(2) + logl(1/G) + poly(r).
+//
+// For detailed description see logl or log1pl function, regular path.
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f32 -> f95 (64 registers)
+
+// General registers used:
+// r32 -> r67 (36 registers)
+
+// Predicate registers used:
+// p7 -> p11
+// p7 for 'NaNs, Inf' path
+// p8 for 'near 1' path
+// p9 for 'huges' path
+// p10 for x = 1
+// p11 for x < 1
+//
+//*********************************************************************
+// IEEE Special Conditions:
+//
+// acoshl(+inf) = +inf
+// acoshl(-inf) = QNaN
+// acoshl(1) = 0
+// acoshl(x<1) = QNaN
+// acoshl(SNaN) = QNaN
+// acoshl(QNaN) = QNaN
+//
+
+// Data tables
+//==============================================================
+
+RODATA
+.align 64
+
+// Near 1 path rational aproximation coefficients
+LOCAL_OBJECT_START(Poly_P)
+data8 0xB0978143F695D40F, 0x3FF1 // .84205539791447100108478906277453574946e-4
+data8 0xB9800D841A8CAD29, 0x3FF6 // .28305085180397409672905983082168721069e-2
+data8 0xC889F455758C1725, 0x3FF9 // .24479844297887530847660233111267222945e-1
+data8 0x9BE1DFF006F45F12, 0x3FFB // .76114415657565879842941751209926938306e-1
+data8 0x9E34AF4D372861E0, 0x3FFB // .77248925727776366270605984806795850504e-1
+data8 0xF3DC502AEE14C4AE, 0x3FA6 // .3077953476682583606615438814166025592e-26
+LOCAL_OBJECT_END(Poly_P)
+
+LOCAL_OBJECT_START(Poly_Q)
+data8 0xF76E3FD3C7680357, 0x3FF1 // .11798413344703621030038719253730708525e-3
+data8 0xD107D2E7273263AE, 0x3FF7 // .63791065024872525660782716786703188820e-2
+data8 0xB609BE5CDE206AEF, 0x3FFB // .88885771950814004376363335821980079985e-1
+data8 0xF7DEACAC28067C8A, 0x3FFD // .48412074662702495416825113623936037072302
+data8 0x8F9BE5890CEC7E38, 0x3FFF // 1.1219450873557867470217771071068369729526
+data8 0xED4F06F3D2BC92D1, 0x3FFE // .92698710873331639524734537734804056798748
+LOCAL_OBJECT_END(Poly_Q)
+
+// Q coeffs
+LOCAL_OBJECT_START(Constants_Q)
+data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
+data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
+data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
+data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
+data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
+data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
+LOCAL_OBJECT_END(Constants_Q)
+
+// Z1 - 16 bit fixed
+LOCAL_OBJECT_START(Constants_Z_1)
+data4 0x00008000
+data4 0x00007879
+data4 0x000071C8
+data4 0x00006BCB
+data4 0x00006667
+data4 0x00006187
+data4 0x00005D18
+data4 0x0000590C
+data4 0x00005556
+data4 0x000051EC
+data4 0x00004EC5
+data4 0x00004BDB
+data4 0x00004925
+data4 0x0000469F
+data4 0x00004445
+data4 0x00004211
+LOCAL_OBJECT_END(Constants_Z_1)
+
+// G1 and H1 - IEEE single and h1 - IEEE double
+LOCAL_OBJECT_START(Constants_G_H_h1)
+data4 0x3F800000,0x00000000
+data8 0x0000000000000000
+data4 0x3F70F0F0,0x3D785196
+data8 0x3DA163A6617D741C
+data4 0x3F638E38,0x3DF13843
+data8 0x3E2C55E6CBD3D5BB
+data4 0x3F579430,0x3E2FF9A0
+data8 0xBE3EB0BFD86EA5E7
+data4 0x3F4CCCC8,0x3E647FD6
+data8 0x3E2E6A8C86B12760
+data4 0x3F430C30,0x3E8B3AE7
+data8 0x3E47574C5C0739BA
+data4 0x3F3A2E88,0x3EA30C68
+data8 0x3E20E30F13E8AF2F
+data4 0x3F321640,0x3EB9CEC8
+data8 0xBE42885BF2C630BD
+data4 0x3F2AAAA8,0x3ECF9927
+data8 0x3E497F3497E577C6
+data4 0x3F23D708,0x3EE47FC5
+data8 0x3E3E6A6EA6B0A5AB
+data4 0x3F1D89D8,0x3EF8947D
+data8 0xBDF43E3CD328D9BE
+data4 0x3F17B420,0x3F05F3A1
+data8 0x3E4094C30ADB090A
+data4 0x3F124920,0x3F0F4303
+data8 0xBE28FBB2FC1FE510
+data4 0x3F0D3DC8,0x3F183EBF
+data8 0x3E3A789510FDE3FA
+data4 0x3F088888,0x3F20EC80
+data8 0x3E508CE57CC8C98F
+data4 0x3F042108,0x3F29516A
+data8 0xBE534874A223106C
+LOCAL_OBJECT_END(Constants_G_H_h1)
+
+// Z2 - 16 bit fixed
+LOCAL_OBJECT_START(Constants_Z_2)
+data4 0x00008000
+data4 0x00007F81
+data4 0x00007F02
+data4 0x00007E85
+data4 0x00007E08
+data4 0x00007D8D
+data4 0x00007D12
+data4 0x00007C98
+data4 0x00007C20
+data4 0x00007BA8
+data4 0x00007B31
+data4 0x00007ABB
+data4 0x00007A45
+data4 0x000079D1
+data4 0x0000795D
+data4 0x000078EB
+LOCAL_OBJECT_END(Constants_Z_2)
+
+// G2 and H2 - IEEE single and h2 - IEEE double
+LOCAL_OBJECT_START(Constants_G_H_h2)
+data4 0x3F800000,0x00000000
+data8 0x0000000000000000
+data4 0x3F7F00F8,0x3B7F875D
+data8 0x3DB5A11622C42273
+data4 0x3F7E03F8,0x3BFF015B
+data8 0x3DE620CF21F86ED3
+data4 0x3F7D08E0,0x3C3EE393
+data8 0xBDAFA07E484F34ED
+data4 0x3F7C0FC0,0x3C7E0586
+data8 0xBDFE07F03860BCF6
+data4 0x3F7B1880,0x3C9E75D2
+data8 0x3DEA370FA78093D6
+data4 0x3F7A2328,0x3CBDC97A
+data8 0x3DFF579172A753D0
+data4 0x3F792FB0,0x3CDCFE47
+data8 0x3DFEBE6CA7EF896B
+data4 0x3F783E08,0x3CFC15D0
+data8 0x3E0CF156409ECB43
+data4 0x3F774E38,0x3D0D874D
+data8 0xBE0B6F97FFEF71DF
+data4 0x3F766038,0x3D1CF49B
+data8 0xBE0804835D59EEE8
+data4 0x3F757400,0x3D2C531D
+data8 0x3E1F91E9A9192A74
+data4 0x3F748988,0x3D3BA322
+data8 0xBE139A06BF72A8CD
+data4 0x3F73A0D0,0x3D4AE46F
+data8 0x3E1D9202F8FBA6CF
+data4 0x3F72B9D0,0x3D5A1756
+data8 0xBE1DCCC4BA796223
+data4 0x3F71D488,0x3D693B9D
+data8 0xBE049391B6B7C239
+LOCAL_OBJECT_END(Constants_G_H_h2)
+
+// G3 and H3 - IEEE single and h3 - IEEE double
+LOCAL_OBJECT_START(Constants_G_H_h3)
+data4 0x3F7FFC00,0x38800100
+data8 0x3D355595562224CD
+data4 0x3F7FF400,0x39400480
+data8 0x3D8200A206136FF6
+data4 0x3F7FEC00,0x39A00640
+data8 0x3DA4D68DE8DE9AF0
+data4 0x3F7FE400,0x39E00C41
+data8 0xBD8B4291B10238DC
+data4 0x3F7FDC00,0x3A100A21
+data8 0xBD89CCB83B1952CA
+data4 0x3F7FD400,0x3A300F22
+data8 0xBDB107071DC46826
+data4 0x3F7FCC08,0x3A4FF51C
+data8 0x3DB6FCB9F43307DB
+data4 0x3F7FC408,0x3A6FFC1D
+data8 0xBD9B7C4762DC7872
+data4 0x3F7FBC10,0x3A87F20B
+data8 0xBDC3725E3F89154A
+data4 0x3F7FB410,0x3A97F68B
+data8 0xBD93519D62B9D392
+data4 0x3F7FAC18,0x3AA7EB86
+data8 0x3DC184410F21BD9D
+data4 0x3F7FA420,0x3AB7E101
+data8 0xBDA64B952245E0A6
+data4 0x3F7F9C20,0x3AC7E701
+data8 0x3DB4B0ECAABB34B8
+data4 0x3F7F9428,0x3AD7DD7B
+data8 0x3D9923376DC40A7E
+data4 0x3F7F8C30,0x3AE7D474
+data8 0x3DC6E17B4F2083D3
+data4 0x3F7F8438,0x3AF7CBED
+data8 0x3DAE314B811D4394
+data4 0x3F7F7C40,0x3B03E1F3
+data8 0xBDD46F21B08F2DB1
+data4 0x3F7F7448,0x3B0BDE2F
+data8 0xBDDC30A46D34522B
+data4 0x3F7F6C50,0x3B13DAAA
+data8 0x3DCB0070B1F473DB
+data4 0x3F7F6458,0x3B1BD766
+data8 0xBDD65DDC6AD282FD
+data4 0x3F7F5C68,0x3B23CC5C
+data8 0xBDCDAB83F153761A
+data4 0x3F7F5470,0x3B2BC997
+data8 0xBDDADA40341D0F8F
+data4 0x3F7F4C78,0x3B33C711
+data8 0x3DCD1BD7EBC394E8
+data4 0x3F7F4488,0x3B3BBCC6
+data8 0xBDC3532B52E3E695
+data4 0x3F7F3C90,0x3B43BAC0
+data8 0xBDA3961EE846B3DE
+data4 0x3F7F34A0,0x3B4BB0F4
+data8 0xBDDADF06785778D4
+data4 0x3F7F2CA8,0x3B53AF6D
+data8 0x3DCC3ED1E55CE212
+data4 0x3F7F24B8,0x3B5BA620
+data8 0xBDBA31039E382C15
+data4 0x3F7F1CC8,0x3B639D12
+data8 0x3D635A0B5C5AF197
+data4 0x3F7F14D8,0x3B6B9444
+data8 0xBDDCCB1971D34EFC
+data4 0x3F7F0CE0,0x3B7393BC
+data8 0x3DC7450252CD7ADA
+data4 0x3F7F04F0,0x3B7B8B6D
+data8 0xBDB68F177D7F2A42
+LOCAL_OBJECT_END(Constants_G_H_h3)
+
+// Assembly macros
+//==============================================================
+
+// Floating Point Registers
+
+FR_Arg = f8
+FR_Res = f8
+
+
+FR_PP0 = f32
+FR_PP1 = f33
+FR_PP2 = f34
+FR_PP3 = f35
+FR_PP4 = f36
+FR_PP5 = f37
+FR_QQ0 = f38
+FR_QQ1 = f39
+FR_QQ2 = f40
+FR_QQ3 = f41
+FR_QQ4 = f42
+FR_QQ5 = f43
+
+FR_Q1 = f44
+FR_Q2 = f45
+FR_Q3 = f46
+FR_Q4 = f47
+
+FR_Half = f48
+FR_Two = f49
+
+FR_log2_hi = f50
+FR_log2_lo = f51
+
+
+FR_X2 = f52
+FR_M2 = f53
+FR_M2L = f54
+FR_Rcp = f55
+FR_GG = f56
+FR_HH = f57
+FR_EE = f58
+FR_DD = f59
+FR_GL = f60
+FR_Tmp = f61
+
+
+FR_XM1 = f62
+FR_2XM1 = f63
+FR_XM12 = f64
+
+
+
+ // Special logl registers
+FR_XLog_Hi = f65
+FR_XLog_Lo = f66
+
+FR_Y_hi = f67
+FR_Y_lo = f68
+
+FR_S_hi = f69
+FR_S_lo = f70
+
+FR_poly_lo = f71
+FR_poly_hi = f72
+
+FR_G = f73
+FR_H = f74
+FR_h = f75
+
+FR_G2 = f76
+FR_H2 = f77
+FR_h2 = f78
+
+FR_r = f79
+FR_rsq = f80
+FR_rcub = f81
+
+FR_float_N = f82
+
+FR_G3 = f83
+FR_H3 = f84
+FR_h3 = f85
+
+FR_2_to_minus_N = f86
+
+
+ // Near 1 registers
+FR_PP = f65
+FR_QQ = f66
+
+
+FR_PV6 = f69
+FR_PV4 = f70
+FR_PV3 = f71
+FR_PV2 = f72
+
+FR_QV6 = f73
+FR_QV4 = f74
+FR_QV3 = f75
+FR_QV2 = f76
+
+FR_Y0 = f77
+FR_Q0 = f78
+FR_E0 = f79
+FR_E2 = f80
+FR_E1 = f81
+FR_Y1 = f82
+FR_E3 = f83
+FR_Y2 = f84
+FR_R0 = f85
+FR_E4 = f86
+FR_Y3 = f87
+FR_R1 = f88
+FR_X_Hi = f89
+FR_X_lo = f90
+
+FR_HH = f91
+FR_LL = f92
+FR_HL = f93
+FR_LH = f94
+
+
+
+ // Error handler registers
+FR_Arg_X = f95
+FR_Arg_Y = f0
+
+
+// General Purpose Registers
+
+ // General prolog registers
+GR_PFS = r32
+GR_OneP125 = r33
+GR_TwoP63 = r34
+GR_Arg = r35
+GR_Half = r36
+
+ // Near 1 path registers
+GR_Poly_P = r37
+GR_Poly_Q = r38
+
+ // Special logl registers
+GR_Index1 = r39
+GR_Index2 = r40
+GR_signif = r41
+GR_X_0 = r42
+GR_X_1 = r43
+GR_X_2 = r44
+GR_minus_N = r45
+GR_Z_1 = r46
+GR_Z_2 = r47
+GR_N = r48
+GR_Bias = r49
+GR_M = r50
+GR_Index3 = r51
+GR_exp_2tom80 = r52
+GR_exp_mask = r53
+GR_exp_2tom7 = r54
+GR_ad_ln10 = r55
+GR_ad_tbl_1 = r56
+GR_ad_tbl_2 = r57
+GR_ad_tbl_3 = r58
+GR_ad_q = r59
+GR_ad_z_1 = r60
+GR_ad_z_2 = r61
+GR_ad_z_3 = r62
+
+//
+// Added for unwind support
+//
+GR_SAVE_PFS = r32
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+
+GR_Parameter_X = r64
+GR_Parameter_Y = r65
+GR_Parameter_RESULT = r66
+GR_Parameter_TAG = r67
+
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(acoshl)
+
+{ .mfi
+ alloc GR_PFS = ar.pfs,0,32,4,0 // Local frame allocation
+ fcmp.lt.s1 p11, p0 = FR_Arg, f1 // if arg is less than 1
+ mov GR_Half = 0xfffe // 0.5's exp
+}
+{ .mfi
+ addl GR_Poly_Q = @ltoff(Poly_Q), gp // Address of Q-coeff table
+ fma.s1 FR_X2 = FR_Arg, FR_Arg, f0 // Obtain x^2
+ addl GR_Poly_P = @ltoff(Poly_P), gp // Address of P-coeff table
+};;
+
+{ .mfi
+ getf.d GR_Arg = FR_Arg // get arument as double (int64)
+ fma.s0 FR_Two = f1, f1, f1 // construct 2.0
+ addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp // logl tables
+}
+{ .mlx
+ nop.m 0
+ movl GR_TwoP63 = 0x43E8000000000000 // 0.5*2^63 (huge arguments)
+};;
+
+{ .mfi
+ ld8 GR_Poly_P = [GR_Poly_P] // get actual P-coeff table address
+ fcmp.eq.s1 p10, p0 = FR_Arg, f1 // if arg == 1 (return 0)
+ nop.i 0
+}
+{ .mlx
+ ld8 GR_Poly_Q = [GR_Poly_Q] // get actual Q-coeff table address
+ movl GR_OneP125 = 0x3FF2000000000000 // 1.125 (near 1 path bound)
+};;
+
+{ .mfi
+ ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1
+ fclass.m p7,p0 = FR_Arg, 0xe3 // if arg NaN inf
+ cmp.le p9, p0 = GR_TwoP63, GR_Arg // if arg > 0.5*2^63 ('huges')
+}
+{ .mfb
+ cmp.ge p8, p0 = GR_OneP125, GR_Arg // if arg<1.125 -near 1 path
+ fms.s1 FR_XM1 = FR_Arg, f1, f1 // X0 = X-1 (for near 1 path)
+(p11) br.cond.spnt acoshl_lt_pone // error branch (less than 1)
+};;
+
+{ .mmi
+ setf.exp FR_Half = GR_Half // construct 0.5
+(p9) setf.s FR_XLog_Lo = r0 // Low of logl arg=0 (Huges path)
+ mov GR_exp_mask = 0x1FFFF // Create exponent mask
+};;
+
+{ .mmf
+(p8) ldfe FR_PP5 = [GR_Poly_P],16 // Load P5
+(p8) ldfe FR_QQ5 = [GR_Poly_Q],16 // Load Q5
+ fms.s1 FR_M2 = FR_X2, f1, f1 // m2 = x^2 - 1
+};;
+
+{ .mfi
+(p8) ldfe FR_QQ4 = [GR_Poly_Q],16 // Load Q4
+ fms.s1 FR_M2L = FR_Arg, FR_Arg, FR_X2 // low part of
+ // m2 = fma(X*X - m2)
+ add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
+}
+{ .mfb
+(p8) ldfe FR_PP4 = [GR_Poly_P],16 // Load P4
+(p7) fma.s0 FR_Res = FR_Arg,f1,FR_Arg // r = a + a (Nan, Inf)
+(p7) br.ret.spnt b0 // return (Nan, Inf)
+};;
+
+{ .mfi
+(p8) ldfe FR_PP3 = [GR_Poly_P],16 // Load P3
+ nop.f 0
+ add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
+}
+{ .mfb
+(p8) ldfe FR_QQ3 = [GR_Poly_Q],16 // Load Q3
+(p9) fms.s1 FR_XLog_Hi = FR_Two, FR_Arg, f1 // Hi of log arg = 2*X-1
+(p9) br.cond.spnt huges_logl // special version of log
+}
+;;
+
+{ .mfi
+(p8) ldfe FR_PP2 = [GR_Poly_P],16 // Load P2
+(p8) fma.s1 FR_2XM1 = FR_Two, FR_XM1, f0 // 2X0 = 2 * X0
+ add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
+}
+{ .mfb
+(p8) ldfe FR_QQ2 = [GR_Poly_Q],16 // Load Q2
+(p10) fma.s0 FR_Res = f0,f1,f0 // r = 0 (arg = 1)
+(p10) br.ret.spnt b0 // return (arg = 1)
+};;
+
+{ .mmi
+(p8) ldfe FR_PP1 = [GR_Poly_P],16 // Load P1
+(p8) ldfe FR_QQ1 = [GR_Poly_Q],16 // Load Q1
+ add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
+}
+;;
+
+{ .mfi
+(p8) ldfe FR_PP0 = [GR_Poly_P] // Load P0
+ fma.s1 FR_Tmp = f1, f1, FR_M2 // Tmp = 1 + m2
+ add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
+}
+{ .mfb
+(p8) ldfe FR_QQ0 = [GR_Poly_Q]
+ nop.f 0
+(p8) br.cond.spnt near_1 // near 1 path
+};;
+{ .mfi
+ ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
+ nop.f 0
+ mov GR_Bias = 0x0FFFF // Create exponent bias
+};;
+{ .mfi
+ nop.m 0
+ frsqrta.s1 FR_Rcp, p0 = FR_M2 // Rcp = 1/m2 reciprocal appr.
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
+ fms.s1 FR_Tmp = FR_X2, f1, FR_Tmp // Tmp = x^2 - Tmp
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
+ fma.s1 FR_GG = FR_Rcp, FR_M2, f0 // g = Rcp * m2
+ // 8 bit Newton Raphson iteration
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
+ fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_M2L = FR_Tmp, f1, FR_M2L // low part of m2 = Tmp+m2l
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
+ // 16 bit Newton Raphson iteration
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_Q1 = [GR_ad_q] // Load Q1
+ fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
+ // 32 bit Newton Raphson iteration
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
+ // 64 bit Newton Raphson iteration
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_DD = FR_GG, FR_GG, FR_M2 // Remainder d = g * g - p2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_XLog_Hi = FR_Arg, f1, FR_GG // bh = z + gh
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_DD = FR_DD, f1, FR_M2L // add p2l: d = d + p2l
+ nop.i 0
+};;
+
+{ .mfi
+ getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
+ nop.f 0
+ mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h
+ extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_XLog_Hi = FR_DD, FR_HH, FR_XLog_Hi // bh = bh + gl
+ nop.i 0
+};;
+
+
+
+{ .mmi
+ shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
+ shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
+ extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
+};;
+
+{ .mmi
+ ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_XLog_Lo = FR_Arg, f1, FR_XLog_Hi // bl = x - bh
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
+};;
+
+// WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
+// "DEAD" ZONE!
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1|
+ nop.i 0
+};;
+
+
+{ .mmi
+ getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
+ ldfd FR_h = [GR_ad_tbl_1] // Load h_1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+};;
+
+{ .mfi
+ shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
+ fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GG // bl = bl + gg
+ mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
+}
+{ .mfi
+ shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
+ nop.f 0
+ sub GR_N = GR_N, GR_Bias // sub bias from exp
+};;
+
+{ .mmi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
+};;
+
+{ .mmi
+ ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mmi
+ setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
+ setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
+};;
+
+// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
+// BECAUSE OF POSSIBLE 10 CLOCKS STALL!
+// (Just nops added - nothing to do here)
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GL // bl = bl + gl
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+};;
+
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+ fcvt.xf FR_float_N = FR_float_N
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S_lo = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^(-N)
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r=G*S_lo+(G*S_hi-1)
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo
+ // Y_lo=poly_hi+poly_lo
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+ fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
+ br.ret.sptk b0 // Common exit for 2^-7 < x < inf
+};;
+
+
+huges_logl:
+{ .mmi
+ getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
+ mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
+ nop.i 0
+};;
+
+{ .mfi
+ add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
+ nop.f 0
+ add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
+}
+{ .mfi
+ add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
+ nop.f 0
+ add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
+};;
+
+{ .mfi
+ add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
+ nop.f 0
+ extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
+};;
+
+{ .mfi
+ shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
+ nop.f 0
+ extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
+};;
+
+{ .mfi
+ ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
+ nop.f 0
+ mov GR_exp_mask = 0x1FFFF // Create exponent mask
+}
+{ .mfi
+ shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
+ nop.f 0
+ mov GR_Bias = 0x0FFFF // Create exponent bias
+};;
+
+{ .mfi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
+ fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x|
+ nop.i 0
+};;
+
+{ .mmi
+ getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
+ ldfd FR_h = [GR_ad_tbl_1] // Load h_1
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
+ nop.f 0
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
+};;
+
+{ .mmi
+ ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
+ sub GR_N = GR_N, GR_Bias
+ mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
+};;
+
+{ .mfi
+ ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
+ nop.f 0
+ sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
+};;
+
+{ .mmf
+ ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
+ setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
+ nop.f 0
+};;
+
+{ .mmi
+ ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
+ nop.m 0
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+};;
+
+{ .mmi
+ ldfe FR_Q1 = [GR_ad_q] // Load Q1
+ shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
+ nop.i 0
+};;
+
+{ .mmi
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
+ nop.i 0
+};;
+
+{ .mmi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mmf
+ ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
+ setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
+ nop.f 0
+};;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1*Z_2
+};;
+
+// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
+// BECAUSE OF POSSIBLE 10 CLOCKS STALL!
+// (Just nops added - nothing to do here)
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+};;
+
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
+ fcvt.xf FR_float_N = FR_float_N
+ nop.i 0
+};;
+
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 0
+};;
+
+{ .mmf
+ nop.m 0
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2)*G_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2)+H_3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h = N*log2_lo+h
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo=poly_hi+poly_lo
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
+ br.ret.sptk b0 // Common exit
+};;
+
+
+// NEAR ONE INTERVAL
+near_1:
+{ .mfi
+ nop.m 0
+ frsqrta.s1 FR_Rcp, p0 = FR_2XM1 // Rcp = 1/x reciprocal appr. &SQRT&
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_PV6 = FR_PP5, FR_XM1, FR_PP4 // pv6 = P5*xm1+P4 $POLY$
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_QV6 = FR_QQ5, FR_XM1, FR_QQ4 // qv6 = Q5*xm1+Q4 $POLY$
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_PV4 = FR_PP3, FR_XM1, FR_PP2 // pv4 = P3*xm1+P2 $POLY$
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_QV4 = FR_QQ3, FR_XM1, FR_QQ2 // qv4 = Q3*xm1+Q2 $POLY$
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_XM12 = FR_XM1, FR_XM1, f0 // xm1^2 = xm1 * xm1 $POLY$
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_PV2 = FR_PP1, FR_XM1, FR_PP0 // pv2 = P1*xm1+P0 $POLY$
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_QV2 = FR_QQ1, FR_XM1, FR_QQ0 // qv2 = Q1*xm1+Q0 $POLY$
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_GG = FR_Rcp, FR_2XM1, f0 // g = Rcp * x &SQRT&
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp &SQRT&
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_PV3 = FR_XM12, FR_PV6, FR_PV4//pv3=pv6*xm1^2+pv4 $POLY$
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_QV3 = FR_XM12, FR_QV6, FR_QV4//qv3=qv6*xm1^2+qv4 $POLY$
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h &SQRT&
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_PP = FR_XM12, FR_PV3, FR_PV2 //pp=pv3*xm1^2+pv2 $POLY$
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_QQ = FR_XM12, FR_QV3, FR_QV2 //qq=qv3*xm1^2+qv2 $POLY$
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT&
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ frcpa.s1 FR_Y0,p0 = f1,FR_QQ // y = frcpa(b) #DIV#
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g*h &SQRT&
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Q0 = FR_PP,FR_Y0,f0 // q = a*y #DIV#
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_E0 = FR_Y0,FR_QQ,f1 // e = 1 - b*y #DIV#
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT&
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_E2 = FR_E0,FR_E0,FR_E0 // e2 = e+e^2 #DIV#
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_E1 = FR_E0,FR_E0,f0 // e1 = e^2 #DIV#
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h &SQRT&
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_DD = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT&
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Y1 = FR_Y0,FR_E2,FR_Y0 // y1 = y+y*e2 #DIV#
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_E3 = FR_E1,FR_E1,FR_E0 // e3 = e+e1^2 #DIV#
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_GG = FR_DD, FR_HH, FR_GG // g = d * h + g &SQRT&
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Y2 = FR_Y1,FR_E3,FR_Y0 // y2 = y+y1*e3 #DIV#
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_R0 = FR_QQ,FR_Q0,FR_PP // r = a-b*q #DIV#
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_DD = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT&
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_E4 = FR_QQ,FR_Y2,f1 // e4 = 1-b*y2 #DIV#
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_X_Hi = FR_R0,FR_Y2,FR_Q0 // x = q+r*y2 #DIV#
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h &SQRT&
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Y3 = FR_Y2,FR_E4,FR_Y2 // y3 = y2+y2*e4 #DIV#
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_R1 = FR_QQ,FR_X_Hi,FR_PP // r1 = a-b*x #DIV#
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HH = FR_GG, FR_X_Hi, f0 // hh = gg * x_hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_LH = FR_GL, FR_X_Hi, f0 // lh = gl * x_hi
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_X_lo = FR_R1,FR_Y3,f0 // x_lo = r1*y3 #DIV#
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_LL = FR_GL, FR_X_lo, f0 // ll = gl*x_lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HL = FR_GG, FR_X_lo, f0 // hl = gg * x_lo
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_Res = FR_GL, f1, FR_LL // res = gl + ll
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_Res = FR_Res, f1, FR_LH // res = res + lh
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_Res = FR_Res, f1, FR_HL // res = res + hl
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_Res = FR_Res, f1, FR_HH // res = res + hh
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+ fma.s0 FR_Res = FR_Res, f1, FR_GG // result = res + gg
+ br.ret.sptk b0 // Exit for near 1 path
+};;
+// NEAR ONE INTERVAL END
+
+
+
+
+acoshl_lt_pone:
+{ .mfi
+ nop.m 0
+ fmerge.s FR_Arg_X = FR_Arg, FR_Arg
+ nop.i 0
+};;
+{ .mfb
+ mov GR_Parameter_TAG = 135
+ frcpa.s0 FR_Res,p0 = f0,f0 // get QNaN,and raise invalid
+ br.cond.sptk __libm_error_region // exit if x < 1.0
+};;
+
+GLOBAL_LIBM_END(acoshl)
+
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y = -32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp = -64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP = gp // Save gp
+};;
+
+{ .mmi
+ stfe [GR_Parameter_Y] = FR_Arg_Y,16 // Parameter 2 to stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0,GR_SAVE_B0
+ mov GR_SAVE_B0 = b0 // Save b0
+};;
+
+.body
+{ .mib
+ stfe [GR_Parameter_X] = FR_Arg_X // Parameter 1 to stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfe [GR_Parameter_Y] = FR_Res // Parameter 3 to stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0 = __libm_error_support# // Error handling function
+};;
+
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+
+{ .mmi
+ ldfe f8 = [GR_Parameter_RESULT] // Get return res
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region#)
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
+
+
+
diff --git a/sysdeps/ia64/fpu/e_acosl.S b/sysdeps/ia64/fpu/e_acosl.S
index ab1bbf41a7..daa75b18a5 100644
--- a/sysdeps/ia64/fpu/e_acosl.S
+++ b/sysdeps/ia64/fpu/e_acosl.S
@@ -1,10 +1,10 @@
.file "acosl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2001 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,1027 +20,2469 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http: //www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 2/07/00 Modified calculation of acos_corr to correct acosl
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
-// set [the previously overwritten] GR_Parameter_RESULT.
-// 12/20/00 Set denormal flag properly.
+// 08/28/01 New version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
-// double-extended = acosl (double-extended)
-// input floating point f8
-// output floating point f8
+// long double acosl(long double)
//
-// Registers used
+// Overview of operation
//==============================================================
+// Background
//
-// predicate registers used:
-// p6 -> p12
+// Implementation
//
-// floating-point registers used:
-// f8 has input, then output
-// f8 -> f15, f32 ->f99
+// For |s| in [2^{-4}, sqrt(2)/2]:
+// Let t= 2^k*1.b1 b2..b6 1, where s= 2^k*1.b1 b2.. b52
+// acos(s)= pi/2-asin(t)-asin(r), where r= s*sqrt(1-t^2)-t*sqrt(1-s^2), i.e.
+// r= (s-t)*sqrt(1-t^2)-t*sqrt(1-t^2)*(sqrt((1-s^2)/(1-t^2))-1)
+// asin(r)-r evaluated as 9-degree polynomial (c3*r^3+c5*r^5+c7*r^7+c9*r^9)
+// The 64-bit significands of sqrt(1-t^2), 1/(1-t^2) are read from the table,
+// along with the high and low parts of asin(t) (stored as two double precision
+// values)
//
-// general registers used:
-// r32 -> r48
+// |s| in (sqrt(2)/2, sqrt(255/256)):
+// Let t= 2^k*1.b1 b2..b6 1, where (1-s^2)*frsqrta(1-s^2)= 2^k*1.b1 b2..b6..
+// acos(|s|)= asin(t)-asin(r)
+// acos(-|s|)=pi-asin(t)+asin(r), r= s*t-sqrt(1-s^2)*sqrt(1-t^2)
+// To minimize accumulated errors, r is computed as
+// r= (t*s)_s-t^2*y*z+z*y*(t^2-1+s^2)_s+z*y*(1-s^2)_s*x+z'*y*(1-s^2)*PS29+
+// +(t*s-(t*s)_s)+z*y*((t^2-1-(t^2-1+s^2)_s)+s^2)+z*y*(1-s^2-(1-s^2)_s)+
+// +ez*z'*y*(1-s^2)*(1-x),
+// where y= frsqrta(1-s^2), z= (sqrt(1-t^2))_s (rounded to 24 significant bits)
+// z'= sqrt(1-t^2), x= ((1-s^2)*y^2-1)/2
+//
+// |s|<2^{-4}: evaluate asin(s) as 17-degree polynomial, return pi/2-asin(s)
+// (or simply return pi/2-s, if|s|<2^{-64})
+//
+// |s| in [sqrt(255/256), 1): acos(|s|)= asin(sqrt(1-s^2))
+// acos(-|s|)= pi-asin(sqrt(1-s^2))
+// use 17-degree polynomial for asin(sqrt(1-s^2)),
+// 9-degree polynomial to evaluate sqrt(1-s^2)
+// High order term is (pi)_high-(y*(1-s^2))_high, for s<0,
+// or y*(1-s^2)_s, for s>0
//
-// Overview of operation
-//==============================================================
-// There are three paths
-// 1. |x| < 2^-25 ACOS_TINY
-// 2. 2^-25 <= |x| < 1/4 ACOS_POLY
-// 3. 1/4 <= |x| < 1 ACOS_ATAN
-#include "libm_support.h"
-// Assembly macros
+
+// Registers used
//==============================================================
+// f6-f15, f32-f36
+// r2-r3, r23-r23
+// p6, p7, p8, p12
+//
-// f8 is input, but acos_V must be put in f8
-// when __libm_atan2_reg is called, f8 must get V
-// f9 gets U when __libm_atan2_reg is called
+ GR_SAVE_B0= r33
+ GR_SAVE_PFS= r34
+ GR_SAVE_GP= r35 // This reg. can safely be used
+ GR_SAVE_SP= r36
-// __libm_atan2_reg returns
-// f8 = Z_hi
-// f10 = Z_lo
-// f11 = s_lo
+ GR_Parameter_X= r37
+ GR_Parameter_Y= r38
+ GR_Parameter_RESULT= r39
+ GR_Parameter_TAG= r40
-acos_Z_hi = f8
-acos_Z_lo = f10
-acos_S_lo = f11
+ FR_X= f10
+ FR_Y= f1
+ FR_RESULT= f8
-// When we call __libm_atan2_reg, we must save
-// the following:
-acos_corr = f12
-acos_X = f13
-acos_pi_hi = f14
-acos_pi_lo = f15
-// The rest of the assembly macros
-
-acos_P79 = f32
-acos_P59 = f33
-acos_P39 = f34
-acos_P19 = f35
+RODATA
-acos_P810 = f36
-acos_P610 = f37
-acos_P410 = f38
-acos_P210 = f39
+.align 16
-acos_A1 = f41
-acos_A2 = f42
-acos_A3 = f43
-acos_A4 = f44
-acos_A5 = f45
-acos_A6 = f46
-acos_A7 = f47
-acos_A8 = f48
-acos_A9 = f49
-acos_A10 = f50
+LOCAL_OBJECT_START(T_table)
+
+// stores 64-bit significand of 1/(1-t^2), 64-bit significand of sqrt(1-t^2),
+// asin(t)_high (double precision), asin(t)_low (double precision)
+
+data8 0x80828692b71c4391, 0xff7ddcec2d87e879
+data8 0x3fb022bc0ae531a0, 0x3c9f599c7bb42af6
+data8 0x80869f0163d0b082, 0xff79cad2247914d3
+data8 0x3fb062dd26afc320, 0x3ca4eff21bd49c5c
+data8 0x808ac7d5a8690705, 0xff75a89ed6b626b9
+data8 0x3fb0a2ff4a1821e0, 0x3cb7e33b58f164cc
+data8 0x808f0112ad8ad2e0, 0xff7176517c2cc0cb
+data8 0x3fb0e32279319d80, 0x3caee31546582c43
+data8 0x80934abba8a1da0a, 0xff6d33e949b1ed31
+data8 0x3fb12346b8101da0, 0x3cb8bfe463d087cd
+data8 0x8097a4d3dbe63d8f, 0xff68e16571015c63
+data8 0x3fb1636c0ac824e0, 0x3c8870a7c5a3556f
+data8 0x809c0f5e9662b3dd, 0xff647ec520bca0f0
+data8 0x3fb1a392756ed280, 0x3c964f1a927461ae
+data8 0x80a08a5f33fadc66, 0xff600c07846a6830
+data8 0x3fb1e3b9fc19e580, 0x3c69eb3576d56332
+data8 0x80a515d91d71acd4, 0xff5b892bc475affa
+data8 0x3fb223e2a2dfbe80, 0x3c6a4e19fd972fb6
+data8 0x80a9b1cfc86ff7cd, 0xff56f631062cf93d
+data8 0x3fb2640c6dd76260, 0x3c62041160e0849e
+data8 0x80ae5e46b78b0d68, 0xff5253166bc17794
+data8 0x3fb2a43761187c80, 0x3cac61651af678c0
+data8 0x80b31b417a4b756b, 0xff4d9fdb14463dc8
+data8 0x3fb2e46380bb6160, 0x3cb06ef23eeba7a1
+data8 0x80b7e8c3ad33c369, 0xff48dc7e1baf6738
+data8 0x3fb32490d0d910c0, 0x3caa05f480b300d5
+data8 0x80bcc6d0f9c784d6, 0xff4408fe9ad13e37
+data8 0x3fb364bf558b3820, 0x3cb01e7e403aaab9
+data8 0x80c1b56d1692492d, 0xff3f255ba75f5f4e
+data8 0x3fb3a4ef12ec3540, 0x3cb4fe8fcdf5f5f1
+data8 0x80c6b49bc72ec446, 0xff3a319453ebd961
+data8 0x3fb3e5200d171880, 0x3caf2dc089b2b7e2
+data8 0x80cbc460dc4e0ae8, 0xff352da7afe64ac6
+data8 0x3fb425524827a720, 0x3cb75a855e7c6053
+data8 0x80d0e4c033bee9c4, 0xff301994c79afb32
+data8 0x3fb46585c83a5e00, 0x3cb3264981c019ab
+data8 0x80d615bdb87556db, 0xff2af55aa431f291
+data8 0x3fb4a5ba916c73c0, 0x3c994251d94427b5
+data8 0x80db575d6291fd8a, 0xff25c0f84bae0cb9
+data8 0x3fb4e5f0a7dbdb20, 0x3cbee2fcc4c786cb
+data8 0x80e0a9a33769e535, 0xff207c6cc0ec09fd
+data8 0x3fb526280fa74620, 0x3c940656e5549b91
+data8 0x80e60c93498e32cd, 0xff1b27b703a19c98
+data8 0x3fb56660ccee2740, 0x3ca7082374d7b2cd
+data8 0x80eb8031b8d4052d, 0xff15c2d6105c72f8
+data8 0x3fb5a69ae3d0b520, 0x3c7c4d46e09ac68a
+data8 0x80f10482b25c6c8a, 0xff104dc8e0813ed4
+data8 0x3fb5e6d6586fec20, 0x3c9aa84ffd9b4958
+data8 0x80f6998a709c7cfb, 0xff0ac88e6a4ab926
+data8 0x3fb627132eed9140, 0x3cbced2cbbbe7d16
+data8 0x80fc3f4d3b657c44, 0xff053325a0c8a2ec
+data8 0x3fb667516b6c34c0, 0x3c6489c5fc68595a
+data8 0x8101f5cf67ed2af8, 0xfeff8d8d73dec2bb
+data8 0x3fb6a791120f33a0, 0x3cbe12acf159dfad
+data8 0x8107bd1558d6291f, 0xfef9d7c4d043df29
+data8 0x3fb6e7d226fabba0, 0x3ca386d099cd0dc7
+data8 0x810d95237e38766a, 0xfef411ca9f80b5f7
+data8 0x3fb72814ae53cc20, 0x3cb9f35731e71dd6
+data8 0x81137dfe55aa0e29, 0xfeee3b9dc7eef009
+data8 0x3fb76858ac403a00, 0x3c74df3dd959141a
+data8 0x811977aa6a479f0f, 0xfee8553d2cb8122c
+data8 0x3fb7a89e24e6b0e0, 0x3ca6034406ee42bc
+data8 0x811f822c54bd5ef8, 0xfee25ea7add46a91
+data8 0x3fb7e8e51c6eb6a0, 0x3cb82f8f78e68ed7
+data8 0x81259d88bb4ffac1, 0xfedc57dc2809fb1d
+data8 0x3fb8292d9700ad60, 0x3cbebb73c0e653f9
+data8 0x812bc9c451e5a257, 0xfed640d974eb6068
+data8 0x3fb8697798c5d620, 0x3ca2feee76a9701b
+data8 0x813206e3da0f3124, 0xfed0199e6ad6b585
+data8 0x3fb8a9c325e852e0, 0x3cb9e88f2f4d0efe
+data8 0x813854ec231172f9, 0xfec9e229dcf4747d
+data8 0x3fb8ea1042932a00, 0x3ca5ff40d81f66fd
+data8 0x813eb3e209ee858f, 0xfec39a7a9b36538b
+data8 0x3fb92a5ef2f247c0, 0x3cb5e3bece4d6b07
+data8 0x814523ca796f56ce, 0xfebd428f72561efe
+data8 0x3fb96aaf3b3281a0, 0x3cb7b9e499436d7c
+data8 0x814ba4aa6a2d3ff9, 0xfeb6da672bd48fe4
+data8 0x3fb9ab011f819860, 0x3cb9168143cc1a7f
+data8 0x81523686e29bbdd7, 0xfeb062008df81f50
+data8 0x3fb9eb54a40e3ac0, 0x3cb6e544197eb1e1
+data8 0x8158d964f7124614, 0xfea9d95a5bcbd65a
+data8 0x3fba2ba9cd080800, 0x3ca9a717be8f7446
+data8 0x815f8d49c9d639e4, 0xfea34073551e1ac8
+data8 0x3fba6c009e9f9260, 0x3c741e989a60938a
+data8 0x8166523a8b24f626, 0xfe9c974a367f785c
+data8 0x3fbaac591d0661a0, 0x3cb2c1290107e57d
+data8 0x816d283c793e0114, 0xfe95ddddb94166cb
+data8 0x3fbaecb34c6ef600, 0x3c9c7d5fbaec405d
+data8 0x81740f54e06d55bd, 0xfe8f142c93750c50
+data8 0x3fbb2d0f310cca00, 0x3cbc09479a9cbcfb
+data8 0x817b07891b15cd5e, 0xfe883a3577e9fceb
+data8 0x3fbb6d6ccf1455e0, 0x3cb9450bff4ee307
+data8 0x818210de91bba6c8, 0xfe814ff7162cf62f
+data8 0x3fbbadcc2abb1180, 0x3c9227fda12a8d24
+data8 0x81892b5abb0f2bf9, 0xfe7a55701a8697b1
+data8 0x3fbbee2d48377700, 0x3cb6fad72acfe356
+data8 0x819057031bf7760e, 0xfe734a9f2dfa1810
+data8 0x3fbc2e902bc10600, 0x3cb4465b588d16ad
+data8 0x819793dd479d4fbe, 0xfe6c2f82f643f68b
+data8 0x3fbc6ef4d9904580, 0x3c8b9ac54823960d
+data8 0x819ee1eedf76367a, 0xfe65041a15d8a92c
+data8 0x3fbcaf5b55dec6a0, 0x3ca2b8d28a954db2
+data8 0x81a6413d934f7a66, 0xfe5dc8632be3477f
+data8 0x3fbcefc3a4e727a0, 0x3c9380da83713ab4
+data8 0x81adb1cf21597d4b, 0xfe567c5cd44431d5
+data8 0x3fbd302dcae51600, 0x3ca995b83421756a
+data8 0x81b533a9563310b8, 0xfe4f2005a78fb50f
+data8 0x3fbd7099cc155180, 0x3caefa2f7a817d5f
+data8 0x81bcc6d20cf4f373, 0xfe47b35c3b0caaeb
+data8 0x3fbdb107acb5ae80, 0x3cb455fc372dd026
+data8 0x81c46b4f2f3d6e68, 0xfe40365f20b316d6
+data8 0x3fbdf177710518c0, 0x3cbee3dcc5b01434
+data8 0x81cc2126b53c1144, 0xfe38a90ce72abf36
+data8 0x3fbe31e91d439620, 0x3cb3e131c950aebd
+data8 0x81d3e85ea5bd8ee2, 0xfe310b6419c9c33a
+data8 0x3fbe725cb5b24900, 0x3c01d3fac6029027
+data8 0x81dbc0fd1637b9c1, 0xfe295d6340932d15
+data8 0x3fbeb2d23e937300, 0x3c6304cc44aeedd1
+data8 0x81e3ab082ad5a0a4, 0xfe219f08e03580b3
+data8 0x3fbef349bc2a77e0, 0x3cac1d2d6abe9c72
+data8 0x81eba6861683cb97, 0xfe19d0537a0946e2
+data8 0x3fbf33c332bbe020, 0x3ca0909dba4e96ca
+data8 0x81f3b37d1afc9979, 0xfe11f1418c0f94e2
+data8 0x3fbf743ea68d5b60, 0x3c937fc12a2a779a
+data8 0x81fbd1f388d4be45, 0xfe0a01d190f09063
+data8 0x3fbfb4bc1be5c340, 0x3cbf51a504b55813
+data8 0x820401efbf87e248, 0xfe020201fff9efea
+data8 0x3fbff53b970d1e80, 0x3ca625444b260078
+data8 0x82106ad2ffdca049, 0xfdf5e3940a49135e
+data8 0x3fc02aff52065460, 0x3c9125d113e22a57
+data8 0x8221343d6ea1d3e2, 0xfde581a45429b0a0
+data8 0x3fc06b84f8e03220, 0x3caccf362295894b
+data8 0x82324434adbf99c2, 0xfdd4de1a001fb775
+data8 0x3fc0ac0ed1fe7240, 0x3cc22f676096b0af
+data8 0x82439aee8d0c7747, 0xfdc3f8e8269d1f03
+data8 0x3fc0ec9cee9e4820, 0x3cca147e2886a628
+data8 0x825538a1d0fcb2f0, 0xfdb2d201a9b1ba66
+data8 0x3fc12d2f6006f0a0, 0x3cc72b36633bc2d4
+data8 0x82671d86345c5cee, 0xfda1695934d723e7
+data8 0x3fc16dc63789de60, 0x3cb11f9c47c7b83f
+data8 0x827949d46a121770, 0xfd8fbee13cbbb823
+data8 0x3fc1ae618682e620, 0x3cce1b59020cef8e
+data8 0x828bbdc61eeab9ba, 0xfd7dd28bff0c9f34
+data8 0x3fc1ef015e586c40, 0x3cafec043e0225ee
+data8 0x829e7995fb6de9e1, 0xfd6ba44b823ee1ca
+data8 0x3fc22fa5d07b90c0, 0x3cba905409caf8e3
+data8 0x82b17d7fa5bbc982, 0xfd5934119557883a
+data8 0x3fc2704eee685da0, 0x3cb5ef21838a823e
+data8 0x82c4c9bfc373d276, 0xfd4681cfcfb2c161
+data8 0x3fc2b0fcc9a5f3e0, 0x3ccc7952c5e0e312
+data8 0x82d85e93fba50136, 0xfd338d7790ca0f41
+data8 0x3fc2f1af73c6ba00, 0x3cbecf5f977d1ca9
+data8 0x82ec3c3af8c76b32, 0xfd2056f9fff97727
+data8 0x3fc33266fe6889a0, 0x3c9d329c022ebdb5
+data8 0x830062f46abf6022, 0xfd0cde480c43b327
+data8 0x3fc373237b34de60, 0x3cc95806d4928adb
+data8 0x8314d30108ea35f0, 0xfcf923526c1562b2
+data8 0x3fc3b3e4fbe10520, 0x3cbc299fe7223d54
+data8 0x83298ca29434df97, 0xfce526099d0737ed
+data8 0x3fc3f4ab922e4a60, 0x3cb59d8bb8fdbccc
+data8 0x833e901bd93c7009, 0xfcd0e65de39f1f7c
+data8 0x3fc435774fea2a60, 0x3c9ec18b43340914
+data8 0x8353ddb0b278aad8, 0xfcbc643f4b106055
+data8 0x3fc4764846ee80a0, 0x3cb90402efd87ed6
+data8 0x836975a60a70c52e, 0xfca79f9da4fab13a
+data8 0x3fc4b71e8921b860, 0xbc58f23449ed6365
+data8 0x837f5841ddfa7a46, 0xfc92986889284148
+data8 0x3fc4f7fa2876fca0, 0xbc6294812bf43acd
+data8 0x839585cb3e839773, 0xfc7d4e8f554ab12f
+data8 0x3fc538db36ee6960, 0x3cb910b773d4c578
+data8 0x83abfe8a5466246f, 0xfc67c2012cb6fa68
+data8 0x3fc579c1c6953cc0, 0x3cc5ede909fc47fc
+data8 0x83c2c2c861474d91, 0xfc51f2acf82041d5
+data8 0x3fc5baade9860880, 0x3cac63cdfc3588e5
+data8 0x83d9d2cfc2813637, 0xfc3be08165519325
+data8 0x3fc5fb9fb1e8e3a0, 0x3cbf7c8466578c29
+data8 0x83f12eebf397daac, 0xfc258b6ce6e6822f
+data8 0x3fc63c9731f39d40, 0x3cb6d2a7ffca3e9e
+data8 0x8408d76990b9296e, 0xfc0ef35db402af94
+data8 0x3fc67d947be9eec0, 0x3cb1980da09e6566
+data8 0x8420cc9659487cd7, 0xfbf81841c8082dc4
+data8 0x3fc6be97a21daf00, 0x3cc2ac8330e59aa5
+data8 0x84390ec132759ecb, 0xfbe0fa06e24cc390
+data8 0x3fc6ffa0b6ef05e0, 0x3ccc1a030fee56c4
+data8 0x84519e3a29df811a, 0xfbc9989a85ce0954
+data8 0x3fc740afcccca000, 0x3cc19692a5301ca6
+data8 0x846a7b527842d61b, 0xfbb1f3e9f8e45dc4
+data8 0x3fc781c4f633e2c0, 0x3cc0e98f3868a508
+data8 0x8483a65c8434b5f0, 0xfb9a0be244f4af45
+data8 0x3fc7c2e045b12140, 0x3cb2a8d309754420
+data8 0x849d1fabe4e97dd7, 0xfb81e070362116d1
+data8 0x3fc80401cddfd120, 0x3ca7a44544aa4ce6
+data8 0x84b6e795650817ea, 0xfb6971805af8411e
+data8 0x3fc84529a16ac020, 0x3c9e3b709c7d6f94
+data8 0x84d0fe6f0589da92, 0xfb50beff0423a2f5
+data8 0x3fc88657d30c49e0, 0x3cc60d65a7f0a278
+data8 0x84eb649000a73014, 0xfb37c8d84414755c
+data8 0x3fc8c78c758e8e80, 0x3cc94b2ee984c2b7
+data8 0x85061a50ccd13781, 0xfb1e8ef7eeaf764b
+data8 0x3fc908c79bcba900, 0x3cc8540ae794a2fe
+data8 0x8521200b1fb8916e, 0xfb05114998f76a83
+data8 0x3fc94a0958ade6c0, 0x3ca127f49839fa9c
+data8 0x853c7619f1618bf6, 0xfaeb4fb898b65d19
+data8 0x3fc98b51bf2ffee0, 0x3c8c9ba7a803909a
+data8 0x85581cd97f45e274, 0xfad14a3004259931
+data8 0x3fc9cca0e25d4ac0, 0x3cba458e91d3bf54
+data8 0x857414a74f8446b4, 0xfab7009ab1945a54
+data8 0x3fca0df6d551fe80, 0x3cc78ea1d329d2b2
+data8 0x85905de2341dea46, 0xfa9c72e3370d2fbc
+data8 0x3fca4f53ab3b6200, 0x3ccf60dca86d57ef
+data8 0x85acf8ea4e423ff8, 0xfa81a0f3e9fa0ee9
+data8 0x3fca90b777580aa0, 0x3ca4c4e2ec8a867e
+data8 0x85c9e62111a92e7d, 0xfa668ab6dec711b1
+data8 0x3fcad2224cf814e0, 0x3c303de5980d071c
+data8 0x85e725e947fbee97, 0xfa4b3015e883dbfe
+data8 0x3fcb13943f7d5f80, 0x3cc29d4eefa5cb1e
+data8 0x8604b8a7144cd054, 0xfa2f90fa9883a543
+data8 0x3fcb550d625bc6a0, 0x3c9e01a746152daf
+data8 0x86229ebff69e2415, 0xfa13ad4e3dfbe1c1
+data8 0x3fcb968dc9195ea0, 0x3ccc091bd73ae518
+data8 0x8640d89acf78858c, 0xf9f784f9e5a1877b
+data8 0x3fcbd815874eb160, 0x3cb5f4b89875e187
+data8 0x865f669fe390c7f5, 0xf9db17e65944eacf
+data8 0x3fcc19a4b0a6f9c0, 0x3cc5c0bc2b0bbf14
+data8 0x867e4938df7dc45f, 0xf9be65fc1f6c2e6e
+data8 0x3fcc5b3b58e061e0, 0x3cc1ca70df8f57e7
+data8 0x869d80d0db7e4c0c, 0xf9a16f237aec427a
+data8 0x3fcc9cd993cc4040, 0x3cbae93acc85eccf
+data8 0x86bd0dd45f4f8265, 0xf98433446a806e70
+data8 0x3fccde7f754f5660, 0x3cb22f70e64568d0
+data8 0x86dcf0b16613e37a, 0xf966b246a8606170
+data8 0x3fcd202d11620fa0, 0x3c962030e5d4c849
+data8 0x86fd29d7624b3d5d, 0xf948ec11a9d4c45b
+data8 0x3fcd61e27c10c0a0, 0x3cc7083c91d59217
+data8 0x871db9b741dbe44a, 0xf92ae08c9eca4941
+data8 0x3fcda39fc97be7c0, 0x3cc9258579e57211
+data8 0x873ea0c3722d6af2, 0xf90c8f9e71633363
+data8 0x3fcde5650dd86d60, 0x3ca4755a9ea582a9
+data8 0x875fdf6fe45529e8, 0xf8edf92dc5875319
+data8 0x3fce27325d6fe520, 0x3cbc1e2b6c1954f9
+data8 0x878176321154e2bc, 0xf8cf1d20f87270b8
+data8 0x3fce6907cca0d060, 0x3cb6ca4804750830
+data8 0x87a36580fe6bccf5, 0xf8affb5e20412199
+data8 0x3fceaae56fdee040, 0x3cad6b310d6fd46c
+data8 0x87c5add5417a5cb9, 0xf89093cb0b7c0233
+data8 0x3fceeccb5bb33900, 0x3cc16e99cedadb20
+data8 0x87e84fa9057914ca, 0xf870e64d40a15036
+data8 0x3fcf2eb9a4bcb600, 0x3cc75ee47c8b09e9
+data8 0x880b4b780f02b709, 0xf850f2c9fdacdf78
+data8 0x3fcf70b05fb02e20, 0x3cad6350d379f41a
+data8 0x882ea1bfc0f228ac, 0xf830b926379e6465
+data8 0x3fcfb2afa158b8a0, 0x3cce0ccd9f829985
+data8 0x885252ff21146108, 0xf810394699fe0e8e
+data8 0x3fcff4b77e97f3e0, 0x3c9b30faa7a4c703
+data8 0x88765fb6dceebbb3, 0xf7ef730f865f6df0
+data8 0x3fd01b6406332540, 0x3cdc5772c9e0b9bd
+data8 0x88ad1f69be2cc730, 0xf7bdc59bc9cfbd97
+data8 0x3fd04cf8ad203480, 0x3caeef44fe21a74a
+data8 0x88f763f70ae2245e, 0xf77a91c868a9c54e
+data8 0x3fd08f23ce0162a0, 0x3cd6290ab3fe5889
+data8 0x89431fc7bc0c2910, 0xf73642973c91298e
+data8 0x3fd0d1610f0c1ec0, 0x3cc67401a01f08cf
+data8 0x8990573407c7738e, 0xf6f0d71d1d7a2dd6
+data8 0x3fd113b0c65d88c0, 0x3cc7aa4020fe546f
+data8 0x89df0eb108594653, 0xf6aa4e6a05cfdef2
+data8 0x3fd156134ada6fe0, 0x3cc87369da09600c
+data8 0x8a2f4ad16e0ed78a, 0xf662a78900c35249
+data8 0x3fd19888f43427a0, 0x3cc62b220f38e49c
+data8 0x8a811046373e0819, 0xf619e180181d97cc
+data8 0x3fd1db121aed7720, 0x3ca3ede7490b52f4
+data8 0x8ad463df6ea0fa2c, 0xf5cffb504190f9a2
+data8 0x3fd21daf185fa360, 0x3caafad98c1d6c1b
+data8 0x8b294a8cf0488daf, 0xf584f3f54b8604e6
+data8 0x3fd2606046bf95a0, 0x3cdb2d704eeb08fa
+data8 0x8b7fc95f35647757, 0xf538ca65c960b582
+data8 0x3fd2a32601231ec0, 0x3cc661619fa2f126
+data8 0x8bd7e588272276f8, 0xf4eb7d92ff39fccb
+data8 0x3fd2e600a3865760, 0x3c8a2a36a99aca4a
+data8 0x8c31a45bf8e9255e, 0xf49d0c68cd09b689
+data8 0x3fd328f08ad12000, 0x3cb9efaf1d7ab552
+data8 0x8c8d0b520a35eb18, 0xf44d75cd993cfad2
+data8 0x3fd36bf614dcc040, 0x3ccacbb590bef70d
+data8 0x8cea2005d068f23d, 0xf3fcb8a23ab4942b
+data8 0x3fd3af11a079a6c0, 0x3cd9775872cf037d
+data8 0x8d48e837c8cd5027, 0xf3aad3c1e2273908
+data8 0x3fd3f2438d754b40, 0x3ca03304f667109a
+data8 0x8da969ce732f3ac7, 0xf357c60202e2fd7e
+data8 0x3fd4358c3ca032e0, 0x3caecf2504ff1a9d
+data8 0x8e0baad75555e361, 0xf3038e323ae9463a
+data8 0x3fd478ec0fd419c0, 0x3cc64bdc3d703971
+data8 0x8e6fb18807ba877e, 0xf2ae2b1c3a6057f7
+data8 0x3fd4bc6369fa40e0, 0x3cbb7122ec245cf2
+data8 0x8ed5843f4bda74d5, 0xf2579b83aa556f0c
+data8 0x3fd4fff2af11e2c0, 0x3c9cfa2dc792d394
+data8 0x8f3d29862c861fef, 0xf1ffde2612ca1909
+data8 0x3fd5439a4436d000, 0x3cc38d46d310526b
+data8 0x8fa6a81128940b2d, 0xf1a6f1bac0075669
+data8 0x3fd5875a8fa83520, 0x3cd8bf59b8153f8a
+data8 0x901206c1686317a6, 0xf14cd4f2a730d480
+data8 0x3fd5cb33f8cf8ac0, 0x3c9502b5c4d0e431
+data8 0x907f4ca5fe9cf739, 0xf0f186784a125726
+data8 0x3fd60f26e847b120, 0x3cc8a1a5e0acaa33
+data8 0x90ee80fd34aeda5e, 0xf09504ef9a212f18
+data8 0x3fd65333c7e43aa0, 0x3cae5b029cb1f26e
+data8 0x915fab35e37421c6, 0xf0374ef5daab5c45
+data8 0x3fd6975b02b8e360, 0x3cd5aa1c280c45e6
+data8 0x91d2d2f0d894d73c, 0xefd86321822dbb51
+data8 0x3fd6db9d05213b20, 0x3cbecf2c093ccd8b
+data8 0x9248000249200009, 0xef7840021aca5a72
+data8 0x3fd71ffa3cc87fc0, 0x3cb8d273f08d00d9
+data8 0x92bf3a7351f081d2, 0xef16e42021d7cbd5
+data8 0x3fd7647318b1ad20, 0x3cbce099d79cdc46
+data8 0x93388a8386725713, 0xeeb44dfce6820283
+data8 0x3fd7a908093fc1e0, 0x3ccb033ec17a30d9
+data8 0x93b3f8aa8e653812, 0xee507c126774fa45
+data8 0x3fd7edb9803e3c20, 0x3cc10aedb48671eb
+data8 0x94318d99d341ade4, 0xedeb6cd32f891afb
+data8 0x3fd83287f0e9cf80, 0x3c994c0c1505cd2a
+data8 0x94b1523e3dedc630, 0xed851eaa3168f43c
+data8 0x3fd87773cff956e0, 0x3cda3b7bce6a6b16
+data8 0x95334fc20577563f, 0xed1d8ffaa2279669
+data8 0x3fd8bc7d93a70440, 0x3cd4922edc792ce2
+data8 0x95b78f8e8f92f274, 0xecb4bf1fd2be72da
+data8 0x3fd901a5b3b9cf40, 0x3cd3fea1b00f9d0d
+data8 0x963e1b4e63a87c3f, 0xec4aaa6d08694cc1
+data8 0x3fd946eca98f2700, 0x3cdba4032d968ff1
+data8 0x96c6fcef314074fc, 0xebdf502d53d65fea
+data8 0x3fd98c52f024e800, 0x3cbe7be1ab8c95c9
+data8 0x97523ea3eab028b2, 0xeb72aea36720793e
+data8 0x3fd9d1d904239860, 0x3cd72d08a6a22b70
+data8 0x97dfeae6f4ee4a9a, 0xeb04c4096a884e94
+data8 0x3fda177f63e8ef00, 0x3cd818c3c1ebfac7
+data8 0x98700c7c6d85d119, 0xea958e90cfe1efd7
+data8 0x3fda5d468f92a540, 0x3cdf45fbfaa080fe
+data8 0x9902ae7487a9caa1, 0xea250c6224aab21a
+data8 0x3fdaa32f090998e0, 0x3cd715a9353cede4
+data8 0x9997dc2e017a9550, 0xe9b33b9ce2bb7638
+data8 0x3fdae939540d3f00, 0x3cc545c014943439
+data8 0x9a2fa158b29b649b, 0xe9401a573f8aa706
+data8 0x3fdb2f65f63f6c60, 0x3cd4a63c2f2ca8e2
+data8 0x9aca09f835466186, 0xe8cba69df9f0bf35
+data8 0x3fdb75b5773075e0, 0x3cda310ce1b217ec
+data8 0x9b672266ab1e0136, 0xe855de74266193d4
+data8 0x3fdbbc28606babc0, 0x3cdc84b75cca6c44
+data8 0x9c06f7579f0b7bd5, 0xe7debfd2f98c060b
+data8 0x3fdc02bf3d843420, 0x3cd225d967ffb922
+data8 0x9ca995db058cabdc, 0xe76648a991511c6e
+data8 0x3fdc497a9c224780, 0x3cde08101c5b825b
+data8 0x9d4f0b605ce71e88, 0xe6ec76dcbc02d9a7
+data8 0x3fdc905b0c10d420, 0x3cb1abbaa3edf120
+data8 0x9df765b9eecad5e6, 0xe6714846bdda7318
+data8 0x3fdcd7611f4b8a00, 0x3cbf6217ae80aadf
+data8 0x9ea2b320350540fe, 0xe5f4bab71494cd6b
+data8 0x3fdd1e8d6a0d56c0, 0x3cb726e048cc235c
+data8 0x9f51023562fc5676, 0xe576cbf239235ecb
+data8 0x3fdd65e082df5260, 0x3cd9e66872bd5250
+data8 0xa002620915c2a2f6, 0xe4f779b15f5ec5a7
+data8 0x3fddad5b02a82420, 0x3c89743b0b57534b
+data8 0xa0b6e21c2caf9992, 0xe476c1a233a7873e
+data8 0x3fddf4fd84bbe160, 0x3cbf7adea9ee3338
+data8 0xa16e9264cc83a6b2, 0xe3f4a16696608191
+data8 0x3fde3cc8a6ec6ee0, 0x3cce46f5a51f49c6
+data8 0xa22983528f3d8d49, 0xe3711694552da8a8
+data8 0x3fde84bd099a6600, 0x3cdc78f6490a2d31
+data8 0xa2e7c5d2e2e69460, 0xe2ec1eb4e1e0a5fb
+data8 0x3fdeccdb4fc685c0, 0x3cdd3aedb56a4825
+data8 0xa3a96b5599bd2532, 0xe265b74506fbe1c9
+data8 0x3fdf15241f23b3e0, 0x3cd440f3c6d65f65
+data8 0xa46e85d1ae49d7de, 0xe1ddddb499b3606f
+data8 0x3fdf5d98202994a0, 0x3cd6c44bd3fb745a
+data8 0xa53727ca3e11b99e, 0xe1548f662951b00d
+data8 0x3fdfa637fe27bf60, 0x3ca8ad1cd33054dd
+data8 0xa6036453bdc20186, 0xe0c9c9aeabe5e481
+data8 0x3fdfef0467599580, 0x3cc0f1ac0685d78a
+data8 0xa6d34f1969dda338, 0xe03d89d5281e4f81
+data8 0x3fe01bff067d6220, 0x3cc0731e8a9ef057
+data8 0xa7a6fc62f7246ff3, 0xdfafcd125c323f54
+data8 0x3fe04092d1ae3b40, 0x3ccabda24b59906d
+data8 0xa87e811a861df9b9, 0xdf20909061bb9760
+data8 0x3fe0653df0fd9fc0, 0x3ce94c8dcc722278
+data8 0xa959f2d2dd687200, 0xde8fd16a4e5f88bd
+data8 0x3fe08a00c1cae320, 0x3ce6b888bb60a274
+data8 0xaa3967cdeea58bda, 0xddfd8cabd1240d22
+data8 0x3fe0aedba3221c00, 0x3ced5941cd486e46
+data8 0xab904fd587263c84, 0xdd1f4472e1cf64ed
+data8 0x3fe0e651e85229c0, 0x3cdb6701042299b1
+data8 0xad686d44dd5a74bb, 0xdbf173e1f6b46e92
+data8 0x3fe1309cbf4cdb20, 0x3cbf1be7bb3f0ec5
+data8 0xaf524e15640ebee4, 0xdabd54896f1029f6
+data8 0x3fe17b4ee1641300, 0x3ce81dd055b792f1
+data8 0xb14eca24ef7db3fa, 0xd982cb9ae2f47e41
+data8 0x3fe1c66b9ffd6660, 0x3cd98ea31eb5ddc7
+data8 0xb35ec807669920ce, 0xd841bd1b8291d0b6
+data8 0x3fe211f66db3a5a0, 0x3ca480c35a27b4a2
+data8 0xb5833e4755e04dd1, 0xd6fa0bd3150b6930
+data8 0x3fe25df2e05b6c40, 0x3ca4bc324287a351
+data8 0xb7bd34c8000b7bd3, 0xd5ab9939a7d23aa1
+data8 0x3fe2aa64b32f7780, 0x3cba67314933077c
+data8 0xba0dc64d126cc135, 0xd4564563ce924481
+data8 0x3fe2f74fc9289ac0, 0x3cec1a1dc0efc5ec
+data8 0xbc76222cbbfa74a6, 0xd2f9eeed501125a8
+data8 0x3fe344b82f859ac0, 0x3ceeef218de413ac
+data8 0xbef78e31985291a9, 0xd19672e2182f78be
+data8 0x3fe392a22087b7e0, 0x3cd2619ba201204c
+data8 0xc19368b2b0629572, 0xd02baca5427e436a
+data8 0x3fe3e11206694520, 0x3cb5d0b3143fe689
+data8 0xc44b2ae8c6733e51, 0xceb975d60b6eae5d
+data8 0x3fe4300c7e945020, 0x3cbd367143da6582
+data8 0xc7206b894212dfef, 0xcd3fa6326ff0ac9a
+data8 0x3fe47f965d201d60, 0x3ce797c7a4ec1d63
+data8 0xca14e1b0622de526, 0xcbbe13773c3c5338
+data8 0x3fe4cfb4b09d1a20, 0x3cedfadb5347143c
+data8 0xcd2a6825eae65f82, 0xca34913d425a5ae9
+data8 0x3fe5206cc637e000, 0x3ce2798b38e54193
+data8 0xd06301095e1351ee, 0xc8a2f0d3679c08c0
+data8 0x3fe571c42e3d0be0, 0x3ccd7cb9c6c2ca68
+data8 0xd3c0d9f50057adda, 0xc70901152d59d16b
+data8 0x3fe5c3c0c108f940, 0x3ceb6c13563180ab
+data8 0xd74650a98cc14789, 0xc5668e3d4cbf8828
+data8 0x3fe61668a46ffa80, 0x3caa9092e9e3c0e5
+data8 0xdaf5f8579dcc8f8f, 0xc3bb61b3eed42d02
+data8 0x3fe669c251ad69e0, 0x3cccf896ef3b4fee
+data8 0xded29f9f9a6171b4, 0xc20741d7f8e8e8af
+data8 0x3fe6bdd49bea05c0, 0x3cdc6b29937c575d
+data8 0xe2df5765854ccdb0, 0xc049f1c2d1b8014b
+data8 0x3fe712a6b76c6e80, 0x3ce1ddc6f2922321
+data8 0xe71f7a9b94fcb4c3, 0xbe833105ec291e91
+data8 0x3fe76840418978a0, 0x3ccda46e85432c3d
+data8 0xeb96b72d3374b91e, 0xbcb2bb61493b28b3
+data8 0x3fe7bea9496d5a40, 0x3ce37b42ec6e17d3
+data8 0xf049183c3f53c39b, 0xbad848720223d3a8
+data8 0x3fe815ea59dab0a0, 0x3cb03ad41bfc415b
+data8 0xf53b11ec7f415f15, 0xb8f38b57c53c9c48
+data8 0x3fe86e0c84010760, 0x3cc03bfcfb17fe1f
+data8 0xfa718f05adbf2c33, 0xb70432500286b185
+data8 0x3fe8c7196b9225c0, 0x3ced99fcc6866ba9
+data8 0xfff200c3f5489608, 0xb509e6454dca33cc
+data8 0x3fe9211b54441080, 0x3cb789cb53515688
+// The following table entries are not used
+//data8 0x82e138a0fac48700, 0xb3044a513a8e6132
+//data8 0x3fe97c1d30f5b7c0, 0x3ce1eb765612d1d0
+//data8 0x85f4cc7fc670d021, 0xb0f2fb2ea6cbbc88
+//data8 0x3fe9d82ab4b5fde0, 0x3ced3fe6f27e8039
+//data8 0x89377c1387d5b908, 0xaed58e9a09014d5c
+//data8 0x3fea355065f87fa0, 0x3cbef481d25f5b58
+//data8 0x8cad7a2c98dec333, 0xacab929ce114d451
+//data8 0x3fea939bb451e2a0, 0x3c8e92b4fbf4560f
+//data8 0x905b7dfc99583025, 0xaa748cc0dbbbc0ec
+//data8 0x3feaf31b11270220, 0x3cdced8c61bd7bd5
+//data8 0x9446d8191f80dd42, 0xa82ff92687235baf
+//data8 0x3feb53de0bcffc20, 0x3cbe1722fb47509e
+//data8 0x98758ba086e4000a, 0xa5dd497a9c184f58
+//data8 0x3febb5f571cb0560, 0x3ce0c7774329a613
+//data8 0x9cee6c7bf18e4e24, 0xa37be3c3cd1de51b
+//data8 0x3fec197373bc7be0, 0x3ce08ebdb55c3177
+//data8 0xa1b944000a1b9440, 0xa10b2101b4f27e03
+//data8 0x3fec7e6bd023da60, 0x3ce5fc5fd4995959
+//data8 0xa6defd8ba04d3e38, 0x9e8a4b93cad088ec
+//data8 0x3fece4f404e29b20, 0x3cea3413401132b5
+//data8 0xac69dd408a10c62d, 0x9bf89d5d17ddae8c
+//data8 0x3fed4d2388f63600, 0x3cd5a7fb0d1d4276
+//data8 0xb265c39cbd80f97a, 0x99553d969fec7beb
+//data8 0x3fedb714101e0a00, 0x3cdbda21f01193f2
+//data8 0xb8e081a16ae4ae73, 0x969f3e3ed2a0516c
+//data8 0x3fee22e1da97bb00, 0x3ce7231177f85f71
+//data8 0xbfea427678945732, 0x93d5990f9ee787af
+//data8 0x3fee90ac13b18220, 0x3ce3c8a5453363a5
+//data8 0xc79611399b8c90c5, 0x90f72bde80febc31
+//data8 0x3fef009542b712e0, 0x3ce218fd79e8cb56
+//data8 0xcffa8425040624d7, 0x8e02b4418574ebed
+//data8 0x3fef72c3d2c57520, 0x3cd32a717f82203f
+//data8 0xd93299cddcf9cf23, 0x8af6ca48e9c44024
+//data8 0x3fefe762b77744c0, 0x3ce53478a6bbcf94
+//data8 0xe35eda760af69ad9, 0x87d1da0d7f45678b
+//data8 0x3ff02f511b223c00, 0x3ced6e11782c28fc
+//data8 0xeea6d733421da0a6, 0x84921bbe64ae029a
+//data8 0x3ff06c5c6f8ce9c0, 0x3ce71fc71c1ffc02
+//data8 0xfb3b2c73fc6195cc, 0x813589ba3a5651b6
+//data8 0x3ff0aaf2613700a0, 0x3cf2a72d2fd94ef3
+//data8 0x84ac1fcec4203245, 0xfb73a828893df19e
+//data8 0x3ff0eb367c3fd600, 0x3cf8054c158610de
+//data8 0x8ca50621110c60e6, 0xf438a14c158d867c
+//data8 0x3ff12d51caa6b580, 0x3ce6bce9748739b6
+//data8 0x95b8c2062d6f8161, 0xecb3ccdd37b369da
+//data8 0x3ff1717418520340, 0x3ca5c2732533177c
+//data8 0xa0262917caab4ad1, 0xe4dde4ddc81fd119
+//data8 0x3ff1b7d59dd40ba0, 0x3cc4c7c98e870ff5
+//data8 0xac402c688b72f3f4, 0xdcae469be46d4c8d
+//data8 0x3ff200b93cc5a540, 0x3c8dd6dc1bfe865a
+//data8 0xba76968b9eabd9ab, 0xd41a8f3df1115f7f
+//data8 0x3ff24c6f8f6affa0, 0x3cf1acb6d2a7eff7
+//data8 0xcb63c87c23a71dc5, 0xcb161074c17f54ec
+//data8 0x3ff29b5b338b7c80, 0x3ce9b5845f6ec746
+//data8 0xdfe323b8653af367, 0xc19107d99ab27e42
+//data8 0x3ff2edf6fac7f5a0, 0x3cf77f961925fa02
+//data8 0xf93746caaba3e1f1, 0xb777744a9df03bff
+//data8 0x3ff344df237486c0, 0x3cf6ddf5f6ddda43
+//data8 0x8ca77052f6c340f0, 0xacaf476f13806648
+//data8 0x3ff3a0dfa4bb4ae0, 0x3cfee01bbd761bff
+//data8 0xa1a48604a81d5c62, 0xa11575d30c0aae50
+//data8 0x3ff4030b73c55360, 0x3cf1cf0e0324d37c
+//data8 0xbe45074b05579024, 0x9478e362a07dd287
+//data8 0x3ff46ce4c738c4e0, 0x3ce3179555367d12
+//data8 0xe7a08b5693d214ec, 0x8690e3575b8a7c3b
+//data8 0x3ff4e0a887c40a80, 0x3cfbd5d46bfefe69
+//data8 0x94503d69396d91c7, 0xedd2ce885ff04028
+//data8 0x3ff561ebd9c18cc0, 0x3cf331bd176b233b
+//data8 0xced1d96c5bb209e6, 0xc965278083808702
+//data8 0x3ff5f71d7ff42c80, 0x3ce3301cc0b5a48c
+//data8 0xabac2cee0fc24e20, 0x9c4eb1136094cbbd
+//data8 0x3ff6ae4c63222720, 0x3cf5ff46874ee51e
+//data8 0x8040201008040201, 0xb4d7ac4d9acb1bf4
+//data8 0x3ff7b7d33b928c40, 0x3cfacdee584023bb
+LOCAL_OBJECT_END(T_table)
-acos_X2 = f51
-acos_X4 = f52
-acos_B = f53
-acos_Bb = f54
-acos_A = f55
-acos_Aa = f56
-acos_1mA = f57
+.align 16
-acos_W = f58
-acos_Ww = f59
+LOCAL_OBJECT_START(poly_coeffs)
+ // C_3
+data8 0xaaaaaaaaaaaaaaab, 0x0000000000003ffc
+ // C_5
+data8 0x999999999999999a, 0x0000000000003ffb
+ // C_7, C_9
+data8 0x3fa6db6db6db6db7, 0x3f9f1c71c71c71c8
+ // pi/2 (low, high)
+data8 0x3C91A62633145C07, 0x3FF921FB54442D18
+ // C_11, C_13
+data8 0x3f96e8ba2e8ba2e9, 0x3f91c4ec4ec4ec4e
+ // C_15, C_17
+data8 0x3f8c99999999999a, 0x3f87a87878787223
+ // pi (low, high)
+data8 0x3CA1A62633145C07, 0x400921FB54442D18
+LOCAL_OBJECT_END(poly_coeffs)
+
+
+R_DBL_S = r21
+R_EXP0 = r22
+R_EXP = r15
+R_SGNMASK = r23
+R_TMP = r24
+R_TMP2 = r25
+R_INDEX = r26
+R_TMP3 = r27
+R_TMP03 = r27
+R_TMP4 = r28
+R_TMP5 = r23
+R_TMP6 = r22
+R_TMP7 = r21
+R_T = r29
+R_BIAS = r20
+
+F_T = f6
+F_1S2 = f7
+F_1S2_S = f9
+F_INV_1T2 = f10
+F_SQRT_1T2 = f11
+F_S2T2 = f12
+F_X = f13
+F_D = f14
+F_2M64 = f15
+
+F_CS2 = f32
+F_CS3 = f33
+F_CS4 = f34
+F_CS5 = f35
+F_CS6 = f36
+F_CS7 = f37
+F_CS8 = f38
+F_CS9 = f39
+F_S23 = f40
+F_S45 = f41
+F_S67 = f42
+F_S89 = f43
+F_S25 = f44
+F_S69 = f45
+F_S29 = f46
+F_X2 = f47
+F_X4 = f48
+F_TSQRT = f49
+F_DTX = f50
+F_R = f51
+F_R2 = f52
+F_R3 = f53
+F_R4 = f54
+
+F_C3 = f55
+F_C5 = f56
+F_C7 = f57
+F_C9 = f58
+F_P79 = f59
+F_P35 = f60
+F_P39 = f61
+
+F_ATHI = f62
+F_ATLO = f63
+
+F_T1 = f64
+F_Y = f65
+F_Y2 = f66
+F_ANDMASK = f67
+F_ORMASK = f68
+F_S = f69
+F_05 = f70
+F_SQRT_1S2 = f71
+F_DS = f72
+F_Z = f73
+F_1T2 = f74
+F_DZ = f75
+F_ZE = f76
+F_YZ = f77
+F_Y1S2 = f78
+F_Y1S2X = f79
+F_1X = f80
+F_ST = f81
+F_1T2_ST = f82
+F_TSS = f83
+F_Y1S2X2 = f84
+F_DZ_TERM = f85
+F_DTS = f86
+F_DS2X = f87
+F_T2 = f88
+F_ZY1S2S = f89
+F_Y1S2_1X = f90
+F_TS = f91
+F_PI2_LO = f92
+F_PI2_HI = f93
+F_S19 = f94
+F_INV1T2_2 = f95
+F_CORR = f96
+F_DZ0 = f97
+
+F_C11 = f98
+F_C13 = f99
+F_C15 = f100
+F_C17 = f101
+F_P1113 = f102
+F_P1517 = f103
+F_P1117 = f104
+F_P317 = f105
+F_R8 = f106
+F_HI = f107
+F_1S2_HI = f108
+F_DS2 = f109
+F_Y2_2 = f110
+//F_S2 = f111
+//F_S_DS2 = f112
+F_S_1S2S = f113
+F_XL = f114
+F_2M128 = f115
+F_1AS = f116
+F_AS = f117
-acos_y0 = f60
-acos_y1 = f61
-acos_y2 = f62
-acos_H = f63
-acos_Hh = f64
-acos_t1 = f65
-acos_t2 = f66
-acos_t3 = f67
-acos_t4 = f68
-acos_t5 = f69
+.section .text
+GLOBAL_LIBM_ENTRY(acosl)
-acos_Pseries = f70
-acos_NORM_f8 = f71
-acos_ABS_NORM_f8 = f72
+{.mfi
+ // get exponent, mantissa (rounded to double precision) of s
+ getf.d R_DBL_S = f8
+ // 1-s^2
+ fnma.s1 F_1S2 = f8, f8, f1
+ // r2 = pointer to T_table
+ addl r2 = @ltoff(T_table), gp
+}
-acos_2 = f73
-acos_P1P2 = f74
-acos_HALF = f75
-acos_U = f76
+{.mfi
+ // sign mask
+ mov R_SGNMASK = 0x20000
+ nop.f 0
+ // bias-63-1
+ mov R_TMP03 = 0xffff-64;;
+}
-acos_1mB = f77
-acos_V = f78
-acos_S = f79
-acos_BmUU = f80
-acos_BmUUpb = f81
-acos_2U = f82
-acos_1d2U = f83
+{.mfi
+ // get exponent of s
+ getf.exp R_EXP = f8
+ nop.f 0
+ // R_TMP4 = 2^45
+ shl R_TMP4 = R_SGNMASK, 45-17
+}
-acos_Dd = f84
+{.mlx
+ // load bias-4
+ mov R_TMP = 0xffff-4
+ // load RU(sqrt(2)/2) to integer register (in double format, shifted left by 1)
+ movl R_TMP2 = 0x7fcd413cccfe779a;;
+}
-acos_pi_by_2_hi = f85
-acos_pi_by_2_lo = f86
-acos_xmpi_by_2_lo = f87
-acos_xPmw = f88
-acos_Uu = f89
-acos_AmVV = f90
-acos_AmVVpa = f91
+{.mfi
+ // load 2^{-64} in FP register
+ setf.exp F_2M64 = R_TMP03
+ nop.f 0
+ // index = (0x7-exponent)|b1 b2.. b6
+ extr.u R_INDEX = R_DBL_S, 46, 9
+}
-acos_2V = f92
-acos_1d2V = f93
-acos_Vv = f94
+{.mfi
+ // get t = sign|exponent|b1 b2.. b6 1 x.. x
+ or R_T = R_DBL_S, R_TMP4
+ nop.f 0
+ // R_TMP4 = 2^45-1
+ sub R_TMP4 = R_TMP4, r0, 1;;
+}
-acos_Vu = f95
-acos_Uv = f96
-
-acos_2_Z_hi = f97
-acos_s_lo_Z_lo = f98
-acos_result_lo = f99
-
-acos_Z_hi = f8
-acos_Z_lo = f10
-acos_s_lo = f11
-
-acos_GR_17_ones = r33
-acos_GR_16_ones = r34
-acos_GR_signexp_f8 = r35
-acos_GR_exp = r36
-acos_GR_true_exp = r37
-acos_GR_fffe = r38
-
-GR_SAVE_PFS = r43
-GR_SAVE_B0 = r39
-GR_SAVE_GP = r41
-
-// r40 is address of table of coefficients
-// r42
-
-GR_Parameter_X = r44
-GR_Parameter_Y = r45
-GR_Parameter_RESULT = r46
-GR_Parameter_TAG = r47
-
-
-// 2^-40:
-// A true exponent of -40 is
-// : -40 + register_bias
-// : -28 + ffff = ffd7
-// A true exponent of 1 is
-// : 1 + register_bias
-// : 1 + ffff = 10000
+{.mfi
+ // get t = sign|exponent|b1 b2.. b6 1 0.. 0
+ andcm R_T = R_T, R_TMP4
+ nop.f 0
+ // eliminate sign from R_DBL_S (shift left by 1)
+ shl R_TMP3 = R_DBL_S, 1
+}
-// Data tables
-//==============================================================
+{.mfi
+ // R_BIAS = 3*2^6
+ mov R_BIAS = 0xc0
+ nop.f 0
+ // eliminate sign from R_EXP
+ andcm R_EXP0 = R_EXP, R_SGNMASK;;
+}
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-.align 16
-acos_coefficients:
-ASM_TYPE_DIRECTIVE(acos_coefficients,@object)
-data8 0xc90fdaa22168c234, 0x00003FFF // pi_by_2_hi
-data8 0xc4c6628b80dc1cd1, 0x00003FBF // pi_by_2_lo
-data8 0xc90fdaa22168c234, 0x00004000 // pi_hi
-data8 0xc4c6628b80dc1cd1, 0x00003FC0 // pi_lo
-
-data8 0xBB08911F2013961E, 0x00003FF8 // A10
-data8 0x981F1095A23A87D3, 0x00003FF8 // A9
-data8 0xBDF09C6C4177BCC6, 0x00003FF8 // A8
-data8 0xE4C3A60B049ACCEA, 0x00003FF8 // A7
-data8 0x8E2789F4E8A8F1AD, 0x00003FF9 // A6
-data8 0xB745D09B2B0E850B, 0x00003FF9 // A5
-data8 0xF8E38E3BC4C50920, 0x00003FF9 // A4
-data8 0xB6DB6DB6D89FCD81, 0x00003FFA // A3
-data8 0x99999999999AF376, 0x00003FFB // A2
-data8 0xAAAAAAAAAAAAAA71, 0x00003FFC // A1
-ASM_SIZE_DIRECTIVE(acos_coefficients)
-
-
-.align 32
-.global acosl#
-ASM_TYPE_DIRECTIVE(acosl#,@function)
+{.mfi
+ // load start address for T_table
+ ld8 r2 = [r2]
+ nop.f 0
+ // p8 = 1 if |s|> = sqrt(2)/2
+ cmp.geu p8, p0 = R_TMP3, R_TMP2
+}
-.section .text
-.proc acosl#
-.align 32
+{.mlx
+ // p7 = 1 if |s|<2^{-4} (exponent of s<bias-4)
+ cmp.lt p7, p0 = R_EXP0, R_TMP
+ // sqrt coefficient cs8 = -33*13/128
+ movl R_TMP2 = 0xc0568000;;
+}
-acosl:
-// After normalizing f8, get its true exponent
-{ .mfi
- alloc r32 = ar.pfs,1,11,4,0
-(p0) fnorm.s1 acos_NORM_f8 = f8
-(p0) mov acos_GR_17_ones = 0x1ffff
+{.mbb
+ // load t in FP register
+ setf.d F_T = R_T
+ // if |s|<2^{-4}, take alternate path
+ (p7) br.cond.spnt SMALL_S
+ // if |s|> = sqrt(2)/2, take alternate path
+ (p8) br.cond.sptk LARGE_S
}
-{ .mmi
-(p0) mov acos_GR_16_ones = 0xffff
-(p0) addl r40 = @ltoff(acos_coefficients), gp
- nop.i 999
+{.mlx
+ // index = (4-exponent)|b1 b2.. b6
+ sub R_INDEX = R_INDEX, R_BIAS
+ // sqrt coefficient cs9 = 55*13/128
+ movl R_TMP = 0x40b2c000;;
}
-;;
-// Set denormal flag on denormal input with fcmp
-{ .mfi
- ld8 r40 = [r40]
- fcmp.eq p6,p0 = f8,f0
- nop.i 999
+
+{.mfi
+ // sqrt coefficient cs8 = -33*13/128
+ setf.s F_CS8 = R_TMP2
+ nop.f 0
+ // shift R_INDEX by 5
+ shl R_INDEX = R_INDEX, 5
}
-;;
+{.mfi
+ // sqrt coefficient cs3 = 0.5 (set exponent = bias-1)
+ mov R_TMP4 = 0xffff - 1
+ nop.f 0
+ // sqrt coefficient cs6 = -21/16
+ mov R_TMP6 = 0xbfa8;;
+}
-// Load the constants pi_by_2 and pi.
-// Each is stored as hi and lo values
-// Also load the coefficients for ACOS_POLY
-{ .mmi
-(p0) ldfe acos_pi_by_2_hi = [r40],16 ;;
-(p0) ldfe acos_pi_by_2_lo = [r40],16
- nop.i 999 ;;
+{.mlx
+ // table index
+ add r2 = r2, R_INDEX
+ // sqrt coefficient cs7 = 33/16
+ movl R_TMP2 = 0x40040000;;
}
-{ .mmi
-(p0) ldfe acos_pi_hi = [r40],16 ;;
-(p0) ldfe acos_pi_lo = [r40],16
- nop.i 999 ;;
+
+{.mmi
+ // load cs9 = 55*13/128
+ setf.s F_CS9 = R_TMP
+ // sqrt coefficient cs5 = 7/8
+ mov R_TMP3 = 0x3f60
+ // sqrt coefficient cs6 = 21/16
+ shl R_TMP6 = R_TMP6, 16;;
}
-{ .mmi
-(p0) ldfe acos_A10 = [r40],16 ;;
-(p0) ldfe acos_A9 = [r40],16
- nop.i 999 ;;
+
+{.mmi
+ // load significand of 1/(1-t^2)
+ ldf8 F_INV_1T2 = [r2], 8
+ // sqrt coefficient cs7 = 33/16
+ setf.s F_CS7 = R_TMP2
+ // sqrt coefficient cs4 = -5/8
+ mov R_TMP5 = 0xbf20;;
}
-// Take the absolute value of f8
-{ .mmf
- nop.m 999
-(p0) getf.exp acos_GR_signexp_f8 = acos_NORM_f8
-(p0) fmerge.s acos_ABS_NORM_f8 = f0, acos_NORM_f8
+
+{.mmi
+ // load significand of sqrt(1-t^2)
+ ldf8 F_SQRT_1T2 = [r2], 8
+ // sqrt coefficient cs6 = 21/16
+ setf.s F_CS6 = R_TMP6
+ // sqrt coefficient cs5 = 7/8
+ shl R_TMP3 = R_TMP3, 16;;
}
-{ .mii
-(p0) ldfe acos_A8 = [r40],16
- nop.i 999 ;;
-(p0) and acos_GR_exp = acos_GR_signexp_f8, acos_GR_17_ones ;;
+
+{.mmi
+ // sqrt coefficient cs3 = 0.5 (set exponent = bias-1)
+ setf.exp F_CS3 = R_TMP4
+ // r3 = pointer to polynomial coefficients
+ addl r3 = @ltoff(poly_coeffs), gp
+ // sqrt coefficient cs4 = -5/8
+ shl R_TMP5 = R_TMP5, 16;;
}
-// case 1: |x| < 2^-25 ==> p6 ACOS_TINY
-// case 2: 2^-25 <= |x| < 2^-2 ==> p8 ACOS_POLY
-// case 3: 2^-2 <= |x| < 1 ==> p9 ACOS_ATAN
-// case 4: 1 <= |x| ==> p11 ACOS_ERROR_RETURN
-// Admittedly |x| = 1 is not an error but this is where that case is
-// handled.
-{ .mii
-(p0) ldfe acos_A7 = [r40],16
-(p0) sub acos_GR_true_exp = acos_GR_exp, acos_GR_16_ones ;;
-(p0) cmp.ge.unc p6, p7 = -26, acos_GR_true_exp ;;
+{.mfi
+ // sqrt coefficient cs5 = 7/8
+ setf.s F_CS5 = R_TMP3
+ // d = s-t
+ fms.s1 F_D = f8, f1, F_T
+ // set p6 = 1 if s<0, p11 = 1 if s> = 0
+ cmp.ge p6, p11 = R_EXP, R_DBL_S
}
-{ .mii
-(p0) ldfe acos_A6 = [r40],16
-(p7) cmp.ge.unc p8, p9 = -3, acos_GR_true_exp ;;
-(p9) cmp.ge.unc p10, p11 = -1, acos_GR_true_exp
+{.mfi
+ // r3 = load start address to polynomial coefficients
+ ld8 r3 = [r3]
+ // s+t
+ fma.s1 F_S2T2 = f8, f1, F_T
+ nop.i 0;;
}
-{ .mmi
-(p0) ldfe acos_A5 = [r40],16 ;;
-(p0) ldfe acos_A4 = [r40],16
- nop.i 999 ;;
+
+{.mfi
+ // sqrt coefficient cs4 = -5/8
+ setf.s F_CS4 = R_TMP5
+ // s^2-t^2
+ fma.s1 F_S2T2 = F_S2T2, F_D, f0
+ nop.i 0;;
}
-{ .mmi
-(p0) ldfe acos_A3 = [r40],16 ;;
-(p0) ldfe acos_A2 = [r40],16
- nop.i 999 ;;
+
+{.mfi
+ // load C3
+ ldfe F_C3 = [r3], 16
+ // 0.5/(1-t^2) = 2^{-64}*(2^63/(1-t^2))
+ fma.s1 F_INV_1T2 = F_INV_1T2, F_2M64, f0
+ nop.i 0;;
}
-// ACOS_ERROR_RETURN ==> p11 is true
-// case 4: |x| >= 1
-{ .mib
-(p0) ldfe acos_A1 = [r40],16
- nop.i 999
-(p11) br.spnt L(ACOS_ERROR_RETURN) ;;
+{.mfi
+ // load C_5
+ ldfe F_C5 = [r3], 16
+ // set correct exponent for sqrt(1-t^2)
+ fma.s1 F_SQRT_1T2 = F_SQRT_1T2, F_2M64, f0
+ nop.i 0;;
}
-// ACOS_TINY ==> p6 is true
-// case 1: |x| < 2^-25
-{ .mfi
- nop.m 999
-(p6) fms.s1 acos_xmpi_by_2_lo = acos_NORM_f8,f1, acos_pi_by_2_lo
- nop.i 999 ;;
+
+{.mfi
+ // load C_7, C_9
+ ldfpd F_C7, F_C9 = [r3], 16
+ // x = -(s^2-t^2)/(1-t^2)/2
+ fnma.s1 F_X = F_INV_1T2, F_S2T2, f0
+ nop.i 0;;
}
-{ .mfb
- nop.m 999
-(p6) fms.s0 f8 = acos_pi_by_2_hi,f1, acos_xmpi_by_2_lo
-(p6) br.ret.spnt b0 ;;
+
+{.mmf
+ // load asin(t)_high, asin(t)_low
+ ldfpd F_ATHI, F_ATLO = [r2]
+ // load pi/2
+ ldfpd F_PI2_LO, F_PI2_HI = [r3]
+ // t*sqrt(1-t^2)
+ fma.s1 F_TSQRT = F_T, F_SQRT_1T2, f0;;
}
+{.mfi
+ nop.m 0
+ // cs9*x+cs8
+ fma.s1 F_S89 = F_CS9, F_X, F_CS8
+ nop.i 0
+}
-// ACOS_POLY ==> p8 is true
-// case 2: 2^-25 <= |x| < 2^-2
-{ .mfi
- nop.m 999
-(p8) fms.s1 acos_W = acos_pi_by_2_hi, f1, acos_NORM_f8
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // cs7*x+cs6
+ fma.s1 F_S67 = F_CS7, F_X, F_CS6
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 acos_X2 = f8,f8, f0
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // cs5*x+cs4
+ fma.s1 F_S45 = F_CS5, F_X, F_CS4
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p8) fms.s1 acos_Ww = acos_pi_by_2_hi, f1, acos_W
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // x*x
+ fma.s1 F_X2 = F_X, F_X, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 acos_X4 = acos_X2,acos_X2, f0
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // (s-t)-t*x
+ fnma.s1 F_DTX = F_T, F_X, F_D
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p8) fms.s1 acos_Ww = acos_Ww, f1, acos_NORM_f8
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // cs3*x+cs2 (cs2 = -0.5 = -cs3)
+ fms.s1 F_S23 = F_CS3, F_X, F_CS3
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 acos_P810 = acos_X4, acos_A10, acos_A8
- nop.i 999
+{.mfi
+ nop.m 0
+ // if sign is negative, negate table values: asin(t)_low
+ (p6) fnma.s1 F_ATLO = F_ATLO, f1, f0
+ nop.i 0
}
-// acos_P79 = X4*A9 + A7
-// acos_P810 = X4*A10 + A8
-{ .mfi
- nop.m 999
-(p8) fma.s1 acos_P79 = acos_X4, acos_A9, acos_A7
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // if sign is negative, negate table values: asin(t)_high
+ (p6) fnma.s1 F_ATHI = F_ATHI, f1, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 acos_Ww = acos_Ww, f1, acos_pi_by_2_lo
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // cs9*x^3+cs8*x^2+cs7*x+cs6
+ fma.s1 F_S69 = F_S89, F_X2, F_S67
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 acos_P610 = acos_X4, acos_P810, acos_A6
- nop.i 999
+{.mfi
+ nop.m 0
+ // x^4
+ fma.s1 F_X4 = F_X2, F_X2, f0
+ nop.i 0;;
}
-// acos_P59 = X4*(X4*A9 + A7) + A5
-// acos_P610 = X4*(X4*A10 + A8) + A6
-{ .mfi
- nop.m 999
-(p8) fma.s1 acos_P59 = acos_X4, acos_P79, acos_A5
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // t*sqrt(1-t^2)*x^2
+ fma.s1 F_TSQRT = F_TSQRT, F_X2, f0
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 acos_P410 = acos_X4, acos_P610, acos_A4
- nop.i 999
+{.mfi
+ nop.m 0
+ // cs5*x^3+cs4*x^2+cs3*x+cs2
+ fma.s1 F_S25 = F_S45, F_X2, F_S23
+ nop.i 0;;
}
-// acos_P39 = X4*(X4*(X4*A9 + A7) + A5) + A3
-// acos_P410 = X4*(X4*(X4*A10 + A8) + A6) + A4
-{ .mfi
- nop.m 999
-(p8) fma.s1 acos_P39 = acos_X4, acos_P59, acos_A3
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // ((s-t)-t*x)*sqrt(1-t^2)
+ fma.s1 F_DTX = F_DTX, F_SQRT_1T2, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 acos_P210 = acos_X4, acos_P410, acos_A2
- nop.i 999
+{.mfi
+ nop.m 0
+ // (pi/2)_high - asin(t)_high
+ fnma.s1 F_ATHI = F_ATHI, f1, F_PI2_HI
+ nop.i 0
}
-// acos_P19 = X4*(X4*(X4*(X4*A9 + A7) + A5) + A3) + A1 = P1
-// acos_P210 = X4*(X4*(X4*(X4*A10 + A8) + A6) + A4) + A2 = P2
-{ .mfi
- nop.m 999
-(p8) fma.s1 acos_P19 = acos_X4, acos_P39, acos_A1
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // asin(t)_low - (pi/2)_low
+ fnma.s1 F_ATLO = F_PI2_LO, f1, F_ATLO
+ nop.i 0;;
}
-// acos_P1P2 = Xsq*P2 + P1
-// acos_P1P2 = Xsq*(Xsq*P2 + P1)
-{ .mfi
- nop.m 999
-(p8) fma.s1 acos_P1P2 = acos_X2, acos_P210, acos_P19
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // PS29 = cs9*x^7+..+cs5*x^3+cs4*x^2+cs3*x+cs2
+ fma.s1 F_S29 = F_S69, F_X4, F_S25
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 acos_P1P2 = acos_X2, acos_P1P2, f0
- nop.i 999 ;;
+
+
+{.mfi
+ nop.m 0
+ // R = ((s-t)-t*x)*sqrt(1-t^2)-t*sqrt(1-t^2)*x^2*PS29
+ fnma.s1 F_R = F_S29, F_TSQRT, F_DTX
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fms.s1 acos_xPmw = acos_NORM_f8, acos_P1P2, acos_Ww
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // R^2
+ fma.s1 F_R2 = F_R, F_R, f0
+ nop.i 0;;
}
-{ .mfb
- nop.m 999
-(p8) fms.s0 f8 = acos_W, f1, acos_xPmw
-(p8) br.ret.spnt b0 ;;
+
+{.mfi
+ nop.m 0
+ // c7+c9*R^2
+ fma.s1 F_P79 = F_C9, F_R2, F_C7
+ nop.i 0
}
+{.mfi
+ nop.m 0
+ // c3+c5*R^2
+ fma.s1 F_P35 = F_C5, F_R2, F_C3
+ nop.i 0;;
+}
-// ACOS_ATAN
-// case 3: 2^-2 <= |x| < 1
-// case 3: 2^-2 <= |x| < 1 ==> p9 ACOS_ATAN
+{.mfi
+ nop.m 0
+ // R^3
+ fma.s1 F_R4 = F_R2, F_R2, f0
+ nop.i 0;;
+}
-// Step 1.1: Get A,B and a,b
-// A + a = 1- |X|
-// B + b = 1+ |X|
-// Note also that we will use acos_corr (f13)
-// and acos_W
+{.mfi
+ nop.m 0
+ // R^3
+ fma.s1 F_R3 = F_R2, F_R, f0
+ nop.i 0;;
+}
-// Step 2
-// Call __libm_atan2_reg
-{ .mfi
-(p0) mov acos_GR_fffe = 0xfffe
-(p0) fma.s1 acos_B = f1,f1, acos_ABS_NORM_f8
-(p0) mov GR_SAVE_B0 = b0 ;;
+{.mfi
+ nop.m 0
+ // c3+c5*R^2+c7*R^4+c9*R^6
+ fma.s1 F_P39 = F_P79, F_R4, F_P35
+ nop.i 0;;
}
-{ .mmf
-(p0) mov GR_SAVE_GP = gp
- nop.m 999
-(p0) fms.s1 acos_A = f1,f1, acos_ABS_NORM_f8
+
+{.mfi
+ nop.m 0
+ // asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6)
+ fma.s1 F_P39 = F_P39, F_R3, F_ATLO
+ nop.i 0;;
}
-{ .mfi
-(p0) setf.exp acos_HALF = acos_GR_fffe
- nop.f 999
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // R+asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6)
+ fma.s1 F_P39 = F_P39, f1, F_R
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p0) fms.s1 acos_1mB = f1,f1, acos_B
- nop.i 999 ;;
+
+{.mfb
+ nop.m 0
+ // result = (pi/2)-asin(t)_high+R+asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6)
+ fnma.s0 f8 = F_P39, f1, F_ATHI
+ // return
+ br.ret.sptk b0;;
}
-// We want atan2(V,U)
-// so put V in f8 and U in f9
-// but save X in acos_X
-{ .mfi
- nop.m 999
-(p0) fmerge.se acos_X = f8, f8
- nop.i 999 ;;
+
+
+LARGE_S:
+
+{.mfi
+ // bias-1
+ mov R_TMP3 = 0xffff - 1
+ // y ~ 1/sqrt(1-s^2)
+ frsqrta.s1 F_Y, p7 = F_1S2
+ // c9 = 55*13*17/128
+ mov R_TMP4 = 0x10af7b
}
-// Step 1.2:
-/////////////////////////
-// Get U = sqrt(B)
-/////////////////////////
+{.mlx
+ // c8 = -33*13*15/128
+ mov R_TMP5 = 0x184923
+ movl R_TMP2 = 0xff00000000000000;;
+}
-{ .mfi
- nop.m 999
-(p0) frsqrta.s1 acos_y0,p8 = acos_B
- nop.i 999
+{.mfi
+ // set p6 = 1 if s<0, p11 = 1 if s>0
+ cmp.ge p6, p11 = R_EXP, R_DBL_S
+ // 1-s^2
+ fnma.s1 F_1S2 = f8, f8, f1
+ // set p9 = 1
+ cmp.eq p9, p0 = r0, r0;;
}
-{ .mfi
- nop.m 999
-(p0) fms.s1 acos_1mA = f1,f1, acos_A
- nop.i 999 ;;
+
+{.mfi
+ // load 0.5
+ setf.exp F_05 = R_TMP3
+ // (1-s^2) rounded to single precision
+ fnma.s.s1 F_1S2_S = f8, f8, f1
+ // c9 = 55*13*17/128
+ shl R_TMP4 = R_TMP4, 10
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_Bb = acos_1mB,f1, acos_ABS_NORM_f8
- nop.i 999 ;;
+{.mlx
+ // AND mask for getting t ~ sqrt(1-s^2)
+ setf.sig F_ANDMASK = R_TMP2
+ // OR mask
+ movl R_TMP2 = 0x0100000000000000;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_Hh = acos_HALF, acos_B, f0
- nop.i 999 ;;
+.pred.rel "mutex", p6, p11
+{.mfi
+ nop.m 0
+ // 1-|s|
+ (p6) fma.s1 F_1AS = f8, f1, f1
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_t1 = acos_y0, acos_y0, f0
- nop.i 999
+{.mfi
+ nop.m 0
+ // 1-|s|
+ (p11) fnma.s1 F_1AS = f8, f1, f1
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p0) fms.s1 acos_Aa = acos_1mA,f1, acos_ABS_NORM_f8
- nop.i 999 ;;
+
+{.mfi
+ // c9 = 55*13*17/128
+ setf.s F_CS9 = R_TMP4
+ // |s|
+ (p6) fnma.s1 F_AS = f8, f1, f0
+ // c8 = -33*13*15/128
+ shl R_TMP5 = R_TMP5, 11
}
-{ .mfi
- nop.m 999
-(p0) fnma.s1 acos_t2 = acos_t1, acos_Hh, acos_HALF
- nop.i 999 ;;
+{.mfi
+ // c7 = 33*13/16
+ mov R_TMP4 = 0x41d68
+ // |s|
+ (p11) fma.s1 F_AS = f8, f1, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_y1 = acos_t2, acos_y0, acos_y0
- nop.i 999
+
+{.mfi
+ setf.sig F_ORMASK = R_TMP2
+ // y^2
+ fma.s1 F_Y2 = F_Y, F_Y, f0
+ // c7 = 33*13/16
+ shl R_TMP4 = R_TMP4, 12
}
+{.mfi
+ // c6 = -33*7/16
+ mov R_TMP6 = 0xc1670
+ // y' ~ sqrt(1-s^2)
+ fma.s1 F_T1 = F_Y, F_1S2, f0
+ // c5 = 63/8
+ mov R_TMP7 = 0x40fc;;
+}
-// Step 1.2:
-/////////////////////////
-// Get V = sqrt(A)
-/////////////////////////
-{ .mfi
- nop.m 999
-(p0) frsqrta.s1 acos_y0,p8 = acos_A
- nop.i 999 ;;
+
+{.mlx
+ // load c8 = -33*13*15/128
+ setf.s F_CS8 = R_TMP5
+ // c4 = -35/8
+ movl R_TMP5 = 0xc08c0000;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_t3 = acos_y1, acos_Hh, f0
- nop.i 999 ;;
+{.mfi
+ // r3 = pointer to polynomial coefficients
+ addl r3 = @ltoff(poly_coeffs), gp
+ // 1-s-(1-s^2)_s
+ fnma.s1 F_DS = F_1S2_S, f1, F_1AS
+ // p9 = 0 if p7 = 1 (p9 = 1 for special cases only)
+ (p7) cmp.ne p9, p0 = r0, r0
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_t1 = acos_y0, acos_y0, f0
- nop.i 999 ;;
+{.mlx
+ // load c7 = 33*13/16
+ setf.s F_CS7 = R_TMP4
+ // c3 = 5/2
+ movl R_TMP4 = 0x40200000;;
}
-{ .mfi
- nop.m 999
-(p0) fnma.s1 acos_t4 = acos_t3, acos_y1, acos_HALF
- nop.i 999 ;;
+
+{.mlx
+ // load c4 = -35/8
+ setf.s F_CS4 = R_TMP5
+ // c2 = -3/2
+ movl R_TMP5 = 0xbfc00000;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_y2 = acos_t4, acos_y1, acos_y1
- nop.i 999 ;;
+
+{.mfi
+ // load c3 = 5/2
+ setf.s F_CS3 = R_TMP4
+ // x = (1-s^2)_s*y^2-1
+ fms.s1 F_X = F_1S2_S, F_Y2, f1
+ // c6 = -33*7/16
+ shl R_TMP6 = R_TMP6, 12
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_S = acos_B, acos_y2, f0
- nop.i 999
+{.mfi
+ nop.m 0
+ // y^2/2
+ fma.s1 F_Y2_2 = F_Y2, F_05, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_H = acos_y2, acos_HALF, f0
- nop.i 999 ;;
+
+{.mfi
+ // load c6 = -33*7/16
+ setf.s F_CS6 = R_TMP6
+ // eliminate lower bits from y'
+ fand F_T = F_T1, F_ANDMASK
+ // c5 = 63/8
+ shl R_TMP7 = R_TMP7, 16
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_t5 = acos_Hh, acos_y2, f0
- nop.i 999
+
+{.mfb
+ // r3 = load start address to polynomial coefficients
+ ld8 r3 = [r3]
+ // 1-(1-s^2)_s-s^2
+ fma.s1 F_DS = F_AS, F_1AS, F_DS
+ // p9 = 1 if s is a special input (NaN, or |s|> = 1)
+ (p9) br.cond.spnt acosl_SPECIAL_CASES;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_Hh = acos_HALF, acos_A, f0
- nop.i 999 ;;
+{.mmf
+ // get exponent, significand of y' (in single prec.)
+ getf.s R_TMP = F_T1
+ // load c3 = -3/2
+ setf.s F_CS2 = R_TMP5
+ // y*(1-s^2)
+ fma.s1 F_Y1S2 = F_Y, F_1S2, f0;;
}
-{ .mfi
- nop.m 999
-(p0) fnma.s1 acos_Dd = acos_S, acos_S, acos_B
- nop.i 999 ;;
+
+
+{.mfi
+ nop.m 0
+ // if s<0, set s = -s
+ (p6) fnma.s1 f8 = f8, f1, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p0) fnma.s1 acos_t2 = acos_t1, acos_Hh, acos_HALF
- nop.i 999 ;;
+
+{.mfi
+ // load c5 = 63/8
+ setf.s F_CS5 = R_TMP7
+ // x = (1-s^2)_s*y^2-1+(1-(1-s^2)_s-s^2)*y^2
+ fma.s1 F_X = F_DS, F_Y2, F_X
+ // for t = 2^k*1.b1 b2.., get 7-k|b1.. b6
+ extr.u R_INDEX = R_TMP, 17, 9;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_U = acos_Dd, acos_H, acos_S
- nop.i 999 ;;
+
+{.mmi
+ // index = (4-exponent)|b1 b2.. b6
+ sub R_INDEX = R_INDEX, R_BIAS
+ nop.m 0
+ // get exponent of y
+ shr.u R_TMP2 = R_TMP, 23;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_y1 = acos_t2, acos_y0, acos_y0
- nop.i 999 ;;
+{.mmi
+ // load C3
+ ldfe F_C3 = [r3], 16
+ // set p8 = 1 if y'<2^{-4}
+ cmp.gt p8, p0 = 0x7b, R_TMP2
+ // shift R_INDEX by 5
+ shl R_INDEX = R_INDEX, 5;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_2U = acos_U, f1, acos_U
- nop.i 999 ;;
+
+{.mfb
+ // get table index for sqrt(1-t^2)
+ add r2 = r2, R_INDEX
+ // get t = 2^k*1.b1 b2.. b7 1
+ for F_T = F_T, F_ORMASK
+ (p8) br.cond.spnt VERY_LARGE_INPUT;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_t3 = acos_y1, acos_Hh, f0
- nop.i 999
+
+
+{.mmf
+ // load C5
+ ldfe F_C5 = [r3], 16
+ // load 1/(1-t^2)
+ ldfp8 F_INV_1T2, F_SQRT_1T2 = [r2], 16
+ // x = ((1-s^2)*y^2-1)/2
+ fma.s1 F_X = F_X, F_05, f0;;
}
-// Step 1.3:
-// sqrt(A + a) = V + v
-// sqrt(B + b) = U + u
-/////////////////////////
-// Get u
-/////////////////////////
+{.mmf
+ nop.m 0
+ // C7, C9
+ ldfpd F_C7, F_C9 = [r3], 16
+ // set correct exponent for t
+ fmerge.se F_T = F_T1, F_T;;
+}
-// acos_BmUU = B - UU
-// acos_BmUUpb = (B - UU) + b
-{ .mfi
- nop.m 999
-(p0) fnma.s1 acos_BmUU = acos_U, acos_U, acos_B
- nop.i 999 ;;
+
+{.mfi
+ // get address for loading pi
+ add r3 = 48, r3
+ // c9*x+c8
+ fma.s1 F_S89 = F_X, F_CS9, F_CS8
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p0) fmerge.se f9 = acos_U, acos_U
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // x^2
+ fma.s1 F_X2 = F_X, F_X, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p0) fnma.s1 acos_t4 = acos_t3, acos_y1, acos_HALF
- nop.i 999 ;;
+
+{.mfi
+ // pi (low, high)
+ ldfpd F_PI2_LO, F_PI2_HI = [r3]
+ // y*(1-s^2)*x
+ fma.s1 F_Y1S2X = F_Y1S2, F_X, f0
+ nop.i 0
}
-// acos_1d2U = frcpa(2U)
-{ .mfi
- nop.m 999
-(p0) frcpa.s1 acos_1d2U,p9 = f1, acos_2U
- nop.i 999
+{.mfi
+ nop.m 0
+ // c7*x+c6
+ fma.s1 F_S67 = F_X, F_CS7, F_CS6
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_BmUUpb = acos_BmUU, f1, acos_Bb
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // 1-x
+ fnma.s1 F_1X = F_X, f1, f1
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_y2 = acos_t4, acos_y1, acos_y1
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // c3*x+c2
+ fma.s1 F_S23 = F_X, F_CS3, F_CS2
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-// acos_Uu = ((B - UU) + b) * frcpa(2U)
-(p0) fma.s1 acos_Uu = acos_BmUUpb, acos_1d2U, f0
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // 1-t^2
+ fnma.s1 F_1T2 = F_T, F_T, f1
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_S = acos_A, acos_y2, f0
- nop.i 999
+{.mfi
+ // load asin(t)_high, asin(t)_low
+ ldfpd F_ATHI, F_ATLO = [r2]
+ // c5*x+c4
+ fma.s1 F_S45 = F_X, F_CS5, F_CS4
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_H = acos_y2, acos_HALF, f0
- nop.i 999 ;;
+
+
+{.mfi
+ nop.m 0
+ // t*s
+ fma.s1 F_TS = F_T, f8, f0
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_t5 = acos_Hh, acos_y2, f0
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // 0.5/(1-t^2)
+ fma.s1 F_INV_1T2 = F_INV_1T2, F_2M64, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p0) fnma.s1 acos_Dd = acos_S, acos_S, acos_A
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // z~sqrt(1-t^2), rounded to 24 significant bits
+ fma.s.s1 F_Z = F_SQRT_1T2, F_2M64, f0
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_V = acos_Dd, acos_H, acos_S
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // sqrt(1-t^2)
+ fma.s1 F_SQRT_1T2 = F_SQRT_1T2, F_2M64, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_2V = acos_V, f1, acos_V
- nop.i 999
+
+{.mfi
+ nop.m 0
+ // y*(1-s^2)*x^2
+ fma.s1 F_Y1S2X2 = F_Y1S2, F_X2, f0
+ nop.i 0
}
-// Step 3
-/////////////////////////
-// Calculate the correction, acos_corr
-/////////////////////////
-// acos_corr = U*v - (V*u)
+{.mfi
+ nop.m 0
+ // x^4
+ fma.s1 F_X4 = F_X2, F_X2, f0
+ nop.i 0;;
+}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_Vu = acos_V,acos_Uu, f0
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // s*t rounded to 24 significant bits
+ fma.s.s1 F_TSS = F_T, f8, f0
+ nop.i 0
}
-/////////////////////////
-// Get v
-/////////////////////////
-// acos_AmVV = A - VV
-// acos_AmVVpa = (A - VV) + a
+{.mfi
+ nop.m 0
+ // c9*x^3+..+c6
+ fma.s1 F_S69 = F_X2, F_S89, F_S67
+ nop.i 0;;
+}
-{ .mfi
- nop.m 999
-(p0) fnma.s1 acos_AmVV = acos_V, acos_V, acos_A
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // ST = (t^2-1+s^2) rounded to 24 significant bits
+ fms.s.s1 F_ST = f8, f8, F_1T2
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p0) fmerge.se f8 = acos_V, acos_V
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // c5*x^3+..+c2
+ fma.s1 F_S25 = F_X2, F_S45, F_S23
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_AmVVpa = acos_AmVV, f1, acos_Aa
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // 0.25/(1-t^2)
+ fma.s1 F_INV1T2_2 = F_05, F_INV_1T2, f0
+ nop.i 0
}
-// acos_1d2V = frcpa(2V)
-{ .mfi
- nop.m 999
-(p0) frcpa.s1 acos_1d2V,p9 = f1, acos_2V
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // t*s-sqrt(1-t^2)*(1-s^2)*y
+ fnma.s1 F_TS = F_Y1S2, F_SQRT_1T2, F_TS
+ nop.i 0;;
}
-// acos_Vv = ((A - VV) + a) * frcpa(2V)
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_Vv = acos_AmVVpa, acos_1d2V, f0
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // z*0.5/(1-t^2)
+ fma.s1 F_ZE = F_INV_1T2, F_SQRT_1T2, f0
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_Uv = acos_U,acos_Vv, f0
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // z^2+t^2-1
+ fms.s1 F_DZ0 = F_Z, F_Z, F_1T2
+ nop.i 0;;
}
-.endp acosl#
-ASM_SIZE_DIRECTIVE(acosl#)
+{.mfi
+ nop.m 0
+ // (1-s^2-(1-s^2)_s)*x
+ fma.s1 F_DS2X = F_X, F_DS, f0
+ nop.i 0;;
+}
-.proc __libm_callout
-__libm_callout:
-.prologue
-{ .mfi
- nop.m 0
- nop.f 0
-.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs
+{.mfi
+ nop.m 0
+ // t*s-(t*s)_s
+ fms.s1 F_DTS = F_T, f8, F_TSS
+ nop.i 0
}
-;;
-{ .mfi
- mov GR_SAVE_GP=gp
- nop.f 0
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0
+{.mfi
+ nop.m 0
+ // c9*x^7+..+c2
+ fma.s1 F_S29 = F_X4, F_S69, F_S25
+ nop.i 0;;
}
-.body
-{ .mfb
- nop.m 999
-(p0) fms.s1 acos_corr = acos_Uv,f1, acos_Vu
-(p0) br.call.sptk.many b0=__libm_atan2_reg# ;;
+
+{.mfi
+ nop.m 0
+ // y*z
+ fma.s1 F_YZ = F_Z, F_Y, f0
+ nop.i 0
}
+{.mfi
+ nop.m 0
+ // t^2
+ fma.s1 F_T2 = F_T, F_T, f0
+ nop.i 0;;
+}
-// p6 ==> X is negative
-// p7 ==> x is positive
-// We know that |X| >= 1/4
-{ .mfi
-(p0) mov gp = GR_SAVE_GP
-(p0) fcmp.lt.unc p6,p7 = acos_X , f0
-(p0) mov b0 = GR_SAVE_B0 ;;
+{.mfi
+ nop.m 0
+ // 1-t^2+ST
+ fma.s1 F_1T2_ST = F_ST, f1, F_1T2
+ nop.i 0;;
}
-// acos_2_Z_hi = 2 * acos_Z_hi
-// acos_s_lo_Z_lo = s_lo * Z_lo
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_2_Z_hi = acos_Z_hi, f1, acos_Z_hi
-(p0) mov ar.pfs = GR_SAVE_PFS
+{.mfi
+ nop.m 0
+ // y*(1-s^2)(1-x)
+ fma.s1 F_Y1S2_1X = F_Y1S2, F_1X, f0
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_s_lo_Z_lo = acos_s_lo, acos_Z_lo, f0
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // dz ~ sqrt(1-t^2)-z
+ fma.s1 F_DZ = F_DZ0, F_ZE, f0
+ nop.i 0;;
}
-// 2 is a constant needed later
-{ .mfi
- nop.m 999
-(p0) fma.s1 acos_2 = f1,f1,f1
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // -1+correction for sqrt(1-t^2)-z
+ fnma.s1 F_CORR = F_INV1T2_2, F_DZ0, f0
+ nop.i 0;;
}
-// X >= 1/4
-// acos_result_lo = 2(s_lo * Z_lo) - corr
-// f8 = (2*Z_hi) + (2(s_lo * Z_lo) - corr)
-{ .mfi
- nop.m 999
-(p7) fma.s1 acos_result_lo = acos_s_lo_Z_lo, acos_2, acos_corr
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // (PS29*x^2+x)*y*(1-s^2)
+ fma.s1 F_S19 = F_Y1S2X2, F_S29, F_Y1S2X
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p7) fma.s0 f8 = acos_2_Z_hi, f1, acos_result_lo
- nop.i 999
+{.mfi
+ nop.m 0
+ // z*y*(1-s^2)_s
+ fma.s1 F_ZY1S2S = F_YZ, F_1S2_S, f0
+ nop.i 0
}
-// acos_result_lo = (pi_lo - corr)
-// acos_result_lo = (pi_lo - corr) + acos_Ww
-{ .mfi
- nop.m 999
-(p6) fms.s1 acos_result_lo = acos_pi_lo, f1, acos_corr
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // s^2-(1-t^2+ST)
+ fms.s1 F_1T2_ST = f8, f8, F_1T2_ST
+ nop.i 0;;
}
-// X <= -1/4
-// acos_W = pi_hi - 2 * Z_hi
-{ .mfi
- nop.m 999
-(p6) fnma.s1 acos_W = acos_2, acos_Z_hi, acos_pi_hi
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // (t*s-(t*s)_s)+z*y*(1-s^2-(1-s^2)_s)*x
+ fma.s1 F_DTS = F_YZ, F_DS2X, F_DTS
+ nop.i 0
}
-// acos_Ww = pi_hi - W
-// acos_Ww = (pi_hi - W) + (2 * Z_hi)
-{ .mfi
- nop.m 999
-(p6) fms.s1 acos_Ww = acos_pi_hi, f1, acos_W
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // dz*y*(1-s^2)*(1-x)
+ fma.s1 F_DZ_TERM = F_DZ, F_Y1S2_1X, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p6) fms.s1 acos_Ww = acos_Ww, f1, acos_2_Z_hi
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // R = t*s-sqrt(1-t^2)*(1-s^2)*y+sqrt(1-t^2)*(1-s^2)*y*PS19
+ // (used for polynomial evaluation)
+ fma.s1 F_R = F_S19, F_SQRT_1T2, F_TS
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p6) fma.s1 acos_result_lo = acos_result_lo, f1, acos_Ww
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // (PS29*x^2)*y*(1-s^2)
+ fma.s1 F_S29 = F_Y1S2X2, F_S29, f0
+ nop.i 0
}
-// acos_Z_lo = ((pi_lo - corr) + acos_Ww) - 2 * (s_lo * Z_lo)
-{ .mfi
- nop.m 999
-(p6) fnma.s1 acos_Z_lo = acos_s_lo_Z_lo, acos_2, acos_result_lo
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // apply correction to dz*y*(1-s^2)*(1-x)
+ fma.s1 F_DZ_TERM = F_DZ_TERM, F_CORR, F_DZ_TERM
+ nop.i 0;;
}
-{ .mfb
- nop.m 999
-(p6) fma.s0 f8 = acos_W, f1, acos_Z_lo
-(p0) br.ret.sptk b0 ;;
+
+{.mfi
+ nop.m 0
+ // R^2
+ fma.s1 F_R2 = F_R, F_R, f0
+ nop.i 0;;
}
-.endp __libm_callout
-ASM_SIZE_DIRECTIVE(__libm_callout)
-.proc SPECIAL
-SPECIAL:
-L(ACOS_NAN):
-{ .mfb
- nop.m 999
-(p0) fma.s0 f8 = f8,f1,f0
-(p0) br.ret.sptk b0 ;;
+
+{.mfi
+ nop.m 0
+ // (t*s-(t*s)_s)+z*y*(1-s^2-(1-s^2)_s)*x+dz*y*(1-s^2)*(1-x)
+ fma.s1 F_DZ_TERM = F_DZ_TERM, f1, F_DTS
+ nop.i 0;;
}
-L(ACOS_ERROR_RETURN):
-// Save ar.pfs, b0, and gp; restore on exit
-// qnan snan inf norm unorm 0 -+
-// 1 1 0 0 0 0 11 = 0xc3
+{.mfi
+ nop.m 0
+ // c7+c9*R^2
+ fma.s1 F_P79 = F_C9, F_R2, F_C7
+ nop.i 0
+}
-// Coming in as X = +- 1
-// What should we return?
+{.mfi
+ nop.m 0
+ // c3+c5*R^2
+ fma.s1 F_P35 = F_C5, F_R2, F_C3
+ nop.i 0;;
+}
-// If X is 1, return (sign of X)pi/2
+{.mfi
+ nop.m 0
+ // asin(t)_low-(pi)_low (if s<0)
+ (p6) fms.s1 F_ATLO = F_ATLO, f1, F_PI2_LO
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // R^4
+ fma.s1 F_R4 = F_R2, F_R2, f0
+ nop.i 0;;
+}
-{ .mfi
- nop.m 999
-(p0) fcmp.eq.unc p6,p7 = acos_ABS_NORM_f8,f1
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // R^3
+ fma.s1 F_R3 = F_R2, F_R, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p6) fcmp.lt.unc p8,p9 = f8,f0
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // (t*s)_s-t^2*y*z
+ fnma.s1 F_TSS = F_T2, F_YZ, F_TSS
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p8) fma.s0 f8 = acos_pi_hi, f1, acos_pi_lo
- nop.i 999
+{.mfi
+ nop.m 0
+ // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST)
+ fma.s1 F_DZ_TERM = F_YZ, F_1T2_ST, F_DZ_TERM
+ nop.i 0;;
}
-{ .mfb
- nop.m 999
-(p9) fmerge.s f8 = f8,f0
-(p6) br.ret.spnt b0 ;;
+
+{.mfi
+ nop.m 0
+ // (pi)_hi-asin(t)_hi (if s<0)
+ (p6) fms.s1 F_ATHI = F_PI2_HI, f1, F_ATHI
+ nop.i 0
}
-// If X is a NAN, leave
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p12,p0 = f8, 0xc3
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // c3+c5*R^2+c7*R^4+c9*R^6
+ fma.s1 F_P39 = F_P79, F_R4, F_P35
+ nop.i 0;;
}
-{ .mfb
- nop.m 999
-(p12) fma.s0 f8 = f8,f1,f0
-(p12) br.ret.spnt b0 ;;
+
+{.mfi
+ nop.m 0
+ // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST)+
+ // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29
+ fma.s1 F_DZ_TERM = F_SQRT_1T2, F_S29, F_DZ_TERM
+ nop.i 0;;
}
-{ .mfi
-(p0) mov GR_Parameter_TAG = 57
-(p0) frcpa f10, p6 = f0, f0
-nop.i 999
-};;
-.endp SPECIAL
-ASM_SIZE_DIRECTIVE(SPECIAL)
+{.mfi
+ nop.m 0
+ // (t*s)_s-t^2*y*z+z*y*ST
+ fma.s1 F_TSS = F_YZ, F_ST, F_TSS
+ nop.i 0
+}
-.proc __libm_error_region
-__libm_error_region:
+{.mfi
+ nop.m 0
+ // -asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6)
+ fms.s1 F_P39 = F_P39, F_R3, F_ATLO
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) +
+ // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 +
+ // - asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6)
+ fma.s1 F_DZ_TERM = F_P39, f1, F_DZ_TERM
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) +
+ // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 + z*y*(1-s^2)_s*x +
+ // - asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6)
+ fma.s1 F_DZ_TERM = F_ZY1S2S, F_X, F_DZ_TERM
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) +
+ // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 + z*y*(1-s^2)_s*x +
+ // - asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) +
+ // + (t*s)_s-t^2*y*z+z*y*ST
+ fma.s1 F_DZ_TERM = F_TSS, f1, F_DZ_TERM
+ nop.i 0;;
+}
+
+
+.pred.rel "mutex", p6, p11
+{.mfi
+ nop.m 0
+ // result: add high part of table value
+ // s>0 in this case
+ (p11) fnma.s0 f8 = F_DZ_TERM, f1, F_ATHI
+ nop.i 0
+}
+
+{.mfb
+ nop.m 0
+ // result: add high part of pi-table value
+ // if s<0
+ (p6) fma.s0 f8 = F_DZ_TERM, f1, F_ATHI
+ br.ret.sptk b0;;
+}
+
+
+
+
+
+
+SMALL_S:
+
+ // use 15-term polynomial approximation
+
+{.mmi
+ // r3 = pointer to polynomial coefficients
+ addl r3 = @ltoff(poly_coeffs), gp;;
+ // load start address for coefficients
+ ld8 r3 = [r3]
+ mov R_TMP = 0x3fbf;;
+}
+
+
+{.mmi
+ add r2 = 64, r3
+ ldfe F_C3 = [r3], 16
+ // p7 = 1 if |s|<2^{-64} (exponent of s<bias-64)
+ cmp.lt p7, p0 = R_EXP0, R_TMP;;
+}
+
+{.mmf
+ ldfe F_C5 = [r3], 16
+ ldfpd F_C11, F_C13 = [r2], 16
+ nop.f 0;;
+}
+
+{.mmf
+ ldfpd F_C7, F_C9 = [r3], 16
+ ldfpd F_C15, F_C17 = [r2]
+ nop.f 0;;
+}
+
+
+
+{.mfb
+ // load pi/2
+ ldfpd F_PI2_LO, F_PI2_HI = [r3]
+ // s^2
+ fma.s1 F_R2 = f8, f8, f0
+ // |s|<2^{-64}
+ (p7) br.cond.spnt RETURN_PI2;;
+}
+
+
+{.mfi
+ nop.m 0
+ // s^3
+ fma.s1 F_R3 = f8, F_R2, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // s^4
+ fma.s1 F_R4 = F_R2, F_R2, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c3+c5*s^2
+ fma.s1 F_P35 = F_C5, F_R2, F_C3
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c11+c13*s^2
+ fma.s1 F_P1113 = F_C13, F_R2, F_C11
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c7+c9*s^2
+ fma.s1 F_P79 = F_C9, F_R2, F_C7
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c15+c17*s^2
+ fma.s1 F_P1517 = F_C17, F_R2, F_C15
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // (pi/2)_high-s_high
+ fnma.s1 F_T = f8, f1, F_PI2_HI
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // s^8
+ fma.s1 F_R8 = F_R4, F_R4, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c3+c5*s^2+c7*s^4+c9*s^6
+ fma.s1 F_P39 = F_P79, F_R4, F_P35
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c11+c13*s^2+c15*s^4+c17*s^6
+ fma.s1 F_P1117 = F_P1517, F_R4, F_P1113
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // -s_high
+ fms.s1 F_S = F_T, f1, F_PI2_HI
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // c3+..+c17*s^14
+ fma.s1 F_P317 = F_R8, F_P1117, F_P39
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // s_low
+ fma.s1 F_DS = f8, f1, F_S
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // (pi/2)_low-s^3*(c3+..+c17*s^14)
+ fnma.s0 F_P317 = F_P317, F_R3, F_PI2_LO
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // (pi/2)_low-s_low-s^3*(c3+..+c17*s^14)
+ fms.s1 F_P317 = F_P317, f1, F_DS
+ nop.i 0;;
+}
+
+{.mfb
+ nop.m 0
+ // result: pi/2-s-c3*s^3-..-c17*s^17
+ fma.s0 f8 = F_T, f1, F_P317
+ br.ret.sptk b0;;
+}
+
+
+
+
+
+RETURN_PI2:
+
+{.mfi
+ nop.m 0
+ // (pi/2)_low-s
+ fms.s0 F_PI2_LO = F_PI2_LO, f1, f8
+ nop.i 0;;
+}
+
+{.mfb
+ nop.m 0
+ // (pi/2)-s
+ fma.s0 f8 = F_PI2_HI, f1, F_PI2_LO
+ br.ret.sptk b0;;
+}
+
+
+
+
+
+VERY_LARGE_INPUT:
+
+
+{.mmf
+ // pointer to pi_low, pi_high
+ add r2 = 80, r3
+ // load C5
+ ldfe F_C5 = [r3], 16
+ // x = ((1-(s^2)_s)*y^2-1)/2-(s^2-(s^2)_s)*y^2/2
+ fma.s1 F_X = F_X, F_05, f0;;
+}
+
+.pred.rel "mutex", p6, p11
+{.mmf
+ // load pi (low, high), if s<0
+ (p6) ldfpd F_PI2_LO, F_PI2_HI = [r2]
+ // C7, C9
+ ldfpd F_C7, F_C9 = [r3], 16
+ // if s>0, set F_PI2_LO=0
+ (p11) fma.s1 F_PI2_HI = f0, f0, f0;;
+}
+
+{.mfi
+ nop.m 0
+ (p11) fma.s1 F_PI2_LO = f0, f0, f0
+ nop.i 0;;
+}
+
+{.mfi
+ // adjust address for C_11
+ add r3 = 16, r3
+ // c9*x+c8
+ fma.s1 F_S89 = F_X, F_CS9, F_CS8
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // x^2
+ fma.s1 F_X2 = F_X, F_X, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // y*(1-s^2)*x
+ fma.s1 F_Y1S2X = F_Y1S2, F_X, f0
+ nop.i 0
+}
+
+{.mfi
+ // C11, C13
+ ldfpd F_C11, F_C13 = [r3], 16
+ // c7*x+c6
+ fma.s1 F_S67 = F_X, F_CS7, F_CS6
+ nop.i 0;;
+}
+
+
+{.mfi
+ // C15, C17
+ ldfpd F_C15, F_C17 = [r3], 16
+ // c3*x+c2
+ fma.s1 F_S23 = F_X, F_CS3, F_CS2
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c5*x+c4
+ fma.s1 F_S45 = F_X, F_CS5, F_CS4
+ nop.i 0;;
+}
+
+
+
+
+{.mfi
+ nop.m 0
+ // y*(1-s^2)*x^2
+ fma.s1 F_Y1S2X2 = F_Y1S2, F_X2, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // x^4
+ fma.s1 F_X4 = F_X2, F_X2, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c9*x^3+..+c6
+ fma.s1 F_S69 = F_X2, F_S89, F_S67
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c5*x^3+..+c2
+ fma.s1 F_S25 = F_X2, F_S45, F_S23
+ nop.i 0;;
+}
+
+
+
+{.mfi
+ nop.m 0
+ // (pi)_high-y*(1-s^2)_s
+ fnma.s1 F_HI = F_Y, F_1S2_S, F_PI2_HI
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c9*x^7+..+c2
+ fma.s1 F_S29 = F_X4, F_S69, F_S25
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // -(y*(1-s^2)_s)_high
+ fms.s1 F_1S2_HI = F_HI, f1, F_PI2_HI
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (PS29*x^2+x)*y*(1-s^2)
+ fma.s1 F_S19 = F_Y1S2X2, F_S29, F_Y1S2X
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // y*(1-s^2)_s-(y*(1-s^2))_high
+ fma.s1 F_DS2 = F_Y, F_1S2_S, F_1S2_HI
+ nop.i 0;;
+}
+
+
+
+{.mfi
+ nop.m 0
+ // R ~ sqrt(1-s^2)
+ // (used for polynomial evaluation)
+ fnma.s1 F_R = F_S19, f1, F_Y1S2
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // y*(1-s^2)-(y*(1-s^2))_high
+ fma.s1 F_DS2 = F_Y, F_DS, F_DS2
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // (pi)_low+(PS29*x^2)*y*(1-s^2)
+ fma.s1 F_S29 = F_Y1S2X2, F_S29, F_PI2_LO
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // R^2
+ fma.s1 F_R2 = F_R, F_R, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // if s<0
+ // (pi)_low+(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-(y*(1-s^2))_high)
+ fms.s1 F_S29 = F_S29, f1, F_DS2
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c7+c9*R^2
+ fma.s1 F_P79 = F_C9, F_R2, F_C7
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c3+c5*R^2
+ fma.s1 F_P35 = F_C5, F_R2, F_C3
+ nop.i 0;;
+}
+
+
+
+{.mfi
+ nop.m 0
+ // R^4
+ fma.s1 F_R4 = F_R2, F_R2, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // R^3
+ fma.s1 F_R3 = F_R2, F_R, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c11+c13*R^2
+ fma.s1 F_P1113 = F_C13, F_R2, F_C11
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c15+c17*R^2
+ fma.s1 F_P1517 = F_C17, F_R2, F_C15
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (pi)_low+(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-(y*(1-s^2))_high)+y*(1-s^2)*x
+ fma.s1 F_S29 = F_Y1S2, F_X, F_S29
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c11+c13*R^2+c15*R^4+c17*R^6
+ fma.s1 F_P1117 = F_P1517, F_R4, F_P1113
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c3+c5*R^2+c7*R^4+c9*R^6
+ fma.s1 F_P39 = F_P79, F_R4, F_P35
+ nop.i 0;;
+}
+
+
+
+{.mfi
+ nop.m 0
+ // R^8
+ fma.s1 F_R8 = F_R4, F_R4, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c3+c5*R^2+c7*R^4+c9*R^6+..+c17*R^14
+ fma.s1 F_P317 = F_P1117, F_R8, F_P39
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (pi)_low-(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-
+ // -(y*(1-s^2))_high)+y*(1-s^2)*x - P3, 17
+ fnma.s1 F_S29 = F_P317, F_R3, F_S29
+ nop.i 0;;
+}
+
+.pred.rel "mutex", p6, p11
+{.mfi
+ nop.m 0
+ // Result (if s<0):
+ // (pi)_low-(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-
+ // -(y*(1-s^2))_high)+y*(1-s^2)*x - P3, 17
+ // +(pi)_high-(y*(1-s^2))_high
+ (p6) fma.s0 f8 = F_S29, f1, F_HI
+ nop.i 0
+}
+
+{.mfb
+ nop.m 0
+ // Result (if s>0):
+ // (PS29*x^2)*y*(1-s^2)-
+ // -y*(1-s^2)*x + P3, 17
+ // +(y*(1-s^2))
+ (p11) fms.s0 f8 = F_Y, F_1S2_S, F_S29
+ br.ret.sptk b0;;
+}
+
+
+
+
+
+
+acosl_SPECIAL_CASES:
+
+{.mfi
+ alloc r32 = ar.pfs, 1, 4, 4, 0
+ // check if the input is a NaN, or unsupported format
+ // (i.e. not infinity or normal/denormal)
+ fclass.nm p7, p8 = f8, 0x3f
+ // pointer to pi/2
+ add r3 = 96, r3;;
+}
+
+
+{.mfi
+ // load pi/2
+ ldfpd F_PI2_HI, F_PI2_LO = [r3]
+ // get |s|
+ fmerge.s F_S = f0, f8
+ nop.i 0
+}
+
+{.mfb
+ nop.m 0
+ // if NaN, quietize it, and return
+ (p7) fma.s0 f8 = f8, f1, f0
+ (p7) br.ret.spnt b0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // |s| = 1 ?
+ fcmp.eq.s0 p9, p10 = F_S, f1
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // load FR_X
+ fma.s1 FR_X = f8, f1, f0
+ // load error tag
+ mov GR_Parameter_TAG = 57;;
+}
+
+
+{.mfi
+ nop.m 0
+ // if s = 1, result is 0
+ (p9) fma.s0 f8 = f0, f0, f0
+ // set p6=0 for |s|>1
+ (p10) cmp.ne p6, p0 = r0, r0;;
+}
+
+
+{.mfb
+ nop.m 0
+ // if s = -1, result is pi
+ (p6) fma.s0 f8 = F_PI2_HI, f1, F_PI2_LO
+ // return if |s| = 1
+ (p9) br.ret.sptk b0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // get Infinity
+ frcpa.s1 FR_RESULT, p0 = f1, f0
+ nop.i 0;;
+}
+
+
+{.mfb
+ nop.m 0
+ // return QNaN indefinite (0*Infinity)
+ fma.s0 FR_RESULT = f0, FR_RESULT, f0
+ nop.b 0;;
+}
+
+
+GLOBAL_LIBM_END(acosl)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
// (1)
{ .mfi
@@ -1068,12 +2510,12 @@ __libm_error_region:
.body
// (3)
{ .mib
- stfe [GR_Parameter_X] = f8 // Store Parameter 1 on stack
+ stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
- stfe [GR_Parameter_Y] = f10 // Store Parameter 3 on stack
+ stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
@@ -1097,11 +2539,13 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
-.type __libm_atan2_reg#,@function
-.global __libm_atan2_reg#
+
+
+
+
+
diff --git a/sysdeps/ia64/fpu/e_asin.S b/sysdeps/ia64/fpu/e_asin.S
index bb4c242fb2..398079eae4 100644
--- a/sysdeps/ia64/fpu/e_asin.S
+++ b/sysdeps/ia64/fpu/e_asin.S
@@ -1,10 +1,10 @@
.file "asin.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003 Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,818 +35,776 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
//==============================================================
-// 2/02/00 Initial version
-// 8/17/00 New and much faster algorithm.
-// 8/31/00 Avoided bank conflicts on loads, shortened |x|=1 path,
+// 02/02/00 Initial version
+// 08/17/00 New and much faster algorithm.
+// 08/31/00 Avoided bank conflicts on loads, shortened |x|=1 path,
// fixed mfb split issue stalls.
// 12/19/00 Fixed small arg cases to force inexact, or inexact and underflow.
+// 08/02/02 New and much faster algorithm II
+// 02/06/03 Reordered header: .section, .global, .proc, .align
// Description
//=========================================
-// The asin function computes the principle value of the arc sine of x.
+// The asin function computes the principal value of the arc sine of x.
// asin(0) returns 0, asin(1) returns pi/2, asin(-1) returns -pi/2.
// A doman error occurs for arguments not in the range [-1,+1].
-
+//
// The asin function returns the arc sine in the range [-pi/2, +pi/2] radians.
+//
+// There are 8 paths:
+// 1. x = +/-0.0
+// Return asin(x) = +/-0.0
+//
+// 2. 0.0 < |x| < 0.625
+// Return asin(x) = x + x^3 *PolA(x^2)
+// where PolA(x^2) = A3 + A5*x^2 + A7*x^4 +...+ A35*x^32
+//
+// 3. 0.625 <=|x| < 1.0
+// Return asin(x) = sign(x) * ( Pi/2 - sqrt(R) * PolB(R))
+// Where R = 1 - |x|,
+// PolB(R) = B0 + B1*R + B2*R^2 +...+B12*R^12
+//
+// sqrt(R) is approximated using the following sequence:
+// y0 = (1 + eps)/sqrt(R) - initial approximation by frsqrta,
+// |eps| < 2^(-8)
+// Then 3 iterations are used to refine the result:
+// H0 = 0.5*y0
+// S0 = R*y0
+//
+// d0 = 0.5 - H0*S0
+// H1 = H0 + d0*H0
+// S1 = S0 + d0*S0
+//
+// d1 = 0.5 - H1*S1
+// H2 = H1 + d0*H1
+// S2 = S1 + d0*S1
+//
+// d2 = 0.5 - H2*S2
+// S3 = S3 + d2*S3
+//
+// S3 approximates sqrt(R) with enough accuracy for this algorithm
+//
+// So, the result should be reconstracted as follows:
+// asin(x) = sign(x) * (Pi/2 - S3*PolB(R))
+//
+// But for optimization perposes the reconstruction step is slightly
+// changed:
+// asin(x) = sign(x)*(Pi/2 - PolB(R)*S2) + sign(x)*d2*S2*PolB(R)
+//
+// 4. |x| = 1.0
+// Return asin(x) = sign(x)*Pi/2
+//
+// 5. 1.0 < |x| <= +INF
+// A doman error occurs for arguments not in the range [-1,+1]
+//
+// 6. x = [S,Q]NaN
+// Return asin(x) = QNaN
+//
+// 7. x is denormal
+// Return asin(x) = x + x^3,
+//
+// 8. x is unnormal
+// Normalize input in f8 and return to the very beginning of the function
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input, output
+// f6, f7, f9 -> f15, f32 -> f63
-#include "libm_support.h"
+// General registers used:
+// r3, r21 -> r31, r32 -> r38
+
+// Predicate registers used:
+// p0, p6 -> p14
//
// Assembly macros
//=========================================
-
-
-// predicate registers
-//asin_pred_LEsqrt2by2 = p7
-//asin_pred_GTsqrt2by2 = p8
-
-// integer registers
-ASIN_Addr1 = r33
-ASIN_Addr2 = r34
-ASIN_FFFE = r35
-ASIN_lnorm_sig = r36
-ASIN_snorm_exp = r37
-
-GR_SAVE_B0 = r36
-GR_SAVE_PFS = r37
-GR_SAVE_GP = r38
-
-GR_Parameter_X = r39
-GR_Parameter_Y = r40
-GR_Parameter_RESULT = r41
-GR_Parameter_Tag = r42
-
-// floating point registers
-asin_coeff_P1 = f32
-asin_coeff_P2 = f33
-asin_coeff_P3 = f34
-asin_coeff_P4 = f35
-
-asin_coeff_P5 = f36
-asin_coeff_P6 = f37
-asin_coeff_P7 = f38
-asin_coeff_P8 = f39
-asin_coeff_P9 = f40
-
-asin_coeff_P10 = f41
-asin_coeff_P11 = f42
-asin_coeff_P12 = f43
-asin_coeff_P13 = f44
-asin_coeff_P14 = f45
-
-asin_coeff_P15 = f46
-asin_coeff_P16 = f47
-asin_coeff_P17 = f48
-asin_coeff_P18 = f49
-asin_coeff_P19 = f50
-
-asin_coeff_P20 = f51
-asin_coeff_P21 = f52
-asin_const_sqrt2by2 = f53
-asin_const_piby2 = f54
-asin_abs_x = f55
-
-asin_tx = f56
-asin_tx2 = f57
-asin_tx3 = f58
-asin_tx4 = f59
-asin_tx8 = f60
-
-asin_tx11 = f61
-asin_1poly_p8 = f62
-asin_1poly_p19 = f63
-asin_1poly_p4 = f64
-asin_1poly_p15 = f65
-
-asin_1poly_p6 = f66
-asin_1poly_p17 = f67
-asin_1poly_p0 = f68
-asin_1poly_p11 = f69
-asin_1poly_p2 = f70
-
-asin_1poly_p13 = f71
-asin_series_tx = f72
-asin_t = f73
-asin_t2 = f74
-asin_t3 = f75
-
-asin_t4 = f76
-asin_t8 = f77
-asin_t11 = f78
-asin_poly_p8 = f79
-asin_poly_p19 = f80
-
-asin_poly_p4 = f81
-asin_poly_p15 = f82
-asin_poly_p6 = f83
-asin_poly_p17 = f84
-asin_poly_p0 = f85
-
-asin_poly_p11 = f86
-asin_poly_p2 = f87
-asin_poly_p13 = f88
-asin_series_t = f89
-asin_1by2 = f90
-
-asin_3by2 = f91
-asin_5by2 = f92
-asin_11by4 = f93
-asin_35by8 = f94
-asin_63by8 = f95
-
-asin_231by16 = f96
-asin_y0 = f97
-asin_H0 = f98
-asin_S0 = f99
-asin_d = f100
-
-asin_l1 = f101
-asin_d2 = f102
-asin_T0 = f103
-asin_d1 = f104
-asin_e0 = f105
-
-asin_l2 = f106
-asin_d3 = f107
-asin_T3 = f108
-asin_S1 = f109
-asin_e1 = f110
-
-asin_z = f111
-answer2 = f112
-asin_sgn_x = f113
-asin_429by16 = f114
-asin_18by4 = f115
-
-asin_3by4 = f116
-asin_l3 = f117
-asin_T6 = f118
-asin_eps_exp = f119
-asin_eps_sig = f120
-asin_eps = f120
-
+// integer registers used
+// scratch
+rTblAddr = r3
+
+rPiBy2Ptr = r21
+rTmpPtr3 = r22
+rDenoBound = r23
+rOne = r24
+rAbsXBits = r25
+rHalf = r26
+r0625 = r27
+rSign = r28
+rXBits = r29
+rTmpPtr2 = r30
+rTmpPtr1 = r31
+
+// stacked
+GR_SAVE_PFS = r32
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_Parameter_X = r35
+GR_Parameter_Y = r36
+GR_Parameter_RESULT = r37
+GR_Parameter_TAG = r38
+
+// floating point registers used
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+
+// scratch
+fXSqr = f6
+fXCube = f7
+fXQuadr = f9
+f1pX = f10
+f1mX = f11
+f1pXRcp = f12
+f1mXRcp = f13
+fH = f14
+fS = f15
+// stacked
+fA3 = f32
+fB1 = f32
+fA5 = f33
+fB2 = f33
+fA7 = f34
+fPiBy2 = f34
+fA9 = f35
+fA11 = f36
+fB10 = f35
+fB11 = f36
+fA13 = f37
+fA15 = f38
+fB4 = f37
+fB5 = f38
+fA17 = f39
+fA19 = f40
+fB6 = f39
+fB7 = f40
+fA21 = f41
+fA23 = f42
+fB3 = f41
+fB8 = f42
+fA25 = f43
+fA27 = f44
+fB9 = f43
+fB12 = f44
+fA29 = f45
+fA31 = f46
+fA33 = f47
+fA35 = f48
+fBaseP = f49
+fB0 = f50
+fSignedS = f51
+fD = f52
+fHalf = f53
+fR = f54
+fCloseTo1Pol = f55
+fSignX = f56
+fDenoBound = f57
+fNormX = f58
+fX8 = f59
+fRSqr = f60
+fRQuadr = f61
+fR8 = f62
+fX16 = f63
// Data tables
//==============================================================
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-
+RODATA
.align 16
-
-asin_coeff_1_table:
-ASM_TYPE_DIRECTIVE(asin_coeff_1_table,@object)
-data8 0xE4E7E0A423A21249 , 0x00003FF8 //P7
-data8 0xC2F7EE0200FCE2A5 , 0x0000C003 //P18
-data8 0xB745D7F6C65C20E0 , 0x00003FF9 //P5
-data8 0xF75E381A323D4D94 , 0x0000C002 //P16
-data8 0x8959C2629C1024C0 , 0x0000C002 //P20
-data8 0xAFF68E7D241292C5 , 0x00003FF8 //P9
-data8 0xB6DB6DB7260AC30D , 0x00003FFA //P3
-data8 0xD0417CE2B41CB7BF , 0x0000C000 //P14
-data8 0x81D570FEA724E3E4 , 0x0000BFFD //P12
-data8 0xAAAAAAAAAAAAC277 , 0x00003FFC //P1
-data8 0xF534912FF3E7B76F , 0x00003FFF //P21
-data8 0xc90fdaa22168c235 , 0x00003fff // pi/2
-data8 0x0000000000000000 , 0x00000000 // pad to avoid data bank conflict
-ASM_SIZE_DIRECTIVE(asin_coeff_1_table)
-
-
-asin_coeff_2_table:
-ASM_TYPE_DIRECTIVE(asin_coeff_2_table,@object)
-data8 0x8E26AF5F29B39A2A , 0x00003FF9 //P6
-data8 0xB4F118A4B1015470 , 0x00004003 //P17
-data8 0xF8E38E10C25990E0 , 0x00003FF9 //P4
-data8 0x80F50489AEF1CAC6 , 0x00004002 //P15
-data8 0x92728015172CFE1C , 0x00004003 //P19
-data8 0xBBC3D831D4595971 , 0x00003FF8 //P8
-data8 0x999999999952A5C3 , 0x00003FFB //P2
-data8 0x855576BE6F0975EC , 0x00003FFF //P13
-data8 0xF12420E778077D89 , 0x00003FFA //P11
-data8 0xB6590FF4D23DE003 , 0x00003FF3 //P10
-data8 0xb504f333f9de6484 , 0x00003ffe // sqrt(2)/2
-ASM_SIZE_DIRECTIVE(asin_coeff_2_table)
+LOCAL_OBJECT_START(asin_base_range_table)
+// Ai: Polynomial coefficients for the asin(x), |x| < .625000
+// Bi: Polynomial coefficients for the asin(x), |x| > .625000
+data8 0xBFDAAB56C01AE468 //A29
+data8 0x3FE1C470B76A5B2B //A31
+data8 0xBFDC5FF82A0C4205 //A33
+data8 0x3FC71FD88BFE93F0 //A35
+data8 0xB504F333F9DE6487, 0x00003FFF //B0
+data8 0xAAAAAAAAAAAAFC18, 0x00003FFC //A3
+data8 0x3F9F1C71BC4A7823 //A9
+data8 0x3F96E8BBAAB216B2 //A11
+data8 0x3F91C4CA1F9F8A98 //A13
+data8 0x3F8C9DDCEDEBE7A6 //A15
+data8 0x3F877784442B1516 //A17
+data8 0x3F859C0491802BA2 //A19
+data8 0x9999999998C88B8F, 0x00003FFB //A5
+data8 0x3F6BD7A9A660BF5E //A21
+data8 0x3F9FC1659340419D //A23
+data8 0xB6DB6DB798149BDF, 0x00003FFA //A7
+data8 0xBFB3EF18964D3ED3 //A25
+data8 0x3FCD285315542CF2 //A27
+data8 0xF15BEEEFF7D2966A, 0x00003FFB //B1
+data8 0x3EF0DDA376D10FB3 //B10
+data8 0xBEB83CAFE05EBAC9 //B11
+data8 0x3F65FFB67B513644 //B4
+data8 0x3F5032FBB86A4501 //B5
+data8 0x3F392162276C7CBA //B6
+data8 0x3F2435949FD98BDF //B7
+data8 0xD93923D7FA08341C, 0x00003FF9 //B2
+data8 0x3F802995B6D90BDB //B3
+data8 0x3F10DF86B341A63F //B8
+data8 0xC90FDAA22168C235, 0x00003FFF // Pi/2
+data8 0x3EFA3EBD6B0ECB9D //B9
+data8 0x3EDE18BA080E9098 //B12
+LOCAL_OBJECT_END(asin_base_range_table)
-
-.align 32
-.global asin
-
.section .text
-.proc asin
-.align 32
-
-
-asin:
-
-{ .mfi
- alloc r32 = ar.pfs,1,6,4,0
- fma.s1 asin_tx = f8,f8,f0
- addl ASIN_Addr2 = @ltoff(asin_coeff_2_table),gp
-}
-{ .mfi
- mov ASIN_FFFE = 0xFFFE
- fnma.s1 asin_t = f8,f8,f1
- addl ASIN_Addr1 = @ltoff(asin_coeff_1_table),gp
+GLOBAL_LIBM_ENTRY(asin)
+asin_unnormal_back:
+{ .mfi
+ getf.d rXBits = f8 // grab bits of input value
+ // set p12 = 1 if x is a NaN, denormal, or zero
+ fclass.m p12, p0 = f8, 0xcf
+ adds rSign = 1, r0
+}
+{ .mfi
+ addl rTblAddr = @ltoff(asin_base_range_table),gp
+ // 1 - x = 1 - |x| for positive x
+ fms.s1 f1mX = f1, f1, f8
+ addl rHalf = 0xFFFE, r0 // exponent of 1/2
}
;;
-
-
-{ .mfi
- setf.exp asin_1by2 = ASIN_FFFE
- fmerge.s asin_abs_x = f1,f8
- nop.i 999 ;;
-}
-
-{ .mmf
- ld8 ASIN_Addr1 = [ASIN_Addr1]
- ld8 ASIN_Addr2 = [ASIN_Addr2]
- fmerge.s asin_sgn_x = f8,f1 ;;
-}
-
-
-{ .mfi
- ldfe asin_coeff_P7 = [ASIN_Addr1],16
- fma.s1 asin_tx2 = asin_tx,asin_tx,f0
- nop.i 999
-}
-{ .mfi
- ldfe asin_coeff_P6 = [ASIN_Addr2],16
- fma.s1 asin_t2 = asin_t,asin_t,f0
- nop.i 999;;
+{ .mfi
+ addl r0625 = 0x3FE4, r0 // high 16 bits of 0.625
+ // set p8 = 1 if x < 0
+ fcmp.lt.s1 p8, p9 = f8, f0
+ shl rSign = rSign, 63 // sign bit
}
-
-
-{ .mmf
- ldfe asin_coeff_P18 = [ASIN_Addr1],16
- ldfe asin_coeff_P17 = [ASIN_Addr2],16
- fclass.m.unc p8,p0 = f8, 0xc3 //@qnan |@snan
-}
-;;
-
-{ .mmf
- ldfe asin_coeff_P5 = [ASIN_Addr1],16
- ldfe asin_coeff_P4 = [ASIN_Addr2],16
- frsqrta.s1 asin_y0,p0 = asin_t
-}
-;;
-
-{ .mfi
- ldfe asin_coeff_P16 = [ASIN_Addr1],16
- fcmp.gt.s1 p9,p0 = asin_abs_x,f1
- nop.i 999
-}
-{ .mfb
- ldfe asin_coeff_P15 = [ASIN_Addr2],16
-(p8) fma.d f8 = f8,f1,f0
-(p8) br.ret.spnt b0
+{ .mfi
+ // point to the beginning of the table
+ ld8 rTblAddr = [rTblAddr]
+ // 1 + x = 1 - |x| for negative x
+ fma.s1 f1pX = f1, f1, f8
+ adds rOne = 0x3FF, r0
}
;;
-
-
-{ .mmf
- ldfe asin_coeff_P20 = [ASIN_Addr1],16
- ldfe asin_coeff_P19 = [ASIN_Addr2],16
- fclass.m.unc p8,p0 = f8, 0x07 //@zero
-}
-;;
-
-
-{ .mfi
- ldfe asin_coeff_P9 = [ASIN_Addr1],16
- fma.s1 asin_t4 = asin_t2,asin_t2,f0
-(p9) mov GR_Parameter_Tag = 61
-}
-{ .mfi
- ldfe asin_coeff_P8 = [ASIN_Addr2],16
- fma.s1 asin_3by2 = asin_1by2,f1,f1
- nop.i 999;;
+{ .mfi
+ andcm rAbsXBits = rXBits, rSign // bits of |x|
+ fmerge.s fSignX = f8, f1 // signum(x)
+ shl r0625 = r0625, 48 // bits of DP representation of 0.625
}
-
-
-{ .mfi
- ldfe asin_coeff_P2 = [ASIN_Addr2],16
- fma.s1 asin_tx4 = asin_tx2,asin_tx2,f0
- nop.i 999
-}
-{ .mfb
- ldfe asin_coeff_P3 = [ASIN_Addr1],16
- fma.s1 asin_t3 = asin_t,asin_t2,f0
-(p8) br.ret.spnt b0
+{ .mfb
+ setf.exp fHalf = rHalf // load A2 to FP reg
+ fma.s1 fXSqr = f8, f8, f0 // x^2
+ // branch on special path if x is a NaN, denormal, or zero
+(p12) br.cond.spnt asin_special
}
;;
-
-
-{ .mfi
- ldfe asin_coeff_P13 = [ASIN_Addr2],16
- fma.s1 asin_H0 = asin_y0,asin_1by2,f0
- nop.i 999
-}
-{ .mfb
- ldfe asin_coeff_P14 = [ASIN_Addr1],16
- fma.s1 asin_S0 = asin_y0,asin_t,f0
-(p9) br.cond.spnt __libm_error_region
+{ .mfi
+ adds rPiBy2Ptr = 272, rTblAddr
+ nop.f 0
+ shl rOne = rOne, 52 // bits of 1.0
+}
+{ .mfi
+ adds rTmpPtr1 = 16, rTblAddr
+ nop.f 0
+ // set p6 = 1 if |x| < 0.625
+ cmp.lt p6, p7 = rAbsXBits, r0625
}
;;
-
-
-{ .mfi
- ldfe asin_coeff_P11 = [ASIN_Addr2],16
- fcmp.eq.s1 p6,p0 = asin_abs_x,f1
- nop.i 999
-}
-{ .mfi
- ldfe asin_coeff_P12 = [ASIN_Addr1],16
- fma.s1 asin_tx3 = asin_tx,asin_tx2,f0
- nop.i 999;;
+{ .mfi
+ ldfpd fA29, fA31 = [rTblAddr] // A29, fA31
+ // 1 - x = 1 - |x| for positive x
+(p9) fms.s1 fR = f1, f1, f8
+ // point to coefficient of "near 1" polynomial
+(p7) adds rTmpPtr2 = 176, rTblAddr
}
-
-
-{ .mfi
- ldfe asin_coeff_P10 = [ASIN_Addr2],16
- fma.s1 asin_1poly_p6 = asin_tx,asin_coeff_P7,asin_coeff_P6
- nop.i 999
-}
-{ .mfi
- ldfe asin_coeff_P1 = [ASIN_Addr1],16
- fma.s1 asin_poly_p6 = asin_t,asin_coeff_P7,asin_coeff_P6
- nop.i 999;;
+{ .mfi
+ ldfpd fA33, fA35 = [rTmpPtr1], 16 // A33, fA35
+ // 1 + x = 1 - |x| for negative x
+(p8) fma.s1 fR = f1, f1, f8
+(p6) adds rTmpPtr2 = 48, rTblAddr
}
-
-
-{ .mfi
- ldfe asin_const_sqrt2by2 = [ASIN_Addr2],16
- fma.s1 asin_5by2 = asin_3by2,f1,f1
- nop.i 999
-}
-{ .mfi
- ldfe asin_coeff_P21 = [ASIN_Addr1],16
- fma.s1 asin_11by4 = asin_3by2,asin_3by2,asin_1by2
- nop.i 999;;
+;;
+{ .mfi
+ ldfe fB0 = [rTmpPtr1], 16 // B0
+ nop.f 0
+ nop.i 0
}
-
-
-{ .mfi
- ldfe asin_const_piby2 = [ASIN_Addr1],16
- fma.s1 asin_poly_p17 = asin_t,asin_coeff_P18,asin_coeff_P17
- nop.i 999
-}
-{ .mfb
- nop.m 999
- fma.s1 asin_3by4 = asin_3by2,asin_1by2,f0
-(p6) br.cond.spnt L(ASIN_ABS_1) // Branch to short exit if |x|=1
+{ .mib
+ adds rTmpPtr3 = 16, rTmpPtr2
+ // set p10 = 1 if |x| = 1.0
+ cmp.eq p10, p0 = rAbsXBits, rOne
+ // branch on special path for |x| = 1.0
+(p10) br.cond.spnt asin_abs_1
}
;;
-
-
-{ .mfi
- addl ASIN_lnorm_sig = -0x1,r0 // Form significand 0xffffffffffffffff
- fma.s1 asin_poly_p15 = asin_t,asin_coeff_P16,asin_coeff_P15
- nop.i 999
-}
-{ .mfi
- addl ASIN_snorm_exp = 0x0c001,r0 // Form small exponent
- fnma.s1 asin_d = asin_S0,asin_H0,asin_1by2
- nop.i 999;;
+{ .mfi
+ ldfe fA3 = [rTmpPtr2], 48 // A3 or B1
+ nop.f 0
+ adds rTmpPtr1 = 64, rTmpPtr3
}
-
-
-// Form the exponent and significand of a small number
-{ .mfi
- setf.sig asin_eps_sig = ASIN_lnorm_sig
- fma.s1 asin_poly_p19 = asin_t,asin_coeff_P20,asin_coeff_P19
- nop.i 999
-}
-{ .mfi
- setf.exp asin_eps_exp = ASIN_snorm_exp
- fma.s1 asin_poly_p4 = asin_t,asin_coeff_P5,asin_coeff_P4
- nop.i 999;;
+{ .mib
+ ldfpd fA9, fA11 = [rTmpPtr3], 16 // A9, A11 or B10, B11
+ // set p11 = 1 if |x| > 1.0
+ cmp.gt p11, p0 = rAbsXBits, rOne
+ // branch on special path for |x| > 1.0
+(p11) br.cond.spnt asin_abs_gt_1
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p17 = asin_tx,asin_coeff_P18,asin_coeff_P17
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p8 = asin_t,asin_coeff_P9,asin_coeff_P8
- nop.i 999;;
+;;
+{ .mfi
+ ldfpd fA17, fA19 = [rTmpPtr2], 16 // A17, A19 or B6, B7
+ // initial approximation of 1 / sqrt(1 - x)
+ frsqrta.s1 f1mXRcp, p0 = f1mX
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fms.s1 asin_35by8 = asin_5by2,asin_11by4,asin_5by2
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_63by8 = asin_5by2,asin_11by4,f1
- nop.i 999;;
+{ .mfi
+ ldfpd fA13, fA15 = [rTmpPtr3] // A13, A15 or B4, B5
+ fma.s1 fXCube = fXSqr, f8, f0 // x^3
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p13 = asin_t,asin_coeff_P14,asin_coeff_P13
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_18by4 = asin_3by2,asin_5by2,asin_3by4
- nop.i 999;;
+;;
+{ .mfi
+ ldfe fA5 = [rTmpPtr2], 48 // A5 or B2
+ // initial approximation of 1 / sqrt(1 + x)
+ frsqrta.s1 f1pXRcp, p0 = f1pX
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_l1 = asin_5by2,asin_d,asin_3by2
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_d2 = asin_d,asin_d,f0
- nop.i 999;;
+{ .mfi
+ ldfpd fA21, fA23 = [rTmpPtr1], 16 // A21, A23 or B3, B8
+ fma.s1 fXQuadr = fXSqr, fXSqr, f0 // x^4
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p15 = asin_t2,asin_poly_p17,asin_poly_p15
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_T0 = asin_d,asin_S0,f0
- nop.i 999;;
+;;
+{ .mfi
+ ldfe fA7 = [rTmpPtr1] // A7 or Pi/2
+ fma.s1 fRSqr = fR, fR, f0 // R^2
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p19 = asin_t2,asin_coeff_P21,asin_poly_p19
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p4 = asin_t2,asin_poly_p6,asin_poly_p4
- nop.i 999;;
+{ .mfb
+ ldfpd fA25, fA27 = [rTmpPtr2] // A25, A27 or B9, B12
+ nop.f 0
+(p6) br.cond.spnt asin_base_range;
}
+;;
-
-{ .mfi
- nop.m 999
- fma.s1 asin_d1 = asin_35by8,asin_d,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_231by16 = asin_3by2,asin_35by8,asin_63by8
- nop.i 999;;
+{ .mfi
+ nop.m 0
+(p9) fma.s1 fH = fHalf, f1mXRcp, f0 // H0 for x > 0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p2 = asin_t,asin_coeff_P3,asin_coeff_P2
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p8 = asin_t2,asin_coeff_P10,asin_poly_p8
- nop.i 999;;
+{ .mfi
+ nop.m 0
+(p9) fma.s1 fS = f1mX, f1mXRcp, f0 // S0 for x > 0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p11 = asin_t,asin_coeff_P12,asin_coeff_P11
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_e0 = asin_d2,asin_l1,asin_d
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+(p8) fma.s1 fH = fHalf, f1pXRcp, f0 // H0 for x < 0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p15 = asin_tx,asin_coeff_P16,asin_coeff_P15
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p0 = asin_t,asin_coeff_P1,f1
- nop.i 999;;
+{ .mfi
+ nop.m 0
+(p8) fma.s1 fS = f1pX, f1pXRcp, f0 // S0 for x > 0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p19 = asin_tx,asin_coeff_P20,asin_coeff_P19
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p4 = asin_tx,asin_coeff_P5,asin_coeff_P4
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRQuadr = fRSqr, fRSqr, f0 // R^4
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p8 = asin_tx,asin_coeff_P9,asin_coeff_P8
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_l2 = asin_231by16,asin_d,asin_63by8
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB11 = fB11, fR, fB10
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_d3 = asin_d2,asin_d,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_T3 = asin_d2,asin_T0,f0
- nop.i 999;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB1 = fB1, fR, fB0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_429by16 = asin_18by4,asin_11by4,asin_231by16
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_S1 = asin_e0,asin_S0,asin_S0
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB5 = fB5, fR, fB4
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p4 = asin_t4,asin_poly_p8,asin_poly_p4
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p15 = asin_t4,asin_poly_p19,asin_poly_p15
- nop.i 999;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB7 = fB7, fR, fB6
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p0 = asin_t2,asin_poly_p2,asin_poly_p0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p11 = asin_t2,asin_poly_p13,asin_poly_p11
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB3 = fB3, fR, fB2
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_t8 = asin_t4,asin_t4,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_e1 = asin_d2,asin_l2,asin_d1
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fnma.s1 fD = fH, fS, fHalf // d0 = 1/2 - H0*S0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p4 = asin_tx2,asin_1poly_p6,asin_1poly_p4
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p15 = asin_tx2,asin_1poly_p17,asin_1poly_p15
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fR8 = fRQuadr, fRQuadr, f0 // R^4
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p8 = asin_tx2,asin_coeff_P10,asin_1poly_p8
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p19 = asin_tx2,asin_coeff_P21,asin_1poly_p19
- nop.i 999;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB9 = fB9, fR, fB8
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p2 = asin_tx,asin_coeff_P3,asin_coeff_P2
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p13 = asin_tx,asin_coeff_P14,asin_coeff_P13
- nop.i 999;;
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fB12 = fB12, fRSqr, fB11
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p0 = asin_tx,asin_coeff_P1,f1
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p11 = asin_tx,asin_coeff_P12,asin_coeff_P11
- nop.i 999;;
+{.mfi
+ nop.m 0
+ fma.s1 fB7 = fB7, fRSqr, fB5
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_l3 = asin_429by16,asin_d,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_z = asin_e1,asin_T3,asin_S1
- nop.i 999;;
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fB3 = fB3, fRSqr, fB1
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p11 = asin_t4,asin_poly_p15,asin_poly_p11
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_T6 = asin_T3,asin_d3,f0
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fH = fH, fD, fH // H1 = H0 + H0*d0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_t11 = asin_t8,asin_t3,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_poly_p0 = asin_t4,asin_poly_p4,asin_poly_p0
- nop.i 999;;
+{ .mfi
+ nop.m 0
+ fma.s1 fS = fS, fD, fS // S1 = S0 + S0*d0
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p4 = asin_tx4,asin_1poly_p8,asin_1poly_p4
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p15 = asin_tx4,asin_1poly_p19,asin_1poly_p15
- nop.i 999;;
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fPiBy2 = fPiBy2, fSignX, f0 // signum(x)*Pi/2
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p0 = asin_tx2,asin_1poly_p2,asin_1poly_p0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p11 = asin_tx2,asin_1poly_p13,asin_1poly_p11
- nop.i 999;;
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB12 = fB12, fRSqr, fB9
+ nop.i 0
}
-
-
-{ .mfi
- nop.m 999
-// fcmp.le.s1 asin_pred_LEsqrt2by2,asin_pred_GTsqrt2by2 = asin_abs_x,asin_const_sqrt2by2
- fcmp.le.s1 p7,p8 = asin_abs_x,asin_const_sqrt2by2
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_tx8 = asin_tx4,asin_tx4,f0
- nop.i 999;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB7 = fB7, fRQuadr, fB3
+ nop.i 0
}
-
-
-// Form a small number to force inexact flag for small args
-{ .mfi
- nop.m 999
- fmerge.se asin_eps = asin_eps_exp,asin_eps_sig
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_z = asin_l3,asin_T6,asin_z
- nop.i 999;;
-}
-
-{ .mfi
- nop.m 999
- fma.s1 asin_series_t = asin_t11,asin_poly_p11,asin_poly_p0
- nop.i 999;;
-}
-
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p0 = asin_tx4,asin_1poly_p4,asin_1poly_p0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 asin_1poly_p11 = asin_tx4,asin_1poly_p15,asin_1poly_p11
- nop.i 999;;
+;;
+{.mfi
+ nop.m 0
+ fnma.s1 fD = fH, fS, fHalf // d1 = 1/2 - H1*S1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 fSignedS = fSignX, fS, f0 // -signum(x)*S1
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fCloseTo1Pol = fB12, fR8, fB7
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fH = fH, fD, fH // H2 = H1 + H1*d1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fS = fS, fD, fS // S2 = S1 + S1*d1
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // -signum(x)* S2 = -signum(x)*(S1 + S1*d1)
+ fma.s1 fSignedS = fSignedS, fD, fSignedS
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fnma.s1 fD = fH, fS, fHalf // d2 = 1/2 - H2*S2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // signum(x)*(Pi/2 - PolB*S2)
+ fma.s1 fPiBy2 = fSignedS, fCloseTo1Pol, fPiBy2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // -signum(x)*PolB * S2
+ fma.s1 fCloseTo1Pol = fSignedS, fCloseTo1Pol, f0
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for 0.625 <= |x| < 1
+ fma.d.s0 f8 = fCloseTo1Pol, fD, fPiBy2
+ // exit here for 0.625 <= |x| < 1
+ br.ret.sptk b0
}
+;;
-
-{ .mfi
- nop.m 999
- fma.s1 asin_tx11 = asin_tx8,asin_tx3,f0
- nop.i 999;;
-}
-
-{ .mfi
- nop.m 999
-//(asin_pred_GTsqrt2by2) fnma.s1 answer2 = asin_z,asin_series_t,asin_const_piby2
-(p8) fnma.s1 answer2 = asin_z,asin_series_t,asin_const_piby2
- nop.i 999;;
-}
-
-{ .mfi
- nop.m 999
- fma.s1 asin_series_tx = asin_tx11,asin_1poly_p11,asin_1poly_p0
- nop.i 999;;
-}
-
-{ .mfi
- nop.m 999
-//(asin_pred_GTsqrt2by2) fma.d f8 = asin_sgn_x,answer2,f0
-(p8) fma.d f8 = asin_sgn_x,answer2,f0
- nop.i 999;;
-}
-
-// asin_eps is added only to force inexact and possibly underflow flag
-// in case asin_series_tx is zero
-//
-{ .mfi
- nop.m 999
-(p7) fma.d asin_eps = f8,asin_series_tx,asin_eps
- nop.i 999
-}
-{ .mfb
- nop.m 999
-//(asin_pred_LEsqrt2by2) fma.d f8 = f8,asin_series_tx,f0
-(p7) fma.d f8 = f8,asin_series_tx,f0
- br.ret.sptk b0
-}
+
+// here if |x| < 0.625
+.align 32
+asin_base_range:
+{ .mfi
+ nop.m 0
+ fma.s1 fA33 = fA33, fXSqr, fA31
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fXSqr, fA13
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA29 = fA29, fXSqr, fA27
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fXSqr, fA23
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA21 = fA21, fXSqr, fA19
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, fXSqr, fA7
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA5 = fA5, fXSqr, fA3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA35 = fA35, fXQuadr, fA33
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, fXQuadr, fA15
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fX8 = fXQuadr, fXQuadr, f0 // x^8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fXQuadr, fA21
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, fXQuadr, fA5
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA35 = fA35, fXQuadr, fA29
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, fXSqr, fA11
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fX16 = fX8, fX8, f0 // x^16
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA35 = fA35, fX8, fA25
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, fX8, fA9
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fBaseP = fA35, fX16, fA17
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for |x| < 0.625
+ fma.d.s0 f8 = fBaseP, fXCube, f8
+ // exit here for |x| < 0.625 path
+ br.ret.sptk b0
+}
;;
+// here if |x| = 1
+// asin(x) = sign(x) * Pi/2
+.align 32
+asin_abs_1:
+{ .mfi
+ ldfe fPiBy2 = [rPiBy2Ptr] // Pi/2
+ nop.f 0
+ nop.i 0
+}
+;;
+{.mfb
+ nop.m 0
+ // result for |x| = 1.0
+ fma.d.s0 f8 = fPiBy2, fSignX, f0
+ // exit here for |x| = 1.0
+ br.ret.sptk b0
+}
+;;
-L(ASIN_ABS_1):
-// Here for short exit if |x|=1
-{ .mfb
- nop.m 999
- fma.d f8 = asin_sgn_x,asin_const_piby2,f0
- br.ret.sptk b0
-}
+// here if x is a NaN, denormal, or zero
+.align 32
+asin_special:
+{ .mfi
+ nop.m 0
+ // set p12 = 1 if x is a NaN
+ fclass.m p12, p0 = f8, 0xc3
+ nop.i 0
+}
+{ .mlx
+ nop.m 0
+ // smallest positive DP normalized number
+ movl rDenoBound = 0x0010000000000000
+}
+;;
+{ .mfi
+ nop.m 0
+ // set p13 = 1 if x = 0.0
+ fclass.m p13, p0 = f8, 0x07
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnorm.s1 fNormX = f8
+ nop.i 0
+}
+;;
+{ .mfb
+ // load smallest normal to FP reg
+ setf.d fDenoBound = rDenoBound
+ // answer if x is a NaN
+(p12) fma.d.s0 f8 = f8,f1,f0
+ // exit here if x is a NaN
+(p12) br.ret.spnt b0
+}
+;;
+{ .mfb
+ nop.m 0
+ nop.f 0
+ // exit here if x = 0.0
+(p13) br.ret.spnt b0
+}
+;;
+// if we still here then x is denormal or unnormal
+{ .mfi
+ nop.m 0
+ // absolute value of normalized x
+ fmerge.s fNormX = f1, fNormX
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // set p14 = 1 if normalized x is greater than or
+ // equal to the smallest denormalized value
+ // So, if p14 is set to 1 it means that we deal with
+ // unnormal rather than with "true" denormal
+ fcmp.ge.s1 p14, p0 = fNormX, fDenoBound
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+(p14) fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag if x unnormal
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ // normalize unnormal input
+(p14) fnorm.s1 f8 = f8
+ // return to the main path
+(p14) br.cond.sptk asin_unnormal_back
+}
+;;
+// if we still here it means that input is "true" denormal
+{ .mfb
+ nop.m 0
+ // final result if x is denormal
+ fma.d.s0 f8 = f8, fXSqr, f8
+ // exit here if x is denormal
+ br.ret.sptk b0
+}
;;
+// here if |x| > 1.0
+// error handler should be called
+.align 32
+asin_abs_gt_1:
+{ .mfi
+ alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers
+ fmerge.s FR_X = f8,f8
+ nop.i 0
+}
+{ .mfb
+ mov GR_Parameter_TAG = 61 // error code
+ frcpa.s0 FR_RESULT, p0 = f0,f0
+ // call error handler routine
+ br.cond.sptk __libm_error_region
+}
+;;
+GLOBAL_LIBM_END(asin)
-.endp asin
-ASM_SIZE_DIRECTIVE(asin)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
- nop.f 999
+ nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
@@ -857,28 +815,29 @@ __libm_error_region:
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
- stfs [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack
+ stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
-
.body
- frcpa.s0 f9,p0 = f0,f0
-;;
-
{ .mib
- stfd [GR_Parameter_X] = f8 // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0 // Parameter 3 address
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfd [GR_Parameter_Y] = f9,-16 // Store Parameter 3 on stack
- adds r32 = 48,sp
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- ldfd f8 = [r32] // Get return result off stack
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
+};;
+{ .mmi
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
@@ -887,11 +846,8 @@ __libm_error_region:
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
-
-.type __libm_error_support,@function
-.global __libm_error_support
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_asinf.S b/sysdeps/ia64/fpu/e_asinf.S
index ddae85880b..f9a1312b26 100644
--- a/sysdeps/ia64/fpu/e_asinf.S
+++ b/sysdeps/ia64/fpu/e_asinf.S
@@ -1,10 +1,10 @@
.file "asinf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/02/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,21 +35,25 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
//==============================================================
-// 2/02/00 Initial revision
-// 6/28/00 Improved speed
-// 6/31/00 Changed register allocation because of some duplicate macros
+// 02/02/00 Initial version
+// 06/28/00 Improved speed
+// 06/31/00 Changed register allocation because of some duplicate macros
// moved nan exit bundle up to gain a cycle.
-// 8/08/00 Improved speed by avoiding SIR flush.
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 08/08/00 Improved speed by avoiding SIR flush.
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-// 8/17/00 Changed predicate register macro-usage to direct predicate
+// 08/17/00 Changed predicate register macro-usage to direct predicate
// names due to an assembler bug.
// 10/17/00 Improved speed of x=0 and x=1 paths, set D flag if x denormal.
+// 03/13/01 Corrected sign of imm1 value in dep instruction.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+
// Description
//=========================================
// The asinf function computes the arc sine of x in the range [-pi,+pi].
@@ -119,7 +123,6 @@
// answer2 = - sign(x) z P(t) + (sign(x) pi/2)
//
-#include "libm_support.h"
// Assembly macros
//=========================================
@@ -225,42 +228,30 @@ asinf_poly_p1a = f90
// Data tables
//==============================================================
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+RODATA
.align 16
-asinf_coeff_1_table:
-ASM_TYPE_DIRECTIVE(asinf_coeff_1_table,@object)
+LOCAL_OBJECT_START(asinf_coeff_1_table)
data8 0x3FC5555607DCF816 // P1
data8 0x3F9CF81AD9BAB2C6 // P4
data8 0x3FC59E0975074DF3 // P7
data8 0xBFA6F4CC2780AA1D // P6
data8 0x3FC2DD45292E93CB // P9
data8 0x3fe6a09e667f3bcd // sqrt(2)/2
-ASM_SIZE_DIRECTIVE(asinf_coeff_1_table)
+LOCAL_OBJECT_END(asinf_coeff_1_table)
-asinf_coeff_2_table:
-ASM_TYPE_DIRECTIVE(asinf_coeff_2_table,@object)
+LOCAL_OBJECT_START(asinf_coeff_2_table)
data8 0x3FA6F108E31EFBA6 // P3
data8 0xBFCA31BF175D82A0 // P8
data8 0x3FA30C0337F6418B // P5
data8 0x3FB332C9266CB1F9 // P2
data8 0x3ff921fb54442d18 // pi_by_2
-ASM_SIZE_DIRECTIVE(asinf_coeff_2_table)
+LOCAL_OBJECT_END(asinf_coeff_2_table)
-.align 32
-.global asinf
-
.section .text
-.proc asinf
-.align 32
-
-asinf:
+GLOBAL_LIBM_ENTRY(asinf)
// Load the addresses of the two tables.
// Then, load the coefficients and other constants.
@@ -345,7 +336,7 @@ asinf:
}
{ .mfb
nop.m 999
-(p8) fma.s f8 = f8,f1,f0
+(p8) fma.s.s0 f8 = f8,f1,f0
(p8) br.ret.spnt b0 ;; // Exit if x=nan
}
@@ -370,7 +361,7 @@ asinf:
{ .mfb
nop.m 999
fma.s1 asinf_t4 = asinf_t2,asinf_t2,f0
-(p6) br.cond.spnt L(ASINF_ABS_ONE) ;; // Branch if |x|=1
+(p6) br.cond.spnt ASINF_ABS_ONE ;; // Branch if |x|=1
}
{ .mfi
@@ -572,28 +563,26 @@ asinf:
.pred.rel "mutex",p8,p7 //asinf_pred_GTsqrt2by2,asinf_pred_LEsqrt2by2
{ .mfi
nop.m 999
-(p8) fnma.s f8 = asinf_z,asinf_Pt,asinf_sgn_x_piby2
+(p8) fnma.s.s0 f8 = asinf_z,asinf_Pt,asinf_sgn_x_piby2
nop.i 999
}
{ .mfb
nop.m 999
-(p7) fma.s f8 = asinf_x11,asinf_poly_Bx,asinf_poly_Ax
+(p7) fma.s.s0 f8 = asinf_x11,asinf_poly_Bx,asinf_poly_Ax
br.ret.sptk b0 ;;
}
-L(ASINF_ABS_ONE):
+ASINF_ABS_ONE:
// Here for short exit if |x|=1
{ .mfb
nop.m 999
- fma.s f8 = asinf_sgn_x,asinf_const_piby2,f0
+ fma.s.s0 f8 = asinf_sgn_x,asinf_const_piby2,f0
br.ret.sptk b0
}
;;
-.endp asinf
-ASM_SIZE_DIRECTIVE(asinf)
-
+GLOBAL_LIBM_END(asinf)
// Stack operations when calling error support.
// (1) (2)
// sp -> + psp -> +
@@ -623,8 +612,7 @@ ASM_SIZE_DIRECTIVE(asinf)
// restore gp
// restore ar.pfs
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
@@ -680,8 +668,7 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_asinl.S b/sysdeps/ia64/fpu/e_asinl.S
index 9153832090..bf5feba155 100644
--- a/sysdeps/ia64/fpu/e_asinl.S
+++ b/sysdeps/ia64/fpu/e_asinl.S
@@ -1,10 +1,10 @@
.file "asinl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2001 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,720 +20,2448 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http: //www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
-// set [the previously overwritten] GR_Parameter_RESULT.
+// 08/28/01 New version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
-// long double = asinl(long double)
-// input floating point f8
-// output floating point f8
+// long double asinl(long double)
//
-// Registers used
+// Overview of operation
//==============================================================
+// Background
//
-// predicate registers used:
-// p6 -> p12
+// Implementation
//
-// floating-point registers used:
-// f8 has input, then output
-// f32 -> f87, f8 -> f13, f32 -> f87
+// For |s| in [2^{-4}, sqrt(2)/2]:
+// Let t= 2^k*1.b1 b2..b6 1, where s= 2^k*1.b1 b2.. b52
+// asin(s)= asin(t)+asin(r), where r= s*sqrt(1-t^2)-t*sqrt(1-s^2), i.e.
+// r= (s-t)*sqrt(1-t^2)-t*sqrt(1-t^2)*(sqrt((1-s^2)/(1-t^2))-1)
+// asin(r)-r evaluated as 9-degree polynomial (c3*r^3+c5*r^5+c7*r^7+c9*r^9)
+// The 64-bit significands of sqrt(1-t^2), 1/(1-t^2) are read from the table,
+// along with the high and low parts of asin(t) (stored as two double precision
+// values)
//
-// general registers used:
-// r32 -> r47
+// |s| in (sqrt(2)/2, sqrt(255/256)):
+// Let t= 2^k*1.b1 b2..b6 1, where (1-s^2)*frsqrta(1-s^2)= 2^k*1.b1 b2..b6..
+// asin(|s|)= pi/2-asin(t)+asin(r), r= s*t-sqrt(1-s^2)*sqrt(1-t^2)
+// To minimize accumulated errors, r is computed as
+// r= (t*s)_s-t^2*y*z+z*y*(t^2-1+s^2)_s+z*y*(1-s^2)_s*x+z'*y*(1-s^2)*PS29+
+// +(t*s-(t*s)_s)+z*y*((t^2-1-(t^2-1+s^2)_s)+s^2)+z*y*(1-s^2-(1-s^2)_s)+
+// +ez*z'*y*(1-s^2)*(1-x),
+// where y= frsqrta(1-s^2), z= (sqrt(1-t^2))_s (rounded to 24 significant bits)
+// z'= sqrt(1-t^2), x= ((1-s^2)*y^2-1)/2
+//
+// |s|<2^{-4}: evaluate as 17-degree polynomial
+// (or simply return s, if|s|<2^{-64})
+//
+// |s| in [sqrt(255/256), 1): asin(|s|)= pi/2-asin(sqrt(1-s^2))
+// use 17-degree polynomial for asin(sqrt(1-s^2)),
+// 9-degree polynomial to evaluate sqrt(1-s^2)
+// High order term is (pi/2)_high-(y*(1-s^2))_high
//
-// Overview of operation
-//==============================================================
-// There are three paths
-// 1. |x| < 2^-40 ASIN_TINY
-// 2. 2^-40 <= |x| < 1/4 ASIN_POLY
-// 3. 1/4 <= |x| < 1 ASIN_ATAN
-#include "libm_support.h"
-// Assembly macros
-//==============================================================
-FR_RESULT = f10
-FR_X = f8
-FR_Y = f1
-asin_P79 = f32
-asin_P59 = f33
-asin_P39 = f34
-asin_P19 = f35
-
-asin_P810 = f36
-asin_P610 = f37
-asin_P410 = f38
-asin_P210 = f39
-
-asin_A1 = f41
-asin_A2 = f42
-asin_A3 = f43
-asin_A4 = f44
-asin_A5 = f45
-asin_A6 = f46
-asin_A7 = f47
-asin_A8 = f48
-asin_A9 = f49
-asin_A10 = f50
-
-asin_X2 = f51
-asin_X4 = f52
-
-asin_B = f53
-asin_Bb = f54
-asin_C = f55
-asin_Cc = f56
-asin_D = f57
-
-asin_W = f58
-asin_Ww = f59
-
-asin_y0 = f60
-asin_y1 = f61
-asin_y2 = f62
-
-asin_H = f63
-asin_Hh = f64
-
-asin_t1 = f65
-asin_t2 = f66
-asin_t3 = f67
-asin_t4 = f68
-asin_t5 = f69
-
-asin_Pseries = f70
-asin_NORM_f8 = f71
-asin_ABS_NORM_f8 = f72
-
-asin_2m100 = f73
-asin_P1P2 = f74
-asin_HALF = f75
-asin_1mD = f76
-
-asin_1mB = f77
-asin_1mBmC = f78
-asin_S = f79
-
-asin_BmWW = f80
-asin_BmWWpb = f81
-asin_2W = f82
-asin_1d2W = f83
-asin_Dd = f84
-
-asin_XWw = f85
-asin_low = f86
-
-asin_pi_by_2 = f87
-asin_pi_by_2_lo = f88
-
-asin_GR_17_ones = r33
-asin_GR_16_ones = r34
-asin_GR_signexp_f8 = r35
-asin_GR_exp = r36
-asin_GR_true_exp = r37
-asin_GR_ff9b = r38
-
-GR_SAVE_B0 = r39
-GR_SAVE_SP = r40
-GR_SAVE_PFS = r33
-// r33 can be used safely.
-// r40 is address of table of coefficients
-// Later it is used to save sp across calls
-GR_SAVE_GP = r41
-asin_GR_fffe = r42
-asin_GR_retval = r43
-
-GR_Parameter_X = r44
-GR_Parameter_Y = r45
-GR_Parameter_RESULT = r46
-GR_Parameter_TAG = r47
-
-
-// 2^-40:
-// A true exponent of -40 is
-// : -40 + register_bias
-// : -28 + ffff = ffd7
-
-// A true exponent of -100 is
-// : -100 + register_bias
-// : -64 + ffff = ff9b
-
-// Data tables
+
+// Registers used
//==============================================================
+// f6-f15, f32-f36
+// r2-r3, r23-r23
+// p6, p7, p8, p12
+//
+
+
+ GR_SAVE_B0= r33
+ GR_SAVE_PFS= r34
+ GR_SAVE_GP= r35 // This reg. can safely be used
+ GR_SAVE_SP= r36
+
+ GR_Parameter_X= r37
+ GR_Parameter_Y= r38
+ GR_Parameter_RESULT= r39
+ GR_Parameter_TAG= r40
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+ FR_X= f10
+ FR_Y= f1
+ FR_RESULT= f8
+
+
+
+RODATA
.align 16
-asin_coefficients:
-ASM_TYPE_DIRECTIVE(asin_coefficients,@object)
-data8 0xBB08911F2013961E, 0x00003FF8 // A10
-data8 0x981F1095A23A87D3, 0x00003FF8 // A9
-data8 0xBDF09C6C4177BCC6, 0x00003FF8 // A8
-data8 0xE4C3A60B049ACCEA, 0x00003FF8 // A7
-data8 0x8E2789F4E8A8F1AD, 0x00003FF9 // A6
-data8 0xB745D09B2B0E850B, 0x00003FF9 // A5
-data8 0xF8E38E3BC4C50920, 0x00003FF9 // A4
-data8 0xB6DB6DB6D89FCD81, 0x00003FFA // A3
-data8 0x99999999999AF376, 0x00003FFB // A2
-data8 0xAAAAAAAAAAAAAA71, 0x00003FFC // A1
-
-data8 0xc90fdaa22168c234, 0x00003FFF // pi_by_2_hi
-data8 0xc4c6628b80dc1cd1, 0x00003FBF // pi_by_2_lo
-ASM_SIZE_DIRECTIVE(asin_coefficients)
-
-.align 32
-.global asinl#
+
+
+LOCAL_OBJECT_START(T_table)
+
+// stores 64-bit significand of 1/(1-t^2), 64-bit significand of sqrt(1-t^2),
+// asin(t)_high (double precision), asin(t)_low (double precision)
+
+data8 0x80828692b71c4391, 0xff7ddcec2d87e879
+data8 0x3fb022bc0ae531a0, 0x3c9f599c7bb42af6
+data8 0x80869f0163d0b082, 0xff79cad2247914d3
+data8 0x3fb062dd26afc320, 0x3ca4eff21bd49c5c
+data8 0x808ac7d5a8690705, 0xff75a89ed6b626b9
+data8 0x3fb0a2ff4a1821e0, 0x3cb7e33b58f164cc
+data8 0x808f0112ad8ad2e0, 0xff7176517c2cc0cb
+data8 0x3fb0e32279319d80, 0x3caee31546582c43
+data8 0x80934abba8a1da0a, 0xff6d33e949b1ed31
+data8 0x3fb12346b8101da0, 0x3cb8bfe463d087cd
+data8 0x8097a4d3dbe63d8f, 0xff68e16571015c63
+data8 0x3fb1636c0ac824e0, 0x3c8870a7c5a3556f
+data8 0x809c0f5e9662b3dd, 0xff647ec520bca0f0
+data8 0x3fb1a392756ed280, 0x3c964f1a927461ae
+data8 0x80a08a5f33fadc66, 0xff600c07846a6830
+data8 0x3fb1e3b9fc19e580, 0x3c69eb3576d56332
+data8 0x80a515d91d71acd4, 0xff5b892bc475affa
+data8 0x3fb223e2a2dfbe80, 0x3c6a4e19fd972fb6
+data8 0x80a9b1cfc86ff7cd, 0xff56f631062cf93d
+data8 0x3fb2640c6dd76260, 0x3c62041160e0849e
+data8 0x80ae5e46b78b0d68, 0xff5253166bc17794
+data8 0x3fb2a43761187c80, 0x3cac61651af678c0
+data8 0x80b31b417a4b756b, 0xff4d9fdb14463dc8
+data8 0x3fb2e46380bb6160, 0x3cb06ef23eeba7a1
+data8 0x80b7e8c3ad33c369, 0xff48dc7e1baf6738
+data8 0x3fb32490d0d910c0, 0x3caa05f480b300d5
+data8 0x80bcc6d0f9c784d6, 0xff4408fe9ad13e37
+data8 0x3fb364bf558b3820, 0x3cb01e7e403aaab9
+data8 0x80c1b56d1692492d, 0xff3f255ba75f5f4e
+data8 0x3fb3a4ef12ec3540, 0x3cb4fe8fcdf5f5f1
+data8 0x80c6b49bc72ec446, 0xff3a319453ebd961
+data8 0x3fb3e5200d171880, 0x3caf2dc089b2b7e2
+data8 0x80cbc460dc4e0ae8, 0xff352da7afe64ac6
+data8 0x3fb425524827a720, 0x3cb75a855e7c6053
+data8 0x80d0e4c033bee9c4, 0xff301994c79afb32
+data8 0x3fb46585c83a5e00, 0x3cb3264981c019ab
+data8 0x80d615bdb87556db, 0xff2af55aa431f291
+data8 0x3fb4a5ba916c73c0, 0x3c994251d94427b5
+data8 0x80db575d6291fd8a, 0xff25c0f84bae0cb9
+data8 0x3fb4e5f0a7dbdb20, 0x3cbee2fcc4c786cb
+data8 0x80e0a9a33769e535, 0xff207c6cc0ec09fd
+data8 0x3fb526280fa74620, 0x3c940656e5549b91
+data8 0x80e60c93498e32cd, 0xff1b27b703a19c98
+data8 0x3fb56660ccee2740, 0x3ca7082374d7b2cd
+data8 0x80eb8031b8d4052d, 0xff15c2d6105c72f8
+data8 0x3fb5a69ae3d0b520, 0x3c7c4d46e09ac68a
+data8 0x80f10482b25c6c8a, 0xff104dc8e0813ed4
+data8 0x3fb5e6d6586fec20, 0x3c9aa84ffd9b4958
+data8 0x80f6998a709c7cfb, 0xff0ac88e6a4ab926
+data8 0x3fb627132eed9140, 0x3cbced2cbbbe7d16
+data8 0x80fc3f4d3b657c44, 0xff053325a0c8a2ec
+data8 0x3fb667516b6c34c0, 0x3c6489c5fc68595a
+data8 0x8101f5cf67ed2af8, 0xfeff8d8d73dec2bb
+data8 0x3fb6a791120f33a0, 0x3cbe12acf159dfad
+data8 0x8107bd1558d6291f, 0xfef9d7c4d043df29
+data8 0x3fb6e7d226fabba0, 0x3ca386d099cd0dc7
+data8 0x810d95237e38766a, 0xfef411ca9f80b5f7
+data8 0x3fb72814ae53cc20, 0x3cb9f35731e71dd6
+data8 0x81137dfe55aa0e29, 0xfeee3b9dc7eef009
+data8 0x3fb76858ac403a00, 0x3c74df3dd959141a
+data8 0x811977aa6a479f0f, 0xfee8553d2cb8122c
+data8 0x3fb7a89e24e6b0e0, 0x3ca6034406ee42bc
+data8 0x811f822c54bd5ef8, 0xfee25ea7add46a91
+data8 0x3fb7e8e51c6eb6a0, 0x3cb82f8f78e68ed7
+data8 0x81259d88bb4ffac1, 0xfedc57dc2809fb1d
+data8 0x3fb8292d9700ad60, 0x3cbebb73c0e653f9
+data8 0x812bc9c451e5a257, 0xfed640d974eb6068
+data8 0x3fb8697798c5d620, 0x3ca2feee76a9701b
+data8 0x813206e3da0f3124, 0xfed0199e6ad6b585
+data8 0x3fb8a9c325e852e0, 0x3cb9e88f2f4d0efe
+data8 0x813854ec231172f9, 0xfec9e229dcf4747d
+data8 0x3fb8ea1042932a00, 0x3ca5ff40d81f66fd
+data8 0x813eb3e209ee858f, 0xfec39a7a9b36538b
+data8 0x3fb92a5ef2f247c0, 0x3cb5e3bece4d6b07
+data8 0x814523ca796f56ce, 0xfebd428f72561efe
+data8 0x3fb96aaf3b3281a0, 0x3cb7b9e499436d7c
+data8 0x814ba4aa6a2d3ff9, 0xfeb6da672bd48fe4
+data8 0x3fb9ab011f819860, 0x3cb9168143cc1a7f
+data8 0x81523686e29bbdd7, 0xfeb062008df81f50
+data8 0x3fb9eb54a40e3ac0, 0x3cb6e544197eb1e1
+data8 0x8158d964f7124614, 0xfea9d95a5bcbd65a
+data8 0x3fba2ba9cd080800, 0x3ca9a717be8f7446
+data8 0x815f8d49c9d639e4, 0xfea34073551e1ac8
+data8 0x3fba6c009e9f9260, 0x3c741e989a60938a
+data8 0x8166523a8b24f626, 0xfe9c974a367f785c
+data8 0x3fbaac591d0661a0, 0x3cb2c1290107e57d
+data8 0x816d283c793e0114, 0xfe95ddddb94166cb
+data8 0x3fbaecb34c6ef600, 0x3c9c7d5fbaec405d
+data8 0x81740f54e06d55bd, 0xfe8f142c93750c50
+data8 0x3fbb2d0f310cca00, 0x3cbc09479a9cbcfb
+data8 0x817b07891b15cd5e, 0xfe883a3577e9fceb
+data8 0x3fbb6d6ccf1455e0, 0x3cb9450bff4ee307
+data8 0x818210de91bba6c8, 0xfe814ff7162cf62f
+data8 0x3fbbadcc2abb1180, 0x3c9227fda12a8d24
+data8 0x81892b5abb0f2bf9, 0xfe7a55701a8697b1
+data8 0x3fbbee2d48377700, 0x3cb6fad72acfe356
+data8 0x819057031bf7760e, 0xfe734a9f2dfa1810
+data8 0x3fbc2e902bc10600, 0x3cb4465b588d16ad
+data8 0x819793dd479d4fbe, 0xfe6c2f82f643f68b
+data8 0x3fbc6ef4d9904580, 0x3c8b9ac54823960d
+data8 0x819ee1eedf76367a, 0xfe65041a15d8a92c
+data8 0x3fbcaf5b55dec6a0, 0x3ca2b8d28a954db2
+data8 0x81a6413d934f7a66, 0xfe5dc8632be3477f
+data8 0x3fbcefc3a4e727a0, 0x3c9380da83713ab4
+data8 0x81adb1cf21597d4b, 0xfe567c5cd44431d5
+data8 0x3fbd302dcae51600, 0x3ca995b83421756a
+data8 0x81b533a9563310b8, 0xfe4f2005a78fb50f
+data8 0x3fbd7099cc155180, 0x3caefa2f7a817d5f
+data8 0x81bcc6d20cf4f373, 0xfe47b35c3b0caaeb
+data8 0x3fbdb107acb5ae80, 0x3cb455fc372dd026
+data8 0x81c46b4f2f3d6e68, 0xfe40365f20b316d6
+data8 0x3fbdf177710518c0, 0x3cbee3dcc5b01434
+data8 0x81cc2126b53c1144, 0xfe38a90ce72abf36
+data8 0x3fbe31e91d439620, 0x3cb3e131c950aebd
+data8 0x81d3e85ea5bd8ee2, 0xfe310b6419c9c33a
+data8 0x3fbe725cb5b24900, 0x3c01d3fac6029027
+data8 0x81dbc0fd1637b9c1, 0xfe295d6340932d15
+data8 0x3fbeb2d23e937300, 0x3c6304cc44aeedd1
+data8 0x81e3ab082ad5a0a4, 0xfe219f08e03580b3
+data8 0x3fbef349bc2a77e0, 0x3cac1d2d6abe9c72
+data8 0x81eba6861683cb97, 0xfe19d0537a0946e2
+data8 0x3fbf33c332bbe020, 0x3ca0909dba4e96ca
+data8 0x81f3b37d1afc9979, 0xfe11f1418c0f94e2
+data8 0x3fbf743ea68d5b60, 0x3c937fc12a2a779a
+data8 0x81fbd1f388d4be45, 0xfe0a01d190f09063
+data8 0x3fbfb4bc1be5c340, 0x3cbf51a504b55813
+data8 0x820401efbf87e248, 0xfe020201fff9efea
+data8 0x3fbff53b970d1e80, 0x3ca625444b260078
+data8 0x82106ad2ffdca049, 0xfdf5e3940a49135e
+data8 0x3fc02aff52065460, 0x3c9125d113e22a57
+data8 0x8221343d6ea1d3e2, 0xfde581a45429b0a0
+data8 0x3fc06b84f8e03220, 0x3caccf362295894b
+data8 0x82324434adbf99c2, 0xfdd4de1a001fb775
+data8 0x3fc0ac0ed1fe7240, 0x3cc22f676096b0af
+data8 0x82439aee8d0c7747, 0xfdc3f8e8269d1f03
+data8 0x3fc0ec9cee9e4820, 0x3cca147e2886a628
+data8 0x825538a1d0fcb2f0, 0xfdb2d201a9b1ba66
+data8 0x3fc12d2f6006f0a0, 0x3cc72b36633bc2d4
+data8 0x82671d86345c5cee, 0xfda1695934d723e7
+data8 0x3fc16dc63789de60, 0x3cb11f9c47c7b83f
+data8 0x827949d46a121770, 0xfd8fbee13cbbb823
+data8 0x3fc1ae618682e620, 0x3cce1b59020cef8e
+data8 0x828bbdc61eeab9ba, 0xfd7dd28bff0c9f34
+data8 0x3fc1ef015e586c40, 0x3cafec043e0225ee
+data8 0x829e7995fb6de9e1, 0xfd6ba44b823ee1ca
+data8 0x3fc22fa5d07b90c0, 0x3cba905409caf8e3
+data8 0x82b17d7fa5bbc982, 0xfd5934119557883a
+data8 0x3fc2704eee685da0, 0x3cb5ef21838a823e
+data8 0x82c4c9bfc373d276, 0xfd4681cfcfb2c161
+data8 0x3fc2b0fcc9a5f3e0, 0x3ccc7952c5e0e312
+data8 0x82d85e93fba50136, 0xfd338d7790ca0f41
+data8 0x3fc2f1af73c6ba00, 0x3cbecf5f977d1ca9
+data8 0x82ec3c3af8c76b32, 0xfd2056f9fff97727
+data8 0x3fc33266fe6889a0, 0x3c9d329c022ebdb5
+data8 0x830062f46abf6022, 0xfd0cde480c43b327
+data8 0x3fc373237b34de60, 0x3cc95806d4928adb
+data8 0x8314d30108ea35f0, 0xfcf923526c1562b2
+data8 0x3fc3b3e4fbe10520, 0x3cbc299fe7223d54
+data8 0x83298ca29434df97, 0xfce526099d0737ed
+data8 0x3fc3f4ab922e4a60, 0x3cb59d8bb8fdbccc
+data8 0x833e901bd93c7009, 0xfcd0e65de39f1f7c
+data8 0x3fc435774fea2a60, 0x3c9ec18b43340914
+data8 0x8353ddb0b278aad8, 0xfcbc643f4b106055
+data8 0x3fc4764846ee80a0, 0x3cb90402efd87ed6
+data8 0x836975a60a70c52e, 0xfca79f9da4fab13a
+data8 0x3fc4b71e8921b860, 0xbc58f23449ed6365
+data8 0x837f5841ddfa7a46, 0xfc92986889284148
+data8 0x3fc4f7fa2876fca0, 0xbc6294812bf43acd
+data8 0x839585cb3e839773, 0xfc7d4e8f554ab12f
+data8 0x3fc538db36ee6960, 0x3cb910b773d4c578
+data8 0x83abfe8a5466246f, 0xfc67c2012cb6fa68
+data8 0x3fc579c1c6953cc0, 0x3cc5ede909fc47fc
+data8 0x83c2c2c861474d91, 0xfc51f2acf82041d5
+data8 0x3fc5baade9860880, 0x3cac63cdfc3588e5
+data8 0x83d9d2cfc2813637, 0xfc3be08165519325
+data8 0x3fc5fb9fb1e8e3a0, 0x3cbf7c8466578c29
+data8 0x83f12eebf397daac, 0xfc258b6ce6e6822f
+data8 0x3fc63c9731f39d40, 0x3cb6d2a7ffca3e9e
+data8 0x8408d76990b9296e, 0xfc0ef35db402af94
+data8 0x3fc67d947be9eec0, 0x3cb1980da09e6566
+data8 0x8420cc9659487cd7, 0xfbf81841c8082dc4
+data8 0x3fc6be97a21daf00, 0x3cc2ac8330e59aa5
+data8 0x84390ec132759ecb, 0xfbe0fa06e24cc390
+data8 0x3fc6ffa0b6ef05e0, 0x3ccc1a030fee56c4
+data8 0x84519e3a29df811a, 0xfbc9989a85ce0954
+data8 0x3fc740afcccca000, 0x3cc19692a5301ca6
+data8 0x846a7b527842d61b, 0xfbb1f3e9f8e45dc4
+data8 0x3fc781c4f633e2c0, 0x3cc0e98f3868a508
+data8 0x8483a65c8434b5f0, 0xfb9a0be244f4af45
+data8 0x3fc7c2e045b12140, 0x3cb2a8d309754420
+data8 0x849d1fabe4e97dd7, 0xfb81e070362116d1
+data8 0x3fc80401cddfd120, 0x3ca7a44544aa4ce6
+data8 0x84b6e795650817ea, 0xfb6971805af8411e
+data8 0x3fc84529a16ac020, 0x3c9e3b709c7d6f94
+data8 0x84d0fe6f0589da92, 0xfb50beff0423a2f5
+data8 0x3fc88657d30c49e0, 0x3cc60d65a7f0a278
+data8 0x84eb649000a73014, 0xfb37c8d84414755c
+data8 0x3fc8c78c758e8e80, 0x3cc94b2ee984c2b7
+data8 0x85061a50ccd13781, 0xfb1e8ef7eeaf764b
+data8 0x3fc908c79bcba900, 0x3cc8540ae794a2fe
+data8 0x8521200b1fb8916e, 0xfb05114998f76a83
+data8 0x3fc94a0958ade6c0, 0x3ca127f49839fa9c
+data8 0x853c7619f1618bf6, 0xfaeb4fb898b65d19
+data8 0x3fc98b51bf2ffee0, 0x3c8c9ba7a803909a
+data8 0x85581cd97f45e274, 0xfad14a3004259931
+data8 0x3fc9cca0e25d4ac0, 0x3cba458e91d3bf54
+data8 0x857414a74f8446b4, 0xfab7009ab1945a54
+data8 0x3fca0df6d551fe80, 0x3cc78ea1d329d2b2
+data8 0x85905de2341dea46, 0xfa9c72e3370d2fbc
+data8 0x3fca4f53ab3b6200, 0x3ccf60dca86d57ef
+data8 0x85acf8ea4e423ff8, 0xfa81a0f3e9fa0ee9
+data8 0x3fca90b777580aa0, 0x3ca4c4e2ec8a867e
+data8 0x85c9e62111a92e7d, 0xfa668ab6dec711b1
+data8 0x3fcad2224cf814e0, 0x3c303de5980d071c
+data8 0x85e725e947fbee97, 0xfa4b3015e883dbfe
+data8 0x3fcb13943f7d5f80, 0x3cc29d4eefa5cb1e
+data8 0x8604b8a7144cd054, 0xfa2f90fa9883a543
+data8 0x3fcb550d625bc6a0, 0x3c9e01a746152daf
+data8 0x86229ebff69e2415, 0xfa13ad4e3dfbe1c1
+data8 0x3fcb968dc9195ea0, 0x3ccc091bd73ae518
+data8 0x8640d89acf78858c, 0xf9f784f9e5a1877b
+data8 0x3fcbd815874eb160, 0x3cb5f4b89875e187
+data8 0x865f669fe390c7f5, 0xf9db17e65944eacf
+data8 0x3fcc19a4b0a6f9c0, 0x3cc5c0bc2b0bbf14
+data8 0x867e4938df7dc45f, 0xf9be65fc1f6c2e6e
+data8 0x3fcc5b3b58e061e0, 0x3cc1ca70df8f57e7
+data8 0x869d80d0db7e4c0c, 0xf9a16f237aec427a
+data8 0x3fcc9cd993cc4040, 0x3cbae93acc85eccf
+data8 0x86bd0dd45f4f8265, 0xf98433446a806e70
+data8 0x3fccde7f754f5660, 0x3cb22f70e64568d0
+data8 0x86dcf0b16613e37a, 0xf966b246a8606170
+data8 0x3fcd202d11620fa0, 0x3c962030e5d4c849
+data8 0x86fd29d7624b3d5d, 0xf948ec11a9d4c45b
+data8 0x3fcd61e27c10c0a0, 0x3cc7083c91d59217
+data8 0x871db9b741dbe44a, 0xf92ae08c9eca4941
+data8 0x3fcda39fc97be7c0, 0x3cc9258579e57211
+data8 0x873ea0c3722d6af2, 0xf90c8f9e71633363
+data8 0x3fcde5650dd86d60, 0x3ca4755a9ea582a9
+data8 0x875fdf6fe45529e8, 0xf8edf92dc5875319
+data8 0x3fce27325d6fe520, 0x3cbc1e2b6c1954f9
+data8 0x878176321154e2bc, 0xf8cf1d20f87270b8
+data8 0x3fce6907cca0d060, 0x3cb6ca4804750830
+data8 0x87a36580fe6bccf5, 0xf8affb5e20412199
+data8 0x3fceaae56fdee040, 0x3cad6b310d6fd46c
+data8 0x87c5add5417a5cb9, 0xf89093cb0b7c0233
+data8 0x3fceeccb5bb33900, 0x3cc16e99cedadb20
+data8 0x87e84fa9057914ca, 0xf870e64d40a15036
+data8 0x3fcf2eb9a4bcb600, 0x3cc75ee47c8b09e9
+data8 0x880b4b780f02b709, 0xf850f2c9fdacdf78
+data8 0x3fcf70b05fb02e20, 0x3cad6350d379f41a
+data8 0x882ea1bfc0f228ac, 0xf830b926379e6465
+data8 0x3fcfb2afa158b8a0, 0x3cce0ccd9f829985
+data8 0x885252ff21146108, 0xf810394699fe0e8e
+data8 0x3fcff4b77e97f3e0, 0x3c9b30faa7a4c703
+data8 0x88765fb6dceebbb3, 0xf7ef730f865f6df0
+data8 0x3fd01b6406332540, 0x3cdc5772c9e0b9bd
+data8 0x88ad1f69be2cc730, 0xf7bdc59bc9cfbd97
+data8 0x3fd04cf8ad203480, 0x3caeef44fe21a74a
+data8 0x88f763f70ae2245e, 0xf77a91c868a9c54e
+data8 0x3fd08f23ce0162a0, 0x3cd6290ab3fe5889
+data8 0x89431fc7bc0c2910, 0xf73642973c91298e
+data8 0x3fd0d1610f0c1ec0, 0x3cc67401a01f08cf
+data8 0x8990573407c7738e, 0xf6f0d71d1d7a2dd6
+data8 0x3fd113b0c65d88c0, 0x3cc7aa4020fe546f
+data8 0x89df0eb108594653, 0xf6aa4e6a05cfdef2
+data8 0x3fd156134ada6fe0, 0x3cc87369da09600c
+data8 0x8a2f4ad16e0ed78a, 0xf662a78900c35249
+data8 0x3fd19888f43427a0, 0x3cc62b220f38e49c
+data8 0x8a811046373e0819, 0xf619e180181d97cc
+data8 0x3fd1db121aed7720, 0x3ca3ede7490b52f4
+data8 0x8ad463df6ea0fa2c, 0xf5cffb504190f9a2
+data8 0x3fd21daf185fa360, 0x3caafad98c1d6c1b
+data8 0x8b294a8cf0488daf, 0xf584f3f54b8604e6
+data8 0x3fd2606046bf95a0, 0x3cdb2d704eeb08fa
+data8 0x8b7fc95f35647757, 0xf538ca65c960b582
+data8 0x3fd2a32601231ec0, 0x3cc661619fa2f126
+data8 0x8bd7e588272276f8, 0xf4eb7d92ff39fccb
+data8 0x3fd2e600a3865760, 0x3c8a2a36a99aca4a
+data8 0x8c31a45bf8e9255e, 0xf49d0c68cd09b689
+data8 0x3fd328f08ad12000, 0x3cb9efaf1d7ab552
+data8 0x8c8d0b520a35eb18, 0xf44d75cd993cfad2
+data8 0x3fd36bf614dcc040, 0x3ccacbb590bef70d
+data8 0x8cea2005d068f23d, 0xf3fcb8a23ab4942b
+data8 0x3fd3af11a079a6c0, 0x3cd9775872cf037d
+data8 0x8d48e837c8cd5027, 0xf3aad3c1e2273908
+data8 0x3fd3f2438d754b40, 0x3ca03304f667109a
+data8 0x8da969ce732f3ac7, 0xf357c60202e2fd7e
+data8 0x3fd4358c3ca032e0, 0x3caecf2504ff1a9d
+data8 0x8e0baad75555e361, 0xf3038e323ae9463a
+data8 0x3fd478ec0fd419c0, 0x3cc64bdc3d703971
+data8 0x8e6fb18807ba877e, 0xf2ae2b1c3a6057f7
+data8 0x3fd4bc6369fa40e0, 0x3cbb7122ec245cf2
+data8 0x8ed5843f4bda74d5, 0xf2579b83aa556f0c
+data8 0x3fd4fff2af11e2c0, 0x3c9cfa2dc792d394
+data8 0x8f3d29862c861fef, 0xf1ffde2612ca1909
+data8 0x3fd5439a4436d000, 0x3cc38d46d310526b
+data8 0x8fa6a81128940b2d, 0xf1a6f1bac0075669
+data8 0x3fd5875a8fa83520, 0x3cd8bf59b8153f8a
+data8 0x901206c1686317a6, 0xf14cd4f2a730d480
+data8 0x3fd5cb33f8cf8ac0, 0x3c9502b5c4d0e431
+data8 0x907f4ca5fe9cf739, 0xf0f186784a125726
+data8 0x3fd60f26e847b120, 0x3cc8a1a5e0acaa33
+data8 0x90ee80fd34aeda5e, 0xf09504ef9a212f18
+data8 0x3fd65333c7e43aa0, 0x3cae5b029cb1f26e
+data8 0x915fab35e37421c6, 0xf0374ef5daab5c45
+data8 0x3fd6975b02b8e360, 0x3cd5aa1c280c45e6
+data8 0x91d2d2f0d894d73c, 0xefd86321822dbb51
+data8 0x3fd6db9d05213b20, 0x3cbecf2c093ccd8b
+data8 0x9248000249200009, 0xef7840021aca5a72
+data8 0x3fd71ffa3cc87fc0, 0x3cb8d273f08d00d9
+data8 0x92bf3a7351f081d2, 0xef16e42021d7cbd5
+data8 0x3fd7647318b1ad20, 0x3cbce099d79cdc46
+data8 0x93388a8386725713, 0xeeb44dfce6820283
+data8 0x3fd7a908093fc1e0, 0x3ccb033ec17a30d9
+data8 0x93b3f8aa8e653812, 0xee507c126774fa45
+data8 0x3fd7edb9803e3c20, 0x3cc10aedb48671eb
+data8 0x94318d99d341ade4, 0xedeb6cd32f891afb
+data8 0x3fd83287f0e9cf80, 0x3c994c0c1505cd2a
+data8 0x94b1523e3dedc630, 0xed851eaa3168f43c
+data8 0x3fd87773cff956e0, 0x3cda3b7bce6a6b16
+data8 0x95334fc20577563f, 0xed1d8ffaa2279669
+data8 0x3fd8bc7d93a70440, 0x3cd4922edc792ce2
+data8 0x95b78f8e8f92f274, 0xecb4bf1fd2be72da
+data8 0x3fd901a5b3b9cf40, 0x3cd3fea1b00f9d0d
+data8 0x963e1b4e63a87c3f, 0xec4aaa6d08694cc1
+data8 0x3fd946eca98f2700, 0x3cdba4032d968ff1
+data8 0x96c6fcef314074fc, 0xebdf502d53d65fea
+data8 0x3fd98c52f024e800, 0x3cbe7be1ab8c95c9
+data8 0x97523ea3eab028b2, 0xeb72aea36720793e
+data8 0x3fd9d1d904239860, 0x3cd72d08a6a22b70
+data8 0x97dfeae6f4ee4a9a, 0xeb04c4096a884e94
+data8 0x3fda177f63e8ef00, 0x3cd818c3c1ebfac7
+data8 0x98700c7c6d85d119, 0xea958e90cfe1efd7
+data8 0x3fda5d468f92a540, 0x3cdf45fbfaa080fe
+data8 0x9902ae7487a9caa1, 0xea250c6224aab21a
+data8 0x3fdaa32f090998e0, 0x3cd715a9353cede4
+data8 0x9997dc2e017a9550, 0xe9b33b9ce2bb7638
+data8 0x3fdae939540d3f00, 0x3cc545c014943439
+data8 0x9a2fa158b29b649b, 0xe9401a573f8aa706
+data8 0x3fdb2f65f63f6c60, 0x3cd4a63c2f2ca8e2
+data8 0x9aca09f835466186, 0xe8cba69df9f0bf35
+data8 0x3fdb75b5773075e0, 0x3cda310ce1b217ec
+data8 0x9b672266ab1e0136, 0xe855de74266193d4
+data8 0x3fdbbc28606babc0, 0x3cdc84b75cca6c44
+data8 0x9c06f7579f0b7bd5, 0xe7debfd2f98c060b
+data8 0x3fdc02bf3d843420, 0x3cd225d967ffb922
+data8 0x9ca995db058cabdc, 0xe76648a991511c6e
+data8 0x3fdc497a9c224780, 0x3cde08101c5b825b
+data8 0x9d4f0b605ce71e88, 0xe6ec76dcbc02d9a7
+data8 0x3fdc905b0c10d420, 0x3cb1abbaa3edf120
+data8 0x9df765b9eecad5e6, 0xe6714846bdda7318
+data8 0x3fdcd7611f4b8a00, 0x3cbf6217ae80aadf
+data8 0x9ea2b320350540fe, 0xe5f4bab71494cd6b
+data8 0x3fdd1e8d6a0d56c0, 0x3cb726e048cc235c
+data8 0x9f51023562fc5676, 0xe576cbf239235ecb
+data8 0x3fdd65e082df5260, 0x3cd9e66872bd5250
+data8 0xa002620915c2a2f6, 0xe4f779b15f5ec5a7
+data8 0x3fddad5b02a82420, 0x3c89743b0b57534b
+data8 0xa0b6e21c2caf9992, 0xe476c1a233a7873e
+data8 0x3fddf4fd84bbe160, 0x3cbf7adea9ee3338
+data8 0xa16e9264cc83a6b2, 0xe3f4a16696608191
+data8 0x3fde3cc8a6ec6ee0, 0x3cce46f5a51f49c6
+data8 0xa22983528f3d8d49, 0xe3711694552da8a8
+data8 0x3fde84bd099a6600, 0x3cdc78f6490a2d31
+data8 0xa2e7c5d2e2e69460, 0xe2ec1eb4e1e0a5fb
+data8 0x3fdeccdb4fc685c0, 0x3cdd3aedb56a4825
+data8 0xa3a96b5599bd2532, 0xe265b74506fbe1c9
+data8 0x3fdf15241f23b3e0, 0x3cd440f3c6d65f65
+data8 0xa46e85d1ae49d7de, 0xe1ddddb499b3606f
+data8 0x3fdf5d98202994a0, 0x3cd6c44bd3fb745a
+data8 0xa53727ca3e11b99e, 0xe1548f662951b00d
+data8 0x3fdfa637fe27bf60, 0x3ca8ad1cd33054dd
+data8 0xa6036453bdc20186, 0xe0c9c9aeabe5e481
+data8 0x3fdfef0467599580, 0x3cc0f1ac0685d78a
+data8 0xa6d34f1969dda338, 0xe03d89d5281e4f81
+data8 0x3fe01bff067d6220, 0x3cc0731e8a9ef057
+data8 0xa7a6fc62f7246ff3, 0xdfafcd125c323f54
+data8 0x3fe04092d1ae3b40, 0x3ccabda24b59906d
+data8 0xa87e811a861df9b9, 0xdf20909061bb9760
+data8 0x3fe0653df0fd9fc0, 0x3ce94c8dcc722278
+data8 0xa959f2d2dd687200, 0xde8fd16a4e5f88bd
+data8 0x3fe08a00c1cae320, 0x3ce6b888bb60a274
+data8 0xaa3967cdeea58bda, 0xddfd8cabd1240d22
+data8 0x3fe0aedba3221c00, 0x3ced5941cd486e46
+data8 0xab904fd587263c84, 0xdd1f4472e1cf64ed
+data8 0x3fe0e651e85229c0, 0x3cdb6701042299b1
+data8 0xad686d44dd5a74bb, 0xdbf173e1f6b46e92
+data8 0x3fe1309cbf4cdb20, 0x3cbf1be7bb3f0ec5
+data8 0xaf524e15640ebee4, 0xdabd54896f1029f6
+data8 0x3fe17b4ee1641300, 0x3ce81dd055b792f1
+data8 0xb14eca24ef7db3fa, 0xd982cb9ae2f47e41
+data8 0x3fe1c66b9ffd6660, 0x3cd98ea31eb5ddc7
+data8 0xb35ec807669920ce, 0xd841bd1b8291d0b6
+data8 0x3fe211f66db3a5a0, 0x3ca480c35a27b4a2
+data8 0xb5833e4755e04dd1, 0xd6fa0bd3150b6930
+data8 0x3fe25df2e05b6c40, 0x3ca4bc324287a351
+data8 0xb7bd34c8000b7bd3, 0xd5ab9939a7d23aa1
+data8 0x3fe2aa64b32f7780, 0x3cba67314933077c
+data8 0xba0dc64d126cc135, 0xd4564563ce924481
+data8 0x3fe2f74fc9289ac0, 0x3cec1a1dc0efc5ec
+data8 0xbc76222cbbfa74a6, 0xd2f9eeed501125a8
+data8 0x3fe344b82f859ac0, 0x3ceeef218de413ac
+data8 0xbef78e31985291a9, 0xd19672e2182f78be
+data8 0x3fe392a22087b7e0, 0x3cd2619ba201204c
+data8 0xc19368b2b0629572, 0xd02baca5427e436a
+data8 0x3fe3e11206694520, 0x3cb5d0b3143fe689
+data8 0xc44b2ae8c6733e51, 0xceb975d60b6eae5d
+data8 0x3fe4300c7e945020, 0x3cbd367143da6582
+data8 0xc7206b894212dfef, 0xcd3fa6326ff0ac9a
+data8 0x3fe47f965d201d60, 0x3ce797c7a4ec1d63
+data8 0xca14e1b0622de526, 0xcbbe13773c3c5338
+data8 0x3fe4cfb4b09d1a20, 0x3cedfadb5347143c
+data8 0xcd2a6825eae65f82, 0xca34913d425a5ae9
+data8 0x3fe5206cc637e000, 0x3ce2798b38e54193
+data8 0xd06301095e1351ee, 0xc8a2f0d3679c08c0
+data8 0x3fe571c42e3d0be0, 0x3ccd7cb9c6c2ca68
+data8 0xd3c0d9f50057adda, 0xc70901152d59d16b
+data8 0x3fe5c3c0c108f940, 0x3ceb6c13563180ab
+data8 0xd74650a98cc14789, 0xc5668e3d4cbf8828
+data8 0x3fe61668a46ffa80, 0x3caa9092e9e3c0e5
+data8 0xdaf5f8579dcc8f8f, 0xc3bb61b3eed42d02
+data8 0x3fe669c251ad69e0, 0x3cccf896ef3b4fee
+data8 0xded29f9f9a6171b4, 0xc20741d7f8e8e8af
+data8 0x3fe6bdd49bea05c0, 0x3cdc6b29937c575d
+data8 0xe2df5765854ccdb0, 0xc049f1c2d1b8014b
+data8 0x3fe712a6b76c6e80, 0x3ce1ddc6f2922321
+data8 0xe71f7a9b94fcb4c3, 0xbe833105ec291e91
+data8 0x3fe76840418978a0, 0x3ccda46e85432c3d
+data8 0xeb96b72d3374b91e, 0xbcb2bb61493b28b3
+data8 0x3fe7bea9496d5a40, 0x3ce37b42ec6e17d3
+data8 0xf049183c3f53c39b, 0xbad848720223d3a8
+data8 0x3fe815ea59dab0a0, 0x3cb03ad41bfc415b
+data8 0xf53b11ec7f415f15, 0xb8f38b57c53c9c48
+data8 0x3fe86e0c84010760, 0x3cc03bfcfb17fe1f
+data8 0xfa718f05adbf2c33, 0xb70432500286b185
+data8 0x3fe8c7196b9225c0, 0x3ced99fcc6866ba9
+data8 0xfff200c3f5489608, 0xb509e6454dca33cc
+data8 0x3fe9211b54441080, 0x3cb789cb53515688
+// The following table entries are not used
+//data8 0x82e138a0fac48700, 0xb3044a513a8e6132
+//data8 0x3fe97c1d30f5b7c0, 0x3ce1eb765612d1d0
+//data8 0x85f4cc7fc670d021, 0xb0f2fb2ea6cbbc88
+//data8 0x3fe9d82ab4b5fde0, 0x3ced3fe6f27e8039
+//data8 0x89377c1387d5b908, 0xaed58e9a09014d5c
+//data8 0x3fea355065f87fa0, 0x3cbef481d25f5b58
+//data8 0x8cad7a2c98dec333, 0xacab929ce114d451
+//data8 0x3fea939bb451e2a0, 0x3c8e92b4fbf4560f
+//data8 0x905b7dfc99583025, 0xaa748cc0dbbbc0ec
+//data8 0x3feaf31b11270220, 0x3cdced8c61bd7bd5
+//data8 0x9446d8191f80dd42, 0xa82ff92687235baf
+//data8 0x3feb53de0bcffc20, 0x3cbe1722fb47509e
+//data8 0x98758ba086e4000a, 0xa5dd497a9c184f58
+//data8 0x3febb5f571cb0560, 0x3ce0c7774329a613
+//data8 0x9cee6c7bf18e4e24, 0xa37be3c3cd1de51b
+//data8 0x3fec197373bc7be0, 0x3ce08ebdb55c3177
+//data8 0xa1b944000a1b9440, 0xa10b2101b4f27e03
+//data8 0x3fec7e6bd023da60, 0x3ce5fc5fd4995959
+//data8 0xa6defd8ba04d3e38, 0x9e8a4b93cad088ec
+//data8 0x3fece4f404e29b20, 0x3cea3413401132b5
+//data8 0xac69dd408a10c62d, 0x9bf89d5d17ddae8c
+//data8 0x3fed4d2388f63600, 0x3cd5a7fb0d1d4276
+//data8 0xb265c39cbd80f97a, 0x99553d969fec7beb
+//data8 0x3fedb714101e0a00, 0x3cdbda21f01193f2
+//data8 0xb8e081a16ae4ae73, 0x969f3e3ed2a0516c
+//data8 0x3fee22e1da97bb00, 0x3ce7231177f85f71
+//data8 0xbfea427678945732, 0x93d5990f9ee787af
+//data8 0x3fee90ac13b18220, 0x3ce3c8a5453363a5
+//data8 0xc79611399b8c90c5, 0x90f72bde80febc31
+//data8 0x3fef009542b712e0, 0x3ce218fd79e8cb56
+//data8 0xcffa8425040624d7, 0x8e02b4418574ebed
+//data8 0x3fef72c3d2c57520, 0x3cd32a717f82203f
+//data8 0xd93299cddcf9cf23, 0x8af6ca48e9c44024
+//data8 0x3fefe762b77744c0, 0x3ce53478a6bbcf94
+//data8 0xe35eda760af69ad9, 0x87d1da0d7f45678b
+//data8 0x3ff02f511b223c00, 0x3ced6e11782c28fc
+//data8 0xeea6d733421da0a6, 0x84921bbe64ae029a
+//data8 0x3ff06c5c6f8ce9c0, 0x3ce71fc71c1ffc02
+//data8 0xfb3b2c73fc6195cc, 0x813589ba3a5651b6
+//data8 0x3ff0aaf2613700a0, 0x3cf2a72d2fd94ef3
+//data8 0x84ac1fcec4203245, 0xfb73a828893df19e
+//data8 0x3ff0eb367c3fd600, 0x3cf8054c158610de
+//data8 0x8ca50621110c60e6, 0xf438a14c158d867c
+//data8 0x3ff12d51caa6b580, 0x3ce6bce9748739b6
+//data8 0x95b8c2062d6f8161, 0xecb3ccdd37b369da
+//data8 0x3ff1717418520340, 0x3ca5c2732533177c
+//data8 0xa0262917caab4ad1, 0xe4dde4ddc81fd119
+//data8 0x3ff1b7d59dd40ba0, 0x3cc4c7c98e870ff5
+//data8 0xac402c688b72f3f4, 0xdcae469be46d4c8d
+//data8 0x3ff200b93cc5a540, 0x3c8dd6dc1bfe865a
+//data8 0xba76968b9eabd9ab, 0xd41a8f3df1115f7f
+//data8 0x3ff24c6f8f6affa0, 0x3cf1acb6d2a7eff7
+//data8 0xcb63c87c23a71dc5, 0xcb161074c17f54ec
+//data8 0x3ff29b5b338b7c80, 0x3ce9b5845f6ec746
+//data8 0xdfe323b8653af367, 0xc19107d99ab27e42
+//data8 0x3ff2edf6fac7f5a0, 0x3cf77f961925fa02
+//data8 0xf93746caaba3e1f1, 0xb777744a9df03bff
+//data8 0x3ff344df237486c0, 0x3cf6ddf5f6ddda43
+//data8 0x8ca77052f6c340f0, 0xacaf476f13806648
+//data8 0x3ff3a0dfa4bb4ae0, 0x3cfee01bbd761bff
+//data8 0xa1a48604a81d5c62, 0xa11575d30c0aae50
+//data8 0x3ff4030b73c55360, 0x3cf1cf0e0324d37c
+//data8 0xbe45074b05579024, 0x9478e362a07dd287
+//data8 0x3ff46ce4c738c4e0, 0x3ce3179555367d12
+//data8 0xe7a08b5693d214ec, 0x8690e3575b8a7c3b
+//data8 0x3ff4e0a887c40a80, 0x3cfbd5d46bfefe69
+//data8 0x94503d69396d91c7, 0xedd2ce885ff04028
+//data8 0x3ff561ebd9c18cc0, 0x3cf331bd176b233b
+//data8 0xced1d96c5bb209e6, 0xc965278083808702
+//data8 0x3ff5f71d7ff42c80, 0x3ce3301cc0b5a48c
+//data8 0xabac2cee0fc24e20, 0x9c4eb1136094cbbd
+//data8 0x3ff6ae4c63222720, 0x3cf5ff46874ee51e
+//data8 0x8040201008040201, 0xb4d7ac4d9acb1bf4
+//data8 0x3ff7b7d33b928c40, 0x3cfacdee584023bb
+LOCAL_OBJECT_END(T_table)
+
+
+
+.align 16
+
+LOCAL_OBJECT_START(poly_coeffs)
+ // C_3
+data8 0xaaaaaaaaaaaaaaab, 0x0000000000003ffc
+ // C_5
+data8 0x999999999999999a, 0x0000000000003ffb
+ // C_7, C_9
+data8 0x3fa6db6db6db6db7, 0x3f9f1c71c71c71c8
+ // pi/2 (low, high)
+data8 0x3C91A62633145C07, 0x3FF921FB54442D18
+ // C_11, C_13
+data8 0x3f96e8ba2e8ba2e9, 0x3f91c4ec4ec4ec4e
+ // C_15, C_17
+data8 0x3f8c99999999999a, 0x3f87a87878787223
+LOCAL_OBJECT_END(poly_coeffs)
+
+
+R_DBL_S = r21
+R_EXP0 = r22
+R_EXP = r15
+R_SGNMASK = r23
+R_TMP = r24
+R_TMP2 = r25
+R_INDEX = r26
+R_TMP3 = r27
+R_TMP03 = r27
+R_TMP4 = r28
+R_TMP5 = r23
+R_TMP6 = r22
+R_TMP7 = r21
+R_T = r29
+R_BIAS = r20
+
+F_T = f6
+F_1S2 = f7
+F_1S2_S = f9
+F_INV_1T2 = f10
+F_SQRT_1T2 = f11
+F_S2T2 = f12
+F_X = f13
+F_D = f14
+F_2M64 = f15
+
+F_CS2 = f32
+F_CS3 = f33
+F_CS4 = f34
+F_CS5 = f35
+F_CS6 = f36
+F_CS7 = f37
+F_CS8 = f38
+F_CS9 = f39
+F_S23 = f40
+F_S45 = f41
+F_S67 = f42
+F_S89 = f43
+F_S25 = f44
+F_S69 = f45
+F_S29 = f46
+F_X2 = f47
+F_X4 = f48
+F_TSQRT = f49
+F_DTX = f50
+F_R = f51
+F_R2 = f52
+F_R3 = f53
+F_R4 = f54
+
+F_C3 = f55
+F_C5 = f56
+F_C7 = f57
+F_C9 = f58
+F_P79 = f59
+F_P35 = f60
+F_P39 = f61
+
+F_ATHI = f62
+F_ATLO = f63
+
+F_T1 = f64
+F_Y = f65
+F_Y2 = f66
+F_ANDMASK = f67
+F_ORMASK = f68
+F_S = f69
+F_05 = f70
+F_SQRT_1S2 = f71
+F_DS = f72
+F_Z = f73
+F_1T2 = f74
+F_DZ = f75
+F_ZE = f76
+F_YZ = f77
+F_Y1S2 = f78
+F_Y1S2X = f79
+F_1X = f80
+F_ST = f81
+F_1T2_ST = f82
+F_TSS = f83
+F_Y1S2X2 = f84
+F_DZ_TERM = f85
+F_DTS = f86
+F_DS2X = f87
+F_T2 = f88
+F_ZY1S2S = f89
+F_Y1S2_1X = f90
+F_TS = f91
+F_PI2_LO = f92
+F_PI2_HI = f93
+F_S19 = f94
+F_INV1T2_2 = f95
+F_CORR = f96
+F_DZ0 = f97
+
+F_C11 = f98
+F_C13 = f99
+F_C15 = f100
+F_C17 = f101
+F_P1113 = f102
+F_P1517 = f103
+F_P1117 = f104
+F_P317 = f105
+F_R8 = f106
+F_HI = f107
+F_1S2_HI = f108
+F_DS2 = f109
+F_Y2_2 = f110
+F_S2 = f111
+F_S_DS2 = f112
+F_S_1S2S = f113
+F_XL = f114
+F_2M128 = f115
+
.section .text
-.proc asinl#
-.align 32
+GLOBAL_LIBM_ENTRY(asinl)
+
+{.mfi
+ // get exponent, mantissa (rounded to double precision) of s
+ getf.d R_DBL_S = f8
+ // 1-s^2
+ fnma.s1 F_1S2 = f8, f8, f1
+ // r2 = pointer to T_table
+ addl r2 = @ltoff(T_table), gp
+}
+{.mfi
+ // sign mask
+ mov R_SGNMASK = 0x20000
+ nop.f 0
+ // bias-63-1
+ mov R_TMP03 = 0xffff-64;;
+}
-asinl:
-{ .mfi
- alloc r32 = ar.pfs,1,11,4,0
-(p0) fnorm asin_NORM_f8 = f8
-(p0) mov asin_GR_17_ones = 0x1ffff
+{.mfi
+ // get exponent of s
+ getf.exp R_EXP = f8
+ nop.f 0
+ // R_TMP4 = 2^45
+ shl R_TMP4 = R_SGNMASK, 45-17
}
-{ .mii
-(p0) mov asin_GR_16_ones = 0xffff
-(p0) mov asin_GR_ff9b = 0xff9b ;;
- nop.i 999
+{.mlx
+ // load bias-4
+ mov R_TMP = 0xffff-4
+ // load RU(sqrt(2)/2) to integer register (in double format, shifted left by 1)
+ movl R_TMP2 = 0x7fcd413cccfe779a;;
}
-{ .mmi
-(p0) setf.exp asin_2m100 = asin_GR_ff9b
-(p0) addl r40 = @ltoff(asin_coefficients), gp
- nop.i 999
+{.mfi
+ // load 2^{-64} in FP register
+ setf.exp F_2M64 = R_TMP03
+ nop.f 0
+ // index = (0x7-exponent)|b1 b2.. b6
+ extr.u R_INDEX = R_DBL_S, 46, 9
}
-;;
-{ .mmi
- ld8 r40 = [r40]
- nop.m 999
- nop.i 999
+{.mfi
+ // get t = sign|exponent|b1 b2.. b6 1 x.. x
+ or R_T = R_DBL_S, R_TMP4
+ nop.f 0
+ // R_TMP4 = 2^45-1
+ sub R_TMP4 = R_TMP4, r0, 1;;
}
-;;
+{.mfi
+ // get t = sign|exponent|b1 b2.. b6 1 0.. 0
+ andcm R_T = R_T, R_TMP4
+ nop.f 0
+ // eliminate sign from R_DBL_S (shift left by 1)
+ shl R_TMP3 = R_DBL_S, 1
+}
-// Load the constants
+{.mfi
+ // R_BIAS = 3*2^6
+ mov R_BIAS = 0xc0
+ nop.f 0
+ // eliminate sign from R_EXP
+ andcm R_EXP0 = R_EXP, R_SGNMASK;;
+}
-{ .mmi
-(p0) ldfe asin_A10 = [r40],16 ;;
-(p0) ldfe asin_A9 = [r40],16
- nop.i 999 ;;
+
+
+{.mfi
+ // load start address for T_table
+ ld8 r2 = [r2]
+ nop.f 0
+ // p8 = 1 if |s|> = sqrt(2)/2
+ cmp.geu p8, p0 = R_TMP3, R_TMP2
}
-{ .mmi
-(p0) ldfe asin_A8 = [r40],16 ;;
-(p0) ldfe asin_A7 = [r40],16
- nop.i 999 ;;
+{.mlx
+ // p7 = 1 if |s|<2^{-4} (exponent of s<bias-4)
+ cmp.lt p7, p0 = R_EXP0, R_TMP
+ // sqrt coefficient cs8 = -33*13/128
+ movl R_TMP2 = 0xc0568000;;
}
-{ .mmi
-(p0) ldfe asin_A6 = [r40],16 ;;
-(p0) getf.exp asin_GR_signexp_f8 = asin_NORM_f8
- nop.i 999
+
+
+{.mbb
+ // load t in FP register
+ setf.d F_T = R_T
+ // if |s|<2^{-4}, take alternate path
+ (p7) br.cond.spnt SMALL_S
+ // if |s|> = sqrt(2)/2, take alternate path
+ (p8) br.cond.sptk LARGE_S
}
-{ .mmi
-(p0) ldfe asin_A5 = [r40],16 ;;
-(p0) ldfe asin_A4 = [r40],16
- nop.i 999 ;;
+{.mlx
+ // index = (4-exponent)|b1 b2.. b6
+ sub R_INDEX = R_INDEX, R_BIAS
+ // sqrt coefficient cs9 = 55*13/128
+ movl R_TMP = 0x40b2c000;;
}
-{ .mfi
- nop.m 999
-(p0) fmerge.s asin_ABS_NORM_f8 = f0, asin_NORM_f8
-(p0) and asin_GR_exp = asin_GR_signexp_f8, asin_GR_17_ones ;;
+
+{.mfi
+ // sqrt coefficient cs8 = -33*13/128
+ setf.s F_CS8 = R_TMP2
+ nop.f 0
+ // shift R_INDEX by 5
+ shl R_INDEX = R_INDEX, 5
+}
+
+{.mfi
+ // sqrt coefficient cs3 = 0.5 (set exponent = bias-1)
+ mov R_TMP4 = 0xffff - 1
+ nop.f 0
+ // sqrt coefficient cs6 = -21/16
+ mov R_TMP6 = 0xbfa8;;
}
-// case 1: |x| < 2^-40 ==> p6 (includes x = +-0)
-// case 2: 2^-40 <= |x| < 2^-2 ==> p8
-// case 3: 2^-2 <= |x| < 1 ==> p9
-// case 4: 1 <= |x| ==> p11
-// In case 4, we pick up the special case x = +-1 and return +-pi/2
-{ .mii
-(p0) ldfe asin_A3 = [r40],16
-(p0) sub asin_GR_true_exp = asin_GR_exp, asin_GR_16_ones ;;
-(p0) cmp.ge.unc p6, p7 = -41, asin_GR_true_exp ;;
+{.mlx
+ // table index
+ add r2 = r2, R_INDEX
+ // sqrt coefficient cs7 = 33/16
+ movl R_TMP2 = 0x40040000;;
}
-{ .mii
-(p0) ldfe asin_A2 = [r40],16
-(p7) cmp.ge.unc p8, p9 = -3, asin_GR_true_exp ;;
-(p9) cmp.ge.unc p10, p11 = -1, asin_GR_true_exp
+
+{.mmi
+ // load cs9 = 55*13/128
+ setf.s F_CS9 = R_TMP
+ // sqrt coefficient cs5 = 7/8
+ mov R_TMP3 = 0x3f60
+ // sqrt coefficient cs6 = 21/16
+ shl R_TMP6 = R_TMP6, 16;;
}
-{ .mmi
-(p0) ldfe asin_A1 = [r40],16 ;;
-(p0) ldfe asin_pi_by_2 = [r40],16
- nop.i 999
+
+{.mmi
+ // load significand of 1/(1-t^2)
+ ldf8 F_INV_1T2 = [r2], 8
+ // sqrt coefficient cs7 = 33/16
+ setf.s F_CS7 = R_TMP2
+ // sqrt coefficient cs4 = -5/8
+ mov R_TMP5 = 0xbf20;;
}
-// case 4: |x| >= 1
-{ .mib
- nop.m 999
- nop.i 999
-(p11) br.spnt L(ASIN_ERROR_RETURN) ;;
+
+{.mmi
+ // load significand of sqrt(1-t^2)
+ ldf8 F_SQRT_1T2 = [r2], 8
+ // sqrt coefficient cs6 = 21/16
+ setf.s F_CS6 = R_TMP6
+ // sqrt coefficient cs5 = 7/8
+ shl R_TMP3 = R_TMP3, 16;;
}
-// case 1: |x| < 2^-40
-{ .mfb
- nop.m 999
-(p6) fma.s0 f8 = asin_2m100,f8,f8
-(p6) br.ret.spnt b0 ;;
+
+{.mmi
+ // sqrt coefficient cs3 = 0.5 (set exponent = bias-1)
+ setf.exp F_CS3 = R_TMP4
+ // r3 = pointer to polynomial coefficients
+ addl r3 = @ltoff(poly_coeffs), gp
+ // sqrt coefficient cs4 = -5/8
+ shl R_TMP5 = R_TMP5, 16;;
}
-// case 2: 2^-40 <= |x| < 2^-2 ==> p8
-{ .mfi
- nop.m 999
-(p8) fma.s1 asin_X2 = f8,f8, f0
- nop.i 999 ;;
+{.mfi
+ // sqrt coefficient cs5 = 7/8
+ setf.s F_CS5 = R_TMP3
+ // d = s-t
+ fms.s1 F_D = f8, f1, F_T
+ // set p6 = 1 if s<0, p11 = 1 if s> = 0
+ cmp.ge p6, p11 = R_EXP, R_DBL_S
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 asin_X4 = asin_X2,asin_X2, f0
- nop.i 999 ;;
+{.mfi
+ // r3 = load start address to polynomial coefficients
+ ld8 r3 = [r3]
+ // s+t
+ fma.s1 F_S2T2 = f8, f1, F_T
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 asin_P810 = asin_X4, asin_A10, asin_A8
- nop.i 999
+
+{.mfi
+ // sqrt coefficient cs4 = -5/8
+ setf.s F_CS4 = R_TMP5
+ // s^2-t^2
+ fma.s1 F_S2T2 = F_S2T2, F_D, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 asin_P79 = asin_X4, asin_A9, asin_A7
- nop.i 999 ;;
+
+{.mfi
+ // load C3
+ ldfe F_C3 = [r3], 16
+ // 0.5/(1-t^2) = 2^{-64}*(2^63/(1-t^2))
+ fma.s1 F_INV_1T2 = F_INV_1T2, F_2M64, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 asin_P610 = asin_X4, asin_P810, asin_A6
- nop.i 999
+{.mfi
+ // load C_5
+ ldfe F_C5 = [r3], 16
+ // set correct exponent for sqrt(1-t^2)
+ fma.s1 F_SQRT_1T2 = F_SQRT_1T2, F_2M64, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 asin_P59 = asin_X4, asin_P79, asin_A5
- nop.i 999 ;;
+
+{.mfi
+ // load C_7, C_9
+ ldfpd F_C7, F_C9 = [r3]
+ // x = -(s^2-t^2)/(1-t^2)/2
+ fnma.s1 F_X = F_INV_1T2, F_S2T2, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 asin_P410 = asin_X4, asin_P610, asin_A4
- nop.i 999
+
+{.mfi
+ // load asin(t)_high, asin(t)_low
+ ldfpd F_ATHI, F_ATLO = [r2]
+ // t*sqrt(1-t^2)
+ fma.s1 F_TSQRT = F_T, F_SQRT_1T2, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 asin_P39 = asin_X4, asin_P59, asin_A3
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // cs9*x+cs8
+ fma.s1 F_S89 = F_CS9, F_X, F_CS8
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 asin_P210 = asin_X4, asin_P410, asin_A2
- nop.i 999
+{.mfi
+ nop.m 0
+ // cs7*x+cs6
+ fma.s1 F_S67 = F_CS7, F_X, F_CS6
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 asin_P19 = asin_X4, asin_P39, asin_A1
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // cs5*x+cs4
+ fma.s1 F_S45 = F_CS5, F_X, F_CS4
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 asin_P1P2 = asin_X2, asin_P210, asin_P19
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // x*x
+ fma.s1 F_X2 = F_X, F_X, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p8) fma.s1 asin_P1P2 = asin_X2, asin_P1P2, f0
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // (s-t)-t*x
+ fnma.s1 F_DTX = F_T, F_X, F_D
+ nop.i 0
}
-{ .mfb
- nop.m 999
-(p8) fma.s0 f8 = asin_NORM_f8, asin_P1P2, asin_NORM_f8
-(p8) br.ret.spnt b0 ;;
+{.mfi
+ nop.m 0
+ // cs3*x+cs2 (cs2 = -0.5 = -cs3)
+ fms.s1 F_S23 = F_CS3, F_X, F_CS3
+ nop.i 0;;
}
-// case 3: 2^-2 <= |x| < 1
-// 1- X*X is computed as B + b
-// Step 1.1: Get B and b
-// atan2 will return
-// f8 = Z_hi
-// f10 = Z_lo
-// f11 = s_lo
+{.mfi
+ nop.m 0
+ // cs9*x^3+cs8*x^2+cs7*x+cs6
+ fma.s1 F_S69 = F_S89, F_X2, F_S67
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // x^4
+ fma.s1 F_X4 = F_X2, F_X2, f0
+ nop.i 0;;
+}
-{ .mfi
-(p0) mov asin_GR_fffe = 0xfffe
-(p0) fmerge.se f8 = asin_ABS_NORM_f8, asin_ABS_NORM_f8
-nop.i 0
-};;
-{ .mmf
-nop.m 0
-(p0) setf.exp asin_HALF = asin_GR_fffe
-(p0) fmerge.se f12 = asin_NORM_f8, asin_NORM_f8 ;;
+{.mfi
+ nop.m 0
+ // t*sqrt(1-t^2)*x^2
+ fma.s1 F_TSQRT = F_TSQRT, F_X2, f0
+ nop.i 0
}
+{.mfi
+ nop.m 0
+ // cs5*x^3+cs4*x^2+cs3*x+cs2
+ fma.s1 F_S25 = F_S45, F_X2, F_S23
+ nop.i 0;;
+}
-{ .mfi
- nop.m 999
-(p0) fcmp.lt.unc.s1 p6,p7 = asin_ABS_NORM_f8, asin_HALF
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // ((s-t)-t*x)*sqrt(1-t^2)
+ fma.s1 F_DTX = F_DTX, F_SQRT_1T2, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p7) fma.s1 asin_D = f1,f1,asin_ABS_NORM_f8
- nop.i 999
+
+{.mfi
+ nop.m 0
+ // if sign is negative, negate table values: asin(t)_low
+ (p6) fnma.s1 F_ATLO = F_ATLO, f1, f0
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p7) fms.s1 asin_C = f1,f1,asin_ABS_NORM_f8
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // PS29 = cs9*x^7+..+cs5*x^3+cs4*x^2+cs3*x+cs2
+ fma.s1 F_S29 = F_S69, F_X4, F_S25
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p7) fma.s1 asin_B = asin_C, asin_D, f0
- nop.i 999
+
+{.mfi
+ nop.m 0
+ // if sign is negative, negate table values: asin(t)_high
+ (p6) fnma.s1 F_ATHI = F_ATHI, f1, f0
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p7) fms.s1 asin_1mD = f1,f1,asin_D
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // R = ((s-t)-t*x)*sqrt(1-t^2)-t*sqrt(1-t^2)*x^2*PS29
+ fnma.s1 F_R = F_S29, F_TSQRT, F_DTX
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p7) fma.s1 asin_Dd = asin_1mD,f1, asin_ABS_NORM_f8
- nop.i 999
+
+{.mfi
+ nop.m 0
+ // R^2
+ fma.s1 F_R2 = F_R, F_R, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p7) fms.s1 asin_Bb = asin_C, asin_D, asin_B
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // c7+c9*R^2
+ fma.s1 F_P79 = F_C9, F_R2, F_C7
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p7) fma.s1 asin_Bb = asin_C, asin_Dd, asin_Bb
- nop.i 999
+{.mfi
+ nop.m 0
+ // c3+c5*R^2
+ fma.s1 F_P35 = F_C5, F_R2, F_C3
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p6) fma.s1 asin_C = asin_ABS_NORM_f8, asin_ABS_NORM_f8, f0
- nop.i 999 ;;
+{.mfi
+ nop.m 0
+ // R^3
+ fma.s1 F_R4 = F_R2, F_R2, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p6) fms.s1 asin_B = f1, f1, asin_C
- nop.i 999
+{.mfi
+ nop.m 0
+ // R^3
+ fma.s1 F_R3 = F_R2, F_R, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p6) fms.s1 asin_Cc = asin_ABS_NORM_f8, asin_ABS_NORM_f8, asin_C
- nop.i 999 ;;
+
+
+{.mfi
+ nop.m 0
+ // c3+c5*R^2+c7*R^4+c9*R^6
+ fma.s1 F_P39 = F_P79, F_R4, F_P35
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 asin_Hh = asin_HALF, asin_B, f0
- nop.i 999
+
+{.mfi
+ nop.m 0
+ // asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6)
+ fma.s1 F_P39 = F_P39, F_R3, F_ATLO
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p6) fms.s1 asin_1mB = f1, f1, asin_B
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // R+asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6)
+ fma.s1 F_P39 = F_P39, f1, F_R
+ nop.i 0;;
}
-// Step 1.2:
-// sqrt(B + b) is computed as W + w
-// Get W
-{ .mfi
- nop.m 999
-(p0) frsqrta.s1 asin_y0,p8 = asin_B
- nop.i 999 ;;
+{.mfb
+ nop.m 0
+ // result = asin(t)_high+R+asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6)
+ fma.s0 f8 = F_ATHI, f1, F_P39
+ // return
+ br.ret.sptk b0;;
}
-{ .mfi
- nop.m 999
-(p6) fms.s1 asin_1mBmC = asin_1mB, f1, asin_C
- nop.i 999 ;;
+
+
+
+LARGE_S:
+
+{.mfi
+ // bias-1
+ mov R_TMP3 = 0xffff - 1
+ // y ~ 1/sqrt(1-s^2)
+ frsqrta.s1 F_Y, p7 = F_1S2
+ // c9 = 55*13*17/128
+ mov R_TMP4 = 0x10af7b
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 asin_t1 = asin_y0, asin_y0, f0
- nop.i 999 ;;
+{.mlx
+ // c8 = -33*13*15/128
+ mov R_TMP5 = 0x184923
+ movl R_TMP2 = 0xff00000000000000;;
}
-{ .mfi
- nop.m 999
-(p6) fms.s1 asin_Bb = asin_1mBmC, f1, asin_Cc
- nop.i 999 ;;
+{.mfi
+ // set p6 = 1 if s<0, p11 = 1 if s>0
+ cmp.ge p6, p11 = R_EXP, R_DBL_S
+ // 1-s^2
+ fnma.s1 F_1S2 = f8, f8, f1
+ // set p9 = 1
+ cmp.eq p9, p0 = r0, r0;;
}
-{ .mfi
- nop.m 999
-(p0) fnma.s1 asin_t2 = asin_t1, asin_Hh, asin_HALF
- nop.i 999 ;;
+
+{.mfi
+ // load 0.5
+ setf.exp F_05 = R_TMP3
+ // (1-s^2) rounded to single precision
+ fnma.s.s1 F_1S2_S = f8, f8, f1
+ // c9 = 55*13*17/128
+ shl R_TMP4 = R_TMP4, 10
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 asin_y1 = asin_t2, asin_y0, asin_y0
- nop.i 999 ;;
+{.mlx
+ // AND mask for getting t ~ sqrt(1-s^2)
+ setf.sig F_ANDMASK = R_TMP2
+ // OR mask
+ movl R_TMP2 = 0x0100000000000000;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 asin_t3 = asin_y1, asin_Hh, f0
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // (s^2)_s
+ fma.s.s1 F_S2 = f8, f8, f0
+ nop.i 0;;
}
-{ .mfi
- nop.m 999
-(p0) fnma.s1 asin_t4 = asin_t3, asin_y1, asin_HALF
- nop.i 999 ;;
+
+{.mmi
+ // c9 = 55*13*17/128
+ setf.s F_CS9 = R_TMP4
+ // c7 = 33*13/16
+ mov R_TMP4 = 0x41d68
+ // c8 = -33*13*15/128
+ shl R_TMP5 = R_TMP5, 11;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 asin_y2 = asin_t4, asin_y1, asin_y1
- nop.i 999 ;;
+
+{.mfi
+ setf.sig F_ORMASK = R_TMP2
+ // y^2
+ fma.s1 F_Y2 = F_Y, F_Y, f0
+ // c7 = 33*13/16
+ shl R_TMP4 = R_TMP4, 12
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 asin_S = asin_B, asin_y2, f0
- nop.i 999
+{.mfi
+ // c6 = -33*7/16
+ mov R_TMP6 = 0xc1670
+ // y' ~ sqrt(1-s^2)
+ fma.s1 F_T1 = F_Y, F_1S2, f0
+ // c5 = 63/8
+ mov R_TMP7 = 0x40fc;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 asin_H = asin_y2, asin_HALF, f0
- nop.i 999 ;;
+
+{.mlx
+ // load c8 = -33*13*15/128
+ setf.s F_CS8 = R_TMP5
+ // c4 = -35/8
+ movl R_TMP5 = 0xc08c0000;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 asin_t5 = asin_Hh, asin_y2, f0
- nop.i 999 ;;
+{.mfi
+ // r3 = pointer to polynomial coefficients
+ addl r3 = @ltoff(poly_coeffs), gp
+ // 1-(1-s^2)_s
+ fnma.s1 F_DS = F_1S2_S, f1, f1
+ // p9 = 0 if p7 = 1 (p9 = 1 for special cases only)
+ (p7) cmp.ne p9, p0 = r0, r0
}
-{ .mfi
- nop.m 999
-(p0) fnma.s1 asin_Dd = asin_S, asin_S, asin_B
- nop.i 999 ;;
+{.mlx
+ // load c7 = 33*13/16
+ setf.s F_CS7 = R_TMP4
+ // c3 = 5/2
+ movl R_TMP4 = 0x40200000;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 asin_W = asin_Dd, asin_H, asin_S
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // 1-(s^2)_s
+ fnma.s1 F_S_1S2S = F_S2, f1, f1
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 asin_2W = asin_W, f1, asin_W
- nop.i 999
+{.mlx
+ // load c4 = -35/8
+ setf.s F_CS4 = R_TMP5
+ // c2 = -3/2
+ movl R_TMP5 = 0xbfc00000;;
}
-// Step 1.3
-// Get w
-{ .mfi
- nop.m 999
-(p0) fnma.s1 asin_BmWW = asin_W, asin_W, asin_B
- nop.i 999 ;;
+
+{.mfi
+ // load c3 = 5/2
+ setf.s F_CS3 = R_TMP4
+ // x = (1-s^2)_s*y^2-1
+ fms.s1 F_X = F_1S2_S, F_Y2, f1
+ // c6 = -33*7/16
+ shl R_TMP6 = R_TMP6, 12
}
-// Step 2
-// asin(x) = atan2(X,sqrt(1-X*X))
-// = atan2(X, W) -Xw
-// corr = Xw
-// asin(x) = Z_hi + (s_lo*Z_lo - corr)
-// Call atan2(X, W)
-// Save W in f9
-// Save X in f12
-// Save w in f13
+{.mfi
+ nop.m 0
+ // y^2/2
+ fma.s1 F_Y2_2 = F_Y2, F_05, f0
+ nop.i 0;;
+}
-{ .mfi
- nop.m 999
-(p0) fmerge.se f9 = asin_W, asin_W
- nop.i 999 ;;
+
+{.mfi
+ // load c6 = -33*7/16
+ setf.s F_CS6 = R_TMP6
+ // eliminate lower bits from y'
+ fand F_T = F_T1, F_ANDMASK
+ // c5 = 63/8
+ shl R_TMP7 = R_TMP7, 16
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 asin_BmWWpb = asin_BmWW, f1, asin_Bb
- nop.i 999 ;;
+{.mfb
+ // r3 = load start address to polynomial coefficients
+ ld8 r3 = [r3]
+ // 1-(1-s^2)_s-s^2
+ fnma.s1 F_DS = f8, f8, F_DS
+ // p9 = 1 if s is a special input (NaN, or |s|> = 1)
+ (p9) br.cond.spnt ASINL_SPECIAL_CASES;;
}
-{ .mfi
- nop.m 999
-(p0) frcpa.s1 asin_1d2W,p9 = f1, asin_2W
- nop.i 999 ;;
+{.mmf
+ // get exponent, significand of y' (in single prec.)
+ getf.s R_TMP = F_T1
+ // load c3 = -3/2
+ setf.s F_CS2 = R_TMP5
+ // y*(1-s^2)
+ fma.s1 F_Y1S2 = F_Y, F_1S2, f0;;
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 asin_Ww = asin_BmWWpb, asin_1d2W, f0
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // x' = (y^2/2)*(1-(s^2)_s)-0.5
+ fms.s1 F_XL = F_Y2_2, F_S_1S2S, F_05
+ nop.i 0
}
-.endp asinl
-ASM_SIZE_DIRECTIVE(asinl)
-.proc __libm_callout
-__libm_callout:
-.prologue
-{ .mfi
- nop.m 0
- nop.f 0
-.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
-};;
-{ .mfi
- mov GR_SAVE_GP=gp // Save gp
- nop.f 0
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+{.mfi
+ nop.m 0
+ // s^2-(s^2)_s
+ fms.s1 F_S_DS2 = f8, f8, F_S2
+ nop.i 0;;
}
-.body
+
+
+{.mfi
+ nop.m 0
+ // if s<0, set s = -s
+ (p6) fnma.s1 f8 = f8, f1, f0
+ nop.i 0;;
+}
+
+{.mfi
+ // load c5 = 63/8
+ setf.s F_CS5 = R_TMP7
+ // x = (1-s^2)_s*y^2-1+(1-(1-s^2)_s-s^2)*y^2
+ fma.s1 F_X = F_DS, F_Y2, F_X
+ // for t = 2^k*1.b1 b2.., get 7-k|b1.. b6
+ extr.u R_INDEX = R_TMP, 17, 9;;
+}
+
+
+{.mmi
+ // index = (4-exponent)|b1 b2.. b6
+ sub R_INDEX = R_INDEX, R_BIAS
+ nop.m 0
+ // get exponent of y
+ shr.u R_TMP2 = R_TMP, 23;;
+}
+
+{.mmi
+ // load C3
+ ldfe F_C3 = [r3], 16
+ // set p8 = 1 if y'<2^{-4}
+ cmp.gt p8, p0 = 0x7b, R_TMP2
+ // shift R_INDEX by 5
+ shl R_INDEX = R_INDEX, 5;;
+}
+
+
{.mfb
- nop.m 0
-(p0) fmerge.se f13 = asin_Ww, asin_Ww
-(p0) br.call.sptk.many b0=__libm_atan2_reg#
-};;
-{ .mfi
- mov gp = GR_SAVE_GP // Restore gp
-(p0) fma.s1 asin_XWw = asin_ABS_NORM_f8,f13,f0
- mov b0 = GR_SAVE_B0 // Restore return address
-};;
-// asin_XWw = Xw = corr
-// asin_low = (s_lo * Z_lo - corr)
-// f8 = Z_hi + (s_lo * Z_lo - corr)
+ // get table index for sqrt(1-t^2)
+ add r2 = r2, R_INDEX
+ // get t = 2^k*1.b1 b2.. b7 1
+ for F_T = F_T, F_ORMASK
+ (p8) br.cond.spnt VERY_LARGE_INPUT;;
+}
-{ .mfi
- nop.m 999
-(p0) fms.s1 asin_low = f11, f10, asin_XWw
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
-};;
-{ .mfi
- nop.m 999
-(p0) fma.s0 f8 = f8, f1, asin_low
- nop.i 999 ;;
+
+{.mmf
+ // load C5
+ ldfe F_C5 = [r3], 16
+ // load 1/(1-t^2)
+ ldfp8 F_INV_1T2, F_SQRT_1T2 = [r2], 16
+ // x = ((1-s^2)*y^2-1)/2
+ fma.s1 F_X = F_X, F_05, f0;;
}
-{ .mfb
- nop.m 999
-(p0) fmerge.s f8 = f12,f8
-(p0) br.ret.sptk b0 ;;
+
+
+{.mmf
+ nop.m 0
+ // C7, C9
+ ldfpd F_C7, F_C9 = [r3], 16
+ // set correct exponent for t
+ fmerge.se F_T = F_T1, F_T;;
}
-.endp __libm_callout
-ASM_SIZE_DIRECTIVE(__libm_callout)
-.proc SPECIAL
-SPECIAL:
-L(ASIN_ERROR_RETURN):
-// If X is 1, return (sign of X)pi/2
-{ .mfi
- nop.m 999
-(p0) fcmp.eq.unc p6,p7 = asin_ABS_NORM_f8,f1
- nop.i 999 ;;
+{.mfi
+ // pi/2 (low, high)
+ ldfpd F_PI2_LO, F_PI2_HI = [r3]
+ // c9*x+c8
+ fma.s1 F_S89 = F_X, F_CS9, F_CS8
+ nop.i 0
}
-{ .mfb
-(p6) ldfe asin_pi_by_2_lo = [r40]
-(p6) fmerge.s asin_pi_by_2 = f8,asin_pi_by_2
- nop.b 0;;
+{.mfi
+ nop.m 0
+ // x^2
+ fma.s1 F_X2 = F_X, F_X, f0
+ nop.i 0;;
}
-// If X is a NAN, leave
-// qnan snan inf norm unorm 0 -+
-// 1 1 0 0 0 0 11
-{ .mfb
- nop.m 999
-(p6) fma.s0 f8 = f8,asin_pi_by_2_lo,asin_pi_by_2
-(p6) br.ret.spnt b0
+
+{.mfi
+ nop.m 0
+ // y*(1-s^2)*x
+ fma.s1 F_Y1S2X = F_Y1S2, F_X, f0
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p12,p0 = f8, 0xc3
- nop.i 999 ;;
+
+{.mfi
+ nop.m 0
+ // c7*x+c6
+ fma.s1 F_S67 = F_X, F_CS7, F_CS6
+ nop.i 0;;
}
-{ .mfb
- nop.m 999
-(p12) fma.s0 f8 = f8,f1,f0
-(p12) br.ret.spnt b0 ;;
+
+{.mfi
+ nop.m 0
+ // 1-x
+ fnma.s1 F_1X = F_X, f1, f1
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c3*x+c2
+ fma.s1 F_S23 = F_X, F_CS3, F_CS2
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // 1-t^2
+ fnma.s1 F_1T2 = F_T, F_T, f1
+ nop.i 0
+}
+
+{.mfi
+ // load asin(t)_high, asin(t)_low
+ ldfpd F_ATHI, F_ATLO = [r2]
+ // c5*x+c4
+ fma.s1 F_S45 = F_X, F_CS5, F_CS4
+ nop.i 0;;
+}
+
+
+
+{.mfi
+ nop.m 0
+ // t*s
+ fma.s1 F_TS = F_T, f8, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // 0.5/(1-t^2)
+ fma.s1 F_INV_1T2 = F_INV_1T2, F_2M64, f0
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // z~sqrt(1-t^2), rounded to 24 significant bits
+ fma.s.s1 F_Z = F_SQRT_1T2, F_2M64, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // sqrt(1-t^2)
+ fma.s1 F_SQRT_1T2 = F_SQRT_1T2, F_2M64, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // y*(1-s^2)*x^2
+ fma.s1 F_Y1S2X2 = F_Y1S2, F_X2, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // x^4
+ fma.s1 F_X4 = F_X2, F_X2, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // s*t rounded to 24 significant bits
+ fma.s.s1 F_TSS = F_T, f8, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c9*x^3+..+c6
+ fma.s1 F_S69 = F_X2, F_S89, F_S67
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // ST = (t^2-1+s^2) rounded to 24 significant bits
+ fms.s.s1 F_ST = f8, f8, F_1T2
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c5*x^3+..+c2
+ fma.s1 F_S25 = F_X2, F_S45, F_S23
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // 0.25/(1-t^2)
+ fma.s1 F_INV1T2_2 = F_05, F_INV_1T2, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // t*s-sqrt(1-t^2)*(1-s^2)*y
+ fnma.s1 F_TS = F_Y1S2, F_SQRT_1T2, F_TS
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // z*0.5/(1-t^2)
+ fma.s1 F_ZE = F_INV_1T2, F_SQRT_1T2, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // z^2+t^2-1
+ fms.s1 F_DZ0 = F_Z, F_Z, F_1T2
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (1-s^2-(1-s^2)_s)*x
+ fma.s1 F_DS2X = F_X, F_DS, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // t*s-(t*s)_s
+ fms.s1 F_DTS = F_T, f8, F_TSS
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c9*x^7+..+c2
+ fma.s1 F_S29 = F_X4, F_S69, F_S25
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // y*z
+ fma.s1 F_YZ = F_Z, F_Y, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // t^2
+ fma.s1 F_T2 = F_T, F_T, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // 1-t^2+ST
+ fma.s1 F_1T2_ST = F_ST, f1, F_1T2
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // y*(1-s^2)(1-x)
+ fma.s1 F_Y1S2_1X = F_Y1S2, F_1X, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // dz ~ sqrt(1-t^2)-z
+ fma.s1 F_DZ = F_DZ0, F_ZE, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // -1+correction for sqrt(1-t^2)-z
+ fnma.s1 F_CORR = F_INV1T2_2, F_DZ0, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (PS29*x^2+x)*y*(1-s^2)
+ fma.s1 F_S19 = F_Y1S2X2, F_S29, F_Y1S2X
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // z*y*(1-s^2)_s
+ fma.s1 F_ZY1S2S = F_YZ, F_1S2_S, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // s^2-(1-t^2+ST)
+ fms.s1 F_1T2_ST = f8, f8, F_1T2_ST
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (t*s-(t*s)_s)+z*y*(1-s^2-(1-s^2)_s)*x
+ fma.s1 F_DTS = F_YZ, F_DS2X, F_DTS
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // dz*y*(1-s^2)*(1-x)
+ fma.s1 F_DZ_TERM = F_DZ, F_Y1S2_1X, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // R = t*s-sqrt(1-t^2)*(1-s^2)*y+sqrt(1-t^2)*(1-s^2)*y*PS19
+ // (used for polynomial evaluation)
+ fma.s1 F_R = F_S19, F_SQRT_1T2, F_TS
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (PS29*x^2)*y*(1-s^2)
+ fma.s1 F_S29 = F_Y1S2X2, F_S29, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // apply correction to dz*y*(1-s^2)*(1-x)
+ fma.s1 F_DZ_TERM = F_DZ_TERM, F_CORR, F_DZ_TERM
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // R^2
+ fma.s1 F_R2 = F_R, F_R, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (t*s-(t*s)_s)+z*y*(1-s^2-(1-s^2)_s)*x+dz*y*(1-s^2)*(1-x)
+ fma.s1 F_DZ_TERM = F_DZ_TERM, f1, F_DTS
+ nop.i 0;;
}
-{ .mfi
-(p0) mov GR_Parameter_TAG = 60
-(p0) frcpa f10, p6 = f0, f0
-nop.i 0
-};;
-.endp SPECIAL
-ASM_SIZE_DIRECTIVE(SPECIAL)
-.proc __libm_error_region
-__libm_error_region:
+
+{.mfi
+ nop.m 0
+ // c7+c9*R^2
+ fma.s1 F_P79 = F_C9, F_R2, F_C7
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c3+c5*R^2
+ fma.s1 F_P35 = F_C5, F_R2, F_C3
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // asin(t)_low-(pi/2)_low
+ fms.s1 F_ATLO = F_ATLO, f1, F_PI2_LO
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // R^4
+ fma.s1 F_R4 = F_R2, F_R2, f0
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // R^3
+ fma.s1 F_R3 = F_R2, F_R, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (t*s)_s-t^2*y*z
+ fnma.s1 F_TSS = F_T2, F_YZ, F_TSS
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST)
+ fma.s1 F_DZ_TERM = F_YZ, F_1T2_ST, F_DZ_TERM
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (pi/2)_hi-asin(t)_hi
+ fms.s1 F_ATHI = F_PI2_HI, f1, F_ATHI
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c3+c5*R^2+c7*R^4+c9*R^6
+ fma.s1 F_P39 = F_P79, F_R4, F_P35
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST)+
+ // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29
+ fma.s1 F_DZ_TERM = F_SQRT_1T2, F_S29, F_DZ_TERM
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (t*s)_s-t^2*y*z+z*y*ST
+ fma.s1 F_TSS = F_YZ, F_ST, F_TSS
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // -asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6)
+ fms.s1 F_P39 = F_P39, F_R3, F_ATLO
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // if s<0, change sign of F_ATHI
+ (p6) fnma.s1 F_ATHI = F_ATHI, f1, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) +
+ // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 +
+ // - asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6)
+ fma.s1 F_DZ_TERM = F_P39, f1, F_DZ_TERM
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) +
+ // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 + z*y*(1-s^2)_s*x +
+ // - asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6)
+ fma.s1 F_DZ_TERM = F_ZY1S2S, F_X, F_DZ_TERM
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // d(ts)+z*y*d(1-s^2)*x+dz*y*(1-s^2)*(1-x)+z*y*(s^2-1+t^2-ST) +
+ // + sqrt(1-t^2)*y*(1-s^2)*x^2*PS29 + z*y*(1-s^2)_s*x +
+ // - asin(t)_low+R^3*(c3+c5*R^2+c7*R^4+c9*R^6) +
+ // + (t*s)_s-t^2*y*z+z*y*ST
+ fma.s1 F_DZ_TERM = F_TSS, f1, F_DZ_TERM
+ nop.i 0;;
+}
+
+
+.pred.rel "mutex", p6, p11
+{.mfi
+ nop.m 0
+ // result: add high part of pi/2-table value
+ // s>0 in this case
+ (p11) fma.s0 f8 = F_DZ_TERM, f1, F_ATHI
+ nop.i 0
+}
+
+{.mfb
+ nop.m 0
+ // result: add high part of pi/2-table value
+ // if s<0
+ (p6) fnma.s0 f8 = F_DZ_TERM, f1, F_ATHI
+ br.ret.sptk b0;;
+}
+
+
+
+
+
+
+SMALL_S:
+
+ // use 15-term polynomial approximation
+
+{.mmi
+ // r3 = pointer to polynomial coefficients
+ addl r3 = @ltoff(poly_coeffs), gp;;
+ // load start address for coefficients
+ ld8 r3 = [r3]
+ mov R_TMP = 0x3fbf;;
+}
+
+
+{.mmi
+ add r2 = 64, r3
+ ldfe F_C3 = [r3], 16
+ // p7 = 1 if |s|<2^{-64} (exponent of s<bias-64)
+ cmp.lt p7, p0 = R_EXP0, R_TMP;;
+}
+
+{.mmf
+ ldfe F_C5 = [r3], 16
+ ldfpd F_C11, F_C13 = [r2], 16
+ // 2^{-128}
+ fma.s1 F_2M128 = F_2M64, F_2M64, f0;;
+}
+
+{.mmf
+ ldfpd F_C7, F_C9 = [r3]
+ ldfpd F_C15, F_C17 = [r2]
+ // if |s|<2^{-64}, return s+2^{-128}*s
+ (p7) fma.s0 f8 = f8, F_2M128, f8;;
+}
+
+
+
+{.mfb
+ nop.m 0
+ // s^2
+ fma.s1 F_R2 = f8, f8, f0
+ // if |s|<2^{-64}, return s
+ (p7) br.ret.spnt b0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // s^3
+ fma.s1 F_R3 = f8, F_R2, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // s^4
+ fma.s1 F_R4 = F_R2, F_R2, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c3+c5*s^2
+ fma.s1 F_P35 = F_C5, F_R2, F_C3
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c11+c13*s^2
+ fma.s1 F_P1113 = F_C13, F_R2, F_C11
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c7+c9*s^2
+ fma.s1 F_P79 = F_C9, F_R2, F_C7
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c15+c17*s^2
+ fma.s1 F_P1517 = F_C17, F_R2, F_C15
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // s^8
+ fma.s1 F_R8 = F_R4, F_R4, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c3+c5*s^2+c7*s^4+c9*s^6
+ fma.s1 F_P39 = F_P79, F_R4, F_P35
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c11+c13*s^2+c15*s^4+c17*s^6
+ fma.s1 F_P1117 = F_P1517, F_R4, F_P1113
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c3+..+c17*s^14
+ fma.s1 F_P317 = F_R8, F_P1117, F_P39
+ nop.i 0;;
+}
+
+
+{.mfb
+ nop.m 0
+ // result
+ fma.s0 f8 = F_P317, F_R3, f8
+ br.ret.sptk b0;;
+}
+
+
+{.mfb
+ nop.m 0
+ fma.s0 f8 = F_P317, F_R3, f0//F_P317, F_R3, F_S29
+ // nop.f 0//fma.s0 f8 = f13, f6, f0
+ br.ret.sptk b0;;
+}
+
+
+
+
+
+ VERY_LARGE_INPUT:
+
+{.mfi
+ nop.m 0
+ // s rounded to 24 significant bits
+ fma.s.s1 F_S = f8, f1, f0
+ nop.i 0
+}
+
+{.mfi
+ // load C5
+ ldfe F_C5 = [r3], 16
+ // x = ((1-(s^2)_s)*y^2-1)/2-(s^2-(s^2)_s)*y^2/2
+ fnma.s1 F_X = F_S_DS2, F_Y2_2, F_XL
+ nop.i 0;;
+}
+
+
+
+{.mmf
+ nop.m 0
+ // C7, C9
+ ldfpd F_C7, F_C9 = [r3], 16
+ nop.f 0;;
+}
+
+
+
+{.mfi
+ // pi/2 (low, high)
+ ldfpd F_PI2_LO, F_PI2_HI = [r3], 16
+ // c9*x+c8
+ fma.s1 F_S89 = F_X, F_CS9, F_CS8
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // x^2
+ fma.s1 F_X2 = F_X, F_X, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // y*(1-s^2)*x
+ fma.s1 F_Y1S2X = F_Y1S2, F_X, f0
+ nop.i 0
+}
+
+{.mfi
+ // C11, C13
+ ldfpd F_C11, F_C13 = [r3], 16
+ // c7*x+c6
+ fma.s1 F_S67 = F_X, F_CS7, F_CS6
+ nop.i 0;;
+}
+
+
+{.mfi
+ // C15, C17
+ ldfpd F_C15, F_C17 = [r3], 16
+ // c3*x+c2
+ fma.s1 F_S23 = F_X, F_CS3, F_CS2
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c5*x+c4
+ fma.s1 F_S45 = F_X, F_CS5, F_CS4
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (s_s)^2
+ fma.s1 F_DS = F_S, F_S, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // 1-(s_s)^2
+ fnma.s1 F_1S2_S = F_S, F_S, f1
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // y*(1-s^2)*x^2
+ fma.s1 F_Y1S2X2 = F_Y1S2, F_X2, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // x^4
+ fma.s1 F_X4 = F_X2, F_X2, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c9*x^3+..+c6
+ fma.s1 F_S69 = F_X2, F_S89, F_S67
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c5*x^3+..+c2
+ fma.s1 F_S25 = F_X2, F_S45, F_S23
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // ((s_s)^2-s^2)
+ fnma.s1 F_DS = f8, f8, F_DS
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // (pi/2)_high-y*(1-(s_s)^2)
+ fnma.s1 F_HI = F_Y, F_1S2_S, F_PI2_HI
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c9*x^7+..+c2
+ fma.s1 F_S29 = F_X4, F_S69, F_S25
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // -(y*(1-(s_s)^2))_high
+ fms.s1 F_1S2_HI = F_HI, f1, F_PI2_HI
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (PS29*x^2+x)*y*(1-s^2)
+ fma.s1 F_S19 = F_Y1S2X2, F_S29, F_Y1S2X
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // y*(1-(s_s)^2)-(y*(1-s^2))_high
+ fma.s1 F_DS2 = F_Y, F_1S2_S, F_1S2_HI
+ nop.i 0;;
+}
+
+
+
+{.mfi
+ nop.m 0
+ // R ~ sqrt(1-s^2)
+ // (used for polynomial evaluation)
+ fnma.s1 F_R = F_S19, f1, F_Y1S2
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // y*(1-s^2)-(y*(1-s^2))_high
+ fma.s1 F_DS2 = F_Y, F_DS, F_DS2
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // (pi/2)_low+(PS29*x^2)*y*(1-s^2)
+ fma.s1 F_S29 = F_Y1S2X2, F_S29, F_PI2_LO
+ nop.i 0;;
+}
+
+
+
+{.mfi
+ nop.m 0
+ // R^2
+ fma.s1 F_R2 = F_R, F_R, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (pi/2)_low+(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-(y*(1-s^2))_high)
+ fms.s1 F_S29 = F_S29, f1, F_DS2
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c7+c9*R^2
+ fma.s1 F_P79 = F_C9, F_R2, F_C7
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c3+c5*R^2
+ fma.s1 F_P35 = F_C5, F_R2, F_C3
+ nop.i 0;;
+}
+
+
+
+{.mfi
+ nop.m 0
+ // R^4
+ fma.s1 F_R4 = F_R2, F_R2, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // R^3
+ fma.s1 F_R3 = F_R2, F_R, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c11+c13*R^2
+ fma.s1 F_P1113 = F_C13, F_R2, F_C11
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c15+c17*R^2
+ fma.s1 F_P1517 = F_C17, F_R2, F_C15
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (pi/2)_low+(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-(y*(1-s^2))_high)+y*(1-s^2)*x
+ fma.s1 F_S29 = F_Y1S2, F_X, F_S29
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c11+c13*R^2+c15*R^4+c17*R^6
+ fma.s1 F_P1117 = F_P1517, F_R4, F_P1113
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // c3+c5*R^2+c7*R^4+c9*R^6
+ fma.s1 F_P39 = F_P79, F_R4, F_P35
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // R^8
+ fma.s1 F_R8 = F_R4, F_R4, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // c3+c5*R^2+c7*R^4+c9*R^6+..+c17*R^14
+ fma.s1 F_P317 = F_P1117, F_R8, F_P39
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (pi/2)_low-(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-
+ // -(y*(1-s^2))_high)+y*(1-s^2)*x - P3, 17
+ fnma.s1 F_S29 = F_P317, F_R3, F_S29
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // set sign
+ (p6) fnma.s1 F_S29 = F_S29, f1, f0
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ (p6) fnma.s1 F_HI = F_HI, f1, f0
+ nop.i 0;;
+}
+
+
+{.mfb
+ nop.m 0
+ // Result:
+ // (pi/2)_low-(PS29*x^2)*y*(1-s^2)-(y*(1-s^2)-
+ // -(y*(1-s^2))_high)+y*(1-s^2)*x - P3, 17
+ // +(pi/2)_high-(y*(1-s^2))_high
+ fma.s0 f8 = F_S29, f1, F_HI
+ br.ret.sptk b0;;
+}
+
+
+
+
+
+
+
+
+
+ ASINL_SPECIAL_CASES:
+
+{.mfi
+ alloc r32 = ar.pfs, 1, 4, 4, 0
+ // check if the input is a NaN, or unsupported format
+ // (i.e. not infinity or normal/denormal)
+ fclass.nm p7, p8 = f8, 0x3f
+ // pointer to pi/2
+ add r3 = 48, r3;;
+}
+
+
+{.mfi
+ // load pi/2
+ ldfpd F_PI2_HI, F_PI2_LO = [r3]
+ // get |s|
+ fmerge.s F_S = f0, f8
+ nop.i 0
+}
+
+{.mfb
+ nop.m 0
+ // if NaN, quietize it, and return
+ (p7) fma.s0 f8 = f8, f1, f0
+ (p7) br.ret.spnt b0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // |s| = 1 ?
+ fcmp.eq.s0 p9, p0 = F_S, f1
+ nop.i 0
+}
+
+{.mfi
+ nop.m 0
+ // load FR_X
+ fma.s1 FR_X = f8, f1, f0
+ // load error tag
+ mov GR_Parameter_TAG = 60;;
+}
+
+
+{.mfb
+ nop.m 0
+ // change sign if s = -1
+ (p6) fnma.s1 F_PI2_HI = F_PI2_HI, f1, f0
+ nop.b 0
+}
+
+{.mfb
+ nop.m 0
+ // change sign if s = -1
+ (p6) fnma.s1 F_PI2_LO = F_PI2_LO, f1, f0
+ nop.b 0;;
+}
+
+{.mfb
+ nop.m 0
+ // if s = 1, result is pi/2
+ (p9) fma.s0 f8 = F_PI2_HI, f1, F_PI2_LO
+ // return if |s| = 1
+ (p9) br.ret.sptk b0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // get Infinity
+ frcpa.s1 FR_RESULT, p0 = f1, f0
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // return QNaN indefinite (0*Infinity)
+ fma.s0 FR_RESULT = f0, FR_RESULT, f0
+ nop.i 0;;
+}
+
+
+GLOBAL_LIBM_END(asinl)
+
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
+// (1)
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
@@ -742,24 +2470,29 @@ __libm_error_region:
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
+
+
+// (2)
{ .mmi
- stfe [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfe [GR_Parameter_Y] = f1,16 // Store Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
+
.body
+// (3)
{ .mib
- stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
- stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
@@ -768,23 +2501,27 @@ __libm_error_region:
nop.m 0
add GR_Parameter_RESULT = 48,sp
};;
+
+// (4)
{ .mmi
ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
+
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
-.type __libm_atan2_reg#,@function
-.global __libm_atan2_reg#
+
+
+
+
diff --git a/sysdeps/ia64/fpu/e_atan2.S b/sysdeps/ia64/fpu/e_atan2.S
index 38dd2f749a..8be7c6cec5 100644
--- a/sysdeps/ia64/fpu/e_atan2.S
+++ b/sysdeps/ia64/fpu/e_atan2.S
@@ -1,10 +1,10 @@
.file "atan2.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,33 +20,38 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
-// set [the previously overwritten] GR_Parameter_RESULT.
-// 8/17/00 Changed predicate register macro-usage to direct predicate
-// names due to an assembler bug.
-// 9/28/00 Updated to set invalid on SNaN inputs
-// 1/19/01 Fixed flags for small results
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
+// set [the previously overwritten] GR_Parameter_RESULT.
+// 08/17/00 Changed predicate register macro-usage to direct predicate
+// names due to an assembler bug.
+// 09/28/00 Updated to set invalid on SNaN inputs
+// 01/19/01 Fixed flags for small results
+// 04/13/01 Rescheduled to make all paths faster
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 08/20/02 Corrected inexact flag and directed rounding symmetry bugs
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+// 04/17/03 Added missing mutex directive
//
// API
//==============================================================
@@ -55,10 +60,12 @@
// Overview of operation
//==============================================================
//
+// The atan2 function returns values in the interval [-pi,+pi].
+//
// There are two basic paths: swap true and swap false.
// atan2(Y,X) ==> atan2(V/U) where U >= V. If Y > X, we must swap.
//
-// p6 swap True |Y| > |X|
+// p6 swap True |Y| > |X|
// p7 swap False |Y| <= |X|
// p8 X+ (If swap=True p8=p9=0)
// p9 X-
@@ -66,21 +73,21 @@
// all the other predicates p10 thru p15 are false for the main path
//
// Simple trigonometric identities show
-// Region 1 (-45 to +45 degrees):
+// Region 1 (-45 to +45 degrees):
// X>0, |Y|<=X, V=Y, U=X atan2(Y,X) = sgnY * (0 + atan(V/U))
//
-// Region 2 (-90 to -45 degrees, and +45 to +90 degrees):
+// Region 2 (-90 to -45 degrees, and +45 to +90 degrees):
// X>0, |Y|>X, V=X, U=Y atan2(Y,X) = sgnY * (pi/2 - atan(V/U))
//
-// Region 3 (-135 to -90 degrees, and +90 to +135 degrees):
+// Region 3 (-135 to -90 degrees, and +90 to +135 degrees):
// X<0, |Y|>X, V=X, U=Y atan2(Y,X) = sgnY * (pi/2 + atan(V/U))
//
-// Region 4 (-180 to -135 degrees, and +135 to +180 degrees):
+// Region 4 (-180 to -135 degrees, and +135 to +180 degrees):
// X<0, |Y|<=X, V=Y, U=X atan2(Y,X) = sgnY * (pi - atan(V/U))
//
// So the result is always of the form atan2(Y,X) = P + sgnXY * atan(V/U)
//
-// We compute atan(V/U) from the identity
+// We compute atan(V/U) from the identity
// atan(z) + atan([(V/U)-z] / [1+(V/U)z])
// where z is a limited precision approximation (16 bits) to V/U
//
@@ -124,13 +131,13 @@
// +number -0 +pi/2
// -number -0 -pi/2
//
-// +0 +number +0
-// -0 +number -0
+// +0 +number +0
+// -0 +number -0
// +0 -number +pi
// -0 -number -pi
//
-// +0 +0 +0
-// -0 +0 -0
+// +0 +0 +0
+// -0 +0 -0
// +0 -0 +pi
// -0 -0 -pi
//
@@ -138,16 +145,26 @@
// anything NaN quiet X
// atan2(+-0/+-0) sets double error tag to 37
-// atan2(+-0/+-0) sets single error tag to 38
-#include "libm_support.h"
+// Registers used
+//==============================================================
+
+// predicate registers used:
+// p6 -> p15
+
+// floating-point registers used:
+// f8, f9 input
+// f32 -> f119
+
+// general registers used
+// r32 -> r41
// Assembly macros
//==============================================================
EXP_AD_P1 = r33
EXP_AD_P2 = r34
-atan2_GR_sml_exp = r35
+rsig_near_one = r35
GR_SAVE_B0 = r35
@@ -159,22 +176,23 @@ GR_Parameter_Y = r39
GR_Parameter_RESULT = r40
atan2_GR_tag = r41
-
-atan2_X = f9
atan2_Y = f8
+atan2_X = f9
atan2_u1_X = f32
atan2_u1_Y = f33
-atan2_Umax = f34
-atan2_Vmin = f35
+atan2_z2_X = f34
+atan2_z2_Y = f35
+
atan2_two = f36
-atan2_absX = f37
+atan2_B1sq_Y = f37
atan2_z1_X = f38
atan2_z1_Y = f39
atan2_B1X = f40
+
atan2_B1Y = f41
-atan2_wp = f42
-atan2_B1sq = f43
+atan2_wp_X = f42
+atan2_B1sq_X = f43
atan2_z = f44
atan2_w = f45
@@ -183,178 +201,149 @@ atan2_P1 = f47
atan2_P2 = f48
atan2_P3 = f49
atan2_P4 = f50
+
atan2_P5 = f51
atan2_P6 = f52
atan2_P7 = f53
atan2_P8 = f54
atan2_P9 = f55
+
atan2_P10 = f56
atan2_P11 = f57
atan2_P12 = f58
atan2_P13 = f59
atan2_P14 = f60
+
atan2_P15 = f61
atan2_P16 = f62
atan2_P17 = f63
atan2_P18 = f64
atan2_P19 = f65
+
atan2_P20 = f66
atan2_P21 = f67
atan2_P22 = f68
-atan2_Pi_by_2 = f69
-
+atan2_tmp = f68
+atan2_pi_by_2 = f69
+atan2_sgn_pi_by_2 = f69
atan2_V13 = f70
+
atan2_W11 = f71
atan2_E = f72
-atan2_gamma = f73
+atan2_wp_Y = f73
atan2_V11 = f74
atan2_V12 = f75
+
atan2_V7 = f76
atan2_V8 = f77
atan2_W7 = f78
atan2_W8 = f79
atan2_W3 = f80
+
atan2_W4 = f81
atan2_V3 = f82
atan2_V4 = f83
atan2_F = f84
atan2_gV = f85
+
atan2_V10 = f86
atan2_zcub = f87
atan2_V6 = f88
atan2_V9 = f89
atan2_W10 = f90
+
atan2_W6 = f91
atan2_W2 = f92
atan2_V2 = f93
-
atan2_alpha = f94
atan2_alpha_1 = f95
+
atan2_gVF = f96
atan2_V5 = f97
atan2_W12 = f98
atan2_W5 = f99
atan2_alpha_sq = f100
+
atan2_Cp = f101
atan2_V1 = f102
-
-atan2_sml_norm = f103
-atan2_FR_tmp = f103
-
+atan2_ysq = f103
atan2_W1 = f104
atan2_alpha_cub = f105
+
atan2_C = f106
-atan2_P = f107
+atan2_xsq = f107
atan2_d = f108
atan2_A_hi = f109
atan2_dsq = f110
+
atan2_pd = f111
atan2_A_lo = f112
atan2_A = f113
-
atan2_Pp = f114
+atan2_sgnY = f115
-atan2_sgnY = f116
+atan2_sig_near_one = f116
+atan2_near_one = f116
atan2_pi = f117
-atan2_sgnX = f118
-atan2_sgnXY = f119
-
-atan2_3pi_by_4 = f120
-atan2_pi_by_4 = f121
-
-//atan2_sF = p7
-//atan2_sT = p6
+atan2_sgn_pi = f117
+atan2_3pi_by_4 = f118
+atan2_pi_by_4 = f119
-// These coefficients are for atan2.
-// You can also use this set to substitute those used in the |X| <= 1 case for atan;
-// BUT NOT vice versa.
/////////////////////////////////////////////////////////////
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+RODATA
.align 16
-atan2_tb1:
-ASM_TYPE_DIRECTIVE(atan2_tb1,@object)
-data8 0xB199DD6D2675C40F , 0x0000BFFA // P10
+LOCAL_OBJECT_START(atan2_tb1)
data8 0xA21922DC45605EA1 , 0x00003FFA // P11
-data8 0xD78F28FC2A592781 , 0x0000BFFA // P8
+data8 0xB199DD6D2675C40F , 0x0000BFFA // P10
data8 0xC2F01E5DDD100DBE , 0x00003FFA // P9
-data8 0x9D89D7D55C3287A5 , 0x00003FFB // P5
+data8 0xD78F28FC2A592781 , 0x0000BFFA // P8
data8 0xF0F03ADB3FC930D3 , 0x00003FFA // P7
-data8 0xF396268151CFB11C , 0x00003FF7 // P17
-data8 0x9D3436AABE218776 , 0x00003FF5 // P19
-data8 0x80D601879218B53A , 0x00003FFA // P13
-data8 0xA2270D30A90AA220 , 0x00003FF9 // P15
-data8 0xCCCCCCCCCCC906CD , 0x00003FFC // P1
+data8 0x88887EBB209E3543 , 0x0000BFFB // P6
+data8 0x9D89D7D55C3287A5 , 0x00003FFB // P5
+data8 0xBA2E8B9793955C77 , 0x0000BFFB // P4
data8 0xE38E38E320A8A098 , 0x00003FFB // P3
-data8 0xFE7E52D2A89995B3 , 0x0000BFEC // P22
-data8 0xC90FDAA22168C235 , 0x00003FFE // pi/4
-ASM_SIZE_DIRECTIVE(atan2_tb1)
+data8 0x9249249247E37913 , 0x0000BFFC // P2
+data8 0xCCCCCCCCCCC906CD , 0x00003FFC // P1
+data8 0xAAAAAAAAAAAAA8A9 , 0x0000BFFD // P0
+data8 0xC90FDAA22168C235 , 0x00004000 // pi
+LOCAL_OBJECT_END(atan2_tb1)
-atan2_tb2:
-ASM_TYPE_DIRECTIVE(atan2_tb2,@object)
-data8 0x9F90FB984D8E39D0 , 0x0000BFF3 // P20
+LOCAL_OBJECT_START(atan2_tb2)
data8 0xCE585A259BD8374C , 0x00003FF0 // P21
-data8 0xBA2E8B9793955C77 , 0x0000BFFB // P4
-data8 0x88887EBB209E3543 , 0x0000BFFB // P6
-data8 0xD818B4BB43D84BF2 , 0x0000BFF8 // P16
+data8 0x9F90FB984D8E39D0 , 0x0000BFF3 // P20
+data8 0x9D3436AABE218776 , 0x00003FF5 // P19
data8 0xDEC343E068A6D2A8 , 0x0000BFF6 // P18
-data8 0x9297B23CCFFB291F , 0x0000BFFA // P12
+data8 0xF396268151CFB11C , 0x00003FF7 // P17
+data8 0xD818B4BB43D84BF2 , 0x0000BFF8 // P16
+data8 0xA2270D30A90AA220 , 0x00003FF9 // P15
data8 0xD5F4F2182E7A8725 , 0x0000BFF9 // P14
-data8 0xAAAAAAAAAAAAA8A9 , 0x0000BFFD // P0
-data8 0x9249249247E37913 , 0x0000BFFC // P2
+data8 0x80D601879218B53A , 0x00003FFA // P13
+data8 0x9297B23CCFFB291F , 0x0000BFFA // P12
+data8 0xFE7E52D2A89995B3 , 0x0000BFEC // P22
data8 0xC90FDAA22168C235 , 0x00003FFF // pi/2
-data8 0xC90FDAA22168C235 , 0x00004000 // pi
+data8 0xC90FDAA22168C235 , 0x00003FFE // pi/4
data8 0x96cbe3f9990e91a8 , 0x00004000 // 3pi/4
-ASM_SIZE_DIRECTIVE(atan2_tb2)
-
-
+LOCAL_OBJECT_END(atan2_tb2)
-.align 32
-.global atan2#
-#ifdef _LIBC
-.global __atan2#
-.global __ieee754_atan2#
-#endif
-////////////////////////////////////////////////////////
.section .text
-.align 32
-
-.proc atan2#
-atan2:
-#ifdef _LIBC
-.proc __atan2#
-__atan2:
-.proc __ieee754_atan2#
-__ieee754_atan2:
-#endif
-// qnan snan inf norm unorm 0 -+
-// 0 0 1 0 0 0 11
-
-
-// Y NAN? p10 p11
-// p10 ==> quiet Y and return
-// p11 X NAN? p12, p13
-// p12 ==> quiet X and return
+GLOBAL_IEEE754_ENTRY(atan2)
{ .mfi
alloc r32 = ar.pfs,1,5,4,0
frcpa.s1 atan2_u1_X,p6 = f1,atan2_X
- addl EXP_AD_P2 = @ltoff(atan2_tb2), gp
+ nop.i 999
}
{ .mfi
addl EXP_AD_P1 = @ltoff(atan2_tb1), gp
- fclass.m.unc p10,p11 = f8, 0xc3
+ fma.s1 atan2_two = f1,f1,f1
nop.i 999
;;
}
@@ -366,256 +355,233 @@ __ieee754_atan2:
}
{ .mfi
nop.m 999
- fma.s1 atan2_two = f1,f1,f1
+ fma.s1 atan2_xsq = atan2_X,atan2_X,f0
nop.i 999
;;
}
-
{ .mfi
- ld8 EXP_AD_P2 = [ EXP_AD_P2]
- famax.s1 atan2_Umax = f8,f9
+ nop.m 999
+ fclass.m p10,p0 = atan2_Y, 0xc3 // Test for y=nan
nop.i 999
}
-;;
-
{ .mfi
nop.m 999
- fmerge.s atan2_absX = f0,atan2_X
+ fma.s1 atan2_ysq = atan2_Y,atan2_Y,f0
nop.i 999
}
;;
-// p10 Y NAN, quiet and return
{ .mfi
- ldfe atan2_P10 = [EXP_AD_P1],16
- fmerge.s atan2_sgnY = atan2_Y,f1
+ add EXP_AD_P2 = 0xd0,EXP_AD_P1
+ fclass.m p12,p0 = atan2_X, 0xc3 // Test for x nan
nop.i 999
}
-{ .mfb
- nop.m 999
-(p10) fma.d f8 = f8,f9,f0
-(p10) br.ret.spnt b0
;;
-}
-{ .mmf
+// p10 Y NAN, quiet and return
+{ .mfi
ldfe atan2_P11 = [EXP_AD_P1],16
- ldfe atan2_P20 = [EXP_AD_P2],16
- fmerge.s atan2_sgnX = atan2_X,f1
+ fmerge.s atan2_sgnY = atan2_Y,f1
+ nop.i 999
+}
+{ .mfb
+ ldfe atan2_P21 = [EXP_AD_P2],16
+(p10) fma.d.s0 f8 = atan2_Y,atan2_X,f0 // If y=nan, result quietized y
+(p10) br.ret.spnt b0 // Exit if y=nan
;;
}
-{ .mfi
- ldfe atan2_P8 = [EXP_AD_P1],16
+{ .mfi
+ ldfe atan2_P10 = [EXP_AD_P1],16
fma.s1 atan2_z1_X = atan2_u1_X, atan2_Y, f0
nop.i 999
}
-{ .mfi
-
- ldfe atan2_P21 = [EXP_AD_P2],16
- fma.s1 atan2_z1_Y = atan2_u1_Y, atan2_X, f0
+{ .mfi
+ ldfe atan2_P20 = [EXP_AD_P2],16
+ fnma.s1 atan2_B1X = atan2_u1_X, atan2_X, atan2_two
nop.i 999
;;
}
-{ .mfi
+{ .mfi
ldfe atan2_P9 = [EXP_AD_P1],16
- fnma.s1 atan2_B1X = atan2_u1_X, atan2_X, atan2_two
+ fma.s1 atan2_z1_Y = atan2_u1_Y, atan2_X, f0
nop.i 999
}
-{ .mfi
-
- ldfe atan2_P4 = [EXP_AD_P2],16
+{ .mfi
+ ldfe atan2_P19 = [EXP_AD_P2],16
fnma.s1 atan2_B1Y = atan2_u1_Y, atan2_Y, atan2_two
nop.i 999
-;;
}
-
-// p6 (atan2_sT) true if swap
-// p7 (atan2_sF) true if no swap
-// p11 ==> Y !NAN; X NAN?
+;;
{ .mfi
- ldfe atan2_P5 = [EXP_AD_P1],16
-// fcmp.eq.unc.s1 atan2_sF,atan2_sT = atan2_Umax, atan2_X
- fcmp.eq.unc.s1 p7,p6 = atan2_Umax, atan2_X
+ ldfe atan2_P8 = [EXP_AD_P1],16
+ fma.s1 atan2_z2_X = atan2_u1_X, atan2_ysq, f0
nop.i 999
}
{ .mfi
- ldfe atan2_P6 = [EXP_AD_P2],16
-(p11) fclass.m.unc p12,p13 = f9, 0xc3
+ ldfe atan2_P18 = [EXP_AD_P2],16
+ fma.s1 atan2_z2_Y = atan2_u1_Y, atan2_xsq, f0
nop.i 999
-;;
}
-
-{ .mmf
- ldfe atan2_P7 = [EXP_AD_P1],16
- ldfe atan2_P16 = [EXP_AD_P2],16
- famin.s1 atan2_Vmin = f8,f9
;;
-}
-// p8 true if X positive
-// p9 true if X negative
-// both are false is swap is true
+// p10 ==> x inf y ?
+// p11 ==> x !inf y ?
{ .mfi
- ldfe atan2_P17 = [EXP_AD_P1],16
-//(atan2_sF) fcmp.eq.unc.s1 p8,p9 = atan2_sgnX,f1
-(p7) fcmp.eq.unc.s1 p8,p9 = atan2_sgnX,f1
+ ldfe atan2_P7 = [EXP_AD_P1],16
+ fclass.m p10,p11 = atan2_X, 0x23 // test for x inf
nop.i 999
}
-{ .mfi
- ldfe atan2_P18 = [EXP_AD_P2],16
- fma.s1 atan2_sgnXY = atan2_sgnX, atan2_sgnY, f0
- nop.i 999
+{ .mfb
+ ldfe atan2_P17 = [EXP_AD_P2],16
+(p12) fma.d.s0 f8 = atan2_X,atan2_Y,f0 // If x nan, result quiet x
+(p12) br.ret.spnt b0 // Exit for x nan
;;
}
+// p6 true if swap, means |y| > |x| or ysq > xsq
+// p7 true if no swap, means |x| >= |y| or xsq >= ysq
+{ .mmf
+ ldfe atan2_P6 = [EXP_AD_P1],16
+ ldfe atan2_P16 = [EXP_AD_P2],16
+ fcmp.ge.s1 p7,p6 = atan2_xsq, atan2_ysq
+;;
+}
{ .mfi
- ldfe atan2_P19 = [EXP_AD_P1],16
-//(atan2_sF) fma.s1 atan2_wp = atan2_z1_X, atan2_z1_X, f0
-(p7) fma.s1 atan2_wp = atan2_z1_X, atan2_z1_X, f0
+ ldfe atan2_P5 = [EXP_AD_P1],16
+ fma.s1 atan2_wp_X = atan2_z1_X, atan2_z1_X, f0
nop.i 999
}
{ .mfi
- ldfe atan2_P12 = [EXP_AD_P2],16
-//(atan2_sT) fma.s1 atan2_wp = atan2_z1_Y, atan2_z1_Y, f0
-(p6) fma.s1 atan2_wp = atan2_z1_Y, atan2_z1_Y, f0
+ ldfe atan2_P15 = [EXP_AD_P2],16
+ fma.s1 atan2_B1sq_X = atan2_B1X, atan2_B1X, f0
nop.i 999
;;
}
-
{ .mfi
- ldfe atan2_P13 = [EXP_AD_P1],16
-//(atan2_sF) fma.s1 atan2_z = atan2_z1_X, atan2_B1X, f0
-(p7) fma.s1 atan2_z = atan2_z1_X, atan2_B1X, f0
+ ldfe atan2_P4 = [EXP_AD_P1],16
+(p6) fma.s1 atan2_wp_Y = atan2_z1_Y, atan2_z1_Y, f0
nop.i 999
}
{ .mfi
ldfe atan2_P14 = [EXP_AD_P2],16
-//(atan2_sT) fma.s1 atan2_z = atan2_z1_Y, atan2_B1Y, f0
-(p6) fma.s1 atan2_z = atan2_z1_Y, atan2_B1Y, f0
+(p6) fma.s1 atan2_B1sq_Y = atan2_B1Y, atan2_B1Y, f0
nop.i 999
;;
}
-
{ .mfi
- ldfe atan2_P15 = [EXP_AD_P1],16
-//(atan2_sF) fma.s1 atan2_B1sq = atan2_B1X, atan2_B1X, f0
-(p7) fma.s1 atan2_B1sq = atan2_B1X, atan2_B1X, f0
+ ldfe atan2_P3 = [EXP_AD_P1],16
+(p6) fma.s1 atan2_E = atan2_z2_Y, atan2_B1Y, atan2_Y
nop.i 999
}
{ .mfi
- ldfe atan2_P0 = [EXP_AD_P2],16
-//(atan2_sT) fma.s1 atan2_B1sq = atan2_B1Y, atan2_B1Y, f0
-(p6) fma.s1 atan2_B1sq = atan2_B1Y, atan2_B1Y, f0
+ ldfe atan2_P13 = [EXP_AD_P2],16
+(p7) fma.s1 atan2_E = atan2_z2_X, atan2_B1X, atan2_X
nop.i 999
;;
}
-// p12 ==> X NAN, quiet and return
{ .mfi
- ldfe atan2_P1 = [EXP_AD_P1],16
- fmerge.s atan2_Umax = f0,atan2_Umax
+ ldfe atan2_P2 = [EXP_AD_P1],16
+(p6) fma.s1 atan2_z = atan2_z1_Y, atan2_B1Y, f0
nop.i 999
}
-{ .mfb
- ldfe atan2_P2 = [EXP_AD_P2],16
-(p12) fma.d f8 = f9,f8,f0
-(p12) br.ret.spnt b0
+{ .mfi
+ ldfe atan2_P12 = [EXP_AD_P2],16
+(p7) fma.s1 atan2_z = atan2_z1_X, atan2_B1X, f0
+ nop.i 999
;;
}
-// p10 ==> x inf y ?
-// p11 ==> x !inf y ?
{ .mfi
- ldfe atan2_P3 = [EXP_AD_P1],16
- fmerge.s atan2_Vmin = f0,atan2_Vmin
+ ldfe atan2_P1 = [EXP_AD_P1],16
+ fcmp.eq.s0 p14,p15=atan2_X,atan2_Y // Dummy for denorm and invalid
nop.i 999
}
-{ .mfi
- ldfe atan2_Pi_by_2 = [EXP_AD_P2],16
- fclass.m.unc p10,p11 = f9, 0x23
- nop.i 999
+{ .mlx
+ ldfe atan2_P22 = [EXP_AD_P2],16
+ movl rsig_near_one = 0x8000000000000001 // signif near 1.0
;;
}
+// p12 ==> x inf y inf
+// p13 ==> x inf y !inf
{ .mmf
- ldfe atan2_P22 = [EXP_AD_P1],16
- ldfe atan2_pi = [EXP_AD_P2],16
- nop.f 999
+ ldfe atan2_P0 = [EXP_AD_P1],16
+ ldfe atan2_pi_by_2 = [EXP_AD_P2],16
+(p10) fclass.m.unc p12,p13 = atan2_Y, 0x23 // x inf, test if y inf
;;
}
{ .mfi
- nop.m 999
- fcmp.eq.s0 p12,p13=f9,f8 // Dummy to catch denormal and invalid
+ ldfe atan2_pi = [EXP_AD_P1],16
+(p6) fma.s1 atan2_w = atan2_wp_Y, atan2_B1sq_Y,f0
nop.i 999
-;;
}
-
-
{ .mfi
- ldfe atan2_pi_by_4 = [EXP_AD_P1],16
-//(atan2_sT) fmerge.ns atan2_sgnXY = atan2_sgnXY, atan2_sgnXY
-(p6) fmerge.ns atan2_sgnXY = atan2_sgnXY, atan2_sgnXY
+ ldfe atan2_pi_by_4 = [EXP_AD_P2],16
+(p7) fma.s1 atan2_w = atan2_wp_X, atan2_B1sq_X,f0
nop.i 999
+;;
}
+
{ .mfi
ldfe atan2_3pi_by_4 = [EXP_AD_P2],16
- fma.s1 atan2_w = atan2_wp, atan2_B1sq,f0
+(p11) fclass.m.unc p9,p0 = atan2_Y, 0x23 // x not inf, test if y inf
nop.i 999
;;
}
-// p12 ==> x inf y inf
-// p13 ==> x inf y !inf
+{ .mfi
+ setf.sig atan2_sig_near_one = rsig_near_one
+(p12) fcmp.gt.unc.s1 p10,p11 = atan2_X,f0 // x inf, y inf, test if x +inf
+ nop.i 999
+}
{ .mfi
nop.m 999
- fmerge.s atan2_z = f0, atan2_z
+(p6) fnma.s1 atan2_gV = atan2_Y, atan2_z, atan2_X
nop.i 999
;;
}
{ .mfi
- nop.m 99
-(p10) fclass.m.unc p12,p13 = f8, 0x23
+ nop.m 999
+ frcpa.s1 atan2_F,p0 = f1, atan2_E
nop.i 999
}
{ .mfi
- nop.m 99
-(p11) fclass.m.unc p14,p15 = f8, 0x23
+ nop.m 999
+(p7) fnma.s1 atan2_gV = atan2_X, atan2_z, atan2_Y
nop.i 999
;;
}
+// p13 ==> x inf y !inf
{ .mfi
nop.m 999
-(p12) fcmp.eq.unc.s1 p10,p11 = atan2_sgnX,f1
- nop.i 99
-;;
+(p13) fcmp.gt.unc.s1 p14,p15 = atan2_X,f0 // x inf, y !inf, test if x +inf
+ nop.i 999
}
-
-
{ .mfb
- mov atan2_GR_sml_exp = 0x1 // Small exponent for making small norm
-(p14) fma.d f8 = atan2_sgnY, atan2_Pi_by_2, f0
-(p14) br.ret.spnt b0
+ nop.m 999
+(p9) fma.d.s0 f8 = atan2_sgnY, atan2_pi_by_2, f0 // +-pi/2 if x !inf, y inf
+(p9) br.ret.spnt b0 // exit if x not inf, y inf, result is +-pi/2
;;
}
-// Make a very small normal in case need to force inexact and underflow
{ .mfi
- setf.exp atan2_sml_norm = atan2_GR_sml_exp
+ nop.m 999
fma.s1 atan2_V13 = atan2_w, atan2_P11, atan2_P10
nop.i 999
}
@@ -626,58 +592,58 @@ __ieee754_atan2:
;;
}
-
{ .mfi
nop.m 999
- fma.s1 atan2_E = atan2_Vmin, atan2_z, atan2_Umax
+ fma.s1 atan2_V11 = atan2_w, atan2_P9, atan2_P8
nop.i 999
}
{ .mfi
nop.m 999
- fnma.s1 atan2_gamma = atan2_Umax, atan2_z, f1
+ fma.s1 atan2_V12 = atan2_w, atan2_w, f0
nop.i 999
;;
}
{ .mfi
nop.m 999
- fma.s1 atan2_V11 = atan2_w, atan2_P9, atan2_P8
+ fma.s1 atan2_V8 = atan2_w, atan2_P7 , atan2_P6
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 atan2_V12 = atan2_w, atan2_w, f0
+ fma.s1 atan2_W8 = atan2_w, atan2_P19, atan2_P18
nop.i 999
;;
}
{ .mfi
nop.m 999
- fma.s1 atan2_V7 = atan2_w, atan2_P5 , atan2_P4
+ fnma.s1 atan2_alpha = atan2_E, atan2_F, f1
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 atan2_V8 = atan2_w, atan2_P7 , atan2_P6
+ fnma.s1 atan2_alpha_1 = atan2_E, atan2_F, atan2_two
nop.i 999
;;
}
+
{ .mfi
nop.m 999
- fma.s1 atan2_W7 = atan2_w, atan2_P17, atan2_P16
+ fma.s1 atan2_V7 = atan2_w, atan2_P5 , atan2_P4
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 atan2_W8 = atan2_w, atan2_P19, atan2_P18
+ fma.s1 atan2_W7 = atan2_w, atan2_P17, atan2_P16
nop.i 999
;;
}
{ .mfi
nop.m 999
- fma.s1 atan2_W3 = atan2_w, atan2_P13, atan2_P12
+ fma.s1 atan2_V4 = atan2_w, atan2_P3 , atan2_P2
nop.i 999
}
{ .mfi
@@ -689,55 +655,55 @@ __ieee754_atan2:
{ .mfi
nop.m 999
- fma.s1 atan2_V3 = atan2_w, atan2_P1 , atan2_P0
+ fma.s1 atan2_V3 = atan2_w, atan2_P1 , atan2_P0
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 atan2_V4 = atan2_w, atan2_P3 , atan2_P2
+ fma.s1 atan2_W3 = atan2_w, atan2_P13, atan2_P12
nop.i 999
;;
}
{ .mfi
nop.m 999
- fma.s1 atan2_zcub = atan2_z, atan2_w, f0
+ fma.s1 atan2_V10 = atan2_V12, atan2_V13, atan2_V11
nop.i 999
}
{ .mfi
nop.m 999
- fnma.s1 atan2_gV = atan2_Umax, atan2_z, atan2_Vmin
+ fma.s1 atan2_gVF = atan2_gV, atan2_F, f0
nop.i 999
;;
}
{ .mfi
nop.m 999
- frcpa.s1 atan2_F,p15 = f1, atan2_E
+ fma.s1 atan2_alpha_sq = atan2_alpha, atan2_alpha, f0
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 atan2_V10 = atan2_V12, atan2_V13, atan2_V11
+ fma.s1 atan2_Cp = atan2_alpha, atan2_alpha_1, f1
nop.i 999
;;
}
{ .mfi
nop.m 999
- fma.s1 atan2_V6 = atan2_V12, atan2_V8 , atan2_V7
+ fma.s1 atan2_V9 = atan2_V12, atan2_V12, f0
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 atan2_V9 = atan2_V12, atan2_V12, f0
+ fma.s1 atan2_W10 = atan2_V12, atan2_P22 , atan2_W11
nop.i 999
;;
}
{ .mfi
nop.m 999
- fma.s1 atan2_W10 = atan2_V12, atan2_P22 , atan2_W11
+ fma.s1 atan2_V6 = atan2_V12, atan2_V8 , atan2_V7
nop.i 999
}
{ .mfi
@@ -749,65 +715,47 @@ __ieee754_atan2:
{ .mfi
nop.m 999
- fma.s1 atan2_W2 = atan2_V12, atan2_W4 , atan2_W3
+ fma.s1 atan2_V2 = atan2_V12, atan2_V4 , atan2_V3
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 atan2_V2 = atan2_V12, atan2_V4 , atan2_V3
+ fma.s1 atan2_W2 = atan2_V12, atan2_W4 , atan2_W3
nop.i 999
;;
}
-
-// Both X and Y are INF
-// p10 ==> X +
-// p11 ==> X -
-.pred.rel "mutex",p10,p11
-{ .mfb
- nop.m 999
-(p10) fma.d f8 = atan2_sgnY, atan2_pi_by_4, f0
-(p10) br.ret.spnt b0
-}
-{ .mfb
- nop.m 999
-(p11) fma.d f8 = atan2_sgnY, atan2_3pi_by_4, f0
-(p11) br.ret.spnt b0
-;;
-}
-
-
-.pred.rel "mutex",p8,p9,p6
+// p8 ==> y 0 x?
+// p9 ==> y !0 x?
{ .mfi
nop.m 999
- fnma.s1 atan2_alpha = atan2_E, atan2_F, f1
+ fclass.m p8,p9 = atan2_Y, 0x07 // Test for y=0
nop.i 999
}
{ .mfi
nop.m 999
- fnma.s1 atan2_alpha_1 = atan2_E, atan2_F, atan2_two
+ fma.s1 atan2_zcub = atan2_z, atan2_w, f0
nop.i 999
;;
}
-
{ .mfi
nop.m 999
-//(atan2_sT) fmerge.s atan2_P = atan2_Y, atan2_Pi_by_2
-(p6) fmerge.s atan2_P = atan2_Y, atan2_Pi_by_2
+ fma.s1 atan2_alpha_cub = atan2_alpha, atan2_alpha_sq, f0
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 atan2_gVF = atan2_gV, atan2_F, f0
+ fma.s1 atan2_C = atan2_gVF, atan2_Cp, f0
nop.i 999
;;
}
-
+// p12 ==> y0 x0
+// p13 ==> y0 x!0
{ .mfi
nop.m 999
- fma.s1 atan2_V5 = atan2_V9, atan2_V10, atan2_V6
+(p8) fclass.m.unc p12,p13 = atan2_X, 0x07 // y=0, test if x is 0
nop.i 999
}
{ .mfi
@@ -817,11 +765,9 @@ __ieee754_atan2:
;;
}
-
-
{ .mfi
nop.m 999
-(p8) fmerge.s atan2_P = atan2_sgnY, f0
+ fma.s1 atan2_V5 = atan2_V9, atan2_V10, atan2_V6
nop.i 999
}
{ .mfi
@@ -832,249 +778,214 @@ __ieee754_atan2:
}
-
-
+// p9 ==> y!0 x0
{ .mfi
nop.m 999
-(p9) fmerge.s atan2_P = atan2_sgnY, atan2_pi
+(p9) fclass.m.unc p9,p0 = atan2_X, 0x07 // y not 0, test if x is 0
nop.i 999
+}
+// p10 ==> X +INF, Y +-INF
+{ .mfb
+ nop.m 999
+(p10) fma.d.s0 f8 = atan2_sgnY, atan2_pi_by_4, f0 // x=+inf, y=inf
+(p10) br.ret.spnt b0 // Exit for x=+inf, y=inf, result is +-pi/4
;;
}
-
+.pred.rel "mutex",p11,p14
{ .mfi
nop.m 999
- fma.s1 atan2_alpha_sq = atan2_alpha, atan2_alpha, f0
+(p14) fmerge.s f8 = atan2_sgnY, f0 // x=+inf, y !inf, result +-0
nop.i 999
}
-{ .mfi
+// p11 ==> X -INF, Y +-INF
+{ .mfb
nop.m 999
- fma.s1 atan2_Cp = atan2_alpha, atan2_alpha_1, f1
- nop.i 999
+(p11) fma.d.s0 f8 = atan2_sgnY, atan2_3pi_by_4, f0 // x=-inf, y=inf
+(p11) br.ret.spnt b0 // Exit for x=-inf, y=inf, result is +-3pi/4
;;
}
-
{ .mfi
nop.m 999
- fma.s1 atan2_V1 = atan2_V9, atan2_V5, atan2_V2
+(p13) fcmp.gt.unc.s1 p10,p11 = atan2_X,f0 // x not 0, y=0, test if x>0
nop.i 999
}
-{ .mfi
+{ .mfb
nop.m 999
- fma.s1 atan2_W12 = atan2_V9, atan2_W12, f0
- nop.i 999
+ fma.s1 atan2_d = atan2_alpha_cub, atan2_C, atan2_C
+(p14) br.ret.spnt b0 // Exit if x=+inf, y !inf, result +-0
;;
}
-
-// p13 ==> x inf y !inf
{ .mfi
nop.m 999
- fma.s1 atan2_W1 = atan2_V9, atan2_W5, atan2_W2
+ fma.s1 atan2_W12 = atan2_V9, atan2_W12, f0
nop.i 999
}
-{ .mfi
+{ .mfb
nop.m 999
-(p13) fcmp.eq.unc.s1 p10,p11 = atan2_sgnX,f1
- nop.i 999
+(p9) fma.d.s0 f8 = atan2_sgnY, atan2_pi_by_2, f0 // x=0, y not 0
+(p9) br.ret.spnt b0 // Exit if x=0 and y not 0, result is +-pi/2
;;
}
-
{ .mfi
nop.m 999
- fma.s1 atan2_alpha_cub = atan2_alpha, atan2_alpha_sq, f0
+ fma.s1 atan2_V1 = atan2_V9, atan2_V5, atan2_V2
nop.i 999
}
-{ .mfi
+{ .mfb
nop.m 999
- fma.s1 atan2_C = atan2_gVF, atan2_Cp, f0
- nop.i 999
+ fma.s1 atan2_W1 = atan2_V9, atan2_W5, atan2_W2
+(p12) br.spnt ATAN2_ERROR // Branch if x=0 and y=0
;;
}
-.pred.rel "mutex",p10,p11
-// x inf y !inf
-{ .mfb
+{ .mfi
nop.m 999
-(p10) fmerge.s f8 = atan2_sgnY, f0
-(p10) br.ret.spnt b0
+(p10) fmerge.s f8 = atan2_sgnY, f0 // +-0 if x>0, y=0
+ nop.i 999
}
{ .mfb
nop.m 999
-(p11) fma.d f8 = atan2_sgnY, atan2_pi, f0
-(p11) br.ret.spnt b0
+(p11) fma.d.s0 f8 = atan2_sgnY, atan2_pi, f0 // +-pi if x<0, y=0
+(p13) br.ret.spnt b0 // Exit if x!0 and y=0
;;
}
-
-// p10 ==> y 0 x?
-// p11 ==> y !0 x?
{ .mfi
nop.m 999
- fclass.m.unc p10,p11 = f8, 0x07
+ fma.s1 atan2_pd = atan2_P0, atan2_d, f0
nop.i 999
-;;
}
-
{ .mfi
nop.m 999
-(p8) fmerge.s atan2_sml_norm = atan2_sgnY, atan2_sml_norm
+ fma.s1 atan2_dsq = atan2_d, atan2_d, f0
nop.i 999
;;
}
+
{ .mfi
nop.m 999
- fma.s1 atan2_Pp = atan2_W12, atan2_W1, atan2_V1
+ fmerge.se atan2_near_one = f1, atan2_sig_near_one // Const ~1.0
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 atan2_d = atan2_alpha_cub, atan2_C, atan2_C
+ fma.s1 atan2_Pp = atan2_W12, atan2_W1, atan2_V1
nop.i 999
;;
}
-// p12 ==> y0 x0
-// p13 ==> y0 x!0
-// p14 ==> y!0 x0
-// p15 ==> y!0 x!0
-{ .mfi
- nop.m 999
-(p10) fclass.m.unc p12,p13 = f9, 0x07
- nop.i 999
-}
+// p8 true if no swap and X positive
+// p9 true if no swap and X negative
+// both are false is swap is true
{ .mfi
nop.m 999
-(p11) fclass.m.unc p14,p15 = f9, 0x07
+(p7) fcmp.ge.unc.s1 p8,p9 = atan2_X,f0
nop.i 999
-;;
}
-
-
-
-
{ .mfb
nop.m 999
-(p13) fcmp.eq.unc.s1 p10,p11 = atan2_sgnX,f1
-(p12) br.spnt ATAN2_ERROR
+(p15) fma.d.s0 f8 = atan2_sgnY, atan2_pi, f0
+(p15) br.ret.spnt b0 // Exit if x=-inf, y !inf, result +-pi
;;
}
-
-
{ .mfi
nop.m 999
- fma.s1 atan2_pd = atan2_P0, atan2_d, f0
+ fma.s1 atan2_sgn_pi_by_2 = atan2_pi_by_2, atan2_sgnY, f0
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 atan2_dsq = atan2_d, atan2_d, f0
+ fma.s1 atan2_A_lo = atan2_pd, atan2_dsq, atan2_d
nop.i 999
;;
}
+
{ .mfi
nop.m 999
- fma.s1 atan2_A_hi = atan2_zcub, atan2_Pp, atan2_z
+ fma.s1 atan2_sgn_pi = atan2_pi, atan2_sgnY, f0
nop.i 999
}
-{ .mfb
+{ .mfi
nop.m 999
-(p14) fma.d f8 = atan2_sgnY, atan2_Pi_by_2, f0
-(p14) br.ret.spnt b0
+ fma.s1 atan2_A_hi = atan2_zcub, atan2_Pp, atan2_z
+ nop.i 999
;;
}
-
-{ .mfb
- nop.m 999
-(p10) fmerge.s f8 = atan2_sgnY, f0
-(p10) br.ret.spnt b0
-}
-{ .mfb
+// For |Y| <= |X| and X > 0, force inexact in case A_lo is zero
+{ .mfi
nop.m 999
-(p11) fma.d f8 = atan2_sgnY, atan2_pi, f0
-(p11) br.ret.spnt b0
+(p8) fmpy.s0 atan2_tmp = atan2_P22, atan2_P22
+ nop.i 999
;;
}
-
-
{ .mfi
nop.m 999
- fma.s1 atan2_A_lo = atan2_pd, atan2_dsq, atan2_d
+ fma.s1 atan2_A = atan2_A_hi, f1, atan2_A_lo
nop.i 999
-;;
}
-
-
+// For |Y| <= |X| and X > 0, result is A_hi + A_lo
{ .mfi
nop.m 999
- fma.s1 atan2_A = atan2_A_hi, f1, atan2_A_lo
+(p8) fma.d.s0 f8 = atan2_A_hi, f1, atan2_A_lo
nop.i 999
;;
}
-// Force inexact and possibly underflow if very small results
+.pred.rel "mutex",p6,p9
+// We perturb A by multiplying by 1.0+1ulp as we produce the result
+// in order to get symmetrically rounded results in directed rounding modes.
+// If we don't do this, there are a few cases where the trailing 11 bits of
+// the significand of the result, before converting to double, are zero. These
+// cases do not round symmetrically in round to +infinity or round to -infinity.
+// The perturbation also insures that the inexact flag is set.
+// For |Y| > |X|, result is +- pi/2 - (A_hi + A_lo)
{ .mfi
nop.m 999
-(p8) fma.d atan2_FR_tmp = atan2_sgnXY, atan2_A, atan2_sml_norm
+(p6) fnma.d.s0 f8 = atan2_A, atan2_near_one, atan2_sgn_pi_by_2
nop.i 999
}
+// For |Y| <= |X|, and X < 0, result is +- pi + (A_hi + A_lo)
{ .mfb
nop.m 999
- fma.d f8 = atan2_sgnXY, atan2_A, atan2_P
- br.ret.sptk b0
+(p9) fma.d.s0 f8 = atan2_A, atan2_near_one, atan2_sgn_pi
+ br.ret.sptk b0
;;
}
ATAN2_ERROR:
-
+// Here if x=0 and y=0
{ .mfi
nop.m 999
- fcmp.eq.unc.s1 p10,p11 = atan2_sgnX,f1
+ fclass.m p10,p11 = atan2_X,0x05 // Test if x=+0
nop.i 999
}
;;
{ .mfi
- mov atan2_GR_tag = 37
-(p10) fmerge.s f10 = atan2_sgnY, f0
- nop.i 999
+ mov atan2_GR_tag = 37
+(p10) fmerge.s f10 = atan2_sgnY, f0 // x=+0, y=0
+ nop.i 999
}
{ .mfi
nop.m 999
-(p11) fma.d f10 = atan2_sgnY, atan2_pi, f0
+(p11) fma.d.s0 f10 = atan2_sgnY, atan2_pi, f0 // x=-0, y=0
nop.i 999
;;
}
-.endp atan2#
-ASM_SIZE_DIRECTIVE(atan2#)
-
-
-// Stack operations when calling error support.
-// (1) (2) (3) (call) (4)
-// sp -> + psp -> + psp -> + sp -> +
-// | | | |
-// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
-// | | | |
-// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
-// | | | |
-// | | <- GR_X X1 ->| |
-// | | | |
-// sp-64 -> + sp -> + sp -> + +
-// save ar.pfs save b0 restore gp
-// save gp restore ar.pfs
+GLOBAL_IEEE754_END(atan2)
-
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
// (1)
{ .mfi
@@ -1102,19 +1013,19 @@ __libm_error_region:
.body
// (3)
{ .mib
- stfd [GR_Parameter_X] = f9 // STORE Parameter 1 on stack
+ stfd [GR_Parameter_X] = f9 // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
- stfd [GR_Parameter_Y] = f10 // STORE Parameter 3 on stack
+ stfd [GR_Parameter_Y] = f10 // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
// (4)
@@ -1130,8 +1041,7 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_atan2f.S b/sysdeps/ia64/fpu/e_atan2f.S
index 03a4fed82f..c483a7ad34 100644
--- a/sysdeps/ia64/fpu/e_atan2f.S
+++ b/sysdeps/ia64/fpu/e_atan2f.S
@@ -1,10 +1,10 @@
.file "atan2f.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 6/1/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,18 +35,21 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
//==============================================================
-// 6/01/00 Initial version
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 06/01/00 Initial version
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-// 8/17/00 Changed predicate register macro-usage to direct predicate
+// 08/17/00 Changed predicate register macro-usage to direct predicate
// names due to an assembler bug.
-// 1/05/01 Fixed flag settings for denormal input.
-// 1/19/01 Added documentation
-// 1/30/01 Improved speed
+// 01/05/01 Fixed flag settings for denormal input.
+// 01/19/01 Added documentation
+// 01/30/01 Improved speed
+// 02/06/02 Corrected .section statement
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
// Description
//=========================================
@@ -226,7 +229,6 @@
// atan2f(+-0/+-0) sets single error tag to 38
// These are domain errors.
-#include "libm_support.h"
//
// Assembly macros
@@ -324,22 +326,20 @@ atan2f_poly_atan_U = f88
//atan2f_Pred_Xneg = p9 // x < 0
-.data
+RODATA
.align 16
-atan2f_coef_table1:
-ASM_TYPE_DIRECTIVE(atan2f_coef_table1,@object)
+LOCAL_OBJECT_START(atan2f_coef_table1)
data8 0xBFD5555512191621 // p1
data8 0x3F522E5D33BC9BAA // p10
data8 0xBFA6E10BA401393F // p7
data8 0x3FB142A73D7C54E3 // p6
data8 0xBFC2473C5145EE38 // p3
data8 0x3FC9997E7AFBFF4E // p2
-ASM_SIZE_DIRECTIVE(atan2f_coef_table1)
+LOCAL_OBJECT_END(atan2f_coef_table1)
-atan2f_coef_table2:
-ASM_TYPE_DIRECTIVE(atan2f_coef_table2,@object)
+LOCAL_OBJECT_START(atan2f_coef_table2)
data8 0xBF7DEAADAA336451 // p9
data8 0x3F97105B4160F86B // p8
data8 0xBFB68EED6A8CFA32 // p5
@@ -348,29 +348,12 @@ data8 0x3ff921fb54442d18 // pi/2
data8 0x400921fb54442d18 // pi
data8 0x3fe921fb54442d18 // pi/4
data8 0x4002d97c7f3321d2 // 3pi/4
-ASM_SIZE_DIRECTIVE(atan2f_coef_table2)
-
+LOCAL_OBJECT_END(atan2f_coef_table2)
-.global atan2f
-#ifdef _LIBC
-.global __atan2f
-.global __ieee754_atan2f
-#endif
-
-.text
-.align 32
-
-atan2f:
-.proc atan2f
-#ifdef _LIBC
-.proc __atan2f
-__atan2f:
-.proc __ieee754_atan2f
-__ieee754_atan2f:
-#endif
-
+.section .text
+GLOBAL_IEEE754_ENTRY(atan2f)
{ .mfi
alloc r32 = ar.pfs,1,5,4,0
@@ -724,7 +707,7 @@ ATAN2F_XY_INF_NAN_ZERO:
}
{ .mfb
nop.m 999
-(p10) fma.s f8 = f9,f8,f0 // Result quietized y if y is nan
+(p10) fma.s.s0 f8 = f9,f8,f0 // Result quietized y if y is nan
(p10) br.ret.spnt b0 // Exit if y is nan
}
;;
@@ -737,7 +720,7 @@ ATAN2F_XY_INF_NAN_ZERO:
}
{ .mfb
nop.m 999
-(p12) fnorm.s f8 = f9 // Result quietized x if x is nan, y not nan
+(p12) fnorm.s.s0 f8 = f9 // Result quietized x if x is nan, y not nan
(p12) br.ret.spnt b0 // Exit if x is nan, y not nan
}
;;
@@ -757,7 +740,7 @@ ATAN2F_XY_INF_NAN_ZERO:
}
{ .mfb
nop.m 999
-(p7) fma.s f8 = atan2f_sgn_Y, atan2f_const_piby4,f0 // Result +-pi/4
+(p7) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_piby4,f0 // Result +-pi/4
(p7) br.ret.spnt b0 // Exit if x +inf and y inf
}
;;
@@ -790,19 +773,19 @@ ATAN2F_XY_INF_NAN_ZERO:
}
{ .mfb
nop.m 999
-(p13) fma.s f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // Result +-pi/2
+(p13) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // Result +-pi/2
(p13) br.ret.spnt b0 // Exit if x not -inf and y inf
}
;;
{ .mfi
nop.m 999
-(p14) fma.s f8 = atan2f_sgn_Y, atan2f_const_3piby4,f0 // Result +-3pi/4
+(p14) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_3piby4,f0 // Result +-3pi/4
nop.i 999
}
{ .mfb
nop.m 999
-(p15) fma.s f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // Result +-pi
+(p15) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // Result +-pi
(p11) br.ret.spnt b0 // Exit if x -inf
}
;;
@@ -829,31 +812,28 @@ ATAN2F_XY_INF_NAN_ZERO:
}
{ .mfb
nop.m 999
-(p9) fma.s f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // x < 0, y 0, result +-pi
+(p9) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_pi,f0 // x < 0, y 0, result +-pi
(p10) br.cond.spnt __libm_error_region // Branch if x zero and y zero
}
;;
{ .mfb
nop.m 999
-(p11) fma.s f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // x zero, y not zero
+(p11) fma.s.s0 f8 = atan2f_sgn_Y, atan2f_const_piby2,f0 // x zero, y not zero
br.ret.sptk b0 // Final special case exit
}
;;
-.endp atan2f
-ASM_SIZE_DIRECTIVE(atan2f)
-
+GLOBAL_IEEE754_END(atan2f)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
mov GR_Parameter_TAG = 38
fclass.m p10,p11 = f9,0x5 // @zero | @pos
;;
(p10) fmerge.s f10 = f8, f0
-(p11) fma.s f10 = atan2f_sgn_Y, atan2f_const_pi,f0
+(p11) fma.s.s0 f10 = atan2f_sgn_Y, atan2f_const_pi,f0
;;
{ .mfi
@@ -913,8 +893,7 @@ __libm_error_region:
}
;;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_atanh.S b/sysdeps/ia64/fpu/e_atanh.S
new file mode 100644
index 0000000000..7ddc3e3023
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_atanh.S
@@ -0,0 +1,1069 @@
+.file "atanh.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// ==============================================================
+// History
+// ==============================================================
+// 05/03/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+// 05/26/03 Improved performance, fixed to handle unorms
+//
+// API
+// ==============================================================
+// double atanh(double)
+//
+// Overview of operation
+// ==============================================================
+//
+// There are 7 paths:
+// 1. x = +/-0.0
+// Return atanh(x) = +/-0.0
+//
+// 2. 0.0 < |x| < 1/4
+// Return atanh(x) = Po2l(x),
+// where Po2l(x) = (((((((((C9*x^2 + C8)*x^2 + C7)*x^2 + C6)*x^2 +
+// C5)*x^2 + C4)*x^2 + C3)*x^2 + C2)*x^2 + C1)* x^2 + C0)*x^3 + x
+// 3. 1/4 <= |x| < 1
+// Return atanh(x) = sign(x) * log((1 + |x|)/(1 - |x|))
+// To compute (1 + |x|)/(1 - |x|) modified Newton Raphson method is used
+// (3 iterations)
+// Algorithm description for log function see below.
+//
+// 4. |x| = 1
+// Return atanh(x) = sign(x) * +INF
+//
+// 5. 1 < |x| <= +INF
+// Return atanh(x) = QNaN
+//
+// 6. x = [S,Q]NaN
+// Return atanh(x) = QNaN
+//
+// 7. x = denormal
+// Return atanh(x) = x
+//
+//==============================================================
+// Algorithm Description for log(x) function
+// Below we are using the fact that inequality x - 1.0 > 2^(-6) is always true
+// for this atanh implementation
+//
+// Consider x = 2^N 1.f1 f2 f3 f4...f63
+// Log(x) = log(x * frcpa(x) / frcpa(x))
+// = log(x * frcpa(x)) + log(1/frcpa(x))
+// = log(x * frcpa(x)) - log(frcpa(x))
+//
+// frcpa(x) = 2^-N * frcpa(1.f1 f2 ... f63)
+//
+// -log(frcpa(x)) = -log(C)
+// = -log(2^-N) - log(frcpa(1.f1 f2 ... f63))
+//
+// -log(frcpa(x)) = -log(C)
+// = N*log2 - log(frcpa(1.f1 f2 ... f63))
+//
+//
+// Log(x) = log(1/frcpa(x)) + log(frcpa(x) x)
+//
+// Log(x) = N*log2 + log(1./frcpa(1.f1 f2 ... f63)) + log(x * frcpa(x))
+// Log(x) = N*log2 + T + log(frcpa(x) x)
+//
+// Log(x) = N*log2 + T + log(C * x)
+//
+// C * x = 1 + r
+//
+// Log(x) = N*log2 + T + log(1 + r)
+// Log(x) = N*log2 + T + Series(r - r^2/2 + r^3/3 - r^4/4 + ...)
+//
+// 1.f1 f2 ... f8 has 256 entries.
+// They are 1 + k/2^8, k = 0 ... 255
+// These 256 values are the table entries.
+//
+// Implementation
+//==============================================================
+// C = frcpa(x)
+// r = C * x - 1
+//
+// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4 + P4*r^5 + P5*r^6
+//
+// x = f * 2*N where f is 1.f_1f_2f_3...f_63
+// Nfloat = float(n) where n is the true unbiased exponent
+// pre-index = f_1f_2....f_8
+// index = pre_index * 16
+// get the dxt table entry at index + offset = T
+//
+// result = (T + Nfloat * log(2)) + rseries
+//
+// The T table is calculated as follows
+// Form x_k = 1 + k/2^8 where k goes from 0... 255
+// y_k = frcpa(x_k)
+// log(1/y_k) in quad and round to double-extended
+//
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f32 -> f77
+
+// General registers used:
+// r14 -> r27, r33 -> r39
+
+// Predicate registers used:
+// p6 -> p14
+
+// p10, p11 to indicate is argument positive or negative
+// p12 to filter out case when x = [Q,S]NaN or +/-0
+// p13 to filter out case when x = denormal
+// p6, p7 to filter out case when |x| >= 1
+// p8 to filter out case when |x| < 1/4
+
+// Assembly macros
+//==============================================================
+Data2Ptr = r14
+Data3Ptr = r15
+RcpTablePtr = r16
+rExpbMask = r17
+rBias = r18
+rNearZeroBound = r19
+rArgSExpb = r20
+rArgExpb = r21
+rSExpb = r22
+rExpb = r23
+rSig = r24
+rN = r25
+rInd = r26
+DataPtr = r27
+
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_SAVE_PFS = r35
+
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+atanh_GR_tag = r39
+
+//==============================================================
+fAbsX = f32
+fOneMx = f33
+fOnePx = f34
+fY = f35
+fR = f36
+fR2 = f37
+fR3 = f38
+fRcp = f39
+fY4Rcp = f40
+fRcp0 = f41
+fRcp0n = f42
+fRcp1 = f43
+fRcp2 = f44
+fRcp3 = f45
+fN4Cvt = f46
+fN = f47
+fY2 = f48
+fLog2 = f49
+fLogT = f50
+fLogT_N = f51
+fX2 = f52
+fX3 = f53
+fX4 = f54
+fX8 = f55
+fP0 = f56
+fP5 = f57
+fP4 = f58
+fP3 = f59
+fP2 = f60
+fP1 = f61
+fNormX = f62
+fC9 = f63
+fC8 = f64
+fC7 = f65
+fC6 = f66
+fC5 = f67
+fC4 = f68
+fC3 = f69
+fC2 = f70
+fC1 = f71
+fC0 = f72
+fP98 = f73
+fP76 = f74
+fP54 = f75
+fP32 = f76
+fP10 = f77
+
+// Data tables
+//==============================================================
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(atanh_data)
+data8 0xBFC5555DA7212371 // P5
+data8 0x3FC999A19EEF5826 // P4
+data8 0xBFCFFFFFFFFEF009 // P3
+data8 0x3FD555555554ECB2 // P2
+data8 0xBFE0000000000000 // P1 = -0.5
+data8 0x0000000000000000 // pad
+data8 0xb17217f7d1cf79ac , 0x00003ffd // 0.5*log(2)
+data8 0x0000000000000000 , 0x00000000 // pad to eliminate bank conflicts
+LOCAL_OBJECT_END(atanh_data)
+
+LOCAL_OBJECT_START(atanh_data_2)
+data8 0x8649FB89D3AD51FB , 0x00003FFB // C9
+data8 0xCC10AABEF160077A , 0x00003FFA // C8
+data8 0xF1EDB99AC0819CE2 , 0x00003FFA // C7
+data8 0x8881E53A809AD24D , 0x00003FFB // C6
+data8 0x9D8A116EF212F271 , 0x00003FFB // C5
+data8 0xBA2E8A6D1D756453 , 0x00003FFB // C4
+data8 0xE38E38E7A0945692 , 0x00003FFB // C3
+data8 0x924924924536891A , 0x00003FFC // C2
+data8 0xCCCCCCCCCCD08D51 , 0x00003FFC // C1
+data8 0xAAAAAAAAAAAAAA0C , 0x00003FFD // C0
+LOCAL_OBJECT_END(atanh_data_2)
+
+
+LOCAL_OBJECT_START(atanh_data_3)
+data8 0x80200aaeac44ef38 , 0x00003ff5 // log(1/frcpa(1+0/2^-8))/2
+//
+data8 0xc09090a2c35aa070 , 0x00003ff6 // log(1/frcpa(1+1/2^-8))/2
+data8 0xa0c94fcb41977c75 , 0x00003ff7 // log(1/frcpa(1+2/2^-8))/2
+data8 0xe18b9c263af83301 , 0x00003ff7 // log(1/frcpa(1+3/2^-8))/2
+data8 0x8d35c8d6399c30ea , 0x00003ff8 // log(1/frcpa(1+4/2^-8))/2
+data8 0xadd4d2ecd601cbb8 , 0x00003ff8 // log(1/frcpa(1+5/2^-8))/2
+//
+data8 0xce95403a192f9f01 , 0x00003ff8 // log(1/frcpa(1+6/2^-8))/2
+data8 0xeb59392cbcc01096 , 0x00003ff8 // log(1/frcpa(1+7/2^-8))/2
+data8 0x862c7d0cefd54c5d , 0x00003ff9 // log(1/frcpa(1+8/2^-8))/2
+data8 0x94aa63c65e70d499 , 0x00003ff9 // log(1/frcpa(1+9/2^-8))/2
+data8 0xa54a696d4b62b382 , 0x00003ff9 // log(1/frcpa(1+10/2^-8))/2
+//
+data8 0xb3e4a796a5dac208 , 0x00003ff9 // log(1/frcpa(1+11/2^-8))/2
+data8 0xc28c45b1878340a9 , 0x00003ff9 // log(1/frcpa(1+12/2^-8))/2
+data8 0xd35c55f39d7a6235 , 0x00003ff9 // log(1/frcpa(1+13/2^-8))/2
+data8 0xe220f037b954f1f5 , 0x00003ff9 // log(1/frcpa(1+14/2^-8))/2
+data8 0xf0f3389b036834f3 , 0x00003ff9 // log(1/frcpa(1+15/2^-8))/2
+//
+data8 0xffd3488d5c980465 , 0x00003ff9 // log(1/frcpa(1+16/2^-8))/2
+data8 0x87609ce2ed300490 , 0x00003ffa // log(1/frcpa(1+17/2^-8))/2
+data8 0x8ede9321e8c85927 , 0x00003ffa // log(1/frcpa(1+18/2^-8))/2
+data8 0x96639427f2f8e2f4 , 0x00003ffa // log(1/frcpa(1+19/2^-8))/2
+data8 0x9defad3e8f73217b , 0x00003ffa // log(1/frcpa(1+20/2^-8))/2
+//
+data8 0xa582ebd50097029c , 0x00003ffa // log(1/frcpa(1+21/2^-8))/2
+data8 0xac06dbe75ab80fee , 0x00003ffa // log(1/frcpa(1+22/2^-8))/2
+data8 0xb3a78449b2d3ccca , 0x00003ffa // log(1/frcpa(1+23/2^-8))/2
+data8 0xbb4f79635ab46bb2 , 0x00003ffa // log(1/frcpa(1+24/2^-8))/2
+data8 0xc2fec93a83523f3f , 0x00003ffa // log(1/frcpa(1+25/2^-8))/2
+//
+data8 0xc99af2eaca4c4571 , 0x00003ffa // log(1/frcpa(1+26/2^-8))/2
+data8 0xd1581106472fa653 , 0x00003ffa // log(1/frcpa(1+27/2^-8))/2
+data8 0xd8002560d4355f2e , 0x00003ffa // log(1/frcpa(1+28/2^-8))/2
+data8 0xdfcb43b4fe508632 , 0x00003ffa // log(1/frcpa(1+29/2^-8))/2
+data8 0xe67f6dff709d4119 , 0x00003ffa // log(1/frcpa(1+30/2^-8))/2
+//
+data8 0xed393b1c22351280 , 0x00003ffa // log(1/frcpa(1+31/2^-8))/2
+data8 0xf5192bff087bcc35 , 0x00003ffa // log(1/frcpa(1+32/2^-8))/2
+data8 0xfbdf4ff6dfef2fa3 , 0x00003ffa // log(1/frcpa(1+33/2^-8))/2
+data8 0x81559a97f92f9cc7 , 0x00003ffb // log(1/frcpa(1+34/2^-8))/2
+data8 0x84be72bce90266e8 , 0x00003ffb // log(1/frcpa(1+35/2^-8))/2
+//
+data8 0x88bc74113f23def2 , 0x00003ffb // log(1/frcpa(1+36/2^-8))/2
+data8 0x8c2ba3edf6799d11 , 0x00003ffb // log(1/frcpa(1+37/2^-8))/2
+data8 0x8f9dc92f92ea08b1 , 0x00003ffb // log(1/frcpa(1+38/2^-8))/2
+data8 0x9312e8f36efab5a7 , 0x00003ffb // log(1/frcpa(1+39/2^-8))/2
+data8 0x968b08643409ceb6 , 0x00003ffb // log(1/frcpa(1+40/2^-8))/2
+//
+data8 0x9a062cba08a1708c , 0x00003ffb // log(1/frcpa(1+41/2^-8))/2
+data8 0x9d845b3abf95485c , 0x00003ffb // log(1/frcpa(1+42/2^-8))/2
+data8 0xa06fd841bc001bb4 , 0x00003ffb // log(1/frcpa(1+43/2^-8))/2
+data8 0xa3f3a74652fbe0db , 0x00003ffb // log(1/frcpa(1+44/2^-8))/2
+data8 0xa77a8fb2336f20f5 , 0x00003ffb // log(1/frcpa(1+45/2^-8))/2
+//
+data8 0xab0497015d28b0a0 , 0x00003ffb // log(1/frcpa(1+46/2^-8))/2
+data8 0xae91c2be6ba6a615 , 0x00003ffb // log(1/frcpa(1+47/2^-8))/2
+data8 0xb189d1b99aebb20b , 0x00003ffb // log(1/frcpa(1+48/2^-8))/2
+data8 0xb51cced5de9c1b2c , 0x00003ffb // log(1/frcpa(1+49/2^-8))/2
+data8 0xb819bee9e720d42f , 0x00003ffb // log(1/frcpa(1+50/2^-8))/2
+//
+data8 0xbbb2a0947b093a5d , 0x00003ffb // log(1/frcpa(1+51/2^-8))/2
+data8 0xbf4ec1505811684a , 0x00003ffb // log(1/frcpa(1+52/2^-8))/2
+data8 0xc2535bacfa8975ff , 0x00003ffb // log(1/frcpa(1+53/2^-8))/2
+data8 0xc55a3eafad187eb8 , 0x00003ffb // log(1/frcpa(1+54/2^-8))/2
+data8 0xc8ff2484b2c0da74 , 0x00003ffb // log(1/frcpa(1+55/2^-8))/2
+//
+data8 0xcc0b1a008d53ab76 , 0x00003ffb // log(1/frcpa(1+56/2^-8))/2
+data8 0xcfb6203844b3209b , 0x00003ffb // log(1/frcpa(1+57/2^-8))/2
+data8 0xd2c73949a47a19f5 , 0x00003ffb // log(1/frcpa(1+58/2^-8))/2
+data8 0xd5daae18b49d6695 , 0x00003ffb // log(1/frcpa(1+59/2^-8))/2
+data8 0xd8f08248cf7e8019 , 0x00003ffb // log(1/frcpa(1+60/2^-8))/2
+//
+data8 0xdca7749f1b3e540e , 0x00003ffb // log(1/frcpa(1+61/2^-8))/2
+data8 0xdfc28e033aaaf7c7 , 0x00003ffb // log(1/frcpa(1+62/2^-8))/2
+data8 0xe2e012a5f91d2f55 , 0x00003ffb // log(1/frcpa(1+63/2^-8))/2
+data8 0xe600064ed9e292a8 , 0x00003ffb // log(1/frcpa(1+64/2^-8))/2
+data8 0xe9226cce42b39f60 , 0x00003ffb // log(1/frcpa(1+65/2^-8))/2
+//
+data8 0xec4749fd97a28360 , 0x00003ffb // log(1/frcpa(1+66/2^-8))/2
+data8 0xef6ea1bf57780495 , 0x00003ffb // log(1/frcpa(1+67/2^-8))/2
+data8 0xf29877ff38809091 , 0x00003ffb // log(1/frcpa(1+68/2^-8))/2
+data8 0xf5c4d0b245cb89be , 0x00003ffb // log(1/frcpa(1+69/2^-8))/2
+data8 0xf8f3afd6fcdef3aa , 0x00003ffb // log(1/frcpa(1+70/2^-8))/2
+//
+data8 0xfc2519756be1abc7 , 0x00003ffb // log(1/frcpa(1+71/2^-8))/2
+data8 0xff59119f503e6832 , 0x00003ffb // log(1/frcpa(1+72/2^-8))/2
+data8 0x8147ce381ae0e146 , 0x00003ffc // log(1/frcpa(1+73/2^-8))/2
+data8 0x82e45f06cb1ad0f2 , 0x00003ffc // log(1/frcpa(1+74/2^-8))/2
+data8 0x842f5c7c573cbaa2 , 0x00003ffc // log(1/frcpa(1+75/2^-8))/2
+//
+data8 0x85ce471968c8893a , 0x00003ffc // log(1/frcpa(1+76/2^-8))/2
+data8 0x876e8305bc04066d , 0x00003ffc // log(1/frcpa(1+77/2^-8))/2
+data8 0x891012678031fbb3 , 0x00003ffc // log(1/frcpa(1+78/2^-8))/2
+data8 0x8a5f1493d766a05f , 0x00003ffc // log(1/frcpa(1+79/2^-8))/2
+data8 0x8c030c778c56fa00 , 0x00003ffc // log(1/frcpa(1+80/2^-8))/2
+//
+data8 0x8da85df17e31d9ae , 0x00003ffc // log(1/frcpa(1+81/2^-8))/2
+data8 0x8efa663e7921687e , 0x00003ffc // log(1/frcpa(1+82/2^-8))/2
+data8 0x90a22b6875c6a1f8 , 0x00003ffc // log(1/frcpa(1+83/2^-8))/2
+data8 0x91f62cc8f5d24837 , 0x00003ffc // log(1/frcpa(1+84/2^-8))/2
+data8 0x93a06cfc3857d980 , 0x00003ffc // log(1/frcpa(1+85/2^-8))/2
+//
+data8 0x94f66d5e6fd01ced , 0x00003ffc // log(1/frcpa(1+86/2^-8))/2
+data8 0x96a330156e6772f2 , 0x00003ffc // log(1/frcpa(1+87/2^-8))/2
+data8 0x97fb3582754ea25b , 0x00003ffc // log(1/frcpa(1+88/2^-8))/2
+data8 0x99aa8259aad1bbf2 , 0x00003ffc // log(1/frcpa(1+89/2^-8))/2
+data8 0x9b0492f6227ae4a8 , 0x00003ffc // log(1/frcpa(1+90/2^-8))/2
+//
+data8 0x9c5f8e199bf3a7a5 , 0x00003ffc // log(1/frcpa(1+91/2^-8))/2
+data8 0x9e1293b9998c1daa , 0x00003ffc // log(1/frcpa(1+92/2^-8))/2
+data8 0x9f6fa31e0b41f308 , 0x00003ffc // log(1/frcpa(1+93/2^-8))/2
+data8 0xa0cda11eaf46390e , 0x00003ffc // log(1/frcpa(1+94/2^-8))/2
+data8 0xa22c8f029cfa45aa , 0x00003ffc // log(1/frcpa(1+95/2^-8))/2
+//
+data8 0xa3e48badb7856b34 , 0x00003ffc // log(1/frcpa(1+96/2^-8))/2
+data8 0xa5459a0aa95849f9 , 0x00003ffc // log(1/frcpa(1+97/2^-8))/2
+data8 0xa6a79c84480cfebd , 0x00003ffc // log(1/frcpa(1+98/2^-8))/2
+data8 0xa80a946d0fcb3eb2 , 0x00003ffc // log(1/frcpa(1+99/2^-8))/2
+data8 0xa96e831a3ea7b314 , 0x00003ffc // log(1/frcpa(1+100/2^-8))/2
+//
+data8 0xaad369e3dc544e3b , 0x00003ffc // log(1/frcpa(1+101/2^-8))/2
+data8 0xac92e9588952c815 , 0x00003ffc // log(1/frcpa(1+102/2^-8))/2
+data8 0xadfa035aa1ed8fdc , 0x00003ffc // log(1/frcpa(1+103/2^-8))/2
+data8 0xaf6219eae1ad6e34 , 0x00003ffc // log(1/frcpa(1+104/2^-8))/2
+data8 0xb0cb2e6d8160f753 , 0x00003ffc // log(1/frcpa(1+105/2^-8))/2
+//
+data8 0xb2354249ad950f72 , 0x00003ffc // log(1/frcpa(1+106/2^-8))/2
+data8 0xb3a056e98ef4a3b4 , 0x00003ffc // log(1/frcpa(1+107/2^-8))/2
+data8 0xb50c6dba52c6292a , 0x00003ffc // log(1/frcpa(1+108/2^-8))/2
+data8 0xb679882c33876165 , 0x00003ffc // log(1/frcpa(1+109/2^-8))/2
+data8 0xb78c07429785cedc , 0x00003ffc // log(1/frcpa(1+110/2^-8))/2
+//
+data8 0xb8faeb8dc4a77d24 , 0x00003ffc // log(1/frcpa(1+111/2^-8))/2
+data8 0xba6ad77eb36ae0d6 , 0x00003ffc // log(1/frcpa(1+112/2^-8))/2
+data8 0xbbdbcc915e9bee50 , 0x00003ffc // log(1/frcpa(1+113/2^-8))/2
+data8 0xbd4dcc44f8cf12ef , 0x00003ffc // log(1/frcpa(1+114/2^-8))/2
+data8 0xbec0d81bf5b531fa , 0x00003ffc // log(1/frcpa(1+115/2^-8))/2
+//
+data8 0xc034f19c139186f4 , 0x00003ffc // log(1/frcpa(1+116/2^-8))/2
+data8 0xc14cb69f7c5e55ab , 0x00003ffc // log(1/frcpa(1+117/2^-8))/2
+data8 0xc2c2abbb6e5fd56f , 0x00003ffc // log(1/frcpa(1+118/2^-8))/2
+data8 0xc439b2c193e6771e , 0x00003ffc // log(1/frcpa(1+119/2^-8))/2
+data8 0xc553acb9d5c67733 , 0x00003ffc // log(1/frcpa(1+120/2^-8))/2
+//
+data8 0xc6cc96e441272441 , 0x00003ffc // log(1/frcpa(1+121/2^-8))/2
+data8 0xc8469753eca88c30 , 0x00003ffc // log(1/frcpa(1+122/2^-8))/2
+data8 0xc962cf3ce072b05c , 0x00003ffc // log(1/frcpa(1+123/2^-8))/2
+data8 0xcadeba8771f694aa , 0x00003ffc // log(1/frcpa(1+124/2^-8))/2
+data8 0xcc5bc08d1f72da94 , 0x00003ffc // log(1/frcpa(1+125/2^-8))/2
+//
+data8 0xcd7a3f99ea035c29 , 0x00003ffc // log(1/frcpa(1+126/2^-8))/2
+data8 0xcef93860c8a53c35 , 0x00003ffc // log(1/frcpa(1+127/2^-8))/2
+data8 0xd0192f68a7ed23df , 0x00003ffc // log(1/frcpa(1+128/2^-8))/2
+data8 0xd19a201127d3c645 , 0x00003ffc // log(1/frcpa(1+129/2^-8))/2
+data8 0xd2bb92f4061c172c , 0x00003ffc // log(1/frcpa(1+130/2^-8))/2
+//
+data8 0xd43e80b2ee8cc8fc , 0x00003ffc // log(1/frcpa(1+131/2^-8))/2
+data8 0xd56173601fc4ade4 , 0x00003ffc // log(1/frcpa(1+132/2^-8))/2
+data8 0xd6e6637efb54086f , 0x00003ffc // log(1/frcpa(1+133/2^-8))/2
+data8 0xd80ad9f58f3c8193 , 0x00003ffc // log(1/frcpa(1+134/2^-8))/2
+data8 0xd991d1d31aca41f8 , 0x00003ffc // log(1/frcpa(1+135/2^-8))/2
+//
+data8 0xdab7d02231484a93 , 0x00003ffc // log(1/frcpa(1+136/2^-8))/2
+data8 0xdc40d532cde49a54 , 0x00003ffc // log(1/frcpa(1+137/2^-8))/2
+data8 0xdd685f79ed8b265e , 0x00003ffc // log(1/frcpa(1+138/2^-8))/2
+data8 0xde9094bbc0e17b1d , 0x00003ffc // log(1/frcpa(1+139/2^-8))/2
+data8 0xe01c91b78440c425 , 0x00003ffc // log(1/frcpa(1+140/2^-8))/2
+//
+data8 0xe14658f26997e729 , 0x00003ffc // log(1/frcpa(1+141/2^-8))/2
+data8 0xe270cdc2391e0d23 , 0x00003ffc // log(1/frcpa(1+142/2^-8))/2
+data8 0xe3ffce3a2aa64922 , 0x00003ffc // log(1/frcpa(1+143/2^-8))/2
+data8 0xe52bdb274ed82887 , 0x00003ffc // log(1/frcpa(1+144/2^-8))/2
+data8 0xe6589852e75d7df6 , 0x00003ffc // log(1/frcpa(1+145/2^-8))/2
+//
+data8 0xe786068c79937a7d , 0x00003ffc // log(1/frcpa(1+146/2^-8))/2
+data8 0xe91903adad100911 , 0x00003ffc // log(1/frcpa(1+147/2^-8))/2
+data8 0xea481236f7d35bb0 , 0x00003ffc // log(1/frcpa(1+148/2^-8))/2
+data8 0xeb77d48c692e6b14 , 0x00003ffc // log(1/frcpa(1+149/2^-8))/2
+data8 0xeca84b83d7297b87 , 0x00003ffc // log(1/frcpa(1+150/2^-8))/2
+//
+data8 0xedd977f4962aa158 , 0x00003ffc // log(1/frcpa(1+151/2^-8))/2
+data8 0xef7179a22f257754 , 0x00003ffc // log(1/frcpa(1+152/2^-8))/2
+data8 0xf0a450d139366ca7 , 0x00003ffc // log(1/frcpa(1+153/2^-8))/2
+data8 0xf1d7e0524ff9ffdb , 0x00003ffc // log(1/frcpa(1+154/2^-8))/2
+data8 0xf30c29036a8b6cae , 0x00003ffc // log(1/frcpa(1+155/2^-8))/2
+//
+data8 0xf4412bc411ea8d92 , 0x00003ffc // log(1/frcpa(1+156/2^-8))/2
+data8 0xf576e97564c8619d , 0x00003ffc // log(1/frcpa(1+157/2^-8))/2
+data8 0xf6ad62fa1b5f172f , 0x00003ffc // log(1/frcpa(1+158/2^-8))/2
+data8 0xf7e499368b55c542 , 0x00003ffc // log(1/frcpa(1+159/2^-8))/2
+data8 0xf91c8d10abaffe22 , 0x00003ffc // log(1/frcpa(1+160/2^-8))/2
+//
+data8 0xfa553f7018c966f3 , 0x00003ffc // log(1/frcpa(1+161/2^-8))/2
+data8 0xfb8eb13e185d802c , 0x00003ffc // log(1/frcpa(1+162/2^-8))/2
+data8 0xfcc8e3659d9bcbed , 0x00003ffc // log(1/frcpa(1+163/2^-8))/2
+data8 0xfe03d6d34d487fd2 , 0x00003ffc // log(1/frcpa(1+164/2^-8))/2
+data8 0xff3f8c7581e9f0ae , 0x00003ffc // log(1/frcpa(1+165/2^-8))/2
+//
+data8 0x803e029e280173ae , 0x00003ffd // log(1/frcpa(1+166/2^-8))/2
+data8 0x80dca10cc52d0757 , 0x00003ffd // log(1/frcpa(1+167/2^-8))/2
+data8 0x817ba200632755a1 , 0x00003ffd // log(1/frcpa(1+168/2^-8))/2
+data8 0x821b05f3b01d6774 , 0x00003ffd // log(1/frcpa(1+169/2^-8))/2
+data8 0x82bacd623ff19d06 , 0x00003ffd // log(1/frcpa(1+170/2^-8))/2
+//
+data8 0x835af8c88e7a8f47 , 0x00003ffd // log(1/frcpa(1+171/2^-8))/2
+data8 0x83c5f8299e2b4091 , 0x00003ffd // log(1/frcpa(1+172/2^-8))/2
+data8 0x8466cb43f3d87300 , 0x00003ffd // log(1/frcpa(1+173/2^-8))/2
+data8 0x850803a67c80ca4b , 0x00003ffd // log(1/frcpa(1+174/2^-8))/2
+data8 0x85a9a1d11a23b461 , 0x00003ffd // log(1/frcpa(1+175/2^-8))/2
+//
+data8 0x864ba644a18e6e05 , 0x00003ffd // log(1/frcpa(1+176/2^-8))/2
+data8 0x86ee1182dcc432f7 , 0x00003ffd // log(1/frcpa(1+177/2^-8))/2
+data8 0x875a925d7e48c316 , 0x00003ffd // log(1/frcpa(1+178/2^-8))/2
+data8 0x87fdaa109d23aef7 , 0x00003ffd // log(1/frcpa(1+179/2^-8))/2
+data8 0x88a129ed4becfaf2 , 0x00003ffd // log(1/frcpa(1+180/2^-8))/2
+//
+data8 0x89451278ecd7f9cf , 0x00003ffd // log(1/frcpa(1+181/2^-8))/2
+data8 0x89b29295f8432617 , 0x00003ffd // log(1/frcpa(1+182/2^-8))/2
+data8 0x8a572ac5a5496882 , 0x00003ffd // log(1/frcpa(1+183/2^-8))/2
+data8 0x8afc2d0ce3b2dadf , 0x00003ffd // log(1/frcpa(1+184/2^-8))/2
+data8 0x8b6a69c608cfd3af , 0x00003ffd // log(1/frcpa(1+185/2^-8))/2
+//
+data8 0x8c101e106e899a83 , 0x00003ffd // log(1/frcpa(1+186/2^-8))/2
+data8 0x8cb63de258f9d626 , 0x00003ffd // log(1/frcpa(1+187/2^-8))/2
+data8 0x8d2539c5bd19e2b1 , 0x00003ffd // log(1/frcpa(1+188/2^-8))/2
+data8 0x8dcc0e064b29e6f1 , 0x00003ffd // log(1/frcpa(1+189/2^-8))/2
+data8 0x8e734f45d88357ae , 0x00003ffd // log(1/frcpa(1+190/2^-8))/2
+//
+data8 0x8ee30cef034a20db , 0x00003ffd // log(1/frcpa(1+191/2^-8))/2
+data8 0x8f8b0515686d1d06 , 0x00003ffd // log(1/frcpa(1+192/2^-8))/2
+data8 0x90336bba039bf32f , 0x00003ffd // log(1/frcpa(1+193/2^-8))/2
+data8 0x90a3edd23d1c9d58 , 0x00003ffd // log(1/frcpa(1+194/2^-8))/2
+data8 0x914d0de2f5d61b32 , 0x00003ffd // log(1/frcpa(1+195/2^-8))/2
+//
+data8 0x91be0c20d28173b5 , 0x00003ffd // log(1/frcpa(1+196/2^-8))/2
+data8 0x9267e737c06cd34a , 0x00003ffd // log(1/frcpa(1+197/2^-8))/2
+data8 0x92d962ae6abb1237 , 0x00003ffd // log(1/frcpa(1+198/2^-8))/2
+data8 0x9383fa6afbe2074c , 0x00003ffd // log(1/frcpa(1+199/2^-8))/2
+data8 0x942f0421651c1c4e , 0x00003ffd // log(1/frcpa(1+200/2^-8))/2
+//
+data8 0x94a14a3845bb985e , 0x00003ffd // log(1/frcpa(1+201/2^-8))/2
+data8 0x954d133857f861e7 , 0x00003ffd // log(1/frcpa(1+202/2^-8))/2
+data8 0x95bfd96468e604c4 , 0x00003ffd // log(1/frcpa(1+203/2^-8))/2
+data8 0x9632d31cafafa858 , 0x00003ffd // log(1/frcpa(1+204/2^-8))/2
+data8 0x96dfaabd86fa1647 , 0x00003ffd // log(1/frcpa(1+205/2^-8))/2
+//
+data8 0x9753261fcbb2a594 , 0x00003ffd // log(1/frcpa(1+206/2^-8))/2
+data8 0x9800c11b426b996d , 0x00003ffd // log(1/frcpa(1+207/2^-8))/2
+data8 0x9874bf4d45ae663c , 0x00003ffd // log(1/frcpa(1+208/2^-8))/2
+data8 0x99231f5ee9a74f79 , 0x00003ffd // log(1/frcpa(1+209/2^-8))/2
+data8 0x9997a18a56bcad28 , 0x00003ffd // log(1/frcpa(1+210/2^-8))/2
+//
+data8 0x9a46c873a3267e79 , 0x00003ffd // log(1/frcpa(1+211/2^-8))/2
+data8 0x9abbcfc621eb6cb6 , 0x00003ffd // log(1/frcpa(1+212/2^-8))/2
+data8 0x9b310cb0d354c990 , 0x00003ffd // log(1/frcpa(1+213/2^-8))/2
+data8 0x9be14cf9e1b3515c , 0x00003ffd // log(1/frcpa(1+214/2^-8))/2
+data8 0x9c5710b8cbb73a43 , 0x00003ffd // log(1/frcpa(1+215/2^-8))/2
+//
+data8 0x9ccd0abd301f399c , 0x00003ffd // log(1/frcpa(1+216/2^-8))/2
+data8 0x9d7e67f3bdce8888 , 0x00003ffd // log(1/frcpa(1+217/2^-8))/2
+data8 0x9df4ea81a99daa01 , 0x00003ffd // log(1/frcpa(1+218/2^-8))/2
+data8 0x9e6ba405a54514ba , 0x00003ffd // log(1/frcpa(1+219/2^-8))/2
+data8 0x9f1e21c8c7bb62b3 , 0x00003ffd // log(1/frcpa(1+220/2^-8))/2
+//
+data8 0x9f956593f6b6355c , 0x00003ffd // log(1/frcpa(1+221/2^-8))/2
+data8 0xa00ce1092e5498c3 , 0x00003ffd // log(1/frcpa(1+222/2^-8))/2
+data8 0xa0c08309c4b912c1 , 0x00003ffd // log(1/frcpa(1+223/2^-8))/2
+data8 0xa1388a8c6faa2afa , 0x00003ffd // log(1/frcpa(1+224/2^-8))/2
+data8 0xa1b0ca7095b5f985 , 0x00003ffd // log(1/frcpa(1+225/2^-8))/2
+//
+data8 0xa22942eb47534a00 , 0x00003ffd // log(1/frcpa(1+226/2^-8))/2
+data8 0xa2de62326449d0a3 , 0x00003ffd // log(1/frcpa(1+227/2^-8))/2
+data8 0xa357690f88bfe345 , 0x00003ffd // log(1/frcpa(1+228/2^-8))/2
+data8 0xa3d0a93f45169a4b , 0x00003ffd // log(1/frcpa(1+229/2^-8))/2
+data8 0xa44a22f7ffe65f30 , 0x00003ffd // log(1/frcpa(1+230/2^-8))/2
+//
+data8 0xa500c5e5b4c1aa36 , 0x00003ffd // log(1/frcpa(1+231/2^-8))/2
+data8 0xa57ad064eb2ebbc2 , 0x00003ffd // log(1/frcpa(1+232/2^-8))/2
+data8 0xa5f5152dedf4384e , 0x00003ffd // log(1/frcpa(1+233/2^-8))/2
+data8 0xa66f9478856233ec , 0x00003ffd // log(1/frcpa(1+234/2^-8))/2
+data8 0xa6ea4e7cca02c32e , 0x00003ffd // log(1/frcpa(1+235/2^-8))/2
+//
+data8 0xa765437325341ccf , 0x00003ffd // log(1/frcpa(1+236/2^-8))/2
+data8 0xa81e21e6c75b4020 , 0x00003ffd // log(1/frcpa(1+237/2^-8))/2
+data8 0xa899ab333fe2b9ca , 0x00003ffd // log(1/frcpa(1+238/2^-8))/2
+data8 0xa9157039c51ebe71 , 0x00003ffd // log(1/frcpa(1+239/2^-8))/2
+data8 0xa991713433c2b999 , 0x00003ffd // log(1/frcpa(1+240/2^-8))/2
+//
+data8 0xaa0dae5cbcc048b3 , 0x00003ffd // log(1/frcpa(1+241/2^-8))/2
+data8 0xaa8a27ede5eb13ad , 0x00003ffd // log(1/frcpa(1+242/2^-8))/2
+data8 0xab06de228a9e3499 , 0x00003ffd // log(1/frcpa(1+243/2^-8))/2
+data8 0xab83d135dc633301 , 0x00003ffd // log(1/frcpa(1+244/2^-8))/2
+data8 0xac3fb076adc7fe7a , 0x00003ffd // log(1/frcpa(1+245/2^-8))/2
+//
+data8 0xacbd3cbbe47988f1 , 0x00003ffd // log(1/frcpa(1+246/2^-8))/2
+data8 0xad3b06b1a5dc57c3 , 0x00003ffd // log(1/frcpa(1+247/2^-8))/2
+data8 0xadb90e94af887717 , 0x00003ffd // log(1/frcpa(1+248/2^-8))/2
+data8 0xae3754a218f7c816 , 0x00003ffd // log(1/frcpa(1+249/2^-8))/2
+data8 0xaeb5d9175437afa2 , 0x00003ffd // log(1/frcpa(1+250/2^-8))/2
+//
+data8 0xaf349c322e9c7cee , 0x00003ffd // log(1/frcpa(1+251/2^-8))/2
+data8 0xafb39e30d1768d1c , 0x00003ffd // log(1/frcpa(1+252/2^-8))/2
+data8 0xb032df51c2c93116 , 0x00003ffd // log(1/frcpa(1+253/2^-8))/2
+data8 0xb0b25fd3e6035ad9 , 0x00003ffd // log(1/frcpa(1+254/2^-8))/2
+data8 0xb1321ff67cba178c , 0x00003ffd // log(1/frcpa(1+255/2^-8))/2
+LOCAL_OBJECT_END(atanh_data_3)
+
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(atanh)
+
+{ .mfi
+ getf.exp rArgSExpb = f8 // Must recompute if x unorm
+ fclass.m p13,p0 = f8, 0x0b // is arg denormal ?
+ mov rExpbMask = 0x1ffff
+}
+{ .mfi
+ addl DataPtr = @ltoff(atanh_data), gp
+ fnma.s1 fOneMx = f8, f1, f1 // fOneMx = 1 - x
+ mov rBias = 0xffff
+}
+;;
+
+{ .mfi
+ mov rNearZeroBound = 0xfffd // biased exp of 1/4
+ fclass.m p12,p0 = f8, 0xc7 // is arg NaN or +/-0 ?
+ nop.i 0
+}
+{ .mfi
+ ld8 DataPtr = [DataPtr]
+ fma.s1 fOnePx = f8, f1, f1 // fOnePx = 1 + x
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fcmp.lt.s1 p10,p11 = f8,f0 // is x < 0 ?
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fnorm.s1 fNormX = f8 // Normalize x
+(p13) br.cond.spnt ATANH_UNORM // Branch if x=unorm
+}
+;;
+
+ATANH_COMMON:
+// Return here if x=unorm and not denorm
+{ .mfi
+ adds Data2Ptr = 0x50, DataPtr
+ fma.s1 fX2 = f8, f8, f0 // x^2
+ nop.i 0
+}
+{ .mfb
+ adds Data3Ptr = 0xC0, DataPtr
+(p12) fma.d.s0 f8 = f8,f1,f8 // NaN or +/-0
+(p12) br.ret.spnt b0 // Exit for x Nan or zero
+}
+;;
+
+{ .mfi
+ ldfe fC9 = [Data2Ptr], 16
+(p11) frcpa.s1 fRcp0, p0 = f1, fOneMx
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe fC8 = [Data2Ptr], 16
+(p10) frcpa.s1 fRcp0n, p0 = f1, fOnePx
+ and rArgExpb = rArgSExpb, rExpbMask // biased exponent
+}
+{ .mfi
+ nop.m 0
+(p10) fma.s1 fOneMx = fOnePx, f1, f0 // fOnePx = 1 - |x|
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe fC7 = [Data2Ptr], 16
+(p10) fnma.s1 fOnePx = fNormX, f1, f1 // fOnePx = 1 + |x|
+ cmp.ge p6,p0 = rArgExpb, rBias // is Expb(Arg) >= Expb(1) ?
+}
+{ .mfb
+ nop.m 0
+ nop.f 0
+(p6) br.cond.spnt atanh_ge_one // Branch if |x| >=1.0
+}
+;;
+
+{ .mfi
+ ldfe fC6 = [Data2Ptr], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe fC5 = [Data2Ptr], 16
+ fma.s1 fX4 = fX2, fX2, f0 // x^4
+ cmp.gt p8,p0 = rNearZeroBound, rArgExpb
+}
+{ .mfb
+ ldfe fC2 = [Data3Ptr], 16
+ fma.s1 fX3 = fX2, fNormX, f0 // x^3
+(p8) br.cond.spnt atanh_near_zero // Exit if 0 < |x| < 0.25
+}
+;;
+
+// Main path: 0.25 <= |x| < 1.0
+// NR method: iteration #1
+.pred.rel "mutex",p11,p10
+{ .mfi
+ ldfpd fP5, fP4 = [DataPtr], 16
+(p11) fnma.s1 fRcp1 = fRcp0, fOneMx, f1 // t = 1 - r0*x
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p10) fnma.s1 fRcp1 = fRcp0n, fOneMx, f1 // t = 1 - r0*x
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfpd fP3, fP2 = [DataPtr], 16
+ // r1 = r0 + r0*t = r0 + r0*(1 - r0*x)
+(p11) fma.s1 fRcp1 = fRcp0, fRcp1, fRcp0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // r1 = r0 + r0*t = r0 + r0*(1 - r0*x)
+(p10) fma.s1 fRcp1 = fRcp0n, fRcp1, fRcp0n
+ nop.i 0
+}
+;;
+
+// NR method: iteration #2
+{ .mfi
+ ldfd fP1 = [DataPtr], 16
+ fnma.s1 fRcp2 = fRcp1, fOneMx, f1 // t = 1 - r1*x
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe fLog2 = [DataPtr], 16
+ // r2 = r1 + r1*t = r1 + r1*(1 - r1*x)
+ fma.s1 fRcp2 = fRcp1, fRcp2, fRcp1
+ nop.i 0
+}
+;;
+
+// NR method: iteration #3
+{ .mfi
+ adds RcpTablePtr = 0xB0, DataPtr
+ fnma.s1 fRcp3 = fRcp2, fOneMx, f1 // t = 1 - r2*x
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fY4Rcp = fRcp2, fOnePx, f0 // fY4Rcp = r2*(1 + x)
+ nop.i 0
+}
+;;
+
+// polynomial approximation & final reconstruction
+{ .mfi
+ nop.m 0
+ frcpa.s1 fRcp, p0 = f1, fY4Rcp
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // y = r2 * (1 + x) + r2 * (1 + x) * t = (1 + x) * (r2 + r2*(1 - r2*x))
+ fma.s1 fY = fY4Rcp, fRcp3, fY4Rcp
+ nop.i 0
+}
+;;
+
+{ .mmi
+ getf.exp rSExpb = fY4Rcp // biased exponent and sign
+;;
+ getf.sig rSig = fY4Rcp // significand
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fR = fY, fRcp, f1 // fR = fY * fRcp - 1
+ nop.i 0
+}
+;;
+
+{ .mmi
+ and rExpb = rSExpb, rExpbMask
+;;
+ sub rN = rExpb, rBias // exponent
+ extr.u rInd = rSig,55,8 // Extract 8 bits
+}
+;;
+
+{ .mmi
+ setf.sig fN4Cvt = rN
+ shladd RcpTablePtr = rInd, 4, RcpTablePtr
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe fLogT = [RcpTablePtr]
+ fma.s1 fR2 = fR, fR, f0 // r^2
+ nop.i 0
+}
+{
+ nop.m 0
+ fma.s1 fP54 = fP5, fR, fP4 // P5*r + P4
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fP32 = fP3, fR, fP2 // P3*r + P2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fR3 = fR2, fR, f0 // r^3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fP10 = fP1, fR2, fR // P1*r^2 + r
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fcvt.xf fN = fN4Cvt
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fP54 = fP54, fR2, fP32 // (P5*r + P4)*r^2 + P3*r + P2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fLogT_N = fN, fLog2, fLogT // N*Log2 + LogT
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // ((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r
+ fma.s1 fP54 = fP54, fR3, fP10
+ nop.i 0
+}
+;;
+
+.pred.rel "mutex",p11,p10
+{ .mfi
+ nop.m 0
+ // 0.5*(((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r) + 0.5*(N*Log2 + T)
+(p11) fnma.d.s0 f8 = fP54, fP1, fLogT_N
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ // -0.5*(((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r) - 0.5*(N*Log2 + T)
+(p10) fms.d.s0 f8 = fP54, fP1, fLogT_N
+ br.ret.sptk b0 // Exit for 0.25 <= |x| < 1.0
+}
+;;
+
+// Here if 0 < |x| < 0.25
+atanh_near_zero:
+{ .mfi
+ ldfe fC4 = [Data2Ptr], 16
+ fma.s1 fP98 = fC9, fX2, fC8 // C9*x^2 + C8
+ nop.i 0
+}
+{ .mfi
+ ldfe fC1 = [Data3Ptr], 16
+ fma.s1 fP76 = fC7, fX2, fC6 // C7*x^2 + C6
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe fC3 = [Data2Ptr], 16
+ fma.s1 fX8 = fX4, fX4, f0 // x^8
+ nop.i 0
+}
+{ .mfi
+ ldfe fC0 = [Data3Ptr], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fP98 = fP98, fX4, fP76 // C9*x^6 + C8*x^4 + C7*x^2 + C6
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fP54 = fC5, fX2, fC4 // C5*x^2 + C4
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fP32 = fC3, fX2, fC2 // C3*x^2 + C2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fP10 = fC1, fX2, fC0 // C1*x^2 + C0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fP54 = fP54, fX4, fP32 // C5*x^6 + C4*x^4 + C3*x^2 + C2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // C9*x^14 + C8*x^12 + C7*x^10 + C6*x^8 + C5*x^6 + C4*x^4 + C3*x^2 + C2
+ fma.s1 fP98 = fP98, fX8, fP54
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // C9*x^18 + C8*x^16 + C7*x^14 + C6*x^12 + C5*x^10 + C4*x^8 + C3*x^6 +
+ // C2*x^4 + C1*x^2 + C0
+ fma.s1 fP98 = fP98, fX4, fP10
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ // C9*x^21 + C8*x^19 + C7*x^17 + C6*x^15 + C5*x^13 + C4*x^11 + C3*x^9 +
+ // C2*x^7 + C1*x^5 + C0*x^3 + x
+ fma.d.s0 f8 = fP98, fX3, fNormX
+ br.ret.sptk b0 // Exit for 0 < |x| < 0.25
+}
+;;
+
+ATANH_UNORM:
+// Here if x=unorm
+{ .mfi
+ getf.exp rArgSExpb = fNormX // Recompute if x unorm
+ fclass.m p0,p13 = fNormX, 0x0b // Test x denorm
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy to set denormal flag
+(p13) br.cond.sptk ATANH_COMMON // Continue if x unorm and not denorm
+}
+;;
+
+.pred.rel "mutex",p10,p11
+{ .mfi
+ nop.m 0
+(p10) fnma.d.s0 f8 = f8,f8,f8 // Result x-x^2 if x=-denorm
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p11) fma.d.s0 f8 = f8,f8,f8 // Result x+x^2 if x=+denorm
+ br.ret.spnt b0 // Exit if denorm
+}
+;;
+
+// Here if |x| >= 1.0
+atanh_ge_one:
+{ .mfi
+ alloc r32 = ar.pfs,1,3,4,0
+ fmerge.s fAbsX = f0, f8 // Form |x|
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fmerge.s f10 = f8, f8 // Save input for error call
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fcmp.eq.s1 p6,p7 = fAbsX, f1 // Test for |x| = 1.0
+ nop.i 0
+}
+;;
+
+// Set error tag and result, and raise invalid flag if |x| > 1.0
+{ .mfi
+(p7) mov atanh_GR_tag = 131
+(p7) frcpa.s0 f8, p0 = f0, f0 // Get QNaN, and raise invalid
+ nop.i 0
+}
+;;
+
+// Set error tag and result, and raise Z flag if |x| = 1.0
+{ .mfi
+ nop.m 0
+(p6) frcpa.s0 fRcp, p0 = f1, f0 // Get inf, and raise Z flag
+ nop.i 0
+}
+;;
+
+{ .mfb
+(p6) mov atanh_GR_tag = 132
+(p6) fmerge.s f8 = f8, fRcp // result is +-inf
+ br.cond.sptk __libm_error_region // Exit if |x| >= 1.0
+}
+;;
+
+GLOBAL_LIBM_END(atanh)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+{ .mmi
+ stfd [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+{ .mib
+ stfd [GR_Parameter_X] = f10 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+
+{ .mmi
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_atanhf.S b/sysdeps/ia64/fpu/e_atanhf.S
new file mode 100644
index 0000000000..3675c5f4c1
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_atanhf.S
@@ -0,0 +1,844 @@
+.file "atanhf.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 05/22/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 08/06/02 Improved Itanium 2 performance
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+// 05/26/03 Improved performance, fixed to handle unorms
+//
+// API
+//==============================================================
+// float atanhf(float)
+//
+// Overview of operation
+//==============================================================
+// Background
+//
+//
+// There are 7 paths:
+// 1. x = +/-0.0
+// Return atanhf(x) = +/-0.0
+//
+// 2. 0.0 < |x| <= MAX_DENORMAL_ABS
+// Return atanhf(x) = x + sign(x)*x^2
+//
+// 3. MAX_DENORMAL_ABS < |x| < 2^(-20)
+// Return atanhf(x) = Pol3(x), where Pol3(x) = x + x^3
+//
+// 4. 2^(-20) <= |x| < 1
+// Return atanhf(x) = 0.5 * (log(1 + x) - log(1 - x))
+// Algorithm description for log function see below.
+//
+// 5. |x| = 1
+// Return atanhf(x) = sign(x) * +INF
+//
+// 6. 1 < |x| <= +INF
+// Return atanhf(x) = QNaN
+//
+// 7. x = [S,Q]NaN
+// Return atanhf(x) = QNaN
+//
+//==============================================================
+// Algorithm Description for log(x) function
+//
+// Consider x = 2^N * 1.f1 f2 f3 f4...f63
+// log(x) = log(x * frcpa(x) / frcpa(x))
+// = log(x * frcpa(x)) + log(1/frcpa(x))
+// = log(x * frcpa(x)) - log(frcpa(x))
+//
+// frcpa(x) = 2^(-N) * frcpa(1.f1 f2 ... f63)
+//
+// -log(frcpa(x)) = -log(C)
+// = -log(2^(-N)) - log(frcpa(1.f1 f2 ... f63))
+//
+// -log(frcpa(x)) = -log(C)
+// = N*log2 - log(frcpa(1.f1 f2 ... f63))
+//
+//
+// log(x) = log(1/frcpa(x)) + log(frcpa(x) x)
+//
+// log(x) = N*log2 + log(1./frcpa(1.f1 f2 ... f63)) + log(x * frcpa(x))
+// log(x) = N*log2 + T + log(frcpa(x) x)
+//
+// Log(x) = N*log2 + T + log(C * x)
+//
+// C * x = 1 + r
+//
+// log(x) = N*log2 + T + log(1 + r)
+// log(x) = N*log2 + T + Series(r)
+//
+// 1.f1 f2 ... f8 has 256 entries.
+// They are 1 + k/2^8, k = 0 ... 255
+// These 256 values are the table entries.
+//
+// Implementation
+//==============================================================
+// C = frcpa(x)
+// r = C * x - 1
+//
+// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4
+//
+// x = f * 2*N where f is 1.f_1f_2f_3...f_63
+// Nfloat = float(n) where n is the true unbiased exponent
+// pre-index = f_1f_2....f_8
+// index = pre_index * 16
+// get the dxt table entry at index + offset = T
+//
+// result = (T + Nfloat * log(2)) + rseries
+//
+// The T table is calculated as follows
+// Form x_k = 1 + k/2^8 where k goes from 0... 255
+// y_k = frcpa(x_k)
+// log(1/y_k) in quad and round to double-extended
+
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f32 -> f59
+
+// General registers used:
+// r14 -> r29, r32 -> r39
+
+// Predicate registers used:
+// p6 -> p9
+
+// p6 to filter out case when |x| >= 1
+// p7 to filter out case when x = [Q,S]NaN or +/-0
+// p8 to filter out case when |x| < 2^(-20)
+// p9 to filter out case when x = denormal
+
+
+// Assembly macros
+//==============================================================
+DataPtr = r14
+RcpTablePtrM = r15
+RcpTablePtrP = r16
+rExpbMask = r17
+rBias = r18
+rNearZeroBound = r19
+rArgSExpb = r20
+rArgExpb = r21
+rExpbm = r22
+rExpbp = r23
+rSigm = r24
+rSigp = r25
+rNm = r26
+rNp = r27
+rIndm = r28
+rIndp = r29
+
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_SAVE_PFS = r35
+
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+atanh_GR_tag = r39
+
+//==============================================================
+fOneMx = f33
+fOnePx = f34
+fRm2 = f35
+fRm3 = f36
+fRp2 = f37
+fRp3 = f38
+fRcpM = f39
+fRcpP = f40
+fRp = f41
+fRm = f42
+fN4CvtM = f43
+fN4CvtP = f44
+fNm = f45
+fNp = f46
+fLogTm = f47
+fLogTp = f48
+fLog2 = f49
+fArgAbs = f50
+fNormX = f50
+fP32m = f51
+fP32p = f52
+fP10m = f53
+fP10p = f54
+fX2 = f55
+fP3 = f56
+fP2 = f57
+fP1 = f58
+fHalf = f59
+
+
+
+// Data tables
+//==============================================================
+
+RODATA
+
+.align 16
+
+LOCAL_OBJECT_START(atanhf_data)
+data8 0xbfc0001008f39d59 // P3*0.5
+data8 0x3fc5556073e0c45a // P2*0.5
+data8 0xbfcffffffffaea15 // P1*0.5
+data8 0x3fe0000000000000 // 0.5
+data8 0x3fd62e42fefa39ef // 0.5*ln(2)
+data8 0x0000000000000000 // pad
+LOCAL_OBJECT_END(atanhf_data)
+
+LOCAL_OBJECT_START(atanhf_data2)
+data8 0x3f50040155d5889e //log(1/frcpa(1+0/256))/2
+data8 0x3f68121214586b54 //log(1/frcpa(1+1/256))/2
+data8 0x3f741929f96832f0 //log(1/frcpa(1+2/256))/2
+data8 0x3f7c317384c75f06 //log(1/frcpa(1+3/256))/2
+data8 0x3f81a6b91ac73386 //log(1/frcpa(1+4/256))/2
+data8 0x3f85ba9a5d9ac039 //log(1/frcpa(1+5/256))/2
+data8 0x3f89d2a8074325f4 //log(1/frcpa(1+6/256))/2
+data8 0x3f8d6b2725979802 //log(1/frcpa(1+7/256))/2
+data8 0x3f90c58fa19dfaaa //log(1/frcpa(1+8/256))/2
+data8 0x3f92954c78cbce1b //log(1/frcpa(1+9/256))/2
+data8 0x3f94a94d2da96c56 //log(1/frcpa(1+10/256))/2
+data8 0x3f967c94f2d4bb58 //log(1/frcpa(1+11/256))/2
+data8 0x3f985188b630f068 //log(1/frcpa(1+12/256))/2
+data8 0x3f9a6b8abe73af4c //log(1/frcpa(1+13/256))/2
+data8 0x3f9c441e06f72a9e //log(1/frcpa(1+14/256))/2
+data8 0x3f9e1e6713606d07 //log(1/frcpa(1+15/256))/2
+data8 0x3f9ffa6911ab9301 //log(1/frcpa(1+16/256))/2
+data8 0x3fa0ec139c5da601 //log(1/frcpa(1+17/256))/2
+data8 0x3fa1dbd2643d190b //log(1/frcpa(1+18/256))/2
+data8 0x3fa2cc7284fe5f1c //log(1/frcpa(1+19/256))/2
+data8 0x3fa3bdf5a7d1ee64 //log(1/frcpa(1+20/256))/2
+data8 0x3fa4b05d7aa012e0 //log(1/frcpa(1+21/256))/2
+data8 0x3fa580db7ceb5702 //log(1/frcpa(1+22/256))/2
+data8 0x3fa674f089365a7a //log(1/frcpa(1+23/256))/2
+data8 0x3fa769ef2c6b568d //log(1/frcpa(1+24/256))/2
+data8 0x3fa85fd927506a48 //log(1/frcpa(1+25/256))/2
+data8 0x3fa9335e5d594989 //log(1/frcpa(1+26/256))/2
+data8 0x3faa2b0220c8e5f5 //log(1/frcpa(1+27/256))/2
+data8 0x3fab0004ac1a86ac //log(1/frcpa(1+28/256))/2
+data8 0x3fabf968769fca11 //log(1/frcpa(1+29/256))/2
+data8 0x3faccfedbfee13a8 //log(1/frcpa(1+30/256))/2
+data8 0x3fada727638446a2 //log(1/frcpa(1+31/256))/2
+data8 0x3faea3257fe10f7a //log(1/frcpa(1+32/256))/2
+data8 0x3faf7be9fedbfde6 //log(1/frcpa(1+33/256))/2
+data8 0x3fb02ab352ff25f4 //log(1/frcpa(1+34/256))/2
+data8 0x3fb097ce579d204d //log(1/frcpa(1+35/256))/2
+data8 0x3fb1178e8227e47c //log(1/frcpa(1+36/256))/2
+data8 0x3fb185747dbecf34 //log(1/frcpa(1+37/256))/2
+data8 0x3fb1f3b925f25d41 //log(1/frcpa(1+38/256))/2
+data8 0x3fb2625d1e6ddf57 //log(1/frcpa(1+39/256))/2
+data8 0x3fb2d1610c86813a //log(1/frcpa(1+40/256))/2
+data8 0x3fb340c59741142e //log(1/frcpa(1+41/256))/2
+data8 0x3fb3b08b6757f2a9 //log(1/frcpa(1+42/256))/2
+data8 0x3fb40dfb08378003 //log(1/frcpa(1+43/256))/2
+data8 0x3fb47e74e8ca5f7c //log(1/frcpa(1+44/256))/2
+data8 0x3fb4ef51f6466de4 //log(1/frcpa(1+45/256))/2
+data8 0x3fb56092e02ba516 //log(1/frcpa(1+46/256))/2
+data8 0x3fb5d23857cd74d5 //log(1/frcpa(1+47/256))/2
+data8 0x3fb6313a37335d76 //log(1/frcpa(1+48/256))/2
+data8 0x3fb6a399dabbd383 //log(1/frcpa(1+49/256))/2
+data8 0x3fb70337dd3ce41b //log(1/frcpa(1+50/256))/2
+data8 0x3fb77654128f6127 //log(1/frcpa(1+51/256))/2
+data8 0x3fb7e9d82a0b022d //log(1/frcpa(1+52/256))/2
+data8 0x3fb84a6b759f512f //log(1/frcpa(1+53/256))/2
+data8 0x3fb8ab47d5f5a310 //log(1/frcpa(1+54/256))/2
+data8 0x3fb91fe49096581b //log(1/frcpa(1+55/256))/2
+data8 0x3fb981634011aa75 //log(1/frcpa(1+56/256))/2
+data8 0x3fb9f6c407089664 //log(1/frcpa(1+57/256))/2
+data8 0x3fba58e729348f43 //log(1/frcpa(1+58/256))/2
+data8 0x3fbabb55c31693ad //log(1/frcpa(1+59/256))/2
+data8 0x3fbb1e104919efd0 //log(1/frcpa(1+60/256))/2
+data8 0x3fbb94ee93e367cb //log(1/frcpa(1+61/256))/2
+data8 0x3fbbf851c067555f //log(1/frcpa(1+62/256))/2
+data8 0x3fbc5c0254bf23a6 //log(1/frcpa(1+63/256))/2
+data8 0x3fbcc000c9db3c52 //log(1/frcpa(1+64/256))/2
+data8 0x3fbd244d99c85674 //log(1/frcpa(1+65/256))/2
+data8 0x3fbd88e93fb2f450 //log(1/frcpa(1+66/256))/2
+data8 0x3fbdedd437eaef01 //log(1/frcpa(1+67/256))/2
+data8 0x3fbe530effe71012 //log(1/frcpa(1+68/256))/2
+data8 0x3fbeb89a1648b971 //log(1/frcpa(1+69/256))/2
+data8 0x3fbf1e75fadf9bde //log(1/frcpa(1+70/256))/2
+data8 0x3fbf84a32ead7c35 //log(1/frcpa(1+71/256))/2
+data8 0x3fbfeb2233ea07cd //log(1/frcpa(1+72/256))/2
+data8 0x3fc028f9c7035c1c //log(1/frcpa(1+73/256))/2
+data8 0x3fc05c8be0d9635a //log(1/frcpa(1+74/256))/2
+data8 0x3fc085eb8f8ae797 //log(1/frcpa(1+75/256))/2
+data8 0x3fc0b9c8e32d1911 //log(1/frcpa(1+76/256))/2
+data8 0x3fc0edd060b78081 //log(1/frcpa(1+77/256))/2
+data8 0x3fc122024cf0063f //log(1/frcpa(1+78/256))/2
+data8 0x3fc14be2927aecd4 //log(1/frcpa(1+79/256))/2
+data8 0x3fc180618ef18adf //log(1/frcpa(1+80/256))/2
+data8 0x3fc1b50bbe2fc63b //log(1/frcpa(1+81/256))/2
+data8 0x3fc1df4cc7cf242d //log(1/frcpa(1+82/256))/2
+data8 0x3fc214456d0eb8d4 //log(1/frcpa(1+83/256))/2
+data8 0x3fc23ec5991eba49 //log(1/frcpa(1+84/256))/2
+data8 0x3fc2740d9f870afb //log(1/frcpa(1+85/256))/2
+data8 0x3fc29ecdabcdfa04 //log(1/frcpa(1+86/256))/2
+data8 0x3fc2d46602adccee //log(1/frcpa(1+87/256))/2
+data8 0x3fc2ff66b04ea9d4 //log(1/frcpa(1+88/256))/2
+data8 0x3fc335504b355a37 //log(1/frcpa(1+89/256))/2
+data8 0x3fc360925ec44f5d //log(1/frcpa(1+90/256))/2
+data8 0x3fc38bf1c3337e75 //log(1/frcpa(1+91/256))/2
+data8 0x3fc3c25277333184 //log(1/frcpa(1+92/256))/2
+data8 0x3fc3edf463c1683e //log(1/frcpa(1+93/256))/2
+data8 0x3fc419b423d5e8c7 //log(1/frcpa(1+94/256))/2
+data8 0x3fc44591e0539f49 //log(1/frcpa(1+95/256))/2
+data8 0x3fc47c9175b6f0ad //log(1/frcpa(1+96/256))/2
+data8 0x3fc4a8b341552b09 //log(1/frcpa(1+97/256))/2
+data8 0x3fc4d4f3908901a0 //log(1/frcpa(1+98/256))/2
+data8 0x3fc501528da1f968 //log(1/frcpa(1+99/256))/2
+data8 0x3fc52dd06347d4f6 //log(1/frcpa(1+100/256))/2
+data8 0x3fc55a6d3c7b8a8a //log(1/frcpa(1+101/256))/2
+data8 0x3fc5925d2b112a59 //log(1/frcpa(1+102/256))/2
+data8 0x3fc5bf406b543db2 //log(1/frcpa(1+103/256))/2
+data8 0x3fc5ec433d5c35ae //log(1/frcpa(1+104/256))/2
+data8 0x3fc61965cdb02c1f //log(1/frcpa(1+105/256))/2
+data8 0x3fc646a84935b2a2 //log(1/frcpa(1+106/256))/2
+data8 0x3fc6740add31de94 //log(1/frcpa(1+107/256))/2
+data8 0x3fc6a18db74a58c5 //log(1/frcpa(1+108/256))/2
+data8 0x3fc6cf31058670ec //log(1/frcpa(1+109/256))/2
+data8 0x3fc6f180e852f0ba //log(1/frcpa(1+110/256))/2
+data8 0x3fc71f5d71b894f0 //log(1/frcpa(1+111/256))/2
+data8 0x3fc74d5aefd66d5c //log(1/frcpa(1+112/256))/2
+data8 0x3fc77b79922bd37e //log(1/frcpa(1+113/256))/2
+data8 0x3fc7a9b9889f19e2 //log(1/frcpa(1+114/256))/2
+data8 0x3fc7d81b037eb6a6 //log(1/frcpa(1+115/256))/2
+data8 0x3fc8069e33827231 //log(1/frcpa(1+116/256))/2
+data8 0x3fc82996d3ef8bcb //log(1/frcpa(1+117/256))/2
+data8 0x3fc85855776dcbfb //log(1/frcpa(1+118/256))/2
+data8 0x3fc8873658327ccf //log(1/frcpa(1+119/256))/2
+data8 0x3fc8aa75973ab8cf //log(1/frcpa(1+120/256))/2
+data8 0x3fc8d992dc8824e5 //log(1/frcpa(1+121/256))/2
+data8 0x3fc908d2ea7d9512 //log(1/frcpa(1+122/256))/2
+data8 0x3fc92c59e79c0e56 //log(1/frcpa(1+123/256))/2
+data8 0x3fc95bd750ee3ed3 //log(1/frcpa(1+124/256))/2
+data8 0x3fc98b7811a3ee5b //log(1/frcpa(1+125/256))/2
+data8 0x3fc9af47f33d406c //log(1/frcpa(1+126/256))/2
+data8 0x3fc9df270c1914a8 //log(1/frcpa(1+127/256))/2
+data8 0x3fca0325ed14fda4 //log(1/frcpa(1+128/256))/2
+data8 0x3fca33440224fa79 //log(1/frcpa(1+129/256))/2
+data8 0x3fca57725e80c383 //log(1/frcpa(1+130/256))/2
+data8 0x3fca87d0165dd199 //log(1/frcpa(1+131/256))/2
+data8 0x3fcaac2e6c03f896 //log(1/frcpa(1+132/256))/2
+data8 0x3fcadccc6fdf6a81 //log(1/frcpa(1+133/256))/2
+data8 0x3fcb015b3eb1e790 //log(1/frcpa(1+134/256))/2
+data8 0x3fcb323a3a635948 //log(1/frcpa(1+135/256))/2
+data8 0x3fcb56fa04462909 //log(1/frcpa(1+136/256))/2
+data8 0x3fcb881aa659bc93 //log(1/frcpa(1+137/256))/2
+data8 0x3fcbad0bef3db165 //log(1/frcpa(1+138/256))/2
+data8 0x3fcbd21297781c2f //log(1/frcpa(1+139/256))/2
+data8 0x3fcc039236f08819 //log(1/frcpa(1+140/256))/2
+data8 0x3fcc28cb1e4d32fd //log(1/frcpa(1+141/256))/2
+data8 0x3fcc4e19b84723c2 //log(1/frcpa(1+142/256))/2
+data8 0x3fcc7ff9c74554c9 //log(1/frcpa(1+143/256))/2
+data8 0x3fcca57b64e9db05 //log(1/frcpa(1+144/256))/2
+data8 0x3fcccb130a5cebb0 //log(1/frcpa(1+145/256))/2
+data8 0x3fccf0c0d18f326f //log(1/frcpa(1+146/256))/2
+data8 0x3fcd232075b5a201 //log(1/frcpa(1+147/256))/2
+data8 0x3fcd490246defa6b //log(1/frcpa(1+148/256))/2
+data8 0x3fcd6efa918d25cd //log(1/frcpa(1+149/256))/2
+data8 0x3fcd9509707ae52f //log(1/frcpa(1+150/256))/2
+data8 0x3fcdbb2efe92c554 //log(1/frcpa(1+151/256))/2
+data8 0x3fcdee2f3445e4af //log(1/frcpa(1+152/256))/2
+data8 0x3fce148a1a2726ce //log(1/frcpa(1+153/256))/2
+data8 0x3fce3afc0a49ff40 //log(1/frcpa(1+154/256))/2
+data8 0x3fce6185206d516e //log(1/frcpa(1+155/256))/2
+data8 0x3fce882578823d52 //log(1/frcpa(1+156/256))/2
+data8 0x3fceaedd2eac990c //log(1/frcpa(1+157/256))/2
+data8 0x3fced5ac5f436be3 //log(1/frcpa(1+158/256))/2
+data8 0x3fcefc9326d16ab9 //log(1/frcpa(1+159/256))/2
+data8 0x3fcf2391a2157600 //log(1/frcpa(1+160/256))/2
+data8 0x3fcf4aa7ee03192d //log(1/frcpa(1+161/256))/2
+data8 0x3fcf71d627c30bb0 //log(1/frcpa(1+162/256))/2
+data8 0x3fcf991c6cb3b379 //log(1/frcpa(1+163/256))/2
+data8 0x3fcfc07ada69a910 //log(1/frcpa(1+164/256))/2
+data8 0x3fcfe7f18eb03d3e //log(1/frcpa(1+165/256))/2
+data8 0x3fd007c053c5002e //log(1/frcpa(1+166/256))/2
+data8 0x3fd01b942198a5a1 //log(1/frcpa(1+167/256))/2
+data8 0x3fd02f74400c64eb //log(1/frcpa(1+168/256))/2
+data8 0x3fd04360be7603ad //log(1/frcpa(1+169/256))/2
+data8 0x3fd05759ac47fe34 //log(1/frcpa(1+170/256))/2
+data8 0x3fd06b5f1911cf52 //log(1/frcpa(1+171/256))/2
+data8 0x3fd078bf0533c568 //log(1/frcpa(1+172/256))/2
+data8 0x3fd08cd9687e7b0e //log(1/frcpa(1+173/256))/2
+data8 0x3fd0a10074cf9019 //log(1/frcpa(1+174/256))/2
+data8 0x3fd0b5343a234477 //log(1/frcpa(1+175/256))/2
+data8 0x3fd0c974c89431ce //log(1/frcpa(1+176/256))/2
+data8 0x3fd0ddc2305b9886 //log(1/frcpa(1+177/256))/2
+data8 0x3fd0eb524bafc918 //log(1/frcpa(1+178/256))/2
+data8 0x3fd0ffb54213a476 //log(1/frcpa(1+179/256))/2
+data8 0x3fd114253da97d9f //log(1/frcpa(1+180/256))/2
+data8 0x3fd128a24f1d9aff //log(1/frcpa(1+181/256))/2
+data8 0x3fd1365252bf0865 //log(1/frcpa(1+182/256))/2
+data8 0x3fd14ae558b4a92d //log(1/frcpa(1+183/256))/2
+data8 0x3fd15f85a19c765b //log(1/frcpa(1+184/256))/2
+data8 0x3fd16d4d38c119fa //log(1/frcpa(1+185/256))/2
+data8 0x3fd18203c20dd133 //log(1/frcpa(1+186/256))/2
+data8 0x3fd196c7bc4b1f3b //log(1/frcpa(1+187/256))/2
+data8 0x3fd1a4a738b7a33c //log(1/frcpa(1+188/256))/2
+data8 0x3fd1b981c0c9653d //log(1/frcpa(1+189/256))/2
+data8 0x3fd1ce69e8bb106b //log(1/frcpa(1+190/256))/2
+data8 0x3fd1dc619de06944 //log(1/frcpa(1+191/256))/2
+data8 0x3fd1f160a2ad0da4 //log(1/frcpa(1+192/256))/2
+data8 0x3fd2066d7740737e //log(1/frcpa(1+193/256))/2
+data8 0x3fd2147dba47a394 //log(1/frcpa(1+194/256))/2
+data8 0x3fd229a1bc5ebac3 //log(1/frcpa(1+195/256))/2
+data8 0x3fd237c1841a502e //log(1/frcpa(1+196/256))/2
+data8 0x3fd24cfce6f80d9a //log(1/frcpa(1+197/256))/2
+data8 0x3fd25b2c55cd5762 //log(1/frcpa(1+198/256))/2
+data8 0x3fd2707f4d5f7c41 //log(1/frcpa(1+199/256))/2
+data8 0x3fd285e0842ca384 //log(1/frcpa(1+200/256))/2
+data8 0x3fd294294708b773 //log(1/frcpa(1+201/256))/2
+data8 0x3fd2a9a2670aff0c //log(1/frcpa(1+202/256))/2
+data8 0x3fd2b7fb2c8d1cc1 //log(1/frcpa(1+203/256))/2
+data8 0x3fd2c65a6395f5f5 //log(1/frcpa(1+204/256))/2
+data8 0x3fd2dbf557b0df43 //log(1/frcpa(1+205/256))/2
+data8 0x3fd2ea64c3f97655 //log(1/frcpa(1+206/256))/2
+data8 0x3fd3001823684d73 //log(1/frcpa(1+207/256))/2
+data8 0x3fd30e97e9a8b5cd //log(1/frcpa(1+208/256))/2
+data8 0x3fd32463ebdd34ea //log(1/frcpa(1+209/256))/2
+data8 0x3fd332f4314ad796 //log(1/frcpa(1+210/256))/2
+data8 0x3fd348d90e7464d0 //log(1/frcpa(1+211/256))/2
+data8 0x3fd35779f8c43d6e //log(1/frcpa(1+212/256))/2
+data8 0x3fd36621961a6a99 //log(1/frcpa(1+213/256))/2
+data8 0x3fd37c299f3c366a //log(1/frcpa(1+214/256))/2
+data8 0x3fd38ae2171976e7 //log(1/frcpa(1+215/256))/2
+data8 0x3fd399a157a603e7 //log(1/frcpa(1+216/256))/2
+data8 0x3fd3afccfe77b9d1 //log(1/frcpa(1+217/256))/2
+data8 0x3fd3be9d503533b5 //log(1/frcpa(1+218/256))/2
+data8 0x3fd3cd7480b4a8a3 //log(1/frcpa(1+219/256))/2
+data8 0x3fd3e3c43918f76c //log(1/frcpa(1+220/256))/2
+data8 0x3fd3f2acb27ed6c7 //log(1/frcpa(1+221/256))/2
+data8 0x3fd4019c2125ca93 //log(1/frcpa(1+222/256))/2
+data8 0x3fd4181061389722 //log(1/frcpa(1+223/256))/2
+data8 0x3fd42711518df545 //log(1/frcpa(1+224/256))/2
+data8 0x3fd436194e12b6bf //log(1/frcpa(1+225/256))/2
+data8 0x3fd445285d68ea69 //log(1/frcpa(1+226/256))/2
+data8 0x3fd45bcc464c893a //log(1/frcpa(1+227/256))/2
+data8 0x3fd46aed21f117fc //log(1/frcpa(1+228/256))/2
+data8 0x3fd47a1527e8a2d3 //log(1/frcpa(1+229/256))/2
+data8 0x3fd489445efffccc //log(1/frcpa(1+230/256))/2
+data8 0x3fd4a018bcb69835 //log(1/frcpa(1+231/256))/2
+data8 0x3fd4af5a0c9d65d7 //log(1/frcpa(1+232/256))/2
+data8 0x3fd4bea2a5bdbe87 //log(1/frcpa(1+233/256))/2
+data8 0x3fd4cdf28f10ac46 //log(1/frcpa(1+234/256))/2
+data8 0x3fd4dd49cf994058 //log(1/frcpa(1+235/256))/2
+data8 0x3fd4eca86e64a684 //log(1/frcpa(1+236/256))/2
+data8 0x3fd503c43cd8eb68 //log(1/frcpa(1+237/256))/2
+data8 0x3fd513356667fc57 //log(1/frcpa(1+238/256))/2
+data8 0x3fd522ae0738a3d8 //log(1/frcpa(1+239/256))/2
+data8 0x3fd5322e26867857 //log(1/frcpa(1+240/256))/2
+data8 0x3fd541b5cb979809 //log(1/frcpa(1+241/256))/2
+data8 0x3fd55144fdbcbd62 //log(1/frcpa(1+242/256))/2
+data8 0x3fd560dbc45153c7 //log(1/frcpa(1+243/256))/2
+data8 0x3fd5707a26bb8c66 //log(1/frcpa(1+244/256))/2
+data8 0x3fd587f60ed5b900 //log(1/frcpa(1+245/256))/2
+data8 0x3fd597a7977c8f31 //log(1/frcpa(1+246/256))/2
+data8 0x3fd5a760d634bb8b //log(1/frcpa(1+247/256))/2
+data8 0x3fd5b721d295f10f //log(1/frcpa(1+248/256))/2
+data8 0x3fd5c6ea94431ef9 //log(1/frcpa(1+249/256))/2
+data8 0x3fd5d6bb22ea86f6 //log(1/frcpa(1+250/256))/2
+data8 0x3fd5e6938645d390 //log(1/frcpa(1+251/256))/2
+data8 0x3fd5f673c61a2ed2 //log(1/frcpa(1+252/256))/2
+data8 0x3fd6065bea385926 //log(1/frcpa(1+253/256))/2
+data8 0x3fd6164bfa7cc06b //log(1/frcpa(1+254/256))/2
+data8 0x3fd62643fecf9743 //log(1/frcpa(1+255/256))/2
+LOCAL_OBJECT_END(atanhf_data2)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(atanhf)
+
+{ .mfi
+ getf.exp rArgSExpb = f8
+ fclass.m p9,p0 = f8, 0x0b // is arg denormal ?
+ mov rExpbMask = 0x1ffff
+}
+{ .mfi
+ addl DataPtr = @ltoff(atanhf_data), gp
+ fnma.s1 fOneMx = f8, f1, f1 // 1 - x
+ mov rBias = 0xffff
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fclass.m p7,p0 = f8, 0xc7 // is arg NaN or +/-0 ?
+ mov rNearZeroBound = 0xffeb // 2^(-20)
+}
+{ .mfi
+ ld8 DataPtr = [DataPtr]
+ fma.s1 fOnePx = f8, f1, f1 // 1 + x
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ fnorm.s1 fNormX = f8 // Normalize x
+(p9) br.cond.spnt ATANH_UNORM // Branch if x=unorm
+}
+;;
+
+ATANH_COMMON:
+// Return here if x=unorm and not denorm
+{ .mfi
+ ldfpd fP3, fP2 = [DataPtr], 16
+ fma.s1 fX2 = f8, f8, f0 // x^2
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p7) fma.s.s0 f8 = f8,f1,f8 // NaN or +/-0
+(p7) br.ret.spnt b0
+}
+;;
+
+{ .mfi
+ ldfpd fP1, fHalf = [DataPtr], 16
+ frcpa.s1 fRcpM, p9 = f1, fOneMx // rcpm = frcpa(1 - x)
+ nop.i 0
+}
+;;
+
+{ .mfi
+ getf.exp rExpbm = fOneMx
+ frcpa.s1 fRcpP, p0 = f1, fOnePx // rcpp = frcpa(1 + x)
+ // biased exponent
+ and rArgExpb = rArgSExpb, rExpbMask
+}
+;;
+
+{ .mmi
+ getf.exp rExpbp = fOnePx
+ // is |x| < 2^(-20) ?
+ cmp.gt p8,p0 = rNearZeroBound, rArgExpb
+ cmp.ge p6,p0 = rArgExpb, rBias // is |x| >= 1 ?
+}
+;;
+
+{ .mmb
+ getf.sig rSigm = fOneMx
+ nop.m 0
+(p6) br.cond.spnt atanhf_ge_one
+}
+;;
+
+{ .mfb
+ getf.sig rSigp = fOnePx
+(p8) fma.s.s0 f8 = fX2, f8, f8 // x + x^3
+(p8) br.ret.spnt b0 // Exit for MAX_DENORM_ABS < |x| < 2^-20
+}
+;;
+
+{ .mfi
+ ldfd fLog2 = [DataPtr], 16
+ fms.s1 fRm = fRcpM, fOneMx, f1 // rm = rcpm * (1 - x) - 1
+ nop.i 0
+}
+;;
+
+{ .mmf
+ // (1 - x) is always positive here and we need not mask sign bit
+ sub rNm = rExpbm, rBias
+ // (1 + x) is always positive here and we need not mask sign bit
+ sub rNp = rExpbp, rBias
+ fms.s1 fRp = fRcpP, fOnePx, f1 // rp = rcpp * (1 + x) - 1
+}
+;;
+
+{ .mmi
+ setf.sig fN4CvtM = rNm
+ setf.sig fN4CvtP = rNp
+ extr.u rIndm = rSigm,55,8 // Extract 8 bits
+}
+;;
+
+{ .mmi
+ shladd RcpTablePtrM = rIndm, 3, DataPtr
+ nop.m 0
+ extr.u rIndp = rSigp,55,8 // Extract 8 bits
+}
+;;
+
+{ .mmi
+ ldfd fLogTm = [RcpTablePtrM]
+ shladd RcpTablePtrP = rIndp, 3, DataPtr
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfd fLogTp = [RcpTablePtrP]
+ fma.s1 fRm2 = fRm, fRm, f0 // rm^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fP32m = fP3, fRm, fP2 // P3*rm + P2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRp2 = fRp, fRp, f0 // rp^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fP10m = fP1, fRm, fHalf // P1*rm + 1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fP32p = fP3, fRp, fP2 // P3*rp + P2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fP10p = fP1, fRp, fHalf // P1*rp + 1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fcvt.xf fNm = fN4CvtM
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fcvt.xf fNp = fN4CvtP
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // (P3*rm + P2)*rm^2 + (P1*rm + 1)
+ fma.s1 fP32m = fP32m, fRm2, fP10m
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (P3*rp + P2)*rp^2 + (P1*rp + 1)
+ fma.s1 fP32p = fP32p, fRp2, fP10p
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // Nm*ln(2)/2 + Tm/2
+ fma.s1 fLogTm = fNm, fLog2, fLogTm
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // Np*ln(2)/2 + Tp/2
+ fma.s1 fLogTp = fNp, fLog2, fLogTp
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // ((P3*rm + P2)*rm^2 + (P3*rm + 1))*0.5*rm + (Nm*ln(2)/2 + Tm/2)
+ fma.d.s1 fP32m = fP32m, fRm, fLogTm
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // ((P3*rp + P2)*rp^2 + (P3*rp + 1))*0.5*rp + (Np*ln(2)/2 + Tp/2)
+ fma.d.s1 fP32p = fP32p, fRp, fLogTp
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ // atanhf(x) = 0.5 * (log(1 + x) - log(1 - x))
+ fnma.s.s0 f8 = fP32m, f1, fP32p
+ br.ret.sptk b0 // Exit for 2^(-20) <= |x| < 1.0
+}
+;;
+
+
+ATANH_UNORM:
+// Here if x=unorm
+{ .mfi
+ getf.exp rArgSExpb = fNormX // Recompute if x unorm
+ fclass.m p0,p9 = fNormX, 0x0b // Test x denorm
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ fcmp.lt.s0 p10,p11 = f8, f0 // Set denormal flag
+(p9) br.cond.sptk ATANH_COMMON // Continue if x unorm and not denorm
+}
+;;
+
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+(p6) fnma.s.s0 f8 = f8,f8,f8 // Result x-x^2 if x=-denorm
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p7) fma.s.s0 f8 = f8,f8,f8 // Result x+x^2 if x=+denorm
+ br.ret.spnt b0 // Exit if denorm
+}
+;;
+
+// Here if |x| >= 1.0
+atanhf_ge_one:
+{ .mfi
+ alloc r32 = ar.pfs,1,3,4,0
+ fmerge.s fArgAbs = f0, f8 // Form |x|
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fmerge.s f10 = f8, f8 // Save input for error call
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fcmp.eq.s1 p6,p7 = fArgAbs, f1 // Test for |x| = 1.0
+ nop.i 0
+}
+;;
+
+// Set error tag and result, and raise invalid flag if |x| > 1.0
+{ .mfi
+(p7) mov atanh_GR_tag = 133
+(p7) frcpa.s0 f8, p0 = f0, f0 // Get QNaN, and raise invalid
+ nop.i 0
+}
+;;
+
+// Set error tag and result, and raise Z flag if |x| = 1.0
+{ .mfi
+ nop.m 0
+(p6) frcpa.s0 fRm, p0 = f1, f0 // Get inf, and raise Z flag
+ nop.i 0
+}
+;;
+
+{ .mfb
+(p6) mov atanh_GR_tag = 134
+(p6) fmerge.s f8 = f8, fRm // result is +-inf
+ br.cond.sptk __libm_error_region // Exit if |x| >= 1.0
+}
+;;
+
+GLOBAL_LIBM_END(atanhf)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+{ .mmi
+ stfs [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+{ .mib
+ stfs [GR_Parameter_X] = f10 // STORE Parameter 1 on stack
+ // Parameter 3 address
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ nop.b 0
+}
+{ .mib
+ stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+
+{ .mmi
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_atanhl.S b/sysdeps/ia64/fpu/e_atanhl.S
new file mode 100644
index 0000000000..8266bd56fb
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_atanhl.S
@@ -0,0 +1,1155 @@
+.file "atanhl.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
+// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
+// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code,and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//*********************************************************************
+//
+// History:
+// 09/10/01 Initial version
+// 12/11/01 Corrected .restore syntax
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
+//
+//*********************************************************************
+//
+//*********************************************************************
+//
+// Function: atanhl(x) computes the principle value of the inverse
+// hyperbolic tangent of x.
+//
+//*********************************************************************
+//
+// Resources Used:
+//
+// Floating-Point Registers: f8 (Input and Return Value)
+// f33-f73
+//
+// General Purpose Registers:
+// r32-r52
+// r49-r52 (Used to pass arguments to error handling routine)
+//
+// Predicate Registers: p6-p15
+//
+//*********************************************************************
+//
+// IEEE Special Conditions:
+//
+// atanhl(inf) = QNaN
+// atanhl(-inf) = QNaN
+// atanhl(+/-0) = +/-0
+// atanhl(1) = +inf
+// atanhl(-1) = -inf
+// atanhl(|x|>1) = QNaN
+// atanhl(SNaN) = QNaN
+// atanhl(QNaN) = QNaN
+//
+//*********************************************************************
+//
+// Overview
+//
+// The method consists of two cases.
+//
+// If |x| < 1/32 use case atanhl_near_zero;
+// else use case atanhl_regular;
+//
+// Case atanhl_near_zero:
+//
+// atanhl(x) can be approximated by the Taylor series expansion
+// up to order 17.
+//
+// Case atanhl_regular:
+//
+// Here we use formula atanhl(x) = sign(x)*log1pl(2*|x|/(1-|x|))/2 and
+// calculation is subdivided into two stages. The first stage is
+// calculating of X = 2*|x|/(1-|x|). The second one is calculating of
+// sign(x)*log1pl(X)/2. To obtain required accuracy we use precise division
+// algorythm output of which is a pair of two extended precision values those
+// approximate result of division with accuracy higher than working
+// precision. This pair is passed to modified log1pl function.
+//
+//
+// 1. calculating of X = 2*|x|/(1-|x|)
+// ( based on Peter Markstein's "IA-64 and Elementary Functions" book )
+// ********************************************************************
+//
+// a = 2*|x|
+// b = 1 - |x|
+// b_lo = |x| - (1 - b)
+//
+// y = frcpa(b) initial approximation of 1/b
+// q = a*y initial approximation of a/b
+//
+// e = 1 - b*y
+// e2 = e + e^2
+// e1 = e^2
+// y1 = y + y*e2 = y + y*(e+e^2)
+//
+// e3 = e + e1^2
+// y2 = y + y1*e3 = y + y*(e+e^2+..+e^6)
+//
+// r = a - b*q
+// e = 1 - b*y2
+// X = q + r*y2 high part of a/b
+//
+// y3 = y2 + y2*e4
+// r1 = a - b*X
+// r1 = r1 - b_lo*X
+// X_lo = r1*y3 low part of a/b
+//
+// 2. special log1p algorithm overview
+// ***********************************
+//
+// Here we use a table lookup method. The basic idea is that in
+// order to compute logl(Arg) = log1pl (Arg-1) for an argument Arg in [1,2),
+// we construct a value G such that G*Arg is close to 1 and that
+// logl(1/G) is obtainable easily from a table of values calculated
+// beforehand. Thus
+//
+// logl(Arg) = logl(1/G) + logl(G*Arg)
+// = logl(1/G) + logl(1 + (G*Arg - 1))
+//
+// Because |G*Arg - 1| is small, the second term on the right hand
+// side can be approximated by a short polynomial. We elaborate
+// this method in several steps.
+//
+// Step 0: Initialization
+// ------
+// We need to calculate logl(X + X_lo + 1). Obtain N, S_hi such that
+//
+// X + X_lo + 1 = 2^N * ( S_hi + S_lo ) exactly
+//
+// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
+// that |S_lo| <= ulp(S_hi).
+//
+// For the special version of log1p we add X_lo to S_lo (S_lo = S_lo + X_lo)
+// !-----------------------------------------------------------------------!
+//
+// Step 1: Argument Reduction
+// ------
+// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
+//
+// G := G_1 * G_2 * G_3
+// r := (G * S_hi - 1) + G * S_lo
+//
+// These G_j's have the property that the product is exactly
+// representable and that |r| < 2^(-12) as a result.
+//
+// Step 2: Approximation
+// ------
+// logl(1 + r) is approximated by a short polynomial poly(r).
+//
+// Step 3: Reconstruction
+// ------
+// Finally, log1pl(X + X_lo) = logl(X + X_lo + 1) is given by
+//
+// logl(X + X_lo + 1) = logl(2^N * (S_hi + S_lo))
+// ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
+// ~=~ N*logl(2) + logl(1/G) + poly(r).
+//
+// For detailed description see log1p1 function, regular path.
+//
+//*********************************************************************
+
+RODATA
+.align 64
+
+// ************* DO NOT CHANGE THE ORDER OF THESE TABLES *************
+
+LOCAL_OBJECT_START(Constants_TaylorSeries)
+data8 0xF0F0F0F0F0F0F0F1,0x00003FFA // C17
+data8 0x8888888888888889,0x00003FFB // C15
+data8 0x9D89D89D89D89D8A,0x00003FFB // C13
+data8 0xBA2E8BA2E8BA2E8C,0x00003FFB // C11
+data8 0xE38E38E38E38E38E,0x00003FFB // C9
+data8 0x9249249249249249,0x00003FFC // C7
+data8 0xCCCCCCCCCCCCCCCD,0x00003FFC // C5
+data8 0xAAAAAAAAAAAAAAAA,0x00003FFD // C3
+data4 0x3f000000 // 1/2
+data4 0x00000000 // pad
+data4 0x00000000
+data4 0x00000000
+LOCAL_OBJECT_END(Constants_TaylorSeries)
+
+LOCAL_OBJECT_START(Constants_Q)
+data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 // log2_hi
+data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 // log2_lo
+data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 // Q4
+data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 // Q3
+data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 // Q2
+data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 // Q1
+LOCAL_OBJECT_END(Constants_Q)
+
+
+// Z1 - 16 bit fixed
+LOCAL_OBJECT_START(Constants_Z_1)
+data4 0x00008000
+data4 0x00007879
+data4 0x000071C8
+data4 0x00006BCB
+data4 0x00006667
+data4 0x00006187
+data4 0x00005D18
+data4 0x0000590C
+data4 0x00005556
+data4 0x000051EC
+data4 0x00004EC5
+data4 0x00004BDB
+data4 0x00004925
+data4 0x0000469F
+data4 0x00004445
+data4 0x00004211
+LOCAL_OBJECT_END(Constants_Z_1)
+
+// G1 and H1 - IEEE single and h1 - IEEE double
+LOCAL_OBJECT_START(Constants_G_H_h1)
+data4 0x3F800000,0x00000000
+data8 0x0000000000000000
+data4 0x3F70F0F0,0x3D785196
+data8 0x3DA163A6617D741C
+data4 0x3F638E38,0x3DF13843
+data8 0x3E2C55E6CBD3D5BB
+data4 0x3F579430,0x3E2FF9A0
+data8 0xBE3EB0BFD86EA5E7
+data4 0x3F4CCCC8,0x3E647FD6
+data8 0x3E2E6A8C86B12760
+data4 0x3F430C30,0x3E8B3AE7
+data8 0x3E47574C5C0739BA
+data4 0x3F3A2E88,0x3EA30C68
+data8 0x3E20E30F13E8AF2F
+data4 0x3F321640,0x3EB9CEC8
+data8 0xBE42885BF2C630BD
+data4 0x3F2AAAA8,0x3ECF9927
+data8 0x3E497F3497E577C6
+data4 0x3F23D708,0x3EE47FC5
+data8 0x3E3E6A6EA6B0A5AB
+data4 0x3F1D89D8,0x3EF8947D
+data8 0xBDF43E3CD328D9BE
+data4 0x3F17B420,0x3F05F3A1
+data8 0x3E4094C30ADB090A
+data4 0x3F124920,0x3F0F4303
+data8 0xBE28FBB2FC1FE510
+data4 0x3F0D3DC8,0x3F183EBF
+data8 0x3E3A789510FDE3FA
+data4 0x3F088888,0x3F20EC80
+data8 0x3E508CE57CC8C98F
+data4 0x3F042108,0x3F29516A
+data8 0xBE534874A223106C
+LOCAL_OBJECT_END(Constants_G_H_h1)
+
+// Z2 - 16 bit fixed
+LOCAL_OBJECT_START(Constants_Z_2)
+data4 0x00008000
+data4 0x00007F81
+data4 0x00007F02
+data4 0x00007E85
+data4 0x00007E08
+data4 0x00007D8D
+data4 0x00007D12
+data4 0x00007C98
+data4 0x00007C20
+data4 0x00007BA8
+data4 0x00007B31
+data4 0x00007ABB
+data4 0x00007A45
+data4 0x000079D1
+data4 0x0000795D
+data4 0x000078EB
+LOCAL_OBJECT_END(Constants_Z_2)
+
+// G2 and H2 - IEEE single and h2 - IEEE double
+LOCAL_OBJECT_START(Constants_G_H_h2)
+data4 0x3F800000,0x00000000
+data8 0x0000000000000000
+data4 0x3F7F00F8,0x3B7F875D
+data8 0x3DB5A11622C42273
+data4 0x3F7E03F8,0x3BFF015B
+data8 0x3DE620CF21F86ED3
+data4 0x3F7D08E0,0x3C3EE393
+data8 0xBDAFA07E484F34ED
+data4 0x3F7C0FC0,0x3C7E0586
+data8 0xBDFE07F03860BCF6
+data4 0x3F7B1880,0x3C9E75D2
+data8 0x3DEA370FA78093D6
+data4 0x3F7A2328,0x3CBDC97A
+data8 0x3DFF579172A753D0
+data4 0x3F792FB0,0x3CDCFE47
+data8 0x3DFEBE6CA7EF896B
+data4 0x3F783E08,0x3CFC15D0
+data8 0x3E0CF156409ECB43
+data4 0x3F774E38,0x3D0D874D
+data8 0xBE0B6F97FFEF71DF
+data4 0x3F766038,0x3D1CF49B
+data8 0xBE0804835D59EEE8
+data4 0x3F757400,0x3D2C531D
+data8 0x3E1F91E9A9192A74
+data4 0x3F748988,0x3D3BA322
+data8 0xBE139A06BF72A8CD
+data4 0x3F73A0D0,0x3D4AE46F
+data8 0x3E1D9202F8FBA6CF
+data4 0x3F72B9D0,0x3D5A1756
+data8 0xBE1DCCC4BA796223
+data4 0x3F71D488,0x3D693B9D
+data8 0xBE049391B6B7C239
+LOCAL_OBJECT_END(Constants_G_H_h2)
+
+// G3 and H3 - IEEE single and h3 - IEEE double
+LOCAL_OBJECT_START(Constants_G_H_h3)
+data4 0x3F7FFC00,0x38800100
+data8 0x3D355595562224CD
+data4 0x3F7FF400,0x39400480
+data8 0x3D8200A206136FF6
+data4 0x3F7FEC00,0x39A00640
+data8 0x3DA4D68DE8DE9AF0
+data4 0x3F7FE400,0x39E00C41
+data8 0xBD8B4291B10238DC
+data4 0x3F7FDC00,0x3A100A21
+data8 0xBD89CCB83B1952CA
+data4 0x3F7FD400,0x3A300F22
+data8 0xBDB107071DC46826
+data4 0x3F7FCC08,0x3A4FF51C
+data8 0x3DB6FCB9F43307DB
+data4 0x3F7FC408,0x3A6FFC1D
+data8 0xBD9B7C4762DC7872
+data4 0x3F7FBC10,0x3A87F20B
+data8 0xBDC3725E3F89154A
+data4 0x3F7FB410,0x3A97F68B
+data8 0xBD93519D62B9D392
+data4 0x3F7FAC18,0x3AA7EB86
+data8 0x3DC184410F21BD9D
+data4 0x3F7FA420,0x3AB7E101
+data8 0xBDA64B952245E0A6
+data4 0x3F7F9C20,0x3AC7E701
+data8 0x3DB4B0ECAABB34B8
+data4 0x3F7F9428,0x3AD7DD7B
+data8 0x3D9923376DC40A7E
+data4 0x3F7F8C30,0x3AE7D474
+data8 0x3DC6E17B4F2083D3
+data4 0x3F7F8438,0x3AF7CBED
+data8 0x3DAE314B811D4394
+data4 0x3F7F7C40,0x3B03E1F3
+data8 0xBDD46F21B08F2DB1
+data4 0x3F7F7448,0x3B0BDE2F
+data8 0xBDDC30A46D34522B
+data4 0x3F7F6C50,0x3B13DAAA
+data8 0x3DCB0070B1F473DB
+data4 0x3F7F6458,0x3B1BD766
+data8 0xBDD65DDC6AD282FD
+data4 0x3F7F5C68,0x3B23CC5C
+data8 0xBDCDAB83F153761A
+data4 0x3F7F5470,0x3B2BC997
+data8 0xBDDADA40341D0F8F
+data4 0x3F7F4C78,0x3B33C711
+data8 0x3DCD1BD7EBC394E8
+data4 0x3F7F4488,0x3B3BBCC6
+data8 0xBDC3532B52E3E695
+data4 0x3F7F3C90,0x3B43BAC0
+data8 0xBDA3961EE846B3DE
+data4 0x3F7F34A0,0x3B4BB0F4
+data8 0xBDDADF06785778D4
+data4 0x3F7F2CA8,0x3B53AF6D
+data8 0x3DCC3ED1E55CE212
+data4 0x3F7F24B8,0x3B5BA620
+data8 0xBDBA31039E382C15
+data4 0x3F7F1CC8,0x3B639D12
+data8 0x3D635A0B5C5AF197
+data4 0x3F7F14D8,0x3B6B9444
+data8 0xBDDCCB1971D34EFC
+data4 0x3F7F0CE0,0x3B7393BC
+data8 0x3DC7450252CD7ADA
+data4 0x3F7F04F0,0x3B7B8B6D
+data8 0xBDB68F177D7F2A42
+LOCAL_OBJECT_END(Constants_G_H_h3)
+
+
+
+// Floating Point Registers
+
+FR_C17 = f50
+FR_C15 = f51
+FR_C13 = f52
+FR_C11 = f53
+FR_C9 = f54
+FR_C7 = f55
+FR_C5 = f56
+FR_C3 = f57
+FR_x2 = f58
+FR_x3 = f59
+FR_x4 = f60
+FR_x8 = f61
+
+FR_Rcp = f61
+
+FR_A = f33
+FR_R1 = f33
+
+FR_E1 = f34
+FR_E3 = f34
+FR_Y2 = f34
+FR_Y3 = f34
+
+FR_E2 = f35
+FR_Y1 = f35
+
+FR_B = f36
+FR_Y0 = f37
+FR_E0 = f38
+FR_E4 = f39
+FR_Q0 = f40
+FR_R0 = f41
+FR_B_lo = f42
+
+FR_abs_x = f43
+FR_Bp = f44
+FR_Bn = f45
+FR_Yp = f46
+FR_Yn = f47
+
+FR_X = f48
+FR_BB = f48
+FR_X_lo = f49
+
+FR_G = f50
+FR_Y_hi = f51
+FR_H = f51
+FR_h = f52
+FR_G2 = f53
+FR_H2 = f54
+FR_h2 = f55
+FR_G3 = f56
+FR_H3 = f57
+FR_h3 = f58
+
+FR_Q4 = f59
+FR_poly_lo = f59
+FR_Y_lo = f59
+
+FR_Q3 = f60
+FR_Q2 = f61
+
+FR_Q1 = f62
+FR_poly_hi = f62
+
+FR_float_N = f63
+
+FR_AA = f64
+FR_S_lo = f64
+
+FR_S_hi = f65
+FR_r = f65
+
+FR_log2_hi = f66
+FR_log2_lo = f67
+FR_Z = f68
+FR_2_to_minus_N = f69
+FR_rcub = f70
+FR_rsq = f71
+FR_05r = f72
+FR_Half = f73
+
+FR_Arg_X = f50
+FR_Arg_Y = f0
+FR_RESULT = f8
+
+
+
+// General Purpose Registers
+
+GR_ad_05 = r33
+GR_Index1 = r34
+GR_ArgExp = r34
+GR_Index2 = r35
+GR_ExpMask = r35
+GR_NearZeroBound = r36
+GR_signif = r36
+GR_X_0 = r37
+GR_X_1 = r37
+GR_X_2 = r38
+GR_Index3 = r38
+GR_minus_N = r39
+GR_Z_1 = r40
+GR_Z_2 = r40
+GR_N = r41
+GR_Bias = r42
+GR_M = r43
+GR_ad_taylor = r44
+GR_ad_taylor_2 = r45
+GR_ad2_tbl_3 = r45
+GR_ad_tbl_1 = r46
+GR_ad_tbl_2 = r47
+GR_ad_tbl_3 = r48
+GR_ad_q = r49
+GR_ad_z_1 = r50
+GR_ad_z_2 = r51
+GR_ad_z_3 = r52
+
+//
+// Added for unwind support
+//
+GR_SAVE_PFS = r46
+GR_SAVE_B0 = r47
+GR_SAVE_GP = r48
+GR_Parameter_X = r49
+GR_Parameter_Y = r50
+GR_Parameter_RESULT = r51
+GR_Parameter_TAG = r52
+
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(atanhl)
+
+{ .mfi
+ alloc r32 = ar.pfs,0,17,4,0
+ fnma.s1 FR_Bp = f8,f1,f1 // b = 1 - |arg| (for x>0)
+ mov GR_ExpMask = 0x1ffff
+}
+{ .mfi
+ addl GR_ad_taylor = @ltoff(Constants_TaylorSeries),gp
+ fma.s1 FR_Bn = f8,f1,f1 // b = 1 - |arg| (for x<0)
+ mov GR_NearZeroBound = 0xfffa // biased exp of 1/32
+};;
+{ .mfi
+ getf.exp GR_ArgExp = f8
+ fcmp.lt.s1 p6,p7 = f8,f0 // is negative?
+ nop.i 0
+}
+{ .mfi
+ ld8 GR_ad_taylor = [GR_ad_taylor]
+ fmerge.s FR_abs_x = f1,f8
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fclass.m p8,p0 = f8,0x1C7 // is arg NaT,Q/SNaN or +/-0 ?
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_x2 = f8,f8,f0
+ nop.i 0
+};;
+{ .mfi
+ add GR_ad_z_1 = 0x0F0,GR_ad_taylor
+ fclass.m p9,p0 = f8,0x0a // is arg -denormal ?
+ add GR_ad_taylor_2 = 0x010,GR_ad_taylor
+}
+{ .mfi
+ add GR_ad_05 = 0x080,GR_ad_taylor
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_C17 = [GR_ad_taylor],32
+ fclass.m p10,p0 = f8,0x09 // is arg +denormal ?
+ add GR_ad_tbl_1 = 0x040,GR_ad_z_1 // point to Constants_G_H_h1
+}
+{ .mfb
+ add GR_ad_z_2 = 0x140,GR_ad_z_1 // point to Constants_Z_2
+ (p8) fma.s0 f8 = f8,f1,f0 // NaN or +/-0
+ (p8) br.ret.spnt b0 // exit for Nan or +/-0
+};;
+{ .mfi
+ ldfe FR_C15 = [GR_ad_taylor_2],32
+ fclass.m p15,p0 = f8,0x23 // is +/-INF ?
+ add GR_ad_tbl_2 = 0x180,GR_ad_z_1 // point to Constants_G_H_h2
+}
+{ .mfb
+ ldfe FR_C13 = [GR_ad_taylor],32
+ (p9) fnma.s0 f8 = f8,f8,f8 // -denormal
+ (p9) br.ret.spnt b0 // exit for -denormal
+};;
+{ .mfi
+ ldfe FR_C11 = [GR_ad_taylor_2],32
+ fcmp.eq.s0 p13,p0 = FR_abs_x,f1 // is |arg| = 1?
+ nop.i 0
+}
+{ .mfb
+ ldfe FR_C9 = [GR_ad_taylor],32
+(p10) fma.s0 f8 = f8,f8,f8 // +denormal
+(p10) br.ret.spnt b0 // exit for +denormal
+};;
+{ .mfi
+ ldfe FR_C7 = [GR_ad_taylor_2],32
+ (p6) frcpa.s1 FR_Yn,p11 = f1,FR_Bn // y = frcpa(b)
+ and GR_ArgExp = GR_ArgExp,GR_ExpMask // biased exponent
+}
+{ .mfb
+ ldfe FR_C5 = [GR_ad_taylor],32
+ fnma.s1 FR_B = FR_abs_x,f1,f1 // b = 1 - |arg|
+(p15) br.cond.spnt atanhl_gt_one // |arg| > 1
+};;
+{ .mfb
+ cmp.gt p14,p0 = GR_NearZeroBound,GR_ArgExp
+ (p7) frcpa.s1 FR_Yp,p12 = f1,FR_Bp // y = frcpa(b)
+(p13) br.cond.spnt atanhl_eq_one // |arg| = 1/32
+}
+{ .mfb
+ ldfe FR_C3 = [GR_ad_taylor_2],32
+ fma.s1 FR_A = FR_abs_x,f1,FR_abs_x // a = 2 * |arg|
+(p14) br.cond.spnt atanhl_near_zero // |arg| < 1/32
+};;
+{ .mfi
+ nop.m 0
+ fcmp.gt.s0 p8,p0 = FR_abs_x,f1 // is |arg| > 1 ?
+ nop.i 0
+};;
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+ (p6) fnma.s1 FR_B_lo = FR_Bn,f1,f1 // argt = 1 - (1 - |arg|)
+ nop.i 0
+}
+{ .mfi
+ ldfs FR_Half = [GR_ad_05]
+ (p7) fnma.s1 FR_B_lo = FR_Bp,f1,f1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ (p6) fnma.s1 FR_E0 = FR_Yn,FR_Bn,f1 // e = 1-b*y
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ (p6) fma.s1 FR_Y0 = FR_Yn,f1,f0
+ (p8) br.cond.spnt atanhl_gt_one // |arg| > 1
+};;
+{ .mfi
+ nop.m 0
+ (p7) fnma.s1 FR_E0 = FR_Yp,FR_Bp,f1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ (p6) fma.s1 FR_Q0 = FR_A,FR_Yn,f0 // q = a*y
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ (p7) fma.s1 FR_Q0 = FR_A,FR_Yp,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ (p7) fma.s1 FR_Y0 = FR_Yp,f1,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fclass.nm p10,p0 = f8,0x1FF // test for unsupported
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_E2 = FR_E0,FR_E0,FR_E0 // e2 = e+e^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_E1 = FR_E0,FR_E0,f0 // e1 = e^2
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+// Return generated NaN or other value for unsupported values.
+(p10) fma.s0 f8 = f8, f0, f0
+(p10) br.ret.spnt b0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Y1 = FR_Y0,FR_E2,FR_Y0 // y1 = y+y*e2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_E3 = FR_E1,FR_E1,FR_E0 // e3 = e+e1^2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_B_lo = FR_abs_x,f1,FR_B_lo // b_lo = argt-|arg|
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Y2 = FR_Y1,FR_E3,FR_Y0 // y2 = y+y1*e3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_R0 = FR_B,FR_Q0,FR_A // r = a-b*q
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_E4 = FR_B,FR_Y2,f1 // e4 = 1-b*y2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_X = FR_R0,FR_Y2,FR_Q0 // x = q+r*y2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Z = FR_X,f1,f1 // x+1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ (p6) fnma.s1 FR_Half = FR_Half,f1,f0 // sign(arg)/2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Y3 = FR_Y2,FR_E4,FR_Y2 // y3 = y2+y2*e4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_R1 = FR_B,FR_X,FR_A // r1 = a-b*x
+ nop.i 0
+};;
+{ .mfi
+ getf.sig GR_signif = FR_Z // get significand of x+1
+ nop.f 0
+ nop.i 0
+};;
+
+
+{ .mfi
+ add GR_ad_q = -0x060,GR_ad_z_1
+ nop.f 0
+ extr.u GR_Index1 = GR_signif,59,4 // get high 4 bits of signif
+}
+{ .mfi
+ add GR_ad_tbl_3 = 0x280,GR_ad_z_1 // point to Constants_G_H_h3
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ shladd GR_ad_z_1 = GR_Index1,2,GR_ad_z_1 // point to Z_1
+ nop.f 0
+ extr.u GR_X_0 = GR_signif,49,15 // get high 15 bits of significand
+};;
+{ .mfi
+ ld4 GR_Z_1 = [GR_ad_z_1] // load Z_1
+ fmax.s1 FR_AA = FR_X,f1 // for S_lo,form AA = max(X,1.0)
+ nop.i 0
+}
+{ .mfi
+ shladd GR_ad_tbl_1 = GR_Index1,4,GR_ad_tbl_1 // point to G_1
+ nop.f 0
+ mov GR_Bias = 0x0FFFF // exponent bias
+};;
+{ .mfi
+ ldfps FR_G,FR_H = [GR_ad_tbl_1],8 // load G_1,H_1
+ fmerge.se FR_S_hi = f1,FR_Z // form |x+1|
+ nop.i 0
+};;
+{ .mfi
+ getf.exp GR_N = FR_Z // get N = exponent of x+1
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfd FR_h = [GR_ad_tbl_1] // load h_1
+ fnma.s1 FR_R1 = FR_B_lo,FR_X,FR_R1 // r1 = r1-b_lo*x
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_log2_hi = [GR_ad_q],16 // load log2_hi
+ nop.f 0
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // get bits 30-15 of X_0 * Z_1
+};;
+//
+// For performance,don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ ldfe FR_log2_lo = [GR_ad_q],16 // load log2_lo
+ nop.f 0
+ sub GR_N = GR_N,GR_Bias
+};;
+{ .mfi
+ ldfe FR_Q4 = [GR_ad_q],16 // load Q4
+ fms.s1 FR_S_lo = FR_AA,f1,FR_Z // form S_lo = AA - Z
+ sub GR_minus_N = GR_Bias,GR_N // form exponent of 2^(-N)
+};;
+{ .mmf
+ ldfe FR_Q3 = [GR_ad_q],16 // load Q3
+ // put integer N into rightmost significand
+ setf.sig FR_float_N = GR_N
+ fmin.s1 FR_BB = FR_X,f1 // for S_lo,form BB = min(X,1.0)
+};;
+{ .mfi
+ ldfe FR_Q2 = [GR_ad_q],16 // load Q2
+ nop.f 0
+ extr.u GR_Index2 = GR_X_1,6,4 // extract bits 6-9 of X_1
+};;
+{ .mmi
+ ldfe FR_Q1 = [GR_ad_q] // load Q1
+ shladd GR_ad_z_2 = GR_Index2,2,GR_ad_z_2 // point to Z_2
+ nop.i 0
+};;
+{ .mmi
+ ld4 GR_Z_2 = [GR_ad_z_2] // load Z_2
+ shladd GR_ad_tbl_2 = GR_Index2,4,GR_ad_tbl_2 // point to G_2
+ nop.i 0
+};;
+{ .mfi
+ ldfps FR_G2,FR_H2 = [GR_ad_tbl_2],8 // load G_2,H_2
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ ldfd FR_h2 = [GR_ad_tbl_2] // load h_2
+ fma.s1 FR_S_lo = FR_S_lo,f1,FR_BB // S_lo = S_lo + BB
+ nop.i 0
+}
+{ .mfi
+ setf.exp FR_2_to_minus_N = GR_minus_N // form 2^(-N)
+ fma.s1 FR_X_lo = FR_R1,FR_Y3,f0 // x_lo = r1*y3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ nop.f 0
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // get bits 30-15 of X_1 * Z_2
+};;
+//
+// For performance,don't use result of pmpyshr2.u for 4 cycles
+//
+{ .mfi
+ add GR_ad2_tbl_3 = 8,GR_ad_tbl_3
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+
+//
+// Now GR_X_2 can be used
+//
+{ .mfi
+ nop.m 0
+ nop.f 0
+ extr.u GR_Index3 = GR_X_2,1,5 // extract bits 1-5 of X_2
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S_lo = FR_S_lo,f1,FR_X_lo // S_lo = S_lo + Arg_lo
+ nop.i 0
+};;
+
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3,4,GR_ad_tbl_3 // point to G_3
+ fcvt.xf FR_float_N = FR_float_N
+ nop.i 0
+}
+{ .mfi
+ shladd GR_ad2_tbl_3 = GR_Index3,4,GR_ad2_tbl_3 // point to h_3
+ fma.s1 FR_Q1 = FR_Q1,FR_Half,f0 // sign(arg)*Q1/2
+ nop.i 0
+};;
+{ .mmi
+ ldfps FR_G3,FR_H3 = [GR_ad_tbl_3],8 // load G_3,H_3
+ ldfd FR_h3 = [GR_ad2_tbl_3] // load h_3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G,FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H,FR_H2 // H = H_1 + H_2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h,FR_h2 // h = h_1 + h_2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ // S_lo = S_lo * 2^(-N)
+ fma.s1 FR_S_lo = FR_S_lo,FR_2_to_minus_N,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G,FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H,FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h,FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = FR_G,FR_S_hi,f1 // r = G * S_hi - 1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // Y_hi = N * log2_hi + H
+ fma.s1 FR_Y_hi = FR_float_N,FR_log2_hi,FR_H
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_h = FR_float_N,FR_log2_lo,FR_h // h = N * log2_lo + h
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r = FR_G,FR_S_lo,FR_r // r = G * S_lo + (G * S_hi - 1)
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_lo = FR_r,FR_Q4,FR_Q3 // poly_lo = r * Q4 + Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r,FR_r // rsq = r * r
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_05r = FR_r,FR_Half,f0 // sign(arg)*r/2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 FR_poly_lo = FR_poly_lo,FR_r,FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq,FR_r,f0 // rcub = r^3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ // poly_hi = sing(arg)*(Q1*r^2 + r)/2
+ fma.s1 FR_poly_hi = FR_Q1,FR_rsq,FR_05r
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 FR_poly_lo = FR_poly_lo,FR_rcub,FR_h
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ // Y_lo = poly_hi + poly_lo/2
+ fma.s0 FR_Y_lo = FR_poly_lo,FR_Half,FR_poly_hi
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ // Result = arctanh(x) = Y_hi/2 + Y_lo
+ fma.s0 f8 = FR_Y_hi,FR_Half,FR_Y_lo
+ br.ret.sptk b0
+};;
+
+// Taylor's series
+atanhl_near_zero:
+{ .mfi
+ nop.m 0
+ fma.s1 FR_x3 = FR_x2,f8,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_x4 = FR_x2,FR_x2,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C17 = FR_C17,FR_x2,FR_C15
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C13 = FR_C13,FR_x2,FR_C11
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C9 = FR_C9,FR_x2,FR_C7
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C5 = FR_C5,FR_x2,FR_C3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_x8 = FR_x4,FR_x4,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C17 = FR_C17,FR_x4,FR_C13
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C9 = FR_C9,FR_x4,FR_C5
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C17 = FR_C17,FR_x8,FR_C9
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ fma.s0 f8 = FR_C17,FR_x3,f8
+ br.ret.sptk b0
+};;
+
+atanhl_eq_one:
+{ .mfi
+ nop.m 0
+ frcpa.s0 FR_Rcp,p0 = f1,f0 // get inf,and raise Z flag
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmerge.s FR_Arg_X = f8, f8
+ nop.i 0
+};;
+{ .mfb
+ mov GR_Parameter_TAG = 130
+ fmerge.s FR_RESULT = f8,FR_Rcp // result is +-inf
+ br.cond.sptk __libm_error_region // exit if |x| = 1.0
+};;
+
+atanhl_gt_one:
+{ .mfi
+ nop.m 0
+ fmerge.s FR_Arg_X = f8, f8
+ nop.i 0
+};;
+{ .mfb
+ mov GR_Parameter_TAG = 129
+ frcpa.s0 FR_RESULT,p0 = f0,f0 // get QNaN,and raise invalid
+ br.cond.sptk __libm_error_region // exit if |x| > 1.0
+};;
+
+GLOBAL_LIBM_END(atanhl)
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfe [GR_Parameter_Y] = FR_Arg_Y,16 // Save Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0,GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfe [GR_Parameter_X] = FR_Arg_X // Store Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ nop.b 0 // Parameter 3 address
+}
+{ .mib
+ stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+{ .mmi
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region#)
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_cosh.S b/sysdeps/ia64/fpu/e_cosh.S
index 205653d4bf..0c6c5b451e 100644
--- a/sysdeps/ia64/fpu/e_cosh.S
+++ b/sysdeps/ia64/fpu/e_cosh.S
@@ -1,10 +1,10 @@
.file "cosh.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2002, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,1081 +20,799 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-//
+// 05/07/01 Reworked to improve speed of all paths
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 11/15/02 Improved speed with new algorithm
+
// API
//==============================================================
-// double = cosh(double)
-// input floating point f8
-// output floating point f8
-
+// double cosh(double)
// Overview of operation
//==============================================================
-// There are four paths
+// Case 1: 0 < |x| < 0.25
+// Evaluate cosh(x) by a 12th order polynomial
+// Care is take for the order of multiplication; and A2 is not exactly 1/4!,
+// A3 is not exactly 1/6!, etc.
+// cosh(x) = 1 + (A1*x^2 + A2*x^4 + A3*x^6 + A4*x^8 + A5*x^10 + A6*x^12)
+//
+// Case 2: 0.25 < |x| < 710.47586
+// Algorithm is based on the identity cosh(x) = ( exp(x) + exp(-x) ) / 2.
+// The algorithm for exp is described as below. There are a number of
+// economies from evaluating both exp(x) and exp(-x). Although we
+// are evaluating both quantities, only where the quantities diverge do we
+// duplicate the computations. The basic algorithm for exp(x) is described
+// below.
+//
+// Take the input x. w is "how many log2/128 in x?"
+// w = x * 128/log2
+// n = int(w)
+// x = n log2/128 + r + delta
-// 1. |x| < 0.25 COSH_BY_POLY
-// 2. |x| < 32 COSH_BY_TBL
-// 3. |x| < 2^14 COSH_BY_EXP
-// 4. |x_ >= 2^14 COSH_HUGE
+// n = 128M + index_1 + 2^4 index_2
+// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta
-// For paths 1, and 2 SAFE is always 1.
-// For path 4, Safe is always 0.
-// SAFE = 1 means we cannot overflow.
+// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta)
+// Construct 2^M
+// Get 2^(index_1/128) from table_1;
+// Get 2^(index_2/8) from table_2;
+// Calculate exp(r) by 5th order polynomial
+// r = x - n (log2/128)_high
+// delta = - n (log2/128)_low
+// Calculate exp(delta) as 1 + delta
-#include "libm_support.h"
-// Assembly macros
+// Special values
//==============================================================
-cosh_FR_X = f44
-cosh_FR_SGNX = f40
-
-cosh_FR_Inv_log2by64 = f9
-cosh_FR_log2by64_lo = f11
-cosh_FR_log2by64_hi = f10
-
-cosh_FR_A1 = f9
-cosh_FR_A2 = f10
-cosh_FR_A3 = f11
-
-cosh_FR_Rcub = f12
-cosh_FR_M_temp = f13
-cosh_FR_R_temp = f13
-cosh_FR_Rsq = f13
-cosh_FR_R = f14
-
-cosh_FR_M = f38
-
-cosh_FR_B1 = f15
-cosh_FR_B2 = f32
-cosh_FR_B3 = f33
-
-cosh_FR_peven_temp1 = f34
-cosh_FR_peven_temp2 = f35
-cosh_FR_peven = f36
-
-cosh_FR_podd_temp1 = f34
-cosh_FR_podd_temp2 = f35
-cosh_FR_podd = f37
-
-cosh_FR_J_temp = f9
-cosh_FR_J = f10
+// cosh(+0) = 1.0
+// cosh(-0) = 1.0
-cosh_FR_Mmj = f39
+// cosh(+qnan) = +qnan
+// cosh(-qnan) = -qnan
+// cosh(+snan) = +qnan
+// cosh(-snan) = -qnan
-cosh_FR_N_temp1 = f11
-cosh_FR_N_temp2 = f12
-cosh_FR_N = f13
+// cosh(-inf) = +inf
+// cosh(+inf) = +inf
-cosh_FR_spos = f14
-cosh_FR_sneg = f15
-
-cosh_FR_Tjhi = f32
-cosh_FR_Tjlo = f33
-cosh_FR_Tmjhi = f34
-cosh_FR_Tmjlo = f35
-
-GR_mJ = r35
-GR_J = r36
-
-AD_mJ = r38
-AD_J = r39
-
-cosh_FR_C_hi = f9
-cosh_FR_C_hi_temp = f10
-cosh_FR_C_lo_temp1 = f11
-cosh_FR_C_lo_temp2 = f12
-cosh_FR_C_lo_temp3 = f13
-
-cosh_FR_C_lo = f38
-cosh_FR_S_hi = f39
-
-cosh_FR_S_hi_temp1 = f10
-cosh_FR_Y_hi = f11
-cosh_FR_Y_lo_temp = f12
-cosh_FR_Y_lo = f13
-cosh_FR_COSH = f9
-
-cosh_FR_X2 = f9
-cosh_FR_X4 = f10
-
-cosh_FR_P1 = f14
-cosh_FR_P2 = f15
-cosh_FR_P3 = f32
-cosh_FR_P4 = f33
-cosh_FR_P5 = f34
-cosh_FR_P6 = f35
-
-cosh_FR_TINY_THRESH = f9
-
-cosh_FR_COSH_temp = f10
-cosh_FR_SCALE = f11
+// Overflow and Underflow
+//=======================
+// cosh(x) = largest double normal when
+// x = 710.47586 = 0x408633ce8fb9f87d
+//
+// There is no underflow.
-cosh_FR_hi_lo = f10
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input, output
+// f6 -> f15, f32 -> f61
-cosh_FR_poly_podd_temp1 = f11
-cosh_FR_poly_podd_temp2 = f13
-cosh_FR_poly_peven_temp1 = f11
-cosh_FR_poly_peven_temp2 = f13
+// General registers used:
+// r14 -> r40
-GR_SAVE_PFS = r41
-GR_SAVE_B0 = r42
-GR_SAVE_GP = r43
+// Predicate registers used:
+// p6 -> p15
-GR_Parameter_X = r44
-GR_Parameter_Y = r45
-GR_Parameter_RESULT = r46
+// Assembly macros
+//==============================================================
+rRshf = r14
+rN_neg = r14
+rAD_TB1 = r15
+rAD_TB2 = r16
+rAD_P = r17
+rN = r18
+rIndex_1 = r19
+rIndex_2_16 = r20
+rM = r21
+rBiased_M = r21
+rSig_inv_ln2 = r22
+rIndex_1_neg = r22
+rExp_bias = r23
+rExp_bias_minus_1 = r23
+rExp_mask = r24
+rTmp = r24
+rGt_ln = r24
+rIndex_2_16_neg = r24
+rM_neg = r25
+rBiased_M_neg = r25
+rRshf_2to56 = r26
+rAD_T1_neg = r26
+rExp_2tom56 = r28
+rAD_T2_neg = r28
+rAD_T1 = r29
+rAD_T2 = r30
+rSignexp_x = r31
+rExp_x = r31
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+fRSHF_2TO56 = f6
+fINV_LN2_2TO63 = f7
+fW_2TO56_RSH = f9
+f2TOM56 = f11
+fP5 = f12
+fP4 = f13
+fP3 = f14
+fP2 = f15
+
+fLn2_by_128_hi = f33
+fLn2_by_128_lo = f34
+
+fRSHF = f35
+fNfloat = f36
+fNormX = f37
+fR = f38
+fF = f39
+
+fRsq = f40
+f2M = f41
+fS1 = f42
+fT1 = f42
+fS2 = f43
+fT2 = f43
+fS = f43
+fWre_urm_f8 = f44
+fAbsX = f44
+
+fMIN_DBL_OFLOW_ARG = f45
+fMAX_DBL_NORM_ARG = f46
+fXsq = f47
+fX4 = f48
+fGt_pln = f49
+fTmp = f49
+
+fP54 = f50
+fP5432 = f50
+fP32 = f51
+fP = f52
+fP54_neg = f53
+fP5432_neg = f53
+fP32_neg = f54
+fP_neg = f55
+fF_neg = f56
+
+f2M_neg = f57
+fS1_neg = f58
+fT1_neg = f58
+fS2_neg = f59
+fT2_neg = f59
+fS_neg = f59
+fExp = f60
+fExp_neg = f61
+
+fA6 = f50
+fA65 = f50
+fA6543 = f50
+fA654321 = f50
+fA5 = f51
+fA4 = f52
+fA43 = f52
+fA3 = f53
+fA2 = f54
+fA21 = f54
+fA1 = f55
// Data tables
//==============================================================
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-
+RODATA
.align 16
-double_cosh_arg_reduction:
-ASM_TYPE_DIRECTIVE(double_cosh_arg_reduction,@object)
- data8 0xB8AA3B295C17F0BC, 0x00004005
- data8 0xB17217F7D1000000, 0x00003FF8
- data8 0xCF79ABC9E3B39804, 0x00003FD0
-ASM_SIZE_DIRECTIVE(double_cosh_arg_reduction)
-
-double_cosh_p_table:
-ASM_TYPE_DIRECTIVE(double_cosh_p_table,@object)
- data8 0x8000000000000000, 0x00003FFE
- data8 0xAAAAAAAAAAAAAB80, 0x00003FFA
- data8 0xB60B60B60B4FE884, 0x00003FF5
- data8 0xD00D00D1021D7370, 0x00003FEF
- data8 0x93F27740C0C2F1CC, 0x00003FE9
- data8 0x8FA02AC65BCBD5BC, 0x00003FE2
-ASM_SIZE_DIRECTIVE(double_cosh_p_table)
-
-double_cosh_ab_table:
-ASM_TYPE_DIRECTIVE(double_cosh_ab_table,@object)
- data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC
- data8 0x88888888884ECDD5, 0x00003FF8
- data8 0xD00D0C6DCC26A86B, 0x00003FF2
- data8 0x8000000000000002, 0x00003FFE
- data8 0xAAAAAAAAAA402C77, 0x00003FFA
- data8 0xB60B6CC96BDB144D, 0x00003FF5
-ASM_SIZE_DIRECTIVE(double_cosh_ab_table)
-
-double_cosh_j_table:
-ASM_TYPE_DIRECTIVE(double_cosh_j_table,@object)
- data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000
- data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000
- data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000
- data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000
- data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000
- data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000
- data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000
- data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000
- data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000
- data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000
- data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000
- data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000
- data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000
- data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000
- data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000
- data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000
- data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000
- data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000
- data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000
- data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000
- data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000
- data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000
- data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000
- data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000
- data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000
- data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000
- data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000
- data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000
- data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000
- data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000
- data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000
- data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000
- data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000
- data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000
- data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000
- data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000
- data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000
- data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000
- data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000
- data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000
- data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000
- data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000
- data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000
- data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000
- data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000
- data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000
- data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000
- data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000
- data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000
- data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000
- data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000
- data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000
- data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000
- data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000
- data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000
- data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000
- data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000
- data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000
- data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000
- data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000
- data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000
- data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000
- data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000
- data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000
- data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000
-ASM_SIZE_DIRECTIVE(double_cosh_j_table)
-
-.align 32
-.global cosh#
-.section .text
-.proc cosh#
-.align 32
+// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
-cosh:
+// double-extended 1/ln(2)
+// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
+// 3fff b8aa 3b29 5c17 f0bc
+// For speed the significand will be loaded directly with a movl and setf.sig
+// and the exponent will be bias+63 instead of bias+0. Thus subsequent
+// computations need to scale appropriately.
+// The constant 128/ln(2) is needed for the computation of w. This is also
+// obtained by scaling the computations.
+//
+// Two shifting constants are loaded directly with movl and setf.d.
+// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7)
+// This constant is added to x*1/ln2 to shift the integer part of
+// x*128/ln2 into the rightmost bits of the significand.
+// The result of this fma is fW_2TO56_RSH.
+// 2. fRSHF = 1.1000..00 * 2^(63)
+// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give
+// the integer part of w, n, as a floating-point number.
+// The result of this fms is fNfloat.
+
+
+LOCAL_OBJECT_START(exp_table_1)
+data8 0x408633ce8fb9f87e // smallest dbl overflow arg
+data8 0x408633ce8fb9f87d // largest dbl arg to give normal dbl result
+data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi
+data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo
+//
+// Table 1 is 2^(index_1/128) where
+// index_1 goes from 0 to 15
+//
+data8 0x8000000000000000 , 0x00003FFF
+data8 0x80B1ED4FD999AB6C , 0x00003FFF
+data8 0x8164D1F3BC030773 , 0x00003FFF
+data8 0x8218AF4373FC25EC , 0x00003FFF
+data8 0x82CD8698AC2BA1D7 , 0x00003FFF
+data8 0x8383594EEFB6EE37 , 0x00003FFF
+data8 0x843A28C3ACDE4046 , 0x00003FFF
+data8 0x84F1F656379C1A29 , 0x00003FFF
+data8 0x85AAC367CC487B15 , 0x00003FFF
+data8 0x8664915B923FBA04 , 0x00003FFF
+data8 0x871F61969E8D1010 , 0x00003FFF
+data8 0x87DB357FF698D792 , 0x00003FFF
+data8 0x88980E8092DA8527 , 0x00003FFF
+data8 0x8955EE03618E5FDD , 0x00003FFF
+data8 0x8A14D575496EFD9A , 0x00003FFF
+data8 0x8AD4C6452C728924 , 0x00003FFF
+LOCAL_OBJECT_END(exp_table_1)
+
+// Table 2 is 2^(index_1/8) where
+// index_2 goes from 0 to 7
+LOCAL_OBJECT_START(exp_table_2)
+data8 0x8000000000000000 , 0x00003FFF
+data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF
+data8 0x9837F0518DB8A96F , 0x00003FFF
+data8 0xA5FED6A9B15138EA , 0x00003FFF
+data8 0xB504F333F9DE6484 , 0x00003FFF
+data8 0xC5672A115506DADD , 0x00003FFF
+data8 0xD744FCCAD69D6AF4 , 0x00003FFF
+data8 0xEAC0C6E7DD24392F , 0x00003FFF
+LOCAL_OBJECT_END(exp_table_2)
+
+LOCAL_OBJECT_START(exp_p_table)
+data8 0x3f8111116da21757 //P5
+data8 0x3fa55555d787761c //P4
+data8 0x3fc5555555555414 //P3
+data8 0x3fdffffffffffd6a //P2
+LOCAL_OBJECT_END(exp_p_table)
+
+LOCAL_OBJECT_START(cosh_p_table)
+data8 0x8FA02AC65BCBD5BC, 0x00003FE2 // A6
+data8 0xD00D00D1021D7370, 0x00003FEF // A4
+data8 0xAAAAAAAAAAAAAB80, 0x00003FFA // A2
+data8 0x93F27740C0C2F1CC, 0x00003FE9 // A5
+data8 0xB60B60B60B4FE884, 0x00003FF5 // A3
+data8 0x8000000000000000, 0x00003FFE // A1
+LOCAL_OBJECT_END(cosh_p_table)
-#ifdef _LIBC
-.global __ieee754_cosh#
-.proc __ieee754_cosh#
-__ieee754_cosh:
-#endif
-// X NAN?
+.section .text
+GLOBAL_IEEE754_ENTRY(cosh)
-{ .mfi
- alloc r32 = ar.pfs,0,12,4,0
-(p0) fclass.m.unc p6,p7 = f8, 0xc3 //@snan | @qnan
- nop.i 999
+{ .mlx
+ getf.exp rSignexp_x = f8 // Must recompute if x unorm
+ movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
}
-;;
-
-
-{ .mfb
- nop.m 999
-(p6) fma.d.s0 f8 = f8,f1,f8
-(p6) br.ret.spnt b0 ;;
+{ .mlx
+ addl rAD_TB1 = @ltoff(exp_table_1), gp
+ movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56)
}
+;;
-
-// X infinity
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6,p0 = f8, 0x23 //@inf
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-(p6) fmerge.s f8 = f0,f8
-(p6) br.ret.spnt b0 ;;
+ ld8 rAD_TB1 = [rAD_TB1]
+ fclass.m p6,p0 = f8,0x0b // Test for x=unorm
+ mov rExp_mask = 0x1ffff
}
-
-
-
-// Put 0.25 in f9; p6 true if x < 0.25
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x000000000000fffd ;;
-}
-
{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ mov rExp_bias = 0xffff
+ fnorm.s1 fNormX = f8
+ mov rExp_2tom56 = 0xffff-56
}
+;;
+
+// Form two constants we need
+// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
+// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand
{ .mfi
- nop.m 999
-(p0) fmerge.s cosh_FR_X = f0,f8
+ setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63
+ fclass.m p8,p0 = f8,0x07 // Test for x=0
nop.i 999
}
-
-{ .mfi
- nop.m 999
-(p0) fmerge.s cosh_FR_SGNX = f8,f1
- nop.i 999 ;;
+{ .mlx
+ setf.d fRSHF_2TO56 = rRshf_2to56 // Form const 1.100 * 2^(63+56)
+ movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for right shift
}
+;;
{ .mfi
- nop.m 999
-(p0) fcmp.lt.unc p0,p7 = cosh_FR_X,f9
- nop.i 999 ;;
+ ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_NORM_ARG = [rAD_TB1],16
+ fclass.m p10,p0 = f8,0x1e3 // Test for x=inf, nan, NaT
+ nop.i 0
}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p7) br.cond.sptk L(COSH_BY_TBL)
+{ .mfb
+ setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat
+ nop.f 0
+(p6) br.cond.spnt COSH_UNORM // Branch if x=unorm
}
;;
-
-// COSH_BY_POLY:
-// POLY cannot overflow so there is no need to call __libm_error_support
-// Get the values of P_x from the table
-
-{ .mmi
- nop.m 999
-(p0) addl r34 = @ltoff(double_cosh_p_table), gp
- nop.i 999
+COSH_COMMON:
+{ .mfi
+ ldfe fLn2_by_128_hi = [rAD_TB1],16
+ nop.f 0
+ nop.i 0
}
-;;
-
-{ .mmi
- ld8 r34 = [r34]
- nop.m 999
- nop.i 999
+{ .mfb
+ setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63
+(p8) fma.d.s0 f8 = f1,f1,f0 // quick exit for x=0
+(p8) br.ret.spnt b0
}
;;
-
-// Calculate cosh_FR_X2 = ax*ax and cosh_FR_X4 = ax*ax*ax*ax
-{ .mmf
- nop.m 999
-(p0) ldfe cosh_FR_P1 = [r34],16
-(p0) fma.s1 cosh_FR_X2 = cosh_FR_X, cosh_FR_X, f0 ;;
-}
-
-{ .mmi
-(p0) ldfe cosh_FR_P2 = [r34],16 ;;
-(p0) ldfe cosh_FR_P3 = [r34],16
- nop.i 999 ;;
+{ .mfi
+ ldfe fLn2_by_128_lo = [rAD_TB1],16
+ nop.f 0
+ nop.i 0
}
-
-{ .mmi
-(p0) ldfe cosh_FR_P4 = [r34],16 ;;
-(p0) ldfe cosh_FR_P5 = [r34],16
- nop.i 999 ;;
+{ .mfb
+ and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x
+(p10) fma.d.s0 f8 = f8,f8,f0 // Result if x=inf, nan, NaT
+(p10) br.ret.spnt b0 // quick exit for x=inf, nan, NaT
}
+;;
+// After that last load rAD_TB1 points to the beginning of table 1
{ .mfi
-(p0) ldfe cosh_FR_P6 = [r34],16
-(p0) fma.s1 cosh_FR_X4 = cosh_FR_X2, cosh_FR_X2, f0
- nop.i 999 ;;
+ nop.m 0
+ fcmp.eq.s0 p6,p0 = f8, f0 // Dummy to set D
+ sub rExp_x = rExp_x, rExp_bias // True exponent of x
}
+;;
-// Calculate cosh_FR_podd = x4 *(x4 * P_5 + P_3) + P_1
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_poly_podd_temp1 = cosh_FR_X4, cosh_FR_P5, cosh_FR_P3
- nop.i 999 ;;
+ nop.m 0
+ fmerge.s fAbsX = f0, fNormX // Form |x|
+ nop.i 0
}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_podd = cosh_FR_X4, cosh_FR_poly_podd_temp1, cosh_FR_P1
- nop.i 999
+{ .mfb
+ cmp.gt p7, p0 = -2, rExp_x // Test |x| < 2^(-2)
+ fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path
+(p7) br.cond.spnt COSH_SMALL // Branch if 0 < |x| < 2^-2
}
+;;
-// Calculate cosh_FR_peven = p_even = x4 *(x4 * (x4 * P_6 + P_4) + P_2)
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_poly_peven_temp1 = cosh_FR_X4, cosh_FR_P6, cosh_FR_P4
- nop.i 999 ;;
-}
+// W = X * Inv_log2_by_128
+// By adding 1.10...0*2^63 we shift and get round_int(W) in significand.
+// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing.
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_poly_peven_temp2 = cosh_FR_X4, cosh_FR_poly_peven_temp1, cosh_FR_P2
- nop.i 999 ;;
+ add rAD_P = 0x180, rAD_TB1
+ fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56
+ add rAD_TB2 = 0x100, rAD_TB1
}
+;;
+
+// Divide arguments into the following categories:
+// Certain Safe - 0.25 <= |x| <= MAX_DBL_NORM_ARG
+// Possible Overflow p14 - MAX_DBL_NORM_ARG < |x| < MIN_DBL_OFLOW_ARG
+// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= |x| < +inf
+//
+// If the input is really a double arg, then there will never be
+// "Possible Overflow" arguments.
+//
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_peven = cosh_FR_X4, cosh_FR_poly_peven_temp2, f0
- nop.i 999 ;;
+ ldfpd fP5, fP4 = [rAD_P] ,16
+ fcmp.ge.s1 p15,p14 = fAbsX,fMIN_DBL_OFLOW_ARG
+ nop.i 0
}
+;;
+
+// Nfloat = round_int(W)
+// The signficand of fW_2TO56_RSH contains the rounded integer part of W,
+// as a twos complement number in the lower bits (that is, it may be negative).
+// That twos complement number (called N) is put into rN.
+
+// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56
+// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat.
+// Thus, fNfloat contains the floating point version of N
-// Y_lo = x2*p_odd + p_even
-// Calculate f8 = Y_hi + Y_lo
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_X2, cosh_FR_podd, cosh_FR_peven
- nop.i 999 ;;
+ ldfpd fP3, fP2 = [rAD_P]
+(p14) fcmp.gt.unc.s1 p14,p0 = fAbsX,fMAX_DBL_NORM_ARG
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p0) fma.d.s0 f8 = f1, f1, cosh_FR_Y_lo
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+ fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF
+(p15) br.cond.spnt COSH_CERTAIN_OVERFLOW
}
+;;
-
-L(COSH_BY_TBL):
-
-// Now that we are at TBL; so far all we know is that |x| >= 0.25.
-// The first two steps are the same for TBL and EXP, but if we are HUGE
-// Double
-// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true)
-// Single
-// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true)
-// we want to leave now. Go to HUGE if |x| >= 2^14
-// 1000d (register-biased) is e = 14 (true)
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000010009 ;;
+{ .mfi
+ getf.sig rN = fW_2TO56_RSH
+ nop.f 0
+ mov rExp_bias_minus_1 = 0xfffe
}
+;;
+
+// rIndex_1 has index_1
+// rIndex_2_16 has index_2 * 16
+// rBiased_M has M
+// rM has true M
+// r = x - Nfloat * ln2_by_128_hi
+// f = 1 - Nfloat * ln2_by_128_lo
{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ and rIndex_1 = 0x0f, rN
+ fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX
+ shr rM = rN, 0x7
}
-
{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc p6,p7 = cosh_FR_X,f9
- nop.i 999 ;;
+ and rIndex_2_16 = 0x70, rN
+ fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1
+ sub rN_neg = r0, rN
}
+;;
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(COSH_HUGE) ;;
+{ .mmi
+ and rIndex_1_neg = 0x0f, rN_neg
+ add rBiased_M = rExp_bias_minus_1, rM
+ shr rM_neg = rN_neg, 0x7
}
-
-// r32 = 1
-// r34 = N-1
-// r35 = N
-// r36 = j
-// r37 = N+1
-
-// TBL can never overflow
-// cosh(x) = cosh(B+R)
-// = cosh(B) cosh(R) + sinh(B) sinh(R)
-// cosh(R) can be approximated by 1 + p_even
-// sinh(R) can be approximated by p_odd
-
-// ******************************************************
-// STEP 1 (TBL and EXP)
-// ******************************************************
-// Get the following constants.
-// f9 = Inv_log2by64
-// f10 = log2by64_hi
-// f11 = log2by64_lo
-
{ .mmi
-(p0) adds r32 = 0x1,r0
-(p0) addl r34 = @ltoff(double_cosh_arg_reduction), gp
- nop.i 999
+ and rIndex_2_16_neg = 0x70, rN_neg
+ add rAD_T2 = rAD_TB2, rIndex_2_16
+ shladd rAD_T1 = rIndex_1, 4, rAD_TB1
}
;;
-// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and
-// put them in an exponent.
-// cosh_FR_spos = 2^(N-1) and cosh_FR_sneg = 2^(-N-1)
-// r39 = 0xffff + (N-1) = 0xffff +N -1
-// r40 = 0xffff - (N +1) = 0xffff -N -1
-
-{ .mlx
- ld8 r34 = [r34]
-(p0) movl r38 = 0x000000000000fffe ;;
-}
+// rAD_T1 has address of T1
+// rAD_T2 has address if T2
{ .mmi
-(p0) ldfe cosh_FR_Inv_log2by64 = [r34],16 ;;
-(p0) ldfe cosh_FR_log2by64_hi = [r34],16
- nop.i 999 ;;
+ setf.exp f2M = rBiased_M
+ ldfe fT2 = [rAD_T2]
+ nop.i 0
}
-
-{ .mbb
-(p0) ldfe cosh_FR_log2by64_lo = [r34],16
- nop.b 999
- nop.b 999 ;;
-}
-
-// Get the A coefficients
-// f9 = A_1
-// f10 = A_2
-// f11 = A_3
-
{ .mmi
- nop.m 999
-(p0) addl r34 = @ltoff(double_cosh_ab_table), gp
- nop.i 999
+ add rBiased_M_neg = rExp_bias_minus_1, rM_neg
+ add rAD_T2_neg = rAD_TB2, rIndex_2_16_neg
+ shladd rAD_T1_neg = rIndex_1_neg, 4, rAD_TB1
}
;;
+// Create Scale = 2^M
+// Load T1 and T2
{ .mmi
- ld8 r34 = [r34]
- nop.m 999
- nop.i 999
+ ldfe fT1 = [rAD_T1]
+ nop.m 0
+ nop.i 0
+}
+{ .mmf
+ setf.exp f2M_neg = rBiased_M_neg
+ ldfe fT2_neg = [rAD_T2_neg]
+ fma.s1 fF_neg = fNfloat, fLn2_by_128_lo, f1
}
;;
-
-// Calculate M and keep it as integer and floating point.
-// M = round-to-integer(x*Inv_log2by64)
-// cosh_FR_M = M = truncate(ax/(log2/64))
-// Put the significand of M in r35
-// and the floating point representation of M in cosh_FR_M
-
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_M = cosh_FR_X, cosh_FR_Inv_log2by64, f0
- nop.i 999
+ nop.m 0
+ fma.s1 fRsq = fR, fR, f0
+ nop.i 0
}
-
{ .mfi
-(p0) ldfe cosh_FR_A1 = [r34],16
- nop.f 999
- nop.i 999 ;;
+ ldfe fT1_neg = [rAD_T1_neg]
+ fma.s1 fP54 = fR, fP5, fP4
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fcvt.fx.s1 cosh_FR_M_temp = cosh_FR_M
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fP32 = fR, fP3, fP2
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fnorm.s1 cosh_FR_M = cosh_FR_M_temp
- nop.i 999 ;;
+ nop.m 0
+ fnma.s1 fP54_neg = fR, fP5, fP4
+ nop.i 0
}
+;;
{ .mfi
-(p0) getf.sig r35 = cosh_FR_M_temp
- nop.f 999
- nop.i 999 ;;
-}
-
-// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It
-// has a range of -32 thru 31.
-// r35 = M
-// r36 = j
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p0) and r36 = 0x3f, r35 ;;
+ nop.m 0
+ fnma.s1 fP32_neg = fR, fP3, fP2
+ nop.i 0
}
-
-// Calculate R
-// f13 = f44 - f12*f10 = x - M*log2by64_hi
-// f14 = f13 - f8*f11 = R = (x - M*log2by64_hi) - M*log2by64_lo
+;;
{ .mfi
- nop.m 999
-(p0) fnma.s1 cosh_FR_R_temp = cosh_FR_M, cosh_FR_log2by64_hi, cosh_FR_X
- nop.i 999
+ nop.m 0
+ fma.s1 fP5432 = fRsq, fP54, fP32
+ nop.i 0
}
-
{ .mfi
-(p0) ldfe cosh_FR_A2 = [r34],16
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fS2 = fF,fT2,f0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fnma.s1 cosh_FR_R = cosh_FR_M, cosh_FR_log2by64_lo, cosh_FR_R_temp
- nop.i 999
+ nop.m 0
+ fma.s1 fS1 = f2M,fT1,f0
+ nop.i 0
}
-
-// Get the B coefficients
-// f15 = B_1
-// f32 = B_2
-// f33 = B_3
-
-{ .mmi
-(p0) ldfe cosh_FR_A3 = [r34],16 ;;
-(p0) ldfe cosh_FR_B1 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) ldfe cosh_FR_B2 = [r34],16 ;;
-(p0) ldfe cosh_FR_B3 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
-(p0) shl r34 = r36, 0x2 ;;
-(p0) sxt1 r37 = r34 ;;
-}
-
-// ******************************************************
-// STEP 2 (TBL and EXP)
-// ******************************************************
-// Calculate Rsquared and Rcubed in preparation for p_even and p_odd
-// f12 = R*R*R
-// f13 = R*R
-// f14 = R <== from above
-
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_Rsq = cosh_FR_R, cosh_FR_R, f0
-(p0) shr r36 = r37, 0x2 ;;
-}
-
-// r34 = M-j = r35 - r36
-// r35 = N = (M-j)/64
-
-{ .mii
-(p0) sub r34 = r35, r36
- nop.i 999 ;;
-(p0) shr r35 = r34, 0x6 ;;
-}
-
-{ .mii
-(p0) sub r40 = r38, r35
-(p0) adds r37 = 0x1, r35
-(p0) add r39 = r38, r35 ;;
-}
-
-// Get the address of the J table, add the offset,
-// addresses are sinh_AD_mJ and sinh_AD_J, get the T value
-// f32 = T(j)_hi
-// f33 = T(j)_lo
-// f34 = T(-j)_hi
-// f35 = T(-j)_lo
-
-{ .mmi
-(p0) sub r34 = r35, r32
-(p0) addl r37 = @ltoff(double_cosh_j_table), gp
- nop.i 999
+ nop.m 0
+ fma.s1 fP5432_neg = fRsq, fP54_neg, fP32_neg
+ nop.i 0
}
;;
{ .mfi
- ld8 r37 = [r37]
-(p0) fma.s1 cosh_FR_Rcub = cosh_FR_Rsq, cosh_FR_R, f0
- nop.i 999
+ nop.m 0
+ fma.s1 fS1_neg = f2M_neg,fT1_neg,f0
+ nop.i 0
}
-
-// ******************************************************
-// STEP 3 Now decide if we need to branch to EXP
-// ******************************************************
-// Put 32 in f9; p6 true if x < 32
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000010004 ;;
-}
-
-// Calculate p_even
-// f34 = B_2 + Rsq *B_3
-// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3)
-// f36 = peven = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
-
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_peven_temp1 = cosh_FR_Rsq, cosh_FR_B3, cosh_FR_B2
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fS2_neg = fF_neg,fT2_neg,f0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_peven_temp2 = cosh_FR_Rsq, cosh_FR_peven_temp1, cosh_FR_B1
- nop.i 999
+ nop.m 0
+ fma.s1 fP = fRsq, fP5432, fR
+ nop.i 0
}
-
-// Calculate p_odd
-// f34 = A_2 + Rsq *A_3
-// f35 = A_1 + Rsq * (A_2 + Rsq *A_3)
-// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
-
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_podd_temp1 = cosh_FR_Rsq, cosh_FR_A3, cosh_FR_A2
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fS = fS1,fS2,f0
+ nop.i 0
}
+;;
{ .mfi
-(p0) setf.exp cosh_FR_N_temp1 = r39
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fms.s1 fP_neg = fRsq, fP5432_neg, fR
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_peven = cosh_FR_Rsq, cosh_FR_peven_temp2, f0
- nop.i 999
+ nop.m 0
+ fma.s1 fS_neg = fS1_neg,fS2_neg,f0
+ nop.i 0
}
+;;
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_podd_temp2 = cosh_FR_Rsq, cosh_FR_podd_temp1, cosh_FR_A1
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+ fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact
+(p14) br.cond.spnt COSH_POSSIBLE_OVERFLOW
}
+;;
{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fExp = fS, fP, fS
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_podd = cosh_FR_podd_temp2, cosh_FR_Rcub, cosh_FR_R
- nop.i 999
+ nop.m 0
+ fma.s1 fExp_neg = fS_neg, fP_neg, fS_neg
+ nop.i 0
}
+;;
-// sinh_GR_mj contains the table offset for -j
-// sinh_GR_j contains the table offset for +j
-// p6 is true when j <= 0
-
-{ .mlx
-(p0) setf.exp cosh_FR_N_temp2 = r40
-(p0) movl r40 = 0x0000000000000020 ;;
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = fExp, f1, fExp_neg
+ br.ret.sptk b0 // Normal path exit
}
+;;
-{ .mfi
-(p0) sub GR_mJ = r40, r36
-(p0) fmerge.se cosh_FR_spos = cosh_FR_N_temp1, f1
-(p0) adds GR_J = 0x20, r36 ;;
+// Here if 0 < |x| < 0.25
+COSH_SMALL:
+{ .mmf
+ add rAD_T1 = 0x1a0, rAD_TB1
+ add rAD_T2 = 0x1d0, rAD_TB1
}
+;;
-{ .mii
- nop.m 999
-(p0) shl GR_mJ = GR_mJ, 5 ;;
-(p0) add AD_mJ = r37, GR_mJ ;;
+{ .mmf
+ ldfe fA6 = [rAD_T1],16
+ ldfe fA5 = [rAD_T2],16
+ nop.f 0
}
+;;
{ .mmi
- nop.m 999
-(p0) ldfe cosh_FR_Tmjhi = [AD_mJ],16
-(p0) shl GR_J = GR_J, 5 ;;
-}
-
-{ .mfi
-(p0) ldfs cosh_FR_Tmjlo = [AD_mJ],16
-(p0) fcmp.lt.unc.s1 p6,p7 = cosh_FR_X,f9
-(p0) add AD_J = r37, GR_J ;;
+ ldfe fA4 = [rAD_T1],16
+ ldfe fA3 = [rAD_T2],16
+ nop.i 0
}
+;;
{ .mmi
-(p0) ldfe cosh_FR_Tjhi = [AD_J],16 ;;
-(p0) ldfs cosh_FR_Tjlo = [AD_J],16
- nop.i 999 ;;
+ ldfe fA2 = [rAD_T1],16
+ ldfe fA1 = [rAD_T2],16
+ nop.i 0
}
-
-{ .mfb
- nop.m 999
-(p0) fmerge.se cosh_FR_sneg = cosh_FR_N_temp2, f1
-(p7) br.cond.spnt L(COSH_BY_EXP) ;;
-}
-
-// ******************************************************
-// If NOT branch to EXP
-// ******************************************************
-// Calculate C_hi
-// ******************************************************
-// cosh_FR_C_hi_temp = cosh_FR_sneg * cosh_FR_Tmjhi
-// cosh_FR_C_hi = cosh_FR_spos * cosh_FR_Tjhi + (cosh_FR_sneg * cosh_FR_Tmjhi)
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_C_hi_temp = cosh_FR_sneg, cosh_FR_Tmjhi, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_C_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi_temp
- nop.i 999
-}
-
-// ******************************************************
-// Calculate S_hi
-// ******************************************************
-// cosh_FR_S_hi_temp1 = cosh_FR_sneg * cosh_FR_Tmjhi
-// cosh_FR_S_hi = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi_temp1
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_S_hi_temp1 = cosh_FR_sneg, cosh_FR_Tmjhi, f0
- nop.i 999 ;;
-}
-
-// ******************************************************
-// Calculate C_lo
-// ******************************************************
-// cosh_FR_C_lo_temp1 = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi
-// cosh_FR_C_lo_temp2 = cosh_FR_sneg * cosh_FR_Tmjlo + (cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi)
-// cosh_FR_C_lo_temp1 = cosh_FR_sneg * cosh_FR_Tmjlo
-// cosh_FR_C_lo_temp3 = cosh_FR_spos * cosh_FR_Tjlo + (cosh_FR_sneg * cosh_FR_Tmjlo)
-// cosh_FR_C_lo = cosh_FR_C_lo_temp3 + cosh_FR_C_lo_temp2
+;;
{ .mfi
- nop.m 999
-(p0) fms.s1 cosh_FR_C_lo_temp1 = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi
- nop.i 999
+ nop.m 0
+ fma.s1 fX4 = fXsq, fXsq, f0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fms.s1 cosh_FR_S_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_S_hi_temp1
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA65 = fXsq, fA6, fA5
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_C_lo_temp2 = cosh_FR_sneg, cosh_FR_Tmjhi, cosh_FR_C_lo_temp1
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_C_lo_temp1 = cosh_FR_sneg, cosh_FR_Tmjlo, f0
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA43 = fXsq, fA4, fA3
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_C_lo_temp3 = cosh_FR_spos, cosh_FR_Tjlo, cosh_FR_C_lo_temp1
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA21 = fXsq, fA2, fA1
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_C_lo = cosh_FR_C_lo_temp3, f1, cosh_FR_C_lo_temp2
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA6543 = fX4, fA65, fA43
+ nop.i 0
}
-
-// ******************************************************
-// cosh_FR_Y_lo_temp = cosh_FR_C_hi * cosh_FR_peven + cosh_FR_C_lo
-// cosh_FR_Y_lo = cosh_FR_S_hi * cosh_FR_podd + cosh_FR_Y_lo_temp
-// cosh_FR_COSH = Y_hi + Y_lo
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_C_hi, cosh_FR_peven, cosh_FR_C_lo
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA654321 = fX4, fA6543, fA21
+ nop.i 0
}
+;;
+// Dummy multiply to generate inexact
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_S_hi, cosh_FR_podd, cosh_FR_Y_lo_temp
- nop.i 999 ;;
+ nop.m 0
+ fmpy.s0 fTmp = fA6, fA6
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p0) fma.d.s0 f8 = cosh_FR_C_hi, f1, cosh_FR_Y_lo
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+ fma.d.s0 f8 = fA654321, fXsq, f1
+ br.ret.sptk b0 // Exit if 0 < |x| < 0.25
}
+;;
-L(COSH_BY_EXP):
-// When p7 is true, we know that an overflow is not going to happen
-// When p7 is false, we must check for possible overflow
-// p7 is the over_SAFE flag
-// f44 = Scale * (Y_hi + Y_lo)
-// = cosh_FR_spos * (cosh_FR_Tjhi + cosh_FR_Y_lo)
+COSH_POSSIBLE_OVERFLOW:
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_peven, f1, cosh_FR_podd
- nop.i 999
-}
-
-// Now we are in EXP. This is the only path where an overflow is possible
-// but not for certain. So this is the only path where over_SAFE has any use.
-// r34 still has N-1
-// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe
-// There is a danger of double overflow if N-1 > 0x3fe = 1022
+// Here if fMAX_DBL_NORM_ARG < |x| < fMIN_DBL_OFLOW_ARG
+// This cannot happen if input is a double, only if input higher precision.
+// Overflow is a possibility, not a certainty.
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x00000000000003fe ;;
-}
+// Recompute result using status field 2 with user's rounding mode,
+// and wre set. If result is larger than largest double, then we have
+// overflow
{ .mfi
-(p0) cmp.gt.unc p0,p7 = r34, r32
- nop.f 999
- nop.i 999 ;;
+ mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp
+ fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_Tjhi, cosh_FR_Y_lo_temp, cosh_FR_Tjlo
- nop.i 999 ;;
+ setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp
+ fma.d.s2 fWre_urm_f8 = fS, fP, fS // Result with wre set
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_COSH_temp = cosh_FR_Y_lo, f1, cosh_FR_Tjhi
- nop.i 999 ;;
+ nop.m 0
+ fsetc.s2 0x7F,0x40 // Turn off wre in sf2
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.d.s0 f44 = cosh_FR_spos, cosh_FR_COSH_temp, f0
- nop.i 999 ;;
+ nop.m 0
+ fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
+ nop.i 0
}
+;;
-// If over_SAFE is set, return
{ .mfb
- nop.m 999
-(p7) fmerge.s f8 = f44,f44
-(p7) br.ret.sptk b0 ;;
-}
-
-// Else see if we overflowed
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// If WRE is set then an overflow will not occur in EXP.
-// The input value that would cause a register (WRE) value to overflow is about 2^15
-// and this input would go into the HUGE path.
-// Answer with WRE is in f43.
-
-{ .mfi
- nop.m 999
-(p0) fsetc.s2 0x7F,0x42
- nop.i 999;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.d.s2 f43 = cosh_FR_spos, cosh_FR_COSH_temp, f0
- nop.i 999 ;;
-}
-
-// 103FF => 103FF -FFFF = 400(true)
-// 400 + 3FF = 7FF, which is 1 more that the exponent of the largest
-// double (7FE). So 0 103FF 8000000000000000 is one ulp more than
-// largest double in register bias
-// Now set p8 if the answer with WRE is greater than or equal this value
-// Also set p9 if the answer with WRE is less than or equal to negative this value
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x00000000000103ff ;;
+ nop.m 0
+ nop.f 0
+(p6) br.cond.spnt COSH_CERTAIN_OVERFLOW // Branch if overflow
}
+;;
-{ .mmf
- nop.m 999
-(p0) setf.exp f41 = r32
-(p0) fsetc.s2 0x7F,0x40 ;;
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = fS, fP, fS
+ br.ret.sptk b0 // Exit if really no overflow
}
+;;
-{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41
- nop.i 999
+COSH_CERTAIN_OVERFLOW:
+{ .mmi
+ sub rTmp = rExp_mask, r0, 1
+;;
+ setf.exp fTmp = rTmp
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fmerge.ns f42 = f41, f41
- nop.i 999 ;;
+ alloc r32=ar.pfs,1,4,4,0
+ fmerge.s FR_X = f8,f8
+ nop.i 0
}
-
-// The error tag for overflow is 64
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p8) mov r47 = 64 ;;
-}
-
{ .mfb
- nop.m 999
-(p0) fcmp.le.unc.s1 p9, p0 = f43, f42
-(p8) br.cond.spnt __libm_error_region ;;
-}
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p9) mov r47 = 64
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.spnt __libm_error_region ;;
+ mov GR_Parameter_TAG = 64
+ fma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result
+ br.cond.sptk __libm_error_region
}
+;;
+// Here if x unorm
+COSH_UNORM:
{ .mfb
- nop.m 999
-(p0) fmerge.s f8 = f44,f44
-(p0) br.ret.sptk b0 ;;
-}
-
-
-// for COSH_HUGE, put 24000 in exponent; take sign from input; add 1
-// SAFE: SAFE is always 0 for HUGE
-
-L(COSH_HUGE):
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000015dbf ;;
-}
-
-{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_hi_lo = f1, f9, f1
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.d.s0 f44 = f9, cosh_FR_hi_lo, f0
-(p0) mov r47 = 64
+ getf.exp rSignexp_x = fNormX // Must recompute if x unorm
+ fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag
+ br.cond.sptk COSH_COMMON
}
;;
-.endp cosh#
-ASM_SIZE_DIRECTIVE(cosh#)
-
-// Stack operations when calling error support.
-// (1) (2) (3) (call) (4)
-// sp -> + psp -> + psp -> + sp -> +
-// | | | |
-// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
-// | | | |
-// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
-// | | | |
-// | | <- GR_X X1 ->| |
-// | | | |
-// sp-64 -> + sp -> + sp -> + +
-// save ar.pfs save b0 restore gp
-// save gp restore ar.pfs
-
-.proc __libm_error_region
-__libm_error_region:
+GLOBAL_IEEE754_END(cosh)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
-// (1)
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
@@ -1103,39 +821,32 @@ __libm_error_region:
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
-
-
-// (2)
{ .mmi
- stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
-
.body
-// (3)
{ .mib
- stfd [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
- stfd [GR_Parameter_Y] = f44 // STORE Parameter 3 on stack
+ stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
-
-// (4)
{ .mmi
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
@@ -1148,8 +859,6 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
-
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_coshf.S b/sysdeps/ia64/fpu/e_coshf.S
index 969abc4ff6..91846e4717 100644
--- a/sysdeps/ia64/fpu/e_coshf.S
+++ b/sysdeps/ia64/fpu/e_coshf.S
@@ -1,10 +1,10 @@
.file "coshf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2002, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,1127 +20,690 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+
// History
-//==============================================================
-// 2/02/00 Initial version
-// 2/16/00 The error tag for coshf overflow changed to 65 (from 64).
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+//*********************************************************************
+// 02/02/00 Initial version
+// 02/16/00 The error tag for coshf overflow changed to 65 (from 64).
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 05/07/01 Reworked to improve speed of all paths
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 11/15/02 Improved algorithm based on expf
//
// API
-//==============================================================
-// float = coshf(float)
-// input floating point f8
-// output floating point f8
-
-
+//*********************************************************************
+// float coshf(float)
+//
// Overview of operation
-//==============================================================
-// There are four paths
-
-// 1. |x| < 0.25 COSH_BY_POLY
-// 2. |x| < 32 COSH_BY_TBL
-// 3. |x| < 2^14 COSH_BY_EXP
-// 4. |x_ >= 2^14 COSH_HUGE
-
-// For paths 1, and 2 SAFE is always 1.
-// For path 4, Safe is always 0.
-// SAFE = 1 means we cannot overflow.
-
-#include "libm_support.h"
-
-// Assembly macros
-//==============================================================
-coshf_FR_X = f44
-coshf_FR_SGNX = f40
-
-coshf_FR_Inv_log2by64 = f9
-coshf_FR_log2by64_lo = f11
-coshf_FR_log2by64_hi = f10
-
-coshf_FR_A1 = f9
-coshf_FR_A2 = f10
-coshf_FR_A3 = f11
-
-coshf_FR_Rcub = f12
-coshf_FR_M_temp = f13
-coshf_FR_R_temp = f13
-coshf_FR_Rsq = f13
-coshf_FR_R = f14
-
-coshf_FR_M = f38
-
-coshf_FR_B1 = f15
-coshf_FR_B2 = f32
-coshf_FR_B3 = f33
-
-coshf_FR_peven_temp1 = f34
-coshf_FR_peven_temp2 = f35
-coshf_FR_peven = f36
-
-coshf_FR_podd_temp1 = f34
-coshf_FR_podd_temp2 = f35
-coshf_FR_podd = f37
-
-coshf_FR_J_temp = f9
-coshf_FR_J = f10
-
-coshf_FR_Mmj = f39
-
-coshf_FR_N_temp1 = f11
-coshf_FR_N_temp2 = f12
-coshf_FR_N = f13
-
-coshf_FR_spos = f14
-coshf_FR_sneg = f15
-
-coshf_FR_Tjhi = f32
-coshf_FR_Tjlo = f33
-coshf_FR_Tmjhi = f34
-coshf_FR_Tmjlo = f35
-
-GR_mJ = r35
-GR_J = r36
-
-AD_mJ = r38
-AD_J = r39
-
-
-GR_SAVE_B0 = r42
-GR_SAVE_PFS = r41
-GR_SAVE_GP = r43
-
-GR_Parameter_X = r44
-GR_Parameter_Y = r45
-GR_Parameter_RESULT = r46
-GR_Parameter_TAG = r47
-
-FR_X = f8
-FR_Y = f0
-FR_RESULT = f44
-
-
-coshf_FR_C_hi = f9
-coshf_FR_C_hi_temp = f10
-coshf_FR_C_lo_temp1 = f11
-coshf_FR_C_lo_temp2 = f12
-coshf_FR_C_lo_temp3 = f13
-
-coshf_FR_C_lo = f38
-coshf_FR_S_hi = f39
+//*********************************************************************
+// Case 1: 0 < |x| < 0.25
+// Evaluate cosh(x) by a 8th order polynomial
+// Care is take for the order of multiplication; and A2 is not exactly 1/4!,
+// A3 is not exactly 1/6!, etc.
+// cosh(x) = 1 + (A1*x^2 + A2*x^4 + A3*x^6 + A4*x^8)
+//
+// Case 2: 0.25 < |x| < 89.41598
+// Algorithm is based on the identity cosh(x) = ( exp(x) + exp(-x) ) / 2.
+// The algorithm for exp is described as below. There are a number of
+// economies from evaluating both exp(x) and exp(-x). Although we
+// are evaluating both quantities, only where the quantities diverge do we
+// duplicate the computations. The basic algorithm for exp(x) is described
+// below.
+//
+// Take the input x. w is "how many log2/128 in x?"
+// w = x * 64/log2
+// NJ = int(w)
+// x = NJ*log2/64 + R
-coshf_FR_S_hi_temp1 = f10
-coshf_FR_Y_hi = f11
-coshf_FR_Y_lo_temp = f12
-coshf_FR_Y_lo = f13
-coshf_FR_COSH = f9
+// NJ = 64*n + j
+// x = n*log2 + (log2/64)*j + R
+//
+// So, exp(x) = 2^n * 2^(j/64)* exp(R)
+//
+// T = 2^n * 2^(j/64)
+// Construct 2^n
+// Get 2^(j/64) table
+// actually all the entries of 2^(j/64) table are stored in DP and
+// with exponent bits set to 0 -> multiplication on 2^n can be
+// performed by doing logical "or" operation with bits presenting 2^n
+
+// exp(R) = 1 + (exp(R) - 1)
+// P = exp(R) - 1 approximated by Taylor series of 3rd degree
+// P = A3*R^3 + A2*R^2 + R, A3 = 1/6, A2 = 1/2
+//
-coshf_FR_X2 = f9
-coshf_FR_X4 = f10
+// The final result is reconstructed as follows
+// exp(x) = T + T*P
-coshf_FR_P1 = f14
-coshf_FR_P2 = f15
-coshf_FR_P3 = f32
-coshf_FR_P4 = f33
-coshf_FR_P5 = f34
-coshf_FR_P6 = f35
+// Special values
+//*********************************************************************
+// coshf(+0) = 1.0
+// coshf(-0) = 1.0
-coshf_FR_TINY_THRESH = f9
+// coshf(+qnan) = +qnan
+// coshf(-qnan) = -qnan
+// coshf(+snan) = +qnan
+// coshf(-snan) = -qnan
-coshf_FR_COSH_temp = f10
-coshf_FR_SCALE = f11
+// coshf(-inf) = +inf
+// coshf(+inf) = +inf
-coshf_FR_hi_lo = f10
+// Overflow and Underflow
+//*********************************************************************
+// coshf(x) = largest single normal when
+// x = 89.41598 = 0x42b2d4fc
+//
+// There is no underflow.
-coshf_FR_poly_podd_temp1 = f11
-coshf_FR_poly_podd_temp2 = f13
-coshf_FR_poly_peven_temp1 = f11
-coshf_FR_poly_peven_temp2 = f13
+// Registers used
+//*********************************************************************
+// Floating Point registers used:
+// f8 input, output
+// f6,f7, f9 -> f15, f32 -> f45
-// Data tables
-//==============================================================
+// General registers used:
+// r2, r3, r16 -> r38
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+// Predicate registers used:
+// p6 -> p15
+// Assembly macros
+//*********************************************************************
+// integer registers used
+// scratch
+rNJ = r2
+rNJ_neg = r3
+
+rJ_neg = r16
+rN_neg = r17
+rSignexp_x = r18
+rExp_x = r18
+rExp_mask = r19
+rExp_bias = r20
+rAd1 = r21
+rAd2 = r22
+rJ = r23
+rN = r24
+rTblAddr = r25
+rA3 = r26
+rExpHalf = r27
+rLn2Div64 = r28
+rGt_ln = r29
+r17ones_m1 = r29
+rRightShifter = r30
+rJ_mask = r30
+r64DivLn2 = r31
+rN_mask = r31
+// stacked
+GR_SAVE_PFS = r32
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_Parameter_X = r35
+GR_Parameter_Y = r36
+GR_Parameter_RESULT = r37
+GR_Parameter_TAG = r38
+
+// floating point registers used
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+// scratch
+fRightShifter = f6
+f64DivLn2 = f7
+fNormX = f9
+fNint = f10
+fN = f11
+fR = f12
+fLn2Div64 = f13
+fA2 = f14
+fA3 = f15
+// stacked
+fP = f32
+fT = f33
+fMIN_SGL_OFLOW_ARG = f34
+fMAX_SGL_NORM_ARG = f35
+fRSqr = f36
+fA1 = f37
+fA21 = f37
+fA4 = f38
+fA43 = f38
+fA4321 = f38
+fX4 = f39
+fTmp = f39
+fGt_pln = f39
+fWre_urm_f8 = f40
+fXsq = f40
+fP_neg = f41
+fT_neg = f42
+fExp = f43
+fExp_neg = f44
+fAbsX = f45
+
+
+RODATA
.align 16
-single_coshf_arg_reduction:
-ASM_TYPE_DIRECTIVE(single_coshf_arg_reduction,@object)
- data8 0xB8AA3B295C17F0BC, 0x00004005
- data8 0xB17217F7D1000000, 0x00003FF8
- data8 0xCF79ABC9E3B39804, 0x00003FD0
-ASM_SIZE_DIRECTIVE(single_coshf_arg_reduction)
-
-single_coshf_p_table:
-ASM_TYPE_DIRECTIVE(single_coshf_p_table,@object)
- data8 0x8000000000000000, 0x00003FFE
- data8 0xAAAAAAAAAAAAAB80, 0x00003FFA
- data8 0xB60B60B60B4FE884, 0x00003FF5
- data8 0xD00D00D1021D7370, 0x00003FEF
- data8 0x93F27740C0C2F1CC, 0x00003FE9
- data8 0x8FA02AC65BCBD5BC, 0x00003FE2
-ASM_SIZE_DIRECTIVE(single_coshf_p_table)
-
-single_coshf_ab_table:
-ASM_TYPE_DIRECTIVE(single_coshf_ab_table,@object)
- data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC
- data8 0x88888888884ECDD5, 0x00003FF8
- data8 0xD00D0C6DCC26A86B, 0x00003FF2
- data8 0x8000000000000002, 0x00003FFE
- data8 0xAAAAAAAAAA402C77, 0x00003FFA
- data8 0xB60B6CC96BDB144D, 0x00003FF5
-ASM_SIZE_DIRECTIVE(single_coshf_ab_table)
-
-single_coshf_j_table:
-ASM_TYPE_DIRECTIVE(single_coshf_j_table,@object)
- data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000
- data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000
- data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000
- data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000
- data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000
- data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000
- data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000
- data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000
- data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000
- data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000
- data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000
- data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000
- data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000
- data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000
- data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000
- data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000
- data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000
- data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000
- data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000
- data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000
- data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000
- data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000
- data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000
- data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000
- data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000
- data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000
- data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000
- data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000
- data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000
- data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000
- data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000
- data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000
- data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000
- data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000
- data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000
- data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000
- data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000
- data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000
- data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000
- data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000
- data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000
- data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000
- data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000
- data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000
- data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000
- data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000
- data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000
- data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000
- data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000
- data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000
- data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000
- data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000
- data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000
- data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000
- data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000
- data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000
- data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000
- data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000
- data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000
- data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000
- data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000
- data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000
- data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000
- data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000
- data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000
-ASM_SIZE_DIRECTIVE(single_coshf_j_table)
-
-.align 32
-.global coshf#
-
-.section .text
-.proc coshf#
-.align 32
-
-coshf:
-
-#ifdef _LIBC
-.global __ieee754_coshf#
-.proc __ieee754_coshf#
-__ieee754_coshf:
-#endif
-// X NAN?
-
-
-{ .mfi
- alloc r32 = ar.pfs,0,12,4,0
-(p0) fclass.m.unc p6,p7 = f8, 0xc3
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p6) fma.s.s0 f8 = f8,f1,f8
-(p6) br.ret.spnt b0 ;;
-}
-
-{ .mfi
- nop.m 999
- nop.f 999
- nop.i 999 ;;
-}
+LOCAL_OBJECT_START(_coshf_table)
+data4 0x42b2d4fd // Smallest single arg to overflow single result
+data4 0x42b2d4fc // Largest single arg to give normal single result
+data4 0x00000000 // pad
+data4 0x00000000 // pad
+//
+// 2^(j/64) table, j goes from 0 to 63
+data8 0x0000000000000000 // 2^(0/64)
+data8 0x00002C9A3E778061 // 2^(1/64)
+data8 0x000059B0D3158574 // 2^(2/64)
+data8 0x0000874518759BC8 // 2^(3/64)
+data8 0x0000B5586CF9890F // 2^(4/64)
+data8 0x0000E3EC32D3D1A2 // 2^(5/64)
+data8 0x00011301D0125B51 // 2^(6/64)
+data8 0x0001429AAEA92DE0 // 2^(7/64)
+data8 0x000172B83C7D517B // 2^(8/64)
+data8 0x0001A35BEB6FCB75 // 2^(9/64)
+data8 0x0001D4873168B9AA // 2^(10/64)
+data8 0x0002063B88628CD6 // 2^(11/64)
+data8 0x0002387A6E756238 // 2^(12/64)
+data8 0x00026B4565E27CDD // 2^(13/64)
+data8 0x00029E9DF51FDEE1 // 2^(14/64)
+data8 0x0002D285A6E4030B // 2^(15/64)
+data8 0x000306FE0A31B715 // 2^(16/64)
+data8 0x00033C08B26416FF // 2^(17/64)
+data8 0x000371A7373AA9CB // 2^(18/64)
+data8 0x0003A7DB34E59FF7 // 2^(19/64)
+data8 0x0003DEA64C123422 // 2^(20/64)
+data8 0x0004160A21F72E2A // 2^(21/64)
+data8 0x00044E086061892D // 2^(22/64)
+data8 0x000486A2B5C13CD0 // 2^(23/64)
+data8 0x0004BFDAD5362A27 // 2^(24/64)
+data8 0x0004F9B2769D2CA7 // 2^(25/64)
+data8 0x0005342B569D4F82 // 2^(26/64)
+data8 0x00056F4736B527DA // 2^(27/64)
+data8 0x0005AB07DD485429 // 2^(28/64)
+data8 0x0005E76F15AD2148 // 2^(29/64)
+data8 0x0006247EB03A5585 // 2^(30/64)
+data8 0x0006623882552225 // 2^(31/64)
+data8 0x0006A09E667F3BCD // 2^(32/64)
+data8 0x0006DFB23C651A2F // 2^(33/64)
+data8 0x00071F75E8EC5F74 // 2^(34/64)
+data8 0x00075FEB564267C9 // 2^(35/64)
+data8 0x0007A11473EB0187 // 2^(36/64)
+data8 0x0007E2F336CF4E62 // 2^(37/64)
+data8 0x00082589994CCE13 // 2^(38/64)
+data8 0x000868D99B4492ED // 2^(39/64)
+data8 0x0008ACE5422AA0DB // 2^(40/64)
+data8 0x0008F1AE99157736 // 2^(41/64)
+data8 0x00093737B0CDC5E5 // 2^(42/64)
+data8 0x00097D829FDE4E50 // 2^(43/64)
+data8 0x0009C49182A3F090 // 2^(44/64)
+data8 0x000A0C667B5DE565 // 2^(45/64)
+data8 0x000A5503B23E255D // 2^(46/64)
+data8 0x000A9E6B5579FDBF // 2^(47/64)
+data8 0x000AE89F995AD3AD // 2^(48/64)
+data8 0x000B33A2B84F15FB // 2^(49/64)
+data8 0x000B7F76F2FB5E47 // 2^(50/64)
+data8 0x000BCC1E904BC1D2 // 2^(51/64)
+data8 0x000C199BDD85529C // 2^(52/64)
+data8 0x000C67F12E57D14B // 2^(53/64)
+data8 0x000CB720DCEF9069 // 2^(54/64)
+data8 0x000D072D4A07897C // 2^(55/64)
+data8 0x000D5818DCFBA487 // 2^(56/64)
+data8 0x000DA9E603DB3285 // 2^(57/64)
+data8 0x000DFC97337B9B5F // 2^(58/64)
+data8 0x000E502EE78B3FF6 // 2^(59/64)
+data8 0x000EA4AFA2A490DA // 2^(60/64)
+data8 0x000EFA1BEE615A27 // 2^(61/64)
+data8 0x000F50765B6E4540 // 2^(62/64)
+data8 0x000FA7C1819E90D8 // 2^(63/64)
+LOCAL_OBJECT_END(_coshf_table)
+
+LOCAL_OBJECT_START(cosh_p_table)
+data8 0x3efa3001dcf5905b // A4
+data8 0x3f56c1437543543e // A3
+data8 0x3fa5555572601504 // A2
+data8 0x3fdfffffffe2f097 // A1
+LOCAL_OBJECT_END(cosh_p_table)
-// X infinity
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6,p0 = f8, 0x23
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p6) fmerge.s f8 = f0,f8
-(p6) br.ret.spnt b0 ;;
-}
+.section .text
+GLOBAL_IEEE754_ENTRY(coshf)
-// Put 0.25 in f9; p6 true if x < 0.25
{ .mlx
- nop.m 999
-(p0) movl r32 = 0x000000000000fffd ;;
-}
-
-{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ getf.exp rSignexp_x = f8 // Must recompute if x unorm
+ movl r64DivLn2 = 0x40571547652B82FE // 64/ln(2)
}
-
-{ .mfi
- nop.m 999
-(p0) fmerge.s coshf_FR_X = f0,f8
- nop.i 999
+{ .mlx
+ addl rTblAddr = @ltoff(_coshf_table),gp
+ movl rRightShifter = 0x43E8000000000000 // DP Right Shifter
}
+;;
{ .mfi
- nop.m 999
-(p0) fmerge.s coshf_FR_SGNX = f8,f1
- nop.i 999 ;;
+ // point to the beginning of the table
+ ld8 rTblAddr = [rTblAddr]
+ fclass.m p6, p0 = f8, 0x0b // Test for x=unorm
+ addl rA3 = 0x3E2AA, r0 // high bits of 1.0/6.0 rounded to SP
}
-
{ .mfi
- nop.m 999
-(p0) fcmp.lt.unc p0,p7 = coshf_FR_X,f9
- nop.i 999 ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p7) br.cond.sptk L(COSH_BY_TBL) ;;
-}
-
-
-// COSH_BY_POLY:
-
-// POLY cannot overflow so there is no need to call __libm_error_support
-// Get the values of P_x from the table
-
-{ .mmi
- nop.m 999
-(p0) addl r34 = @ltoff(single_coshf_p_table), gp
- nop.i 999
+ nop.m 0
+ fnorm.s1 fNormX = f8 // normalized x
+ addl rExpHalf = 0xFFFE, r0 // exponent of 1/2
}
;;
-{ .mmi
- ld8 r34 = [r34]
- nop.m 999
- nop.i 999
-}
-;;
-
-// Calculate coshf_FR_X2 = ax*ax and coshf_FR_X4 = ax*ax*ax*ax
-{ .mmf
- nop.m 999
-(p0) ldfe coshf_FR_P1 = [r34],16
-(p0) fma.s1 coshf_FR_X2 = coshf_FR_X, coshf_FR_X, f0 ;;
-}
-
-{ .mmi
-(p0) ldfe coshf_FR_P2 = [r34],16 ;;
-(p0) ldfe coshf_FR_P3 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) ldfe coshf_FR_P4 = [r34],16 ;;
-(p0) ldfe coshf_FR_P5 = [r34],16
- nop.i 999 ;;
-}
-
{ .mfi
-(p0) ldfe coshf_FR_P6 = [r34],16
-(p0) fma.s1 coshf_FR_X4 = coshf_FR_X2, coshf_FR_X2, f0
- nop.i 999 ;;
+ setf.d f64DivLn2 = r64DivLn2 // load 64/ln(2) to FP reg
+ fclass.m p15, p0 = f8, 0x1e3 // test for NaT,NaN,Inf
+ nop.i 0
}
-
-// Calculate coshf_FR_podd = x4 *(x4 * P_5 + P_3) + P_1
-{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_poly_podd_temp1 = coshf_FR_X4, coshf_FR_P5, coshf_FR_P3
- nop.i 999 ;;
+{ .mlx
+ // load Right Shifter to FP reg
+ setf.d fRightShifter = rRightShifter
+ movl rLn2Div64 = 0x3F862E42FEFA39EF // DP ln(2)/64 in GR
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_podd = coshf_FR_X4, coshf_FR_poly_podd_temp1, coshf_FR_P1
- nop.i 999
+ mov rExp_mask = 0x1ffff
+ fcmp.eq.s1 p13, p0 = f0, f8 // test for x = 0.0
+ shl rA3 = rA3, 12 // 0x3E2AA000, approx to 1.0/6.0 in SP
}
-
-// Calculate coshf_FR_peven = p_even = x4 *(x4 * (x4 * P_6 + P_4) + P_2)
-{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_poly_peven_temp1 = coshf_FR_X4, coshf_FR_P6, coshf_FR_P4
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+ nop.f 0
+(p6) br.cond.spnt COSH_UNORM // Branch if x=unorm
}
+;;
+COSH_COMMON:
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_poly_peven_temp2 = coshf_FR_X4, coshf_FR_poly_peven_temp1, coshf_FR_P2
- nop.i 999 ;;
+ setf.exp fA2 = rExpHalf // load A2 to FP reg
+ nop.f 0
+ mov rExp_bias = 0xffff
}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_peven = coshf_FR_X4, coshf_FR_poly_peven_temp2, f0
- nop.i 999 ;;
+{ .mfb
+ setf.d fLn2Div64 = rLn2Div64 // load ln(2)/64 to FP reg
+(p15) fma.s.s0 f8 = f8, f8, f0 // result if x = NaT,NaN,Inf
+(p15) br.ret.spnt b0 // exit here if x = NaT,NaN,Inf
}
-
-// Y_lo = x2*p_odd + p_even
-// Calculate f8 = Y_hi + Y_lo
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_Y_lo = coshf_FR_X2, coshf_FR_podd, coshf_FR_peven
- nop.i 999 ;;
+ // min overflow and max normal threshold
+ ldfps fMIN_SGL_OFLOW_ARG, fMAX_SGL_NORM_ARG = [rTblAddr], 8
+ nop.f 0
+ and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x
}
-
{ .mfb
- nop.m 999
-(p0) fma.s.s0 f8 = f1, f1, coshf_FR_Y_lo
-(p0) br.ret.sptk b0 ;;
-}
-
-
-L(COSH_BY_TBL):
-
-// Now that we are at TBL; so far all we know is that |x| >= 0.25.
-// The first two steps are the same for TBL and EXP, but if we are HUGE
-// Double
-// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true)
-// Single
-// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true)
-// we want to leave now. Go to HUGE if |x| >= 2^14
-// 1000d (register-biased) is e = 14 (true)
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000010006 ;;
+ setf.s fA3 = rA3 // load A3 to FP reg
+(p13) fma.s.s0 f8 = f1, f1, f0 // result if x = 0.0
+(p13) br.ret.spnt b0 // exit here if x =0.0
}
+;;
{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ sub rExp_x = rExp_x, rExp_bias // True exponent of x
+ fmerge.s fAbsX = f0, fNormX // Form |x|
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc p6,p7 = coshf_FR_X,f9
- nop.i 999 ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(COSH_HUGE) ;;
+ nop.m 0
+ // x*(64/ln(2)) + Right Shifter
+ fma.s1 fNint = fNormX, f64DivLn2, fRightShifter
+ add rTblAddr = 8, rTblAddr
}
-
-// r32 = 1
-// r34 = N-1
-// r35 = N
-// r36 = j
-// r37 = N+1
-
-// TBL can never overflow
-// coshf(x) = coshf(B+R)
-// = coshf(B) coshf(R) + sinh(B) sinh(R)
-// coshf(R) can be approximated by 1 + p_even
-// sinh(R) can be approximated by p_odd
-
-// ******************************************************
-// STEP 1 (TBL and EXP)
-// ******************************************************
-// Get the following constants.
-// f9 = Inv_log2by64
-// f10 = log2by64_hi
-// f11 = log2by64_lo
-
-{ .mmi
-(p0) adds r32 = 0x1,r0
-(p0) addl r34 = @ltoff(single_coshf_arg_reduction), gp
- nop.i 999
+{ .mfb
+ cmp.gt p7, p0 = -2, rExp_x // Test |x| < 2^(-2)
+ fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path
+(p7) br.cond.spnt COSH_SMALL // Branch if 0 < |x| < 2^-2
}
;;
-
-// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and
-// put them in an exponent.
-// coshf_FR_spos = 2^(N-1) and coshf_FR_sneg = 2^(-N-1)
-// r39 = 0xffff + (N-1) = 0xffff +N -1
-// r40 = 0xffff - (N +1) = 0xffff -N -1
-
-{ .mlx
- ld8 r34 = [r34]
-(p0) movl r38 = 0x000000000000fffe ;;
-}
-
-{ .mmi
-(p0) ldfe coshf_FR_Inv_log2by64 = [r34],16 ;;
-(p0) ldfe coshf_FR_log2by64_hi = [r34],16
- nop.i 999 ;;
-}
-
-{ .mbb
-(p0) ldfe coshf_FR_log2by64_lo = [r34],16
- nop.b 999
- nop.b 999 ;;
-}
-
-// Get the A coefficients
-// f9 = A_1
-// f10 = A_2
-// f11 = A_3
-
-{ .mmi
- nop.m 999
-(p0) addl r34 = @ltoff(single_coshf_ab_table), gp
- nop.i 999
+{ .mfi
+ nop.m 0
+ // check for overflow
+ fcmp.ge.s1 p12, p13 = fAbsX, fMIN_SGL_OFLOW_ARG
+ mov rJ_mask = 0x3f // 6-bit mask for J
}
;;
-{ .mmi
- ld8 r34 = [r34]
- nop.m 999
- nop.i 999
+{ .mfb
+ nop.m 0
+ fms.s1 fN = fNint, f1, fRightShifter // n in FP register
+ // branch out if overflow
+(p12) br.cond.spnt COSH_CERTAIN_OVERFLOW
}
;;
-
-// Calculate M and keep it as integer and floating point.
-// M = round-to-integer(x*Inv_log2by64)
-// coshf_FR_M = M = truncate(ax/(log2/64))
-// Put the significand of M in r35
-// and the floating point representation of M in coshf_FR_M
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_M = coshf_FR_X, coshf_FR_Inv_log2by64, f0
- nop.i 999
-}
-
-{ .mfi
-(p0) ldfe coshf_FR_A1 = [r34],16
- nop.f 999
- nop.i 999 ;;
-}
-
{ .mfi
- nop.m 999
-(p0) fcvt.fx.s1 coshf_FR_M_temp = coshf_FR_M
- nop.i 999 ;;
+ getf.sig rNJ = fNint // bits of n, j
+ // check for possible overflow
+ fcmp.gt.s1 p13, p0 = fAbsX, fMAX_SGL_NORM_ARG
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fnorm.s1 coshf_FR_M = coshf_FR_M_temp
- nop.i 999 ;;
+ addl rN = 0xFFBF - 63, rNJ // biased and shifted n-1,j
+ fnma.s1 fR = fLn2Div64, fN, fNormX // R = x - N*ln(2)/64
+ and rJ = rJ_mask, rNJ // bits of j
}
-
{ .mfi
-(p0) getf.sig r35 = coshf_FR_M_temp
- nop.f 999
- nop.i 999 ;;
-}
-
-// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It
-// has a range of -32 thru 31.
-// r35 = M
-// r36 = j
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p0) and r36 = 0x3f, r35 ;;
+ sub rNJ_neg = r0, rNJ // bits of n, j for -x
+ nop.f 0
+ andcm rN_mask = -1, rJ_mask // 0xff...fc0 to mask N
}
-
-// Calculate R
-// f13 = f44 - f12*f10 = x - M*log2by64_hi
-// f14 = f13 - f8*f11 = R = (x - M*log2by64_hi) - M*log2by64_lo
+;;
{ .mfi
- nop.m 999
-(p0) fnma.s1 coshf_FR_R_temp = coshf_FR_M, coshf_FR_log2by64_hi, coshf_FR_X
- nop.i 999
+ shladd rJ = rJ, 3, rTblAddr // address in the 2^(j/64) table
+ nop.f 0
+ and rN = rN_mask, rN // biased, shifted n-1
}
-
{ .mfi
-(p0) ldfe coshf_FR_A2 = [r34],16
- nop.f 999
- nop.i 999 ;;
+ addl rN_neg = 0xFFBF - 63, rNJ_neg // -x biased, shifted n-1,j
+ nop.f 0
+ and rJ_neg = rJ_mask, rNJ_neg // bits of j for -x
}
+;;
{ .mfi
- nop.m 999
-(p0) fnma.s1 coshf_FR_R = coshf_FR_M, coshf_FR_log2by64_lo, coshf_FR_R_temp
- nop.i 999
+ ld8 rJ = [rJ] // Table value
+ nop.f 0
+ shl rN = rN, 46 // 2^(n-1) bits in DP format
}
-
-// Get the B coefficients
-// f15 = B_1
-// f32 = B_2
-// f33 = B_3
-
-{ .mmi
-(p0) ldfe coshf_FR_A3 = [r34],16 ;;
-(p0) ldfe coshf_FR_B1 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) ldfe coshf_FR_B2 = [r34],16 ;;
-(p0) ldfe coshf_FR_B3 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
-(p0) shl r34 = r36, 0x2 ;;
-(p0) sxt1 r37 = r34 ;;
-}
-
-// ******************************************************
-// STEP 2 (TBL and EXP)
-// ******************************************************
-// Calculate Rsquared and Rcubed in preparation for p_even and p_odd
-// f12 = R*R*R
-// f13 = R*R
-// f14 = R <== from above
-
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_Rsq = coshf_FR_R, coshf_FR_R, f0
-(p0) shr r36 = r37, 0x2 ;;
-}
-
-// r34 = M-j = r35 - r36
-// r35 = N = (M-j)/64
-
-{ .mii
-(p0) sub r34 = r35, r36
- nop.i 999 ;;
-(p0) shr r35 = r34, 0x6 ;;
-}
-
-{ .mii
-(p0) sub r40 = r38, r35
-(p0) adds r37 = 0x1, r35
-(p0) add r39 = r38, r35 ;;
-}
-
-// Get the address of the J table, add the offset,
-// addresses are sinh_AD_mJ and sinh_AD_J, get the T value
-// f32 = T(j)_hi
-// f33 = T(j)_lo
-// f34 = T(-j)_hi
-// f35 = T(-j)_lo
-
-{ .mmi
-(p0) sub r34 = r35, r32
-(p0) addl r37 = @ltoff(single_coshf_j_table), gp
- nop.i 999
+ shladd rJ_neg = rJ_neg, 3, rTblAddr // addr in 2^(j/64) table -x
+ nop.f 0
+ and rN_neg = rN_mask, rN_neg // biased, shifted n-1 for -x
}
;;
{ .mfi
- ld8 r37 = [r37]
-(p0) fma.s1 coshf_FR_Rcub = coshf_FR_Rsq, coshf_FR_R, f0
- nop.i 999
-}
-
-// ******************************************************
-// STEP 3 Now decide if we need to branch to EXP
-// ******************************************************
-// Put 32 in f9; p6 true if x < 32
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000010004 ;;
+ ld8 rJ_neg = [rJ_neg] // Table value for -x
+ nop.f 0
+ shl rN_neg = rN_neg, 46 // 2^(n-1) bits in DP format for -x
}
-
-// Calculate p_even
-// f34 = B_2 + Rsq *B_3
-// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3)
-// f36 = peven = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_peven_temp1 = coshf_FR_Rsq, coshf_FR_B3, coshf_FR_B2
- nop.i 999 ;;
+ or rN = rN, rJ // bits of 2^n * 2^(j/64) in DP format
+ nop.f 0
+ nop.i 0
}
+;;
-{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_peven_temp2 = coshf_FR_Rsq, coshf_FR_peven_temp1, coshf_FR_B1
- nop.i 999
+{ .mmf
+ setf.d fT = rN // 2^(n-1) * 2^(j/64)
+ or rN_neg = rN_neg, rJ_neg // -x bits of 2^n * 2^(j/64) in DP
+ fma.s1 fRSqr = fR, fR, f0 // R^2
}
-
-// Calculate p_odd
-// f34 = A_2 + Rsq *A_3
-// f35 = A_1 + Rsq * (A_2 + Rsq *A_3)
-// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_podd_temp1 = coshf_FR_Rsq, coshf_FR_A3, coshf_FR_A2
- nop.i 999 ;;
+ setf.d fT_neg = rN_neg // 2^(n-1) * 2^(j/64) for -x
+ fma.s1 fP = fA3, fR, fA2 // A3*R + A2
+ nop.i 0
}
-
{ .mfi
-(p0) setf.exp coshf_FR_N_temp1 = r39
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fnma.s1 fP_neg = fA3, fR, fA2 // A3*R + A2 for -x
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_peven = coshf_FR_Rsq, coshf_FR_peven_temp2, f0
- nop.i 999
+ nop.m 0
+ fma.s1 fP = fP, fRSqr, fR // P = (A3*R + A2)*R^2 + R
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_podd_temp2 = coshf_FR_Rsq, coshf_FR_podd_temp1, coshf_FR_A1
- nop.i 999 ;;
+ nop.m 0
+ fms.s1 fP_neg = fP_neg, fRSqr, fR // P = (A3*R + A2)*R^2 + R, -x
+ nop.i 0
}
+;;
{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fmpy.s0 fTmp = fLn2Div64, fLn2Div64 // Force inexact
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_podd = coshf_FR_podd_temp2, coshf_FR_Rcub, coshf_FR_R
- nop.i 999
-}
-
-// sinh_GR_mj contains the table offset for -j
-// sinh_GR_j contains the table offset for +j
-// p6 is true when j <= 0
-
-{ .mlx
-(p0) setf.exp coshf_FR_N_temp2 = r40
-(p0) movl r40 = 0x0000000000000020 ;;
+ nop.m 0
+ fma.s1 fExp = fP, fT, fT // exp(x)/2
+ nop.i 0
}
-
-{ .mfi
-(p0) sub GR_mJ = r40, r36
-(p0) fmerge.se coshf_FR_spos = coshf_FR_N_temp1, f1
-(p0) adds GR_J = 0x20, r36 ;;
+{ .mfb
+ nop.m 0
+ fma.s1 fExp_neg = fP_neg, fT_neg, fT_neg // exp(-x)/2
+ // branch out if possible overflow result
+(p13) br.cond.spnt COSH_POSSIBLE_OVERFLOW
}
+;;
-{ .mii
- nop.m 999
-(p0) shl GR_mJ = GR_mJ, 5 ;;
-(p0) add AD_mJ = r37, GR_mJ ;;
+{ .mfb
+ nop.m 0
+ // final result in the absence of overflow
+ fma.s.s0 f8 = fExp, f1, fExp_neg // result = (exp(x)+exp(-x))/2
+ // exit here in the absence of overflow
+ br.ret.sptk b0 // Exit main path, 0.25 <= |x| < 89.41598
}
+;;
+// Here if 0 < |x| < 0.25. Evaluate 8th order polynomial.
+COSH_SMALL:
{ .mmi
- nop.m 999
-(p0) ldfe coshf_FR_Tmjhi = [AD_mJ],16
-(p0) shl GR_J = GR_J, 5 ;;
-}
-
-{ .mfi
-(p0) ldfs coshf_FR_Tmjlo = [AD_mJ],16
-(p0) fcmp.lt.unc.s1 p6,p7 = coshf_FR_X,f9
-(p0) add AD_J = r37, GR_J ;;
+ add rAd1 = 0x200, rTblAddr
+ add rAd2 = 0x210, rTblAddr
+ nop.i 0
}
+;;
{ .mmi
-(p0) ldfe coshf_FR_Tjhi = [AD_J],16 ;;
-(p0) ldfs coshf_FR_Tjlo = [AD_J],16
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-(p0) fmerge.se coshf_FR_sneg = coshf_FR_N_temp2, f1
-(p7) br.cond.spnt L(COSH_BY_EXP) ;;
-}
-
-// ******************************************************
-// If NOT branch to EXP
-// ******************************************************
-// Calculate C_hi
-// ******************************************************
-// coshf_FR_C_hi_temp = coshf_FR_sneg * coshf_FR_Tmjhi
-// coshf_FR_C_hi = coshf_FR_spos * coshf_FR_Tjhi + (coshf_FR_sneg * coshf_FR_Tmjhi)
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_C_hi_temp = coshf_FR_sneg, coshf_FR_Tmjhi, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_C_hi = coshf_FR_spos, coshf_FR_Tjhi, coshf_FR_C_hi_temp
- nop.i 999
-}
-
-// ******************************************************
-// Calculate S_hi
-// ******************************************************
-// coshf_FR_S_hi_temp1 = coshf_FR_sneg * coshf_FR_Tmjhi
-// coshf_FR_S_hi = coshf_FR_spos * coshf_FR_Tjhi - coshf_FR_C_hi_temp1
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_S_hi_temp1 = coshf_FR_sneg, coshf_FR_Tmjhi, f0
- nop.i 999 ;;
-}
-
-// ******************************************************
-// Calculate C_lo
-// ******************************************************
-// coshf_FR_C_lo_temp1 = coshf_FR_spos * coshf_FR_Tjhi - coshf_FR_C_hi
-// coshf_FR_C_lo_temp2 = coshf_FR_sneg * coshf_FR_Tmjlo + (coshf_FR_spos * coshf_FR_Tjhi - coshf_FR_C_hi)
-// coshf_FR_C_lo_temp1 = coshf_FR_sneg * coshf_FR_Tmjlo
-// coshf_FR_C_lo_temp3 = coshf_FR_spos * coshf_FR_Tjlo + (coshf_FR_sneg * coshf_FR_Tmjlo)
-// coshf_FR_C_lo = coshf_FR_C_lo_temp3 + coshf_FR_C_lo_temp2
-
-{ .mfi
- nop.m 999
-(p0) fms.s1 coshf_FR_C_lo_temp1 = coshf_FR_spos, coshf_FR_Tjhi, coshf_FR_C_hi
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fms.s1 coshf_FR_S_hi = coshf_FR_spos, coshf_FR_Tjhi, coshf_FR_S_hi_temp1
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_C_lo_temp2 = coshf_FR_sneg, coshf_FR_Tmjhi, coshf_FR_C_lo_temp1
- nop.i 999
+ ldfpd fA4, fA3 = [rAd1]
+ ldfpd fA2, fA1 = [rAd2]
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_C_lo_temp1 = coshf_FR_sneg, coshf_FR_Tmjlo, f0
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fX4 = fXsq, fXsq, f0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_C_lo_temp3 = coshf_FR_spos, coshf_FR_Tjlo, coshf_FR_C_lo_temp1
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA43 = fXsq, fA4, fA3
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_C_lo = coshf_FR_C_lo_temp3, f1, coshf_FR_C_lo_temp2
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA21 = fXsq, fA2, fA1
+ nop.i 0
}
-
-// ******************************************************
-// coshf_FR_Y_lo_temp = coshf_FR_C_hi * coshf_FR_peven + coshf_FR_C_lo
-// coshf_FR_Y_lo = coshf_FR_S_hi * coshf_FR_podd + coshf_FR_Y_lo_temp
-// coshf_FR_COSH = Y_hi + Y_lo
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_Y_lo_temp = coshf_FR_C_hi, coshf_FR_peven, coshf_FR_C_lo
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA4321 = fX4, fA43, fA21
+ nop.i 0
}
+;;
+// Dummy multiply to generate inexact
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_Y_lo = coshf_FR_S_hi, coshf_FR_podd, coshf_FR_Y_lo_temp
- nop.i 999 ;;
+ nop.m 0
+ fmpy.s0 fTmp = fA4, fA4
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p0) fma.s.s0 f8 = coshf_FR_C_hi, f1, coshf_FR_Y_lo
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+ fma.s.s0 f8 = fA4321, fXsq, f1
+ br.ret.sptk b0 // Exit if 0 < |x| < 0.25
}
+;;
+COSH_POSSIBLE_OVERFLOW:
-L(COSH_BY_EXP):
+// Here if fMAX_SGL_NORM_ARG < x < fMIN_SGL_OFLOW_ARG
+// This cannot happen if input is a single, only if input higher precision.
+// Overflow is a possibility, not a certainty.
-// When p7 is true, we know that an overflow is not going to happen
-// When p7 is false, we must check for possible overflow
-// p7 is the over_SAFE flag
-// f44 = Scale * (Y_hi + Y_lo)
-// = coshf_FR_spos * (coshf_FR_Tjhi + coshf_FR_Y_lo)
+// Recompute result using status field 2 with user's rounding mode,
+// and wre set. If result is larger than largest single, then we have
+// overflow
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_Y_lo_temp = coshf_FR_peven, f1, coshf_FR_podd
- nop.i 999
-}
-
-// Now we are in EXP. This is the only path where an overflow is possible
-// but not for certain. So this is the only path where over_SAFE has any use.
-// r34 still has N-1
-// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe
-// There is a danger of double overflow if N-1 > 0x3fe = 1022
-// There is a danger of single overflow if N-1 > 0x7e = 126
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x000000000000007e ;;
-}
-
-{ .mfi
-(p0) cmp.gt.unc p0,p7 = r34, r32
- nop.f 999
- nop.i 999 ;;
+ mov rGt_ln = 0x1007f // Exponent for largest single + 1 ulp
+ fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_Y_lo = coshf_FR_Tjhi, coshf_FR_Y_lo_temp, coshf_FR_Tjlo
- nop.i 999 ;;
+ setf.exp fGt_pln = rGt_ln // Create largest single + 1 ulp
+ fma.s.s2 fWre_urm_f8 = fP, fT, fT // Result with wre set
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_COSH_temp = coshf_FR_Y_lo, f1, coshf_FR_Tjhi
- nop.i 999 ;;
+ nop.m 0
+ fsetc.s2 0x7F,0x40 // Turn off wre in sf2
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s.s0 f44 = coshf_FR_spos, coshf_FR_COSH_temp, f0
- nop.i 999 ;;
+ nop.m 0
+ fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
+ nop.i 0
}
+;;
-// If over_SAFE is set, return
{ .mfb
- nop.m 999
-(p7) fmerge.s f8 = f44,f44
-(p7) br.ret.sptk b0 ;;
-}
-
-// Else see if we overflowed
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// If WRE is set then an overflow will not occur in EXP.
-// The input value that would cause a register (WRE) value to overflow is about 2^15
-// and this input would go into the HUGE path.
-// Answer with WRE is in f43.
-
-{ .mfi
- nop.m 999
-(p0) fsetc.s2 0x7F,0x42
- nop.i 999;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s.s2 f43 = coshf_FR_spos, coshf_FR_COSH_temp, f0
- nop.i 999 ;;
-}
-
-// 1 more that the exponent of the largest double (7FE) = 7FF
-// 7FF - 3FF = 400 (true); 400 + FFFF = 103FF (register-biased)
-// So 0 103FF 8000000000000000 is one ulp more than
-// largest double in register bias
-// 1 more that the exponent of the largest single (FE) = FF
-// FF - 7F = 80 (true); 80 + FFFF = 1007F (register-biased)
-// Now set p8 if the answer with WRE is greater than or equal this value
-// Also set p9 if the answer with WRE is less than or equal to negative this value
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x000000000001007f ;;
+ nop.m 0
+ nop.f 0
+(p6) br.cond.spnt COSH_CERTAIN_OVERFLOW // Branch if overflow
}
+;;
-{ .mmf
- nop.m 999
-(p0) setf.exp f41 = r32
-(p0) fsetc.s2 0x7F,0x40 ;;
+{ .mfb
+ nop.m 0
+ fma.s.s0 f8 = fP, fT, fT
+ br.ret.sptk b0 // Exit if really no overflow
}
+;;
-{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41
- nop.i 999
+// here if overflow
+COSH_CERTAIN_OVERFLOW:
+{ .mmi
+ addl r17ones_m1 = 0x1FFFE, r0
+;;
+ setf.exp fTmp = r17ones_m1
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fmerge.ns f42 = f41, f41
- nop.i 999 ;;
-}
-
-// The error tag for overflow is 65
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p8) mov GR_Parameter_TAG = 65 ;;
+ alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers
+ fmerge.s FR_X = f8,f8
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p0) fcmp.le.unc.s1 p9, p0 = f43, f42
-(p8) br.cond.spnt __libm_error_region ;;
-}
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p9) mov GR_Parameter_TAG = 64
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.spnt __libm_error_region ;;
+ mov GR_Parameter_TAG = 65
+ fma.s.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result
+ br.cond.sptk __libm_error_region
}
+;;
+// Here if x unorm
+COSH_UNORM:
{ .mfb
- nop.m 999
-(p0) fmerge.s f8 = f44,f44
-(p0) br.ret.sptk b0 ;;
+ getf.exp rSignexp_x = fNormX // Must recompute if x unorm
+ fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag
+ br.cond.sptk COSH_COMMON // Return to main path
}
+;;
+GLOBAL_IEEE754_END(coshf)
-L(COSH_HUGE):
-
-// for COSH_HUGE, put 24000 in exponent; take sign from input; add 1
-// SAFE: SAFE is always 0 for HUGE
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000015dbf ;;
-}
-
-{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 coshf_FR_hi_lo = f1, f9, f1
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s.s0 f44 = f9, coshf_FR_hi_lo, f0
-(p0) mov GR_Parameter_TAG = 65
-}
-.endp coshf
-ASM_SIZE_DIRECTIVE(coshf)
-
-
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
- nop.f 0
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
- add sp=-64,sp // Create new stack
- nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
- stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
-{ .mib
- stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0 // Parameter 3 address
+{ .mfi
+ stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ nop.f 0
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
}
{ .mib
- stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk.many b0=__libm_error_support# // Call error handling function
+ stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
+
{ .mmi
- nop.m 0
- nop.m 0
- add GR_Parameter_RESULT = 48,sp
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
+
{ .mmi
- ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
- add sp = 64,sp // Restore stack pointer
- mov b0 = GR_SAVE_B0 // Restore return address
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
-};;
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_coshl.S b/sysdeps/ia64/fpu/e_coshl.S
index daac20d9a3..cef8be0b1a 100644
--- a/sysdeps/ia64/fpu/e_coshl.S
+++ b/sysdeps/ia64/fpu/e_coshl.S
@@ -1,10 +1,10 @@
.file "coshl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2002, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,1129 +35,1060 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-// 1/23/01 Set inexact flag for large args.
+// 01/23/01 Set inexact flag for large args.
+// 05/07/01 Reworked to improve speed of all paths
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 12/06/02 Improved performance
//
// API
//==============================================================
-// float = cosh(float)
-// double = cosh(double)
// long double = coshl(long double)
// input floating point f8
// output floating point f8
-
-
+//
+// Registers used
+//==============================================================
+// general registers:
+// r14 -> r40
+// predicate registers used:
+// p6 -> p11
+// floating-point registers used:
+// f9 -> f15; f32 -> f90;
+// f8 has input, then output
+//
// Overview of operation
//==============================================================
-// There are four paths
-
-// 1. |x| < 0.25 COSH_BY_POLY
-// 2. |x| < 32 COSH_BY_TBL
-// 3. |x| < 2^14 COSH_BY_EXP
-// 4. |x| >= 2^14 COSH_HUGE
-
-// For paths 1, and 2 SAFE is always 1.
-// For path 4, Safe is always 0.
-// SAFE = 1 means we cannot overflow.
-
-#include "libm_support.h"
-
+// There are seven paths
+// 1. 0 < |x| < 0.25 COSH_BY_POLY
+// 2. 0.25 <=|x| < 32 COSH_BY_TBL
+// 3. 32 <= |x| < 11357.21655 COSH_BY_EXP (merged path with COSH_BY_TBL)
+// 4. |x| >= 11357.21655 COSH_HUGE
+// 5. x=0 Done with early exit
+// 6. x=inf,nan Done with early exit
+// 7. x=denormal COSH_DENORM
+//
+// For double extended we get overflow for x >= 400c b174 ddc0 31ae c0ea
+// >= 11357.21655
+//
+//
+// 1. COSH_BY_POLY 0 < |x| < 0.25
+// ===============
+// Evaluate cosh(x) by a 12th order polynomial
+// Care is take for the order of multiplication; and P2 is not exactly 1/4!,
+// P3 is not exactly 1/6!, etc.
+// cosh(x) = 1 + (P1*x^2 + P2*x^4 + P3*x^6 + P4*x^8 + P5*x^10 + P6*x^12)
+//
+// 2. COSH_BY_TBL 0.25 <= |x| < 32.0
+// =============
+// cosh(x) = cosh(B+R)
+// = cosh(B)cosh(R) + sinh(B)sinh(R)
+//
+// ax = |x| = M*log2/64 + R
+// B = M*log2/64
+// M = 64*N + j
+// We will calculate M and get N as (M-j)/64
+// The division is a shift.
+// exp(B) = exp(N*log2 + j*log2/64)
+// = 2^N * 2^(j*log2/64)
+// cosh(B) = 1/2(e^B + e^-B)
+// = 1/2(2^N * 2^(j*log2/64) + 2^-N * 2^(-j*log2/64))
+// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
+// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
+// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32
+// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit)
+//
+// R = ax - M*log2/64
+// R = ax - M*log2_by_64_hi - M*log2_by_64_lo
+// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...)
+// = 1 + p_odd + p_even
+// where the p_even uses the A coefficients and the p_even uses
+// the B coefficients
+//
+// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd
+// cosh(R) = 1 + p_even
+// cosh(B) = C_hi + C_lo
+// sinh(B) = S_hi
+// cosh(x) = cosh(B)cosh(R) + sinh(B)sinh(R)
+//
+// 3. COSH_BY_EXP 32.0 <= |x| < 11357.21655 ( 400c b174 ddc0 31ae c0ea )
+// ==============
+// Can approximate result by exp(x)/2 in this region.
+// Y_hi = Tjhi
+// Y_lo = Tjhi * (p_odd + p_even) + Tjlo
+// cosh(x) = Y_hi + Y_lo
+//
+// 4. COSH_HUGE |x| >= 11357.21655 ( 400c b174 ddc0 31ae c0ea )
+// ============
+// Set error tag and call error support
+//
+//
// Assembly macros
//==============================================================
-cosh_FR_X = f44
-FR_RESULT = f44
-cosh_FR_SGNX = f40
-cosh_FR_all_ones = f45
-
-FR_X = f8
-FR_Y = f0
-cosh_FR_Inv_log2by64 = f9
-cosh_FR_log2by64_lo = f11
-cosh_FR_log2by64_hi = f10
-
-cosh_FR_A1 = f9
-cosh_FR_A2 = f10
-cosh_FR_A3 = f11
-
-cosh_FR_Rcub = f12
-cosh_FR_M_temp = f13
-cosh_FR_R_temp = f13
-cosh_FR_Rsq = f13
-cosh_FR_R = f14
-
-cosh_FR_M = f38
-
-cosh_FR_tmp = f15
-cosh_FR_B1 = f15
-cosh_FR_B2 = f32
-cosh_FR_B3 = f33
-
-cosh_FR_peven_temp1 = f34
-cosh_FR_peven_temp2 = f35
-cosh_FR_peven = f36
-
-cosh_FR_podd_temp1 = f34
-cosh_FR_podd_temp2 = f35
-cosh_FR_podd = f37
-
-cosh_FR_J_temp = f9
-cosh_FR_J = f10
-
-cosh_FR_Mmj = f39
-
-cosh_FR_N_temp1 = f11
-cosh_FR_N_temp2 = f12
-cosh_FR_N = f13
-
-cosh_FR_spos = f14
-cosh_FR_sneg = f15
-
-cosh_FR_Tjhi = f32
-cosh_FR_Tjlo = f33
-cosh_FR_Tmjhi = f34
-cosh_FR_Tmjlo = f35
-
-GR_mJ = r35
-GR_J = r36
-
-AD_mJ = r38
-AD_J = r39
-
-cosh_GR_all_ones = r40
-
-GR_SAVE_PFS = r41
-GR_SAVE_B0 = r42
-GR_SAVE_GP = r43
-GR_Parameter_X = r44
-GR_Parameter_Y = r45
-GR_Parameter_RESULT = r46
-GR_Parameter_TAG = r47
+r_ad5 = r14
+r_rshf_2to57 = r15
+r_exp_denorm = r15
+r_ad_mJ_lo = r15
+r_ad_J_lo = r16
+r_2Nm1 = r17
+r_2mNm1 = r18
+r_exp_x = r18
+r_ad_J_hi = r19
+r_ad2o = r19
+r_ad_mJ_hi = r20
+r_mj = r21
+r_ad2e = r22
+r_ad3 = r23
+r_ad1 = r24
+r_Mmj = r24
+r_rshf = r25
+r_M = r25
+r_N = r25
+r_jshf = r26
+r_exp_2tom57 = r26
+r_j = r26
+r_exp_mask = r27
+r_signexp_x = r28
+r_signexp_0_5 = r28
+r_exp_0_25 = r29
+r_sig_inv_ln2 = r30
+r_exp_32 = r30
+r_exp_huge = r30
+r_ad4 = r31
+
+GR_SAVE_PFS = r34
+GR_SAVE_B0 = r35
+GR_SAVE_GP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+f_ABS_X = f9
+f_X2 = f10
+f_X4 = f11
+f_tmp = f14
+f_RSHF = f15
+
+f_Inv_log2by64 = f32
+f_log2by64_lo = f33
+f_log2by64_hi = f34
+f_A1 = f35
+
+f_A2 = f36
+f_A3 = f37
+f_Rcub = f38
+f_M_temp = f39
+f_R_temp = f40
+
+f_Rsq = f41
+f_R = f42
+f_M = f43
+f_B1 = f44
+f_B2 = f45
+
+f_B3 = f46
+f_peven_temp1 = f47
+f_peven_temp2 = f48
+f_peven = f49
+f_podd_temp1 = f50
+
+f_podd_temp2 = f51
+f_podd = f52
+f_poly65 = f53
+f_poly6543 = f53
+f_poly6to1 = f53
+f_poly43 = f54
+f_poly21 = f55
+
+f_X3 = f56
+f_INV_LN2_2TO63 = f57
+f_RSHF_2TO57 = f58
+f_2TOM57 = f59
+f_smlst_oflow_input = f60
+
+f_pre_result = f61
+f_huge = f62
+f_spos = f63
+f_sneg = f64
+f_Tjhi = f65
+
+f_Tjlo = f66
+f_Tmjhi = f67
+f_Tmjlo = f68
+f_S_hi = f69
+f_SC_hi_temp = f70
+
+f_C_lo_temp1 = f71
+f_C_lo_temp2 = f72
+f_C_lo_temp3 = f73
+f_C_lo_temp4 = f73
+f_C_lo = f74
+f_C_hi = f75
+
+f_Y_hi = f77
+f_Y_lo_temp = f78
+f_Y_lo = f79
+f_NORM_X = f80
+
+f_P1 = f81
+f_P2 = f82
+f_P3 = f83
+f_P4 = f84
+f_P5 = f85
+
+f_P6 = f86
+f_Tjhi_spos = f87
+f_Tjlo_spos = f88
+f_huge = f89
+f_signed_hi_lo = f90
-cosh_FR_C_hi = f9
-cosh_FR_C_hi_temp = f10
-cosh_FR_C_lo_temp1 = f11
-cosh_FR_C_lo_temp2 = f12
-cosh_FR_C_lo_temp3 = f13
-
-cosh_FR_C_lo = f38
-cosh_FR_S_hi = f39
-
-cosh_FR_S_hi_temp1 = f10
-cosh_FR_Y_hi = f11
-cosh_FR_Y_lo_temp = f12
-cosh_FR_Y_lo = f13
-cosh_FR_COSH = f9
-
-cosh_FR_X2 = f9
-cosh_FR_X4 = f10
-
-cosh_FR_P1 = f14
-cosh_FR_P2 = f15
-cosh_FR_P3 = f32
-cosh_FR_P4 = f33
-cosh_FR_P5 = f34
-cosh_FR_P6 = f35
-
-cosh_FR_TINY_THRESH = f9
-
-cosh_FR_COSH_temp = f10
-cosh_FR_SCALE = f11
-
-cosh_FR_hi_lo = f10
-
-cosh_FR_poly_podd_temp1 = f11
-cosh_FR_poly_podd_temp2 = f13
-cosh_FR_poly_peven_temp1 = f11
-cosh_FR_poly_peven_temp2 = f13
// Data tables
//==============================================================
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+// DO NOT CHANGE ORDER OF THESE TABLES
+RODATA
.align 16
-double_cosh_arg_reduction:
-ASM_TYPE_DIRECTIVE(double_cosh_arg_reduction,@object)
- data8 0xB8AA3B295C17F0BC, 0x00004005
- data8 0xB17217F7D1000000, 0x00003FF8
- data8 0xCF79ABC9E3B39804, 0x00003FD0
-ASM_SIZE_DIRECTIVE(double_cosh_arg_reduction)
-
-double_cosh_p_table:
-ASM_TYPE_DIRECTIVE(double_cosh_p_table,@object)
- data8 0x8000000000000000, 0x00003FFE
- data8 0xAAAAAAAAAAAAAB80, 0x00003FFA
- data8 0xB60B60B60B4FE884, 0x00003FF5
- data8 0xD00D00D1021D7370, 0x00003FEF
- data8 0x93F27740C0C2F1CC, 0x00003FE9
- data8 0x8FA02AC65BCBD5BC, 0x00003FE2
-ASM_SIZE_DIRECTIVE(double_cosh_p_table)
-
-double_cosh_ab_table:
-ASM_TYPE_DIRECTIVE(double_cosh_ab_table,@object)
- data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC
- data8 0x88888888884ECDD5, 0x00003FF8
- data8 0xD00D0C6DCC26A86B, 0x00003FF2
- data8 0x8000000000000002, 0x00003FFE
- data8 0xAAAAAAAAAA402C77, 0x00003FFA
- data8 0xB60B6CC96BDB144D, 0x00003FF5
-ASM_SIZE_DIRECTIVE(double_cosh_ab_table)
-
-double_cosh_j_table:
-ASM_TYPE_DIRECTIVE(double_cosh_j_table,@object)
- data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000
- data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000
- data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000
- data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000
- data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000
- data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000
- data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000
- data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000
- data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000
- data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000
- data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000
- data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000
- data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000
- data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000
- data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000
- data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000
- data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000
- data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000
- data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000
- data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000
- data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000
- data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000
- data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000
- data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000
- data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000
- data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000
- data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000
- data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000
- data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000
- data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000
- data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000
- data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000
- data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000
- data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000
- data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000
- data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000
- data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000
- data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000
- data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000
- data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000
- data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000
- data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000
- data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000
- data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000
- data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000
- data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000
- data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000
- data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000
- data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000
- data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000
- data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000
- data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000
- data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000
- data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000
- data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000
- data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000
- data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000
- data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000
- data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000
- data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000
- data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000
- data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000
- data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000
- data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000
- data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000
-ASM_SIZE_DIRECTIVE(double_cosh_j_table)
-
-.align 32
-.global coshl#
-
-.section .text
-.proc coshl#
-.align 32
-
-coshl:
-
-#ifdef _LIBC
-.global __ieee754_coshl#
-.proc __ieee754_coshl#
-__ieee754_coshl:
-#endif
-
-// X NAN?
-
-{ .mfi
- alloc r32 = ar.pfs,0,12,4,0
-(p0) fclass.m.unc p6,p7 = f8, 0xc3
- mov cosh_GR_all_ones = -1
-};;
-
-// This is more than we need but it is in preparation
-// for the values we add for error support. We push three
-// addresses on the stack (3*8) = 24 bytes and one tag
-
-{ .mfb
- nop.m 999
-(p6) fma.s0 f8 = f8,f1,f8
-(p6) br.ret.spnt b0 ;;
-}
-
-
-// Make constant that will generate inexact when squared
-// X infinity
-{ .mfi
- setf.sig cosh_FR_all_ones = cosh_GR_all_ones
-(p0) fclass.m.unc p6,p0 = f8, 0x23
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-(p6) fmerge.s f8 = f0,f8
-(p6) br.ret.spnt b0 ;;
-}
+LOCAL_OBJECT_START(cosh_arg_reduction)
+// data8 0xB8AA3B295C17F0BC, 0x00004005 // 64/log2 -- signif loaded with setf
+ data8 0xB17217F7D1000000, 0x00003FF8 // log2/64 high part
+ data8 0xCF79ABC9E3B39804, 0x00003FD0 // log2/64 low part
+ data8 0xb174ddc031aec0ea, 0x0000400c // Smallest x to overflow (11357.21655)
+LOCAL_OBJECT_END(cosh_arg_reduction)
+
+LOCAL_OBJECT_START(cosh_p_table)
+ data8 0x8FA02AC65BCBD5BC, 0x00003FE2 // P6
+ data8 0xD00D00D1021D7370, 0x00003FEF // P4
+ data8 0xAAAAAAAAAAAAAB80, 0x00003FFA // P2
+ data8 0x93F27740C0C2F1CC, 0x00003FE9 // P5
+ data8 0xB60B60B60B4FE884, 0x00003FF5 // P3
+ data8 0x8000000000000000, 0x00003FFE // P1
+LOCAL_OBJECT_END(cosh_p_table)
+
+LOCAL_OBJECT_START(cosh_ab_table)
+ data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC // A1
+ data8 0x88888888884ECDD5, 0x00003FF8 // A2
+ data8 0xD00D0C6DCC26A86B, 0x00003FF2 // A3
+ data8 0x8000000000000002, 0x00003FFE // B1
+ data8 0xAAAAAAAAAA402C77, 0x00003FFA // B2
+ data8 0xB60B6CC96BDB144D, 0x00003FF5 // B3
+LOCAL_OBJECT_END(cosh_ab_table)
+
+LOCAL_OBJECT_START(cosh_j_hi_table)
+ data8 0xB504F333F9DE6484, 0x00003FFE
+ data8 0xB6FD91E328D17791, 0x00003FFE
+ data8 0xB8FBAF4762FB9EE9, 0x00003FFE
+ data8 0xBAFF5AB2133E45FB, 0x00003FFE
+ data8 0xBD08A39F580C36BF, 0x00003FFE
+ data8 0xBF1799B67A731083, 0x00003FFE
+ data8 0xC12C4CCA66709456, 0x00003FFE
+ data8 0xC346CCDA24976407, 0x00003FFE
+ data8 0xC5672A115506DADD, 0x00003FFE
+ data8 0xC78D74C8ABB9B15D, 0x00003FFE
+ data8 0xC9B9BD866E2F27A3, 0x00003FFE
+ data8 0xCBEC14FEF2727C5D, 0x00003FFE
+ data8 0xCE248C151F8480E4, 0x00003FFE
+ data8 0xD06333DAEF2B2595, 0x00003FFE
+ data8 0xD2A81D91F12AE45A, 0x00003FFE
+ data8 0xD4F35AABCFEDFA1F, 0x00003FFE
+ data8 0xD744FCCAD69D6AF4, 0x00003FFE
+ data8 0xD99D15C278AFD7B6, 0x00003FFE
+ data8 0xDBFBB797DAF23755, 0x00003FFE
+ data8 0xDE60F4825E0E9124, 0x00003FFE
+ data8 0xE0CCDEEC2A94E111, 0x00003FFE
+ data8 0xE33F8972BE8A5A51, 0x00003FFE
+ data8 0xE5B906E77C8348A8, 0x00003FFE
+ data8 0xE8396A503C4BDC68, 0x00003FFE
+ data8 0xEAC0C6E7DD24392F, 0x00003FFE
+ data8 0xED4F301ED9942B84, 0x00003FFE
+ data8 0xEFE4B99BDCDAF5CB, 0x00003FFE
+ data8 0xF281773C59FFB13A, 0x00003FFE
+ data8 0xF5257D152486CC2C, 0x00003FFE
+ data8 0xF7D0DF730AD13BB9, 0x00003FFE
+ data8 0xFA83B2DB722A033A, 0x00003FFE
+ data8 0xFD3E0C0CF486C175, 0x00003FFE
+ data8 0x8000000000000000, 0x00003FFF // Center of table
+ data8 0x8164D1F3BC030773, 0x00003FFF
+ data8 0x82CD8698AC2BA1D7, 0x00003FFF
+ data8 0x843A28C3ACDE4046, 0x00003FFF
+ data8 0x85AAC367CC487B15, 0x00003FFF
+ data8 0x871F61969E8D1010, 0x00003FFF
+ data8 0x88980E8092DA8527, 0x00003FFF
+ data8 0x8A14D575496EFD9A, 0x00003FFF
+ data8 0x8B95C1E3EA8BD6E7, 0x00003FFF
+ data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF
+ data8 0x8EA4398B45CD53C0, 0x00003FFF
+ data8 0x9031DC431466B1DC, 0x00003FFF
+ data8 0x91C3D373AB11C336, 0x00003FFF
+ data8 0x935A2B2F13E6E92C, 0x00003FFF
+ data8 0x94F4EFA8FEF70961, 0x00003FFF
+ data8 0x96942D3720185A00, 0x00003FFF
+ data8 0x9837F0518DB8A96F, 0x00003FFF
+ data8 0x99E0459320B7FA65, 0x00003FFF
+ data8 0x9B8D39B9D54E5539, 0x00003FFF
+ data8 0x9D3ED9A72CFFB751, 0x00003FFF
+ data8 0x9EF5326091A111AE, 0x00003FFF
+ data8 0xA0B0510FB9714FC2, 0x00003FFF
+ data8 0xA27043030C496819, 0x00003FFF
+ data8 0xA43515AE09E6809E, 0x00003FFF
+ data8 0xA5FED6A9B15138EA, 0x00003FFF
+ data8 0xA7CD93B4E965356A, 0x00003FFF
+ data8 0xA9A15AB4EA7C0EF8, 0x00003FFF
+ data8 0xAB7A39B5A93ED337, 0x00003FFF
+ data8 0xAD583EEA42A14AC6, 0x00003FFF
+ data8 0xAF3B78AD690A4375, 0x00003FFF
+ data8 0xB123F581D2AC2590, 0x00003FFF
+ data8 0xB311C412A9112489, 0x00003FFF
+ data8 0xB504F333F9DE6484, 0x00003FFF
+LOCAL_OBJECT_END(cosh_j_hi_table)
+
+LOCAL_OBJECT_START(cosh_j_lo_table)
+ data4 0x1EB2FB13
+ data4 0x1CE2CBE2
+ data4 0x1DDC3CBC
+ data4 0x1EE9AA34
+ data4 0x9EAEFDC1
+ data4 0x9DBF517B
+ data4 0x1EF88AFB
+ data4 0x1E03B216
+ data4 0x1E78AB43
+ data4 0x9E7B1747
+ data4 0x9EFE3C0E
+ data4 0x9D36F837
+ data4 0x9DEE53E4
+ data4 0x9E24AE8E
+ data4 0x1D912473
+ data4 0x1EB243BE
+ data4 0x1E669A2F
+ data4 0x9BBC610A
+ data4 0x1E761035
+ data4 0x9E0BE175
+ data4 0x1CCB12A1
+ data4 0x1D1BFE90
+ data4 0x1DF2F47A
+ data4 0x1EF22F22
+ data4 0x9E3F4A29
+ data4 0x1EC01A5B
+ data4 0x1E8CAC3A
+ data4 0x9DBB3FAB
+ data4 0x1EF73A19
+ data4 0x9BB795B5
+ data4 0x1EF84B76
+ data4 0x9EF5818B
+ data4 0x00000000 // Center of table
+ data4 0x1F77CACA
+ data4 0x1EF8A91D
+ data4 0x1E57C976
+ data4 0x9EE8DA92
+ data4 0x1EE85C9F
+ data4 0x1F3BF1AF
+ data4 0x1D80CA1E
+ data4 0x9D0373AF
+ data4 0x9F167097
+ data4 0x1EB70051
+ data4 0x1F6EB029
+ data4 0x1DFD6D8E
+ data4 0x9EB319B0
+ data4 0x1EBA2BEB
+ data4 0x1F11D537
+ data4 0x1F0D5A46
+ data4 0x9E5E7BCA
+ data4 0x9F3AAFD1
+ data4 0x9E86DACC
+ data4 0x9F3EDDC2
+ data4 0x1E496E3D
+ data4 0x9F490BF6
+ data4 0x1DD1DB48
+ data4 0x1E65EBFB
+ data4 0x9F427496
+ data4 0x1F283C4A
+ data4 0x1F4B0047
+ data4 0x1F130152
+ data4 0x9E8367C0
+ data4 0x9F705F90
+ data4 0x1EFB3C53
+ data4 0x1F32FB13
+LOCAL_OBJECT_END(cosh_j_lo_table)
+.section .text
+GLOBAL_IEEE754_ENTRY(coshl)
-// Put 0.25 in f9; p6 true if x < 0.25
{ .mlx
- nop.m 999
-(p0) movl r32 = 0x000000000000fffd ;;
-}
-
-{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ getf.exp r_signexp_x = f8 // Get signexp of x, must redo if unorm
+ movl r_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
}
-
-{ .mfi
- nop.m 999
-(p0) fmerge.s cosh_FR_X = f0,f8
- nop.i 999
+{ .mlx
+ addl r_ad1 = @ltoff(cosh_arg_reduction), gp
+ movl r_rshf_2to57 = 0x4778000000000000 // 1.10000 2^(63+57)
}
+;;
{ .mfi
- nop.m 999
-(p0) fmerge.s cosh_FR_SGNX = f8,f1
- nop.i 999 ;;
+ ld8 r_ad1 = [r_ad1]
+ fmerge.s f_ABS_X = f0,f8
+ mov r_exp_0_25 = 0x0fffd // Form exponent for 0.25
}
-
{ .mfi
- nop.m 999
-(p0) fcmp.lt.unc p0,p7 = cosh_FR_X,f9
- nop.i 999 ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p7) br.cond.sptk L(COSH_BY_TBL)
+ nop.m 0
+ fnorm.s1 f_NORM_X = f8
+ mov r_exp_2tom57 = 0xffff-57
}
;;
-
-// COSH_BY_POLY:
-// POLY cannot overflow so there is no need to call __libm_error_support
-// Get the values of P_x from the table
-
-{ .mmi
- nop.m 999
-(p0) addl r34 = @ltoff(double_cosh_p_table), gp
- nop.i 999
+{ .mfi
+ setf.d f_RSHF_2TO57 = r_rshf_2to57 // Form const 1.100 * 2^120
+ fclass.m p10,p0 = f8, 0x0b // Test for denorm
+ mov r_exp_mask = 0x1ffff
}
-;;
-
-{ .mmi
- ld8 r34 = [r34]
- nop.m 999
- nop.i 999
+{ .mlx
+ setf.sig f_INV_LN2_2TO63 = r_sig_inv_ln2 // Form 1/ln2 * 2^63
+ movl r_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift
}
;;
-
-// Calculate cosh_FR_X2 = ax*ax and cosh_FR_X4 = ax*ax*ax*ax
-{ .mmf
- nop.m 999
-(p0) ldfe cosh_FR_P1 = [r34],16
-(p0) fma.s1 cosh_FR_X2 = cosh_FR_X, cosh_FR_X, f0 ;;
-}
-
-{ .mmi
-(p0) ldfe cosh_FR_P2 = [r34],16 ;;
-(p0) ldfe cosh_FR_P3 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) ldfe cosh_FR_P4 = [r34],16 ;;
-(p0) ldfe cosh_FR_P5 = [r34],16
- nop.i 999 ;;
-}
-
{ .mfi
-(p0) ldfe cosh_FR_P6 = [r34],16
-(p0) fma.s1 cosh_FR_X4 = cosh_FR_X2, cosh_FR_X2, f0
- nop.i 999 ;;
+ nop.m 0
+ fclass.m p7,p0 = f8, 0x07 // Test if x=0
+ nop.i 0
}
-
-// Calculate cosh_FR_podd = x4 *(x4 * P_5 + P_3) + P_1
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_poly_podd_temp1 = cosh_FR_X4, cosh_FR_P5, cosh_FR_P3
- nop.i 999 ;;
+ setf.exp f_2TOM57 = r_exp_2tom57 // Form 2^-57 for scaling
+ nop.f 0
+ add r_ad3 = 0x90, r_ad1 // Point to ab_table
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_podd = cosh_FR_X4, cosh_FR_poly_podd_temp1, cosh_FR_P1
- nop.i 999
+ setf.d f_RSHF = r_rshf // Form right shift const 1.100 * 2^63
+ fclass.m p6,p0 = f8, 0xe3 // Test if x nan, inf
+ add r_ad4 = 0x2f0, r_ad1 // Point to j_hi_table midpoint
}
-
-// Calculate cosh_FR_peven = p_even = x4 *(x4 * (x4 * P_6 + P_4) + P_2)
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_poly_peven_temp1 = cosh_FR_X4, cosh_FR_P6, cosh_FR_P4
- nop.i 999 ;;
+{ .mib
+ add r_ad2e = 0x20, r_ad1 // Point to p_table
+ nop.i 0
+(p10) br.cond.spnt COSH_DENORM // Branch if x denorm
}
+;;
+// Common path -- return here from COSH_DENORM if x is unnorm
+COSH_COMMON:
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_poly_peven_temp2 = cosh_FR_X4, cosh_FR_poly_peven_temp1, cosh_FR_P2
- nop.i 999 ;;
+ ldfe f_smlst_oflow_input = [r_ad2e],16
+(p7) fma.s0 f8 = f1, f1, f0 // Result = 1.0 if x=0
+ add r_ad5 = 0x580, r_ad1 // Point to j_lo_table midpoint
}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_peven = cosh_FR_X4, cosh_FR_poly_peven_temp2, f0
- nop.i 999 ;;
+{ .mib
+ ldfe f_log2by64_hi = [r_ad1],16
+ and r_exp_x = r_exp_mask, r_signexp_x
+(p7) br.ret.spnt b0 // Exit if x=0
}
+;;
-// Y_lo = x2*p_odd + p_even
-// Calculate f8 = Y_hi + Y_lo
+// Get the A coefficients for COSH_BY_TBL
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_X2, cosh_FR_podd, cosh_FR_peven
- nop.i 999 ;;
+ ldfe f_A1 = [r_ad3],16
+ fcmp.lt.s1 p8,p9 = f8,f0 // Test for x<0
+ cmp.lt p7,p0 = r_exp_x, r_exp_0_25 // Test x < 0.25
}
-
{ .mfb
- nop.m 999
-(p0) fma.s0 f8 = f1, f1, cosh_FR_Y_lo
-(p0) br.ret.sptk b0 ;;
-}
-
-
-L(COSH_BY_TBL):
-
-// Now that we are at TBL; so far all we know is that |x| >= 0.25.
-// The first two steps are the same for TBL and EXP, but if we are HUGE
-// Double Extended
-// Go to HUGE if |x| >= 2^14, 1000d (register-biased) is e = 14 (true)
-// Double
-// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true)
-// Single
-// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true)
-// we want to leave now. Go to HUGE if |x| >= 2^14
-// 1000d (register-biased) is e = 14 (true)
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x000000000001000d ;;
-}
-
-{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ add r_ad2o = 0x30, r_ad2e // Point to p_table odd coeffs
+(p6) fma.s0 f8 = f8,f8,f0 // Result for x nan, inf
+(p6) br.ret.spnt b0 // Exit for x nan, inf
}
+;;
+// Calculate X2 = ax*ax for COSH_BY_POLY
{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc p6,p7 = cosh_FR_X,f9
- nop.i 999 ;;
+ ldfe f_log2by64_lo = [r_ad1],16
+ nop.f 0
+ nop.i 0
}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(COSH_HUGE) ;;
+{ .mfb
+ ldfe f_A2 = [r_ad3],16
+ fma.s1 f_X2 = f_NORM_X, f_NORM_X, f0
+(p7) br.cond.spnt COSH_BY_POLY
}
+;;
-// r32 = 1
-// r34 = N-1
-// r35 = N
-// r36 = j
-// r37 = N+1
-
-// TBL can never overflow
-// cosh(x) = cosh(B+R)
-// = cosh(B) cosh(R) + sinh(B) sinh(R)
-// cosh(R) can be approximated by 1 + p_even
-// sinh(R) can be approximated by p_odd
-
+// Here if |x| >= 0.25
+COSH_BY_TBL:
// ******************************************************
-// STEP 1 (TBL and EXP)
+// STEP 1 (TBL and EXP) - Argument reduction
// ******************************************************
-// Get the following constants.
-// f9 = Inv_log2by64
-// f10 = log2by64_hi
-// f11 = log2by64_lo
+// Get the following constants.
+// Inv_log2by64
+// log2by64_hi
+// log2by64_lo
-{ .mmi
-(p0) adds r32 = 0x1,r0
-(p0) addl r34 = @ltoff(double_cosh_arg_reduction), gp
- nop.i 999
-}
-;;
// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and
// put them in an exponent.
-// cosh_FR_spos = 2^(N-1) and cosh_FR_sneg = 2^(-N-1)
-// r39 = 0xffff + (N-1) = 0xffff +N -1
-// r40 = 0xffff - (N +1) = 0xffff -N -1
-
-{ .mlx
- ld8 r34 = [r34]
-(p0) movl r38 = 0x000000000000fffe ;;
-}
+// f_spos = 2^(N-1) and f_sneg = 2^(-N-1)
+// 0xffff + (N-1) = 0xffff +N -1
+// 0xffff - (N +1) = 0xffff -N -1
-{ .mmi
-(p0) ldfe cosh_FR_Inv_log2by64 = [r34],16 ;;
-(p0) ldfe cosh_FR_log2by64_hi = [r34],16
- nop.i 999 ;;
-}
-
-{ .mbb
-(p0) ldfe cosh_FR_log2by64_lo = [r34],16
- nop.b 999
- nop.b 999 ;;
-}
-
-// Get the A coefficients
-// f9 = A_1
-// f10 = A_2
-// f11 = A_3
-{ .mmi
- nop.m 999
-(p0) addl r34 = @ltoff(double_cosh_ab_table), gp
- nop.i 999
-}
-;;
+// Calculate M and keep it as integer and floating point.
+// M = round-to-integer(x*Inv_log2by64)
+// f_M = M = truncate(ax/(log2/64))
+// Put the integer representation of M in r_M
+// and the floating point representation of M in f_M
+// Get the remaining A,B coefficients
{ .mmi
- ld8 r34 = [r34]
- nop.m 999
- nop.i 999
+ ldfe f_A3 = [r_ad3],16
+ nop.m 0
+ nop.i 0
}
;;
-
-// Calculate M and keep it as integer and floating point.
-// M = round-to-integer(x*Inv_log2by64)
-// cosh_FR_M = M = truncate(ax/(log2/64))
-// Put the significand of M in r35
-// and the floating point representation of M in cosh_FR_M
-
+// Use constant (1.100*2^(63-6)) to get rounded M into rightmost significand
+// |x| * 64 * 1/ln2 * 2^(63-6) + 1.1000 * 2^(63+(63-6))
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_M = cosh_FR_X, cosh_FR_Inv_log2by64, f0
- nop.i 999
+ nop.m 0
+ fma.s1 f_M_temp = f_ABS_X, f_INV_LN2_2TO63, f_RSHF_2TO57
+ mov r_signexp_0_5 = 0x0fffe // signexp of +0.5
}
+;;
+// Test for |x| >= overflow limit
{ .mfi
-(p0) ldfe cosh_FR_A1 = [r34],16
- nop.f 999
- nop.i 999 ;;
+ ldfe f_B1 = [r_ad3],16
+ fcmp.ge.s1 p6,p0 = f_ABS_X, f_smlst_oflow_input
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fcvt.fx.s1 cosh_FR_M_temp = cosh_FR_M
- nop.i 999 ;;
+ ldfe f_B2 = [r_ad3],16
+ nop.f 0
+ mov r_exp_32 = 0x10004
}
+;;
-{ .mfi
- nop.m 999
-(p0) fnorm.s1 cosh_FR_M = cosh_FR_M_temp
- nop.i 999 ;;
+// Subtract RSHF constant to get rounded M as a floating point value
+// M_temp * 2^(63-6) - 2^63
+{ .mfb
+ ldfe f_B3 = [r_ad3],16
+ fms.s1 f_M = f_M_temp, f_2TOM57, f_RSHF
+(p6) br.cond.spnt COSH_HUGE // Branch if result will overflow
}
+;;
{ .mfi
-(p0) getf.sig r35 = cosh_FR_M_temp
- nop.f 999
- nop.i 999 ;;
+ getf.sig r_M = f_M_temp
+ nop.f 0
+ cmp.ge p7,p6 = r_exp_x, r_exp_32 // Test if x >= 32
}
+;;
-// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It
+// Calculate j. j is the signed extension of the six lsb of M. It
// has a range of -32 thru 31.
-// r35 = M
-// r36 = j
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p0) and r36 = 0x3f, r35 ;;
-}
// Calculate R
-// f13 = f44 - f12*f10 = x - M*log2by64_hi
-// f14 = f13 - f8*f11 = R = (x - M*log2by64_hi) - M*log2by64_lo
-
-{ .mfi
- nop.m 999
-(p0) fnma.s1 cosh_FR_R_temp = cosh_FR_M, cosh_FR_log2by64_hi, cosh_FR_X
- nop.i 999
-}
+// ax - M*log2by64_hi
+// R = (ax - M*log2by64_hi) - M*log2by64_lo
{ .mfi
-(p0) ldfe cosh_FR_A2 = [r34],16
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fnma.s1 f_R_temp = f_M, f_log2by64_hi, f_ABS_X
+ and r_j = 0x3f, r_M
}
+;;
-{ .mfi
- nop.m 999
-(p0) fnma.s1 cosh_FR_R = cosh_FR_M, cosh_FR_log2by64_lo, cosh_FR_R_temp
- nop.i 999
+{ .mii
+ nop.m 0
+ shl r_jshf = r_j, 0x2 // Shift j so can sign extend it
+;;
+ sxt1 r_jshf = r_jshf
}
+;;
-// Get the B coefficients
-// f15 = B_1
-// f32 = B_2
-// f33 = B_3
-
-{ .mmi
-(p0) ldfe cosh_FR_A3 = [r34],16 ;;
-(p0) ldfe cosh_FR_B1 = [r34],16
- nop.i 999 ;;
+{ .mii
+ nop.m 0
+ shr r_j = r_jshf, 0x2 // Now j has range -32 to 31
+ nop.i 0
}
+;;
{ .mmi
-(p0) ldfe cosh_FR_B2 = [r34],16 ;;
-(p0) ldfe cosh_FR_B3 = [r34],16
- nop.i 999 ;;
+ shladd r_ad_J_hi = r_j, 4, r_ad4 // pointer to Tjhi
+ sub r_Mmj = r_M, r_j // M-j
+ sub r_mj = r0, r_j // Form -j
}
+;;
-{ .mii
- nop.m 999
-(p0) shl r34 = r36, 0x2 ;;
-(p0) sxt1 r37 = r34 ;;
+// The TBL and EXP branches are merged and predicated
+// If TBL, p6 true, 0.25 <= |x| < 32
+// If EXP, p7 true, 32 <= |x| < overflow_limit
+//
+// N = (M-j)/64
+{ .mfi
+ ldfe f_Tjhi = [r_ad_J_hi]
+ fnma.s1 f_R = f_M, f_log2by64_lo, f_R_temp
+ shr r_N = r_Mmj, 0x6 // N = (M-j)/64
}
-
-// ******************************************************
-// STEP 2 (TBL and EXP)
-// ******************************************************
-// Calculate Rsquared and Rcubed in preparation for p_even and p_odd
-// f12 = R*R*R
-// f13 = R*R
-// f14 = R <== from above
-
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_Rsq = cosh_FR_R, cosh_FR_R, f0
-(p0) shr r36 = r37, 0x2 ;;
+ shladd r_ad_mJ_hi = r_mj, 4, r_ad4 // pointer to Tmjhi
+ nop.f 0
+ shladd r_ad_mJ_lo = r_mj, 2, r_ad5 // pointer to Tmjlo
}
+;;
-// r34 = M-j = r35 - r36
-// r35 = N = (M-j)/64
-
-{ .mii
-(p0) sub r34 = r35, r36
- nop.i 999 ;;
-(p0) shr r35 = r34, 0x6 ;;
+{ .mfi
+ sub r_2mNm1 = r_signexp_0_5, r_N // signexp 2^(-N-1)
+ nop.f 0
+ shladd r_ad_J_lo = r_j, 2, r_ad5 // pointer to Tjlo
}
-
-{ .mii
-(p0) sub r40 = r38, r35
-(p0) adds r37 = 0x1, r35
-(p0) add r39 = r38, r35 ;;
+{ .mfi
+ ldfe f_Tmjhi = [r_ad_mJ_hi]
+ nop.f 0
+ add r_2Nm1 = r_signexp_0_5, r_N // signexp 2^(N-1)
}
+;;
-// Get the address of the J table, add the offset,
-// addresses are sinh_AD_mJ and sinh_AD_J, get the T value
-// f32 = T(j)_hi
-// f33 = T(j)_lo
-// f34 = T(-j)_hi
-// f35 = T(-j)_lo
-
-{ .mmi
-(p0) sub r34 = r35, r32
-(p0) addl r37 = @ltoff(double_cosh_j_table), gp
- nop.i 999
+{ .mmf
+ ldfs f_Tmjlo = [r_ad_mJ_lo]
+ setf.exp f_sneg = r_2mNm1 // Form 2^(-N-1)
+ nop.f 0
}
;;
-{ .mfi
- ld8 r37 = [r37]
-(p0) fma.s1 cosh_FR_Rcub = cosh_FR_Rsq, cosh_FR_R, f0
- nop.i 999
+{ .mmf
+ ldfs f_Tjlo = [r_ad_J_lo]
+ setf.exp f_spos = r_2Nm1 // Form 2^(N-1)
+ nop.f 0
}
+;;
// ******************************************************
-// STEP 3 Now decide if we need to branch to EXP
+// STEP 2 (TBL and EXP)
// ******************************************************
-// Put 32 in f9; p6 true if x < 32
+// Calculate Rsquared and Rcubed in preparation for p_even and p_odd
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000010004 ;;
+{ .mmf
+ nop.m 0
+ nop.m 0
+ fma.s1 f_Rsq = f_R, f_R, f0
}
+;;
-// Calculate p_even
-// f34 = B_2 + Rsq *B_3
-// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3)
-// f36 = peven = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_peven_temp1 = cosh_FR_Rsq, cosh_FR_B3, cosh_FR_B2
- nop.i 999 ;;
-}
+// Calculate p_even
+// B_2 + Rsq *B_3
+// B_1 + Rsq * (B_2 + Rsq *B_3)
+// p_even = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_peven_temp2 = cosh_FR_Rsq, cosh_FR_peven_temp1, cosh_FR_B1
- nop.i 999
+ nop.m 0
+ fma.s1 f_peven_temp1 = f_Rsq, f_B3, f_B2
+ nop.i 0
}
-
// Calculate p_odd
-// f34 = A_2 + Rsq *A_3
-// f35 = A_1 + Rsq * (A_2 + Rsq *A_3)
-// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
-
+// A_2 + Rsq *A_3
+// A_1 + Rsq * (A_2 + Rsq *A_3)
+// podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_podd_temp1 = cosh_FR_Rsq, cosh_FR_A3, cosh_FR_A2
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_podd_temp1 = f_Rsq, f_A3, f_A2
+ nop.i 0
}
+;;
{ .mfi
-(p0) setf.exp cosh_FR_N_temp1 = r39
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_Rcub = f_Rsq, f_R, f0
+ nop.i 0
}
+;;
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_peven = cosh_FR_Rsq, cosh_FR_peven_temp2, f0
- nop.i 999
-}
+//
+// If TBL,
+// Calculate S_hi and S_lo, and C_hi
+// SC_hi_temp = sneg * Tmjhi
+// S_hi = spos * Tjhi - SC_hi_temp
+// S_hi = spos * Tjhi - (sneg * Tmjhi)
+// C_hi = spos * Tjhi + SC_hi_temp
+// C_hi = spos * Tjhi + (sneg * Tmjhi)
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_podd_temp2 = cosh_FR_Rsq, cosh_FR_podd_temp1, cosh_FR_A1
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.s1 f_SC_hi_temp = f_sneg, f_Tmjhi, f0
+ nop.i 0
}
+;;
+// If TBL,
+// C_lo_temp3 = sneg * Tmjlo
+// C_lo_temp4 = spos * Tjlo + C_lo_temp3
+// C_lo_temp4 = spos * Tjlo + (sneg * Tmjlo)
{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.s1 f_C_lo_temp3 = f_sneg, f_Tmjlo, f0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_podd = cosh_FR_podd_temp2, cosh_FR_Rcub, cosh_FR_R
- nop.i 999
-}
-
-// sinh_GR_mj contains the table offset for -j
-// sinh_GR_j contains the table offset for +j
-// p6 is true when j <= 0
-
-{ .mlx
-(p0) setf.exp cosh_FR_N_temp2 = r40
-(p0) movl r40 = 0x0000000000000020 ;;
+ nop.m 0
+ fma.s1 f_peven_temp2 = f_Rsq, f_peven_temp1, f_B1
+ nop.i 0
}
-
{ .mfi
-(p0) sub GR_mJ = r40, r36
-(p0) fmerge.se cosh_FR_spos = cosh_FR_N_temp1, f1
-(p0) adds GR_J = 0x20, r36 ;;
+ nop.m 0
+ fma.s1 f_podd_temp2 = f_Rsq, f_podd_temp1, f_A1
+ nop.i 0
}
+;;
-{ .mii
- nop.m 999
-(p0) shl GR_mJ = GR_mJ, 5 ;;
-(p0) add AD_mJ = r37, GR_mJ ;;
+// If EXP,
+// Compute 2^(N-1) * Tjhi and 2^(N-1) * Tjlo
+{ .mfi
+ nop.m 0
+(p7) fma.s1 f_Tjhi_spos = f_Tjhi, f_spos, f0
+ nop.i 0
}
-
-{ .mmi
- nop.m 999
-(p0) ldfe cosh_FR_Tmjhi = [AD_mJ],16
-(p0) shl GR_J = GR_J, 5 ;;
+{ .mfi
+ nop.m 0
+(p7) fma.s1 f_Tjlo_spos = f_Tjlo, f_spos, f0
+ nop.i 0
}
+;;
{ .mfi
-(p0) ldfs cosh_FR_Tmjlo = [AD_mJ],16
-(p0) fcmp.lt.unc.s1 p6,p7 = cosh_FR_X,f9
-(p0) add AD_J = r37, GR_J ;;
+ nop.m 0
+(p6) fma.s1 f_C_hi = f_spos, f_Tjhi, f_SC_hi_temp
+ nop.i 0
}
+;;
-{ .mmi
-(p0) ldfe cosh_FR_Tjhi = [AD_J],16 ;;
-(p0) ldfs cosh_FR_Tjlo = [AD_J],16
- nop.i 999 ;;
+{ .mfi
+ nop.m 0
+(p6) fms.s1 f_S_hi = f_spos, f_Tjhi, f_SC_hi_temp
+ nop.i 0
}
-
-{ .mfb
- nop.m 999
-(p0) fmerge.se cosh_FR_sneg = cosh_FR_N_temp2, f1
-(p7) br.cond.spnt L(COSH_BY_EXP) ;;
+{ .mfi
+ nop.m 0
+(p6) fma.s1 f_C_lo_temp4 = f_spos, f_Tjlo, f_C_lo_temp3
+ nop.i 0
}
-
-// ******************************************************
-// If NOT branch to EXP
-// ******************************************************
-// Calculate C_hi
-// ******************************************************
-// cosh_FR_C_hi_temp = cosh_FR_sneg * cosh_FR_Tmjhi
-// cosh_FR_C_hi = cosh_FR_spos * cosh_FR_Tjhi + (cosh_FR_sneg * cosh_FR_Tmjhi)
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_C_hi_temp = cosh_FR_sneg, cosh_FR_Tmjhi, f0
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_peven = f_Rsq, f_peven_temp2, f0
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_C_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi_temp
- nop.i 999
+ nop.m 0
+ fma.s1 f_podd = f_podd_temp2, f_Rcub, f_R
+ nop.i 0
}
+;;
-// ******************************************************
-// Calculate S_hi
-// ******************************************************
-// cosh_FR_S_hi_temp1 = cosh_FR_sneg * cosh_FR_Tmjhi
-// cosh_FR_S_hi = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi_temp1
+// If TBL,
+// C_lo_temp1 = spos * Tjhi - C_hi
+// C_lo_temp2 = sneg * Tmjlo + C_lo_temp1
+// C_lo_temp2 = sneg * Tmjlo + (spos * Tjhi - C_hi)
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_S_hi_temp1 = cosh_FR_sneg, cosh_FR_Tmjhi, f0
- nop.i 999 ;;
+ nop.m 0
+(p6) fms.s1 f_C_lo_temp1 = f_spos, f_Tjhi, f_C_hi
+ nop.i 0
}
-
-// ******************************************************
-// Calculate C_lo
-// ******************************************************
-// cosh_FR_C_lo_temp1 = cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi
-// cosh_FR_C_lo_temp2 = cosh_FR_sneg * cosh_FR_Tmjlo + (cosh_FR_spos * cosh_FR_Tjhi - cosh_FR_C_hi)
-// cosh_FR_C_lo_temp1 = cosh_FR_sneg * cosh_FR_Tmjlo
-// cosh_FR_C_lo_temp3 = cosh_FR_spos * cosh_FR_Tjlo + (cosh_FR_sneg * cosh_FR_Tmjlo)
-// cosh_FR_C_lo = cosh_FR_C_lo_temp3 + cosh_FR_C_lo_temp2
+;;
{ .mfi
- nop.m 999
-(p0) fms.s1 cosh_FR_C_lo_temp1 = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_C_hi
- nop.i 999
+ nop.m 0
+(p6) fma.s1 f_C_lo_temp2 = f_sneg, f_Tmjhi, f_C_lo_temp1
+ nop.i 0
}
+;;
+// If EXP,
+// Y_hi = 2^(N-1) * Tjhi
+// Y_lo = 2^(N-1) * Tjhi * (p_odd + p_even) + 2^(N-1) * Tjlo
{ .mfi
- nop.m 999
-(p0) fms.s1 cosh_FR_S_hi = cosh_FR_spos, cosh_FR_Tjhi, cosh_FR_S_hi_temp1
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s1 f_Y_lo_temp = f_peven, f1, f_podd
+ nop.i 0
}
+;;
+// If TBL,
+// C_lo = C_lo_temp4 + C_lo_temp2
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_C_lo_temp2 = cosh_FR_sneg, cosh_FR_Tmjhi, cosh_FR_C_lo_temp1
- nop.i 999
+ nop.m 0
+(p6) fma.s1 f_C_lo = f_C_lo_temp4, f1, f_C_lo_temp2
+ nop.i 0
}
+;;
+// If TBL,
+// Y_hi = C_hi
+// Y_lo = S_hi*p_odd + (C_hi*p_even + C_lo)
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_C_lo_temp1 = cosh_FR_sneg, cosh_FR_Tmjlo, f0
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.s1 f_Y_lo_temp = f_C_hi, f_peven, f_C_lo
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_C_lo_temp3 = cosh_FR_spos, cosh_FR_Tjlo, cosh_FR_C_lo_temp1
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s1 f_Y_lo = f_Tjhi_spos, f_Y_lo_temp, f_Tjlo_spos
+ nop.i 0
}
+;;
+// Dummy multiply to generate inexact
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_C_lo = cosh_FR_C_lo_temp3, f1, cosh_FR_C_lo_temp2
- nop.i 999 ;;
+ nop.m 0
+ fmpy.s0 f_tmp = f_B2, f_B2
+ nop.i 0
}
-
-// ******************************************************
-// cosh_FR_Y_lo_temp = cosh_FR_C_hi * cosh_FR_peven + cosh_FR_C_lo
-// cosh_FR_Y_lo = cosh_FR_S_hi * cosh_FR_podd + cosh_FR_Y_lo_temp
-// cosh_FR_COSH = Y_hi + Y_lo
-
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_C_hi, cosh_FR_peven, cosh_FR_C_lo
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.s1 f_Y_lo = f_S_hi, f_podd, f_Y_lo_temp
+ nop.i 0
}
+;;
+// f8 = answer = Y_hi + Y_lo
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_S_hi, cosh_FR_podd, cosh_FR_Y_lo_temp
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s0 f8 = f_Y_lo, f1, f_Tjhi_spos
+ nop.i 0
}
+;;
+// f8 = answer = Y_hi + Y_lo
{ .mfb
- nop.m 999
-(p0) fma.s0 f8 = cosh_FR_C_hi, f1, cosh_FR_Y_lo
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+(p6) fma.s0 f8 = f_Y_lo, f1, f_C_hi
+ br.ret.sptk b0 // Exit for COSH_BY_TBL and COSH_BY_EXP
}
+;;
-L(COSH_BY_EXP):
-
-// When p7 is true, we know that an overflow is not going to happen
-// When p7 is false, we must check for possible overflow
-// p7 is the over_SAFE flag
-// f44 = Scale * (Y_hi + Y_lo)
-// = cosh_FR_spos * (cosh_FR_Tjhi + cosh_FR_Y_lo)
-{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_Y_lo_temp = cosh_FR_peven, f1, cosh_FR_podd
- nop.i 999
+// Here if 0 < |x| < 0.25
+COSH_BY_POLY:
+{ .mmf
+ ldfe f_P6 = [r_ad2e],16
+ ldfe f_P5 = [r_ad2o],16
+ nop.f 0
}
+;;
-// Now we are in EXP. This is the only path where an overflow is possible
-// but not for certain. So this is the only path where over_SAFE has any use.
-// r34 still has N-1
-// There is a danger of double-extended overflow if N-1 > 0x3ffe = 16382
-// There is a danger of double overflow if N-1 > 0x3fe = 1022
-// There is a danger of single overflow if N-1 > 0x7e = 126
+{ .mmi
+ ldfe f_P4 = [r_ad2e],16
+ ldfe f_P3 = [r_ad2o],16
+ nop.i 0
+}
+;;
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000003ffe ;;
+{ .mmi
+ ldfe f_P2 = [r_ad2e],16
+ ldfe f_P1 = [r_ad2o],16
+ nop.i 0
}
+;;
{ .mfi
-(p0) cmp.gt.unc p0,p7 = r34, r32
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_X3 = f_NORM_X, f_X2, f0
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_Y_lo = cosh_FR_Tjhi, cosh_FR_Y_lo_temp, cosh_FR_Tjlo
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_X4 = f_X2, f_X2, f0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_COSH_temp = cosh_FR_Y_lo, f1, cosh_FR_Tjhi
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_poly65 = f_X2, f_P6, f_P5
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s0 f44 = cosh_FR_spos, cosh_FR_COSH_temp, f0
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_poly43 = f_X2, f_P4, f_P3
+ nop.i 0
}
+;;
-// Dummy multiply to generate inexact
{ .mfi
- nop.m 999
-(p7) fmpy.s0 cosh_FR_tmp = cosh_FR_all_ones, cosh_FR_all_ones
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_poly21 = f_X2, f_P2, f_P1
+ nop.i 0
}
+;;
-// If over_SAFE is set, return
-{ .mfb
- nop.m 999
-(p7) fmerge.s f8 = f44,f44
-(p7) br.ret.sptk b0 ;;
+{ .mfi
+ nop.m 0
+ fma.s1 f_poly6543 = f_X4, f_poly65, f_poly43
+ nop.i 0
}
-
-// Else see if we overflowed
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// If WRE is set then an overflow will not occur in EXP.
-// The input value that would cause a register (WRE) value to overflow is about 2^15
-// and this input would go into the HUGE path.
-// Answer with WRE is in f43.
+;;
{ .mfi
- nop.m 999
-(p0) fsetc.s2 0x7F,0x42
- nop.i 999;;
+ nop.m 0
+ fma.s1 f_poly6to1 = f_X4, f_poly6543, f_poly21
+ nop.i 0
}
+;;
+// Dummy multiply to generate inexact
{ .mfi
- nop.m 999
-(p0) fma.s2 f43 = cosh_FR_spos, cosh_FR_COSH_temp, f0
- nop.i 999 ;;
+ nop.m 0
+ fmpy.s0 f_tmp = f_P6, f_P6
+ nop.i 0
}
-
-// 103FF => 103FF -FFFF = 400(true)
-// 400 + 3FF = 7FF, which is 1 more than the exponent of the largest
-// double (7FE). So 0 103FF 8000000000000000 is one ulp more than
-// largest double in register bias
-
-// 13FFF => 13FFF -FFFF = 4000(true)
-
-// Now set p8 if the answer with WRE is greater than or equal this value
-// Also set p9 if the answer with WRE is less than or equal to negative this value
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000013fff ;;
+{ .mfb
+ nop.m 0
+ fma.s0 f8 = f_poly6to1, f_X2, f1
+ br.ret.sptk b0 // Exit COSH_BY_POLY
}
+;;
-{ .mmf
- nop.m 999
-(p0) setf.exp f41 = r32
-(p0) fsetc.s2 0x7F,0x40 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41
- nop.i 999
+// Here if x denorm or unorm
+COSH_DENORM:
+// Determine if x really a denorm and not a unorm
+{ .mmf
+ getf.exp r_signexp_x = f_NORM_X
+ mov r_exp_denorm = 0x0c001 // Real denorms have exp < this
+ fmerge.s f_ABS_X = f0, f_NORM_X
}
+;;
{ .mfi
- nop.m 999
-(p0) fmerge.ns f42 = f41, f41
- nop.i 999 ;;
+ nop.m 0
+ fcmp.eq.s0 p10,p0 = f8, f0 // Set denorm flag
+ nop.i 0
}
+;;
-// The error tag for overflow is 63
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p8) mov GR_Parameter_TAG = 63 ;;
+// Set p8 if really a denorm
+{ .mmi
+ and r_exp_x = r_exp_mask, r_signexp_x
+;;
+ cmp.lt p8,p9 = r_exp_x, r_exp_denorm
+ nop.i 0
}
+;;
+// Identify denormal operands.
{ .mfb
- nop.m 999
-(p0) fcmp.le.unc.s1 p9, p0 = f43, f42
-(p8) br.cond.spnt __libm_error_region ;;
-}
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p9) mov GR_Parameter_TAG = 63
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.spnt __libm_error_region ;;
-}
-
-// Dummy multiply to generate inexact
-{ .mfi
- nop.m 999
-(p0) fmpy.s0 cosh_FR_tmp = cosh_FR_all_ones, cosh_FR_all_ones
- nop.i 999 ;;
+ nop.m 0
+(p8) fma.s0 f8 = f8,f8,f1 // If x denorm, result=1+x^2
+(p9) br.cond.sptk COSH_COMMON // Return to main path if x unorm
}
+;;
{ .mfb
- nop.m 999
-(p0) fmerge.s f8 = f44,f44
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+ nop.f 0
+ br.ret.sptk b0 // Exit if x denorm
}
+;;
-// for COSH_HUGE, put 24000 in exponent; take sign from input; add 1
-// SAFE: SAFE is always 0 for HUGE
-
-L(COSH_HUGE):
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000015dbf ;;
+// Here if |x| >= overflow limit
+COSH_HUGE:
+// for COSH_HUGE, put 24000 in exponent; take sign from input
+{ .mmi
+ mov r_exp_huge = 0x15dbf
+;;
+ setf.exp f_huge = r_exp_huge
+ nop.i 0
}
+;;
{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ alloc r32 = ar.pfs,0,5,4,0
+ fma.s1 f_signed_hi_lo = f_huge, f1, f1
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 cosh_FR_hi_lo = f1, f9, f1
- nop.i 999 ;;
+ nop.m 0
+ fma.s0 f_pre_result = f_signed_hi_lo, f_huge, f0
+ mov GR_Parameter_TAG = 63
}
+;;
-{ .mfi
- nop.m 999
-(p0) fma.s0 f44 = f9, cosh_FR_hi_lo, f0
-(p0) mov GR_Parameter_TAG = 63
-}
-.endp coshl
-ASM_SIZE_DIRECTIVE(coshl)
+GLOBAL_IEEE754_END(coshl)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
+
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
+
{ .mmi
- stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
+
.body
{ .mib
- stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0 // Parameter 3 address
+ stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ stfe [GR_Parameter_Y] = f_pre_result // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
+
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
+
{ .mmi
- ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
- add sp = 64,sp // Restore stack pointer
- mov b0 = GR_SAVE_B0 // Restore return address
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
+
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
+
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_exp.S b/sysdeps/ia64/fpu/e_exp.S
index db02336ecf..5ae8afeb99 100644
--- a/sysdeps/ia64/fpu/e_exp.S
+++ b/sysdeps/ia64/fpu/e_exp.S
@@ -1,10 +1,10 @@
.file "exp.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2002, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,26 +20,26 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
+// 2/02/00 Initial version
// 3/07/00 exp(inf) = inf but now does NOT call error support
// exp(-inf) = 0 but now does NOT call error support
// 4/04/00 Unwind support added
@@ -48,6 +48,10 @@
// 11/30/00 Reworked to shorten main path, widen main path to include all
// args in normal range, and add quick exit for 0, nan, inf.
// 12/05/00 Loaded constants earlier with setf to save 2 cycles.
+// 02/05/02 Corrected uninitialize predicate in POSSIBLE_UNDERFLOW path
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 09/07/02 Force inexact flag
+// 11/15/02 Split underflow path into zero/nonzero; eliminated fma in main path
// API
//==============================================================
@@ -67,187 +71,167 @@
// Construct 2^M
// Get 2^(index_1/128) from table_1;
// Get 2^(index_2/8) from table_2;
-// Calculate exp(r) by series
+// Calculate exp(r) by 5th order polynomial
// r = x - n (log2/128)_high
// delta = - n (log2/128)_low
// Calculate exp(delta) as 1 + delta
-// Special values
+// Special values
//==============================================================
// exp(+0) = 1.0
// exp(-0) = 1.0
-// exp(+qnan) = +qnan
-// exp(-qnan) = -qnan
-// exp(+snan) = +qnan
-// exp(-snan) = -qnan
+// exp(+qnan) = +qnan
+// exp(-qnan) = -qnan
+// exp(+snan) = +qnan
+// exp(-snan) = -qnan
-// exp(-inf) = +0
+// exp(-inf) = +0
// exp(+inf) = +inf
-// Overfow and Underfow
+// Overflow and Underflow
//=======================
-// exp(-x) = smallest double normal when
-// x = -708.396 = c086232bdd7abcd2
-
// exp(x) = largest double normal when
-// x = 709.7827 = 40862e42fefa39ef
+// x = 709.7827 = 0x40862e42fefa39ef
+
+// exp(x) = smallest double normal when
+// x = -708.396 = 0xc086232bdd7abcd2
+// exp(x) = largest round-to-nearest single zero when
+// x = -745.1332 = 0xc0874910d52d3052
// Registers used
//==============================================================
-// Floating Point registers used:
-// f8, input
-// f9 -> f15, f32 -> f60
+// Floating Point registers used:
+// f8, input, output
+// f6 -> f15, f32 -> f49
-// General registers used:
-// r32 -> r60
+// General registers used:
+// r14 -> r40
// Predicate registers used:
// p6 -> p15
-#include "libm_support.h"
-
// Assembly macros
//==============================================================
-exp_GR_rshf = r33
-EXP_AD_TB1 = r34
-EXP_AD_TB2 = r35
-EXP_AD_P = r36
-
-exp_GR_N = r37
-exp_GR_index_1 = r38
-exp_GR_index_2_16 = r39
-
-exp_GR_biased_M = r40
-exp_GR_index_1_16 = r41
-EXP_AD_T1 = r42
-EXP_AD_T2 = r43
-exp_GR_sig_inv_ln2 = r44
-
-exp_GR_17ones = r45
-exp_GR_one = r46
-exp_TB1_size = r47
-exp_TB2_size = r48
-exp_GR_rshf_2to56 = r49
-
-exp_GR_gt_ln = r50
-exp_GR_exp_2tom56 = r51
-
-exp_GR_17ones_m1 = r52
-
-GR_SAVE_B0 = r53
-GR_SAVE_PFS = r54
-GR_SAVE_GP = r55
-GR_SAVE_SP = r56
-
-GR_Parameter_X = r57
-GR_Parameter_Y = r58
-GR_Parameter_RESULT = r59
-GR_Parameter_TAG = r60
-
-
-FR_X = f10
-FR_Y = f1
-FR_RESULT = f8
-
-EXP_RSHF_2TO56 = f6
-EXP_INV_LN2_2TO63 = f7
-EXP_W_2TO56_RSH = f9
-EXP_2TOM56 = f11
-exp_P4 = f12
-exp_P3 = f13
-exp_P2 = f14
-exp_P1 = f15
-
-exp_ln2_by_128_hi = f33
-exp_ln2_by_128_lo = f34
-
-EXP_RSHF = f35
-EXP_Nfloat = f36
-exp_W = f37
-exp_r = f38
-exp_f = f39
-
-exp_rsq = f40
-exp_rcube = f41
-
-EXP_2M = f42
-exp_S1 = f43
-exp_T1 = f44
-
-EXP_MIN_DBL_OFLOW_ARG = f45
-EXP_MAX_DBL_ZERO_ARG = f46
-EXP_MAX_DBL_NORM_ARG = f47
-EXP_MAX_DBL_UFLOW_ARG = f48
-EXP_MIN_DBL_NORM_ARG = f49
-exp_rP4pP3 = f50
-exp_P_lo = f51
-exp_P_hi = f52
-exp_P = f53
-exp_S = f54
-
-EXP_NORM_f8 = f56
-
-exp_wre_urm_f8 = f57
-exp_ftz_urm_f8 = f57
-
-exp_gt_pln = f58
-
-exp_S2 = f59
-exp_T2 = f60
+rRshf = r14
+rAD_TB1 = r15
+rAD_T1 = r15
+rAD_TB2 = r16
+rAD_T2 = r16
+rAD_P = r17
+rN = r18
+rIndex_1 = r19
+rIndex_2_16 = r20
+rM = r21
+rBiased_M = r21
+rIndex_1_16 = r21
+rSig_inv_ln2 = r22
+rExp_bias = r23
+rExp_mask = r24
+rTmp = r25
+rRshf_2to56 = r26
+rGt_ln = r27
+rExp_2tom56 = r28
+
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+fRSHF_2TO56 = f6
+fINV_LN2_2TO63 = f7
+fW_2TO56_RSH = f9
+f2TOM56 = f11
+fP5 = f12
+fP54 = f12
+fP5432 = f12
+fP4 = f13
+fP3 = f14
+fP32 = f14
+fP2 = f15
+fP = f15
+
+fLn2_by_128_hi = f33
+fLn2_by_128_lo = f34
+
+fRSHF = f35
+fNfloat = f36
+fNormX = f37
+fR = f38
+fF = f39
+
+fRsq = f40
+f2M = f41
+fS1 = f42
+fT1 = f42
+fS2 = f43
+fT2 = f43
+fS = f43
+fWre_urm_f8 = f44
+fFtz_urm_f8 = f44
+
+fMIN_DBL_OFLOW_ARG = f45
+fMAX_DBL_ZERO_ARG = f46
+fMAX_DBL_NORM_ARG = f47
+fMIN_DBL_NORM_ARG = f48
+fGt_pln = f49
+fTmp = f49
// Data tables
//==============================================================
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-
+RODATA
.align 16
// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
// double-extended 1/ln(2)
// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
-// 3fff b8aa 3b29 5c17 f0bc
+// 3fff b8aa 3b29 5c17 f0bc
// For speed the significand will be loaded directly with a movl and setf.sig
// and the exponent will be bias+63 instead of bias+0. Thus subsequent
// computations need to scale appropriately.
-// The constant 128/ln(2) is needed for the computation of w. This is also
+// The constant 128/ln(2) is needed for the computation of w. This is also
// obtained by scaling the computations.
//
-// Two shifting constants are loaded directly with movl and setf.d.
-// 1. EXP_RSHF_2TO56 = 1.1000..00 * 2^(63-7)
+// Two shifting constants are loaded directly with movl and setf.d.
+// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7)
// This constant is added to x*1/ln2 to shift the integer part of
// x*128/ln2 into the rightmost bits of the significand.
-// The result of this fma is EXP_W_2TO56_RSH.
-// 2. EXP_RSHF = 1.1000..00 * 2^(63)
-// This constant is subtracted from EXP_W_2TO56_RSH * 2^(-56) to give
+// The result of this fma is fW_2TO56_RSH.
+// 2. fRSHF = 1.1000..00 * 2^(63)
+// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give
// the integer part of w, n, as a floating-point number.
-// The result of this fms is EXP_Nfloat.
+// The result of this fms is fNfloat.
-exp_table_1:
-ASM_TYPE_DIRECTIVE(exp_table_1,@object)
-data8 0x40862e42fefa39f0 // smallest dbl overflow arg
-data8 0xc0874c0000000000 // approx largest arg for zero result
-data8 0x40862e42fefa39ef // largest dbl arg to give normal dbl result
-data8 0xc086232bdd7abcd3 // largest dbl underflow arg
-data8 0xc086232bdd7abcd2 // smallest dbl arg to give normal dbl result
-data8 0x0 // pad
+LOCAL_OBJECT_START(exp_table_1)
+data8 0x40862e42fefa39f0 // smallest dbl overflow arg, +709.7827
+data8 0xc0874910d52d3052 // largest arg for rnd-to-nearest 0 result, -745.133
+data8 0x40862e42fefa39ef // largest dbl arg to give normal dbl result, +709.7827
+data8 0xc086232bdd7abcd2 // smallest dbl arg to give normal dbl result, -708.396
data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi
data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo
-
+//
// Table 1 is 2^(index_1/128) where
// index_1 goes from 0 to 15
-
+//
data8 0x8000000000000000 , 0x00003FFF
data8 0x80B1ED4FD999AB6C , 0x00003FFF
data8 0x8164D1F3BC030773 , 0x00003FFF
@@ -264,12 +248,11 @@ data8 0x88980E8092DA8527 , 0x00003FFF
data8 0x8955EE03618E5FDD , 0x00003FFF
data8 0x8A14D575496EFD9A , 0x00003FFF
data8 0x8AD4C6452C728924 , 0x00003FFF
-ASM_SIZE_DIRECTIVE(exp_table_1)
+LOCAL_OBJECT_END(exp_table_1)
// Table 2 is 2^(index_1/8) where
// index_2 goes from 0 to 7
-exp_table_2:
-ASM_TYPE_DIRECTIVE(exp_table_2,@object)
+LOCAL_OBJECT_START(exp_table_2)
data8 0x8000000000000000 , 0x00003FFF
data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF
data8 0x9837F0518DB8A96F , 0x00003FFF
@@ -278,413 +261,356 @@ data8 0xB504F333F9DE6484 , 0x00003FFF
data8 0xC5672A115506DADD , 0x00003FFF
data8 0xD744FCCAD69D6AF4 , 0x00003FFF
data8 0xEAC0C6E7DD24392F , 0x00003FFF
-ASM_SIZE_DIRECTIVE (exp_table_2)
-
+LOCAL_OBJECT_END(exp_table_2)
-exp_p_table:
-ASM_TYPE_DIRECTIVE(exp_p_table,@object)
-data8 0x3f8111116da21757 //P_4
-data8 0x3fa55555d787761c //P_3
-data8 0x3fc5555555555414 //P_2
-data8 0x3fdffffffffffd6a //P_1
-ASM_SIZE_DIRECTIVE(exp_p_table)
+LOCAL_OBJECT_START(exp_p_table)
+data8 0x3f8111116da21757 //P5
+data8 0x3fa55555d787761c //P4
+data8 0x3fc5555555555414 //P3
+data8 0x3fdffffffffffd6a //P2
+LOCAL_OBJECT_END(exp_p_table)
-.align 32
-.global exp#
.section .text
-.proc exp#
-.align 32
-exp:
-#ifdef _LIBC
-.global __ieee754_exp#
-__ieee754_exp:
-#endif
+GLOBAL_IEEE754_ENTRY(exp)
{ .mlx
- alloc r32=ar.pfs,1,24,4,0
- movl exp_GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
+ nop.m 0
+ movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
}
{ .mlx
- addl EXP_AD_TB1 = @ltoff(exp_table_1), gp
- movl exp_GR_rshf_2to56 = 0x4768000000000000 ;; // 1.10000 2^(63+56)
+ addl rAD_TB1 = @ltoff(exp_table_1), gp
+ movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56)
}
;;
-// We do this fnorm right at the beginning to take any enabled
-// faults and to normalize any input unnormals so that SWA is not taken.
{ .mfi
- ld8 EXP_AD_TB1 = [EXP_AD_TB1]
- fclass.m p8,p0 = f8,0x07 // Test for x=0
- mov exp_GR_17ones = 0x1FFFF
+ ld8 rAD_TB1 = [rAD_TB1]
+ fclass.m p8,p0 = f8,0x07 // Test for x=0
+ mov rExp_mask = 0x1ffff
}
{ .mfi
- mov exp_TB1_size = 0x100
- fnorm EXP_NORM_f8 = f8
- mov exp_GR_exp_2tom56 = 0xffff-56
+ mov rExp_bias = 0xffff
+ fnorm.s1 fNormX = f8
+ mov rExp_2tom56 = 0xffff-56
}
;;
// Form two constants we need
-// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
+// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand
-{ .mmf
- setf.sig EXP_INV_LN2_2TO63 = exp_GR_sig_inv_ln2 // form 1/ln2 * 2^63
- setf.d EXP_RSHF_2TO56 = exp_GR_rshf_2to56 // Form const 1.100 * 2^(63+56)
- fclass.m p9,p0 = f8,0x22 // Test for x=-inf
+{ .mfi
+ setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63
+ fclass.m p9,p0 = f8,0x22 // Test for x=-inf
+ nop.i 0
+}
+{ .mlx
+ setf.d fRSHF_2TO56 = rRshf_2to56 // Form const 1.100 * 2^(63+56)
+ movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for right shift
}
;;
-{ .mlx
- setf.exp EXP_2TOM56 = exp_GR_exp_2tom56 // form 2^-56 for scaling Nfloat
- movl exp_GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift
+{ .mfi
+ ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_ZERO_ARG = [rAD_TB1],16
+ fclass.m p10,p0 = f8,0x1e1 // Test for x=+inf, nan, NaT
+ nop.i 0
}
{ .mfb
- mov exp_TB2_size = 0x80
-(p8) fma.d f8 = f1,f1,f0 // quick exit for x=0
-(p8) br.ret.spnt b0
-;;
+ setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat
+(p9) fma.d.s0 f8 = f0,f0,f0 // quick exit for x=-inf
+(p9) br.ret.spnt b0
}
+;;
{ .mfi
- ldfpd EXP_MIN_DBL_OFLOW_ARG, EXP_MAX_DBL_ZERO_ARG = [EXP_AD_TB1],16
- fclass.m p10,p0 = f8,0x21 // Test for x=+inf
- nop.i 999
+ ldfpd fMAX_DBL_NORM_ARG, fMIN_DBL_NORM_ARG = [rAD_TB1],16
+ nop.f 0
+ nop.i 0
}
{ .mfb
- nop.m 999
-(p9) fma.d f8 = f0,f0,f0 // quick exit for x=-inf
-(p9) br.ret.spnt b0
-;;
+ setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63
+(p8) fma.d.s0 f8 = f1,f1,f0 // quick exit for x=0
+(p8) br.ret.spnt b0
}
-
-{ .mmf
- ldfpd EXP_MAX_DBL_NORM_ARG, EXP_MAX_DBL_UFLOW_ARG = [EXP_AD_TB1],16
- setf.d EXP_RSHF = exp_GR_rshf // Form right shift const 1.100 * 2^63
- fclass.m p11,p0 = f8,0xc3 // Test for x=nan
;;
-}
{ .mfb
- ldfd EXP_MIN_DBL_NORM_ARG = [EXP_AD_TB1],16
- nop.f 999
-(p10) br.ret.spnt b0 // quick exit for x=+inf
-;;
+ ldfe fLn2_by_128_hi = [rAD_TB1],16
+(p10) fma.d.s0 f8 = f8,f8,f0 // Result if x=+inf, nan, NaT
+(p10) br.ret.spnt b0 // quick exit for x=+inf, nan, NaT
}
+;;
{ .mfi
- ldfe exp_ln2_by_128_hi = [EXP_AD_TB1],16
- nop.f 999
- nop.i 999
-;;
+ ldfe fLn2_by_128_lo = [rAD_TB1],16
+ fcmp.eq.s0 p6,p0 = f8, f0 // Dummy to set D
+ nop.i 0
}
-
-
-{ .mfb
- ldfe exp_ln2_by_128_lo = [EXP_AD_TB1],16
-(p11) fmerge.s f8 = EXP_NORM_f8, EXP_NORM_f8
-(p11) br.ret.spnt b0 // quick exit for x=nan
;;
-}
-// After that last load, EXP_AD_TB1 points to the beginning of table 1
+// After that last load, rAD_TB1 points to the beginning of table 1
// W = X * Inv_log2_by_128
// By adding 1.10...0*2^63 we shift and get round_int(W) in significand.
// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing.
{ .mfi
- nop.m 999
- fma.s1 EXP_W_2TO56_RSH = EXP_NORM_f8, EXP_INV_LN2_2TO63, EXP_RSHF_2TO56
- nop.i 999
-;;
+ nop.m 0
+ fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56
+ nop.i 0
}
-
+;;
// Divide arguments into the following categories:
-// Certain Underflow/zero p11 - -inf < x <= MAX_DBL_ZERO_ARG
-// Certain Underflow p12 - MAX_DBL_ZERO_ARG < x <= MAX_DBL_UFLOW_ARG
-// Possible Underflow p13 - MAX_DBL_UFLOW_ARG < x < MIN_DBL_NORM_ARG
+// Certain Underflow p11 - -inf < x <= MAX_DBL_ZERO_ARG
+// Possible Underflow p13 - MAX_DBL_ZERO_ARG < x < MIN_DBL_NORM_ARG
// Certain Safe - MIN_DBL_NORM_ARG <= x <= MAX_DBL_NORM_ARG
// Possible Overflow p14 - MAX_DBL_NORM_ARG < x < MIN_DBL_OFLOW_ARG
// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= x < +inf
//
-// If the input is really a double arg, then there will never be "Possible
-// Underflow" or "Possible Overflow" arguments.
+// If the input is really a double arg, then there will never be
+// "Possible Overflow" arguments.
//
{ .mfi
- add EXP_AD_TB2 = exp_TB1_size, EXP_AD_TB1
- fcmp.ge.s1 p15,p14 = EXP_NORM_f8,EXP_MIN_DBL_OFLOW_ARG
- nop.i 999
-;;
+ add rAD_TB2 = 0x100, rAD_TB1
+ fcmp.ge.s1 p15,p0 = fNormX,fMIN_DBL_OFLOW_ARG
+ nop.i 0
}
+;;
{ .mfi
- add EXP_AD_P = exp_TB2_size, EXP_AD_TB2
- fcmp.le.s1 p11,p12 = EXP_NORM_f8,EXP_MAX_DBL_ZERO_ARG
- nop.i 999
-;;
+ add rAD_P = 0x80, rAD_TB2
+ fcmp.le.s1 p11,p0 = fNormX,fMAX_DBL_ZERO_ARG
+ nop.i 0
}
+;;
{ .mfb
- ldfpd exp_P4, exp_P3 = [EXP_AD_P] ,16
-(p14) fcmp.gt.unc.s1 p14,p0 = EXP_NORM_f8,EXP_MAX_DBL_NORM_ARG
-(p15) br.cond.spnt L(EXP_CERTAIN_OVERFLOW)
-;;
+ ldfpd fP5, fP4 = [rAD_P] ,16
+ fcmp.gt.s1 p14,p0 = fNormX,fMAX_DBL_NORM_ARG
+(p15) br.cond.spnt EXP_CERTAIN_OVERFLOW
}
+;;
-
-// Nfloat = round_int(W)
-// The signficand of EXP_W_2TO56_RSH contains the rounded integer part of W,
+// Nfloat = round_int(W)
+// The signficand of fW_2TO56_RSH contains the rounded integer part of W,
// as a twos complement number in the lower bits (that is, it may be negative).
-// That twos complement number (called N) is put into exp_GR_N.
+// That twos complement number (called N) is put into rN.
-// Since EXP_W_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56
-// before the shift constant 1.10000 * 2^63 is subtracted to yield EXP_Nfloat.
-// Thus, EXP_Nfloat contains the floating point version of N
+// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56
+// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat.
+// Thus, fNfloat contains the floating point version of N
-
-{ .mfi
- nop.m 999
-(p12) fcmp.le.unc p12,p0 = EXP_NORM_f8,EXP_MAX_DBL_UFLOW_ARG
- nop.i 999
-}
{ .mfb
- ldfpd exp_P2, exp_P1 = [EXP_AD_P]
- fms.s1 EXP_Nfloat = EXP_W_2TO56_RSH, EXP_2TOM56, EXP_RSHF
-(p11) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW_ZERO)
-;;
+ ldfpd fP3, fP2 = [rAD_P]
+ fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF
+(p11) br.cond.spnt EXP_CERTAIN_UNDERFLOW
}
+;;
{ .mfi
- getf.sig exp_GR_N = EXP_W_2TO56_RSH
-(p13) fcmp.lt.unc p13,p0 = EXP_NORM_f8,EXP_MIN_DBL_NORM_ARG
- nop.i 999
-;;
+ getf.sig rN = fW_2TO56_RSH
+ nop.f 0
+ nop.i 0
}
+;;
+// rIndex_1 has index_1
+// rIndex_2_16 has index_2 * 16
+// rBiased_M has M
+// rIndex_1_16 has index_1 * 16
-// exp_GR_index_1 has index_1
-// exp_GR_index_2_16 has index_2 * 16
-// exp_GR_biased_M has M
-// exp_GR_index_1_16 has index_1 * 16
-
-// r2 has true M
+// rM has true M
+// r = x - Nfloat * ln2_by_128_hi
+// f = 1 - Nfloat * ln2_by_128_lo
{ .mfi
- and exp_GR_index_1 = 0x0f, exp_GR_N
- fnma.s1 exp_r = EXP_Nfloat, exp_ln2_by_128_hi, EXP_NORM_f8
- shr r2 = exp_GR_N, 0x7
+ and rIndex_1 = 0x0f, rN
+ fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX
+ shr rM = rN, 0x7
}
{ .mfi
- and exp_GR_index_2_16 = 0x70, exp_GR_N
- fnma.s1 exp_f = EXP_Nfloat, exp_ln2_by_128_lo, f1
- nop.i 999
-;;
+ and rIndex_2_16 = 0x70, rN
+ fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1
+ nop.i 0
}
+;;
-
-// EXP_AD_T1 has address of T1
-// EXP_AD_T2 has address if T2
+// rAD_T1 has address of T1
+// rAD_T2 has address if T2
{ .mmi
- addl exp_GR_biased_M = 0xffff, r2
- add EXP_AD_T2 = EXP_AD_TB2, exp_GR_index_2_16
- shladd EXP_AD_T1 = exp_GR_index_1, 4, EXP_AD_TB1
-;;
+ add rBiased_M = rExp_bias, rM
+ add rAD_T2 = rAD_TB2, rIndex_2_16
+ shladd rAD_T1 = rIndex_1, 4, rAD_TB1
}
-
+;;
// Create Scale = 2^M
-// r = x - Nfloat * ln2_by_128_hi
-// f = 1 - Nfloat * ln2_by_128_lo
-
{ .mmi
- setf.exp EXP_2M = exp_GR_biased_M
- ldfe exp_T2 = [EXP_AD_T2]
- nop.i 999
-;;
+ setf.exp f2M = rBiased_M
+ ldfe fT2 = [rAD_T2]
+ nop.i 0
}
+;;
// Load T1 and T2
{ .mfi
- ldfe exp_T1 = [EXP_AD_T1]
- nop.f 999
- nop.i 999
-;;
+ ldfe fT1 = [rAD_T1]
+ fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact
+ nop.i 0
}
-
+;;
{ .mfi
- nop.m 999
- fma.s1 exp_rsq = exp_r, exp_r, f0
- nop.i 999
+ nop.m 0
+ fma.s1 fRsq = fR, fR, f0
+ nop.i 0
}
{ .mfi
- nop.m 999
- fma.s1 exp_rP4pP3 = exp_r, exp_P4, exp_P3
- nop.i 999
-;;
+ nop.m 0
+ fma.s1 fP54 = fR, fP5, fP4
+ nop.i 0
}
-
-
+;;
{ .mfi
- nop.m 999
- fma.s1 exp_rcube = exp_r, exp_rsq, f0
- nop.i 999
+ nop.m 0
+ fcmp.lt.s1 p13,p0 = fNormX,fMIN_DBL_NORM_ARG
+ nop.i 0
}
{ .mfi
- nop.m 999
- fma.s1 exp_P_lo = exp_r, exp_rP4pP3, exp_P2
- nop.i 999
-;;
+ nop.m 0
+ fma.s1 fP32 = fR, fP3, fP2
+ nop.i 0
}
-
+;;
{ .mfi
- nop.m 999
- fma.s1 exp_P_hi = exp_rsq, exp_P1, exp_r
- nop.i 999
+ nop.m 0
+ fma.s1 fP5432 = fRsq, fP54, fP32
+ nop.i 0
}
-{ .mfi
- nop.m 999
- fma.s1 exp_S2 = exp_f,exp_T2,f0
- nop.i 999
;;
-}
{ .mfi
- nop.m 999
- fma.s1 exp_S1 = EXP_2M,exp_T1,f0
- nop.i 999
-;;
+ nop.m 0
+ fma.s1 fS1 = f2M,fT1,f0
+ nop.i 0
}
-
-
{ .mfi
- nop.m 999
- fma.s1 exp_P = exp_rcube, exp_P_lo, exp_P_hi
- nop.i 999
-;;
+ nop.m 0
+ fma.s1 fS2 = fF,fT2,f0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
- fma.s1 exp_S = exp_S1,exp_S2,f0
- nop.i 999
-;;
+ nop.m 0
+ fma.s1 fP = fRsq, fP5432, fR
+ nop.i 0
}
-
-{ .bbb
-(p12) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW)
-(p13) br.cond.spnt L(EXP_POSSIBLE_UNDERFLOW)
-(p14) br.cond.spnt L(EXP_POSSIBLE_OVERFLOW)
-;;
+{ .mfi
+ nop.m 0
+ fma.s1 fS = fS1,fS2,f0
+ nop.i 0
}
+;;
+{ .mbb
+ nop.m 0
+(p13) br.cond.spnt EXP_POSSIBLE_UNDERFLOW
+(p14) br.cond.spnt EXP_POSSIBLE_OVERFLOW
+}
+;;
{ .mfb
- nop.m 999
- fma.d f8 = exp_S, exp_P, exp_S
- br.ret.sptk b0 ;; // Normal path exit
+ nop.m 0
+ fma.d.s0 f8 = fS, fP, fS
+ br.ret.sptk b0 // Normal path exit
}
+;;
-L(EXP_POSSIBLE_OVERFLOW):
+EXP_POSSIBLE_OVERFLOW:
-// We got an answer. EXP_MAX_DBL_NORM_ARG < x < EXP_MIN_DBL_OFLOW_ARG
-// overflow is a possibility, not a certainty
+// Here if fMAX_DBL_NORM_ARG < x < fMIN_DBL_OFLOW_ARG
+// This cannot happen if input is a double, only if input higher precision.
+// Overflow is a possibility, not a certainty.
-{ .mfi
- nop.m 999
- fsetc.s2 0x7F,0x42
- nop.i 999 ;;
-}
+// Recompute result using status field 2 with user's rounding mode,
+// and wre set. If result is larger than largest double, then we have
+// overflow
{ .mfi
- nop.m 999
- fma.d.s2 exp_wre_urm_f8 = exp_S, exp_P, exp_S
- nop.i 999 ;;
+ mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp
+ fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
+ nop.i 0
}
-
-// We define an overflow when the answer with
-// WRE set
-// user-defined rounding mode
-// is ldn +1
-
-// Is the exponent 1 more than the largest double?
-// If so, go to ERROR RETURN, else get the answer and
-// leave.
-
-// Largest double is 7FE (biased double)
-// 7FE - 3FF + FFFF = 103FE
-// Create + largest_double_plus_ulp
-// Create - largest_double_plus_ulp
-// Calculate answer with WRE set.
-
-// Cases when answer is ldn+1 are as follows:
-// ldn ldn+1
-// --+----------|----------+------------
-// |
-// +inf +inf -inf
-// RN RN
-// RZ
+;;
{ .mfi
- nop.m 999
- fsetc.s2 0x7F,0x40
- mov exp_GR_gt_ln = 0x103ff ;;
+ setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp
+ fma.d.s2 fWre_urm_f8 = fS, fP, fS // Result with wre set
+ nop.i 0
}
+;;
{ .mfi
- setf.exp exp_gt_pln = exp_GR_gt_ln
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fsetc.s2 0x7F,0x40 // Turn off wre in sf2
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
- fcmp.ge.unc.s1 p6, p0 = exp_wre_urm_f8, exp_gt_pln
- nop.i 999 ;;
+ nop.m 0
+ fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
+ nop.i 0
}
+;;
{ .mfb
- nop.m 999
- nop.f 999
-(p6) br.cond.spnt L(EXP_CERTAIN_OVERFLOW) ;; // Branch if really overflow
+ nop.m 0
+ nop.f 0
+(p6) br.cond.spnt EXP_CERTAIN_OVERFLOW // Branch if overflow
}
+;;
{ .mfb
- nop.m 999
- fma.d f8 = exp_S, exp_P, exp_S
- br.ret.sptk b0 ;; // Exit if really no overflow
+ nop.m 0
+ fma.d.s0 f8 = fS, fP, fS
+ br.ret.sptk b0 // Exit if really no overflow
}
+;;
-L(EXP_CERTAIN_OVERFLOW):
+EXP_CERTAIN_OVERFLOW:
{ .mmi
- sub exp_GR_17ones_m1 = exp_GR_17ones, r0, 1 ;;
- setf.exp f9 = exp_GR_17ones_m1
- nop.i 999 ;;
+ sub rTmp = rExp_mask, r0, 1
+;;
+ setf.exp fTmp = rTmp
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
- fmerge.s FR_X = f8,f8
- nop.i 999
+ alloc r32=ar.pfs,1,4,4,0
+ fmerge.s FR_X = f8,f8
+ nop.i 0
}
{ .mfb
- mov GR_Parameter_TAG = 14
- fma.d FR_RESULT = f9, f9, f0 // Set I,O and +INF result
- br.cond.sptk __libm_error_region ;;
+ mov GR_Parameter_TAG = 14
+ fma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result
+ br.cond.sptk __libm_error_region
}
+;;
-L(EXP_POSSIBLE_UNDERFLOW):
+EXP_POSSIBLE_UNDERFLOW:
-// We got an answer. EXP_MAX_DBL_UFLOW_ARG < x < EXP_MIN_DBL_NORM_ARG
-// underflow is a possibility, not a certainty
+// Here if fMAX_DBL_ZERO_ARG < x < fMIN_DBL_NORM_ARG
+// Underflow is a possibility, not a certainty
// We define an underflow when the answer with
// ftz set
@@ -709,81 +635,111 @@ L(EXP_POSSIBLE_UNDERFLOW):
// largest dn smallest normal
{ .mfi
- nop.m 999
- fsetc.s2 0x7F,0x41
- nop.i 999 ;;
+ nop.m 0
+ fsetc.s2 0x7F,0x41 // Get user's round mode, set ftz
+ nop.i 0
}
+;;
+
{ .mfi
- nop.m 999
- fma.d.s2 exp_ftz_urm_f8 = exp_S, exp_P, exp_S
- nop.i 999 ;;
+ nop.m 0
+ fma.d.s2 fFtz_urm_f8 = fS, fP, fS // Result with ftz set
+ nop.i 0
}
+;;
+
{ .mfi
- nop.m 999
- fsetc.s2 0x7F,0x40
- nop.i 999 ;;
+ nop.m 0
+ fsetc.s2 0x7F,0x40 // Turn off ftz in sf2
+ nop.i 0
}
+;;
+
{ .mfi
- nop.m 999
- fcmp.eq.unc.s1 p6, p0 = exp_ftz_urm_f8, f0
- nop.i 999 ;;
+ nop.m 0
+ fcmp.eq.s1 p6, p7 = fFtz_urm_f8, f0 // Test for underflow
+ nop.i 0
}
-{ .mfb
- nop.m 999
- nop.f 999
-(p6) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW) ;; // Branch if really underflow
+{ .mfi
+ nop.m 0
+ fma.d.s0 f8 = fS, fP, fS // Compute result, set I, maybe U
+ nop.i 0
}
-{ .mfb
- nop.m 999
- fma.d f8 = exp_S, exp_P, exp_S
- br.ret.sptk b0 ;; // Exit if really no underflow
+;;
+
+{ .mbb
+ nop.m 0
+(p6) br.cond.spnt EXP_UNDERFLOW_COMMON // Branch if really underflow
+(p7) br.ret.sptk b0 // Exit if really no underflow
}
+;;
-L(EXP_CERTAIN_UNDERFLOW):
-{ .mfi
- nop.m 999
- fmerge.s FR_X = f8,f8
- nop.i 999
+EXP_CERTAIN_UNDERFLOW:
+// Here if x < fMAX_DBL_ZERO_ARG
+// Result will be zero (or smallest denorm if round to +inf) with I, U set
+{ .mmi
+ mov rTmp = 1
+;;
+ setf.exp fTmp = rTmp // Form small normal
+ nop.i 0
}
+;;
+
{ .mfb
- mov GR_Parameter_TAG = 15
- fma.d FR_RESULT = exp_S, exp_P, exp_S // Set I,U and tiny result
- br.cond.sptk __libm_error_region ;;
+ nop.m 0
+ fma.d.s0 f8 = fTmp, fTmp, f0 // Set I,U, tiny (+0.0) result
+ br.cond.sptk EXP_UNDERFLOW_COMMON
}
+;;
-L(EXP_CERTAIN_UNDERFLOW_ZERO):
-{ .mmi
- mov exp_GR_one = 1 ;;
- setf.exp f9 = exp_GR_one
- nop.i 999 ;;
+EXP_UNDERFLOW_COMMON:
+// Determine if underflow result is zero or nonzero
+{ .mfi
+ alloc r32=ar.pfs,1,4,4,0
+ fcmp.eq.s1 p6, p0 = f8, f0
+ nop.i 0
}
+;;
-{ .mfi
- nop.m 999
- fmerge.s FR_X = f8,f8
- nop.i 999
+{ .mfb
+ nop.m 0
+ fmerge.s FR_X = fNormX,fNormX
+(p6) br.cond.spnt EXP_UNDERFLOW_ZERO
}
+;;
+
+EXP_UNDERFLOW_NONZERO:
+// Here if x < fMIN_DBL_NORM_ARG and result nonzero;
+// I, U are set
{ .mfb
- mov GR_Parameter_TAG = 15
- fma.d FR_RESULT = f9, f9, f0 // Set I,U and tiny (+0.0) result
- br.cond.sptk __libm_error_region ;;
+ mov GR_Parameter_TAG = 15
+ nop.f 0 // FR_RESULT already set
+ br.cond.sptk __libm_error_region
}
+;;
-.endp exp
-ASM_SIZE_DIRECTIVE(exp)
+EXP_UNDERFLOW_ZERO:
+// Here if x < fMIN_DBL_NORM_ARG and result zero;
+// I, U are set
+{ .mfb
+ mov GR_Parameter_TAG = 15
+ nop.f 0 // FR_RESULT already set
+ br.cond.sptk __libm_error_region
+}
+;;
+GLOBAL_IEEE754_END(exp)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
@@ -791,24 +747,24 @@ __libm_error_region:
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
- stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
{ .mmi
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
@@ -817,12 +773,11 @@ __libm_error_region:
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_exp10.S b/sysdeps/ia64/fpu/e_exp10.S
new file mode 100644
index 0000000000..1cc5bef406
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_exp10.S
@@ -0,0 +1,602 @@
+.file "exp10.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http: //www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 08/25/00 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 09/06/02 Improved performance; no inexact flags on exact cases
+// 01/29/03 Added missing } to bundle templates
+//
+// API
+//==============================================================
+// double exp10(double)
+//
+// Overview of operation
+//==============================================================
+// Background
+//
+// Implementation
+//
+// Let x= (K + fh + fl + r)/log2(10), where
+// K is an integer, fh= 0.b1 b2 b3 b4 b5,
+// fl= 2^{-5}* 0.b6 b7 b8 b8 b10 (fh, fl >= 0),
+// and |r|<2^{-11}
+// Th is a table that stores 2^fh (32 entries) rounded to
+// double extended precision (only mantissa is stored)
+// Tl is a table that stores 2^fl (32 entries) rounded to
+// double extended precision (only mantissa is stored)
+//
+// 10^x is approximated as
+// 2^K * Th [ f ] * Tl [ f ] * (1+c1*e+c1*r+c2*r^2+c3*r^3+c4*r^4),
+// where e= (x*log2(10)_hi-RN(x*log2(10)_hi))+log2(10)_lo*x
+
+// Note there are only 22 non-zero values that produce an exact result:
+// 1.0, 2.0, ... 22.0.
+// We test for these cases and use s1 to avoid setting the inexact flag.
+
+// Special values
+//==============================================================
+// exp10(0)= 1
+// exp10(+inf)= inf
+// exp10(-inf)= 0
+//
+
+// Registers used
+//==============================================================
+// r2-r3, r14-r40
+// f6-f15, f32-f51
+// p6-p9, p12
+//
+
+
+GR_TBL_START = r2
+GR_LOG_TBL = r3
+
+GR_OF_LIMIT = r14
+GR_UF_LIMIT = r15
+GR_EXP_CORR = r16
+GR_F_low = r17
+GR_F_high = r18
+GR_K = r19
+GR_Flow_ADDR = r20
+
+GR_BIAS = r21
+GR_Fh = r22
+GR_Fh_ADDR = r23
+GR_EXPMAX = r24
+GR_BIAS53 = r25
+
+GR_ROUNDVAL = r26
+GR_MASK = r27
+GR_KF0 = r28
+GR_MASK_low = r29
+GR_COEFF_START = r30
+GR_exact_limit = r31
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+
+FR_COEFF1 = f6
+FR_COEFF2 = f7
+FR_R = f9
+FR_LOG2_10 = f10
+
+FR_2P53 = f11
+FR_KF0 = f12
+FR_COEFF3 = f13
+FR_COEFF4 = f14
+FR_UF_LIMIT = f15
+
+FR_OF_LIMIT = f32
+FR_DX_L210 = f33
+FR_ROUNDVAL = f34
+FR_KF = f35
+
+FR_2_TO_K = f36
+FR_T_low = f37
+FR_T_high = f38
+FR_P34 = f39
+FR_R2 = f40
+
+FR_P12 = f41
+FR_T_low_K = f42
+FR_P14 = f43
+FR_T = f44
+FR_P = f45
+
+FR_L2_10_low = f46
+FR_L2_10_high = f47
+FR_E0 = f48
+FR_E = f49
+FR_exact_limit = f50
+
+FR_int_x = f51
+
+
+// Data tables
+//==============================================================
+
+RODATA
+
+.align 16
+
+LOCAL_OBJECT_START(poly_coeffs)
+
+data8 0xd49a784bcd1b8afe, 0x00003fcb // log2(10)*2^(10-63)
+data8 0x9257edfe9b5fb698, 0x3fbf // log2(10)_low (bits 64...127)
+data8 0x3fac6b08d704a0c0, 0x3f83b2ab6fba4e77 // C_3 and C_4
+data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
+data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
+LOCAL_OBJECT_END(poly_coeffs)
+
+
+LOCAL_OBJECT_START(T_table)
+
+// 2^{0.00000 b6 b7 b8 b9 b10}
+data8 0x8000000000000000, 0x8016302f17467628
+data8 0x802c6436d0e04f50, 0x80429c17d77c18ed
+data8 0x8058d7d2d5e5f6b0, 0x806f17687707a7af
+data8 0x80855ad965e88b83, 0x809ba2264dada76a
+data8 0x80b1ed4fd999ab6c, 0x80c83c56b50cf77f
+data8 0x80de8f3b8b85a0af, 0x80f4e5ff089f763e
+data8 0x810b40a1d81406d4, 0x81219f24a5baa59d
+data8 0x813801881d886f7b, 0x814e67cceb90502c
+data8 0x8164d1f3bc030773, 0x817b3ffd3b2f2e47
+data8 0x8191b1ea15813bfd, 0x81a827baf7838b78
+data8 0x81bea1708dde6055, 0x81d51f0b8557ec1c
+data8 0x81eba08c8ad4536f, 0x820225f44b55b33b
+data8 0x8218af4373fc25eb, 0x822f3c7ab205c89a
+data8 0x8245cd9ab2cec048, 0x825c62a423d13f0c
+data8 0x8272fb97b2a5894c, 0x828998760d01faf3
+data8 0x82a0393fe0bb0ca8, 0x82b6ddf5dbc35906
+//
+//
+// 2^{0.b1 b2 b3 b4 b5}
+data8 0x8000000000000000, 0x82cd8698ac2ba1d7
+data8 0x85aac367cc487b14, 0x88980e8092da8527
+data8 0x8b95c1e3ea8bd6e6, 0x8ea4398b45cd53c0
+data8 0x91c3d373ab11c336, 0x94f4efa8fef70961
+data8 0x9837f0518db8a96f, 0x9b8d39b9d54e5538
+data8 0x9ef5326091a111ad, 0xa27043030c496818
+data8 0xa5fed6a9b15138ea, 0xa9a15ab4ea7c0ef8
+data8 0xad583eea42a14ac6, 0xb123f581d2ac258f
+data8 0xb504f333f9de6484, 0xb8fbaf4762fb9ee9
+data8 0xbd08a39f580c36be, 0xc12c4cca66709456
+data8 0xc5672a115506dadd, 0xc9b9bd866e2f27a2
+data8 0xce248c151f8480e3, 0xd2a81d91f12ae45a
+data8 0xd744fccad69d6af4, 0xdbfbb797daf23755
+data8 0xe0ccdeec2a94e111, 0xe5b906e77c8348a8
+data8 0xeac0c6e7dd24392e, 0xefe4b99bdcdaf5cb
+data8 0xf5257d152486cc2c, 0xfa83b2db722a033a
+LOCAL_OBJECT_END(T_table)
+
+
+
+.section .text
+GLOBAL_IEEE754_ENTRY(exp10)
+
+
+{.mfi
+ alloc r32= ar.pfs, 1, 4, 4, 0
+ // will continue only for non-zero normal/denormal numbers
+ fclass.nm.unc p12, p7= f8, 0x1b
+ mov GR_BIAS53= 0xffff+63-10
+}
+{.mlx
+ // GR_TBL_START= pointer to log2(10), C_1...C_4 followed by T_table
+ addl GR_TBL_START= @ltoff(poly_coeffs), gp
+ movl GR_ROUNDVAL= 0x3fc00000 // 1.5 (SP)
+}
+;;
+
+{.mfi
+ ld8 GR_COEFF_START= [ GR_TBL_START ] // Load pointer to coeff table
+ fcmp.lt.s1 p6, p8= f8, f0 // X<0 ?
+ nop.i 0
+}
+;;
+
+{.mlx
+ setf.exp FR_2P53= GR_BIAS53 // 2^{63-10}
+ movl GR_UF_LIMIT= 0xc07439b746e36b52 // (-2^10-51) / log2(10)
+}
+{.mlx
+ setf.s FR_ROUNDVAL= GR_ROUNDVAL
+ movl GR_OF_LIMIT= 0x40734413509f79fe // Overflow threshold
+}
+;;
+
+{.mib
+ ldfe FR_LOG2_10= [ GR_COEFF_START ], 16 // load log2(10)*2^(10-63)
+ nop.i 0
+ (p12) br.cond.spnt SPECIAL_exp10 // Branch if nan, inf, zero
+}
+;;
+
+{.mmf
+ ldfe FR_L2_10_low= [ GR_COEFF_START ], 16 // load log2(10)_low
+ setf.d FR_OF_LIMIT= GR_OF_LIMIT // Set overflow limit
+ fma.s0 f8= f8, f1, f0 // normalize x
+}
+;;
+
+{.mfi
+ ldfpd FR_COEFF3, FR_COEFF4= [ GR_COEFF_START ], 16 // load C_3, C_4
+ (p8) fcvt.fx.s1 FR_int_x = f8 // Convert x to integer
+ nop.i 0
+}
+{.mfi
+ setf.d FR_UF_LIMIT= GR_UF_LIMIT // Set underflow limit
+ fma.s1 FR_KF0= f8, FR_LOG2_10, FR_ROUNDVAL // y= (x*log2(10)*2^10 +
+ // 1.5*2^63) * 2^(-63)
+ mov GR_EXP_CORR= 0xffff-126
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_L2_10_high= FR_LOG2_10, FR_2P53, f0 // FR_LOG2_10= log2(10)_hi
+ nop.i 0
+}
+;;
+
+{.mfi
+ ldfe FR_COEFF1= [ GR_COEFF_START ], 16 // load C_1
+ fms.s1 FR_KF= FR_KF0, f1, FR_ROUNDVAL // (K+f)*2^(10-63)
+ mov GR_MASK= 1023
+}
+;;
+
+{.mfi
+ ldfe FR_COEFF2= [ GR_COEFF_START ], 16 // load C_2
+ fma.s1 FR_LOG2_10= f8, FR_L2_10_high, f0 // y0= x*log2(10)_hi
+ mov GR_MASK_low= 31
+}
+;;
+
+{.mlx
+ getf.sig GR_KF0= FR_KF0 // (K+f)*2^10= round_to_int(y)
+ (p8) movl GR_exact_limit= 0x41b00000 // Largest x for exact result,
+ // +22.0
+}
+;;
+
+{.mfi
+ add GR_LOG_TBL= 256, GR_COEFF_START // Pointer to high T_table
+ fcmp.gt.s1 p12, p7= f8, FR_OF_LIMIT // x>overflow threshold ?
+ nop.i 0
+}
+;;
+
+{.mfi
+ (p8) setf.s FR_exact_limit = GR_exact_limit // Largest x for exact result
+ (p8) fcvt.xf FR_int_x = FR_int_x // Integral part of x
+ shr GR_K= GR_KF0, 10 // K
+}
+{.mfi
+ and GR_F_high= GR_MASK, GR_KF0 // f_high*32
+ fnma.s1 FR_R= FR_KF, FR_2P53, FR_LOG2_10 // r= x*log2(10)-2^{63-10}*
+ // [ (K+f)*2^{10-63} ]
+ and GR_F_low= GR_KF0, GR_MASK_low // f_low
+}
+;;
+
+{.mmi
+ shladd GR_Flow_ADDR= GR_F_low, 3, GR_COEFF_START // address of 2^{f_low}
+ add GR_BIAS= GR_K, GR_EXP_CORR // K= bias-2*63
+ shr GR_Fh= GR_F_high, 5 // f_high
+}
+;;
+
+{.mfi
+ setf.exp FR_2_TO_K= GR_BIAS // 2^{K-126}
+ (p7) fcmp.lt.s1 p12, p7= f8, FR_UF_LIMIT // x<underflow threshold ?
+ shladd GR_Fh_ADDR= GR_Fh, 3, GR_LOG_TBL // address of 2^{f_high}
+}
+{.mfi
+ ldf8 FR_T_low= [ GR_Flow_ADDR ] // load T_low= 2^{f_low}
+ fms.s1 FR_DX_L210= f8, FR_L2_10_high, FR_LOG2_10 // x*log2(10)_hi-
+ // RN(x*log2(10)_hi)
+ nop.i 0
+}
+;;
+
+{.mfi
+ ldf8 FR_T_high= [ GR_Fh_ADDR ] // load T_high= 2^{f_high}
+ fma.s1 FR_P34= FR_COEFF4, FR_R, FR_COEFF3 // P34= C_3+C_4*r
+ nop.i 0
+}
+{.mfb
+ nop.m 0
+ fma.s1 FR_R2= FR_R, FR_R, f0 // r*r
+ (p12) br.cond.spnt OUT_RANGE_exp10
+}
+;;
+
+{.mfi
+ nop.m 0
+ // e= (x*log2(10)_hi-RN(x*log2(10)_hi))+log2(10)_lo*x
+ fma.s1 FR_E0= f8, FR_L2_10_low, FR_DX_L210
+ cmp.eq p7,p9= r0,r0 // Assume inexact result
+}
+{.mfi
+ nop.m 0
+ fma.s1 FR_P12= FR_COEFF2, FR_R, FR_COEFF1 // P12= C_1+C_2*r
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ (p8) fcmp.eq.s1 p9,p7= FR_int_x, f8 // Test x positive integer
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 FR_T_low_K= FR_T_low, FR_2_TO_K, f0 // T= 2^{K-126}*T_low
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_E= FR_E0, FR_COEFF1, f0 // E= C_1*e
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 FR_P14= FR_R2, FR_P34, FR_P12 // P14= P12+r2*P34
+ nop.i 0
+}
+;;
+
+// If x a positive integer, will it produce an exact result?
+// p7 result will be inexact
+// p9 result will be exact
+{.mfi
+ nop.m 0
+ (p9) fcmp.le.s1 p9,p7= f8, FR_exact_limit // Test x gives exact result
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 FR_T= FR_T_low_K, FR_T_high, f0 // T= T*T_high
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_P= FR_P14, FR_R, FR_E // P= P14*r+E
+ nop.i 0
+}
+;;
+
+.pred.rel "mutex",p7,p9
+{.mfi
+ nop.m 0
+ (p7) fma.d.s0 f8= FR_P, FR_T, FR_T // result= T+T*P, inexact set
+ nop.i 0
+}
+{.mfb
+ nop.m 0
+ (p9) fma.d.s1 f8= FR_P, FR_T, FR_T // result= T+T*P, exact use s1
+ br.ret.sptk b0 // return
+}
+;;
+
+
+SPECIAL_exp10:
+{.mfi
+ nop.m 0
+ fclass.m p6, p0= f8, 0x22 // x= -Infinity ?
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fclass.m p7, p0= f8, 0x21 // x= +Infinity ?
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fclass.m p8, p0= f8, 0x7 // x= +/-Zero ?
+ nop.i 0
+}
+{.mfb
+ nop.m 0
+ (p6) mov f8= f0 // exp10(-Infinity)= 0
+ (p6) br.ret.spnt b0
+}
+;;
+
+{.mfb
+ nop.m 0
+ nop.f 0
+ (p7) br.ret.spnt b0 // exp10(+Infinity)= +Infinity
+}
+;;
+
+{.mfb
+ nop.m 0
+ (p8) mov f8= f1 // exp10(+/-0)= 1
+ (p8) br.ret.spnt b0
+}
+;;
+
+{.mfb
+ nop.m 0
+ fma.d.s0 f8= f8, f1, f0 // Remaining cases: NaNs
+ br.ret.sptk b0
+}
+;;
+
+
+OUT_RANGE_exp10:
+
+// overflow: p8= 1
+
+{.mii
+ (p8) mov GR_EXPMAX= 0x1fffe
+ nop.i 0
+ nop.i 0
+}
+;;
+
+
+{.mmb
+ (p8) mov GR_Parameter_TAG= 166
+ (p8) setf.exp FR_R= GR_EXPMAX
+ nop.b 999
+}
+;;
+
+{.mfi
+ nop.m 999
+ (p8) fma.d.s0 f8= FR_R, FR_R, f0 // Create overflow
+ nop.i 999
+}
+// underflow: p6= 1
+{.mii
+ nop.m 0
+ (p6) mov GR_EXPMAX= 1
+ nop.i 0
+}
+;;
+
+{.mmb
+ nop.m 0
+ (p6) setf.exp FR_R= GR_EXPMAX
+ nop.b 999
+}
+;;
+
+{.mfb
+ nop.m 999
+ (p6) fma.d.s0 f8= FR_R, FR_R, f0 // Create underflow
+ (p6) br.ret.sptk b0 // will not call libm_error for underflow
+}
+;;
+
+GLOBAL_IEEE754_END(exp10)
+weak_alias (exp10, pow10)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+
+.prologue
+{.mfi
+ add GR_Parameter_Y= -32, sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs, GR_SAVE_PFS
+ mov GR_SAVE_PFS= ar.pfs // Save ar.pfs
+}
+
+{.mfi
+.fframe 64
+ add sp= -64, sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP= gp // Save gp
+}
+;;
+
+{.mmi
+ stfd [ GR_Parameter_Y ]= FR_Y, 16 // STORE Parameter 2 on stack
+ add GR_Parameter_X= 16, sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0= b0 // Save b0
+}
+;;
+
+.body
+{.mib
+ stfd [ GR_Parameter_X ]= FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT= 0, GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{.mib
+ stfd [ GR_Parameter_Y ]= FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y= -16, GR_Parameter_Y
+ br.call.sptk b0= __libm_error_support# // Call error handling function
+}
+;;
+
+{.mmi
+ add GR_Parameter_RESULT= 48, sp
+ nop.m 0
+ nop.i 0
+}
+;;
+
+{.mmi
+ ldfd f8= [ GR_Parameter_RESULT ] // Get return result off stack
+.restore sp
+ add sp= 64, sp // Restore stack pointer
+ mov b0= GR_SAVE_B0 // Restore return address
+}
+;;
+
+{.mib
+ mov gp= GR_SAVE_GP // Restore gp
+ mov ar.pfs= GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+}
+;;
+
+
+LOCAL_LIBM_END(__libm_error_region)
+
+.type __libm_error_support#, @function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_exp10f.S b/sysdeps/ia64/fpu/e_exp10f.S
new file mode 100644
index 0000000000..f069b3afab
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_exp10f.S
@@ -0,0 +1,561 @@
+.file "exp10f.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http: //www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 08/25/00 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 09/06/02 Improved performance and accuracy; no inexact flags on exact cases
+// 01/29/03 Added missing } to bundle templates
+//
+// API
+//==============================================================
+// float exp10f(float)
+//
+// Overview of operation
+//==============================================================
+// Background
+//
+// Implementation
+//
+// Let x= (K + fh + fl + r)/log2(10), where
+// K is an integer, fh= 0.b1 b2 b3 b4 b5,
+// fl= 2^{-5}* 0.b6 b7 b8 b8 b10 (fh, fl >= 0),
+// and |r|<2^{-11}
+// Th is a table that stores 2^fh (32 entries) rounded to
+// double extended precision (only mantissa is stored)
+// Tl is a table that stores 2^fl (32 entries) rounded to
+// double extended precision (only mantissa is stored)
+//
+// 10^x is approximated as
+// 2^K * Th [ f ] * Tl [ f ] * (1+c1*r+c2*r^2)
+
+// Note there are only 10 non-zero values that produce an exact result:
+// 1.0, 2.0, ... 10.0.
+// We test for these cases and use s1 to avoid setting the inexact flag.
+
+// Special values
+//==============================================================
+// exp10(0)= 1
+// exp10(+inf)= inf
+// exp10(-inf)= 0
+//
+
+// Registers used
+//==============================================================
+// r2-r3, r14-r40
+// f6-f15, f32-f51
+// p6-p9, p12
+//
+
+
+GR_TBL_START = r2
+GR_LOG_TBL = r3
+
+GR_OF_LIMIT = r14
+GR_UF_LIMIT = r15
+GR_EXP_CORR = r16
+GR_F_low = r17
+GR_F_high = r18
+GR_K = r19
+GR_Flow_ADDR = r20
+
+GR_BIAS = r21
+GR_Fh = r22
+GR_Fh_ADDR = r23
+GR_EXPMAX = r24
+
+GR_ROUNDVAL = r26
+GR_MASK = r27
+GR_KF0 = r28
+GR_MASK_low = r29
+GR_COEFF_START = r30
+GR_exact_limit = r31
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+
+FR_COEFF1 = f6
+FR_COEFF2 = f7
+FR_R = f9
+FR_LOG2_10 = f10
+
+FR_2P53 = f11
+FR_KF0 = f12
+FR_COEFF3 = f13
+FR_COEFF4 = f14
+FR_UF_LIMIT = f15
+
+FR_OF_LIMIT = f32
+FR_DX_L210 = f33
+FR_ROUNDVAL = f34
+FR_KF = f35
+
+FR_2_TO_K = f36
+FR_T_low = f37
+FR_T_high = f38
+
+FR_P12 = f41
+FR_T_low_K = f42
+FR_T = f44
+FR_P = f45
+
+FR_E = f49
+FR_exact_limit = f50
+
+FR_int_x = f51
+
+
+// Data tables
+//==============================================================
+
+RODATA
+
+.align 16
+
+LOCAL_OBJECT_START(poly_coeffs)
+
+data8 0xd49a784bcd1b8afe, 0x00003fcb // log2(10)*2^(10-63)
+data8 0xb17217f7d1cf79ab, 0x00004033 // C_1 * 2^53
+data8 0xf5fdeffc162c7541, 0x00004066 // C_2 * 2^106
+LOCAL_OBJECT_END(poly_coeffs)
+
+
+LOCAL_OBJECT_START(T_table)
+
+// 2^{0.00000 b6 b7 b8 b9 b10}
+data8 0x8000000000000000, 0x8016302f17467628
+data8 0x802c6436d0e04f50, 0x80429c17d77c18ed
+data8 0x8058d7d2d5e5f6b0, 0x806f17687707a7af
+data8 0x80855ad965e88b83, 0x809ba2264dada76a
+data8 0x80b1ed4fd999ab6c, 0x80c83c56b50cf77f
+data8 0x80de8f3b8b85a0af, 0x80f4e5ff089f763e
+data8 0x810b40a1d81406d4, 0x81219f24a5baa59d
+data8 0x813801881d886f7b, 0x814e67cceb90502c
+data8 0x8164d1f3bc030773, 0x817b3ffd3b2f2e47
+data8 0x8191b1ea15813bfd, 0x81a827baf7838b78
+data8 0x81bea1708dde6055, 0x81d51f0b8557ec1c
+data8 0x81eba08c8ad4536f, 0x820225f44b55b33b
+data8 0x8218af4373fc25eb, 0x822f3c7ab205c89a
+data8 0x8245cd9ab2cec048, 0x825c62a423d13f0c
+data8 0x8272fb97b2a5894c, 0x828998760d01faf3
+data8 0x82a0393fe0bb0ca8, 0x82b6ddf5dbc35906
+//
+//
+// 2^{0.b1 b2 b3 b4 b5}
+data8 0x8000000000000000, 0x82cd8698ac2ba1d7
+data8 0x85aac367cc487b14, 0x88980e8092da8527
+data8 0x8b95c1e3ea8bd6e6, 0x8ea4398b45cd53c0
+data8 0x91c3d373ab11c336, 0x94f4efa8fef70961
+data8 0x9837f0518db8a96f, 0x9b8d39b9d54e5538
+data8 0x9ef5326091a111ad, 0xa27043030c496818
+data8 0xa5fed6a9b15138ea, 0xa9a15ab4ea7c0ef8
+data8 0xad583eea42a14ac6, 0xb123f581d2ac258f
+data8 0xb504f333f9de6484, 0xb8fbaf4762fb9ee9
+data8 0xbd08a39f580c36be, 0xc12c4cca66709456
+data8 0xc5672a115506dadd, 0xc9b9bd866e2f27a2
+data8 0xce248c151f8480e3, 0xd2a81d91f12ae45a
+data8 0xd744fccad69d6af4, 0xdbfbb797daf23755
+data8 0xe0ccdeec2a94e111, 0xe5b906e77c8348a8
+data8 0xeac0c6e7dd24392e, 0xefe4b99bdcdaf5cb
+data8 0xf5257d152486cc2c, 0xfa83b2db722a033a
+LOCAL_OBJECT_END(T_table)
+
+
+
+.section .text
+GLOBAL_IEEE754_ENTRY(exp10f)
+
+
+{.mfi
+ alloc r32= ar.pfs, 1, 4, 4, 0
+ // will continue only for non-zero normal/denormal numbers
+ fclass.nm.unc p12, p7= f8, 0x1b
+ nop.i 0
+}
+{.mlx
+ // GR_TBL_START= pointer to log2(10), C_1...C_4 followed by T_table
+ addl GR_TBL_START= @ltoff(poly_coeffs), gp
+ movl GR_ROUNDVAL= 0x3fc00000 // 1.5 (SP)
+}
+;;
+
+{.mfi
+ ld8 GR_COEFF_START= [ GR_TBL_START ] // Load pointer to coeff table
+ fcmp.lt.s1 p6, p8= f8, f0 // X<0 ?
+ nop.i 0
+}
+;;
+
+{.mlx
+ nop.m 0
+ movl GR_UF_LIMIT= 0xc2349e35 // (-2^7-22) / log2(10)
+}
+{.mlx
+ setf.s FR_ROUNDVAL= GR_ROUNDVAL
+ movl GR_OF_LIMIT= 0x421a209a // Overflow threshold
+}
+;;
+
+{.mib
+ ldfe FR_LOG2_10= [ GR_COEFF_START ], 16 // load log2(10)*2^(10-63)
+ nop.i 0
+ (p12) br.cond.spnt SPECIAL_exp10 // Branch if nan, inf, zero
+}
+;;
+
+{.mfi
+ setf.s FR_OF_LIMIT= GR_OF_LIMIT // Set overflow limit
+ fma.s0 f8= f8, f1, f0 // normalize x
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ (p8) fcvt.fx.s1 FR_int_x = f8 // Convert x to integer
+ nop.i 0
+}
+{.mfi
+ setf.s FR_UF_LIMIT= GR_UF_LIMIT // Set underflow limit
+ fma.s1 FR_KF0= f8, FR_LOG2_10, FR_ROUNDVAL // y= (x*log2(10)*2^10 +
+ // 1.5*2^63) * 2^(-63)
+ mov GR_EXP_CORR= 0xffff-126
+}
+;;
+
+{.mfi
+ ldfe FR_COEFF1= [ GR_COEFF_START ], 16 // load C_1
+ fms.s1 FR_KF= FR_KF0, f1, FR_ROUNDVAL // (K+f)*2^(10-63)
+ mov GR_MASK= 1023
+}
+;;
+
+{.mfi
+ ldfe FR_COEFF2= [ GR_COEFF_START ], 16 // load C_2
+ nop.f 0
+ mov GR_MASK_low= 31
+}
+;;
+
+{.mlx
+ getf.sig GR_KF0= FR_KF0 // (K+f)*2^10= round_to_int(y)
+ (p8) movl GR_exact_limit= 0x41200000 // Largest x for exact result,
+ // +10.0
+}
+;;
+
+{.mfi
+ add GR_LOG_TBL= 256, GR_COEFF_START // Pointer to high T_table
+ fcmp.gt.s1 p12, p7= f8, FR_OF_LIMIT // x>overflow threshold ?
+ nop.i 0
+}
+;;
+
+{.mfi
+ (p8) setf.s FR_exact_limit = GR_exact_limit // Largest x for exact result
+ (p8) fcvt.xf FR_int_x = FR_int_x // Integral part of x
+ shr GR_K= GR_KF0, 10 // K
+}
+{.mfi
+ and GR_F_high= GR_MASK, GR_KF0 // f_high*32
+ fms.s1 FR_R= f8, FR_LOG2_10, FR_KF // r*2^(-53)= [ x*log2(10)-
+ // (K+f) ] *2^{10-63}
+ and GR_F_low= GR_KF0, GR_MASK_low // f_low
+}
+;;
+
+{.mmi
+ shladd GR_Flow_ADDR= GR_F_low, 3, GR_COEFF_START // address of 2^{f_low}
+ add GR_BIAS= GR_K, GR_EXP_CORR // K= bias-2*63
+ shr GR_Fh= GR_F_high, 5 // f_high
+}
+;;
+
+{.mfi
+ setf.exp FR_2_TO_K= GR_BIAS // 2^{K-126}
+ (p7) fcmp.lt.s1 p12, p7= f8, FR_UF_LIMIT // x<underflow threshold ?
+ shladd GR_Fh_ADDR= GR_Fh, 3, GR_LOG_TBL // address of 2^{f_high}
+}
+{.mfi
+ ldf8 FR_T_low= [ GR_Flow_ADDR ] // load T_low= 2^{f_low}
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{.mfb
+ ldf8 FR_T_high= [ GR_Fh_ADDR ] // load T_high= 2^{f_high}
+ nop.f 0
+ (p12) br.cond.spnt OUT_RANGE_exp10
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_P12= FR_COEFF2, FR_R, FR_COEFF1 // P12= C_1+C_2*r
+ cmp.eq p7,p9= r0,r0 // Assume inexact result
+}
+;;
+
+{.mfi
+ nop.m 0
+ (p8) fcmp.eq.s1 p9,p7= FR_int_x, f8 // Test x positive integer
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 FR_T_low_K= FR_T_low, FR_2_TO_K, f0 // T= 2^{K-126}*T_low
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_P= FR_P12, FR_R, f0 // P= P12*r
+ nop.i 0
+}
+;;
+
+// If x a positive integer, will it produce an exact result?
+// p7 result will be inexact
+// p9 result will be exact
+{.mfi
+ nop.m 0
+ (p9) fcmp.le.s1 p9,p7= f8, FR_exact_limit // Test x gives exact result
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 FR_T= FR_T_low_K, FR_T_high, f0 // T= T*T_high
+ nop.i 0
+}
+;;
+
+.pred.rel "mutex",p7,p9
+{.mfi
+ nop.m 0
+ (p7) fma.s.s0 f8= FR_P, FR_T, FR_T // result= T+T*P, inexact set
+ nop.i 0
+}
+{.mfb
+ nop.m 0
+ (p9) fma.s.s1 f8= FR_P, FR_T, FR_T // result= T+T*P, exact use s1
+ br.ret.sptk b0 // return
+}
+;;
+
+
+SPECIAL_exp10:
+{.mfi
+ nop.m 0
+ fclass.m p6, p0= f8, 0x22 // x= -Infinity ?
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fclass.m p7, p0= f8, 0x21 // x= +Infinity ?
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fclass.m p8, p0= f8, 0x7 // x= +/-Zero ?
+ nop.i 0
+}
+{.mfb
+ nop.m 0
+ (p6) mov f8= f0 // exp10(-Infinity)= 0
+ (p6) br.ret.spnt b0
+}
+;;
+
+{.mfb
+ nop.m 0
+ nop.f 0
+ (p7) br.ret.spnt b0 // exp10(+Infinity)= +Infinity
+}
+;;
+
+{.mfb
+ nop.m 0
+ (p8) mov f8= f1 // exp10(+/-0)= 1
+ (p8) br.ret.spnt b0
+}
+;;
+
+{.mfb
+ nop.m 0
+ fma.s.s0 f8= f8, f1, f0 // Remaining cases: NaNs
+ br.ret.sptk b0
+}
+;;
+
+
+OUT_RANGE_exp10:
+
+// overflow: p8= 1
+
+{.mii
+ (p8) mov GR_EXPMAX= 0x1fffe
+ nop.i 0
+ nop.i 0
+}
+;;
+
+
+{.mmb
+ (p8) mov GR_Parameter_TAG= 167
+ (p8) setf.exp FR_R= GR_EXPMAX
+ nop.b 999
+}
+;;
+
+{.mfi
+ nop.m 999
+ (p8) fma.s.s0 f8= FR_R, FR_R, f0 // Create overflow
+ nop.i 999
+}
+// underflow: p6= 1
+{.mii
+ nop.m 0
+ (p6) mov GR_EXPMAX= 1
+ nop.i 0
+}
+;;
+
+{.mmb
+ nop.m 0
+ (p6) setf.exp FR_R= GR_EXPMAX
+ nop.b 999
+}
+;;
+
+{.mfb
+ nop.m 999
+ (p6) fma.s.s0 f8= FR_R, FR_R, f0 // Create underflow
+ (p6) br.ret.sptk b0 // will not call libm_error for underflow
+}
+;;
+
+GLOBAL_IEEE754_END(exp10f)
+weak_alias (exp10f, pow10f)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+
+.prologue
+{.mfi
+ add GR_Parameter_Y= -32, sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs, GR_SAVE_PFS
+ mov GR_SAVE_PFS= ar.pfs // Save ar.pfs
+}
+
+{.mfi
+.fframe 64
+ add sp= -64, sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP= gp // Save gp
+}
+;;
+
+{.mmi
+ stfs [ GR_Parameter_Y ]= FR_Y, 16 // STORE Parameter 2 on stack
+ add GR_Parameter_X= 16, sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0= b0 // Save b0
+}
+;;
+
+.body
+{.mib
+ stfs [ GR_Parameter_X ]= FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT= 0, GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{.mib
+ stfs [ GR_Parameter_Y ]= FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y= -16, GR_Parameter_Y
+ br.call.sptk b0= __libm_error_support# // Call error handling function
+}
+;;
+
+{.mmi
+ add GR_Parameter_RESULT= 48, sp
+ nop.m 0
+ nop.i 0
+}
+;;
+
+{.mmi
+ ldfs f8= [ GR_Parameter_RESULT ] // Get return result off stack
+.restore sp
+ add sp= 64, sp // Restore stack pointer
+ mov b0= GR_SAVE_B0 // Restore return address
+}
+;;
+
+{.mib
+ mov gp= GR_SAVE_GP // Restore gp
+ mov ar.pfs= GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+}
+;;
+
+
+LOCAL_LIBM_END(__libm_error_region)
+
+.type __libm_error_support#, @function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_exp10l.S b/sysdeps/ia64/fpu/e_exp10l.S
new file mode 100644
index 0000000000..1b47258e73
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_exp10l.S
@@ -0,0 +1,805 @@
+.file "exp10l.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 08/25/00 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+// 05/08/03 Reformatted assembly source; corrected overflow result for round to
+// -inf and round to zero; exact results now don't set inexact flag
+//
+// API
+//==============================================================
+// long double exp10l(long double)
+//
+// Overview of operation
+//==============================================================
+// Background
+//
+// Implementation
+//
+// Let x= (K + f + r)/log2(10), where
+// K is an integer, f= 0.b1 b2... b8 (f>= 0),
+// and |r|<2^{-9}
+// T is a table that stores 2^f (256 entries) rounded to
+// double extended precision (only mantissa is stored)
+// D stores (2^f/T [ f ] - 1), rounded to single precision
+//
+// 10^x is approximated as
+// 2^K * T [ f ] * ((1+c1*r+c2*r^2+...+c6*r^6)*(1+c1*e)+D [ f ] ),
+// where e= log2(10)_lo*x+(log2(10)_hi*x-RN(log2(10)_hi*x))
+//
+
+
+
+// Special values
+//==============================================================
+// exp10(0)= 1
+// exp10(+inf)= inf
+// exp10(-inf)= 0
+//
+
+
+// Registers used
+//==============================================================
+// f6-f15, f32-f62
+// r14-r30, r32-r40
+// p6-p8, p12-p14
+//
+
+
+ FR_X = f10
+ FR_Y = f1
+ FR_RESULT = f8
+
+ FR_COEFF1 = f6
+ FR_COEFF2 = f7
+ FR_KF0 = f9
+ FR_LOG10 = f10
+ FR_CONST1 = f11
+ FR_XL10 = f12
+ FR_COEFF3 = f13
+ FR_COEFF4 = f14
+ FR_UF_TEST = f15
+ FR_OF_TEST = f32
+ FR_L10_LOW = f33
+ FR_COEFF5 = f34
+ FR_COEFF6 = f35
+ FR_L10 = f36
+ FR_C_L10 = f37
+ FR_XL10_H = f38
+ FR_XL10_L = f39
+ FR_KF = f40
+ FR_E = f41
+ FR_T = f42
+ FR_D = f43
+ FR_EXP_M_63 = f44
+ FR_R = f45
+ FR_E1 = f46
+ FR_COEFF2 = f47
+ FR_P34 = f48
+ FR_P56 = f49
+ FR_R2 = f50
+ FR_RE = f51
+ FR_D1 = f52
+ FR_P36 = f53
+ FR_R3E = f54
+ FR_P1 = f55
+ FR_P = f56
+ FR_T1 = f57
+ FR_XINT = f58
+ FR_XINTF = f59
+ FR_4 = f60
+ FR_28 = f61
+ FR_32 = f62
+
+
+ GR_ADDR0 = r14
+ GR_D_ADDR = r15
+ GR_ADDR = r16
+ GR_B63 = r17
+ GR_KBITS = r18
+ GR_F = r19
+ GR_K = r20
+ GR_D = r21
+ GR_BM63 = r22
+ GR_T = r23
+ GR_CONST1 = r24
+ GR_EMIN = r25
+ GR_CONST2 = r26
+ GR_BM8 = r27
+ GR_SREG = r28
+ GR_4_BIAS = r29
+ GR_32_BIAS = r30
+
+ GR_SAVE_B0 = r33
+ GR_SAVE_PFS = r34
+ GR_SAVE_GP = r35
+ GR_SAVE_SP = r36
+
+ GR_Parameter_X = r37
+ GR_Parameter_Y = r38
+ GR_Parameter_RESULT= r39
+ GR_Parameter_TAG = r40
+
+
+// Data tables
+//==============================================================
+
+RODATA
+
+.align 16
+
+LOCAL_OBJECT_START(poly_coeffs)
+
+ data8 0xd49a784bcd1b8afe, 0x00004008 // log2(10)*2^8
+ data8 0x9a209a84fbcff798, 0x0000400b // overflow threshold
+ data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
+ data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
+ data8 0x3fac6b08d704a0c0 // C_3
+ data8 0x3f83b2ab6fba4e77 // C_4
+ data8 0x3f55d87fe78a6731 // C_5
+ data8 0x3f2430912f86c787 // C_6
+ data8 0x9257edfe9b5fb698, 0x00003fbf // log2(10)_low (bits 64...127)
+LOCAL_OBJECT_END(poly_coeffs)
+
+
+LOCAL_OBJECT_START(T_table)
+
+ // 2^{0.b1 b2 b3 b4 b5 b6 b7 b8}
+ data8 0x8000000000000000, 0x8058d7d2d5e5f6b1
+ data8 0x80b1ed4fd999ab6c, 0x810b40a1d81406d4
+ data8 0x8164d1f3bc030773, 0x81bea1708dde6056
+ data8 0x8218af4373fc25ec, 0x8272fb97b2a5894c
+ data8 0x82cd8698ac2ba1d7, 0x83285071e0fc4547
+ data8 0x8383594eefb6ee37, 0x83dea15b9541b132
+ data8 0x843a28c3acde4046, 0x8495efb3303efd30
+ data8 0x84f1f656379c1a29, 0x854e3cd8f9c8c95d
+ data8 0x85aac367cc487b15, 0x86078a2f23642a9f
+ data8 0x8664915b923fba04, 0x86c1d919caef5c88
+ data8 0x871f61969e8d1010, 0x877d2afefd4e256c
+ data8 0x87db357ff698d792, 0x88398146b919f1d4
+ data8 0x88980e8092da8527, 0x88f6dd5af155ac6b
+ data8 0x8955ee03618e5fdd, 0x89b540a7902557a4
+ data8 0x8a14d575496efd9a, 0x8a74ac9a79896e47
+ data8 0x8ad4c6452c728924, 0x8b3522a38e1e1032
+ data8 0x8b95c1e3ea8bd6e7, 0x8bf6a434adde0085
+ data8 0x8c57c9c4646f4dde, 0x8cb932c1bae97a95
+ data8 0x8d1adf5b7e5ba9e6, 0x8d7ccfc09c50e2f8
+ data8 0x8ddf042022e69cd6, 0x8e417ca940e35a01
+ data8 0x8ea4398b45cd53c0, 0x8f073af5a2013520
+ data8 0x8f6a8117e6c8e5c4, 0x8fce0c21c6726481
+ data8 0x9031dc431466b1dc, 0x9095f1abc540ca6b
+ data8 0x90fa4c8beee4b12b, 0x915eed13c89689d3
+ data8 0x91c3d373ab11c336, 0x9228ffdc10a051ad
+ data8 0x928e727d9531f9ac, 0x92f42b88f673aa7c
+ data8 0x935a2b2f13e6e92c, 0x93c071a0eef94bc1
+ data8 0x9426ff0fab1c04b6, 0x948dd3ac8ddb7ed3
+ data8 0x94f4efa8fef70961, 0x955c5336887894d5
+ data8 0x95c3fe86d6cc7fef, 0x962bf1cbb8d97560
+ data8 0x96942d3720185a00, 0x96fcb0fb20ac4ba3
+ data8 0x97657d49f17ab08e, 0x97ce9255ec4357ab
+ data8 0x9837f0518db8a96f, 0x98a1976f7597e996
+ data8 0x990b87e266c189aa, 0x9975c1dd47518c77
+ data8 0x99e0459320b7fa65, 0x9a4b13371fd166ca
+ data8 0x9ab62afc94ff864a, 0x9b218d16f441d63d
+ data8 0x9b8d39b9d54e5539, 0x9bf93118f3aa4cc1
+ data8 0x9c6573682ec32c2d, 0x9cd200db8a0774cb
+ data8 0x9d3ed9a72cffb751, 0x9dabfdff6367a2aa
+ data8 0x9e196e189d472420, 0x9e872a276f0b98ff
+ data8 0x9ef5326091a111ae, 0x9f6386f8e28ba651
+ data8 0x9fd228256400dd06, 0xa041161b3d0121be
+ data8 0xa0b0510fb9714fc2, 0xa11fd9384a344cf7
+ data8 0xa18faeca8544b6e4, 0xa1ffd1fc25cea188
+ data8 0xa27043030c496819, 0xa2e102153e918f9e
+ data8 0xa3520f68e802bb93, 0xa3c36b345991b47c
+ data8 0xa43515ae09e6809e, 0xa4a70f0c95768ec5
+ data8 0xa5195786be9ef339, 0xa58bef536dbeb6ee
+ data8 0xa5fed6a9b15138ea, 0xa6720dc0be08a20c
+ data8 0xa6e594cfeee86b1e, 0xa7596c0ec55ff55b
+ data8 0xa7cd93b4e965356a, 0xa8420bfa298f70d1
+ data8 0xa8b6d5167b320e09, 0xa92bef41fa77771b
+ data8 0xa9a15ab4ea7c0ef8, 0xaa1717a7b5693979
+ data8 0xaa8d2652ec907629, 0xab0386ef48868de1
+ data8 0xab7a39b5a93ed337, 0xabf13edf162675e9
+ data8 0xac6896a4be3fe929, 0xace0413ff83e5d04
+ data8 0xad583eea42a14ac6, 0xadd08fdd43d01491
+ data8 0xae493452ca35b80e, 0xaec22c84cc5c9465
+ data8 0xaf3b78ad690a4375, 0xafb51906e75b8661
+ data8 0xb02f0dcbb6e04584, 0xb0a957366fb7a3c9
+ data8 0xb123f581d2ac2590, 0xb19ee8e8c94feb09
+ data8 0xb21a31a66618fe3b, 0xb295cff5e47db4a4
+ data8 0xb311c412a9112489, 0xb38e0e38419fae18
+ data8 0xb40aaea2654b9841, 0xb487a58cf4a9c180
+ data8 0xb504f333f9de6484, 0xb58297d3a8b9f0d2
+ data8 0xb60093a85ed5f76c, 0xb67ee6eea3b22b8f
+ data8 0xb6fd91e328d17791, 0xb77c94c2c9d725e9
+ data8 0xb7fbefca8ca41e7c, 0xb87ba337a1743834
+ data8 0xb8fbaf4762fb9ee9, 0xb97c143756844dbf
+ data8 0xb9fcd2452c0b9deb, 0xba7de9aebe5fea09
+ data8 0xbaff5ab2133e45fb, 0xbb81258d5b704b6f
+ data8 0xbc034a7ef2e9fb0d, 0xbc85c9c560e7b269
+ data8 0xbd08a39f580c36bf, 0xbd8bd84bb67ed483
+ data8 0xbe0f6809860993e2, 0xbe935317fc378238
+ data8 0xbf1799b67a731083, 0xbf9c3c248e2486f8
+ data8 0xc0213aa1f0d08db0, 0xc0a6956e8836ca8d
+ data8 0xc12c4cca66709456, 0xc1b260f5ca0fbb33
+ data8 0xc238d2311e3d6673, 0xc2bfa0bcfad907c9
+ data8 0xc346ccda24976407, 0xc3ce56c98d21b15d
+ data8 0xc4563ecc5334cb33, 0xc4de8523c2c07baa
+ data8 0xc5672a115506dadd, 0xc5f02dd6b0bbc3d9
+ data8 0xc67990b5aa245f79, 0xc70352f04336c51e
+ data8 0xc78d74c8abb9b15d, 0xc817f681416452b2
+ data8 0xc8a2d85c8ffe2c45, 0xc92e1a9d517f0ecc
+ data8 0xc9b9bd866e2f27a3, 0xca45c15afcc72624
+ data8 0xcad2265e4290774e, 0xcb5eecd3b38597c9
+ data8 0xcbec14fef2727c5d, 0xcc799f23d11510e5
+ data8 0xcd078b86503dcdd2, 0xcd95da6a9ff06445
+ data8 0xce248c151f8480e4, 0xceb3a0ca5dc6a55d
+ data8 0xcf4318cf191918c1, 0xcfd2f4683f94eeb5
+ data8 0xd06333daef2b2595, 0xd0f3d76c75c5db8d
+ data8 0xd184df6251699ac6, 0xd2164c023056bcab
+ data8 0xd2a81d91f12ae45a, 0xd33a5457a3029054
+ data8 0xd3ccf099859ac379, 0xd45ff29e0972c561
+ data8 0xd4f35aabcfedfa1f, 0xd5872909ab75d18a
+ data8 0xd61b5dfe9f9bce07, 0xd6aff9d1e13ba2fe
+ data8 0xd744fccad69d6af4, 0xd7da67311797f56a
+ data8 0xd870394c6db32c84, 0xd9067364d44a929c
+ data8 0xd99d15c278afd7b6, 0xda3420adba4d8704
+ data8 0xdacb946f2ac9cc72, 0xdb63714f8e295255
+ data8 0xdbfbb797daf23755, 0xdc9467913a4f1c92
+ data8 0xdd2d818508324c20, 0xddc705bcd378f7f0
+ data8 0xde60f4825e0e9124, 0xdefb4e1f9d1037f2
+ data8 0xdf9612deb8f04420, 0xe031430a0d99e627
+ data8 0xe0ccdeec2a94e111, 0xe168e6cfd3295d23
+ data8 0xe2055afffe83d369, 0xe2a23bc7d7d91226
+ data8 0xe33f8972be8a5a51, 0xe3dd444c46499619
+ data8 0xe47b6ca0373da88d, 0xe51a02ba8e26d681
+ data8 0xe5b906e77c8348a8, 0xe658797368b3a717
+ data8 0xe6f85aaaee1fce22, 0xe798aadadd5b9cbf
+ data8 0xe8396a503c4bdc68, 0xe8da9958464b42ab
+ data8 0xe97c38406c4f8c57, 0xea1e4756550eb27b
+ data8 0xeac0c6e7dd24392f, 0xeb63b74317369840
+ data8 0xec0718b64c1cbddc, 0xecaaeb8ffb03ab41
+ data8 0xed4f301ed9942b84, 0xedf3e6b1d418a491
+ data8 0xee990f980da3025b, 0xef3eab20e032bc6b
+ data8 0xefe4b99bdcdaf5cb, 0xf08b3b58cbe8b76a
+ data8 0xf13230a7ad094509, 0xf1d999d8b7708cc1
+ data8 0xf281773c59ffb13a, 0xf329c9233b6bae9c
+ data8 0xf3d28fde3a641a5b, 0xf47bcbbe6db9fddf
+ data8 0xf5257d152486cc2c, 0xf5cfa433e6537290
+ data8 0xf67a416c733f846e, 0xf7255510c4288239
+ data8 0xf7d0df730ad13bb9, 0xf87ce0e5b2094d9c
+ data8 0xf92959bb5dd4ba74, 0xf9d64a46eb939f35
+ data8 0xfa83b2db722a033a, 0xfb3193cc4227c3f4
+ data8 0xfbdfed6ce5f09c49, 0xfc8ec01121e447bb
+ data8 0xfd3e0c0cf486c175, 0xfdedd1b496a89f35
+ data8 0xfe9e115c7b8f884c, 0xff4ecb59511ec8a5
+LOCAL_OBJECT_END(T_table)
+
+
+LOCAL_OBJECT_START(D_table)
+ data4 0x00000000, 0x9f55c08f, 0x1e93ffa3, 0x1dcd43a8
+ data4 0x1f751f79, 0x9f3cdd88, 0x9f43d155, 0x1eda222c
+ data4 0x1ef35513, 0x9f597895, 0x9e698881, 0x1ec71073
+ data4 0x1e50e371, 0x9dc01e19, 0x1de74133, 0x1e2f028c
+ data4 0x9edefb47, 0x1ebbac48, 0x9e8b0330, 0x9e9e9314
+ data4 0x1edc1d11, 0x1f098529, 0x9f52827c, 0x1f50050d
+ data4 0x1f301e8e, 0x1f5b64d1, 0x9f45e3ee, 0x9ef64d6d
+ data4 0x1d6ec5e8, 0x9e61ad9a, 0x1d44ccbb, 0x9e4a8bbb
+ data4 0x9cf11576, 0x9dcce7e7, 0x9d02ac90, 0x1f26ccf0
+ data4 0x9f0877c6, 0x9ddd62ae, 0x9f4b7fc3, 0x1ea8ef6b
+ data4 0x1ea4378d, 0x1ef6fc38, 0x1db99fd9, 0x1f22bf6f
+ data4 0x1f53e172, 0x1e85504a, 0x9f37cc75, 0x1f0c5e17
+ data4 0x1dde8aac, 0x9cb42bb2, 0x1e153cd7, 0x1eb62bba
+ data4 0x9e9b941b, 0x9ea80e3c, 0x1f508823, 0x1ec3fd36
+ data4 0x1e9ffaa1, 0x1e21e2eb, 0x9d948b1d, 0x9e8ac93a
+ data4 0x1ef7ee6f, 0x9e80dda3, 0x1f0814be, 0x1dc5ddfe
+ data4 0x1eedb9d1, 0x9f2aaa26, 0x9ea5b0fc, 0x1edf702e
+ data4 0x9e391201, 0x1f1316bb, 0x1ea27fb7, 0x9e05ed18
+ data4 0x9f199ed2, 0x1ee7fd7c, 0x1f003db6, 0x9eac3793
+ data4 0x9e5b8c10, 0x9f3af17c, 0x1bc9a8be, 0x1ee3c004
+ data4 0x9f19b1b2, 0x9f242ce9, 0x9ce67dd1, 0x9e4f6275
+ data4 0x1e20742c, 0x1eb9328a, 0x9f477153, 0x1d969718
+ data4 0x9f1e6c43, 0x1f2f67f4, 0x9f39c7e4, 0x9e3c4feb
+ data4 0x1da3956b, 0x9e7c685d, 0x1f280911, 0x9f0d8afb
+ data4 0x1e314b40, 0x9eb4f250, 0x9f1a34ad, 0x1ef5d5e7
+ data4 0x9f145496, 0x1e604827, 0x9f1e5195, 0x1e9c1fc0
+ data4 0x1efde521, 0x1e69b385, 0x1f316830, 0x9f244eae
+ data4 0x1f1787ec, 0x9e939971, 0x1f0bb393, 0x9f0511d6
+ data4 0x1ed919de, 0x1d8b7b28, 0x1e5ca4a9, 0x1e7c357b
+ data4 0x9e3ff8e8, 0x1eef53b5, 0x9ed22ed7, 0x1f16659b
+ data4 0x9f2db102, 0x9e2c6a78, 0x1f328d7d, 0x9f2fec3c
+ data4 0x1eb395bd, 0x9f242b84, 0x9e2683e6, 0x1ed71e68
+ data4 0x1efd1df5, 0x9e9eeafd, 0x9ed2249c, 0x1eef129a
+ data4 0x1d1ea44c, 0x9e81f7ff, 0x1eaf77c9, 0x9ee7a285
+ data4 0x1e1864ed, 0x9ee7edbb, 0x9e15a27d, 0x9ae61655
+ data4 0x1f1ff1a2, 0x1da29755, 0x9e5f46fb, 0x1e901236
+ data4 0x9eecfb9b, 0x9f204d2f, 0x1ec64685, 0x9eb809bd
+ data4 0x9e0026c5, 0x1d9f1da1, 0x1f142b49, 0x9f20f22e
+ data4 0x1f24b067, 0x1f185a4c, 0x9f09765c, 0x9ece902f
+ data4 0x1e2ca5db, 0x1e6de464, 0x9f071f67, 0x1f1518c3
+ data4 0x1ea13ded, 0x1f0b8414, 0x1edb6ad4, 0x9e548740
+ data4 0x9ea10efb, 0x1ee48a60, 0x1e7954c5, 0x9edad013
+ data4 0x9f21517d, 0x9e9b6e0c, 0x9ee7f9a6, 0x9ebd4298
+ data4 0x9d65b24e, 0x1eed751f, 0x9f1573ea, 0x9d430377
+ data4 0x9e13fc0c, 0x1e47008a, 0x1e3d5c1d, 0x1ef41a91
+ data4 0x9e4a4ef7, 0x9e952f18, 0x1d620566, 0x1d9b8d33
+ data4 0x1db06247, 0x1e94b31e, 0x1f0730ad, 0x9d79ffb4
+ data4 0x1ed64d51, 0x9e91fd11, 0x9e28d35a, 0x9dea0ed9
+ data4 0x1e891def, 0x9ee28ac0, 0x1e1db99b, 0x9ee1ce38
+ data4 0x9bdd9bca, 0x1eb72cb9, 0x9e8c53c6, 0x1e0df6ca
+ data4 0x1e8f2ccd, 0x9e9b0886, 0x1eeb3bc7, 0x1ec7e772
+ data4 0x9e210776, 0x9daf246c, 0x1ea1f151, 0x1ece4dc6
+ data4 0x1ce741c8, 0x1ed3c88f, 0x9ec9a4fd, 0x9e0c8d30
+ data4 0x1d2fbb26, 0x9ef212a7, 0x1ee44f1c, 0x9e445550
+ data4 0x1e075f77, 0x9d9291a3, 0x1f09c2ee, 0x9e012c88
+ data4 0x1f057d62, 0x9e7bb0dc, 0x9d8758ee, 0x1ee8d6c1
+ data4 0x9e509a57, 0x9e4ca7b7, 0x1e2cb341, 0x9ec35106
+ data4 0x1ecf3baf, 0x1e11781c, 0x1ea0cc78, 0x1eb75ca6
+ data4 0x1e961e1a, 0x1eb88853, 0x1e7abf50, 0x1ee38704
+ data4 0x9dc5ab0f, 0x1afe197b, 0x9ec07523, 0x9d9b7f78
+ data4 0x1f011618, 0x1ed43b0b, 0x9f035945, 0x9e3fd014
+ data4 0x9bbda5cd, 0x9e83f8ab, 0x1e58a928, 0x1e392d61
+ data4 0x1efdbb52, 0x1ee310a8, 0x9ec7ecc1, 0x1e8c9ed6
+ data4 0x9ef82dee, 0x9e70545b, 0x9ea53fc4, 0x1e40f419
+LOCAL_OBJECT_END(D_table)
+
+
+
+.section .text
+GLOBAL_IEEE754_ENTRY(exp10l)
+
+{.mfi
+ alloc GR_SREG = ar.pfs, 1, 4, 4, 0
+ // will continue only for normal/denormal numbers
+ fclass.nm.unc p12, p7 = f8, 0x1b
+ // GR_ADDR0 = pointer to log2(10), C_1...C_6 followed by T_table
+ addl GR_ADDR0 = @ltoff(poly_coeffs), gp ;;
+}
+
+{.mfi
+ // load start address for C_1...C_6 followed by T_table
+ ld8 GR_ADDR0 = [ GR_ADDR0 ]
+ // X<0 ?
+ fcmp.lt.s1 p6, p8 = f8, f0
+ // GR_BM8 = bias-8
+ mov GR_BM8 = 0xffff-8
+}
+{.mlx
+ nop.m 0
+ // GR_EMIN = (-2^14-62)*2^{8}
+ movl GR_EMIN = 0xca807c00 ;;
+}
+
+{.mmb
+ // FR_CONST1 = 2^{-8}
+ setf.exp FR_CONST1 = GR_BM8
+ // load log2(10)*2^8
+ ldfe FR_LOG10 = [ GR_ADDR0 ], 16
+ (p12) br.cond.spnt SPECIAL_EXP10 ;;
+}
+
+{.mmf
+ setf.s FR_UF_TEST = GR_EMIN
+ // load overflow threshold
+ ldfe FR_OF_TEST = [ GR_ADDR0 ], 16
+ // normalize x
+ fma.s0 f8 = f8, f1, f0 ;;
+}
+
+{.mmi
+ // load C_1
+ ldfe FR_COEFF1 = [ GR_ADDR0 ], 16 ;;
+ // load C_2
+ ldfe FR_COEFF2 = [ GR_ADDR0 ], 16
+ nop.i 0 ;;
+}
+
+{.mmf
+ // GR_D_ADDR = pointer to D table
+ add GR_D_ADDR = 2048-64+96+16, GR_ADDR0
+ // load C_3, C_4
+ ldfpd FR_COEFF3, FR_COEFF4 = [ GR_ADDR0 ], 16
+ // y = x*log2(10)*2^8
+ fma.s1 FR_XL10 = f8, FR_LOG10, f0 ;;
+}
+
+{.mfi
+ // load C_5, C_6
+ ldfpd FR_COEFF5, FR_COEFF6 = [ GR_ADDR0 ], 16
+ // get int(x)
+ fcvt.fx.trunc.s1 FR_XINT = f8
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // FR_LOG10 = log2(10)
+ fma.s1 FR_L10 = FR_LOG10, FR_CONST1, f0
+ nop.i 0 ;;
+}
+
+{.mfi
+ // load log2(10)_low
+ ldfe FR_L10_LOW = [ GR_ADDR0 ], 16
+ // y0 = x*log2(10) = x*log2(10)_hi
+ fma.s1 FR_LOG10 = f8, FR_L10, f0
+ mov GR_EMIN = 0xffff-63
+}
+{.mfi
+ mov GR_32_BIAS = 0xffff + 5
+ // (K+f)*2^8 = round_to_int(y)
+ fcvt.fx.s1 FR_KF0 = FR_XL10
+ mov GR_4_BIAS = 0xffff + 2;;
+}
+
+{.mfi
+ nop.m 0
+ // x>overflow threshold ?
+ fcmp.gt.s1 p12, p7 = f8, FR_OF_TEST
+ nop.i 0 ;;
+}
+
+{.mfi
+ setf.exp FR_32 = GR_32_BIAS
+ // x<underflow threshold ?
+ (p7) fcmp.lt.s1 p12, p7 = FR_XL10, FR_UF_TEST
+ nop.i 0 ;;
+}
+
+{.mfi
+ setf.exp FR_4 = GR_4_BIAS
+ fcvt.xf FR_XINTF = FR_XINT
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // FR_L10 = log2(10)_h*x-RN(log2(10)_h*x)
+ fms.s1 FR_L10 = f8, FR_L10, FR_LOG10
+ nop.i 0 ;;
+}
+
+{.mfi
+ getf.sig GR_BM8 = FR_KF0
+ fcvt.xf FR_KF0 = FR_KF0
+ mov GR_CONST2 = 255 ;;
+}
+
+{.mfi
+ // GR_CONST2 = f
+ and GR_CONST2 = GR_CONST2, GR_BM8
+ // FR_L10_LOW = e = log2(10)_l*x+(log2(10)_h*x-RN(log2(10)_h*x))
+ fma.s1 FR_L10_LOW = FR_L10_LOW, f8, FR_L10
+ // GR_BM8 = K
+ shr GR_BM8 = GR_BM8, 8 ;;
+}
+
+{.mmi
+ // address of D
+ shladd GR_D_ADDR = GR_CONST2, 2, GR_D_ADDR
+ // K+ = bias-63
+ add GR_BM8 = GR_BM8, GR_EMIN
+ // address of T
+ shladd GR_ADDR0 = GR_CONST2, 3, GR_ADDR0 ;;
+}
+
+{.mfb
+ // load D
+ ldfs FR_OF_TEST = [ GR_D_ADDR ]
+ // is input an integer ?
+ fcmp.eq.s1 p13, p14 = f8, FR_XINTF
+ (p12) br.cond.spnt OUT_RANGE_EXP10 ;;
+}
+
+{.mmf
+ // load T
+ ldf8 FR_UF_TEST = [ GR_ADDR0 ]
+ // FR_XL10 = 2^{K-63}
+ setf.exp FR_XL10 = GR_BM8
+ // r = x*log2(10)_hi-2^{-10}* [ (K+f)*2^{10} ]
+ fnma.s1 FR_KF0 = FR_KF0, FR_CONST1, FR_LOG10 ;;
+}
+
+{.mfi
+ nop.m 0
+ // get 28.0
+ fms.s1 FR_28 = FR_32, f1, FR_4
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // E = 1+C_1*e
+ fma.s1 FR_L10 = FR_L10_LOW, FR_COEFF1, f1
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // P12 = C_1+C_2*r
+ fma.s1 FR_COEFF2 = FR_COEFF2, FR_KF0, FR_COEFF1
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // P34 = C_3+C_4*r
+ fma.s1 FR_COEFF4 = FR_COEFF4, FR_KF0, FR_COEFF3
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // P56 = C_5+C_6*r
+ fma.s1 FR_COEFF5 = FR_COEFF6, FR_KF0, FR_COEFF5
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // GR_ADDR0 = r*r
+ fma.s1 FR_COEFF3 = FR_KF0, FR_KF0, f0
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // if input is integer, is it positive ?
+ (p13) fcmp.ge.s1 p13, p14 = f8, f0
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // r' = r*E
+ fma.s1 FR_KF0 = FR_KF0, FR_L10, f0
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // D' = D+C_1*e
+ fma.s1 FR_OF_TEST = FR_L10_LOW, FR_COEFF1, FR_OF_TEST
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // P36 = P34+r2*P56
+ fma.s1 FR_COEFF4 = FR_COEFF5, FR_COEFF3, FR_COEFF4
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // GR_D_ADDR = r'*r2
+ fma.s1 FR_COEFF3 = FR_COEFF3, FR_KF0, f0
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // is input below 28.0 ?
+ (p13) fcmp.lt.s1 p13, p14 = f8, FR_28
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // P' = P12*r'+D'
+ fma.s1 FR_COEFF2 = FR_COEFF2, FR_KF0, FR_OF_TEST
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // P = P'+r3*P36
+ fma.s1 FR_COEFF3 = FR_COEFF3, FR_COEFF4, FR_COEFF2
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // T = 2^{K-63}*T
+ fma.s1 FR_UF_TEST = FR_UF_TEST, FR_XL10, f0
+ nop.i 0 ;;
+}
+
+.pred.rel "mutex",p13,p14
+{.mfi
+ nop.m 0
+ (p13) fma.s1 f8 = FR_COEFF3, FR_UF_TEST, FR_UF_TEST
+ nop.i 0
+}
+{.mfb
+ nop.m 0
+ // result = T+T*P
+ (p14) fma.s0 f8 = FR_COEFF3, FR_UF_TEST, FR_UF_TEST
+ // return
+ br.ret.sptk b0 ;;
+}
+
+
+SPECIAL_EXP10:
+
+{.mfi
+ nop.m 0
+ // x = -Infinity ?
+ fclass.m p6, p0 = f8, 0x22
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // x = +Infinity ?
+ fclass.m p7, p0 = f8, 0x21
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // x = +/-Zero ?
+ fclass.m p8, p0 = f8, 0x7
+ nop.i 0
+}
+{.mfb
+ nop.m 0
+ // exp10(-Infinity) = 0
+ (p6) mov f8 = f0
+ (p6) br.ret.spnt b0 ;;
+}
+
+{.mfb
+ nop.m 0
+ // exp10(+Infinity) = +Infinity
+ nop.f 0
+ (p7) br.ret.spnt b0 ;;
+}
+
+{.mfb
+ nop.m 0
+ // exp10(+/-0) = 1
+ (p8) mov f8 = f1
+ (p8) br.ret.spnt b0 ;;
+}
+
+{.mfb
+ nop.m 0
+ // Remaining cases: NaNs
+ fma.s0 f8 = f8, f1, f0
+ br.ret.sptk b0 ;;
+}
+
+
+OUT_RANGE_EXP10:
+
+{.mii
+ // overflow: p8 = 1
+ (p8) mov GR_CONST1 = 0x1fffe
+ nop.i 0
+ nop.i 0 ;;
+}
+
+{.mmb
+ (p8) mov GR_Parameter_TAG = 165
+ (p8) setf.exp FR_KF0 = GR_CONST1
+ nop.b 999 ;;
+}
+
+{.mfi
+ nop.m 999
+ (p8) fma.s0 f8 = FR_KF0, FR_KF0, f0
+ nop.i 999
+}
+{.mii
+ nop.m 0
+ // underflow: p6 = 1
+ (p6) mov GR_CONST1 = 1
+ nop.i 0 ;;
+}
+
+{.mmb
+ nop.m 0
+ (p6) setf.exp FR_KF0 = GR_CONST1
+ nop.b 999 ;;
+}
+
+{.mfb
+ nop.m 999
+ (p6) fma.s0 f8 = FR_KF0, FR_KF0, f0
+ // will not call libm_error for underflow
+ (p6) br.ret.sptk b0 ;;
+}
+
+GLOBAL_IEEE754_END(exp10l)
+weak_alias (exp10l, pow10l)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{.mfi
+ add GR_Parameter_Y = -32, sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs, GR_SAVE_PFS
+ mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
+}
+
+{.mfi
+.fframe 64
+ add sp = -64, sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP = gp ;; // Save gp
+}
+
+{.mmi
+ stfe [ GR_Parameter_Y ] = FR_Y, 16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16, sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0 = b0 ;; // Save b0
+}
+
+.body
+{.mib
+ stfe [ GR_Parameter_X ] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0, GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{.mib
+ stfe [ GR_Parameter_Y ] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16, GR_Parameter_Y
+ br.call.sptk b0 = __libm_error_support# ;; // Call error handling function
+}
+
+{.mmi
+ add GR_Parameter_RESULT = 48, sp
+ nop.m 0
+ nop.i 0 ;;
+}
+
+{.mmi
+ ldfe f8 = [ GR_Parameter_RESULT ] // Get return result off stack
+.restore sp
+ add sp = 64, sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 ;; // Restore return address
+}
+
+{.mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 ;; // Return
+}
+
+
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#, @function
+.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/e_exp2.S b/sysdeps/ia64/fpu/e_exp2.S
new file mode 100644
index 0000000000..e4a1dadd73
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_exp2.S
@@ -0,0 +1,563 @@
+.file "exp2.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http: //www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 08/25/00 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 09/05/02 Improved performance
+// 01/17/03 Fixed to call error support when x=1024.0
+//
+// API
+//==============================================================
+// double exp2(double)
+//
+// Overview of operation
+//==============================================================
+// Background
+//
+// Implementation
+//
+// Let x= (K + fh + fl + r), where
+// K is an integer, fh= 0.b1 b2 b3 b4 b5,
+// fl= 2^{-5}* 0.b6 b7 b8 b8 b10 (fh, fl >= 0),
+// and |r|<2^{-11}
+// Th is a table that stores 2^fh (32 entries) rounded to
+// double extended precision (only mantissa is stored)
+// Tl is a table that stores 2^fl (32 entries) rounded to
+// double extended precision (only mantissa is stored)
+//
+// 2^x is approximated as
+// 2^K * Th [ f ] * Tl [ f ] * (1+c1*r+c2*r^2+c3*r^3+c4*r^4)
+
+// Note: We use the following trick to speed up conversion from FP to integer:
+//
+// Let x = K + r, where K is an integer, and |r| <= 0.5
+// Let N be the number of significand bits for the FP format used
+// ( N=64 for double-extended, N=53 for double)
+//
+// Then let y = 1.5 * 2^(N-1) + x for RN mode
+// K = y - 1.5 * 2^(N-1)
+// r = x - K
+//
+// If we want to obtain the integer part and the first m fractional bits of x,
+// we can use the same trick, but with a constant of 1.5 * 2^(N-1-m):
+//
+// Let x = K + f + r
+// f = 0.b_1 b_2 ... b_m
+// |r| <= 2^(-m-1)
+//
+// Then let y = 1.5 * 2^(N-1-m) + x for RN mode
+// (K+f) = y - 1.5 * 2^(N-1-m)
+// r = x - K
+
+
+// Special values
+//==============================================================
+// exp2(0)= 1
+// exp2(+inf)= inf
+// exp2(-inf)= 0
+//
+
+// Registers used
+//==============================================================
+// r2-r3, r14-r40
+// f6-f15, f32-f45
+// p6-p8, p12
+//
+
+
+GR_TBL_START = r2
+GR_LOG_TBL = r3
+
+GR_OF_LIMIT = r14
+GR_UF_LIMIT = r15
+GR_EXP_CORR = r16
+GR_F_low = r17
+GR_F_high = r18
+GR_K = r19
+GR_Flow_ADDR = r20
+
+GR_BIAS = r21
+GR_Fh = r22
+GR_Fh_ADDR = r23
+GR_EXPMAX = r24
+GR_EMIN = r25
+
+GR_ROUNDVAL = r26
+GR_MASK = r27
+GR_KF0 = r28
+GR_MASK_low = r29
+GR_COEFF_START = r30
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+
+FR_COEFF1 = f6
+FR_COEFF2 = f7
+FR_R = f9
+
+FR_KF0 = f12
+FR_COEFF3 = f13
+FR_COEFF4 = f14
+FR_UF_LIMIT = f15
+
+FR_OF_LIMIT = f32
+FR_EXPMIN = f33
+FR_ROUNDVAL = f34
+FR_KF = f35
+
+FR_2_TO_K = f36
+FR_T_low = f37
+FR_T_high = f38
+FR_P34 = f39
+FR_R2 = f40
+
+FR_P12 = f41
+FR_T_low_K = f42
+FR_P14 = f43
+FR_T = f44
+FR_P = f45
+
+
+// Data tables
+//==============================================================
+
+RODATA
+
+.align 16
+
+LOCAL_OBJECT_START(poly_coeffs)
+
+data8 0x3fac6b08d704a0c0, 0x3f83b2ab6fba4e77 // C_3 and C_4
+data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
+data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
+LOCAL_OBJECT_END(poly_coeffs)
+
+
+LOCAL_OBJECT_START(T_table)
+
+// 2^{0.00000 b6 b7 b8 b9 b10}
+data8 0x8000000000000000, 0x8016302f17467628
+data8 0x802c6436d0e04f50, 0x80429c17d77c18ed
+data8 0x8058d7d2d5e5f6b0, 0x806f17687707a7af
+data8 0x80855ad965e88b83, 0x809ba2264dada76a
+data8 0x80b1ed4fd999ab6c, 0x80c83c56b50cf77f
+data8 0x80de8f3b8b85a0af, 0x80f4e5ff089f763e
+data8 0x810b40a1d81406d4, 0x81219f24a5baa59d
+data8 0x813801881d886f7b, 0x814e67cceb90502c
+data8 0x8164d1f3bc030773, 0x817b3ffd3b2f2e47
+data8 0x8191b1ea15813bfd, 0x81a827baf7838b78
+data8 0x81bea1708dde6055, 0x81d51f0b8557ec1c
+data8 0x81eba08c8ad4536f, 0x820225f44b55b33b
+data8 0x8218af4373fc25eb, 0x822f3c7ab205c89a
+data8 0x8245cd9ab2cec048, 0x825c62a423d13f0c
+data8 0x8272fb97b2a5894c, 0x828998760d01faf3
+data8 0x82a0393fe0bb0ca8, 0x82b6ddf5dbc35906
+//
+//
+// 2^{0.b1 b2 b3 b4 b5}
+data8 0x8000000000000000, 0x82cd8698ac2ba1d7
+data8 0x85aac367cc487b14, 0x88980e8092da8527
+data8 0x8b95c1e3ea8bd6e6, 0x8ea4398b45cd53c0
+data8 0x91c3d373ab11c336, 0x94f4efa8fef70961
+data8 0x9837f0518db8a96f, 0x9b8d39b9d54e5538
+data8 0x9ef5326091a111ad, 0xa27043030c496818
+data8 0xa5fed6a9b15138ea, 0xa9a15ab4ea7c0ef8
+data8 0xad583eea42a14ac6, 0xb123f581d2ac258f
+data8 0xb504f333f9de6484, 0xb8fbaf4762fb9ee9
+data8 0xbd08a39f580c36be, 0xc12c4cca66709456
+data8 0xc5672a115506dadd, 0xc9b9bd866e2f27a2
+data8 0xce248c151f8480e3, 0xd2a81d91f12ae45a
+data8 0xd744fccad69d6af4, 0xdbfbb797daf23755
+data8 0xe0ccdeec2a94e111, 0xe5b906e77c8348a8
+data8 0xeac0c6e7dd24392e, 0xefe4b99bdcdaf5cb
+data8 0xf5257d152486cc2c, 0xfa83b2db722a033a
+LOCAL_OBJECT_END(T_table)
+
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(exp2)
+
+
+{.mfi
+ alloc r32= ar.pfs, 1, 4, 4, 0
+ // will continue only for non-zero normal/denormal numbers
+ fclass.nm p12, p0= f8, 0x1b
+ // GR_TBL_START= pointer to C_1...C_4 followed by T_table
+ addl GR_TBL_START= @ltoff(poly_coeffs), gp
+}
+{.mlx
+ mov GR_OF_LIMIT= 0xffff + 10 // Exponent of overflow limit
+ movl GR_ROUNDVAL= 0x5a400000 // 1.5*2^(63-10) (SP)
+}
+;;
+
+// Form special constant 1.5*2^(63-10) to give integer part and first 10
+// fractional bits of x
+{.mfi
+ setf.s FR_ROUNDVAL= GR_ROUNDVAL // Form special constant
+ fcmp.lt.s1 p6, p8= f8, f0 // X<0 ?
+ nop.i 0
+}
+{.mfb
+ ld8 GR_COEFF_START= [ GR_TBL_START ] // Load pointer to coeff table
+ nop.f 0
+ (p12) br.cond.spnt SPECIAL_exp2 // Branch if nan, inf, zero
+}
+;;
+
+{.mlx
+ setf.exp FR_OF_LIMIT= GR_OF_LIMIT // Set overflow limit
+ movl GR_UF_LIMIT= 0xc4866000 // (-2^10-51) = -1075
+}
+;;
+
+{.mfi
+ ldfpd FR_COEFF3, FR_COEFF4= [ GR_COEFF_START ], 16 // load C_3, C_4
+ fma.s0 f8= f8, f1, f0 // normalize x
+ nop.i 0
+}
+;;
+
+{.mmi
+ setf.s FR_UF_LIMIT= GR_UF_LIMIT // Set underflow limit
+ ldfe FR_COEFF1= [ GR_COEFF_START ], 16 // load C_1
+ mov GR_EXP_CORR= 0xffff-126
+}
+;;
+
+{.mfi
+ ldfe FR_COEFF2= [ GR_COEFF_START ], 16 // load C_2
+ fma.s1 FR_KF0= f8, f1, FR_ROUNDVAL // y= x + 1.5*2^(63-10)
+ nop.i 0
+}
+;;
+
+{.mfi
+ mov GR_MASK= 1023
+ fms.s1 FR_KF= FR_KF0, f1, FR_ROUNDVAL // (K+f)
+ mov GR_MASK_low= 31
+}
+;;
+
+{.mfi
+ getf.sig GR_KF0= FR_KF0 // (K+f)*2^10= round_to_int(y)
+ fcmp.ge.s1 p12, p7= f8, FR_OF_LIMIT // x >= overflow threshold ?
+ add GR_LOG_TBL= 256, GR_COEFF_START // Pointer to high T_table
+}
+;;
+
+{.mmi
+ and GR_F_low= GR_KF0, GR_MASK_low // f_low
+ and GR_F_high= GR_MASK, GR_KF0 // f_high*32
+ shr GR_K= GR_KF0, 10 // K
+}
+;;
+
+{.mmi
+ shladd GR_Flow_ADDR= GR_F_low, 3, GR_COEFF_START // address of 2^{f_low}
+ add GR_BIAS= GR_K, GR_EXP_CORR // K= bias-2*63
+ shr GR_Fh= GR_F_high, 5 // f_high
+}
+;;
+
+{.mfi
+ setf.exp FR_2_TO_K= GR_BIAS // 2^{K-126}
+ fnma.s1 FR_R= FR_KF, f1, f8 // r= x - (K+f)
+ shladd GR_Fh_ADDR= GR_Fh, 3, GR_LOG_TBL // address of 2^{f_high}
+}
+{.mlx
+ ldf8 FR_T_low= [ GR_Flow_ADDR ] // load T_low= 2^{f_low}
+ movl GR_EMIN= 0xc47f8000 // EMIN= -1022
+}
+;;
+
+{.mfi
+ ldf8 FR_T_high= [ GR_Fh_ADDR ] // load T_high= 2^{f_high}
+ (p7) fcmp.lt.s1 p12, p7= f8, FR_UF_LIMIT // x<underflow threshold ?
+ nop.i 0
+}
+;;
+
+{.mfi
+ setf.s FR_EXPMIN= GR_EMIN // FR_EXPMIN= EMIN
+ fma.s1 FR_P34= FR_COEFF4, FR_R, FR_COEFF3 // P34= C_3+C_4*r
+ nop.i 0
+}
+{.mfb
+ nop.m 0
+ fma.s1 FR_R2= FR_R, FR_R, f0 // r*r
+ (p12) br.cond.spnt OUT_RANGE_exp2
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_P12= FR_COEFF2, FR_R, FR_COEFF1 // P12= C_1+C_2*r
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_T_low_K= FR_T_low, FR_2_TO_K, f0 // T= 2^{K-126}*T_low
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_P14= FR_R2, FR_P34, FR_P12 // P14= P12+r2*P34
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_T= FR_T_low_K, FR_T_high, f0 // T= T*T_high
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fcmp.lt.s0 p6, p8= f8, FR_EXPMIN // underflow (x<EMIN) ?
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_P= FR_P14, FR_R, f0 // P= P14*r
+ nop.i 0
+}
+;;
+
+{.mfb
+ nop.m 0
+ fma.d.s0 f8= FR_P, FR_T, FR_T // result= T+T*P
+ (p8) br.ret.sptk b0 // return
+}
+;;
+
+{.mfb
+ (p6) mov GR_Parameter_TAG= 162
+ nop.f 0
+ (p6) br.cond.sptk __libm_error_region
+}
+;;
+
+
+SPECIAL_exp2:
+{.mfi
+ nop.m 0
+ fclass.m p6, p0= f8, 0x22 // x= -Infinity ?
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fclass.m p7, p0= f8, 0x21 // x= +Infinity ?
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fclass.m p8, p0= f8, 0x7 // x= +/-Zero ?
+ nop.i 0
+}
+{.mfb
+ nop.m 0
+ (p6) mov f8= f0 // exp2(-Infinity)= 0
+ (p6) br.ret.spnt b0
+}
+;;
+
+{.mfb
+ nop.m 0
+ nop.f 0
+ (p7) br.ret.spnt b0 // exp2(+Infinity)= +Infinity
+}
+;;
+
+{.mfb
+ nop.m 0
+ (p8) mov f8= f1 // exp2(+/-0)= 1
+ (p8) br.ret.spnt b0
+}
+;;
+
+{.mfb
+ nop.m 0
+ fma.d.s0 f8= f8, f1, f0 // Remaining cases: NaNs
+ br.ret.sptk b0
+}
+;;
+
+
+OUT_RANGE_exp2:
+
+// overflow: p8= 1
+
+{.mii
+ (p8) mov GR_EXPMAX= 0x1fffe
+ nop.i 0
+ nop.i 0
+}
+;;
+
+{.mmb
+ (p8) mov GR_Parameter_TAG= 161
+ (p8) setf.exp FR_R= GR_EXPMAX
+ nop.b 999
+}
+;;
+
+{.mfi
+ nop.m 999
+ (p8) fma.d.s0 f8= FR_R, FR_R, f0 // Create overflow
+ nop.i 999
+}
+// underflow: p6= 1
+{.mii
+ (p6) mov GR_Parameter_TAG= 162
+ (p6) mov GR_EXPMAX= 1
+ nop.i 0
+}
+;;
+
+{.mmb
+ nop.m 0
+ (p6) setf.exp FR_R= GR_EXPMAX
+ nop.b 999
+}
+;;
+
+{.mfb
+ nop.m 999
+ (p6) fma.d.s0 f8= FR_R, FR_R, f0 // Create underflow
+ nop.b 0
+}
+;;
+
+GLOBAL_LIBM_END(exp2)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+
+.prologue
+{.mfi
+ add GR_Parameter_Y= -32, sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs, GR_SAVE_PFS
+ mov GR_SAVE_PFS= ar.pfs // Save ar.pfs
+}
+
+{.mfi
+.fframe 64
+ add sp= -64, sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP= gp // Save gp
+}
+;;
+
+{.mmi
+ stfd [ GR_Parameter_Y ]= FR_Y, 16 // STORE Parameter 2 on stack
+ add GR_Parameter_X= 16, sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0= b0 // Save b0
+}
+;;
+
+.body
+{.mib
+ stfd [ GR_Parameter_X ]= FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT= 0, GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{.mib
+ stfd [ GR_Parameter_Y ]= FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y= -16, GR_Parameter_Y
+ br.call.sptk b0= __libm_error_support# // Call error handling function
+}
+;;
+
+{.mmi
+ add GR_Parameter_RESULT= 48, sp
+ nop.m 0
+ nop.i 0
+}
+;;
+
+{.mmi
+ ldfd f8= [ GR_Parameter_RESULT ] // Get return result off stack
+.restore sp
+ add sp= 64, sp // Restore stack pointer
+ mov b0= GR_SAVE_B0 // Restore return address
+}
+;;
+
+{.mib
+ mov gp= GR_SAVE_GP // Restore gp
+ mov ar.pfs= GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+}
+;;
+
+
+LOCAL_LIBM_END(__libm_error_region)
+
+.type __libm_error_support#, @function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_exp2f.S b/sysdeps/ia64/fpu/e_exp2f.S
new file mode 100644
index 0000000000..f785b70e65
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_exp2f.S
@@ -0,0 +1,538 @@
+.file "exp2f.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http: //www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 08/25/00 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 09/05/02 Improved performance and accuracy
+// 01/17/03 Fixed to call error support when x=128.0
+//
+// API
+//==============================================================
+// float exp2f(float)
+//
+// Overview of operation
+//==============================================================
+// Background
+//
+// Implementation
+//
+// Let x= (K + fh + fl + r), where
+// K is an integer, fh= 0.b1 b2 b3 b4 b5,
+// fl= 2^{-5}* 0.b6 b7 b8 b8 b10 (fh, fl >= 0),
+// and |r|<2^{-11}
+// Th is a table that stores 2^fh (32 entries) rounded to
+// double extended precision (only mantissa is stored)
+// Tl is a table that stores 2^fl (32 entries) rounded to
+// double extended precision (only mantissa is stored)
+//
+// 2^x is approximated as
+// 2^K * Th [ f ] * Tl [ f ] * (1+c1*r+c2*r^2)
+
+// Note: We use the following trick to speed up conversion from FP to integer:
+//
+// Let x = K + r, where K is an integer, and |r| <= 0.5
+// Let N be the number of significand bits for the FP format used
+// ( N=64 for double-extended, N=53 for double)
+//
+// Then let y = 1.5 * 2^(N-1) + x for RN mode
+// K = y - 1.5 * 2^(N-1)
+// r = x - K
+//
+// If we want to obtain the integer part and the first m fractional bits of x,
+// we can use the same trick, but with a constant of 1.5 * 2^(N-1-m):
+//
+// Let x = K + f + r
+// f = 0.b_1 b_2 ... b_m
+// |r| <= 2^(-m-1)
+//
+// Then let y = 1.5 * 2^(N-1-m) + x for RN mode
+// (K+f) = y - 1.5 * 2^(N-1-m)
+// r = x - K
+
+
+// Special values
+//==============================================================
+// exp2(0)= 1
+// exp2(+inf)= inf
+// exp2(-inf)= 0
+//
+
+// Registers used
+//==============================================================
+// r2-r3, r14-r40
+// f6-f15, f32-f45
+// p6-p8, p12
+//
+
+
+GR_TBL_START = r2
+GR_LOG_TBL = r3
+
+GR_OF_LIMIT = r14
+GR_UF_LIMIT = r15
+GR_EXP_CORR = r16
+GR_F_low = r17
+GR_F_high = r18
+GR_K = r19
+GR_Flow_ADDR = r20
+
+GR_BIAS = r21
+GR_Fh = r22
+GR_Fh_ADDR = r23
+GR_EXPMAX = r24
+GR_EMIN = r25
+
+GR_ROUNDVAL = r26
+GR_MASK = r27
+GR_KF0 = r28
+GR_MASK_low = r29
+GR_COEFF_START = r30
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+
+FR_COEFF1 = f6
+FR_COEFF2 = f7
+FR_R = f9
+
+FR_KF0 = f12
+FR_UF_LIMIT = f15
+
+FR_OF_LIMIT = f32
+FR_EXPMIN = f33
+FR_ROUNDVAL = f34
+FR_KF = f35
+
+FR_2_TO_K = f36
+FR_T_low = f37
+FR_T_high = f38
+
+FR_P12 = f41
+FR_T_low_K = f42
+FR_T = f44
+FR_P = f45
+
+
+// Data tables
+//==============================================================
+
+RODATA
+
+.align 16
+
+LOCAL_OBJECT_START(poly_coeffs)
+
+data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
+data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
+LOCAL_OBJECT_END(poly_coeffs)
+
+
+LOCAL_OBJECT_START(T_table)
+
+// 2^{0.00000 b6 b7 b8 b9 b10}
+data8 0x8000000000000000, 0x8016302f17467628
+data8 0x802c6436d0e04f50, 0x80429c17d77c18ed
+data8 0x8058d7d2d5e5f6b0, 0x806f17687707a7af
+data8 0x80855ad965e88b83, 0x809ba2264dada76a
+data8 0x80b1ed4fd999ab6c, 0x80c83c56b50cf77f
+data8 0x80de8f3b8b85a0af, 0x80f4e5ff089f763e
+data8 0x810b40a1d81406d4, 0x81219f24a5baa59d
+data8 0x813801881d886f7b, 0x814e67cceb90502c
+data8 0x8164d1f3bc030773, 0x817b3ffd3b2f2e47
+data8 0x8191b1ea15813bfd, 0x81a827baf7838b78
+data8 0x81bea1708dde6055, 0x81d51f0b8557ec1c
+data8 0x81eba08c8ad4536f, 0x820225f44b55b33b
+data8 0x8218af4373fc25eb, 0x822f3c7ab205c89a
+data8 0x8245cd9ab2cec048, 0x825c62a423d13f0c
+data8 0x8272fb97b2a5894c, 0x828998760d01faf3
+data8 0x82a0393fe0bb0ca8, 0x82b6ddf5dbc35906
+//
+//
+// 2^{0.b1 b2 b3 b4 b5}
+data8 0x8000000000000000, 0x82cd8698ac2ba1d7
+data8 0x85aac367cc487b14, 0x88980e8092da8527
+data8 0x8b95c1e3ea8bd6e6, 0x8ea4398b45cd53c0
+data8 0x91c3d373ab11c336, 0x94f4efa8fef70961
+data8 0x9837f0518db8a96f, 0x9b8d39b9d54e5538
+data8 0x9ef5326091a111ad, 0xa27043030c496818
+data8 0xa5fed6a9b15138ea, 0xa9a15ab4ea7c0ef8
+data8 0xad583eea42a14ac6, 0xb123f581d2ac258f
+data8 0xb504f333f9de6484, 0xb8fbaf4762fb9ee9
+data8 0xbd08a39f580c36be, 0xc12c4cca66709456
+data8 0xc5672a115506dadd, 0xc9b9bd866e2f27a2
+data8 0xce248c151f8480e3, 0xd2a81d91f12ae45a
+data8 0xd744fccad69d6af4, 0xdbfbb797daf23755
+data8 0xe0ccdeec2a94e111, 0xe5b906e77c8348a8
+data8 0xeac0c6e7dd24392e, 0xefe4b99bdcdaf5cb
+data8 0xf5257d152486cc2c, 0xfa83b2db722a033a
+LOCAL_OBJECT_END(T_table)
+
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(exp2f)
+
+
+{.mfi
+ alloc r32= ar.pfs, 1, 4, 4, 0
+ // will continue only for non-zero normal/denormal numbers
+ fclass.nm p12, p0= f8, 0x1b
+ // GR_TBL_START= pointer to C_1...C_2 followed by T_table
+ addl GR_TBL_START= @ltoff(poly_coeffs), gp
+}
+{.mlx
+ mov GR_OF_LIMIT= 0xffff + 7 // Exponent of overflow limit
+ movl GR_ROUNDVAL= 0x5a400000 // 1.5*2^(63-10) (SP)
+}
+;;
+
+// Form special constant 1.5*2^(63-10) to give integer part and first 10
+// fractional bits of x
+{.mfi
+ setf.s FR_ROUNDVAL= GR_ROUNDVAL // Form special constant
+ fcmp.lt.s1 p6, p8= f8, f0 // X<0 ?
+ nop.i 0
+}
+{.mfb
+ ld8 GR_COEFF_START= [ GR_TBL_START ] // Load pointer to coeff table
+ nop.f 0
+ (p12) br.cond.spnt SPECIAL_exp2 // Branch if nan, inf, zero
+}
+;;
+
+{.mlx
+ setf.exp FR_OF_LIMIT= GR_OF_LIMIT // Set overflow limit
+ movl GR_UF_LIMIT= 0xc3160000 // (-2^7-22) = -150
+}
+;;
+
+{.mfi
+ ldfe FR_COEFF1= [ GR_COEFF_START ], 16 // load C_1
+ fma.s0 f8= f8, f1, f0 // normalize x
+ nop.i 0
+}
+;;
+
+{.mmi
+ ldfe FR_COEFF2= [ GR_COEFF_START ], 16 // load C_2
+ setf.s FR_UF_LIMIT= GR_UF_LIMIT // Set underflow limit
+ mov GR_EXP_CORR= 0xffff-126
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_KF0= f8, f1, FR_ROUNDVAL // y= x + 1.5*2^(63-10)
+ nop.i 0
+}
+;;
+
+{.mfi
+ mov GR_MASK= 1023
+ fms.s1 FR_KF= FR_KF0, f1, FR_ROUNDVAL // (K+f)
+ mov GR_MASK_low= 31
+}
+;;
+
+{.mfi
+ getf.sig GR_KF0= FR_KF0 // (K+f)*2^10= round_to_int(y)
+ fcmp.ge.s1 p12, p7= f8, FR_OF_LIMIT // x >= overflow threshold ?
+ add GR_LOG_TBL= 256, GR_COEFF_START // Pointer to high T_table
+}
+;;
+
+{.mmi
+ and GR_F_low= GR_KF0, GR_MASK_low // f_low
+ and GR_F_high= GR_MASK, GR_KF0 // f_high*32
+ shr GR_K= GR_KF0, 10 // K
+}
+;;
+
+{.mmi
+ shladd GR_Flow_ADDR= GR_F_low, 3, GR_COEFF_START // address of 2^{f_low}
+ add GR_BIAS= GR_K, GR_EXP_CORR // K= bias-2*63
+ shr GR_Fh= GR_F_high, 5 // f_high
+}
+;;
+
+{.mfi
+ setf.exp FR_2_TO_K= GR_BIAS // 2^{K-126}
+ fnma.s1 FR_R= FR_KF, f1, f8 // r= x - (K+f)
+ shladd GR_Fh_ADDR= GR_Fh, 3, GR_LOG_TBL // address of 2^{f_high}
+}
+{.mlx
+ ldf8 FR_T_low= [ GR_Flow_ADDR ] // load T_low= 2^{f_low}
+ movl GR_EMIN= 0xc2fc0000 // EMIN= -126
+}
+;;
+
+{.mfi
+ ldf8 FR_T_high= [ GR_Fh_ADDR ] // load T_high= 2^{f_high}
+ (p7) fcmp.lt.s1 p12, p7= f8, FR_UF_LIMIT // x<underflow threshold ?
+ nop.i 0
+}
+;;
+
+{.mfb
+ setf.s FR_EXPMIN= GR_EMIN // FR_EXPMIN= EMIN
+ fma.s1 FR_P12= FR_COEFF2, FR_R, FR_COEFF1 // P12= C_1+C_2*r
+ (p12) br.cond.spnt OUT_RANGE_exp2
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_T_low_K= FR_T_low, FR_2_TO_K, f0 // T= 2^{K-126}*T_low
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_P= FR_R, FR_P12, f0 // P= P12+r
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fma.s1 FR_T= FR_T_low_K, FR_T_high, f0 // T= T*T_high
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fcmp.lt.s0 p6, p8= f8, FR_EXPMIN // underflow (x<EMIN) ?
+ nop.i 0
+}
+;;
+
+{.mfb
+ nop.m 0
+ fma.s.s0 f8= FR_P, FR_T, FR_T // result= T+T*P
+ (p8) br.ret.sptk b0 // return
+}
+;;
+
+{.mfb
+ (p6) mov GR_Parameter_TAG= 164
+ nop.f 0
+ (p6) br.cond.sptk __libm_error_region
+}
+;;
+
+
+SPECIAL_exp2:
+{.mfi
+ nop.m 0
+ fclass.m p6, p0= f8, 0x22 // x= -Infinity ?
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fclass.m p7, p0= f8, 0x21 // x= +Infinity ?
+ nop.i 0
+}
+;;
+
+{.mfi
+ nop.m 0
+ fclass.m p8, p0= f8, 0x7 // x= +/-Zero ?
+ nop.i 0
+}
+{.mfb
+ nop.m 0
+ (p6) mov f8= f0 // exp2(-Infinity)= 0
+ (p6) br.ret.spnt b0
+}
+;;
+
+{.mfb
+ nop.m 0
+ nop.f 0
+ (p7) br.ret.spnt b0 // exp2(+Infinity)= +Infinity
+}
+;;
+
+{.mfb
+ nop.m 0
+ (p8) mov f8= f1 // exp2(+/-0)= 1
+ (p8) br.ret.spnt b0
+}
+;;
+
+{.mfb
+ nop.m 0
+ fma.s.s0 f8= f8, f1, f0 // Remaining cases: NaNs
+ br.ret.sptk b0
+}
+;;
+
+
+OUT_RANGE_exp2:
+
+// overflow: p8= 1
+
+{.mii
+ (p8) mov GR_EXPMAX= 0x1fffe
+ nop.i 0
+ nop.i 0
+}
+;;
+
+{.mmb
+ (p8) mov GR_Parameter_TAG= 163
+ (p8) setf.exp FR_R= GR_EXPMAX
+ nop.b 999
+}
+;;
+
+{.mfi
+ nop.m 999
+ (p8) fma.s.s0 f8= FR_R, FR_R, f0 // Create overflow
+ nop.i 999
+}
+// underflow: p6= 1
+{.mii
+ (p6) mov GR_Parameter_TAG= 164
+ (p6) mov GR_EXPMAX= 1
+ nop.i 0
+}
+;;
+
+{.mmb
+ nop.m 0
+ (p6) setf.exp FR_R= GR_EXPMAX
+ nop.b 999
+}
+;;
+
+{.mfb
+ nop.m 999
+ (p6) fma.s.s0 f8= FR_R, FR_R, f0 // Create underflow
+ nop.b 0
+}
+;;
+
+GLOBAL_LIBM_END(exp2f)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+
+.prologue
+{.mfi
+ add GR_Parameter_Y= -32, sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs, GR_SAVE_PFS
+ mov GR_SAVE_PFS= ar.pfs // Save ar.pfs
+}
+
+{.mfi
+.fframe 64
+ add sp= -64, sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP= gp // Save gp
+}
+;;
+
+{.mmi
+ stfs [ GR_Parameter_Y ]= FR_Y, 16 // STORE Parameter 2 on stack
+ add GR_Parameter_X= 16, sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0= b0 // Save b0
+}
+;;
+
+.body
+{.mib
+ stfs [ GR_Parameter_X ]= FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT= 0, GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{.mib
+ stfs [ GR_Parameter_Y ]= FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y= -16, GR_Parameter_Y
+ br.call.sptk b0= __libm_error_support# // Call error handling function
+}
+;;
+
+{.mmi
+ add GR_Parameter_RESULT= 48, sp
+ nop.m 0
+ nop.i 0
+}
+;;
+
+{.mmi
+ ldfs f8= [ GR_Parameter_RESULT ] // Get return result off stack
+.restore sp
+ add sp= 64, sp // Restore stack pointer
+ mov b0= GR_SAVE_B0 // Restore return address
+}
+;;
+
+{.mib
+ mov gp= GR_SAVE_GP // Restore gp
+ mov ar.pfs= GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+}
+;;
+
+
+LOCAL_LIBM_END(__libm_error_region)
+
+.type __libm_error_support#, @function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_exp2l.S b/sysdeps/ia64/fpu/e_exp2l.S
new file mode 100644
index 0000000000..6e2a62ad91
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_exp2l.S
@@ -0,0 +1,806 @@
+.file "exp2l.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 07/27/00 Initial version
+// 08/15/00 Bundle added after call to __libm_error_support to properly
+// set [ the previously overwritten ] GR_Parameter_RESULT.
+// 02/02/01 Added libm_error_support calls for underflow
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+// 05/07/03 Reformatted assembly source
+//
+// API
+//==============================================================
+// long double exp2l(long double)
+//
+// Overview of operation
+//==============================================================
+// Background
+//
+// Implementation
+//
+// Let x= K + f + r, where
+// K is an integer, f= 0.b1 b2... b8 (f>= 0),
+// and |r|<2^{-8}
+// T is a table that stores 2^f (256 entries) rounded to
+// double extended precision (only mantissa is stored)
+// D stores (2^f/T [ f ] - 1), rounded to single precision
+//
+// 2^x is approximated as
+// 2^K * T [ f ] * (1+D [ f ] +c1*r+c2*r^2+...+c6*r^6)
+//
+
+
+
+// Special values
+//==============================================================
+// exp2(0)= 1
+// exp2(+inf)= inf
+// exp2(-inf)= 0
+//
+
+
+// Registers used
+//==============================================================
+// f6-f15, f32-f46
+// r2-r3, r8-r11, r14-r40
+// p6, p7, p8, p12
+
+ FR_X = f10
+ FR_Y = f1
+ FR_RESULT = f8
+
+ FR_KF0 = f6
+ FR_EXP63 = f7
+ FR_T = f9
+ FR_COEFF3 = f10
+ FR_COEFF4 = f11
+ FR_COEFF5 = f12
+ FR_COEFF6 = f13
+ FR_COEFF1 = f14
+ FR_COEFF2 = f15
+ FR_2P14 = f32
+ FR_UF_TEST = f33
+ FR_D = f34
+ FR_R = f35
+ FR_2EXP = f36
+ FR_EMIN = f37
+ FR_P34 = f38
+ FR_P56 = f39
+ FR_R2 = f40
+ FR_P12 = f41
+ FR_TS = f42
+ FR_P36 = f43
+ FR_P02 = f44
+ FR_R3 = f45
+ FR_P06 = f46
+
+
+ GR_ADDR0 = r2
+ GR_ADDR = r2
+ GR_D_ADDR0 = r3
+ GR_D_ADDR = r3
+ GR_LEADBITS = r8
+ GR_256 = r9
+ GR_EM63 = r10
+ GR_255 = r11
+ GR_EXPON = r14
+ GR_BM63 = r15
+ GR_UF_TEST = r16
+ GR_INDEX = r17
+ GR_K = r18
+ GR_KF = r19
+ GR_2P14 = r19
+ GR_EMIN = r20
+ GR_IT = r21
+ GR_ID = r22
+ GR_63 = r23
+ GR_CONST1 = r24
+ GR_EBIAS = r25
+ GR_CONST2 = r26
+ GR_CONST3 = r27
+ GR_SIGNIF = r28
+ GR_ARGEXP = r29
+ GR_SGN = r30
+ GR_EMIN1 = r31
+ GR_SREG = r32
+
+ GR_SAVE_B0 = r33
+ GR_SAVE_PFS = r34
+ GR_SAVE_GP = r35
+ GR_SAVE_SP = r36
+
+ GR_Parameter_X = r37
+ GR_Parameter_Y = r38
+ GR_Parameter_RESULT= r39
+ GR_Parameter_TAG = r40
+
+
+// Data tables
+//==============================================================
+
+RODATA
+
+.align 16
+
+LOCAL_OBJECT_START(poly_coeffs)
+
+ data8 0x3fac6b08d704a0c0 // C_3
+ data8 0x3f83b2ab6fba4e77 // C_4
+ data8 0x3f55d87fe78a6731 // C_5
+ data8 0x3f2430912f86c787 // C_6
+ data8 0xb17217f7d1cf79ab, 0x00003ffe // C_1
+ data8 0xf5fdeffc162c7541, 0x00003ffc // C_2
+LOCAL_OBJECT_END(poly_coeffs)
+
+
+LOCAL_OBJECT_START(T_table)
+
+ data8 0x8000000000000000, 0x8058d7d2d5e5f6b1
+ data8 0x80b1ed4fd999ab6c, 0x810b40a1d81406d4
+ data8 0x8164d1f3bc030773, 0x81bea1708dde6056
+ data8 0x8218af4373fc25ec, 0x8272fb97b2a5894c
+ data8 0x82cd8698ac2ba1d7, 0x83285071e0fc4547
+ data8 0x8383594eefb6ee37, 0x83dea15b9541b132
+ data8 0x843a28c3acde4046, 0x8495efb3303efd30
+ data8 0x84f1f656379c1a29, 0x854e3cd8f9c8c95d
+ data8 0x85aac367cc487b15, 0x86078a2f23642a9f
+ data8 0x8664915b923fba04, 0x86c1d919caef5c88
+ data8 0x871f61969e8d1010, 0x877d2afefd4e256c
+ data8 0x87db357ff698d792, 0x88398146b919f1d4
+ data8 0x88980e8092da8527, 0x88f6dd5af155ac6b
+ data8 0x8955ee03618e5fdd, 0x89b540a7902557a4
+ data8 0x8a14d575496efd9a, 0x8a74ac9a79896e47
+ data8 0x8ad4c6452c728924, 0x8b3522a38e1e1032
+ data8 0x8b95c1e3ea8bd6e7, 0x8bf6a434adde0085
+ data8 0x8c57c9c4646f4dde, 0x8cb932c1bae97a95
+ data8 0x8d1adf5b7e5ba9e6, 0x8d7ccfc09c50e2f8
+ data8 0x8ddf042022e69cd6, 0x8e417ca940e35a01
+ data8 0x8ea4398b45cd53c0, 0x8f073af5a2013520
+ data8 0x8f6a8117e6c8e5c4, 0x8fce0c21c6726481
+ data8 0x9031dc431466b1dc, 0x9095f1abc540ca6b
+ data8 0x90fa4c8beee4b12b, 0x915eed13c89689d3
+ data8 0x91c3d373ab11c336, 0x9228ffdc10a051ad
+ data8 0x928e727d9531f9ac, 0x92f42b88f673aa7c
+ data8 0x935a2b2f13e6e92c, 0x93c071a0eef94bc1
+ data8 0x9426ff0fab1c04b6, 0x948dd3ac8ddb7ed3
+ data8 0x94f4efa8fef70961, 0x955c5336887894d5
+ data8 0x95c3fe86d6cc7fef, 0x962bf1cbb8d97560
+ data8 0x96942d3720185a00, 0x96fcb0fb20ac4ba3
+ data8 0x97657d49f17ab08e, 0x97ce9255ec4357ab
+ data8 0x9837f0518db8a96f, 0x98a1976f7597e996
+ data8 0x990b87e266c189aa, 0x9975c1dd47518c77
+ data8 0x99e0459320b7fa65, 0x9a4b13371fd166ca
+ data8 0x9ab62afc94ff864a, 0x9b218d16f441d63d
+ data8 0x9b8d39b9d54e5539, 0x9bf93118f3aa4cc1
+ data8 0x9c6573682ec32c2d, 0x9cd200db8a0774cb
+ data8 0x9d3ed9a72cffb751, 0x9dabfdff6367a2aa
+ data8 0x9e196e189d472420, 0x9e872a276f0b98ff
+ data8 0x9ef5326091a111ae, 0x9f6386f8e28ba651
+ data8 0x9fd228256400dd06, 0xa041161b3d0121be
+ data8 0xa0b0510fb9714fc2, 0xa11fd9384a344cf7
+ data8 0xa18faeca8544b6e4, 0xa1ffd1fc25cea188
+ data8 0xa27043030c496819, 0xa2e102153e918f9e
+ data8 0xa3520f68e802bb93, 0xa3c36b345991b47c
+ data8 0xa43515ae09e6809e, 0xa4a70f0c95768ec5
+ data8 0xa5195786be9ef339, 0xa58bef536dbeb6ee
+ data8 0xa5fed6a9b15138ea, 0xa6720dc0be08a20c
+ data8 0xa6e594cfeee86b1e, 0xa7596c0ec55ff55b
+ data8 0xa7cd93b4e965356a, 0xa8420bfa298f70d1
+ data8 0xa8b6d5167b320e09, 0xa92bef41fa77771b
+ data8 0xa9a15ab4ea7c0ef8, 0xaa1717a7b5693979
+ data8 0xaa8d2652ec907629, 0xab0386ef48868de1
+ data8 0xab7a39b5a93ed337, 0xabf13edf162675e9
+ data8 0xac6896a4be3fe929, 0xace0413ff83e5d04
+ data8 0xad583eea42a14ac6, 0xadd08fdd43d01491
+ data8 0xae493452ca35b80e, 0xaec22c84cc5c9465
+ data8 0xaf3b78ad690a4375, 0xafb51906e75b8661
+ data8 0xb02f0dcbb6e04584, 0xb0a957366fb7a3c9
+ data8 0xb123f581d2ac2590, 0xb19ee8e8c94feb09
+ data8 0xb21a31a66618fe3b, 0xb295cff5e47db4a4
+ data8 0xb311c412a9112489, 0xb38e0e38419fae18
+ data8 0xb40aaea2654b9841, 0xb487a58cf4a9c180
+ data8 0xb504f333f9de6484, 0xb58297d3a8b9f0d2
+ data8 0xb60093a85ed5f76c, 0xb67ee6eea3b22b8f
+ data8 0xb6fd91e328d17791, 0xb77c94c2c9d725e9
+ data8 0xb7fbefca8ca41e7c, 0xb87ba337a1743834
+ data8 0xb8fbaf4762fb9ee9, 0xb97c143756844dbf
+ data8 0xb9fcd2452c0b9deb, 0xba7de9aebe5fea09
+ data8 0xbaff5ab2133e45fb, 0xbb81258d5b704b6f
+ data8 0xbc034a7ef2e9fb0d, 0xbc85c9c560e7b269
+ data8 0xbd08a39f580c36bf, 0xbd8bd84bb67ed483
+ data8 0xbe0f6809860993e2, 0xbe935317fc378238
+ data8 0xbf1799b67a731083, 0xbf9c3c248e2486f8
+ data8 0xc0213aa1f0d08db0, 0xc0a6956e8836ca8d
+ data8 0xc12c4cca66709456, 0xc1b260f5ca0fbb33
+ data8 0xc238d2311e3d6673, 0xc2bfa0bcfad907c9
+ data8 0xc346ccda24976407, 0xc3ce56c98d21b15d
+ data8 0xc4563ecc5334cb33, 0xc4de8523c2c07baa
+ data8 0xc5672a115506dadd, 0xc5f02dd6b0bbc3d9
+ data8 0xc67990b5aa245f79, 0xc70352f04336c51e
+ data8 0xc78d74c8abb9b15d, 0xc817f681416452b2
+ data8 0xc8a2d85c8ffe2c45, 0xc92e1a9d517f0ecc
+ data8 0xc9b9bd866e2f27a3, 0xca45c15afcc72624
+ data8 0xcad2265e4290774e, 0xcb5eecd3b38597c9
+ data8 0xcbec14fef2727c5d, 0xcc799f23d11510e5
+ data8 0xcd078b86503dcdd2, 0xcd95da6a9ff06445
+ data8 0xce248c151f8480e4, 0xceb3a0ca5dc6a55d
+ data8 0xcf4318cf191918c1, 0xcfd2f4683f94eeb5
+ data8 0xd06333daef2b2595, 0xd0f3d76c75c5db8d
+ data8 0xd184df6251699ac6, 0xd2164c023056bcab
+ data8 0xd2a81d91f12ae45a, 0xd33a5457a3029054
+ data8 0xd3ccf099859ac379, 0xd45ff29e0972c561
+ data8 0xd4f35aabcfedfa1f, 0xd5872909ab75d18a
+ data8 0xd61b5dfe9f9bce07, 0xd6aff9d1e13ba2fe
+ data8 0xd744fccad69d6af4, 0xd7da67311797f56a
+ data8 0xd870394c6db32c84, 0xd9067364d44a929c
+ data8 0xd99d15c278afd7b6, 0xda3420adba4d8704
+ data8 0xdacb946f2ac9cc72, 0xdb63714f8e295255
+ data8 0xdbfbb797daf23755, 0xdc9467913a4f1c92
+ data8 0xdd2d818508324c20, 0xddc705bcd378f7f0
+ data8 0xde60f4825e0e9124, 0xdefb4e1f9d1037f2
+ data8 0xdf9612deb8f04420, 0xe031430a0d99e627
+ data8 0xe0ccdeec2a94e111, 0xe168e6cfd3295d23
+ data8 0xe2055afffe83d369, 0xe2a23bc7d7d91226
+ data8 0xe33f8972be8a5a51, 0xe3dd444c46499619
+ data8 0xe47b6ca0373da88d, 0xe51a02ba8e26d681
+ data8 0xe5b906e77c8348a8, 0xe658797368b3a717
+ data8 0xe6f85aaaee1fce22, 0xe798aadadd5b9cbf
+ data8 0xe8396a503c4bdc68, 0xe8da9958464b42ab
+ data8 0xe97c38406c4f8c57, 0xea1e4756550eb27b
+ data8 0xeac0c6e7dd24392f, 0xeb63b74317369840
+ data8 0xec0718b64c1cbddc, 0xecaaeb8ffb03ab41
+ data8 0xed4f301ed9942b84, 0xedf3e6b1d418a491
+ data8 0xee990f980da3025b, 0xef3eab20e032bc6b
+ data8 0xefe4b99bdcdaf5cb, 0xf08b3b58cbe8b76a
+ data8 0xf13230a7ad094509, 0xf1d999d8b7708cc1
+ data8 0xf281773c59ffb13a, 0xf329c9233b6bae9c
+ data8 0xf3d28fde3a641a5b, 0xf47bcbbe6db9fddf
+ data8 0xf5257d152486cc2c, 0xf5cfa433e6537290
+ data8 0xf67a416c733f846e, 0xf7255510c4288239
+ data8 0xf7d0df730ad13bb9, 0xf87ce0e5b2094d9c
+ data8 0xf92959bb5dd4ba74, 0xf9d64a46eb939f35
+ data8 0xfa83b2db722a033a, 0xfb3193cc4227c3f4
+ data8 0xfbdfed6ce5f09c49, 0xfc8ec01121e447bb
+ data8 0xfd3e0c0cf486c175, 0xfdedd1b496a89f35
+ data8 0xfe9e115c7b8f884c, 0xff4ecb59511ec8a5
+LOCAL_OBJECT_END(T_table)
+
+
+LOCAL_OBJECT_START(D_table)
+
+ data4 0x00000000, 0x9f55c08f, 0x1e93ffa3, 0x1dcd43a8
+ data4 0x1f751f79, 0x9f3cdd88, 0x9f43d155, 0x1eda222c
+ data4 0x1ef35513, 0x9f597895, 0x9e698881, 0x1ec71073
+ data4 0x1e50e371, 0x9dc01e19, 0x1de74133, 0x1e2f028c
+ data4 0x9edefb47, 0x1ebbac48, 0x9e8b0330, 0x9e9e9314
+ data4 0x1edc1d11, 0x1f098529, 0x9f52827c, 0x1f50050d
+ data4 0x1f301e8e, 0x1f5b64d1, 0x9f45e3ee, 0x9ef64d6d
+ data4 0x1d6ec5e8, 0x9e61ad9a, 0x1d44ccbb, 0x9e4a8bbb
+ data4 0x9cf11576, 0x9dcce7e7, 0x9d02ac90, 0x1f26ccf0
+ data4 0x9f0877c6, 0x9ddd62ae, 0x9f4b7fc3, 0x1ea8ef6b
+ data4 0x1ea4378d, 0x1ef6fc38, 0x1db99fd9, 0x1f22bf6f
+ data4 0x1f53e172, 0x1e85504a, 0x9f37cc75, 0x1f0c5e17
+ data4 0x1dde8aac, 0x9cb42bb2, 0x1e153cd7, 0x1eb62bba
+ data4 0x9e9b941b, 0x9ea80e3c, 0x1f508823, 0x1ec3fd36
+ data4 0x1e9ffaa1, 0x1e21e2eb, 0x9d948b1d, 0x9e8ac93a
+ data4 0x1ef7ee6f, 0x9e80dda3, 0x1f0814be, 0x1dc5ddfe
+ data4 0x1eedb9d1, 0x9f2aaa26, 0x9ea5b0fc, 0x1edf702e
+ data4 0x9e391201, 0x1f1316bb, 0x1ea27fb7, 0x9e05ed18
+ data4 0x9f199ed2, 0x1ee7fd7c, 0x1f003db6, 0x9eac3793
+ data4 0x9e5b8c10, 0x9f3af17c, 0x1bc9a8be, 0x1ee3c004
+ data4 0x9f19b1b2, 0x9f242ce9, 0x9ce67dd1, 0x9e4f6275
+ data4 0x1e20742c, 0x1eb9328a, 0x9f477153, 0x1d969718
+ data4 0x9f1e6c43, 0x1f2f67f4, 0x9f39c7e4, 0x9e3c4feb
+ data4 0x1da3956b, 0x9e7c685d, 0x1f280911, 0x9f0d8afb
+ data4 0x1e314b40, 0x9eb4f250, 0x9f1a34ad, 0x1ef5d5e7
+ data4 0x9f145496, 0x1e604827, 0x9f1e5195, 0x1e9c1fc0
+ data4 0x1efde521, 0x1e69b385, 0x1f316830, 0x9f244eae
+ data4 0x1f1787ec, 0x9e939971, 0x1f0bb393, 0x9f0511d6
+ data4 0x1ed919de, 0x1d8b7b28, 0x1e5ca4a9, 0x1e7c357b
+ data4 0x9e3ff8e8, 0x1eef53b5, 0x9ed22ed7, 0x1f16659b
+ data4 0x9f2db102, 0x9e2c6a78, 0x1f328d7d, 0x9f2fec3c
+ data4 0x1eb395bd, 0x9f242b84, 0x9e2683e6, 0x1ed71e68
+ data4 0x1efd1df5, 0x9e9eeafd, 0x9ed2249c, 0x1eef129a
+ data4 0x1d1ea44c, 0x9e81f7ff, 0x1eaf77c9, 0x9ee7a285
+ data4 0x1e1864ed, 0x9ee7edbb, 0x9e15a27d, 0x9ae61655
+ data4 0x1f1ff1a2, 0x1da29755, 0x9e5f46fb, 0x1e901236
+ data4 0x9eecfb9b, 0x9f204d2f, 0x1ec64685, 0x9eb809bd
+ data4 0x9e0026c5, 0x1d9f1da1, 0x1f142b49, 0x9f20f22e
+ data4 0x1f24b067, 0x1f185a4c, 0x9f09765c, 0x9ece902f
+ data4 0x1e2ca5db, 0x1e6de464, 0x9f071f67, 0x1f1518c3
+ data4 0x1ea13ded, 0x1f0b8414, 0x1edb6ad4, 0x9e548740
+ data4 0x9ea10efb, 0x1ee48a60, 0x1e7954c5, 0x9edad013
+ data4 0x9f21517d, 0x9e9b6e0c, 0x9ee7f9a6, 0x9ebd4298
+ data4 0x9d65b24e, 0x1eed751f, 0x9f1573ea, 0x9d430377
+ data4 0x9e13fc0c, 0x1e47008a, 0x1e3d5c1d, 0x1ef41a91
+ data4 0x9e4a4ef7, 0x9e952f18, 0x1d620566, 0x1d9b8d33
+ data4 0x1db06247, 0x1e94b31e, 0x1f0730ad, 0x9d79ffb4
+ data4 0x1ed64d51, 0x9e91fd11, 0x9e28d35a, 0x9dea0ed9
+ data4 0x1e891def, 0x9ee28ac0, 0x1e1db99b, 0x9ee1ce38
+ data4 0x9bdd9bca, 0x1eb72cb9, 0x9e8c53c6, 0x1e0df6ca
+ data4 0x1e8f2ccd, 0x9e9b0886, 0x1eeb3bc7, 0x1ec7e772
+ data4 0x9e210776, 0x9daf246c, 0x1ea1f151, 0x1ece4dc6
+ data4 0x1ce741c8, 0x1ed3c88f, 0x9ec9a4fd, 0x9e0c8d30
+ data4 0x1d2fbb26, 0x9ef212a7, 0x1ee44f1c, 0x9e445550
+ data4 0x1e075f77, 0x9d9291a3, 0x1f09c2ee, 0x9e012c88
+ data4 0x1f057d62, 0x9e7bb0dc, 0x9d8758ee, 0x1ee8d6c1
+ data4 0x9e509a57, 0x9e4ca7b7, 0x1e2cb341, 0x9ec35106
+ data4 0x1ecf3baf, 0x1e11781c, 0x1ea0cc78, 0x1eb75ca6
+ data4 0x1e961e1a, 0x1eb88853, 0x1e7abf50, 0x1ee38704
+ data4 0x9dc5ab0f, 0x1afe197b, 0x9ec07523, 0x9d9b7f78
+ data4 0x1f011618, 0x1ed43b0b, 0x9f035945, 0x9e3fd014
+ data4 0x9bbda5cd, 0x9e83f8ab, 0x1e58a928, 0x1e392d61
+ data4 0x1efdbb52, 0x1ee310a8, 0x9ec7ecc1, 0x1e8c9ed6
+ data4 0x9ef82dee, 0x9e70545b, 0x9ea53fc4, 0x1e40f419
+LOCAL_OBJECT_END(D_table)
+
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(exp2l)
+
+{.mii
+ // get exponent
+ getf.exp GR_EBIAS = f8
+ // GR_D_ADDR0 = pointer to D_table
+ addl GR_D_ADDR0 = @ltoff(D_table), gp
+ // GR_ADDR0 = pointer to C_1...C_6 followed by T_table
+ addl GR_ADDR0 = @ltoff(poly_coeffs), gp ;;
+}
+
+{.mfi
+ // get significand
+ getf.sig GR_SIGNIF = f8
+ // will continue only for normal/denormal numbers
+ fclass.nm.unc p12, p7 = f8, 0x1b
+ mov GR_63 = 63 ;;
+}
+
+{.mfi
+ nop.m 0
+ nop.f 0
+ // GR_CONST2 = bias+63-8
+ mov GR_CONST2 = 0xffff+55
+}
+{.mfi
+ // GR_CONST1 = bias+15
+ mov GR_CONST1 = 0xffff+15
+ nop.f 0
+ mov GR_CONST3 = 0x1ffff ;;
+}
+
+{.mfi
+ // load start address for C_1...C_6 followed by T_table
+ ld8 GR_ADDR = [ GR_ADDR0 ]
+ nop.f 0
+ // get sign of argument
+ andcm GR_SGN = GR_EBIAS, GR_CONST3
+}
+{.mfi
+ // GR_D_ADDR = pointer to D_table
+ ld8 GR_D_ADDR = [ GR_D_ADDR0 ]
+ nop.f 0
+ // get argument exponent
+ and GR_ARGEXP = GR_CONST3, GR_EBIAS ;;
+}
+
+{.mfi
+ alloc GR_SREG = ar.pfs, 1, 4, 4, 0
+ nop.f 0
+ // p6 = 1 if sign = 1
+ cmp.ne p6, p8 = GR_SGN, r0
+}
+{.mfi
+ // p7 = 1 if exponent> = 15 (argument out of range)
+ cmp.ge p7, p0 = GR_ARGEXP, GR_CONST1
+ nop.f 0
+ sub GR_EXPON = GR_CONST2, GR_ARGEXP ;;
+}
+
+{.mib
+ // load C_3, C_4
+ ldfpd FR_COEFF3, FR_COEFF4 = [ GR_ADDR ], 16
+ // get first exponent+8 bits
+ shr.u GR_LEADBITS = GR_SIGNIF, GR_EXPON
+ (p12) br.cond.spnt SPECIAL_exp2l
+}
+{.mib
+ mov GR_256 = 256
+ // exponent- = 63
+ sub GR_EM63 = GR_EBIAS, GR_63
+ (p7) br.cond.spnt OUT_RANGE_exp2l ;;
+}
+
+{.mlx
+ // load C_5, C_6
+ ldfpd FR_COEFF5, FR_COEFF6 = [ GR_ADDR ], 16
+ // GR_2P14 = 2^14
+ movl GR_2P14 = 0x46800000 ;;
+}
+
+{.mfi
+ // load C_1
+ ldfe FR_COEFF1 = [ GR_ADDR ], 16
+ fma.s0 f8 = f8, f1, f0
+ // GR_BM63 = bias-63
+ mov GR_BM63 = 0xffff-63 ;;
+}
+
+{.mlx
+ setf.s FR_2P14 = GR_2P14
+ // GR_UF_TEST = -2^14-62
+ movl GR_UF_TEST = 0xc6807c00
+}
+{.mfi
+ // load C_2
+ ldfe FR_COEFF2 = [ GR_ADDR ], 16
+ nop.f 0
+ mov GR_255 = 255 ;;
+}
+
+{.mib
+ // get 8-bit index
+ and GR_INDEX = GR_255, GR_LEADBITS
+ // get K = integer part
+ shr.u GR_K = GR_LEADBITS, 8
+ nop.b 0 ;;
+}
+
+{.mmi
+ // if sign = 1 && f>0, set p7 = 1
+ (p6) cmp.gt.unc p7, p0 = GR_INDEX, r0
+ setf.s FR_UF_TEST = GR_UF_TEST
+ shl GR_KF = GR_LEADBITS, GR_EXPON ;;
+}
+
+{.mfi
+ // if sign = 1 && f>0, set f = 1-f
+ (p7) sub GR_INDEX = GR_256, GR_INDEX
+ nop.f 0
+ // if sign = 1 && f>0, set K = K+1
+ (p7) add GR_K = GR_K, r0, 1 ;;
+}
+
+{.mfi
+ // FR_EXP63 = 2^{expon-63}
+ setf.exp FR_EXP63 = GR_EM63
+ nop.f 0
+ nop.i 0 ;;
+}
+
+.pred.rel "mutex", p6, p8
+{.mfi
+ // if sign = 0, set scale factor exponent S = K+bias-63
+ (p8) add GR_K = GR_K, GR_BM63
+ nop.f 0
+ // if sign = 1, set scale factor exponent S = -K+bias-63
+ (p6) sub GR_K = GR_BM63, GR_K ;;
+}
+
+{.mmi
+ // FR_KF0 = 2^{63-expon}*(K+f)
+ setf.sig FR_KF0 = GR_KF
+ nop.m 0
+ // GR_EMIN = EMIN = 2-2^14
+ mov GR_EMIN = 0x18cfff ;;
+}
+
+{.mfi
+ // get T_table index
+ shladd GR_IT = GR_INDEX, 3, GR_ADDR
+ // p7 = 1 if x> = 2^10
+ fcmp.ge.s1 p7, p12 = f8, FR_2P14
+ // get D_table index
+ shladd GR_ID = GR_INDEX, 2, GR_D_ADDR ;;
+}
+
+{.mfi
+ // load T_table value
+ ldf8 FR_T = [ GR_IT ]
+ // p7 = 1 if x<-2^10-50
+ (p12) fcmp.lt.s1 p7, p0 = f8, FR_UF_TEST
+ // GR_EMIN1 = EMIN = 2-2^14
+ shl GR_EMIN1 = GR_EMIN, 11 ;;
+}
+
+{.mmb
+ // f50 = scale factor = 2^{K-63}
+ setf.exp FR_2EXP = GR_K
+ // load D_table value
+ ldfs FR_D = [ GR_ID ]
+ (p7) br.cond.spnt OUT_RANGE_exp2l ;;
+}
+
+{.mfi
+ nop.m 0
+ // get r = x-(K+f)
+ fnma.s1 FR_R = FR_KF0, FR_EXP63, f8
+ nop.i 0 ;;
+}
+
+{.mfi
+ // FR_EMIN = EMIN
+ setf.s FR_EMIN = GR_EMIN1
+ // P34 = C_4*r+C_3
+ fma.s1 FR_P34 = FR_COEFF4, FR_R, FR_COEFF3
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // P56 = C_6*r+C_5
+ fma.s1 FR_P56 = FR_COEFF6, FR_R, FR_COEFF5
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // r*r
+ fma.s1 FR_R2 = FR_R, FR_R, f0
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // P12 = C_2*r+C_1
+ fma.s1 FR_P12 = FR_COEFF2, FR_R, FR_COEFF1
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // T* = scaling factor
+ fma.s1 FR_TS = FR_T, FR_2EXP, f0
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // P36 = P34+r2*P56
+ fma.s1 FR_P36 = FR_P56, FR_R2, FR_P34
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // P02 = D+r*P12
+ fma.s1 FR_P02 = FR_P12, FR_R, FR_D
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // GR_ID = r*r2
+ fma.s1 FR_R3 = FR_R2, FR_R, f0
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // P06 = P02+r3*P36
+ fma.s1 FR_P06 = FR_P36, FR_R3, FR_P02
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // underflow (x<EMIN) ?
+ fcmp.lt.s0 p6, p8 = f8, FR_EMIN
+ nop.i 0 ;;
+}
+
+{.mfb
+ nop.m 0
+ // result = T+T*P06
+ fma.s0 f8 = FR_TS, FR_P06, FR_TS
+ // return
+ (p8) br.ret.sptk b0
+}
+{.mfb
+ (p6) mov GR_Parameter_TAG = 160
+ nop.f 0
+ (p6) br.cond.sptk __libm_error_region ;;
+}
+
+
+SPECIAL_exp2l:
+
+{.mfi
+ nop.m 0
+ // x = -Infinity ?
+ fclass.m p6, p0 = f8, 0x22
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // x = +Infinity ?
+ fclass.m p7, p0 = f8, 0x21
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // x = +/-Zero ?
+ fclass.m p8, p0 = f8, 0x7
+ nop.i 0
+}
+{.mfb
+ nop.m 0
+ // exp2l(-Infinity) = 0
+ (p6) mov f8 = f0
+ (p6) br.ret.spnt b0 ;;
+}
+
+{.mfb
+ nop.m 0
+ // exp2l(+Infinity) = +Infinity
+ nop.f 0
+ (p7) br.ret.spnt b0 ;;
+}
+
+{.mfb
+ nop.m 0
+ // exp2l(+/-0) = 1
+ (p8) mov f8 = f1
+ (p8) br.ret.spnt b0 ;;
+}
+
+{.mfb
+ nop.m 0
+ // Remaining cases: NaNs
+ fma.s0 f8 = f8, f1, f0
+ br.ret.sptk b0 ;;
+}
+
+
+OUT_RANGE_exp2l:
+
+
+{.mfi
+ // overflow: p8 = 1
+ (p8) mov GR_EM63 = 0x1fffe
+ // normalize input, to detect pseudo-zeroes
+ fma.s0 f8 = f8, f1, f0
+ nop.i 0 ;;
+}
+
+{.mfi
+ nop.m 0
+ // f8 = 0?
+ fcmp.eq.s1 p7, p0 = f8, f0
+ nop.i 0 ;;
+}
+
+{.mmb
+ (p8) mov GR_Parameter_TAG = 159
+ (p8) setf.exp FR_TS = GR_EM63
+ nop.b 999 ;;
+}
+
+{.mfb
+ nop.m 0
+ // pseudo-zero
+ (p7) mov f8 = f1
+ (p7) br.ret.sptk b0 ;;
+}
+
+{.mfi
+ nop.m 999
+ (p8) fma.s0 f8 = FR_TS, FR_TS, f0
+ nop.i 999
+}
+{.mii
+ nop.m 0
+ // underflow: p6 = 1
+ (p6) mov GR_EM63 = 1
+ nop.i 0 ;;
+}
+
+{.mmb
+ (p6) mov GR_Parameter_TAG = 160
+ (p6) setf.exp FR_TS = GR_EM63
+ nop.b 999 ;;
+}
+
+{.mfb
+ nop.m 999
+ (p6) fma.s0 f8 = FR_TS, FR_TS, f0
+ nop.b 0 ;;
+}
+
+
+GLOBAL_LIBM_END(exp2l)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{.mfi
+ add GR_Parameter_Y = -32, sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs, GR_SAVE_PFS
+ mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
+}
+{.mfi
+.fframe 64
+ add sp = -64, sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP = gp ;; // Save gp
+}
+
+{.mmi
+ stfe [ GR_Parameter_Y ] = FR_Y, 16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16, sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0 = b0 ;; // Save b0
+}
+
+.body
+{.mib
+ stfe [ GR_Parameter_X ] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0, GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{.mib
+ stfe [ GR_Parameter_Y ] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16, GR_Parameter_Y
+ br.call.sptk b0 = __libm_error_support# ;; // Call error handling function
+}
+
+{.mmi
+ add GR_Parameter_RESULT = 48, sp
+ nop.m 0
+ nop.i 0 ;;
+}
+
+{.mmi
+ ldfe f8 = [ GR_Parameter_RESULT ] // Get return result off stack
+.restore sp
+ add sp = 64, sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 ;; // Restore return address
+}
+
+{.mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 ;; // Return
+}
+
+
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#, @function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_expf.S b/sysdeps/ia64/fpu/e_expf.S
index 2aad021335..8d620b6ffa 100644
--- a/sysdeps/ia64/fpu/e_expf.S
+++ b/sysdeps/ia64/fpu/e_expf.S
@@ -1,10 +1,10 @@
.file "expf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2002, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,589 +35,501 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
-//==============================================================
-// 4/04/00 Unwind update
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+//*********************************************************************
+// 02/02/00 Original version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-// 8/21/00 Improvements to save 2 cycles on main path, and shorten x=0 case
+// 08/21/00 Improvements to save 2 cycles on main path, and shorten x=0 case
// 12/07/00 Widen main path, shorten x=inf, nan paths
+// 03/15/01 Fix monotonicity problem around x=0 for round to +inf
+// 02/05/02 Corrected uninitialize predicate in POSSIBLE_UNDERFLOW path
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 07/26/02 Algorithm changed, accuracy improved
+// 09/26/02 support of higher precision inputs added, underflow threshold
+// corrected
+// 11/15/02 Improved performance on Itanium 2, added possible over/under paths
+//
+//
+// API
+//*********************************************************************
+// float expf(float)
+//
+// Overview of operation
+//*********************************************************************
+// Take the input x. w is "how many log2/128 in x?"
+// w = x * 64/log2
+// NJ = int(w)
+// x = NJ*log2/64 + R
+
+// NJ = 64*n + j
+// x = n*log2 + (log2/64)*j + R
+//
+// So, exp(x) = 2^n * 2^(j/64)* exp(R)
+//
+// T = 2^n * 2^(j/64)
+// Construct 2^n
+// Get 2^(j/64) table
+// actually all the entries of 2^(j/64) table are stored in DP and
+// with exponent bits set to 0 -> multiplication on 2^n can be
+// performed by doing logical "or" operation with bits presenting 2^n
+
+// exp(R) = 1 + (exp(R) - 1)
+// P = exp(R) - 1 approximated by Taylor series of 3rd degree
+// P = A3*R^3 + A2*R^2 + R, A3 = 1/6, A2 = 1/2
//
-#include "libm_support.h"
-
-// Assembly macros
-//==============================================================
-// integer registers used
-
- exp_GR_0x0f = r33
- exp_GR_0xf0 = r34
+// The final result is reconstructed as follows
+// exp(x) = T + T*P
- EXP_AD_P_1 = r36
- EXP_AD_P_2 = r37
- EXP_AD_T1 = r38
- EXP_AD_T2 = r39
- exp_GR_Mint = r40
+// Special values
+//*********************************************************************
+// expf(+0) = 1.0
+// expf(-0) = 1.0
- exp_GR_Mint_p_128 = r41
- exp_GR_Ind1 = r42
- EXP_AD_M1 = r43
- exp_GR_Ind2 = r44
- EXP_AD_M2 = r45
+// expf(+qnan) = +qnan
+// expf(-qnan) = -qnan
+// expf(+snan) = +qnan
+// expf(-snan) = -qnan
- exp_GR_min_oflow = r46
- exp_GR_max_zero = r47
- exp_GR_max_norm = r48
- exp_GR_max_uflow = r49
- exp_GR_min_norm = r50
+// expf(-inf) = +0
+// expf(+inf) = +inf
- exp_GR_17ones = r51
- exp_GR_gt_ln = r52
- exp_GR_T2_size = r53
+// Overflow and Underflow
+//*********************************************************************
+// expf(x) = largest single normal when
+// x = 88.72283 = 0x42b17217
- exp_GR_17ones_m1 = r56
- exp_GR_one = r57
+// expf(x) = smallest single normal when
+// x = -87.33654 = 0xc2aeac4f
+// expf(x) = largest round-to-nearest single zero when
+// x = -103.97208 = 0xc2cff1b5
-GR_SAVE_B0 = r53
-GR_SAVE_PFS = r55
-GR_SAVE_GP = r54
+// Registers used
+//*********************************************************************
+// Floating Point registers used:
+// f8, input
+// f6,f7, f9 -> f15, f32 -> f40
-GR_Parameter_X = r59
-GR_Parameter_Y = r60
-GR_Parameter_RESULT = r61
-GR_Parameter_TAG = r62
+// General registers used:
+// r3, r23 -> r38
-FR_X = f10
-FR_Y = f1
-FR_RESULT = f8
+// Predicate registers used:
+// p10 -> p15
+// Assembly macros
+//*********************************************************************
+// integer registers used
+// scratch
+rNJ = r3
+
+rTmp = r23
+rJ = r23
+rN = r24
+rTblAddr = r25
+rA3 = r26
+rExpHalf = r27
+rLn2Div64 = r28
+r17ones_m1 = r29
+rGt_ln = r29
+rRightShifter = r30
+r64DivLn2 = r31
+// stacked
+GR_SAVE_PFS = r32
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_Parameter_X = r35
+GR_Parameter_Y = r36
+GR_Parameter_RESULT = r37
+GR_Parameter_TAG = r38
// floating point registers used
-
- EXP_MIN_SGL_OFLOW_ARG = f11
- EXP_MAX_SGL_ZERO_ARG = f12
- EXP_MAX_SGL_NORM_ARG = f13
- EXP_MAX_SGL_UFLOW_ARG = f14
- EXP_MIN_SGL_NORM_ARG = f15
-
- exp_coeff_P5 = f32
- exp_coeff_P6 = f33
- exp_coeff_P3 = f34
- exp_coeff_P4 = f35
-
- exp_coeff_P1 = f36
- exp_coeff_P2 = f37
- exp_Mx = f38
- exp_Mfloat = f39
- exp_R = f40
-
- exp_P1 = f41
- exp_P2 = f42
- exp_P3 = f43
- exp_Rsq = f44
- exp_R4 = f45
-
- exp_P4 = f46
- exp_P5 = f47
- exp_P6 = f48
- exp_P7 = f49
- exp_T1 = f50
-
- exp_T2 = f51
- exp_T = f52
- exp_A = f53
- exp_norm_f8 = f54
- exp_wre_urm_f8 = f55
-
- exp_ftz_urm_f8 = f56
- exp_gt_pln = f57
-
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+// scratch
+fRightShifter = f6
+f64DivLn2 = f7
+fNormX = f9
+fNint = f10
+fN = f11
+fR = f12
+fLn2Div64 = f13
+fA2 = f14
+fA3 = f15
+// stacked
+fP = f32
+fT = f33
+fMIN_SGL_OFLOW_ARG = f34
+fMAX_SGL_ZERO_ARG = f35
+fMAX_SGL_NORM_ARG = f36
+fMIN_SGL_NORM_ARG = f37
+fRSqr = f38
+fTmp = f39
+fGt_pln = f39
+fWre_urm_f8 = f40
+fFtz_urm_f8 = f40
+
+
+RODATA
.align 16
-exp_coeff_1_table:
-ASM_TYPE_DIRECTIVE(exp_coeff_1_table,@object)
-data8 0x3F56F35FDE4F8563 // p5
-data8 0x3F2A378BEFECCFDD // p6
-data8 0x3FE00000258C581D // p1
-data8 0x3FC555557AE7B3D4 // p2
-ASM_SIZE_DIRECTIVE(exp_coeff_1_table)
-
-
-exp_coeff_2_table:
-ASM_TYPE_DIRECTIVE(exp_coeff_2_table,@object)
-data8 0x3FA5551BB6592FAE // p3
-data8 0x3F8110E8EBFFD485 // p4
-ASM_SIZE_DIRECTIVE(exp_coeff_2_table)
-
-
-exp_T2_table:
-ASM_TYPE_DIRECTIVE(exp_T2_table,@object)
-data8 0xa175cf9cd7d85844 , 0x00003f46 // exp(-128)
-data8 0xdb7279415a1f9eed , 0x00003f47 // exp(-127)
-data8 0x95213b242bd8ca5f , 0x00003f49 // exp(-126)
-data8 0xcab03c968c989f83 , 0x00003f4a // exp(-125)
-data8 0x89bdb674702961ad , 0x00003f4c // exp(-124)
-data8 0xbb35a2eec278be35 , 0x00003f4d // exp(-123)
-data8 0xfe71b17f373e7e7a , 0x00003f4e // exp(-122)
-data8 0xace9a6ec52a39b63 , 0x00003f50 // exp(-121)
-data8 0xeb03423fe393cf1c , 0x00003f51 // exp(-120)
-data8 0x9fb52c5bcaef1693 , 0x00003f53 // exp(-119)
-data8 0xd910b6377ed60bf1 , 0x00003f54 // exp(-118)
-data8 0x9382dad8a9fdbfe4 , 0x00003f56 // exp(-117)
-data8 0xc87d0a84dea869a3 , 0x00003f57 // exp(-116)
-data8 0x883efb4c6d1087b0 , 0x00003f59 // exp(-115)
-data8 0xb92d7373dce9a502 , 0x00003f5a // exp(-114)
-data8 0xfbaeb020577fb0cb , 0x00003f5b // exp(-113)
-ASM_SIZE_DIRECTIVE(exp_T2_table)
-
-
-exp_T1_table:
-ASM_TYPE_DIRECTIVE(exp_T1_table,@object)
-data8 0x8000000000000000 , 0x00003fff // exp(16 * 0)
-data8 0x87975e8540010249 , 0x00004016 // exp(16 * 1)
-data8 0x8fa1fe625b3163ec , 0x0000402d // exp(16 * 2)
-data8 0x9826b576512a59d7 , 0x00004044 // exp(16 * 3)
-data8 0xa12cc167acbe6902 , 0x0000405b // exp(16 * 4)
-data8 0xaabbcdcc279f59e4 , 0x00004072 // exp(16 * 5)
-data8 0xb4dbfaadc045d16f , 0x00004089 // exp(16 * 6)
-data8 0xbf95e372ccdbf146 , 0x000040a0 // exp(16 * 7)
-data8 0xcaf2a62eea10bbfb , 0x000040b7 // exp(16 * 8)
-data8 0xd6fbeb62fddbd340 , 0x000040ce // exp(16 * 9)
-data8 0xe3bbee32e4a440ea , 0x000040e5 // exp(16 * 10)
-data8 0xf13d8517c34199a8 , 0x000040fc // exp(16 * 11)
-data8 0xff8c2b166241eedd , 0x00004113 // exp(16 * 12)
-data8 0x875a04c0b38d6129 , 0x0000412b // exp(16 * 13)
-data8 0x8f610127db6774d7 , 0x00004142 // exp(16 * 14)
-data8 0x97e1dd87e5c20bb6 , 0x00004159 // exp(16 * 15)
-ASM_SIZE_DIRECTIVE(exp_T1_table)
-
-// Argument Reduction
-// exp_Mx = (int)f8 ==> The value of f8 rounded to int is placed into the
-// significand of exp_Mx as a two's
-// complement number.
-
-// Later we want to have exp_Mx in a general register. Do this with a getf.sig
-// and call the general register exp_GR_Mint
-
-// exp_Mfloat = (float)(int)f8 ==> the two's complement number in
-// significand of exp_Mx is turned
-// into a floating point number.
-// R = 1 - exp_Mfloat ==> reduced argument
-
-// Core Approximation
-// Calculate a series in R
-// R * p6 + p5
-// R * p4 + p3
-// R * p2 + p1
-// R^2
-// R^4
-// R^2(R * p6 + p5) + (R * p4 + p3)
-// R^2(R * p2 + p1)
-// R^4(R^2(R * p6 + p5) + (R * p4 + p3)) + (R^2(R * p2 + p1))
-// R + 1
-// exp(R) = (1 + R) + R^4(R^2(R * p6 + p5) + (R * p4 + p3)) + (R^2(R * p2 + p1))
-// exp(R) = 1 + R + R^2 * p1 + R^3 * p2 + R^4 * p3 + R^5 * p4 + R^6 * p5 + R^7 * p6
-
-// Reconstruction
-// signficand of exp_Mx is two's complement,
-// -103 < x < 89
-// The smallest single denormal is 2^-149 = ssdn
-// For e^x = ssdn
-// x = log(ssdn) = -103.279
-// But with rounding result goes to ssdn until -103.972079
-// The largest single normal is 1.<23 1's> 2^126 ~ 2^127 = lsn
-// For e^x = lsn
-// x = log(lsn) = 88.7228
+LOCAL_OBJECT_START(_expf_table)
+data4 0x42b17218 // Smallest sgl arg to overflow sgl result, +88.7228
+data4 0xc2cff1b5 // Largest sgl for rnd-to-nearest 0 result, -103.9720
+data4 0x42b17217 // Largest sgl arg to give normal sgl result, +88.7228
+data4 0xc2aeac4f // Smallest sgl arg to give normal sgl result, -87.3365
//
-// expf overflows when x > 42b17218 = 88.7228
-// expf returns largest single denormal when x = c2aeac50
-// expf goes to zero when x < c2cff1b5
-
-// Consider range of 8-bit two's complement, -128 ---> 127
-// Add 128; range becomes 0 ---> 255
-
-// The number (=i) in 0 ---> 255 is used as offset into two tables.
-
-// i = abcd efgh = abcd * 16 + efgh = i1 * 16 + i2
-
-// i1 = (exp_GR_Mint + 128) & 0xf0 (show 0xf0 as -0x10 to avoid assembler error)
-// (The immediate in the AND is an 8-bit two's complement)
-// i1 = i1 + start of T1 table (EXP_AD_T1)
-// Note that the entries in T1 are double-extended numbers on 16-byte boundaries
-// and that i1 is already shifted left by 16 after the AND.
-
-// i2 must be shifted left by 4 before adding to the start of the table.
-// i2 = ((exp_GR_Mint + 128) & 0x0f) << 4
-// i2 = i2 + start of T2 table (EXP_AD_T2)
-
-// T = T1 * T2
-// A = T * (1 + R)
-// answer = T * (R^2 * p1 + R^3 * p2 + R^4 * p3 + R^5 * p4 + R^6 * p5 + R^7 * p6) +
-// T * (1 + R)
-// = T * exp(R)
-
+// 2^(j/64) table, j goes from 0 to 63
+data8 0x0000000000000000 // 2^(0/64)
+data8 0x00002C9A3E778061 // 2^(1/64)
+data8 0x000059B0D3158574 // 2^(2/64)
+data8 0x0000874518759BC8 // 2^(3/64)
+data8 0x0000B5586CF9890F // 2^(4/64)
+data8 0x0000E3EC32D3D1A2 // 2^(5/64)
+data8 0x00011301D0125B51 // 2^(6/64)
+data8 0x0001429AAEA92DE0 // 2^(7/64)
+data8 0x000172B83C7D517B // 2^(8/64)
+data8 0x0001A35BEB6FCB75 // 2^(9/64)
+data8 0x0001D4873168B9AA // 2^(10/64)
+data8 0x0002063B88628CD6 // 2^(11/64)
+data8 0x0002387A6E756238 // 2^(12/64)
+data8 0x00026B4565E27CDD // 2^(13/64)
+data8 0x00029E9DF51FDEE1 // 2^(14/64)
+data8 0x0002D285A6E4030B // 2^(15/64)
+data8 0x000306FE0A31B715 // 2^(16/64)
+data8 0x00033C08B26416FF // 2^(17/64)
+data8 0x000371A7373AA9CB // 2^(18/64)
+data8 0x0003A7DB34E59FF7 // 2^(19/64)
+data8 0x0003DEA64C123422 // 2^(20/64)
+data8 0x0004160A21F72E2A // 2^(21/64)
+data8 0x00044E086061892D // 2^(22/64)
+data8 0x000486A2B5C13CD0 // 2^(23/64)
+data8 0x0004BFDAD5362A27 // 2^(24/64)
+data8 0x0004F9B2769D2CA7 // 2^(25/64)
+data8 0x0005342B569D4F82 // 2^(26/64)
+data8 0x00056F4736B527DA // 2^(27/64)
+data8 0x0005AB07DD485429 // 2^(28/64)
+data8 0x0005E76F15AD2148 // 2^(29/64)
+data8 0x0006247EB03A5585 // 2^(30/64)
+data8 0x0006623882552225 // 2^(31/64)
+data8 0x0006A09E667F3BCD // 2^(32/64)
+data8 0x0006DFB23C651A2F // 2^(33/64)
+data8 0x00071F75E8EC5F74 // 2^(34/64)
+data8 0x00075FEB564267C9 // 2^(35/64)
+data8 0x0007A11473EB0187 // 2^(36/64)
+data8 0x0007E2F336CF4E62 // 2^(37/64)
+data8 0x00082589994CCE13 // 2^(38/64)
+data8 0x000868D99B4492ED // 2^(39/64)
+data8 0x0008ACE5422AA0DB // 2^(40/64)
+data8 0x0008F1AE99157736 // 2^(41/64)
+data8 0x00093737B0CDC5E5 // 2^(42/64)
+data8 0x00097D829FDE4E50 // 2^(43/64)
+data8 0x0009C49182A3F090 // 2^(44/64)
+data8 0x000A0C667B5DE565 // 2^(45/64)
+data8 0x000A5503B23E255D // 2^(46/64)
+data8 0x000A9E6B5579FDBF // 2^(47/64)
+data8 0x000AE89F995AD3AD // 2^(48/64)
+data8 0x000B33A2B84F15FB // 2^(49/64)
+data8 0x000B7F76F2FB5E47 // 2^(50/64)
+data8 0x000BCC1E904BC1D2 // 2^(51/64)
+data8 0x000C199BDD85529C // 2^(52/64)
+data8 0x000C67F12E57D14B // 2^(53/64)
+data8 0x000CB720DCEF9069 // 2^(54/64)
+data8 0x000D072D4A07897C // 2^(55/64)
+data8 0x000D5818DCFBA487 // 2^(56/64)
+data8 0x000DA9E603DB3285 // 2^(57/64)
+data8 0x000DFC97337B9B5F // 2^(58/64)
+data8 0x000E502EE78B3FF6 // 2^(59/64)
+data8 0x000EA4AFA2A490DA // 2^(60/64)
+data8 0x000EFA1BEE615A27 // 2^(61/64)
+data8 0x000F50765B6E4540 // 2^(62/64)
+data8 0x000FA7C1819E90D8 // 2^(63/64)
+LOCAL_OBJECT_END(_expf_table)
-.global expf#
.section .text
-.proc expf#
-.align 32
-expf:
-#ifdef _LIBC
-.global __ieee754_expf#
-__ieee754_expf:
-#endif
-
-{ .mfi
- alloc r32 = ar.pfs,1,26,4,0
- fcvt.fx.s1 exp_Mx = f8
- mov exp_GR_17ones = 0x1FFFF
+GLOBAL_IEEE754_ENTRY(expf)
+
+{ .mlx
+ addl rTblAddr = @ltoff(_expf_table),gp
+ movl r64DivLn2 = 0x40571547652B82FE // 64/ln(2)
}
{ .mlx
- addl EXP_AD_P_1 = @ltoff(exp_coeff_1_table),gp
- movl exp_GR_min_oflow = 0x42b17218
+ addl rA3 = 0x3E2AA, r0 // high bits of 1.0/6.0 rounded to SP
+ movl rRightShifter = 0x43E8000000000000 // DP Right Shifter
}
;;
-// Fnorm done to take any enabled faults
{ .mfi
- ld8 EXP_AD_P_1 = [EXP_AD_P_1]
- fclass.m p6,p0 = f8, 0x07 //@zero
- nop.i 999
+ // point to the beginning of the table
+ ld8 rTblAddr = [rTblAddr]
+ fclass.m p14, p0 = f8, 0x22 // test for -INF
+ shl rA3 = rA3, 12 // 0x3E2AA000, approx to 1.0/6.0 in SP
}
{ .mfi
- add exp_GR_max_norm = -1, exp_GR_min_oflow // 0x42b17217
- fnorm exp_norm_f8 = f8
- nop.i 999
+ nop.m 0
+ fnorm.s1 fNormX = f8 // normalized x
+ addl rExpHalf = 0xFFFE, r0 // exponent of 1/2
}
;;
{ .mfi
- setf.s EXP_MIN_SGL_OFLOW_ARG = exp_GR_min_oflow // 0x42b17218
- fclass.m p7,p0 = f8, 0x22 // Test for x=-inf
- mov exp_GR_0xf0 = 0x0f0
+ setf.d f64DivLn2 = r64DivLn2 // load 64/ln(2) to FP reg
+ fclass.m p15, p0 = f8, 0x1e1 // test for NaT,NaN,+Inf
+ nop.i 0
}
{ .mlx
- setf.s EXP_MAX_SGL_NORM_ARG = exp_GR_max_norm
- movl exp_GR_max_zero = 0xc2cff1b5
+ // load Right Shifter to FP reg
+ setf.d fRightShifter = rRightShifter
+ movl rLn2Div64 = 0x3F862E42FEFA39EF // DP ln(2)/64 in GR
}
;;
-
-{ .mlx
- mov exp_GR_0x0f = 0x00f
- movl exp_GR_max_uflow = 0xc2aeac50
+{ .mfi
+ nop.m 0
+ fcmp.eq.s1 p13, p0 = f0, f8 // test for x = 0.0
+ nop.i 0
}
{ .mfb
- nop.m 999
-(p6) fma.s f8 = f1,f1,f0
-(p6) br.ret.spnt b0 // quick exit for x=0
+ setf.s fA3 = rA3 // load A3 to FP reg
+(p14) fma.s.s0 f8 = f0, f1, f0 // result if x = -inf
+(p14) br.ret.spnt b0 // exit here if x = -inf
}
;;
{ .mfi
- setf.s EXP_MAX_SGL_ZERO_ARG = exp_GR_max_zero
- fclass.m p8,p0 = f8, 0x21 // Test for x=+inf
- adds exp_GR_min_norm = 1, exp_GR_max_uflow // 0xc2aeac51
+ setf.exp fA2 = rExpHalf // load A2 to FP reg
+ fcmp.eq.s0 p6, p0 = f8, f0 // Dummy to flag denorm
+ nop.i 0
}
{ .mfb
- ldfpd exp_coeff_P5,exp_coeff_P6 = [EXP_AD_P_1],16
-(p7) fma.s f8 = f0,f0,f0
-(p7) br.ret.spnt b0 // quick exit for x=-inf
+ setf.d fLn2Div64 = rLn2Div64 // load ln(2)/64 to FP reg
+(p15) fma.s.s0 f8 = f8, f1, f0 // result if x = NaT,NaN,+Inf
+(p15) br.ret.spnt b0 // exit here if x = NaT,NaN,+Inf
}
;;
-{ .mmf
- ldfpd exp_coeff_P1,exp_coeff_P2 = [EXP_AD_P_1],16
- setf.s EXP_MAX_SGL_UFLOW_ARG = exp_GR_max_uflow
- fclass.m p9,p0 = f8, 0xc3 // Test for x=nan
-}
-;;
-
-{ .mmb
- ldfpd exp_coeff_P3,exp_coeff_P4 = [EXP_AD_P_1],16
- setf.s EXP_MIN_SGL_NORM_ARG = exp_GR_min_norm
-(p8) br.ret.spnt b0 // quick exit for x=+inf
+{ .mfb
+ // overflow and underflow_zero threshold
+ ldfps fMIN_SGL_OFLOW_ARG, fMAX_SGL_ZERO_ARG = [rTblAddr], 8
+(p13) fma.s.s0 f8 = f1, f1, f0 // result if x = 0.0
+(p13) br.ret.spnt b0 // exit here if x =0.0
}
;;
-// EXP_AD_P_1 now points to exp_T2_table
+ // max normal and underflow_denorm threshold
{ .mfi
- mov exp_GR_T2_size = 0x100
- fcvt.xf exp_Mfloat = exp_Mx
- nop.i 999
+ ldfps fMAX_SGL_NORM_ARG, fMIN_SGL_NORM_ARG = [rTblAddr], 8
+ nop.f 0
+ nop.i 0
}
;;
-{ .mfb
- getf.sig exp_GR_Mint = exp_Mx
-(p9) fmerge.s f8 = exp_norm_f8, exp_norm_f8
-(p9) br.ret.spnt b0 // quick exit for x=nan
+{ .mfi
+ nop.m 0
+ // x*(64/ln(2)) + Right Shifter
+ fma.s1 fNint = fNormX, f64DivLn2, fRightShifter
+ nop.i 0
}
;;
-{ .mmi
- nop.m 999
- mov EXP_AD_T2 = EXP_AD_P_1
- add EXP_AD_T1 = exp_GR_T2_size,EXP_AD_P_1 ;;
-}
-
-
-{ .mmi
- adds exp_GR_Mint_p_128 = 0x80,exp_GR_Mint ;;
- and exp_GR_Ind1 = exp_GR_Mint_p_128, exp_GR_0xf0
- and exp_GR_Ind2 = exp_GR_Mint_p_128, exp_GR_0x0f ;;
-}
-
// Divide arguments into the following categories:
-// Certain Underflow/zero p11 - -inf < x <= MAX_SGL_ZERO_ARG
-// Certain Underflow p12 - MAX_SGL_ZERO_ARG < x <= MAX_SGL_UFLOW_ARG
-// Possible Underflow p13 - MAX_SGL_UFLOW_ARG < x < MIN_SGL_NORM_ARG
+// Certain Underflow p11 - -inf < x <= MAX_SGL_ZERO_ARG
+// Possible Underflow p13 - MAX_SGL_ZERO_ARG < x < MIN_SGL_NORM_ARG
// Certain Safe - MIN_SGL_NORM_ARG <= x <= MAX_SGL_NORM_ARG
// Possible Overflow p14 - MAX_SGL_NORM_ARG < x < MIN_SGL_OFLOW_ARG
// Certain Overflow p15 - MIN_SGL_OFLOW_ARG <= x < +inf
//
-// If the input is really a single arg, then there will never be "Possible
-// Underflow" or "Possible Overflow" arguments.
+// If the input is really a single arg, then there will never be
+// "Possible Overflow" arguments.
//
{ .mfi
- add EXP_AD_M1 = exp_GR_Ind1,EXP_AD_T1
- fcmp.ge.s1 p15,p14 = exp_norm_f8,EXP_MIN_SGL_OFLOW_ARG
- nop.i 999
-}
-{ .mfi
- shladd EXP_AD_M2 = exp_GR_Ind2,4,EXP_AD_T2
- fms.s1 exp_R = f1,f8,exp_Mfloat
- nop.i 999 ;;
+ nop.m 0
+ // check for overflow
+ fcmp.ge.s1 p15, p0 = fNormX, fMIN_SGL_OFLOW_ARG
+ nop.i 0
}
+;;
{ .mfi
- ldfe exp_T1 = [EXP_AD_M1]
- fcmp.le.s1 p11,p12 = exp_norm_f8,EXP_MAX_SGL_ZERO_ARG
- nop.i 999 ;;
+ nop.m 0
+ // check for underflow and tiny (+0) result
+ fcmp.le.s1 p11, p0 = fNormX, fMAX_SGL_ZERO_ARG
+ nop.i 0
}
-
{ .mfb
- ldfe exp_T2 = [EXP_AD_M2]
-(p14) fcmp.gt.s1 p14,p0 = exp_norm_f8,EXP_MAX_SGL_NORM_ARG
-(p15) br.cond.spnt L(EXP_CERTAIN_OVERFLOW) ;;
-}
-
-{ .mfb
- nop.m 999
-(p12) fcmp.le.s1 p12,p0 = exp_norm_f8,EXP_MAX_SGL_UFLOW_ARG
-(p11) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW_ZERO)
+ nop.m 0
+ fms.s1 fN = fNint, f1, fRightShifter // n in FP register
+ // branch out if overflow
+(p15) br.cond.spnt EXP_CERTAIN_OVERFLOW
}
;;
-{ .mfi
- nop.m 999
-(p13) fcmp.lt.s1 p13,p0 = exp_norm_f8,EXP_MIN_SGL_NORM_ARG
- nop.i 999
+{ .mfb
+ getf.sig rNJ = fNint // bits of n, j
+ // check for underflow and deno result
+ fcmp.lt.s1 p13, p0 = fNormX, fMIN_SGL_NORM_ARG
+ // branch out if underflow and tiny (+0) result
+(p11) br.cond.spnt EXP_CERTAIN_UNDERFLOW
}
;;
-
{ .mfi
- nop.m 999
- fma.s1 exp_Rsq = exp_R,exp_R,f0
- nop.i 999
+ nop.m 0
+ // check for possible overflow
+ fcmp.gt.s1 p14, p0 = fNormX, fMAX_SGL_NORM_ARG
+ extr.u rJ = rNJ, 0, 6 // bits of j
}
{ .mfi
- nop.m 999
- fma.s1 exp_P3 = exp_R,exp_coeff_P2,exp_coeff_P1
- nop.i 999
+ addl rN = 0xFFFF - 63, rNJ // biased and shifted n
+ fnma.s1 fR = fLn2Div64, fN, fNormX // R = x - N*ln(2)/64
+ nop.i 0
}
;;
{ .mfi
- nop.m 999
- fma.s1 exp_P1 = exp_R,exp_coeff_P6,exp_coeff_P5
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 exp_P2 = exp_R,exp_coeff_P4,exp_coeff_P3
- nop.i 999
+ shladd rJ = rJ, 3, rTblAddr // address in the 2^(j/64) table
+ nop.f 0
+ shr rN = rN, 6 // biased n
}
;;
-
{ .mfi
- nop.m 999
- fma.s1 exp_P7 = f1,exp_R,f1
- nop.i 999
+ ld8 rJ = [rJ]
+ nop.f 0
+ shl rN = rN, 52 // 2^n bits in DP format
}
;;
-
-{ .mfi
- nop.m 999
- fma.s1 exp_P5 = exp_Rsq,exp_P3,f0
- nop.i 999
-}
{ .mfi
- nop.m 999
- fma.s1 exp_R4 = exp_Rsq,exp_Rsq,f0
- nop.i 999
+ or rN = rN, rJ // bits of 2^n * 2^(j/64) in DP format
+ nop.f 0
+ nop.i 0
}
;;
{ .mfi
- nop.m 999
- fma.s1 exp_T = exp_T1,exp_T2,f0
- nop.i 999
+ setf.d fT = rN // 2^n * 2^(j/64)
+ fma.s1 fP = fA3, fR, fA2 // A3*R + A2
+ nop.i 0
}
{ .mfi
- nop.m 999
- fma.s1 exp_P4 = exp_Rsq,exp_P1,exp_P2
- nop.i 999
+ nop.m 0
+ fma.s1 fRSqr = fR, fR, f0 // R^2
+ nop.i 0
}
;;
{ .mfi
- nop.m 999
- fma.s1 exp_A = exp_T,exp_P7,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 exp_P6 = exp_R4,exp_P4,exp_P5
- nop.i 999
+ nop.m 0
+ fma.s1 fP = fP, fRSqr, fR // P = (A3*R + A2)*R^2 + R
+ nop.i 0
}
;;
-{ .bbb
-(p12) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW)
-(p13) br.cond.spnt L(EXP_POSSIBLE_UNDERFLOW)
-(p14) br.cond.spnt L(EXP_POSSIBLE_OVERFLOW)
+{ .mbb
+ nop.m 0
+ // branch out if possible underflow
+(p13) br.cond.spnt EXP_POSSIBLE_UNDERFLOW
+ // branch out if possible overflow result
+(p14) br.cond.spnt EXP_POSSIBLE_OVERFLOW
}
;;
{ .mfb
- nop.m 999
- fma.s f8 = exp_T,exp_P6,exp_A
- br.ret.sptk b0
+ nop.m 0
+ // final result in the absence of over- and underflow
+ fma.s.s0 f8 = fP, fT, fT
+ // exit here in the absence of over- and underflow
+ br.ret.sptk b0
}
;;
-L(EXP_POSSIBLE_OVERFLOW):
-
-// We got an answer. EXP_MAX_SGL_NORM_ARG < x < EXP_MIN_SGL_OFLOW_ARG
-// overflow is a possibility, not a certainty
-// Set wre in s2 and perform the last operation with s2
-
-// We define an overflow when the answer with
-// WRE set
-// user-defined rounding mode
-// is lsn +1
-
-// Is the exponent 1 more than the largest single?
-// If so, go to ERROR RETURN, else (no overflow) get the answer and
-// leave.
-
-// Largest single is FE (biased single)
-// FE - 7F + FFFF = 1007E
+EXP_POSSIBLE_OVERFLOW:
-// Create + largest_single_plus_ulp
-// Create - largest_single_plus_ulp
+// Here if fMAX_SGL_NORM_ARG < x < fMIN_SGL_OFLOW_ARG
+// This cannot happen if input is a single, only if input higher precision.
+// Overflow is a possibility, not a certainty.
-// Calculate answer with WRE set.
-
-// Cases when answer is lsn+1 are as follows:
-
-// midpoint
-// |
-// lsn | lsn+1
-// --+----------|----------+------------
-// |
-// +inf +inf -inf
-// RN RN
-// RZ
-// exp_gt_pln contains the floating point number lsn+1.
-// The setf.exp puts 0x1007f in the exponent and 0x800... in the significand.
-
-// If the answer is >= lsn+1, we have overflowed.
-// Then p6 is TRUE. Set the overflow tag, save input in FR_X,
-// do the final calculation for IEEE result, and branch to error return.
+// Recompute result using status field 2 with user's rounding mode,
+// and wre set. If result is larger than largest single, then we have
+// overflow
{ .mfi
- mov exp_GR_gt_ln = 0x1007F
- fsetc.s2 0x7F,0x42
- nop.i 999
+ mov rGt_ln = 0x1007f // Exponent for largest single + 1 ulp
+ fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
+ nop.i 0
}
;;
{ .mfi
- setf.exp exp_gt_pln = exp_GR_gt_ln
- fma.s.s2 exp_wre_urm_f8 = exp_T, exp_P6, exp_A
- nop.i 999
+ setf.exp fGt_pln = rGt_ln // Create largest single + 1 ulp
+ fma.s.s2 fWre_urm_f8 = fP, fT, fT // Result with wre set
+ nop.i 0
}
;;
{ .mfi
- nop.m 999
- fsetc.s2 0x7F,0x40
- nop.i 999
+ nop.m 0
+ fsetc.s2 0x7F,0x40 // Turn off wre in sf2
+ nop.i 0
}
;;
{ .mfi
- nop.m 999
- fcmp.ge.unc.s1 p6, p0 = exp_wre_urm_f8, exp_gt_pln
- nop.i 999
+ nop.m 0
+ fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
+ nop.i 0
}
;;
{ .mfb
- nop.m 999
- nop.f 999
-(p6) br.cond.spnt L(EXP_CERTAIN_OVERFLOW) // Branch if really overflow
+ nop.m 0
+ nop.f 0
+(p6) br.cond.spnt EXP_CERTAIN_OVERFLOW // Branch if overflow
}
;;
{ .mfb
- nop.m 999
- fma.s f8 = exp_T, exp_P6, exp_A
- br.ret.sptk b0 // Exit if really no overflow
+ nop.m 0
+ fma.s.s0 f8 = fP, fT, fT
+ br.ret.sptk b0 // Exit if really no overflow
}
;;
-L(EXP_CERTAIN_OVERFLOW):
+// here if overflow
+EXP_CERTAIN_OVERFLOW:
{ .mmi
- sub exp_GR_17ones_m1 = exp_GR_17ones, r0, 1 ;;
- setf.exp f9 = exp_GR_17ones_m1
- nop.i 999 ;;
+ addl r17ones_m1 = 0x1FFFE, r0
+;;
+ setf.exp fTmp = r17ones_m1
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
- fmerge.s FR_X = f8,f8
- nop.i 999
+ alloc r32=ar.pfs,0,3,4,0
+ fmerge.s FR_X = f8,f8
+ nop.i 0
}
{ .mfb
- mov GR_Parameter_TAG = 16
- fma.s FR_RESULT = f9, f9, f0 // Set I,O and +INF result
- br.cond.sptk __libm_error_region ;;
+ mov GR_Parameter_TAG = 16
+ fma.s.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result
+ br.cond.sptk __libm_error_region
}
+;;
-L(EXP_POSSIBLE_UNDERFLOW):
+EXP_POSSIBLE_UNDERFLOW:
-// We got an answer. EXP_MAX_SGL_UFLOW_ARG < x < EXP_MIN_SGL_NORM_ARG
-// underflow is a possibility, not a certainty
+// Here if fMAX_SGL_ZERO_ARG < x < fMIN_SGL_NORM_ARG
+// Underflow is a possibility, not a certainty
// We define an underflow when the answer with
// ftz set
@@ -637,144 +549,157 @@ L(EXP_POSSIBLE_UNDERFLOW):
// E
// -----+--------------------+--------------------+-----
// | | |
-// 1.1...10 2^-7f 1.1...11 2^-7f 1.0...00 2^-7e
-// 0.1...11 2^-7e (biased, 1)
+// 1.1...10 2^-3fff 1.1...11 2^-3fff 1.0...00 2^-3ffe
+// 0.1...11 2^-3ffe (biased, 1)
// largest dn smallest normal
-// If the answer is = 0, we have underflowed.
-// Then p6 is TRUE. Set the underflow tag, save input in FR_X,
-// do the final calculation for IEEE result, and branch to error return.
-
{ .mfi
- nop.m 999
- fsetc.s2 0x7F,0x41
- nop.i 999
+ nop.m 0
+ fsetc.s2 0x7F,0x41 // Get user's round mode, set ftz
+ nop.i 0
}
;;
{ .mfi
- nop.m 999
- fma.s.s2 exp_ftz_urm_f8 = exp_T, exp_P6, exp_A
- nop.i 999
+ nop.m 0
+ fma.s.s2 fFtz_urm_f8 = fP, fT, fT // Result with ftz set
+ nop.i 0
}
;;
-
{ .mfi
- nop.m 999
- fsetc.s2 0x7F,0x40
- nop.i 999
+ nop.m 0
+ fsetc.s2 0x7F,0x40 // Turn off ftz in sf2
+ nop.i 0
}
;;
{ .mfi
- nop.m 999
- fcmp.eq.unc.s1 p6, p0 = exp_ftz_urm_f8, f0
- nop.i 999
+ nop.m 0
+ fcmp.eq.s1 p6, p7 = fFtz_urm_f8, f0 // Test for underflow
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s.s0 f8 = fP, fT, fT // Compute result, set I, maybe U
+ nop.i 0
}
;;
-{ .mfb
- nop.m 999
- nop.f 999
-(p6) br.cond.spnt L(EXP_CERTAIN_UNDERFLOW) // Branch if really underflow
+{ .mbb
+ nop.m 0
+(p6) br.cond.spnt EXP_UNDERFLOW_COMMON // Branch if really underflow
+(p7) br.ret.sptk b0 // Exit if really no underflow
+}
+;;
+
+EXP_CERTAIN_UNDERFLOW:
+// Here if x < fMAX_SGL_ZERO_ARG
+// Result will be zero (or smallest denorm if round to +inf) with I, U set
+{ .mmi
+ mov rTmp = 1
+;;
+ setf.exp fTmp = rTmp // Form small normal
+ nop.i 0
}
;;
{ .mfb
- nop.m 999
- fma.s f8 = exp_T, exp_P6, exp_A
- br.ret.sptk b0 // Exit if really no underflow
+ nop.m 0
+ fma.s.s0 f8 = fTmp, fTmp, f0 // Set I,U, tiny (+0.0) result
+ br.cond.sptk EXP_UNDERFLOW_COMMON
}
;;
-L(EXP_CERTAIN_UNDERFLOW):
+EXP_UNDERFLOW_COMMON:
+// Determine if underflow result is zero or nonzero
{ .mfi
- nop.m 999
- fmerge.s FR_X = f8,f8
- nop.i 999
+ alloc r32=ar.pfs,0,3,4,0
+ fcmp.eq.s1 p6, p0 = f8, f0
+ nop.i 0
}
+;;
+
{ .mfb
- mov GR_Parameter_TAG = 17
- fma.s FR_RESULT = exp_T, exp_P6, exp_A // Set I,U and tiny result
- br.cond.sptk __libm_error_region ;;
+ nop.m 0
+ fmerge.s FR_X = fNormX,fNormX
+(p6) br.cond.spnt EXP_UNDERFLOW_ZERO
}
+;;
-L(EXP_CERTAIN_UNDERFLOW_ZERO):
-{ .mmi
- mov exp_GR_one = 1 ;;
- setf.exp f9 = exp_GR_one
- nop.i 999 ;;
+EXP_UNDERFLOW_NONZERO:
+// Here if x < fMIN_SGL_NORM_ARG and result nonzero;
+// I, U are set
+{ .mfb
+ mov GR_Parameter_TAG = 17
+ nop.f 0 // FR_RESULT already set
+ br.cond.sptk __libm_error_region
}
+;;
-{ .mfi
- nop.m 999
- fmerge.s FR_X = f8,f8
- nop.i 999
-}
+EXP_UNDERFLOW_ZERO:
+// Here if x < fMIN_SGL_NORM_ARG and result zero;
+// I, U are set
{ .mfb
- mov GR_Parameter_TAG = 17
- fma.s FR_RESULT = f9, f9, f0 // Set I,U and tiny (+0.0) result
- br.cond.sptk __libm_error_region ;;
+ mov GR_Parameter_TAG = 17
+ nop.f 0 // FR_RESULT already set
+ br.cond.sptk __libm_error_region
}
+;;
-.endp expf
-ASM_SIZE_DIRECTIVE(expf)
-
+GLOBAL_IEEE754_END(expf)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
- nop.f 999
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
- nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
- stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mfi
- stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- nop.f 0
- add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ nop.f 0
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
}
{ .mib
- stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- nop.m 0
- nop.m 0
- add GR_Parameter_RESULT = 48,sp
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
{ .mmi
- ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
- add sp = 64,sp // Restore stack pointer
- mov b0 = GR_SAVE_B0 // Restore return address
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
-};;
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
diff --git a/sysdeps/ia64/fpu/e_fmod.S b/sysdeps/ia64/fpu/e_fmod.S
index 2b3ee9610f..d801e0c128 100644
--- a/sysdeps/ia64/fpu/e_fmod.S
+++ b/sysdeps/ia64/fpu/e_fmod.S
@@ -1,11 +1,10 @@
.file "fmod.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational
-// Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -36,38 +35,42 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//====================================================================
-// 2/02/00 Initial version
-// 3/02/00 New Algorithm
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 03/02/00 New Algorithm
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-//11/28/00 Set FR_Y to f9
+// 11/28/00 Set FR_Y to f9
+// 03/11/02 Fixed flags for fmod(qnan,zero)
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+// 04/28/03 Fix: fmod(sNaN,0) no longer sets errno
//
// API
//====================================================================
-// double fmod(double,double);
+// double fmod(double,double);
//
// Overview of operation
//====================================================================
// fmod(a,b)=a-i*b,
-// where i is an integer such that, if b!=0,
+// where i is an integer such that, if b!=0,
// |i|<|a/b| and |a/b-i|<1
//
// Algorithm
//====================================================================
// a). if |a|<|b|, return a
-// b). get quotient and reciprocal overestimates accurate to
+// b). get quotient and reciprocal overestimates accurate to
// 33 bits (q2,y2)
// c). if the exponent difference (exponent(a)-exponent(b))
// is less than 32, truncate quotient to integer and
// finish in one iteration
// d). if exponent(a)-exponent(b)>=32 (q2>=2^32)
// round quotient estimate to single precision (k=RN(q2)),
-// calculate partial remainder (a'=a-k*b),
+// calculate partial remainder (a'=a-k*b),
// get quotient estimate (a'*y2), and repeat from c).
//
// Special cases
@@ -81,14 +84,9 @@
// General registers: r2,r29,r32 (ar.pfs), r33-r39
// Floating point registers: f6-f15
-#include "libm_support.h"
-
-.section .text
-
-
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
-GR_SAVE_GP = r35
+GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
@@ -101,17 +99,9 @@ FR_Y = f9
FR_RESULT = f8
-.proc fmod#
-.align 32
-.global fmod#
-.align 32
+.section .text
+GLOBAL_IEEE754_ENTRY(fmod)
-fmod:
-#ifdef _LIBC
-.global __ieee754_fmod
-.type __ieee754_fmod,@function
-__ieee754_fmod:
-#endif
// inputs in f8, f9
// result in f8
@@ -133,12 +123,12 @@ __ieee754_fmod:
// (1) y0
frcpa.s1 f10,p6=f6,f7
nop.i 0
-}
+}
// Y +-NAN, +-inf, +-0? p7
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f9, 0xe7
+ fclass.m.unc p7,p0 = f9, 0xe7
nop.i 999;;
}
@@ -149,14 +139,14 @@ __ieee754_fmod:
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p0 = f8, 0xe3
- nop.i 999
+ fclass.m.unc p9,p0 = f8, 0xe3
+ nop.i 999
}
// |x| < |y|? Return x p8
{ .mfi
nop.m 999
-(p0) fcmp.lt.unc.s1 p8,p0 = f6,f7
+ fcmp.lt.unc.s1 p8,p0 = f6,f7
nop.i 999 ;;
}
@@ -172,33 +162,33 @@ __ieee754_fmod:
// (2) q0=a*y0
(p6) fma.s1 f13=f6,f10,f0
nop.i 0
-}
+}
{ .mfi
nop.m 0
// (3) e0 = 1 - b * y0
(p6) fnma.s1 f12=f7,f10,f1
nop.i 0;;
-}
+}
{.mfi
nop.m 0
// normalize x (if |x|<|y|)
(p8) fma.d.s0 f8=f8,f1,f0
nop.i 0
-}
+}
{.bbb
- (p9) br.cond.spnt L(FMOD_X_NAN_INF)
- (p7) br.cond.spnt L(FMOD_Y_NAN_INF_ZERO)
+ (p9) br.cond.spnt FMOD_X_NAN_INF
+ (p7) br.cond.spnt FMOD_Y_NAN_INF_ZERO
// if |x|<|y|, return
(p8) br.ret.spnt b0;;
}
- {.mfi
+ {.mfi
nop.m 0
// normalize x
fma.s0 f6=f6,f1,f0
nop.i 0
-}
+}
{.mfi
nop.m 0
// normalize y
@@ -212,45 +202,45 @@ __ieee754_fmod:
// (4) q1=q0+e0*q0
(p6) fma.s1 f13=f12,f13,f13
nop.i 0
-}
+}
{ .mfi
nop.m 0
// (5) e1 = e0 * e0 + 2^-34
(p6) fma.s1 f14=f12,f12,f11
nop.i 0;;
-}
+}
{.mlx
nop.m 0
movl r2=0x33a00000;;
-}
+}
{ .mfi
nop.m 0
// (6) y1 = y0 + e0 * y0
(p6) fma.s1 f10=f12,f10,f10
nop.i 0;;
-}
+}
{.mfi
// set f12=1.25*2^{-24}
setf.s f12=r2
// (7) q2=q1+e1*q1
(p6) fma.s1 f13=f13,f14,f13
nop.i 0;;
-}
+}
{.mfi
nop.m 0
fmerge.s f9=f8,f9
nop.i 0
-}
+}
{ .mfi
nop.m 0
// (8) y2 = y1 + e1 * y1
(p6) fma.s1 f10=f14,f10,f10
// set p6=0, p10=0
cmp.ne.and p6,p10=r0,r0;;
-}
+}
.align 32
-L(loop53):
+loop53:
{.mfi
nop.m 0
// compare q2, 2^32
@@ -280,7 +270,7 @@ L(loop53):
// normalize truncated quotient
(p8) fcvt.xf f13=f11
nop.i 0;;
-}
+}
{ .mfi
nop.m 0
// calculate remainder (assuming f13=RZ(Q))
@@ -289,7 +279,7 @@ L(loop53):
}
{.mfi
nop.m 0
- // also if exponent>32, round quotient to single precision
+ // also if exponent>32, round quotient to single precision
// and subtract 1 ulp: q=q-q*(1.25*2^{-24})
(p7) fnma.s.s1 f11=f13,f12,f13
nop.i 0;;
@@ -332,7 +322,7 @@ L(loop53):
.pred.rel "mutex",p6,p10
{.mfb
nop.m 0
- // add b to estimated remainder (to cover the case when the quotient was overestimated)
+ // add b to estimated remainder (to cover the case when the quotient was overestimated)
// also set correct sign by using f9=|b|*sgn(a), f12=sgn(a)
(p6) fma.d.s0 f8=f11,f12,f9
nop.b 0
@@ -354,97 +344,114 @@ L(loop53):
nop.m 0
// if f14 was RZ(Q), set remainder to f14
(p9) mov f6=f14
- br.cond.sptk L(loop53);;
+ br.cond.sptk loop53;;
}
-L(FMOD_X_NAN_INF):
+FMOD_X_NAN_INF:
// Y zero ?
-{.mfi
+{.mfi
+ nop.m 0
+ fclass.m p10,p0=f8,0xc3 // Test x=nan
+ nop.i 0
+}
+{.mfi
nop.m 0
fma.s1 f10=f9,f1,f0
nop.i 0;;
}
+
{.mfi
+ nop.m 0
+ fma.s0 f8=f8,f1,f0
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+(p10) fclass.m p10,p0=f9,0x07 // Test x=nan, and y=zero
+ nop.i 0;;
+}
+
+{.mfb
nop.m 0
fcmp.eq.unc.s1 p11,p0=f10,f0
- nop.i 0;;
+(p10) br.ret.spnt b0;; // Exit with result=x if x=nan and y=zero
}
{.mib
nop.m 0
nop.i 0
// if Y zero
- (p11) br.cond.spnt L(FMOD_Y_ZERO);;
+ (p11) br.cond.spnt FMOD_Y_ZERO;;
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
-(p0) fclass.m.unc p8,p9 = f8, 0x23
- nop.i 999;;
+ fclass.m.unc p8,p9 = f8, 0x23
+ nop.i 999;;
}
// Y NaN ?
{.mfi
- nop.m 999
+ nop.m 999
(p8) fclass.m p9,p8=f9,0xc3
- nop.i 0;;
+ nop.i 0;;
}
{.mfi
- nop.m 999
-(p8) frcpa.s0 f8,p0 = f8,f8
+ nop.m 999
+(p8) frcpa.s0 f8,p0 = f8,f8
nop.i 0
-}
+}
{ .mfi
nop.m 999
- // also set Denormal flag if necessary
+ // also set Denormal flag if necessary
(p8) fma.s0 f9=f9,f1,f0
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p8) fma.d f8=f8,f1,f0
- nop.b 999 ;;
+(p8) fma.d.s0 f8=f8,f1,f0
+ nop.b 999 ;;
}
{ .mfb
nop.m 999
-(p9) frcpa.s0 f8,p7=f8,f9
- br.ret.sptk b0 ;;
+(p9) frcpa.s0 f8,p7=f8,f9
+ br.ret.sptk b0 ;;
}
-L(FMOD_Y_NAN_INF_ZERO):
+FMOD_Y_NAN_INF_ZERO:
// Y INF
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f9, 0x23
+ fclass.m.unc p7,p0 = f9, 0x23
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p7) fma.d f8=f8,f1,f0
-(p7) br.ret.spnt b0 ;;
+(p7) fma.d.s0 f8=f8,f1,f0
+(p7) br.ret.spnt b0 ;;
}
// Y NAN?
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p0 = f9, 0xc3
+ fclass.m.unc p9,p0 = f9, 0xc3
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p9) fma.d f8=f9,f1,f0
-(p9) br.ret.spnt b0 ;;
+(p9) fma.d.s0 f8=f9,f1,f0
+(p9) br.ret.spnt b0 ;;
}
-L(FMOD_Y_ZERO):
+FMOD_Y_ZERO:
// Y zero? Must be zero at this point
// because it is the only choice left.
// Return QNAN indefinite
@@ -452,60 +459,56 @@ L(FMOD_Y_ZERO):
{.mfi
nop.m 0
// set Invalid
- frcpa f12,p0=f0,f0
+ frcpa.s0 f12,p0=f0,f0
nop.i 0
}
// X NAN?
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p10 = f8, 0xc3
+ fclass.m.unc p9,p10 = f8, 0xc3
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p10) fclass.nm p9,p10 = f8, 0xff
+(p10) fclass.nm p9,p10 = f8, 0xff
nop.i 999 ;;
}
{.mfi
nop.m 999
- (p9) frcpa f11,p7=f8,f0
+ (p9) frcpa.s0 f11,p7=f8,f0
nop.i 0;;
}
{ .mfi
nop.m 999
-(p10) frcpa f11,p7 = f9,f9
-(p0) mov GR_Parameter_TAG = 121 ;;
+(p10) frcpa.s0 f11,p7 = f9,f9
+ mov GR_Parameter_TAG = 121 ;;
}
{ .mfi
nop.m 999
-(p0) fmerge.s f10 = f8, f8
+ fmerge.s f10 = f8, f8
nop.i 999
}
{ .mfb
nop.m 999
-(p0) fma.d f8=f11,f1,f0
-(p0) br.sptk __libm_error_region;;
+ fma.d.s0 f8=f11,f1,f0
+ br.sptk __libm_error_region;;
}
-.endp fmod
-ASM_SIZE_DIRECTIVE(fmod)
-ASM_SIZE_DIRECTIVE(__ieee754_fmod)
-
-.proc __libm_error_region
-__libm_error_region:
+GLOBAL_IEEE754_END(fmod)
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
@@ -513,18 +516,18 @@ __libm_error_region:
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
- stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0 // Parameter 3 address
+ stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ nop.b 0 // Parameter 3 address
}
{ .mib
stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
@@ -539,13 +542,17 @@ __libm_error_region:
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
+
+
+
+
diff --git a/sysdeps/ia64/fpu/e_fmodf.S b/sysdeps/ia64/fpu/e_fmodf.S
index 5b6390eeec..fe1ec0304d 100644
--- a/sysdeps/ia64/fpu/e_fmodf.S
+++ b/sysdeps/ia64/fpu/e_fmodf.S
@@ -1,10 +1,10 @@
.file "fmodf.s"
-// Copyright (c) 2000, 2001, Intel Corporation
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational
-// Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,9 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// WARRANTY DISCLAIMER
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -37,38 +35,42 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//====================================================================
-// 2/02/00 Initial version
-// 3/02/00 New Algorithm
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 03/02/00 New Algorithm
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-//11/28/00 Set FR_Y to f9
+// 11/28/00 Set FR_Y to f9
+// 03/11/02 Fixed flags for fmodf(qnan,zero)
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+// 04/28/03 Fix: fmod(sNaN,0) no longer sets errno
//
// API
//====================================================================
-// float fmodf(float,float);
+// float fmodf(float,float);
//
// Overview of operation
//====================================================================
// fmod(a,b)=a-i*b,
-// where i is an integer such that, if b!=0,
+// where i is an integer such that, if b!=0,
// |i|<|a/b| and |a/b-i|<1
// Algorithm
//====================================================================
// a). if |a|<|b|, return a
-// b). get quotient and reciprocal overestimates accurate to
+// b). get quotient and reciprocal overestimates accurate to
// 33 bits (q2,y2)
// c). if the exponent difference (exponent(a)-exponent(b))
// is less than 32, truncate quotient to integer and
// finish in one iteration
// d). if exponent(a)-exponent(b)>=32 (q2>=2^32)
// round quotient estimate to single precision (k=RN(q2)),
-// calculate partial remainder (a'=a-k*b),
+// calculate partial remainder (a'=a-k*b),
// get quotient estimate (a'*y2), and repeat from c).
// Special cases
@@ -82,13 +84,9 @@
// General registers: r2,r29,r32 (ar.pfs), r33-r39
// Floating point registers: f6-f15
-#include "libm_support.h"
-
-.section .text
-
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
-GR_SAVE_GP = r35
+GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
@@ -101,18 +99,9 @@ FR_Y = f9
FR_RESULT = f8
+.section .text
+GLOBAL_IEEE754_ENTRY(fmodf)
-.proc fmodf#
-.align 32
-.global fmodf#
-.align 32
-
-fmodf:
-#ifdef _LIBC
-.global __ieee754_fmodf
-.type __ieee754_fmodf,@function
-__ieee754_fmodf:
-#endif
// inputs in f8, f9
// result in f8
@@ -134,13 +123,13 @@ __ieee754_fmodf:
// (1) y0
frcpa.s1 f10,p6=f6,f7
nop.i 0
-}
+}
// eliminate special cases
// Y +-NAN, +-inf, +-0? p7
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f9, 0xe7
+ fclass.m.unc p7,p0 = f9, 0xe7
nop.i 999;;
}
@@ -151,14 +140,14 @@ __ieee754_fmodf:
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p0 = f8, 0xe3
- nop.i 999
+ fclass.m.unc p9,p0 = f8, 0xe3
+ nop.i 999
}
// |x| < |y|? Return x p8
{ .mfi
nop.m 999
-(p0) fcmp.lt.unc.s1 p8,p0 = f6,f7
+ fcmp.lt.unc.s1 p8,p0 = f6,f7
nop.i 999 ;;
}
@@ -174,33 +163,33 @@ __ieee754_fmodf:
// (2) q0=a*y0
(p6) fma.s1 f13=f6,f10,f0
nop.i 0
-}
+}
{ .mfi
nop.m 0
// (3) e0 = 1 - b * y0
(p6) fnma.s1 f12=f7,f10,f1
nop.i 0;;
-}
+}
{.mfi
nop.m 0
// normalize x (if |x|<|y|)
(p8) fma.s.s0 f8=f8,f1,f0
nop.i 0
-}
+}
{.bbb
- (p9) br.cond.spnt L(FMOD_X_NAN_INF)
- (p7) br.cond.spnt L(FMOD_Y_NAN_INF_ZERO)
+ (p9) br.cond.spnt FMOD_X_NAN_INF
+ (p7) br.cond.spnt FMOD_Y_NAN_INF_ZERO
// if |x|<|y|, return
(p8) br.ret.spnt b0;;
}
- {.mfi
+ {.mfi
nop.m 0
// normalize x
fma.s0 f6=f6,f1,f0
nop.i 0
-}
+}
{.mfi
nop.m 0
// normalize y
@@ -215,45 +204,45 @@ __ieee754_fmodf:
// (4) q1=q0+e0*q0
(p6) fma.s1 f13=f12,f13,f13
nop.i 0
-}
+}
{ .mfi
nop.m 0
// (5) e1 = e0 * e0 + 2^-34
(p6) fma.s1 f14=f12,f12,f11
nop.i 0;;
-}
+}
{.mlx
nop.m 0
movl r2=0x33a00000;;
-}
+}
{ .mfi
nop.m 0
// (6) y1 = y0 + e0 * y0
(p6) fma.s1 f10=f12,f10,f10
nop.i 0;;
-}
+}
{.mfi
// set f12=1.25*2^{-24}
setf.s f12=r2
// (7) q2=q1+e1*q1
(p6) fma.s1 f13=f13,f14,f13
nop.i 0;;
-}
+}
{.mfi
nop.m 0
fmerge.s f9=f8,f9
nop.i 0
-}
+}
{ .mfi
nop.m 0
// (8) y2 = y1 + e1 * y1
(p6) fma.s1 f10=f14,f10,f10
// set p6=0, p10=0
cmp.ne.and p6,p10=r0,r0;;
-}
+}
.align 32
-L(loop24):
+loop24:
{.mfi
nop.m 0
// compare q2, 2^32
@@ -283,7 +272,7 @@ L(loop24):
// normalize truncated quotient
(p8) fcvt.xf f13=f11
nop.i 0;;
-}
+}
{ .mfi
nop.m 0
// calculate remainder (assuming f13=RZ(Q))
@@ -292,7 +281,7 @@ L(loop24):
}
{.mfi
nop.m 0
- // also if exponent>32, round quotient to single precision
+ // also if exponent>32, round quotient to single precision
// and subtract 1 ulp: q=q-q*(1.25*2^{-24})
(p7) fnma.s.s1 f11=f13,f12,f13
nop.i 0;;
@@ -335,7 +324,7 @@ L(loop24):
.pred.rel "mutex",p6,p10
{.mfb
nop.m 0
- // add b to estimated remainder (to cover the case when the quotient was overestimated)
+ // add b to estimated remainder (to cover the case when the quotient was overestimated)
// also set correct sign by using f9=|b|*sgn(a), f12=sgn(a)
(p6) fma.s.s0 f8=f11,f12,f9
nop.b 0
@@ -357,102 +346,118 @@ L(loop24):
nop.m 0
// if f14 was RZ(Q), set remainder to f14
(p9) mov f6=f14
- br.cond.sptk L(loop24);;
+ br.cond.sptk loop24;;
}
{ .mmb
- nop.m 0
- nop.m 0
- br.ret.sptk b0;;
+ nop.m 0
+ nop.m 0
+ br.ret.sptk b0;;
}
-L(FMOD_X_NAN_INF):
+FMOD_X_NAN_INF:
// Y zero ?
-{.mfi
+{.mfi
+ nop.m 0
+ fclass.m p10,p0=f8,0xc3 // Test x=nan
+ nop.i 0
+}
+{.mfi
nop.m 0
fma.s1 f10=f9,f1,f0
nop.i 0;;
}
+
{.mfi
+ nop.m 0
+ fma.s0 f8=f8,f1,f0
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+(p10) fclass.m p10,p0=f9,0x07 // Test x=nan, and y=zero
+ nop.i 0;;
+}
+{.mfb
nop.m 0
fcmp.eq.unc.s1 p11,p0=f10,f0
- nop.i 0;;
+(p10) br.ret.spnt b0;; // Exit with result=x if x=nan and y=zero
}
{.mib
nop.m 0
nop.i 0
// if Y zero
- (p11) br.cond.spnt L(FMOD_Y_ZERO);;
+ (p11) br.cond.spnt FMOD_Y_ZERO;;
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
-(p0) fclass.m.unc p8,p9 = f8, 0x23
- nop.i 999;;
+ fclass.m.unc p8,p9 = f8, 0x23
+ nop.i 999;;
}
// Y NaN ?
{.mfi
- nop.m 999
+ nop.m 999
(p8) fclass.m p9,p8=f9,0xc3
- nop.i 0;;
+ nop.i 0;;
}
{.mfi
- nop.m 999
-(p8) frcpa.s0 f8,p0 = f8,f8
+ nop.m 999
+(p8) frcpa.s0 f8,p0 = f8,f8
nop.i 0
-}
+}
{ .mfi
nop.m 999
- // also set Denormal flag if necessary
+ // also set Denormal flag if necessary
(p8) fma.s0 f9=f9,f1,f0
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p8) fma.s f8=f8,f1,f0
- nop.b 999 ;;
+(p8) fma.s.s0 f8=f8,f1,f0
+ nop.b 999 ;;
}
{ .mfb
nop.m 999
-(p9) frcpa.s0 f8,p7=f8,f9
- br.ret.sptk b0 ;;
+(p9) frcpa.s0 f8,p7=f8,f9
+ br.ret.sptk b0 ;;
}
-L(FMOD_Y_NAN_INF_ZERO):
+FMOD_Y_NAN_INF_ZERO:
// Y INF
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f9, 0x23
+ fclass.m.unc p7,p0 = f9, 0x23
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p7) fma.s f8=f8,f1,f0
-(p7) br.ret.spnt b0 ;;
+(p7) fma.s.s0 f8=f8,f1,f0
+(p7) br.ret.spnt b0 ;;
}
// Y NAN?
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p0 = f9, 0xc3
+ fclass.m.unc p9,p0 = f9, 0xc3
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p9) fma.s f8=f9,f1,f0
-(p9) br.ret.spnt b0 ;;
+(p9) fma.s.s0 f8=f9,f1,f0
+(p9) br.ret.spnt b0 ;;
}
-L(FMOD_Y_ZERO):
+FMOD_Y_ZERO:
// Y zero? Must be zero at this point
// because it is the only choice left.
// Return QNAN indefinite
@@ -460,69 +465,65 @@ L(FMOD_Y_ZERO):
{.mfi
nop.m 0
// set Invalid
- frcpa f12,p0=f0,f0
+ frcpa.s0 f12,p0=f0,f0
nop.i 999
}
// X NAN?
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p10 = f8, 0xc3
+ fclass.m.unc p9,p10 = f8, 0xc3
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p10) fclass.nm p9,p10 = f8, 0xff
+(p10) fclass.nm p9,p10 = f8, 0xff
nop.i 999 ;;
}
{.mfi
nop.m 999
- (p9) frcpa f11,p7=f8,f0
+ (p9) frcpa.s0 f11,p7=f8,f0
nop.i 0;;
}
{ .mfi
nop.m 999
-(p10) frcpa f11,p7 = f0,f0
+(p10) frcpa.s0 f11,p7 = f0,f0
nop.i 999;;
}
{ .mfi
nop.m 999
-(p0) fmerge.s f10 = f8, f8
+ fmerge.s f10 = f8, f8
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fma.s f8=f11,f1,f0
+ fma.s.s0 f8=f11,f1,f0
nop.i 999;;
}
-L(EXP_ERROR_RETURN):
+EXP_ERROR_RETURN:
{ .mib
nop.m 0
-(p0) mov GR_Parameter_TAG=122
-(p0) br.sptk __libm_error_region;;
+ mov GR_Parameter_TAG=122
+ br.sptk __libm_error_region;;
}
-.endp fmodf
-ASM_SIZE_DIRECTIVE(fmodf)
-ASM_SIZE_DIRECTIVE(__ieee754_fmodf)
-
-.proc __libm_error_region
-__libm_error_region:
+GLOBAL_IEEE754_END(fmodf)
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
@@ -530,18 +531,18 @@ __libm_error_region:
{ .mmi
stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
- stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0 // Parameter 3 address
+ stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ nop.b 0 // Parameter 3 address
}
{ .mib
stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support#;; // Call error handling function
}
{ .mmi
@@ -556,13 +557,14 @@ __libm_error_region:
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
+
+
diff --git a/sysdeps/ia64/fpu/e_fmodl.S b/sysdeps/ia64/fpu/e_fmodl.S
index 85c9f6ef82..da08ae3f5c 100644
--- a/sysdeps/ia64/fpu/e_fmodl.S
+++ b/sysdeps/ia64/fpu/e_fmodl.S
@@ -1,11 +1,10 @@
.file "fmodl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational
-// Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -36,38 +35,42 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//====================================================================
-// 2/02/00 Initial version
-// 3/02/00 New Algorithm
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 03/02/00 New Algorithm
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-//11/28/00 Set FR_Y to f9
+// 11/28/00 Set FR_Y to f9
+// 03/11/02 Fixed flags for fmodl(qnan,zero)
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+// 04/28/03 Fix: fmod(sNaN,0) no longer sets errno
//
// API
//====================================================================
-// long double fmodl(long double,long double);
+// long double fmodl(long double,long double);
//
// Overview of operation
//====================================================================
// fmod(a,b)=a-i*b,
-// where i is an integer such that, if b!=0,
+// where i is an integer such that, if b!=0,
// |i|<|a/b| and |a/b-i|<1
//
// Algorithm
//====================================================================
// a). if |a|<|b|, return a
-// b). get quotient and reciprocal overestimates accurate to
+// b). get quotient and reciprocal overestimates accurate to
// 33 bits (q2,y2)
// c). if the exponent difference (exponent(a)-exponent(b))
// is less than 32, truncate quotient to integer and
// finish in one iteration
// d). if exponent(a)-exponent(b)>=32 (q2>=2^32)
// round quotient estimate to single precision (k=RN(q2)),
-// calculate partial remainder (a'=a-k*b),
+// calculate partial remainder (a'=a-k*b),
// get quotient estimate (a'*y2), and repeat from c).
//
// Registers used
@@ -76,13 +79,9 @@
// General registers: r2,r29,r32 (ar.pfs), r33-r39
// Floating point registers: f6-f15
-#include "libm_support.h"
-
-.section .text
-
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
-GR_SAVE_GP = r35
+GR_SAVE_GP = r35
GR_SAVE_SP = r36
GR_Parameter_X = r37
@@ -95,18 +94,9 @@ FR_Y = f9
FR_RESULT = f8
+.section .text
+GLOBAL_IEEE754_ENTRY(fmodl)
-.proc fmodl#
-.align 32
-.global fmodl#
-.align 32
-
-fmodl:
-#ifdef _LIBC
-.global __ieee754_fmodl
-.type __ieee754_fmodl,@function
-__ieee754_fmodl:
-#endif
// inputs in f8, f9
// result in f8
@@ -128,7 +118,7 @@ __ieee754_fmodl:
// (1) y0
frcpa.s1 f10,p6=f6,f7
nop.i 0;;
-}
+}
// eliminate special cases
{.mmi
@@ -141,7 +131,7 @@ cmp.eq p7,p10=r29,r0;;
// Y +-NAN, +-inf, +-0? p7
{ .mfi
nop.m 999
-(p10) fclass.m p7,p10 = f9, 0xe7
+(p10) fclass.m p7,p10 = f9, 0xe7
nop.i 999;;
}
@@ -152,14 +142,14 @@ cmp.eq p7,p10=r29,r0;;
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p11 = f8, 0xe3
- nop.i 999
+ fclass.m.unc p9,p11 = f8, 0xe3
+ nop.i 999
}
// |x| < |y|? Return x p8
{ .mfi
nop.m 999
-(p10) fcmp.lt.unc.s1 p8,p0 = f6,f7
+(p10) fcmp.lt.unc.s1 p8,p0 = f6,f7
nop.i 999 ;;
}
@@ -173,13 +163,13 @@ cmp.eq p7,p10=r29,r0;;
// (3) e0 = 1 - b * y0
(p6) fnma.s1 f12=f7,f10,f1
nop.i 0;;
-}
+}
// Y +-NAN, +-inf, +-0? p7
{ .mfi
nop.m 999
- // pseudo-NaN ?
-(p10) fclass.nm p7,p0 = f9, 0xff
+ // pseudo-NaN ?
+(p10) fclass.nm p7,p0 = f9, 0xff
nop.i 999
}
@@ -190,7 +180,7 @@ cmp.eq p7,p10=r29,r0;;
{ .mfi
nop.m 999
-(p11) fclass.nm p9,p0 = f8, 0xff
+(p11) fclass.nm p9,p0 = f8, 0xff
nop.i 999;;
}
@@ -209,18 +199,18 @@ cmp.eq p7,p10=r29,r0;;
nop.i 0
}
{.bbb
- (p9) br.cond.spnt L(FMOD_X_NAN_INF)
- (p7) br.cond.spnt L(FMOD_Y_NAN_INF_ZERO)
+ (p9) br.cond.spnt FMOD_X_NAN_INF
+ (p7) br.cond.spnt FMOD_Y_NAN_INF_ZERO
// if |x|<|y|, return
(p8) br.ret.spnt b0;;
}
- {.mfi
+ {.mfi
nop.m 0
// x denormal ? set D flag
fnma.s0 f32=f6,f1,f6
nop.i 0
-}
+}
{.mfi
nop.m 0
// y denormal ? set D flag
@@ -234,46 +224,46 @@ cmp.eq p7,p10=r29,r0;;
// (4) q1=q0+e0*q0
(p6) fma.s1 f13=f12,f13,f13
nop.i 0
-}
+}
{ .mfi
nop.m 0
// (5) e1 = e0 * e0 + 2^-34
(p6) fma.s1 f14=f12,f12,f11
nop.i 0;;
-}
+}
{.mlx
nop.m 0
movl r2=0x33a00000;;
-}
+}
{ .mfi
nop.m 0
// (6) y1 = y0 + e0 * y0
(p6) fma.s1 f10=f12,f10,f10
nop.i 0;;
-}
+}
{.mfi
// set f12=1.25*2^{-24}
setf.s f12=r2
// (7) q2=q1+e1*q1
(p6) fma.s1 f13=f13,f14,f13
nop.i 0;;
-}
+}
{.mfi
nop.m 0
fmerge.s f9=f8,f9
nop.i 0
-}
+}
{ .mfi
nop.m 0
// (8) y2 = y1 + e1 * y1
(p6) fma.s1 f10=f14,f10,f10
// set p6=0, p10=0
cmp.ne.and p6,p10=r0,r0;;
-}
+}
.align 32
-L(loop64):
+loop64:
{.mfi
nop.m 0
// compare q2, 2^32
@@ -305,7 +295,7 @@ L(loop64):
// normalize truncated quotient
(p8) fcvt.xf f13=f11
nop.i 0;;
-}
+}
{ .mfi
nop.m 0
// calculate remainder (assuming f13=RZ(Q))
@@ -314,7 +304,7 @@ L(loop64):
}
{.mfi
nop.m 0
- // also if exponent>32, round quotient to single precision
+ // also if exponent>32, round quotient to single precision
// and subtract 1 ulp: q=q-q*(1.25*2^{-24})
(p7) fnma.s.s1 f11=f13,f12,f13
nop.i 0;;
@@ -357,7 +347,7 @@ L(loop64):
.pred.rel "mutex",p6,p10
{.mfb
nop.m 0
- // add b to estimated remainder (to cover the case when the quotient was overestimated)
+ // add b to estimated remainder (to cover the case when the quotient was overestimated)
// also set correct sign by using f9=|b|*sgn(a), f12=sgn(a)
(p6) fma.s0 f8=f11,f12,f9
nop.b 0
@@ -378,43 +368,59 @@ L(loop64):
nop.m 0
// if f14 was RZ(Q), set remainder to f14
(p9) mov f6=f14
- br.cond.sptk L(loop64);;
+ br.cond.sptk loop64;;
}
-L(FMOD_X_NAN_INF):
+FMOD_X_NAN_INF:
// Y zero ?
-{.mfi
+{.mfi
+ nop.m 0
+ fclass.m p10,p0=f8,0xc3 // Test x=nan
+ nop.i 0
+}
+{.mfi
nop.m 0
fma.s1 f10=f9,f1,f0
nop.i 0;;
}
+
+{.mfi
+ nop.m 0
+ fma.s0 f8=f8,f1,f0
+ nop.i 0
+}
{.mfi
+ nop.m 0
+(p10) fclass.m p10,p0=f9,0x07 // Test x=nan, and y=zero
+ nop.i 0;;
+}
+{.mfb
nop.m 0
fcmp.eq.unc.s1 p11,p0=f10,f0
- nop.i 0;;
+(p10) br.ret.spnt b0;; // Exit with result=x if x=nan and y=zero
}
{.mib
nop.m 0
nop.i 0
// if Y zero
- (p11) br.cond.spnt L(FMOD_Y_ZERO);;
+ (p11) br.cond.spnt FMOD_Y_ZERO;;
}
// X infinity? Return QNAN indefinite
{ .mfi
- // set p7 t0 0
- cmp.ne p7,p0=r0,r0
-(p0) fclass.m.unc p8,p9 = f8, 0x23
- nop.i 999;;
+ // set p7 t0 0
+ cmp.ne p7,p0=r0,r0
+ fclass.m.unc p8,p9 = f8, 0x23
+ nop.i 999;;
}
// Y NaN ?
{.mfi
nop.m 999
(p8) fclass.m p9,p8=f9,0xc3
- nop.i 0;;
+ nop.i 0;;
}
// Y not pseudo-zero ? (r29 holds significand)
{.mii
@@ -423,63 +429,63 @@ L(FMOD_X_NAN_INF):
nop.i 0;;
}
{.mfi
- nop.m 999
-(p8) frcpa.s0 f8,p0 = f8,f8
+ nop.m 999
+(p8) frcpa.s0 f8,p0 = f8,f8
nop.i 0
-}
+}
{ .mfi
nop.m 999
- // also set Denormal flag if necessary
+ // also set Denormal flag if necessary
(p7) fnma.s0 f9=f9,f1,f9
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p8) fma.s0 f8=f8,f1,f0
- nop.b 999 ;;
+(p8) fma.s0 f8=f8,f1,f0
+ nop.b 999 ;;
}
{ .mfb
nop.m 999
-(p9) frcpa.s0 f8,p7=f8,f9
- br.ret.sptk b0 ;;
+(p9) frcpa.s0 f8,p7=f8,f9
+ br.ret.sptk b0 ;;
}
-L(FMOD_Y_NAN_INF_ZERO):
+FMOD_Y_NAN_INF_ZERO:
// Y INF
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f9, 0x23
+ fclass.m.unc p7,p0 = f9, 0x23
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p7) fma f8=f8,f1,f0
-(p7) br.ret.spnt b0 ;;
+(p7) fma.s0 f8=f8,f1,f0
+(p7) br.ret.spnt b0 ;;
}
// Y NAN?
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p10 = f9, 0xc3
+ fclass.m.unc p9,p10 = f9, 0xc3
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p10) fclass.nm p9,p0 = f9, 0xff
+(p10) fclass.nm p9,p0 = f9, 0xff
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p9) fma f8=f9,f1,f0
-(p9) br.ret.spnt b0 ;;
+(p9) fma.s0 f8=f9,f1,f0
+(p9) br.ret.spnt b0 ;;
}
-L(FMOD_Y_ZERO):
+FMOD_Y_ZERO:
// Y zero? Must be zero at this point
// because it is the only choice left.
// Return QNAN indefinite
@@ -487,62 +493,59 @@ L(FMOD_Y_ZERO):
{.mfi
nop.m 0
// set Invalid
- frcpa f12,p0=f0,f0
+ frcpa.s0 f12,p0=f0,f0
nop.i 0
}
// X NAN?
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p10 = f8, 0xc3
+ fclass.m.unc p9,p10 = f8, 0xc3
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p10) fclass.nm p9,p10 = f8, 0xff
+(p10) fclass.nm p9,p10 = f8, 0xff
nop.i 999 ;;
}
{.mfi
nop.m 999
- (p9) frcpa f11,p7=f8,f0
+ (p9) frcpa.s0 f11,p7=f8,f0
nop.i 0;;
}
{ .mfi
nop.m 999
-(p10) frcpa f11,p7 = f9,f9
-(p0) mov GR_Parameter_TAG = 120 ;;
+(p10) frcpa.s0 f11,p7 = f9,f9
+ mov GR_Parameter_TAG = 120 ;;
}
{ .mfi
nop.m 999
-(p0) fmerge.s f10 = f8, f8
+ fmerge.s f10 = f8, f8
nop.i 999
}
{ .mfb
nop.m 999
-(p0) fma f8=f11,f1,f0
-(p0) br.sptk __libm_error_region;;
+ fma.s0 f8=f11,f1,f0
+ br.sptk __libm_error_region;;
}
-.endp fmodl
-ASM_SIZE_DIRECTIVE(fmodl)
-ASM_SIZE_DIRECTIVE(__ieee754_fmodl)
+GLOBAL_IEEE754_END(fmodl)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
-.fframe 64
+.fframe 64
add sp=-64,sp // Create new stack
nop.f 0
mov GR_SAVE_GP=gp // Save gp
@@ -550,18 +553,18 @@ __libm_error_region:
{ .mmi
stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
- stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0 // Parameter 3 address
+ stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ nop.b 0 // Parameter 3 address
}
{ .mib
stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
+ add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
@@ -576,15 +579,17 @@ __libm_error_region:
mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
-};;
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
+
+
diff --git a/sysdeps/ia64/fpu/e_gamma_r.c b/sysdeps/ia64/fpu/e_gamma_r.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_gamma_r.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/e_gammaf_r.c b/sysdeps/ia64/fpu/e_gammaf_r.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_gammaf_r.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/e_gammal_r.c b/sysdeps/ia64/fpu/e_gammal_r.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_gammal_r.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/e_hypot.S b/sysdeps/ia64/fpu/e_hypot.S
index 113aac3461..885c819326 100644
--- a/sysdeps/ia64/fpu/e_hypot.S
+++ b/sysdeps/ia64/fpu/e_hypot.S
@@ -1,11 +1,10 @@
-.file "hypot.asm"
+.file "hypot.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang of the
-// Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -36,24 +35,27 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-// *********************************************************************
+//*********************************************************************
//
// History:
-// 2/02/00 hand-optimized
-// 4/04/00 Unwind support added
-// 6/20/00 new version
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 hand-optimized
+// 04/04/00 Unwind support added
+// 06/20/00 new version
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+// 04/17/03 Added missing mutex directive
//
-// *********************************************************************
+//*********************************************************************
// ___________
// Function: hypot(x,y) = |(x^2 + y^2) = for double precision values
// x and y
// Also provides cabs functionality.
//
-// *********************************************************************
+//*********************************************************************
//
// Resources Used:
//
@@ -68,7 +70,7 @@
//
// Predicate Registers: p6 - p10
//
-// *********************************************************************
+//*********************************************************************
//
// IEEE Special Conditions:
//
@@ -78,7 +80,7 @@
// hypot(QNaN and anything) = QNaN
// hypot(SNaN and anything ) = QNaN
//
-// *********************************************************************
+//*********************************************************************
//
// Implementation:
// x2 = x * x in double-extended
@@ -86,9 +88,7 @@
// temp = x2 + y2 in double-extended
// sqrt(temp) rounded to double
//
-// *********************************************************************
-
-#include "libm_support.h"
+//*********************************************************************
GR_SAVE_PFS = r33
GR_SAVE_B0 = r34
@@ -103,23 +103,11 @@ FR_Y = f33
FR_RESULT = f8
.section .text
-#ifndef _LIBC
-.proc cabs#
-.global cabs#
-cabs:
-.endp cabs
-#endif
-.proc hypot#
-.global hypot#
-.align 64
-hypot:
-#ifdef _LIBC
-.global __hypot
-__hypot:
-.global __ieee754_hypot
-__ieee754_hypot:
-#endif
+LOCAL_LIBM_ENTRY(cabs)
+LOCAL_LIBM_END(cabs)
+GLOBAL_IEEE754_ENTRY(hypot)
+
{.mfi
alloc r32= ar.pfs,0,4,4,0
// Compute x*x
@@ -221,6 +209,7 @@ __ieee754_hypot:
mov r2=0x107fb;;
}
+.pred.rel "mutex",p7,p8
{.mfb
nop.m 0
// if f8=Infinity or f9=Zero, return |f8|
@@ -394,11 +383,8 @@ __ieee754_hypot:
// No overflow
(p9) br.ret.sptk b0;;
}
-.endp hypot
-ASM_SIZE_DIRECTIVE(hypot)
-
-.proc __libm_error_region
-__libm_error_region:
+GLOBAL_IEEE754_END(hypot)
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
@@ -445,7 +431,8 @@ __libm_error_region:
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region#)
.type __libm_error_support#,@function
.global __libm_error_support#
+
+
diff --git a/sysdeps/ia64/fpu/e_hypotf.S b/sysdeps/ia64/fpu/e_hypotf.S
index 0a11ec5b41..633bb67e59 100644
--- a/sysdeps/ia64/fpu/e_hypotf.S
+++ b/sysdeps/ia64/fpu/e_hypotf.S
@@ -1,11 +1,10 @@
-.file "hypotf.asm"
+.file "hypotf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang of the
-// Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -36,24 +35,27 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-// *********************************************************************
+//*********************************************************************
//
// History:
-// 2/02/00 hand-optimized
-// 4/04/00 Unwind support added
-// 6/26/00 new version
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 hand-optimized
+// 04/04/00 Unwind support added
+// 06/26/00 new version
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+// 04/17/03 Added missing mutex directive
//
-// *********************************************************************
+//*********************************************************************
// ___________
// Function: hypotf(x,y) = |(x^2 + y^2) = for single precision values
// x and y
// Also provides cabsf functionality.
//
-// *********************************************************************
+//*********************************************************************
//
// Resources Used:
//
@@ -68,7 +70,7 @@
//
// Predicate Registers: p6 - p10
//
-// *********************************************************************
+//*********************************************************************
//
// IEEE Special Conditions:
//
@@ -78,7 +80,7 @@
// hypotf(QNaN and anything) = QNaN
// hypotf(SNaN and anything ) = QNaN
//
-// *********************************************************************
+//*********************************************************************
//
// Implementation:
// x2 = x * x in double-extended
@@ -86,9 +88,7 @@
// temp = x2 + y2 in double-extended
// sqrt(temp) rounded to single precision
//
-// *********************************************************************
-
-#include "libm_support.h"
+//*********************************************************************
GR_SAVE_PFS = r33
GR_SAVE_B0 = r34
@@ -103,23 +103,10 @@ FR_Y = f15
FR_RESULT = f8
.section .text
-#ifndef _LIBC
-.proc cabsf#
-.global cabsf#
-cabsf:
-.endp cabsf
-#endif
-.proc hypotf#
-.global hypotf#
-.align 64
-hypotf:
-#ifdef _LIBC
-.global __hypotf
-__hypotf:
-.global __ieee754_hypotf
-__ieee754_hypotf:
-#endif
+LOCAL_LIBM_ENTRY(cabsf)
+LOCAL_LIBM_END(cabsf)
+GLOBAL_IEEE754_ENTRY(hypotf)
{.mfi
alloc r32= ar.pfs,0,4,4,0
// Compute x*x
@@ -207,6 +194,7 @@ __ieee754_hypotf:
nop.i 0;;
}
+.pred.rel "mutex",p7,p8
{.mfb
nop.m 0
// if f8=Infinity or f9=Zero, return |f8|
@@ -348,15 +336,12 @@ __ieee754_hypotf:
// No overflow
(p9) br.ret.sptk b0;;
}
-.endp hypotf
-ASM_SIZE_DIRECTIVE(hypotf)
-
-.proc __libm_error_region
-__libm_error_region:
+GLOBAL_IEEE754_END(hypotf)
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mii
add GR_Parameter_Y=-32,sp // Parameter 2 value
-(p0) mov GR_Parameter_TAG = 47
+ mov GR_Parameter_TAG = 47
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
@@ -400,8 +385,9 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
+
.type __libm_error_support#,@function
.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/e_hypotl.S b/sysdeps/ia64/fpu/e_hypotl.S
index 986faf6fcc..0aa94b69b8 100644
--- a/sysdeps/ia64/fpu/e_hypotl.S
+++ b/sysdeps/ia64/fpu/e_hypotl.S
@@ -1,11 +1,10 @@
-.file "hypotl.asm"
+.file "hypotl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang of the
-// Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -36,24 +35,26 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-// *********************************************************************
+//*********************************************************************
//
// History:
-// 2/02/00 hand-optimized
-// 4/04/00 Unwind support added
-// 6/20/00 new version
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 hand-optimized
+// 04/04/00 Unwind support added
+// 06/20/00 new version
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
-// *********************************************************************
+//*********************************************************************
// ___________
// Function: hypotl(x,y) = |(x^2 + y^2) = for double extended values
// x and y
// Also provides cabsl functionality.
//
-// *********************************************************************
+//*********************************************************************
//
// Resources Used:
//
@@ -68,7 +69,7 @@
//
// Predicate Registers: p6 - p10
//
-// *********************************************************************
+//*********************************************************************
//
// IEEE Special Conditions:
//
@@ -78,7 +79,7 @@
// hypotl(QNaN and anything) = QNaN
// hypotl(SNaN and anything ) = QNaN
//
-// *********************************************************************
+//*********************************************************************
//
// Implementation:
// x2 = x * x in double-extended
@@ -86,9 +87,7 @@
// temp = x2 + y2 in double-extended
// sqrt(temp) rounded to double extended
//
-// *********************************************************************
-
-#include "libm_support.h"
+//*********************************************************************
GR_SAVE_PFS = r33
GR_SAVE_B0 = r34
@@ -103,23 +102,10 @@ FR_Y = f33
FR_RESULT = f8
.section .text
-#ifndef _LIBC
-.proc cabsl#
-.global cabsl#
-cabsl:
-.endp cabsl
-#endif
-.proc hypotl#
-.global hypotl#
-.align 64
-hypotl:
-#ifdef _LIBC
-.global __hypotl
-__hypotl:
-.global __ieee754_hypotl
-__ieee754_hypotl:
-#endif
+LOCAL_LIBM_ENTRY(cabsl)
+LOCAL_LIBM_END(cabsl)
+GLOBAL_IEEE754_ENTRY(hypotl)
{.mfi
alloc r32= ar.pfs,0,4,4,0
// Compute x*x
@@ -434,11 +420,8 @@ __ieee754_hypotl:
// No overflow
(p9) br.ret.sptk b0;;
}
-.endp hypotl
-ASM_SIZE_DIRECTIVE(hypotl)
-
-.proc __libm_error_region
-__libm_error_region:
+GLOBAL_IEEE754_END(hypotl)
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
@@ -485,7 +468,9 @@ __libm_error_region:
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region#)
.type __libm_error_support#,@function
.global __libm_error_support#
+
+
+
diff --git a/sysdeps/ia64/fpu/e_lgamma_r.c b/sysdeps/ia64/fpu/e_lgamma_r.c
new file mode 100644
index 0000000000..e892635eae
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_lgamma_r.c
@@ -0,0 +1,71 @@
+/* file: lgamma_r.c */
+
+// Copyright (c) 2002 Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//
+
+// History
+//==============================================================
+// 02/04/02: Initial version
+// 02/22/02: Removed lgammaf_r, gammaf_r
+/*
+// FUNCTIONS: double lgamma_r(double x, int* signgam)
+// double gamma_r(double x, int* signgam)
+// Natural logarithm of GAMMA function
+*/
+
+#include "libm_support.h"
+
+
+extern double __libm_lgamma(double /*x*/, int* /*signgam*/, int /*signgamsz*/);
+
+
+double __ieee754_lgamma_r(double x, int* signgam)
+{
+ return __libm_lgamma(x, signgam, sizeof(*signgam));
+}
+weak_alias(__ieee754_lgamma_r, lgamma_r)
+
+#ifndef _LIBC
+double __ieee754_gamma_r(double x, int* signgam)
+{
+ return __libm_lgamma(x, signgam, sizeof(*signgam));
+}
+weak_alias(__ieee754_gamma_r, gamma_r)
+#endif
diff --git a/sysdeps/ia64/fpu/e_lgammaf_r.c b/sysdeps/ia64/fpu/e_lgammaf_r.c
new file mode 100644
index 0000000000..e5d4d2e0d8
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_lgammaf_r.c
@@ -0,0 +1,71 @@
+/* file: lgammaf_r.c */
+
+// Copyright (c) 2002 Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//
+
+// History
+//==============================================================
+// 02/04/02: Initial version
+// 02/22/02: Removed lgamma_r, gamma_r
+/*
+// FUNCTIONS: float lgammaf_r(float x, int* signgam)
+// float gammaf_r(float x, int* signgam)
+// Natural logarithm of GAMMA function
+*/
+
+#include "libm_support.h"
+
+
+extern float __libm_lgammaf(float /*x*/, int* /*signgam*/, int /*signgamsz*/);
+
+
+float __ieee754_lgammaf_r(float x, int* signgam)
+{
+ return __libm_lgammaf(x, signgam, sizeof(*signgam));
+}
+weak_alias(__ieee754_lgammaf_r, lgammaf_r)
+
+#ifndef _LIBC
+float __ieee754_gammaf_r(float x, int* signgam)
+{
+ return __libm_lgammaf(x, signgam, sizeof(*signgam));
+}
+weak_alias(__ieee754_gammaf_r, gammaf_r)
+#endif
diff --git a/sysdeps/ia64/fpu/e_lgammal_r.c b/sysdeps/ia64/fpu/e_lgammal_r.c
new file mode 100644
index 0000000000..a2b36d6394
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_lgammal_r.c
@@ -0,0 +1,70 @@
+/* file: lgammal_r.c */
+
+// Copyright (c) 2002 Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//
+
+// History
+//==============================================================
+// 08/15/02: Initial version
+/*
+// FUNCTIONS: long double lgammal_r(long double x, int* signgam)
+// long double gammal_r(long double x, int* signgam)
+// Natural logarithm of GAMMA function
+*/
+
+#include "libm_support.h"
+
+
+extern double __libm_lgammal(long double /*x*/, int* /*signgam*/, int /*signgamsz*/);
+
+
+long double __ieee754_lgammal_r(long double x, int* signgam)
+{
+ return __libm_lgammal(x, signgam, sizeof(*signgam));
+}
+weak_alias(__ieee754_lgammal_r, lgammal_r)
+
+#ifndef _LIBC
+long double __ieee754_gammal_r(long double x, int* signgam)
+{
+ return __libm_lgammal(x, signgam, sizeof(*signgam));
+}
+weak_alias(__ieee754_gammal_r, gammal_r)
+#endif
diff --git a/sysdeps/ia64/fpu/e_log.S b/sysdeps/ia64/fpu/e_log.S
index 9ad1e5fe56..f80f153679 100644
--- a/sysdeps/ia64/fpu/e_log.S
+++ b/sysdeps/ia64/fpu/e_log.S
@@ -1,10 +1,10 @@
.file "log.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2002, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,1085 +20,1707 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 4/04/00 Unwind support added
-// 6/16/00 Updated table to be rounded correctly
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
+// 06/16/00 Updated table to be rounded correctly
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-// 8/17/00 Improved speed of main path by 5 cycles
+// 08/17/00 Improved speed of main path by 5 cycles
// Shortened path for x=1.0
-// 1/09/01 Improved speed, fixed flags for neg denormals
-//
+// 01/09/01 Improved speed, fixed flags for neg denormals
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 05/23/02 Modified algorithm. Now only one polynomial is used
+// for |x-1| >= 1/256 and for |x-1| < 1/256
+// 12/11/02 Improved performance for Itanium 2
//
// API
//==============================================================
// double log(double)
// double log10(double)
//
+//
// Overview of operation
//==============================================================
// Background
+// ----------
//
-// Consider x = 2^N 1.f1 f2 f3 f4...f63
-// Log(x) = log(frcpa(x) x/frcpa(x))
-// = log(1/frcpa(x)) + log(frcpa(x) x)
-// = -log(frcpa(x)) + log(frcpa(x) x)
+// This algorithm is based on fact that
+// log(a b) = log(a) + log(b).
+// In our case we have x = 2^N f, where 1 <= f < 2.
+// So
+// log(x) = log(2^N f) = log(2^N) + log(f) = n*log(2) + log(f)
//
-// frcpa(x) = 2^-N frcpa((1.f1 f2 ... f63)
+// To calculate log(f) we do following
+// log(f) = log(f * frcpa(f) / frcpa(f)) =
+// = log(f * frcpa(f)) + log(1/frcpa(f))
//
-// -log(frcpa(x)) = -log(C)
-// = -log(2^-N) - log(frcpa(1.f1 f2 ... f63))
+// According to definition of IA-64's frcpa instruction it's a
+// floating point that approximates 1/f using a lookup on the
+// top of 8 bits of the input number's significand with relative
+// error < 2^(-8.886). So we have following
//
-// -log(frcpa(x)) = -log(C)
-// = +Nlog2 - log(frcpa(1.f1 f2 ... f63))
+// |(1/f - frcpa(f)) / (1/f))| = |1 - f*frcpa(f)| < 1/256
//
-// -log(frcpa(x)) = -log(C)
-// = +Nlog2 + log(frcpa(1.f1 f2 ... f63))
+// and
//
-// Log(x) = log(1/frcpa(x)) + log(frcpa(x) x)
-
-// Log(x) = +Nlog2 + log(1./frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)
-// Log(x) = +Nlog2 - log(/frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)
-// Log(x) = +Nlog2 + T + log(frcpa(x) x)
+// log(f) = log(f * frcpa(f)) + log(1/frcpa(f)) =
+// = log(1 + r) + T
+//
+// The first value can be computed by polynomial P(r) approximating
+// log(1 + r) on |r| < 1/256 and the second is precomputed tabular
+// value defined by top 8 bit of f.
//
-// Log(x) = +Nlog2 + T + log(C x)
+// Finally we have that log(x) ~ (N*log(2) + T) + P(r)
//
-// Cx = 1 + r
+// Note that if input argument is close to 1.0 (in our case it means
+// that |1 - x| < 1/256) we can use just polynomial approximation
+// because x = 2^0 * f = f = 1 + r and
+// log(x) = log(1 + r) ~ P(r)
//
-// Log(x) = +Nlog2 + T + log(1+r)
-// Log(x) = +Nlog2 + T + Series( r - r^2/2 + r^3/3 - r^4/4 ....)
//
-// 1.f1 f2 ... f8 has 256 entries.
-// They are 1 + k/2^8, k = 0 ... 255
-// These 256 values are the table entries.
+// To compute log10(x) we use the simple identity
+//
+// log10(x) = log(x)/log(10)
+//
+// so we have that
+//
+// log10(x) = (N*log(2) + T + log(1+r)) / log(10) =
+// = N*(log(2)/log(10)) + (T/log(10)) + log(1 + r)/log(10)
+//
//
// Implementation
-//===============
-// CASE 1: |x-1| >= 2^-6
-// C = frcpa(x)
-// r = C * x - 1
+// --------------
+// It can be seen that formulas for log and log10 differ from one another
+// only by coefficients and tabular values. Namely as log as log10 are
+// calculated as (N*L1 + T) + L2*Series(r) where in case of log
+// L1 = log(2)
+// T = log(1/frcpa(x))
+// L2 = 1.0
+// and in case of log10
+// L1 = log(2)/log(10)
+// T = log(1/frcpa(x))/log(10)
+// L2 = 1.0/log(10)
//
-// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4 + P4*r^5 + P5*r^6
+// So common code with two different entry points those set pointers
+// to the base address of coresponding data sets containing values
+// of L2,T and prepare integer representation of L1 needed for following
+// setf instruction.
//
-// x = f * 2*n where f is 1.f_1f_2f_3....f_63
-// Nfloat = float(n) where n is the true unbiased exponent
-// pre-index = f_1f_2....f_8
-// index = pre_index * 16
-// get the dxt table entry at index + offset = T
+// Note that both log and log10 use common approximation polynomial
+// it means we need only one set of coefficients of approximation.
//
-// result = (T + Nfloat * log(2)) + rseries
//
-// The T table is calculated as follows
-// Form x_k = 1 + k/2^8 where k goes from 0... 255
-// y_k = frcpa(x_k)
-// log(1/y_k) in quad and round to double-extended
-
-// CASE 2: |x-1| < 2^-6
-// w = x - 1
+// 1. |x-1| >= 1/256
+// InvX = frcpa(x)
+// r = InvX*x - 1
+// P(r) = r*((r*A3 - A2) + r^4*((A4 + r*A5) + r^2*(A6 + r*A7)),
+// all coefficients are calcutated in quad and rounded to double
+// precision. A7,A6,A5,A4 are stored in memory whereas A3 and A2
+// created with setf.
+//
+// N = float(n) where n is true unbiased exponent of x
+//
+// T is tabular value of log(1/frcpa(x)) calculated in quad precision
+// and represented by two floating-point numbers 64-bit Thi and 32-bit Tlo.
+// To load Thi,Tlo we get bits from 55 to 62 of register format significand
+// as index and calculate two addresses
+// ad_Thi = Thi_table_base_addr + 8 * index
+// ad_Tlo = Tlo_table_base_addr + 4 * index
+//
+// L2 (1.0 or 1.0/log(10) depending on function) is calculated in quad
+// precision and rounded to double extended; it's loaded from memory.
+//
+// L1 (log(2) or log10(2) depending on function) is calculated in quad
+// precision and represented by two floating-point 64-bit numbers L1hi,L1lo
+// stored in memory.
//
-// Form wseries = w + Q1*w^2 + Q2*w^3 + ... + Q7*w^8 + Q8*w^9
+// And final result = ((L1hi*N + Thi) + (N*L1lo + Tlo)) + L2*P(r)
+//
+//
+// 2. |x-1| < 1/256
+// r = x - 1
+// P(r) = r*((r*A3 - A2) + r^4*((A4 + r*A5) + r^2*(A6 + r*A7)),
+// A7,A6,A5A4,A3,A2 are the same as in case |x-1| >= 1/256
+//
+// And final results
+// log(x) = P(r)
+// log10(x) = L2*P(r)
+//
+// 3. How we define is input argument such that |x-1| < 1/256 or not.
+//
+// To do it we analyze biased exponent and integer representation of
+// input argument
+//
+// a) First we test is biased exponent equal to 0xFFFE or 0xFFFF (i.e.
+// we test is 0.5 <= x < 2). This comparison can be performed using
+// unsigned version of cmp instruction in such a way
+// biased_exponent_of_x - 0xFFFE < 2
+//
+//
+// b) Second (in case when result of a) is true) we need to compare x
+// with 1-1/256 and 1+1/256 or in double precision memory representation
+// with 0x3FEFE00000000000 and 0x3FF0100000000000 correspondingly.
+// This comparison can be made like in a), using unsigned
+// version of cmp i.e. ix - 0x3FEFE00000000000 < 0x0000300000000000.
+// 0x0000300000000000 is difference between 0x3FF0100000000000 and
+// 0x3FEFE00000000000
+//
+// Note: NaT, any NaNs, +/-INF, +/-0, negatives and unnormalized numbers are
+// filtered and processed on special branches.
//
-// result = wseries
-// Special values
+//
+// Special values
//==============================================================
-
-
+//
// log(+0) = -inf
// log(-0) = -inf
-
-// log(+qnan) = +qnan
-// log(-qnan) = -qnan
-// log(+snan) = +qnan
-// log(-snan) = -qnan
-
+//
+// log(+qnan) = +qnan
+// log(-qnan) = -qnan
+// log(+snan) = +qnan
+// log(-snan) = -qnan
+//
// log(-n) = QNAN Indefinite
-// log(-inf) = QNAN Indefinite
-
+// log(-inf) = QNAN Indefinite
+//
// log(+inf) = +inf
-
+//
+//
// Registers used
//==============================================================
-// Floating Point registers used:
+// Floating Point registers used:
// f8, input
-// f9 -> f15, f32 -> f68
-
-// General registers used:
-// r32 -> r51
-
+// f7 -> f15, f32 -> f42
+//
+// General registers used:
+// r8 -> r11
+// r14 -> r23
+//
// Predicate registers used:
// p6 -> p15
-// p8 log base e
-// p6 log base e special
-// p9 used in the frcpa
-// p13 log base e large W
-// p14 log base e small w
-
-// p7 log base 10
-// p10 log base 10 large W
-// p11 log base 10 small w
-// p12 log base 10 special
-
-#include "libm_support.h"
-
// Assembly macros
//==============================================================
-
-log_int_Nfloat = f9
-log_Nfloat = f10
-
-log_P5 = f11
-log_P4 = f12
-log_P3 = f13
-log_P2 = f14
-log_half = f15
-
-log_log2 = f32
-log_T = f33
-
-log_rp_p4 = f34
-log_rp_p32 = f35
-log_rp_p2 = f36
-log_w6 = f37
-log_rp_p10 = f38
-log_rcube = f39
-log_rsq = f40
-
-log_T_plus_Nlog2 = f41
-log_w3 = f42
-
-log_r = f43
-log_C = f44
-
-log_w = f45
-log_Q8 = f46
-log_Q7 = f47
-log_Q4 = f48
-log_Q3 = f49
-log_Q6 = f50
-log_Q5 = f51
-log_Q2 = f52
-log_Q1 = f53
-log_P1 = f53
-
-log_rp_q7 = f54
-log_rp_q65 = f55
-log_Qlo = f56
-
-log_rp_q3 = f57
-log_rp_q21 = f58
-log_Qhi = f59
-
-log_wsq = f60
-log_w4 = f61
-log_Q = f62
-
-log_inv_ln10 = f63
-log_log10_hi = f64
-log_log10_lo = f65
-log_rp_q10 = f66
-log_NORM_f8 = f67
-log_r2P_r = f68
-
-// ===================================
-
-log_GR_exp_17_ones = r33
-log_GR_exp_16_ones = r34
-log_GR_exp_f8 = r35
-log_GR_signexp_f8 = r36
-log_GR_true_exp_f8 = r37
-log_GR_significand_f8 = r38
-log_GR_half_exp = r39
-log_GR_index = r39
-log_AD_1 = r40
-log_GR_signexp_w = r41
-log_GR_fff9 = r42
-log_AD_2 = r43
-log_GR_exp_w = r44
-
-GR_SAVE_B0 = r45
-GR_SAVE_GP = r46
-GR_SAVE_PFS = r47
-
-GR_Parameter_X = r48
-GR_Parameter_Y = r49
-GR_Parameter_RESULT = r50
-log_GR_tag = r51
-
-
-// Data tables
+GR_TAG = r8
+GR_ad_1 = r8
+GR_ad_2 = r9
+GR_Exp = r10
+GR_N = r11
+
+GR_x = r14
+GR_dx = r15
+GR_NearOne = r15
+GR_xorg = r16
+GR_mask = r16
+GR_05 = r17
+GR_A3 = r18
+GR_Sig = r19
+GR_Ind = r19
+GR_Nm1 = r20
+GR_bias = r21
+GR_ad_3 = r22
+GR_rexp = r23
+
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+
+FR_NormX = f7
+FR_RcpX = f9
+FR_tmp = f9
+FR_r = f10
+FR_r2 = f11
+FR_r4 = f12
+FR_N = f13
+FR_Ln2hi = f14
+FR_Ln2lo = f15
+
+FR_A7 = f32
+FR_A6 = f33
+FR_A5 = f34
+FR_A4 = f35
+FR_A3 = f36
+FR_A2 = f37
+
+FR_Thi = f38
+FR_NxLn2hipThi = f38
+FR_NxLn2pT = f38
+FR_Tlo = f39
+FR_NxLn2lopTlo = f39
+
+FR_InvLn10 = f40
+FR_A32 = f41
+FR_A321 = f42
+
+
+FR_Y = f1
+FR_X = f10
+FR_RESULT = f8
+
+
+// Data
//==============================================================
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-
+RODATA
.align 16
-log_table_1:
-ASM_TYPE_DIRECTIVE(log_table_1,@object)
-data8 0xBFC5555DA7212371 // P5
-data8 0x3FC999A19EEF5826 // P4
-data8 0x3FBC756AC654273B // Q8
-data8 0xBFC001A42489AB4D // Q7
-data8 0x3FC99999999A169B // Q4
-data8 0xBFD00000000019AC // Q3
-ASM_SIZE_DIRECTIVE(log_table_1)
-log_table_2:
-ASM_TYPE_DIRECTIVE(log_table_2,@object)
-data8 0xBFCFFFFFFFFEF009 // P3
-data8 0x3FD555555554ECB2 // P2
-data8 0x3FC2492479AA0DF8 // Q6
-data8 0xBFC5555544986F52 // Q5
-data8 0x3FD5555555555555 // Q2
-data8 0xBFE0000000000000 // Q1, P1 = -0.5
-
-
-data8 0xde5bd8a937287195, 0x00003ffd // double-extended 1/ln(10)
-data8 0xb17217f7d1cf79ac, 0x00003ffe // log2
-// b17217f7d1cf79ab c9e3b39803f2f6a
-
-
-data8 0x80200aaeac44ef38 , 0x00003ff6 // log(1/frcpa(1+ 0/2^-8))
-
-data8 0xc09090a2c35aa070 , 0x00003ff7 // log(1/frcpa(1+ 1/2^-8))
-data8 0xa0c94fcb41977c75 , 0x00003ff8 // log(1/frcpa(1+ 2/2^-8))
-data8 0xe18b9c263af83301 , 0x00003ff8 // log(1/frcpa(1+ 3/2^-8))
-data8 0x8d35c8d6399c30ea , 0x00003ff9 // log(1/frcpa(1+ 4/2^-8))
-data8 0xadd4d2ecd601cbb8 , 0x00003ff9 // log(1/frcpa(1+ 5/2^-8))
-
-data8 0xce95403a192f9f01 , 0x00003ff9 // log(1/frcpa(1+ 6/2^-8))
-data8 0xeb59392cbcc01096 , 0x00003ff9 // log(1/frcpa(1+ 7/2^-8))
-data8 0x862c7d0cefd54c5d , 0x00003ffa // log(1/frcpa(1+ 8/2^-8))
-data8 0x94aa63c65e70d499 , 0x00003ffa // log(1/frcpa(1+ 9/2^-8))
-data8 0xa54a696d4b62b382 , 0x00003ffa // log(1/frcpa(1+ 10/2^-8))
-
-data8 0xb3e4a796a5dac208 , 0x00003ffa // log(1/frcpa(1+ 11/2^-8))
-data8 0xc28c45b1878340a9 , 0x00003ffa // log(1/frcpa(1+ 12/2^-8))
-data8 0xd35c55f39d7a6235 , 0x00003ffa // log(1/frcpa(1+ 13/2^-8))
-data8 0xe220f037b954f1f5 , 0x00003ffa // log(1/frcpa(1+ 14/2^-8))
-data8 0xf0f3389b036834f3 , 0x00003ffa // log(1/frcpa(1+ 15/2^-8))
-
-data8 0xffd3488d5c980465 , 0x00003ffa // log(1/frcpa(1+ 16/2^-8))
-data8 0x87609ce2ed300490 , 0x00003ffb // log(1/frcpa(1+ 17/2^-8))
-data8 0x8ede9321e8c85927 , 0x00003ffb // log(1/frcpa(1+ 18/2^-8))
-data8 0x96639427f2f8e2f4 , 0x00003ffb // log(1/frcpa(1+ 19/2^-8))
-data8 0x9defad3e8f73217b , 0x00003ffb // log(1/frcpa(1+ 20/2^-8))
-
-data8 0xa582ebd50097029c , 0x00003ffb // log(1/frcpa(1+ 21/2^-8))
-data8 0xac06dbe75ab80fee , 0x00003ffb // log(1/frcpa(1+ 22/2^-8))
-data8 0xb3a78449b2d3ccca , 0x00003ffb // log(1/frcpa(1+ 23/2^-8))
-data8 0xbb4f79635ab46bb2 , 0x00003ffb // log(1/frcpa(1+ 24/2^-8))
-data8 0xc2fec93a83523f3f , 0x00003ffb // log(1/frcpa(1+ 25/2^-8))
-
-data8 0xc99af2eaca4c4571 , 0x00003ffb // log(1/frcpa(1+ 26/2^-8))
-data8 0xd1581106472fa653 , 0x00003ffb // log(1/frcpa(1+ 27/2^-8))
-data8 0xd8002560d4355f2e , 0x00003ffb // log(1/frcpa(1+ 28/2^-8))
-data8 0xdfcb43b4fe508632 , 0x00003ffb // log(1/frcpa(1+ 29/2^-8))
-data8 0xe67f6dff709d4119 , 0x00003ffb // log(1/frcpa(1+ 30/2^-8))
-
-data8 0xed393b1c22351280 , 0x00003ffb // log(1/frcpa(1+ 31/2^-8))
-data8 0xf5192bff087bcc35 , 0x00003ffb // log(1/frcpa(1+ 32/2^-8))
-data8 0xfbdf4ff6dfef2fa3 , 0x00003ffb // log(1/frcpa(1+ 33/2^-8))
-data8 0x81559a97f92f9cc7 , 0x00003ffc // log(1/frcpa(1+ 34/2^-8))
-data8 0x84be72bce90266e8 , 0x00003ffc // log(1/frcpa(1+ 35/2^-8))
-
-data8 0x88bc74113f23def2 , 0x00003ffc // log(1/frcpa(1+ 36/2^-8))
-data8 0x8c2ba3edf6799d11 , 0x00003ffc // log(1/frcpa(1+ 37/2^-8))
-data8 0x8f9dc92f92ea08b1 , 0x00003ffc // log(1/frcpa(1+ 38/2^-8))
-data8 0x9312e8f36efab5a7 , 0x00003ffc // log(1/frcpa(1+ 39/2^-8))
-data8 0x968b08643409ceb6 , 0x00003ffc // log(1/frcpa(1+ 40/2^-8))
-
-data8 0x9a062cba08a1708c , 0x00003ffc // log(1/frcpa(1+ 41/2^-8))
-data8 0x9d845b3abf95485c , 0x00003ffc // log(1/frcpa(1+ 42/2^-8))
-data8 0xa06fd841bc001bb4 , 0x00003ffc // log(1/frcpa(1+ 43/2^-8))
-data8 0xa3f3a74652fbe0db , 0x00003ffc // log(1/frcpa(1+ 44/2^-8))
-data8 0xa77a8fb2336f20f5 , 0x00003ffc // log(1/frcpa(1+ 45/2^-8))
-
-data8 0xab0497015d28b0a0 , 0x00003ffc // log(1/frcpa(1+ 46/2^-8))
-data8 0xae91c2be6ba6a615 , 0x00003ffc // log(1/frcpa(1+ 47/2^-8))
-data8 0xb189d1b99aebb20b , 0x00003ffc // log(1/frcpa(1+ 48/2^-8))
-data8 0xb51cced5de9c1b2c , 0x00003ffc // log(1/frcpa(1+ 49/2^-8))
-data8 0xb819bee9e720d42f , 0x00003ffc // log(1/frcpa(1+ 50/2^-8))
-
-data8 0xbbb2a0947b093a5d , 0x00003ffc // log(1/frcpa(1+ 51/2^-8))
-data8 0xbf4ec1505811684a , 0x00003ffc // log(1/frcpa(1+ 52/2^-8))
-data8 0xc2535bacfa8975ff , 0x00003ffc // log(1/frcpa(1+ 53/2^-8))
-data8 0xc55a3eafad187eb8 , 0x00003ffc // log(1/frcpa(1+ 54/2^-8))
-data8 0xc8ff2484b2c0da74 , 0x00003ffc // log(1/frcpa(1+ 55/2^-8))
-
-data8 0xcc0b1a008d53ab76 , 0x00003ffc // log(1/frcpa(1+ 56/2^-8))
-data8 0xcfb6203844b3209b , 0x00003ffc // log(1/frcpa(1+ 57/2^-8))
-data8 0xd2c73949a47a19f5 , 0x00003ffc // log(1/frcpa(1+ 58/2^-8))
-data8 0xd5daae18b49d6695 , 0x00003ffc // log(1/frcpa(1+ 59/2^-8))
-data8 0xd8f08248cf7e8019 , 0x00003ffc // log(1/frcpa(1+ 60/2^-8))
-
-data8 0xdca7749f1b3e540e , 0x00003ffc // log(1/frcpa(1+ 61/2^-8))
-data8 0xdfc28e033aaaf7c7 , 0x00003ffc // log(1/frcpa(1+ 62/2^-8))
-data8 0xe2e012a5f91d2f55 , 0x00003ffc // log(1/frcpa(1+ 63/2^-8))
-data8 0xe600064ed9e292a8 , 0x00003ffc // log(1/frcpa(1+ 64/2^-8))
-data8 0xe9226cce42b39f60 , 0x00003ffc // log(1/frcpa(1+ 65/2^-8))
-
-data8 0xec4749fd97a28360 , 0x00003ffc // log(1/frcpa(1+ 66/2^-8))
-data8 0xef6ea1bf57780495 , 0x00003ffc // log(1/frcpa(1+ 67/2^-8))
-data8 0xf29877ff38809091 , 0x00003ffc // log(1/frcpa(1+ 68/2^-8))
-data8 0xf5c4d0b245cb89be , 0x00003ffc // log(1/frcpa(1+ 69/2^-8))
-data8 0xf8f3afd6fcdef3aa , 0x00003ffc // log(1/frcpa(1+ 70/2^-8))
-
-data8 0xfc2519756be1abc7 , 0x00003ffc // log(1/frcpa(1+ 71/2^-8))
-data8 0xff59119f503e6832 , 0x00003ffc // log(1/frcpa(1+ 72/2^-8))
-data8 0x8147ce381ae0e146 , 0x00003ffd // log(1/frcpa(1+ 73/2^-8))
-data8 0x82e45f06cb1ad0f2 , 0x00003ffd // log(1/frcpa(1+ 74/2^-8))
-data8 0x842f5c7c573cbaa2 , 0x00003ffd // log(1/frcpa(1+ 75/2^-8))
-
-data8 0x85ce471968c8893a , 0x00003ffd // log(1/frcpa(1+ 76/2^-8))
-data8 0x876e8305bc04066d , 0x00003ffd // log(1/frcpa(1+ 77/2^-8))
-data8 0x891012678031fbb3 , 0x00003ffd // log(1/frcpa(1+ 78/2^-8))
-data8 0x8a5f1493d766a05f , 0x00003ffd // log(1/frcpa(1+ 79/2^-8))
-data8 0x8c030c778c56fa00 , 0x00003ffd // log(1/frcpa(1+ 80/2^-8))
-
-data8 0x8da85df17e31d9ae , 0x00003ffd // log(1/frcpa(1+ 81/2^-8))
-data8 0x8efa663e7921687e , 0x00003ffd // log(1/frcpa(1+ 82/2^-8))
-data8 0x90a22b6875c6a1f8 , 0x00003ffd // log(1/frcpa(1+ 83/2^-8))
-data8 0x91f62cc8f5d24837 , 0x00003ffd // log(1/frcpa(1+ 84/2^-8))
-data8 0x93a06cfc3857d980 , 0x00003ffd // log(1/frcpa(1+ 85/2^-8))
-
-data8 0x94f66d5e6fd01ced , 0x00003ffd // log(1/frcpa(1+ 86/2^-8))
-data8 0x96a330156e6772f2 , 0x00003ffd // log(1/frcpa(1+ 87/2^-8))
-data8 0x97fb3582754ea25b , 0x00003ffd // log(1/frcpa(1+ 88/2^-8))
-data8 0x99aa8259aad1bbf2 , 0x00003ffd // log(1/frcpa(1+ 89/2^-8))
-data8 0x9b0492f6227ae4a8 , 0x00003ffd // log(1/frcpa(1+ 90/2^-8))
-
-data8 0x9c5f8e199bf3a7a5 , 0x00003ffd // log(1/frcpa(1+ 91/2^-8))
-data8 0x9e1293b9998c1daa , 0x00003ffd // log(1/frcpa(1+ 92/2^-8))
-data8 0x9f6fa31e0b41f308 , 0x00003ffd // log(1/frcpa(1+ 93/2^-8))
-data8 0xa0cda11eaf46390e , 0x00003ffd // log(1/frcpa(1+ 94/2^-8))
-data8 0xa22c8f029cfa45aa , 0x00003ffd // log(1/frcpa(1+ 95/2^-8))
-
-data8 0xa3e48badb7856b34 , 0x00003ffd // log(1/frcpa(1+ 96/2^-8))
-data8 0xa5459a0aa95849f9 , 0x00003ffd // log(1/frcpa(1+ 97/2^-8))
-data8 0xa6a79c84480cfebd , 0x00003ffd // log(1/frcpa(1+ 98/2^-8))
-data8 0xa80a946d0fcb3eb2 , 0x00003ffd // log(1/frcpa(1+ 99/2^-8))
-data8 0xa96e831a3ea7b314 , 0x00003ffd // log(1/frcpa(1+100/2^-8))
-
-data8 0xaad369e3dc544e3b , 0x00003ffd // log(1/frcpa(1+101/2^-8))
-data8 0xac92e9588952c815 , 0x00003ffd // log(1/frcpa(1+102/2^-8))
-data8 0xadfa035aa1ed8fdc , 0x00003ffd // log(1/frcpa(1+103/2^-8))
-data8 0xaf6219eae1ad6e34 , 0x00003ffd // log(1/frcpa(1+104/2^-8))
-data8 0xb0cb2e6d8160f753 , 0x00003ffd // log(1/frcpa(1+105/2^-8))
-
-data8 0xb2354249ad950f72 , 0x00003ffd // log(1/frcpa(1+106/2^-8))
-data8 0xb3a056e98ef4a3b4 , 0x00003ffd // log(1/frcpa(1+107/2^-8))
-data8 0xb50c6dba52c6292a , 0x00003ffd // log(1/frcpa(1+108/2^-8))
-data8 0xb679882c33876165 , 0x00003ffd // log(1/frcpa(1+109/2^-8))
-data8 0xb78c07429785cedc , 0x00003ffd // log(1/frcpa(1+110/2^-8))
-
-data8 0xb8faeb8dc4a77d24 , 0x00003ffd // log(1/frcpa(1+111/2^-8))
-data8 0xba6ad77eb36ae0d6 , 0x00003ffd // log(1/frcpa(1+112/2^-8))
-data8 0xbbdbcc915e9bee50 , 0x00003ffd // log(1/frcpa(1+113/2^-8))
-data8 0xbd4dcc44f8cf12ef , 0x00003ffd // log(1/frcpa(1+114/2^-8))
-data8 0xbec0d81bf5b531fa , 0x00003ffd // log(1/frcpa(1+115/2^-8))
-
-data8 0xc034f19c139186f4 , 0x00003ffd // log(1/frcpa(1+116/2^-8))
-data8 0xc14cb69f7c5e55ab , 0x00003ffd // log(1/frcpa(1+117/2^-8))
-data8 0xc2c2abbb6e5fd56f , 0x00003ffd // log(1/frcpa(1+118/2^-8))
-data8 0xc439b2c193e6771e , 0x00003ffd // log(1/frcpa(1+119/2^-8))
-data8 0xc553acb9d5c67733 , 0x00003ffd // log(1/frcpa(1+120/2^-8))
-
-data8 0xc6cc96e441272441 , 0x00003ffd // log(1/frcpa(1+121/2^-8))
-data8 0xc8469753eca88c30 , 0x00003ffd // log(1/frcpa(1+122/2^-8))
-data8 0xc962cf3ce072b05c , 0x00003ffd // log(1/frcpa(1+123/2^-8))
-data8 0xcadeba8771f694aa , 0x00003ffd // log(1/frcpa(1+124/2^-8))
-data8 0xcc5bc08d1f72da94 , 0x00003ffd // log(1/frcpa(1+125/2^-8))
-
-data8 0xcd7a3f99ea035c29 , 0x00003ffd // log(1/frcpa(1+126/2^-8))
-data8 0xcef93860c8a53c35 , 0x00003ffd // log(1/frcpa(1+127/2^-8))
-data8 0xd0192f68a7ed23df , 0x00003ffd // log(1/frcpa(1+128/2^-8))
-data8 0xd19a201127d3c645 , 0x00003ffd // log(1/frcpa(1+129/2^-8))
-data8 0xd2bb92f4061c172c , 0x00003ffd // log(1/frcpa(1+130/2^-8))
-
-data8 0xd43e80b2ee8cc8fc , 0x00003ffd // log(1/frcpa(1+131/2^-8))
-data8 0xd56173601fc4ade4 , 0x00003ffd // log(1/frcpa(1+132/2^-8))
-data8 0xd6e6637efb54086f , 0x00003ffd // log(1/frcpa(1+133/2^-8))
-data8 0xd80ad9f58f3c8193 , 0x00003ffd // log(1/frcpa(1+134/2^-8))
-data8 0xd991d1d31aca41f8 , 0x00003ffd // log(1/frcpa(1+135/2^-8))
-
-data8 0xdab7d02231484a93 , 0x00003ffd // log(1/frcpa(1+136/2^-8))
-data8 0xdc40d532cde49a54 , 0x00003ffd // log(1/frcpa(1+137/2^-8))
-data8 0xdd685f79ed8b265e , 0x00003ffd // log(1/frcpa(1+138/2^-8))
-data8 0xde9094bbc0e17b1d , 0x00003ffd // log(1/frcpa(1+139/2^-8))
-data8 0xe01c91b78440c425 , 0x00003ffd // log(1/frcpa(1+140/2^-8))
-
-data8 0xe14658f26997e729 , 0x00003ffd // log(1/frcpa(1+141/2^-8))
-data8 0xe270cdc2391e0d23 , 0x00003ffd // log(1/frcpa(1+142/2^-8))
-data8 0xe3ffce3a2aa64922 , 0x00003ffd // log(1/frcpa(1+143/2^-8))
-data8 0xe52bdb274ed82887 , 0x00003ffd // log(1/frcpa(1+144/2^-8))
-data8 0xe6589852e75d7df6 , 0x00003ffd // log(1/frcpa(1+145/2^-8))
-
-data8 0xe786068c79937a7d , 0x00003ffd // log(1/frcpa(1+146/2^-8))
-data8 0xe91903adad100911 , 0x00003ffd // log(1/frcpa(1+147/2^-8))
-data8 0xea481236f7d35bb0 , 0x00003ffd // log(1/frcpa(1+148/2^-8))
-data8 0xeb77d48c692e6b14 , 0x00003ffd // log(1/frcpa(1+149/2^-8))
-data8 0xeca84b83d7297b87 , 0x00003ffd // log(1/frcpa(1+150/2^-8))
-
-data8 0xedd977f4962aa158 , 0x00003ffd // log(1/frcpa(1+151/2^-8))
-data8 0xef7179a22f257754 , 0x00003ffd // log(1/frcpa(1+152/2^-8))
-data8 0xf0a450d139366ca7 , 0x00003ffd // log(1/frcpa(1+153/2^-8))
-data8 0xf1d7e0524ff9ffdb , 0x00003ffd // log(1/frcpa(1+154/2^-8))
-data8 0xf30c29036a8b6cae , 0x00003ffd // log(1/frcpa(1+155/2^-8))
-
-data8 0xf4412bc411ea8d92 , 0x00003ffd // log(1/frcpa(1+156/2^-8))
-data8 0xf576e97564c8619d , 0x00003ffd // log(1/frcpa(1+157/2^-8))
-data8 0xf6ad62fa1b5f172f , 0x00003ffd // log(1/frcpa(1+158/2^-8))
-data8 0xf7e499368b55c542 , 0x00003ffd // log(1/frcpa(1+159/2^-8))
-data8 0xf91c8d10abaffe22 , 0x00003ffd // log(1/frcpa(1+160/2^-8))
-
-data8 0xfa553f7018c966f3 , 0x00003ffd // log(1/frcpa(1+161/2^-8))
-data8 0xfb8eb13e185d802c , 0x00003ffd // log(1/frcpa(1+162/2^-8))
-data8 0xfcc8e3659d9bcbed , 0x00003ffd // log(1/frcpa(1+163/2^-8))
-data8 0xfe03d6d34d487fd2 , 0x00003ffd // log(1/frcpa(1+164/2^-8))
-data8 0xff3f8c7581e9f0ae , 0x00003ffd // log(1/frcpa(1+165/2^-8))
-
-data8 0x803e029e280173ae , 0x00003ffe // log(1/frcpa(1+166/2^-8))
-data8 0x80dca10cc52d0757 , 0x00003ffe // log(1/frcpa(1+167/2^-8))
-data8 0x817ba200632755a1 , 0x00003ffe // log(1/frcpa(1+168/2^-8))
-data8 0x821b05f3b01d6774 , 0x00003ffe // log(1/frcpa(1+169/2^-8))
-data8 0x82bacd623ff19d06 , 0x00003ffe // log(1/frcpa(1+170/2^-8))
-
-data8 0x835af8c88e7a8f47 , 0x00003ffe // log(1/frcpa(1+171/2^-8))
-data8 0x83c5f8299e2b4091 , 0x00003ffe // log(1/frcpa(1+172/2^-8))
-data8 0x8466cb43f3d87300 , 0x00003ffe // log(1/frcpa(1+173/2^-8))
-data8 0x850803a67c80ca4b , 0x00003ffe // log(1/frcpa(1+174/2^-8))
-data8 0x85a9a1d11a23b461 , 0x00003ffe // log(1/frcpa(1+175/2^-8))
-
-data8 0x864ba644a18e6e05 , 0x00003ffe // log(1/frcpa(1+176/2^-8))
-data8 0x86ee1182dcc432f7 , 0x00003ffe // log(1/frcpa(1+177/2^-8))
-data8 0x875a925d7e48c316 , 0x00003ffe // log(1/frcpa(1+178/2^-8))
-data8 0x87fdaa109d23aef7 , 0x00003ffe // log(1/frcpa(1+179/2^-8))
-data8 0x88a129ed4becfaf2 , 0x00003ffe // log(1/frcpa(1+180/2^-8))
-
-data8 0x89451278ecd7f9cf , 0x00003ffe // log(1/frcpa(1+181/2^-8))
-data8 0x89b29295f8432617 , 0x00003ffe // log(1/frcpa(1+182/2^-8))
-data8 0x8a572ac5a5496882 , 0x00003ffe // log(1/frcpa(1+183/2^-8))
-data8 0x8afc2d0ce3b2dadf , 0x00003ffe // log(1/frcpa(1+184/2^-8))
-data8 0x8b6a69c608cfd3af , 0x00003ffe // log(1/frcpa(1+185/2^-8))
-
-data8 0x8c101e106e899a83 , 0x00003ffe // log(1/frcpa(1+186/2^-8))
-data8 0x8cb63de258f9d626 , 0x00003ffe // log(1/frcpa(1+187/2^-8))
-data8 0x8d2539c5bd19e2b1 , 0x00003ffe // log(1/frcpa(1+188/2^-8))
-data8 0x8dcc0e064b29e6f1 , 0x00003ffe // log(1/frcpa(1+189/2^-8))
-data8 0x8e734f45d88357ae , 0x00003ffe // log(1/frcpa(1+190/2^-8))
-
-data8 0x8ee30cef034a20db , 0x00003ffe // log(1/frcpa(1+191/2^-8))
-data8 0x8f8b0515686d1d06 , 0x00003ffe // log(1/frcpa(1+192/2^-8))
-data8 0x90336bba039bf32f , 0x00003ffe // log(1/frcpa(1+193/2^-8))
-data8 0x90a3edd23d1c9d58 , 0x00003ffe // log(1/frcpa(1+194/2^-8))
-data8 0x914d0de2f5d61b32 , 0x00003ffe // log(1/frcpa(1+195/2^-8))
-
-data8 0x91be0c20d28173b5 , 0x00003ffe // log(1/frcpa(1+196/2^-8))
-data8 0x9267e737c06cd34a , 0x00003ffe // log(1/frcpa(1+197/2^-8))
-data8 0x92d962ae6abb1237 , 0x00003ffe // log(1/frcpa(1+198/2^-8))
-data8 0x9383fa6afbe2074c , 0x00003ffe // log(1/frcpa(1+199/2^-8))
-data8 0x942f0421651c1c4e , 0x00003ffe // log(1/frcpa(1+200/2^-8))
-
-data8 0x94a14a3845bb985e , 0x00003ffe // log(1/frcpa(1+201/2^-8))
-data8 0x954d133857f861e7 , 0x00003ffe // log(1/frcpa(1+202/2^-8))
-data8 0x95bfd96468e604c4 , 0x00003ffe // log(1/frcpa(1+203/2^-8))
-data8 0x9632d31cafafa858 , 0x00003ffe // log(1/frcpa(1+204/2^-8))
-data8 0x96dfaabd86fa1647 , 0x00003ffe // log(1/frcpa(1+205/2^-8))
-
-data8 0x9753261fcbb2a594 , 0x00003ffe // log(1/frcpa(1+206/2^-8))
-data8 0x9800c11b426b996d , 0x00003ffe // log(1/frcpa(1+207/2^-8))
-data8 0x9874bf4d45ae663c , 0x00003ffe // log(1/frcpa(1+208/2^-8))
-data8 0x99231f5ee9a74f79 , 0x00003ffe // log(1/frcpa(1+209/2^-8))
-data8 0x9997a18a56bcad28 , 0x00003ffe // log(1/frcpa(1+210/2^-8))
-
-data8 0x9a46c873a3267e79 , 0x00003ffe // log(1/frcpa(1+211/2^-8))
-data8 0x9abbcfc621eb6cb6 , 0x00003ffe // log(1/frcpa(1+212/2^-8))
-data8 0x9b310cb0d354c990 , 0x00003ffe // log(1/frcpa(1+213/2^-8))
-data8 0x9be14cf9e1b3515c , 0x00003ffe // log(1/frcpa(1+214/2^-8))
-data8 0x9c5710b8cbb73a43 , 0x00003ffe // log(1/frcpa(1+215/2^-8))
-
-data8 0x9ccd0abd301f399c , 0x00003ffe // log(1/frcpa(1+216/2^-8))
-data8 0x9d7e67f3bdce8888 , 0x00003ffe // log(1/frcpa(1+217/2^-8))
-data8 0x9df4ea81a99daa01 , 0x00003ffe // log(1/frcpa(1+218/2^-8))
-data8 0x9e6ba405a54514ba , 0x00003ffe // log(1/frcpa(1+219/2^-8))
-data8 0x9f1e21c8c7bb62b3 , 0x00003ffe // log(1/frcpa(1+220/2^-8))
-
-data8 0x9f956593f6b6355c , 0x00003ffe // log(1/frcpa(1+221/2^-8))
-data8 0xa00ce1092e5498c3 , 0x00003ffe // log(1/frcpa(1+222/2^-8))
-data8 0xa0c08309c4b912c1 , 0x00003ffe // log(1/frcpa(1+223/2^-8))
-data8 0xa1388a8c6faa2afa , 0x00003ffe // log(1/frcpa(1+224/2^-8))
-data8 0xa1b0ca7095b5f985 , 0x00003ffe // log(1/frcpa(1+225/2^-8))
-
-data8 0xa22942eb47534a00 , 0x00003ffe // log(1/frcpa(1+226/2^-8))
-data8 0xa2de62326449d0a3 , 0x00003ffe // log(1/frcpa(1+227/2^-8))
-data8 0xa357690f88bfe345 , 0x00003ffe // log(1/frcpa(1+228/2^-8))
-data8 0xa3d0a93f45169a4b , 0x00003ffe // log(1/frcpa(1+229/2^-8))
-data8 0xa44a22f7ffe65f30 , 0x00003ffe // log(1/frcpa(1+230/2^-8))
-
-data8 0xa500c5e5b4c1aa36 , 0x00003ffe // log(1/frcpa(1+231/2^-8))
-data8 0xa57ad064eb2ebbc2 , 0x00003ffe // log(1/frcpa(1+232/2^-8))
-data8 0xa5f5152dedf4384e , 0x00003ffe // log(1/frcpa(1+233/2^-8))
-data8 0xa66f9478856233ec , 0x00003ffe // log(1/frcpa(1+234/2^-8))
-data8 0xa6ea4e7cca02c32e , 0x00003ffe // log(1/frcpa(1+235/2^-8))
-
-data8 0xa765437325341ccf , 0x00003ffe // log(1/frcpa(1+236/2^-8))
-data8 0xa81e21e6c75b4020 , 0x00003ffe // log(1/frcpa(1+237/2^-8))
-data8 0xa899ab333fe2b9ca , 0x00003ffe // log(1/frcpa(1+238/2^-8))
-data8 0xa9157039c51ebe71 , 0x00003ffe // log(1/frcpa(1+239/2^-8))
-data8 0xa991713433c2b999 , 0x00003ffe // log(1/frcpa(1+240/2^-8))
-
-data8 0xaa0dae5cbcc048b3 , 0x00003ffe // log(1/frcpa(1+241/2^-8))
-data8 0xaa8a27ede5eb13ad , 0x00003ffe // log(1/frcpa(1+242/2^-8))
-data8 0xab06de228a9e3499 , 0x00003ffe // log(1/frcpa(1+243/2^-8))
-data8 0xab83d135dc633301 , 0x00003ffe // log(1/frcpa(1+244/2^-8))
-data8 0xac3fb076adc7fe7a , 0x00003ffe // log(1/frcpa(1+245/2^-8))
-
-data8 0xacbd3cbbe47988f1 , 0x00003ffe // log(1/frcpa(1+246/2^-8))
-data8 0xad3b06b1a5dc57c3 , 0x00003ffe // log(1/frcpa(1+247/2^-8))
-data8 0xadb90e94af887717 , 0x00003ffe // log(1/frcpa(1+248/2^-8))
-data8 0xae3754a218f7c816 , 0x00003ffe // log(1/frcpa(1+249/2^-8))
-data8 0xaeb5d9175437afa2 , 0x00003ffe // log(1/frcpa(1+250/2^-8))
-
-data8 0xaf349c322e9c7cee , 0x00003ffe // log(1/frcpa(1+251/2^-8))
-data8 0xafb39e30d1768d1c , 0x00003ffe // log(1/frcpa(1+252/2^-8))
-data8 0xb032df51c2c93116 , 0x00003ffe // log(1/frcpa(1+253/2^-8))
-data8 0xb0b25fd3e6035ad9 , 0x00003ffe // log(1/frcpa(1+254/2^-8))
-data8 0xb1321ff67cba178c , 0x00003ffe // log(1/frcpa(1+255/2^-8))
-ASM_SIZE_DIRECTIVE(log_table_2)
-
-
-.align 32
-.global log#
-.global log10#
+LOCAL_OBJECT_START(log_data)
+// coefficients of polynomial approximation
+data8 0x3FC2494104381A8E // A7
+data8 0xBFC5556D556BBB69 // A6
+//
+// two parts of ln(2)
+data8 0x3FE62E42FEF00000,0x3DD473DE6AF278ED
+//
+data8 0x8000000000000000,0x3FFF // 1.0
+//
+data8 0x3FC999999988B5E9 // A5
+data8 0xBFCFFFFFFFF6FFF5 // A4
+//
+// hi parts of ln(1/frcpa(1+i/256)), i=0...255
+data8 0x3F60040155D5889D // 0
+data8 0x3F78121214586B54 // 1
+data8 0x3F841929F96832EF // 2
+data8 0x3F8C317384C75F06 // 3
+data8 0x3F91A6B91AC73386 // 4
+data8 0x3F95BA9A5D9AC039 // 5
+data8 0x3F99D2A8074325F3 // 6
+data8 0x3F9D6B2725979802 // 7
+data8 0x3FA0C58FA19DFAA9 // 8
+data8 0x3FA2954C78CBCE1A // 9
+data8 0x3FA4A94D2DA96C56 // 10
+data8 0x3FA67C94F2D4BB58 // 11
+data8 0x3FA85188B630F068 // 12
+data8 0x3FAA6B8ABE73AF4C // 13
+data8 0x3FAC441E06F72A9E // 14
+data8 0x3FAE1E6713606D06 // 15
+data8 0x3FAFFA6911AB9300 // 16
+data8 0x3FB0EC139C5DA600 // 17
+data8 0x3FB1DBD2643D190B // 18
+data8 0x3FB2CC7284FE5F1C // 19
+data8 0x3FB3BDF5A7D1EE64 // 20
+data8 0x3FB4B05D7AA012E0 // 21
+data8 0x3FB580DB7CEB5701 // 22
+data8 0x3FB674F089365A79 // 23
+data8 0x3FB769EF2C6B568D // 24
+data8 0x3FB85FD927506A47 // 25
+data8 0x3FB9335E5D594988 // 26
+data8 0x3FBA2B0220C8E5F4 // 27
+data8 0x3FBB0004AC1A86AB // 28
+data8 0x3FBBF968769FCA10 // 29
+data8 0x3FBCCFEDBFEE13A8 // 30
+data8 0x3FBDA727638446A2 // 31
+data8 0x3FBEA3257FE10F79 // 32
+data8 0x3FBF7BE9FEDBFDE5 // 33
+data8 0x3FC02AB352FF25F3 // 34
+data8 0x3FC097CE579D204C // 35
+data8 0x3FC1178E8227E47B // 36
+data8 0x3FC185747DBECF33 // 37
+data8 0x3FC1F3B925F25D41 // 38
+data8 0x3FC2625D1E6DDF56 // 39
+data8 0x3FC2D1610C868139 // 40
+data8 0x3FC340C59741142E // 41
+data8 0x3FC3B08B6757F2A9 // 42
+data8 0x3FC40DFB08378003 // 43
+data8 0x3FC47E74E8CA5F7C // 44
+data8 0x3FC4EF51F6466DE4 // 45
+data8 0x3FC56092E02BA516 // 46
+data8 0x3FC5D23857CD74D4 // 47
+data8 0x3FC6313A37335D76 // 48
+data8 0x3FC6A399DABBD383 // 49
+data8 0x3FC70337DD3CE41A // 50
+data8 0x3FC77654128F6127 // 51
+data8 0x3FC7E9D82A0B022D // 52
+data8 0x3FC84A6B759F512E // 53
+data8 0x3FC8AB47D5F5A30F // 54
+data8 0x3FC91FE49096581B // 55
+data8 0x3FC981634011AA75 // 56
+data8 0x3FC9F6C407089664 // 57
+data8 0x3FCA58E729348F43 // 58
+data8 0x3FCABB55C31693AC // 59
+data8 0x3FCB1E104919EFD0 // 60
+data8 0x3FCB94EE93E367CA // 61
+data8 0x3FCBF851C067555E // 62
+data8 0x3FCC5C0254BF23A5 // 63
+data8 0x3FCCC000C9DB3C52 // 64
+data8 0x3FCD244D99C85673 // 65
+data8 0x3FCD88E93FB2F450 // 66
+data8 0x3FCDEDD437EAEF00 // 67
+data8 0x3FCE530EFFE71012 // 68
+data8 0x3FCEB89A1648B971 // 69
+data8 0x3FCF1E75FADF9BDE // 70
+data8 0x3FCF84A32EAD7C35 // 71
+data8 0x3FCFEB2233EA07CD // 72
+data8 0x3FD028F9C7035C1C // 73
+data8 0x3FD05C8BE0D9635A // 74
+data8 0x3FD085EB8F8AE797 // 75
+data8 0x3FD0B9C8E32D1911 // 76
+data8 0x3FD0EDD060B78080 // 77
+data8 0x3FD122024CF0063F // 78
+data8 0x3FD14BE2927AECD4 // 79
+data8 0x3FD180618EF18ADF // 80
+data8 0x3FD1B50BBE2FC63B // 81
+data8 0x3FD1DF4CC7CF242D // 82
+data8 0x3FD214456D0EB8D4 // 83
+data8 0x3FD23EC5991EBA49 // 84
+data8 0x3FD2740D9F870AFB // 85
+data8 0x3FD29ECDABCDFA03 // 86
+data8 0x3FD2D46602ADCCEE // 87
+data8 0x3FD2FF66B04EA9D4 // 88
+data8 0x3FD335504B355A37 // 89
+data8 0x3FD360925EC44F5C // 90
+data8 0x3FD38BF1C3337E74 // 91
+data8 0x3FD3C25277333183 // 92
+data8 0x3FD3EDF463C1683E // 93
+data8 0x3FD419B423D5E8C7 // 94
+data8 0x3FD44591E0539F48 // 95
+data8 0x3FD47C9175B6F0AD // 96
+data8 0x3FD4A8B341552B09 // 97
+data8 0x3FD4D4F39089019F // 98
+data8 0x3FD501528DA1F967 // 99
+data8 0x3FD52DD06347D4F6 // 100
+data8 0x3FD55A6D3C7B8A89 // 101
+data8 0x3FD5925D2B112A59 // 102
+data8 0x3FD5BF406B543DB1 // 103
+data8 0x3FD5EC433D5C35AD // 104
+data8 0x3FD61965CDB02C1E // 105
+data8 0x3FD646A84935B2A1 // 106
+data8 0x3FD6740ADD31DE94 // 107
+data8 0x3FD6A18DB74A58C5 // 108
+data8 0x3FD6CF31058670EC // 109
+data8 0x3FD6F180E852F0B9 // 110
+data8 0x3FD71F5D71B894EF // 111
+data8 0x3FD74D5AEFD66D5C // 112
+data8 0x3FD77B79922BD37D // 113
+data8 0x3FD7A9B9889F19E2 // 114
+data8 0x3FD7D81B037EB6A6 // 115
+data8 0x3FD8069E33827230 // 116
+data8 0x3FD82996D3EF8BCA // 117
+data8 0x3FD85855776DCBFA // 118
+data8 0x3FD8873658327CCE // 119
+data8 0x3FD8AA75973AB8CE // 120
+data8 0x3FD8D992DC8824E4 // 121
+data8 0x3FD908D2EA7D9511 // 122
+data8 0x3FD92C59E79C0E56 // 123
+data8 0x3FD95BD750EE3ED2 // 124
+data8 0x3FD98B7811A3EE5B // 125
+data8 0x3FD9AF47F33D406B // 126
+data8 0x3FD9DF270C1914A7 // 127
+data8 0x3FDA0325ED14FDA4 // 128
+data8 0x3FDA33440224FA78 // 129
+data8 0x3FDA57725E80C382 // 130
+data8 0x3FDA87D0165DD199 // 131
+data8 0x3FDAAC2E6C03F895 // 132
+data8 0x3FDADCCC6FDF6A81 // 133
+data8 0x3FDB015B3EB1E790 // 134
+data8 0x3FDB323A3A635948 // 135
+data8 0x3FDB56FA04462909 // 136
+data8 0x3FDB881AA659BC93 // 137
+data8 0x3FDBAD0BEF3DB164 // 138
+data8 0x3FDBD21297781C2F // 139
+data8 0x3FDC039236F08818 // 140
+data8 0x3FDC28CB1E4D32FC // 141
+data8 0x3FDC4E19B84723C1 // 142
+data8 0x3FDC7FF9C74554C9 // 143
+data8 0x3FDCA57B64E9DB05 // 144
+data8 0x3FDCCB130A5CEBAF // 145
+data8 0x3FDCF0C0D18F326F // 146
+data8 0x3FDD232075B5A201 // 147
+data8 0x3FDD490246DEFA6B // 148
+data8 0x3FDD6EFA918D25CD // 149
+data8 0x3FDD9509707AE52F // 150
+data8 0x3FDDBB2EFE92C554 // 151
+data8 0x3FDDEE2F3445E4AE // 152
+data8 0x3FDE148A1A2726CD // 153
+data8 0x3FDE3AFC0A49FF3F // 154
+data8 0x3FDE6185206D516D // 155
+data8 0x3FDE882578823D51 // 156
+data8 0x3FDEAEDD2EAC990C // 157
+data8 0x3FDED5AC5F436BE2 // 158
+data8 0x3FDEFC9326D16AB8 // 159
+data8 0x3FDF2391A21575FF // 160
+data8 0x3FDF4AA7EE03192C // 161
+data8 0x3FDF71D627C30BB0 // 162
+data8 0x3FDF991C6CB3B379 // 163
+data8 0x3FDFC07ADA69A90F // 164
+data8 0x3FDFE7F18EB03D3E // 165
+data8 0x3FE007C053C5002E // 166
+data8 0x3FE01B942198A5A0 // 167
+data8 0x3FE02F74400C64EA // 168
+data8 0x3FE04360BE7603AC // 169
+data8 0x3FE05759AC47FE33 // 170
+data8 0x3FE06B5F1911CF51 // 171
+data8 0x3FE078BF0533C568 // 172
+data8 0x3FE08CD9687E7B0E // 173
+data8 0x3FE0A10074CF9019 // 174
+data8 0x3FE0B5343A234476 // 175
+data8 0x3FE0C974C89431CD // 176
+data8 0x3FE0DDC2305B9886 // 177
+data8 0x3FE0EB524BAFC918 // 178
+data8 0x3FE0FFB54213A475 // 179
+data8 0x3FE114253DA97D9F // 180
+data8 0x3FE128A24F1D9AFF // 181
+data8 0x3FE1365252BF0864 // 182
+data8 0x3FE14AE558B4A92D // 183
+data8 0x3FE15F85A19C765B // 184
+data8 0x3FE16D4D38C119FA // 185
+data8 0x3FE18203C20DD133 // 186
+data8 0x3FE196C7BC4B1F3A // 187
+data8 0x3FE1A4A738B7A33C // 188
+data8 0x3FE1B981C0C9653C // 189
+data8 0x3FE1CE69E8BB106A // 190
+data8 0x3FE1DC619DE06944 // 191
+data8 0x3FE1F160A2AD0DA3 // 192
+data8 0x3FE2066D7740737E // 193
+data8 0x3FE2147DBA47A393 // 194
+data8 0x3FE229A1BC5EBAC3 // 195
+data8 0x3FE237C1841A502E // 196
+data8 0x3FE24CFCE6F80D9A // 197
+data8 0x3FE25B2C55CD5762 // 198
+data8 0x3FE2707F4D5F7C40 // 199
+data8 0x3FE285E0842CA383 // 200
+data8 0x3FE294294708B773 // 201
+data8 0x3FE2A9A2670AFF0C // 202
+data8 0x3FE2B7FB2C8D1CC0 // 203
+data8 0x3FE2C65A6395F5F5 // 204
+data8 0x3FE2DBF557B0DF42 // 205
+data8 0x3FE2EA64C3F97654 // 206
+data8 0x3FE3001823684D73 // 207
+data8 0x3FE30E97E9A8B5CC // 208
+data8 0x3FE32463EBDD34E9 // 209
+data8 0x3FE332F4314AD795 // 210
+data8 0x3FE348D90E7464CF // 211
+data8 0x3FE35779F8C43D6D // 212
+data8 0x3FE36621961A6A99 // 213
+data8 0x3FE37C299F3C366A // 214
+data8 0x3FE38AE2171976E7 // 215
+data8 0x3FE399A157A603E7 // 216
+data8 0x3FE3AFCCFE77B9D1 // 217
+data8 0x3FE3BE9D503533B5 // 218
+data8 0x3FE3CD7480B4A8A2 // 219
+data8 0x3FE3E3C43918F76C // 220
+data8 0x3FE3F2ACB27ED6C6 // 221
+data8 0x3FE4019C2125CA93 // 222
+data8 0x3FE4181061389722 // 223
+data8 0x3FE42711518DF545 // 224
+data8 0x3FE436194E12B6BF // 225
+data8 0x3FE445285D68EA69 // 226
+data8 0x3FE45BCC464C893A // 227
+data8 0x3FE46AED21F117FC // 228
+data8 0x3FE47A1527E8A2D3 // 229
+data8 0x3FE489445EFFFCCB // 230
+data8 0x3FE4A018BCB69835 // 231
+data8 0x3FE4AF5A0C9D65D7 // 232
+data8 0x3FE4BEA2A5BDBE87 // 233
+data8 0x3FE4CDF28F10AC46 // 234
+data8 0x3FE4DD49CF994058 // 235
+data8 0x3FE4ECA86E64A683 // 236
+data8 0x3FE503C43CD8EB68 // 237
+data8 0x3FE513356667FC57 // 238
+data8 0x3FE522AE0738A3D7 // 239
+data8 0x3FE5322E26867857 // 240
+data8 0x3FE541B5CB979809 // 241
+data8 0x3FE55144FDBCBD62 // 242
+data8 0x3FE560DBC45153C6 // 243
+data8 0x3FE5707A26BB8C66 // 244
+data8 0x3FE587F60ED5B8FF // 245
+data8 0x3FE597A7977C8F31 // 246
+data8 0x3FE5A760D634BB8A // 247
+data8 0x3FE5B721D295F10E // 248
+data8 0x3FE5C6EA94431EF9 // 249
+data8 0x3FE5D6BB22EA86F5 // 250
+data8 0x3FE5E6938645D38F // 251
+data8 0x3FE5F673C61A2ED1 // 252
+data8 0x3FE6065BEA385926 // 253
+data8 0x3FE6164BFA7CC06B // 254
+data8 0x3FE62643FECF9742 // 255
+//
+// lo parts of ln(1/frcpa(1+i/256)), i=0...255
+data4 0x20E70672 // 0
+data4 0x1F60A5D0 // 1
+data4 0x218EABA0 // 2
+data4 0x21403104 // 3
+data4 0x20E9B54E // 4
+data4 0x21EE1382 // 5
+data4 0x226014E3 // 6
+data4 0x2095E5C9 // 7
+data4 0x228BA9D4 // 8
+data4 0x22932B86 // 9
+data4 0x22608A57 // 10
+data4 0x220209F3 // 11
+data4 0x212882CC // 12
+data4 0x220D46E2 // 13
+data4 0x21FA4C28 // 14
+data4 0x229E5BD9 // 15
+data4 0x228C9838 // 16
+data4 0x2311F954 // 17
+data4 0x221365DF // 18
+data4 0x22BD0CB3 // 19
+data4 0x223D4BB7 // 20
+data4 0x22A71BBE // 21
+data4 0x237DB2FA // 22
+data4 0x23194C9D // 23
+data4 0x22EC639E // 24
+data4 0x2367E669 // 25
+data4 0x232E1D5F // 26
+data4 0x234A639B // 27
+data4 0x2365C0E0 // 28
+data4 0x234646C1 // 29
+data4 0x220CBF9C // 30
+data4 0x22A00FD4 // 31
+data4 0x2306A3F2 // 32
+data4 0x23745A9B // 33
+data4 0x2398D756 // 34
+data4 0x23DD0B6A // 35
+data4 0x23DE338B // 36
+data4 0x23A222DF // 37
+data4 0x223164F8 // 38
+data4 0x23B4E87B // 39
+data4 0x23D6CCB8 // 40
+data4 0x220C2099 // 41
+data4 0x21B86B67 // 42
+data4 0x236D14F1 // 43
+data4 0x225A923F // 44
+data4 0x22748723 // 45
+data4 0x22200D13 // 46
+data4 0x23C296EA // 47
+data4 0x2302AC38 // 48
+data4 0x234B1996 // 49
+data4 0x2385E298 // 50
+data4 0x23175BE5 // 51
+data4 0x2193F482 // 52
+data4 0x23BFEA90 // 53
+data4 0x23D70A0C // 54
+data4 0x231CF30A // 55
+data4 0x235D9E90 // 56
+data4 0x221AD0CB // 57
+data4 0x22FAA08B // 58
+data4 0x23D29A87 // 59
+data4 0x20C4B2FE // 60
+data4 0x2381B8B7 // 61
+data4 0x23F8D9FC // 62
+data4 0x23EAAE7B // 63
+data4 0x2329E8AA // 64
+data4 0x23EC0322 // 65
+data4 0x2357FDCB // 66
+data4 0x2392A9AD // 67
+data4 0x22113B02 // 68
+data4 0x22DEE901 // 69
+data4 0x236A6D14 // 70
+data4 0x2371D33E // 71
+data4 0x2146F005 // 72
+data4 0x23230B06 // 73
+data4 0x22F1C77D // 74
+data4 0x23A89FA3 // 75
+data4 0x231D1241 // 76
+data4 0x244DA96C // 77
+data4 0x23ECBB7D // 78
+data4 0x223E42B4 // 79
+data4 0x23801BC9 // 80
+data4 0x23573263 // 81
+data4 0x227C1158 // 82
+data4 0x237BD749 // 83
+data4 0x21DDBAE9 // 84
+data4 0x23401735 // 85
+data4 0x241D9DEE // 86
+data4 0x23BC88CB // 87
+data4 0x2396D5F1 // 88
+data4 0x23FC89CF // 89
+data4 0x2414F9A2 // 90
+data4 0x2474A0F5 // 91
+data4 0x24354B60 // 92
+data4 0x23C1EB40 // 93
+data4 0x2306DD92 // 94
+data4 0x24353B6B // 95
+data4 0x23CD1701 // 96
+data4 0x237C7A1C // 97
+data4 0x245793AA // 98
+data4 0x24563695 // 99
+data4 0x23C51467 // 100
+data4 0x24476B68 // 101
+data4 0x212585A9 // 102
+data4 0x247B8293 // 103
+data4 0x2446848A // 104
+data4 0x246A53F8 // 105
+data4 0x246E496D // 106
+data4 0x23ED1D36 // 107
+data4 0x2314C258 // 108
+data4 0x233244A7 // 109
+data4 0x245B7AF0 // 110
+data4 0x24247130 // 111
+data4 0x22D67B38 // 112
+data4 0x2449F620 // 113
+data4 0x23BBC8B8 // 114
+data4 0x237D3BA0 // 115
+data4 0x245E8F13 // 116
+data4 0x2435573F // 117
+data4 0x242DE666 // 118
+data4 0x2463BC10 // 119
+data4 0x2466587D // 120
+data4 0x2408144B // 121
+data4 0x2405F0E5 // 122
+data4 0x22381CFF // 123
+data4 0x24154F9B // 124
+data4 0x23A4E96E // 125
+data4 0x24052967 // 126
+data4 0x2406963F // 127
+data4 0x23F7D3CB // 128
+data4 0x2448AFF4 // 129
+data4 0x24657A21 // 130
+data4 0x22FBC230 // 131
+data4 0x243C8DEA // 132
+data4 0x225DC4B7 // 133
+data4 0x23496EBF // 134
+data4 0x237C2B2B // 135
+data4 0x23A4A5B1 // 136
+data4 0x2394E9D1 // 137
+data4 0x244BC950 // 138
+data4 0x23C7448F // 139
+data4 0x2404A1AD // 140
+data4 0x246511D5 // 141
+data4 0x24246526 // 142
+data4 0x23111F57 // 143
+data4 0x22868951 // 144
+data4 0x243EB77F // 145
+data4 0x239F3DFF // 146
+data4 0x23089666 // 147
+data4 0x23EBFA6A // 148
+data4 0x23C51312 // 149
+data4 0x23E1DD5E // 150
+data4 0x232C0944 // 151
+data4 0x246A741F // 152
+data4 0x2414DF8D // 153
+data4 0x247B5546 // 154
+data4 0x2415C980 // 155
+data4 0x24324ABD // 156
+data4 0x234EB5E5 // 157
+data4 0x2465E43E // 158
+data4 0x242840D1 // 159
+data4 0x24444057 // 160
+data4 0x245E56F0 // 161
+data4 0x21AE30F8 // 162
+data4 0x23FB3283 // 163
+data4 0x247A4D07 // 164
+data4 0x22AE314D // 165
+data4 0x246B7727 // 166
+data4 0x24EAD526 // 167
+data4 0x24B41DC9 // 168
+data4 0x24EE8062 // 169
+data4 0x24A0C7C4 // 170
+data4 0x24E8DA67 // 171
+data4 0x231120F7 // 172
+data4 0x24401FFB // 173
+data4 0x2412DD09 // 174
+data4 0x248C131A // 175
+data4 0x24C0A7CE // 176
+data4 0x243DD4C8 // 177
+data4 0x24457FEB // 178
+data4 0x24DEEFBB // 179
+data4 0x243C70AE // 180
+data4 0x23E7A6FA // 181
+data4 0x24C2D311 // 182
+data4 0x23026255 // 183
+data4 0x2437C9B9 // 184
+data4 0x246BA847 // 185
+data4 0x2420B448 // 186
+data4 0x24C4CF5A // 187
+data4 0x242C4981 // 188
+data4 0x24DE1525 // 189
+data4 0x24F5CC33 // 190
+data4 0x235A85DA // 191
+data4 0x24A0B64F // 192
+data4 0x244BA0A4 // 193
+data4 0x24AAF30A // 194
+data4 0x244C86F9 // 195
+data4 0x246D5B82 // 196
+data4 0x24529347 // 197
+data4 0x240DD008 // 198
+data4 0x24E98790 // 199
+data4 0x2489B0CE // 200
+data4 0x22BC29AC // 201
+data4 0x23F37C7A // 202
+data4 0x24987FE8 // 203
+data4 0x22AFE20B // 204
+data4 0x24C8D7C2 // 205
+data4 0x24B28B7D // 206
+data4 0x23B6B271 // 207
+data4 0x24C77CB6 // 208
+data4 0x24EF1DCA // 209
+data4 0x24A4F0AC // 210
+data4 0x24CF113E // 211
+data4 0x2496BBAB // 212
+data4 0x23C7CC8A // 213
+data4 0x23AE3961 // 214
+data4 0x2410A895 // 215
+data4 0x23CE3114 // 216
+data4 0x2308247D // 217
+data4 0x240045E9 // 218
+data4 0x24974F60 // 219
+data4 0x242CB39F // 220
+data4 0x24AB8D69 // 221
+data4 0x23436788 // 222
+data4 0x24305E9E // 223
+data4 0x243E71A9 // 224
+data4 0x23C2A6B3 // 225
+data4 0x23FFE6CF // 226
+data4 0x2322D801 // 227
+data4 0x24515F21 // 228
+data4 0x2412A0D6 // 229
+data4 0x24E60D44 // 230
+data4 0x240D9251 // 231
+data4 0x247076E2 // 232
+data4 0x229B101B // 233
+data4 0x247B12DE // 234
+data4 0x244B9127 // 235
+data4 0x2499EC42 // 236
+data4 0x21FC3963 // 237
+data4 0x23E53266 // 238
+data4 0x24CE102D // 239
+data4 0x23CC45D2 // 240
+data4 0x2333171D // 241
+data4 0x246B3533 // 242
+data4 0x24931129 // 243
+data4 0x24405FFA // 244
+data4 0x24CF464D // 245
+data4 0x237095CD // 246
+data4 0x24F86CBD // 247
+data4 0x24E2D84B // 248
+data4 0x21ACBB44 // 249
+data4 0x24F43A8C // 250
+data4 0x249DB931 // 251
+data4 0x24A385EF // 252
+data4 0x238B1279 // 253
+data4 0x2436213E // 254
+data4 0x24F18A3B // 255
+LOCAL_OBJECT_END(log_data)
+
+
+LOCAL_OBJECT_START(log10_data)
+// coefficients of polynoimal approximation
+data8 0x3FC2494104381A8E // A7
+data8 0xBFC5556D556BBB69 // A6
+//
+// two parts of ln(2)/ln(10)
+data8 0x3FD3441350900000, 0x3DCEF3FDE623E256
+//
+data8 0xDE5BD8A937287195,0x3FFD // 1/ln(10)
+//
+data8 0x3FC999999988B5E9 // A5
+data8 0xBFCFFFFFFFF6FFF5 // A4
+//
+// Hi parts of ln(1/frcpa(1+i/256))/ln(10), i=0...255
+data8 0x3F4BD27045BFD024 // 0
+data8 0x3F64E84E793A474A // 1
+data8 0x3F7175085AB85FF0 // 2
+data8 0x3F787CFF9D9147A5 // 3
+data8 0x3F7EA9D372B89FC8 // 4
+data8 0x3F82DF9D95DA961C // 5
+data8 0x3F866DF172D6372B // 6
+data8 0x3F898D79EF5EEDEF // 7
+data8 0x3F8D22ADF3F9579C // 8
+data8 0x3F9024231D30C398 // 9
+data8 0x3F91F23A98897D49 // 10
+data8 0x3F93881A7B818F9E // 11
+data8 0x3F951F6E1E759E35 // 12
+data8 0x3F96F2BCE7ADC5B4 // 13
+data8 0x3F988D362CDF359E // 14
+data8 0x3F9A292BAF010981 // 15
+data8 0x3F9BC6A03117EB97 // 16
+data8 0x3F9D65967DE3AB08 // 17
+data8 0x3F9F061167FC31E7 // 18
+data8 0x3FA05409E4F7819B // 19
+data8 0x3FA125D0432EA20D // 20
+data8 0x3FA1F85D440D299B // 21
+data8 0x3FA2AD755749617C // 22
+data8 0x3FA381772A00E603 // 23
+data8 0x3FA45643E165A70A // 24
+data8 0x3FA52BDD034475B8 // 25
+data8 0x3FA5E3966B7E9295 // 26
+data8 0x3FA6BAAF47C5B244 // 27
+data8 0x3FA773B3E8C4F3C7 // 28
+data8 0x3FA84C51EBEE8D15 // 29
+data8 0x3FA906A6786FC1CA // 30
+data8 0x3FA9C197ABF00DD6 // 31
+data8 0x3FAA9C78712191F7 // 32
+data8 0x3FAB58C09C8D637C // 33
+data8 0x3FAC15A8BCDD7B7E // 34
+data8 0x3FACD331E2C2967B // 35
+data8 0x3FADB11ED766ABF4 // 36
+data8 0x3FAE70089346A9E6 // 37
+data8 0x3FAF2F96C6754AED // 38
+data8 0x3FAFEFCA8D451FD5 // 39
+data8 0x3FB0585283764177 // 40
+data8 0x3FB0B913AAC7D3A6 // 41
+data8 0x3FB11A294F2569F5 // 42
+data8 0x3FB16B51A2696890 // 43
+data8 0x3FB1CD03ADACC8BD // 44
+data8 0x3FB22F0BDD7745F5 // 45
+data8 0x3FB2916ACA38D1E7 // 46
+data8 0x3FB2F4210DF7663C // 47
+data8 0x3FB346A6C3C49065 // 48
+data8 0x3FB3A9FEBC605409 // 49
+data8 0x3FB3FD0C10A3AA54 // 50
+data8 0x3FB46107D3540A81 // 51
+data8 0x3FB4C55DD16967FE // 52
+data8 0x3FB51940330C000A // 53
+data8 0x3FB56D620EE7115E // 54
+data8 0x3FB5D2ABCF26178D // 55
+data8 0x3FB6275AA5DEBF81 // 56
+data8 0x3FB68D4EAF26D7EE // 57
+data8 0x3FB6E28C5C54A28D // 58
+data8 0x3FB7380B9665B7C7 // 59
+data8 0x3FB78DCCC278E85B // 60
+data8 0x3FB7F50C2CF25579 // 61
+data8 0x3FB84B5FD5EAEFD7 // 62
+data8 0x3FB8A1F6BAB2B226 // 63
+data8 0x3FB8F8D144557BDF // 64
+data8 0x3FB94FEFDCD61D92 // 65
+data8 0x3FB9A752EF316149 // 66
+data8 0x3FB9FEFAE7611EDF // 67
+data8 0x3FBA56E8325F5C86 // 68
+data8 0x3FBAAF1B3E297BB3 // 69
+data8 0x3FBB079479C372AC // 70
+data8 0x3FBB6054553B12F7 // 71
+data8 0x3FBBB95B41AB5CE5 // 72
+data8 0x3FBC12A9B13FE079 // 73
+data8 0x3FBC6C4017382BEA // 74
+data8 0x3FBCB41FBA42686C // 75
+data8 0x3FBD0E38CE73393E // 76
+data8 0x3FBD689B2193F132 // 77
+data8 0x3FBDC3472B1D285F // 78
+data8 0x3FBE0C06300D528B // 79
+data8 0x3FBE6738190E394B // 80
+data8 0x3FBEC2B50D208D9A // 81
+data8 0x3FBF0C1C2B936827 // 82
+data8 0x3FBF68216C9CC726 // 83
+data8 0x3FBFB1F6381856F3 // 84
+data8 0x3FC00742AF4CE5F8 // 85
+data8 0x3FC02C64906512D2 // 86
+data8 0x3FC05AF1E63E03B4 // 87
+data8 0x3FC0804BEA723AA8 // 88
+data8 0x3FC0AF1FD6711526 // 89
+data8 0x3FC0D4B2A88059FF // 90
+data8 0x3FC0FA5EF136A06C // 91
+data8 0x3FC1299A4FB3E305 // 92
+data8 0x3FC14F806253C3EC // 93
+data8 0x3FC175805D1587C1 // 94
+data8 0x3FC19B9A637CA294 // 95
+data8 0x3FC1CB5FC26EDE16 // 96
+data8 0x3FC1F1B4E65F2590 // 97
+data8 0x3FC218248B5DC3E5 // 98
+data8 0x3FC23EAED62ADC76 // 99
+data8 0x3FC26553EBD337BC // 100
+data8 0x3FC28C13F1B118FF // 101
+data8 0x3FC2BCAA14381385 // 102
+data8 0x3FC2E3A740B7800E // 103
+data8 0x3FC30ABFD8F333B6 // 104
+data8 0x3FC331F403985096 // 105
+data8 0x3FC35943E7A6068F // 106
+data8 0x3FC380AFAC6E7C07 // 107
+data8 0x3FC3A8377997B9E5 // 108
+data8 0x3FC3CFDB771C9ADB // 109
+data8 0x3FC3EDA90D39A5DE // 110
+data8 0x3FC4157EC09505CC // 111
+data8 0x3FC43D7113FB04C0 // 112
+data8 0x3FC4658030AD1CCE // 113
+data8 0x3FC48DAC404638F5 // 114
+data8 0x3FC4B5F56CBBB869 // 115
+data8 0x3FC4DE5BE05E7582 // 116
+data8 0x3FC4FCBC0776FD85 // 117
+data8 0x3FC525561E9256EE // 118
+data8 0x3FC54E0DF3198865 // 119
+data8 0x3FC56CAB7112BDE2 // 120
+data8 0x3FC59597BA735B15 // 121
+data8 0x3FC5BEA23A506FD9 // 122
+data8 0x3FC5DD7E08DE382E // 123
+data8 0x3FC606BDD3F92355 // 124
+data8 0x3FC6301C518A501E // 125
+data8 0x3FC64F3770618915 // 126
+data8 0x3FC678CC14C1E2D7 // 127
+data8 0x3FC6981005ED2947 // 128
+data8 0x3FC6C1DB5F9BB335 // 129
+data8 0x3FC6E1488ECD2880 // 130
+data8 0x3FC70B4B2E7E41B8 // 131
+data8 0x3FC72AE209146BF8 // 132
+data8 0x3FC7551C81BD8DCF // 133
+data8 0x3FC774DD76CC43BD // 134
+data8 0x3FC79F505DB00E88 // 135
+data8 0x3FC7BF3BDE099F30 // 136
+data8 0x3FC7E9E7CAC437F8 // 137
+data8 0x3FC809FE4902D00D // 138
+data8 0x3FC82A2757995CBD // 139
+data8 0x3FC85525C625E098 // 140
+data8 0x3FC8757A79831887 // 141
+data8 0x3FC895E2058D8E02 // 142
+data8 0x3FC8C13437695531 // 143
+data8 0x3FC8E1C812EF32BE // 144
+data8 0x3FC9026F112197E8 // 145
+data8 0x3FC923294888880A // 146
+data8 0x3FC94EEA4B8334F2 // 147
+data8 0x3FC96FD1B639FC09 // 148
+data8 0x3FC990CCA66229AB // 149
+data8 0x3FC9B1DB33334842 // 150
+data8 0x3FC9D2FD740E6606 // 151
+data8 0x3FC9FF49EEDCB553 // 152
+data8 0x3FCA209A84FBCFF7 // 153
+data8 0x3FCA41FF1E43F02B // 154
+data8 0x3FCA6377D2CE9377 // 155
+data8 0x3FCA8504BAE0D9F5 // 156
+data8 0x3FCAA6A5EEEBEFE2 // 157
+data8 0x3FCAC85B878D7878 // 158
+data8 0x3FCAEA259D8FFA0B // 159
+data8 0x3FCB0C0449EB4B6A // 160
+data8 0x3FCB2DF7A5C50299 // 161
+data8 0x3FCB4FFFCA70E4D1 // 162
+data8 0x3FCB721CD17157E2 // 163
+data8 0x3FCB944ED477D4EC // 164
+data8 0x3FCBB695ED655C7C // 165
+data8 0x3FCBD8F2364AEC0F // 166
+data8 0x3FCBFB63C969F4FF // 167
+data8 0x3FCC1DEAC134D4E9 // 168
+data8 0x3FCC4087384F4F80 // 169
+data8 0x3FCC6339498F09E1 // 170
+data8 0x3FCC86010FFC076B // 171
+data8 0x3FCC9D3D065C5B41 // 172
+data8 0x3FCCC029375BA079 // 173
+data8 0x3FCCE32B66978BA4 // 174
+data8 0x3FCD0643AFD51404 // 175
+data8 0x3FCD29722F0DEA45 // 176
+data8 0x3FCD4CB70070FE43 // 177
+data8 0x3FCD6446AB3F8C95 // 178
+data8 0x3FCD87B0EF71DB44 // 179
+data8 0x3FCDAB31D1FE99A6 // 180
+data8 0x3FCDCEC96FDC888E // 181
+data8 0x3FCDE69088763579 // 182
+data8 0x3FCE0A4E4A25C1FF // 183
+data8 0x3FCE2E2315755E32 // 184
+data8 0x3FCE461322D1648A // 185
+data8 0x3FCE6A0E95C7787B // 186
+data8 0x3FCE8E216243DD60 // 187
+data8 0x3FCEA63AF26E007C // 188
+data8 0x3FCECA74ED15E0B7 // 189
+data8 0x3FCEEEC692CCD259 // 190
+data8 0x3FCF070A36B8D9C0 // 191
+data8 0x3FCF2B8393E34A2D // 192
+data8 0x3FCF5014EF538A5A // 193
+data8 0x3FCF68833AF1B17F // 194
+data8 0x3FCF8D3CD9F3F04E // 195
+data8 0x3FCFA5C61ADD93E9 // 196
+data8 0x3FCFCAA8567EBA79 // 197
+data8 0x3FCFE34CC8743DD8 // 198
+data8 0x3FD0042BFD74F519 // 199
+data8 0x3FD016BDF6A18017 // 200
+data8 0x3FD023262F907322 // 201
+data8 0x3FD035CCED8D32A1 // 202
+data8 0x3FD042430E869FFB // 203
+data8 0x3FD04EBEC842B2DF // 204
+data8 0x3FD06182E84FD4AB // 205
+data8 0x3FD06E0CB609D383 // 206
+data8 0x3FD080E60BEC8F12 // 207
+data8 0x3FD08D7E0D894735 // 208
+data8 0x3FD0A06CC96A2055 // 209
+data8 0x3FD0AD131F3B3C55 // 210
+data8 0x3FD0C01771E775FB // 211
+data8 0x3FD0CCCC3CAD6F4B // 212
+data8 0x3FD0D986D91A34A8 // 213
+data8 0x3FD0ECA9B8861A2D // 214
+data8 0x3FD0F972F87FF3D5 // 215
+data8 0x3FD106421CF0E5F7 // 216
+data8 0x3FD11983EBE28A9C // 217
+data8 0x3FD12661E35B7859 // 218
+data8 0x3FD13345D2779D3B // 219
+data8 0x3FD146A6F597283A // 220
+data8 0x3FD15399E81EA83D // 221
+data8 0x3FD16092E5D3A9A6 // 222
+data8 0x3FD17413C3B7AB5D // 223
+data8 0x3FD1811BF629D6FA // 224
+data8 0x3FD18E2A47B46685 // 225
+data8 0x3FD19B3EBE1A4418 // 226
+data8 0x3FD1AEE9017CB450 // 227
+data8 0x3FD1BC0CED7134E1 // 228
+data8 0x3FD1C93712ABC7FF // 229
+data8 0x3FD1D66777147D3E // 230
+data8 0x3FD1EA3BD1286E1C // 231
+data8 0x3FD1F77BED932C4C // 232
+data8 0x3FD204C25E1B031F // 233
+data8 0x3FD2120F28CE69B1 // 234
+data8 0x3FD21F6253C48D00 // 235
+data8 0x3FD22CBBE51D60A9 // 236
+data8 0x3FD240CE4C975444 // 237
+data8 0x3FD24E37F8ECDAE7 // 238
+data8 0x3FD25BA8215AF7FC // 239
+data8 0x3FD2691ECC29F042 // 240
+data8 0x3FD2769BFFAB2DFF // 241
+data8 0x3FD2841FC23952C9 // 242
+data8 0x3FD291AA1A384978 // 243
+data8 0x3FD29F3B0E15584A // 244
+data8 0x3FD2B3A0EE479DF7 // 245
+data8 0x3FD2C142842C09E5 // 246
+data8 0x3FD2CEEACCB7BD6C // 247
+data8 0x3FD2DC99CE82FF20 // 248
+data8 0x3FD2EA4F902FD7D9 // 249
+data8 0x3FD2F80C186A25FC // 250
+data8 0x3FD305CF6DE7B0F6 // 251
+data8 0x3FD3139997683CE7 // 252
+data8 0x3FD3216A9BB59E7C // 253
+data8 0x3FD32F4281A3CEFE // 254
+data8 0x3FD33D2150110091 // 255
+//
+// Lo parts of ln(1/frcpa(1+i/256))/ln(10), i=0...255
+data4 0x1FB0EB5A // 0
+data4 0x206E5EE3 // 1
+data4 0x208F3609 // 2
+data4 0x2070EB03 // 3
+data4 0x1F314BAE // 4
+data4 0x217A889D // 5
+data4 0x21E63650 // 6
+data4 0x21C2F4A3 // 7
+data4 0x2192A10C // 8
+data4 0x1F84B73E // 9
+data4 0x2243FBCA // 10
+data4 0x21BD9C51 // 11
+data4 0x213C542B // 12
+data4 0x21047386 // 13
+data4 0x21217D8F // 14
+data4 0x226791B7 // 15
+data4 0x204CCE66 // 16
+data4 0x2234CE9F // 17
+data4 0x220675E2 // 18
+data4 0x22B8E5BA // 19
+data4 0x22C12D14 // 20
+data4 0x211D41F0 // 21
+data4 0x228507F3 // 22
+data4 0x22F7274B // 23
+data4 0x22A7FDD1 // 24
+data4 0x2244A06E // 25
+data4 0x215DCE69 // 26
+data4 0x22F5C961 // 27
+data4 0x22EBEF29 // 28
+data4 0x222A2CB6 // 29
+data4 0x22B9FE00 // 30
+data4 0x22E79EB7 // 31
+data4 0x222F9607 // 32
+data4 0x2189D87F // 33
+data4 0x2236DB45 // 34
+data4 0x22ED77FB // 35
+data4 0x21CB70F0 // 36
+data4 0x21B8ACE8 // 37
+data4 0x22EC58C1 // 38
+data4 0x22CFCC1C // 39
+data4 0x2343E77A // 40
+data4 0x237FBC7F // 41
+data4 0x230D472E // 42
+data4 0x234686FB // 43
+data4 0x23770425 // 44
+data4 0x223977EC // 45
+data4 0x2345800A // 46
+data4 0x237BC351 // 47
+data4 0x23191502 // 48
+data4 0x232BAC12 // 49
+data4 0x22692421 // 50
+data4 0x234D409D // 51
+data4 0x22EC3214 // 52
+data4 0x2376C916 // 53
+data4 0x22B00DD1 // 54
+data4 0x2309D910 // 55
+data4 0x22F925FD // 56
+data4 0x22A63A7B // 57
+data4 0x2106264A // 58
+data4 0x234227F9 // 59
+data4 0x1ECB1978 // 60
+data4 0x23460A62 // 61
+data4 0x232ED4B1 // 62
+data4 0x226DDC38 // 63
+data4 0x1F101A73 // 64
+data4 0x21B1F82B // 65
+data4 0x22752F19 // 66
+data4 0x2320BC15 // 67
+data4 0x236EEC5E // 68
+data4 0x23404D3E // 69
+data4 0x2304C517 // 70
+data4 0x22F7441A // 71
+data4 0x230D3D7A // 72
+data4 0x2264A9DF // 73
+data4 0x22410CC8 // 74
+data4 0x2342CCCB // 75
+data4 0x23560BD4 // 76
+data4 0x237BBFFE // 77
+data4 0x2373A206 // 78
+data4 0x22C871B9 // 79
+data4 0x2354B70C // 80
+data4 0x232EDB33 // 81
+data4 0x235DB680 // 82
+data4 0x230EF422 // 83
+data4 0x235316CA // 84
+data4 0x22EEEE8B // 85
+data4 0x2375C88C // 86
+data4 0x235ABD21 // 87
+data4 0x23A0D232 // 88
+data4 0x23F5FFB5 // 89
+data4 0x23D3CEC8 // 90
+data4 0x22A92204 // 91
+data4 0x238C64DF // 92
+data4 0x23B82896 // 93
+data4 0x22D633B8 // 94
+data4 0x23861E93 // 95
+data4 0x23CB594B // 96
+data4 0x2330387E // 97
+data4 0x21CD4702 // 98
+data4 0x2284C505 // 99
+data4 0x23D6995C // 100
+data4 0x23F6C807 // 101
+data4 0x239CEF5C // 102
+data4 0x239442B0 // 103
+data4 0x22B35EE5 // 104
+data4 0x2391E9A4 // 105
+data4 0x23A390F5 // 106
+data4 0x2349AC9C // 107
+data4 0x23FA5535 // 108
+data4 0x21E3A46A // 109
+data4 0x23B44ABA // 110
+data4 0x23CEA8E0 // 111
+data4 0x23F647DC // 112
+data4 0x2390D1A8 // 113
+data4 0x23D0CFA2 // 114
+data4 0x236E0872 // 115
+data4 0x23B88B91 // 116
+data4 0x2283C359 // 117
+data4 0x232F647F // 118
+data4 0x23122CD7 // 119
+data4 0x232CF564 // 120
+data4 0x232630FD // 121
+data4 0x23BEE1C8 // 122
+data4 0x23B2BD30 // 123
+data4 0x2301F1C0 // 124
+data4 0x23CE4D67 // 125
+data4 0x23A353C9 // 126
+data4 0x238086E8 // 127
+data4 0x22D0D29E // 128
+data4 0x23A3B3C8 // 129
+data4 0x23F69F4B // 130
+data4 0x23EA3C21 // 131
+data4 0x23951C88 // 132
+data4 0x2372AFFC // 133
+data4 0x23A6D1A8 // 134
+data4 0x22BBBAF4 // 135
+data4 0x227FA3DD // 136
+data4 0x23804D9B // 137
+data4 0x232D771F // 138
+data4 0x239CB57B // 139
+data4 0x2303CF34 // 140
+data4 0x22218C2A // 141
+data4 0x23991BEE // 142
+data4 0x23EB3596 // 143
+data4 0x230487FA // 144
+data4 0x2135DF4C // 145
+data4 0x2380FD2D // 146
+data4 0x23EB75E9 // 147
+data4 0x211C62C8 // 148
+data4 0x23F518F1 // 149
+data4 0x23FEF882 // 150
+data4 0x239097C7 // 151
+data4 0x223E2BDA // 152
+data4 0x23988F89 // 153
+data4 0x22E4A4AD // 154
+data4 0x23F03D9C // 155
+data4 0x23F5018F // 156
+data4 0x23E1E250 // 157
+data4 0x23FD3D90 // 158
+data4 0x22DEE2FF // 159
+data4 0x238342AB // 160
+data4 0x22E6736F // 161
+data4 0x233AFC28 // 162
+data4 0x2395F661 // 163
+data4 0x23D8B991 // 164
+data4 0x23CD58D5 // 165
+data4 0x21941FD6 // 166
+data4 0x23352915 // 167
+data4 0x235D09EE // 168
+data4 0x22DC7EF9 // 169
+data4 0x238BC9F3 // 170
+data4 0x2397DF8F // 171
+data4 0x2380A7BB // 172
+data4 0x23EFF48C // 173
+data4 0x21E67408 // 174
+data4 0x236420F7 // 175
+data4 0x22C8DFB5 // 176
+data4 0x239B5D35 // 177
+data4 0x23BDC09D // 178
+data4 0x239E822C // 179
+data4 0x23984F0A // 180
+data4 0x23EF2119 // 181
+data4 0x23F738B8 // 182
+data4 0x23B66187 // 183
+data4 0x23B06AD7 // 184
+data4 0x2369140F // 185
+data4 0x218DACE6 // 186
+data4 0x21DF23F1 // 187
+data4 0x235D8B34 // 188
+data4 0x23460333 // 189
+data4 0x23F11D62 // 190
+data4 0x23C37147 // 191
+data4 0x22B2AE2A // 192
+data4 0x23949211 // 193
+data4 0x23B69799 // 194
+data4 0x23DBEC75 // 195
+data4 0x229A6FB3 // 196
+data4 0x23FC6C60 // 197
+data4 0x22D01FFC // 198
+data4 0x235985F0 // 199
+data4 0x23F7ECA5 // 200
+data4 0x23F924D3 // 201
+data4 0x2381B92F // 202
+data4 0x243A0FBE // 203
+data4 0x24712D72 // 204
+data4 0x24594E2F // 205
+data4 0x220CD12A // 206
+data4 0x23D87FB0 // 207
+data4 0x2338288A // 208
+data4 0x242BB2CC // 209
+data4 0x220F6265 // 210
+data4 0x23BB7FE3 // 211
+data4 0x2301C0A2 // 212
+data4 0x246709AB // 213
+data4 0x23A619E2 // 214
+data4 0x24030E3B // 215
+data4 0x233C36CC // 216
+data4 0x241AAB77 // 217
+data4 0x243D41A3 // 218
+data4 0x23834A60 // 219
+data4 0x236AC7BF // 220
+data4 0x23B6D597 // 221
+data4 0x210E9474 // 222
+data4 0x242156E6 // 223
+data4 0x243A1D68 // 224
+data4 0x2472187C // 225
+data4 0x23834E86 // 226
+data4 0x23CA0807 // 227
+data4 0x24745887 // 228
+data4 0x23E2B0E1 // 229
+data4 0x2421EB67 // 230
+data4 0x23DCC64E // 231
+data4 0x22DF71D1 // 232
+data4 0x238D5ECA // 233
+data4 0x23CDE86F // 234
+data4 0x24131F45 // 235
+data4 0x240FE4E2 // 236
+data4 0x2317731A // 237
+data4 0x24015C76 // 238
+data4 0x2301A4E8 // 239
+data4 0x23E52A6D // 240
+data4 0x247D8A0D // 241
+data4 0x23DFEEBA // 242
+data4 0x22139FEC // 243
+data4 0x2454A112 // 244
+data4 0x23C21E28 // 245
+data4 0x2460D813 // 246
+data4 0x24258924 // 247
+data4 0x2425680F // 248
+data4 0x24194D1E // 249
+data4 0x24242C2F // 250
+data4 0x243DDE5E // 251
+data4 0x23DEB388 // 252
+data4 0x23E0E6EB // 253
+data4 0x24393E74 // 254
+data4 0x241B1863 // 255
+LOCAL_OBJECT_END(log10_data)
+
+
+
+// Code
+//==============================================================
-// log10 has p7 true, p8 false
-// log has p8 true, p7 false
+// log has p13 true, p14 false
+// log10 has p14 true, p13 false
.section .text
-.proc log10#
-.align 32
-
-log10:
-#ifdef _LIBC
-.global __ieee754_log10
-.type __ieee754_log10,@function
-__ieee754_log10:
-#endif
+GLOBAL_IEEE754_ENTRY(log10)
{ .mfi
- alloc r32=ar.pfs,1,15,4,0
- frcpa.s1 log_C,p9 = f1,f8
- cmp.eq.unc p7,p8 = r0, r0
-}
-{ .mfb
- addl log_AD_1 = @ltoff(log_table_1), gp
- fnorm.s1 log_NORM_f8 = f8
- br.sptk L(LOG_LOG10_X)
+ getf.exp GR_Exp = f8 // if x is unorm then must recompute
+ frcpa.s1 FR_RcpX,p0 = f1,f8
+ mov GR_05 = 0xFFFE // biased exponent of A2=0.5
}
-;;
-
-.endp log10
-ASM_SIZE_DIRECTIVE(log10)
-ASM_SIZE_DIRECTIVE(__ieee754_log10)
-
-
-.section .text
-.proc log#
-.align 32
-log:
-#ifdef _LIBC
-.global __ieee754_log
-.type __ieee754_log,@function
-__ieee754_log:
-#endif
+{ .mlx
+ addl GR_ad_1 = @ltoff(log10_data),gp
+ movl GR_A3 = 0x3fd5555555555557 // double precision memory
+ // representation of A3
+};;
{ .mfi
- alloc r32=ar.pfs,1,15,4,0
- frcpa.s1 log_C,p9 = f1,f8
- cmp.eq.unc p8,p7 = r0, r0
+ getf.sig GR_Sig = f8 // get significand to calculate index
+ fclass.m p8,p0 = f8,9 // is x positive unorm?
+ mov GR_xorg = 0x3fefe // double precision memory msb of 255/256
}
-{ .mfi
- addl log_AD_1 = @ltoff(log_table_1), gp
- fnorm.s1 log_NORM_f8 = f8
- nop.i 999
-}
-;;
-
-L(LOG_LOG10_X):
+{ .mib
+ ld8 GR_ad_1 = [GR_ad_1]
+ cmp.eq p14,p13 = r0,r0 // set p14 to 1 for log10
+ br.cond.sptk log_log10_common
+};;
+GLOBAL_IEEE754_END(log10)
+GLOBAL_IEEE754_ENTRY(log)
{ .mfi
- ld8 log_AD_1 = [log_AD_1]
- fclass.m.unc p15,p0 = f8, 0x0b // Test for x=unorm
- mov log_GR_fff9 = 0xfff9
-}
-{ .mfi
- mov log_GR_half_exp = 0x0fffe
- fms.s1 log_w = f8,f1,f1
- mov log_GR_exp_17_ones = 0x1ffff
-}
-;;
-
-{ .mmi
- getf.exp log_GR_signexp_f8 = f8 // If x unorm then must recompute
- setf.exp log_half = log_GR_half_exp // Form 0.5 = -Q1
- nop.i 999
-}
-;;
-
-{ .mmb
- adds log_AD_2 = 0x30, log_AD_1
- mov log_GR_exp_16_ones = 0xffff
-(p15) br.cond.spnt L(LOG_DENORM)
-}
-;;
-
-L(LOG_COMMON):
-{.mfi
- ldfpd log_P5,log_P4 = [log_AD_1],16
- fclass.m.unc p6,p0 = f8, 0xc3 // Test for x=nan
- and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones
+ getf.exp GR_Exp = f8 // if x is unorm then must recompute
+ frcpa.s1 FR_RcpX,p0 = f1,f8
+ mov GR_05 = 0xfffe
}
-{.mfi
- ldfpd log_P3,log_P2 = [log_AD_2],16
- nop.f 999
- nop.i 999
-}
-;;
+{ .mlx
+ addl GR_ad_1 = @ltoff(log_data),gp
+ movl GR_A3 = 0x3fd5555555555557 // double precision memory
+ // representation of A3
+};;
{ .mfi
- ldfpd log_Q8,log_Q7 = [log_AD_1],16
- fclass.m.unc p11,p0 = f8, 0x21 // Test for x=+inf
- sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones
+ getf.sig GR_Sig = f8 // get significand to calculate index
+ fclass.m p8,p0 = f8,9 // is x positive unorm?
+ mov GR_xorg = 0x3fefe // double precision memory msb of 255/256
}
{ .mfi
- ldfpd log_Q6,log_Q5 = [log_AD_2],16
- nop.f 999
- nop.i 999
-}
-;;
-
+ ld8 GR_ad_1 = [GR_ad_1]
+ nop.f 0
+ cmp.eq p13,p14 = r0,r0 // set p13 to 1 for log
+};;
+log_log10_common:
{ .mfi
- ldfpd log_Q4,log_Q3 = [log_AD_1],16
- fma.s1 log_wsq = log_w, log_w, f0
- nop.i 999
-}
-{ .mfb
- ldfpd log_Q2,log_Q1 = [log_AD_2],16
-(p6) fma.d.s0 f8 = f8,f1,f0 // quietize nan result if x=nan
-(p6) br.ret.spnt b0 // Exit for x=nan
+ getf.d GR_x = f8 // double precision memory representation of x
+ fclass.m p9,p0 = f8,0x1E1 // is x NaN, NaT or +Inf?
+ dep.z GR_dx = 3, 44, 2 // Create 0x0000300000000000
+ // Difference between double precision
+ // memory representations of 257/256 and
+ // 255/256
}
-;;
-
-
{ .mfi
- setf.sig log_int_Nfloat = log_GR_true_exp_f8
- fcmp.eq.s1 p10,p0 = log_NORM_f8, f1 // Test for x=+1.0
- nop.i 999
-}
-{ .mfb
- nop.m 999
- fms.s1 log_r = log_C,f8,f1
-(p11) br.ret.spnt b0 // Exit for x=+inf
-}
-;;
-
-
-{ .mmf
- getf.sig log_GR_significand_f8 = log_NORM_f8
- ldfe log_inv_ln10 = [log_AD_2],16
- fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0
-}
-;;
-
-
-{ .mfb
- nop.m 999
-(p10) fmerge.s f8 = f0, f0
-(p10) br.ret.spnt b0 // Exit for x=1.0
-;;
-}
-
+ setf.exp FR_A2 = GR_05 // create A2
+ fnorm.s1 FR_NormX = f8
+ mov GR_bias = 0xffff
+};;
+
{ .mfi
- getf.exp log_GR_signexp_w = log_w
- fclass.m.unc p12,p0 = f8, 0x3a // Test for x neg norm, unorm, inf
- shl log_GR_index = log_GR_significand_f8,1
+ setf.d FR_A3 = GR_A3 // create A3
+ fcmp.eq.s1 p12,p0 = f1,f8 // is x equal to 1.0?
+ dep.z GR_xorg = GR_xorg, 44, 19 // 0x3fefe00000000000
+ // double precision memory
+ // representation of 255/256
}
-;;
+{ .mib
+ add GR_ad_2 = 0x30,GR_ad_1 // address of A5,A4
+ add GR_ad_3 = 0x840,GR_ad_1 // address of ln(1/frcpa) lo parts
+(p8) br.cond.spnt log_positive_unorms
+};;
+log_core:
{ .mfi
- ldfe log_log2 = [log_AD_2],16
- fnma.s1 log_rp_q10 = log_half, log_wsq, log_w
- shr.u log_GR_index = log_GR_index,56
+ ldfpd FR_A7,FR_A6 = [GR_ad_1],16
+ fclass.m p10,p0 = f8,0x3A // is x < 0?
+ sub GR_Nm1 = GR_Exp,GR_05 // unbiased_exponent_of_x - 1
}
-{ .mfb
- nop.m 999
- fma.s1 log_w3 = log_wsq, log_w, f0
-(p6) br.cond.spnt L(LOG_ZERO_NEG) // Branch if x=0
-;;
-}
-
-
{ .mfi
- and log_GR_exp_w = log_GR_exp_17_ones, log_GR_signexp_w
- fma.s1 log_w4 = log_wsq, log_wsq, f0
- nop.i 999
-}
-{ .mfb
- shladd log_AD_2 = log_GR_index,4,log_AD_2
- fma.s1 log_rsq = log_r, log_r, f0
-(p12) br.cond.spnt L(LOG_ZERO_NEG) // Branch if x<0
-;;
-}
+ ldfpd FR_A5,FR_A4 = [GR_ad_2],16
+(p9) fma.d.s0 f8 = f8,f1,f0 // set V-flag
+ sub GR_N = GR_Exp,GR_bias // unbiased_exponent_of_x
+};;
{ .mfi
- ldfe log_T = [log_AD_2]
- fma.s1 log_rp_p4 = log_P5, log_r, log_P4
- nop.i 999
+ setf.sig FR_N = GR_N // copy unbiased exponent of x to significand
+ fms.s1 FR_r = FR_RcpX,f8,f1 // range reduction for |x-1|>=1/256
+ extr.u GR_Ind = GR_Sig,55,8 // get bits from 55 to 62 as index
}
-{ .mfi
- nop.m 999
- fma.s1 log_rp_p32 = log_P3, log_r, log_P2
- nop.i 999
-;;
-}
-
+{ .mib
+ sub GR_x = GR_x, GR_xorg // get diff between x and 255/256
+ cmp.gtu p6, p7 = 2, GR_Nm1 // p6 true if 0.5 <= x < 2
+(p9) br.ret.spnt b0 // exit for NaN, NaT and +Inf
+};;
{ .mfi
- nop.m 999
- fma.s1 log_rp_q7 = log_Q8, log_w, log_Q7
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 log_rp_q65 = log_Q6, log_w, log_Q5
- nop.i 999
-;;
+ ldfpd FR_Ln2hi,FR_Ln2lo = [GR_ad_1],16
+ fclass.m p11,p0 = f8,0x07 // is x = 0?
+ shladd GR_ad_3 = GR_Ind,2,GR_ad_3 // address of Tlo
}
+{ .mib
+ shladd GR_ad_2 = GR_Ind,3,GR_ad_2 // address of Thi
+(p6) cmp.leu p6, p7 = GR_x, GR_dx // 255/256 <= x <= 257/256
+(p10) br.cond.spnt log_negatives // jump if x is negative
+};;
-// p13 <== large w log
-// p14 <== small w log
+// p6 is true if |x-1| < 1/256
+// p7 is true if |x-1| >= 1/256
{ .mfi
-(p8) cmp.ge.unc p13,p14 = log_GR_exp_w, log_GR_fff9
- fma.s1 log_rp_q3 = log_Q4, log_w, log_Q3
- nop.i 999
-;;
-}
+ ldfd FR_Thi = [GR_ad_2]
+(p6) fms.s1 FR_r = f8,f1,f1 // range reduction for |x-1|<1/256
+ nop.i 0
+};;
-// p10 <== large w log10
-// p11 <== small w log10
-{ .mfi
-(p7) cmp.ge.unc p10,p11 = log_GR_exp_w, log_GR_fff9
- fcvt.xf log_Nfloat = log_int_Nfloat
- nop.i 999
+{ .mmi
+(p7) ldfs FR_Tlo = [GR_ad_3]
+ nop.m 0
+ nop.i 0
}
+{ .mfb
+ nop.m 0
+(p12) fma.d.s0 f8 = f0,f0,f0
+(p12) br.ret.spnt b0 // exit for +1.0
+};;
+.pred.rel "mutex",p6,p7
{ .mfi
- nop.m 999
- fma.s1 log_rp_q21 = log_Q2, log_w3, log_rp_q10
- nop.i 999 ;;
+(p6) mov GR_NearOne = 1
+ fms.s1 FR_A32 = FR_A3,FR_r,FR_A2 // A3*r-A2
+(p7) mov GR_NearOne = 0
}
+{ .mfb
+ ldfe FR_InvLn10 = [GR_ad_1],16
+ fma.s1 FR_r2 = FR_r,FR_r,f0 // r^2
+(p11) br.cond.spnt log_zeroes // jump if x is zero
+};;
{ .mfi
- nop.m 999
- fma.s1 log_rcube = log_rsq, log_r, f0
- nop.i 999
+ nop.m 0
+ fma.s1 FR_A6 = FR_A7,FR_r,FR_A6 // A7*r+A6
+ nop.i 0
}
{ .mfi
- nop.m 999
- fma.s1 log_rp_p10 = log_rsq, log_P1, log_r
- nop.i 999
-;;
-}
+(p7) cmp.eq.unc p9,p0 = r0,r0 // set p9 if |x-1| > 1/256
+ fma.s1 FR_A4 = FR_A5,FR_r,FR_A4 // A5*r+A4
+(p14) cmp.eq.unc p8,p0 = 1,GR_NearOne // set p8 to 1 if it's log10
+ // and argument near 1.0
+};;
{ .mfi
- nop.m 999
- fcmp.eq.s0 p6,p0 = f8,f0 // Sets flag on +denormal input
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 log_rp_p2 = log_rp_p4, log_rsq, log_rp_p32
- nop.i 999
-;;
-}
-
+(p6) getf.exp GR_rexp = FR_r // Get signexp of x-1
+(p7) fcvt.xf FR_N = FR_N
+(p8) cmp.eq p9,p6 = r0,r0 // Also set p9 and clear p6 if log10
+ // and arg near 1
+};;
{ .mfi
- nop.m 999
- fma.s1 log_w6 = log_w3, log_w3, f0
- nop.i 999
+ nop.m 0
+ fma.s1 FR_r4 = FR_r2,FR_r2,f0 // r^4
+ nop.i 0
}
{ .mfi
- nop.m 999
- fma.s1 log_Qlo = log_rp_q7, log_wsq, log_rp_q65
- nop.i 999
-}
-;;
+ nop.m 0
+(p8) fma.s1 FR_NxLn2pT = f0,f0,f0 // Clear NxLn2pT if log10 near 1
+ nop.i 0
+};;
{ .mfi
- nop.m 999
- fma.s1 log_Qhi = log_rp_q3, log_w4, log_rp_q21
- nop.i 999 ;;
+ nop.m 0
+ // (A3*r+A2)*r^2+r
+ fma.s1 FR_A321 = FR_A32,FR_r2,FR_r
+ mov GR_mask = 0x1ffff
}
-
-
{ .mfi
- nop.m 999
- fma.s1 log_T_plus_Nlog2 = log_Nfloat,log_log2, log_T
- nop.i 999 ;;
-}
+ nop.m 0
+ // (A7*r+A6)*r^2+(A5*r+A4)
+ fma.s1 FR_A4 = FR_A6,FR_r2,FR_A4
+ nop.i 0
+};;
{ .mfi
- nop.m 999
- fma.s1 log_r2P_r = log_rp_p2, log_rcube, log_rp_p10
- nop.i 999 ;;
+(p6) and GR_rexp = GR_rexp, GR_mask
+ // N*Ln2hi+Thi
+(p7) fma.s1 FR_NxLn2hipThi = FR_N,FR_Ln2hi,FR_Thi
+ nop.i 0
}
+{ .mfi
+ nop.m 0
+ // N*Ln2lo+Tlo
+(p7) fma.s1 FR_NxLn2lopTlo = FR_N,FR_Ln2lo,FR_Tlo
+ nop.i 0
+};;
-
-// small w, log <== p14
{ .mfi
- nop.m 999
-(p14) fma.d f8 = log_Qlo, log_w6, log_Qhi
- nop.i 999
+(p6) sub GR_rexp = GR_rexp, GR_bias // unbiased exponent of x-1
+(p9) fma.s1 f8 = FR_A4,FR_r4,FR_A321 // P(r) if |x-1| >= 1/256 or
+ // log10 and |x-1| < 1/256
+ nop.i 0
}
{ .mfi
- nop.m 999
- fma.s1 log_Q = log_Qlo, log_w6, log_Qhi
- nop.i 999 ;;
-}
-
+ nop.m 0
+ // (N*Ln2hi+Thi) + (N*Ln2lo+Tlo)
+(p7) fma.s1 FR_NxLn2pT = FR_NxLn2hipThi,f1,FR_NxLn2lopTlo
+ nop.i 0
+};;
{ .mfi
- nop.m 999
-(p10) fma.s1 log_log10_hi = log_T_plus_Nlog2, log_inv_ln10,f0
- nop.i 999 ;;
-}
+(p6) cmp.gt.unc p10, p6 = -40, GR_rexp // Test |x-1| < 2^-40
+ nop.f 0
+ nop.i 0
+};;
-// large w, log <== p13
-.pred.rel "mutex",p13,p10
{ .mfi
- nop.m 999
-(p13) fadd.d f8 = log_T_plus_Nlog2, log_r2P_r
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p10) fma.s1 log_log10_lo = log_inv_ln10, log_r2P_r,f0
- nop.i 999 ;;
-}
-
+ nop.m 0
+(p10) fma.d.s0 f8 = FR_A32,FR_r2,FR_r // log(x) if |x-1| < 2^-40
+ nop.i 0
+};;
-// small w, log10 <== p11
+.pred.rel "mutex",p6,p9
{ .mfi
- nop.m 999
-(p11) fma.d f8 = log_inv_ln10,log_Q,f0
- nop.i 999 ;;
-}
-
-// large w, log10 <== p10
-{ .mfb
- nop.m 999
-(p10) fma.d f8 = log_log10_hi, f1, log_log10_lo
- br.ret.sptk b0
-;;
+ nop.m 0
+(p6) fma.d.s0 f8 = FR_A4,FR_r4,FR_A321 // log(x) if 2^-40 <= |x-1| < 1/256
+ nop.i 0
}
-
-L(LOG_DENORM):
{ .mfb
- getf.exp log_GR_signexp_f8 = log_NORM_f8
- nop.f 999
- br.cond.sptk L(LOG_COMMON)
-}
-;;
-
-L(LOG_ZERO_NEG):
-
-// qnan snan inf norm unorm 0 -+
-// 0 0 0 0 0 1 11 0x7
-// 0 0 1 1 1 0 10 0x3a
-
-// Save x (f8) in f10
-{ .mfi
- nop.m 999
- fmerge.s f10 = f8,f8
- nop.i 999 ;;
-}
-
-// p8 p9 means ln(+-0) = -inf
-// p7 p10 means log(+-0) = -inf
-
-// p13 means ln(-)
-// p14 means log(-)
-
+ nop.m 0
+(p9) fma.d.s0 f8 = f8,FR_InvLn10,FR_NxLn2pT // result if |x-1| >= 1/256
+ // or log10 and |x-1| < 1/256
+ br.ret.sptk b0
+};;
-{ .mfi
- nop.m 999
- fmerge.ns f6 = f1,f1 // Form -1.0
- nop.i 999 ;;
-}
+.align 32
+log_positive_unorms:
+{ .mmf
+ getf.exp GR_Exp = FR_NormX // recompute biased exponent
+ getf.d GR_x = FR_NormX // recompute double precision x
+ fcmp.eq.s1 p12,p0 = f1,FR_NormX // is x equal to 1.0?
+};;
-// p9 means ln(+-0) = -inf
-// p10 means log(+-0) = -inf
-// Log(+-0) = -inf
+{ .mfb
+ getf.sig GR_Sig = FR_NormX // recompute significand
+ fcmp.eq.s0 p15, p0 = f8, f0 // set denormal flag
+ br.cond.sptk log_core
+};;
+.align 32
+log_zeroes:
{ .mfi
- nop.m 999
-(p8) fclass.m.unc p9,p0 = f10, 0x07
- nop.i 999
+ nop.m 0
+ fmerge.s FR_X = f8,f8 // keep input argument for subsequent
+ // call of __libm_error_support#
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p7) fclass.m.unc p10,p0 = f10, 0x07
- nop.i 999 ;;
-}
-
-
-// p13 ln(-)
-// p14 log(-)
+ nop.m 0
+ fms.s1 FR_tmp = f0,f0,f1 // -1.0
+ nop.i 0
+};;
-// Log(-inf, -normal, -unnormal) = QNAN indefinite
-{ .mfi
- nop.m 999
-(p8) fclass.m.unc p13,p0 = f10, 0x3a
- nop.i 999
-}
+.pred.rel "mutex",p13,p14
{ .mfi
- nop.m 999
-(p7) fclass.m.unc p14,p0 = f10, 0x3a
- nop.i 999 ;;
+(p13) mov GR_TAG = 2 // set libm error in case of log
+ frcpa.s0 f8,p0 = FR_tmp,f0 // log(+/-0) should be equal to -INF.
+ // We can get it using frcpa because it
+ // sets result to the IEEE-754 mandated
+ // quotient of FR_tmp/f0.
+ // As far as FR_tmp is -1 it'll be -INF
+ nop.i 0
}
+{ .mib
+(p14) mov GR_TAG = 8 // set libm error in case of log10
+ nop.i 0
+ br.cond.sptk log_libm_err
+};;
-
-.pred.rel "mutex",p9,p10
-{ .mfi
-(p9) mov log_GR_tag = 2
-(p9) frcpa f8,p11 = f6,f0
- nop.i 999
-}
+.align 32
+log_negatives:
{ .mfi
-(p10) mov log_GR_tag = 8
-(p10) frcpa f8,p12 = f6,f0
- nop.i 999 ;;
-}
+ nop.m 0
+ fmerge.s FR_X = f8,f8
+ nop.i 0
+};;
.pred.rel "mutex",p13,p14
{ .mfi
-(p13) mov log_GR_tag = 3
-(p13) frcpa f8,p11 = f0,f0
- nop.i 999
-}
-{ .mfb
-(p14) mov log_GR_tag = 9
-(p14) frcpa f8,p12 = f0,f0
- br.cond.sptk __libm_error_region ;;
-}
-.endp log
-ASM_SIZE_DIRECTIVE(log)
-ASM_SIZE_DIRECTIVE(__ieee754_log)
-
-
-// Stack operations when calling error support.
-// (1) (2) (3) (call) (4)
-// sp -> + psp -> + psp -> + sp -> +
-// | | | |
-// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
-// | | | |
-// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
-// | | | |
-// | | <- GR_X X1 ->| |
-// | | | |
-// sp-64 -> + sp -> + sp -> + +
-// save ar.pfs save b0 restore gp
-// save gp restore ar.pfs
-
+(p13) mov GR_TAG = 3 // set libm error in case of log
+ frcpa.s0 f8,p0 = f0,f0 // log(negatives) should be equal to NaN.
+ // We can get it using frcpa because it
+ // sets result to the IEEE-754 mandated
+ // quotient of f0/f0 i.e. NaN.
+(p14) mov GR_TAG = 9 // set libm error in case of log10
+};;
+.align 32
+log_libm_err:
+{ .mmi
+ alloc r32 = ar.pfs,1,4,4,0
+ mov GR_Parameter_TAG = GR_TAG
+ nop.i 0
+};;
+GLOBAL_IEEE754_END(log)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
-
-// (1)
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
+ add GR_Parameter_Y = -32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp = -64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP = gp // Save gp
};;
-
-// (2)
{ .mmi
- stfd [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack
+ stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0 = b0 // Save b0
};;
.body
-// (3)
{ .mib
- stfd [GR_Parameter_X] = f10 // STORE Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
-// (4)
{ .mmi
- ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
- add sp = 64,sp // Restore stack pointer
- mov b0 = GR_SAVE_B0 // Restore return address
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
+
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
};;
-
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
-
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/e_log2.S b/sysdeps/ia64/fpu/e_log2.S
new file mode 100644
index 0000000000..76793574ea
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_log2.S
@@ -0,0 +1,710 @@
+.file "log2.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//=================================================================
+// 09/11/00 Initial version
+// 03/19/01 Added one polynomial coefficient, to improve accuracy
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+// 04/18/03 Reformatted T[255]
+//
+// API
+//=================================================================
+// double log2(double)
+//
+// Overview of operation
+//=================================================================
+// Background
+//
+// Implementation
+//
+// Let x = 2^l * m, where m=1.b1 b2 ... b8 b9 ... b52
+// y=frcpa(m), r=m*y-1, f=b1 b2 .. b8 (table index)
+// j=0 if f<128; j=1 if f>=128
+// T is a table that stores log2(1/y) (in entries 1..255) rounded to
+// double extended precision; f is used as an index; T[255]=0
+//
+// If f=0 and b9=0, r is set to 2^{-8}* 0.b9 b10 ... b52 = m-1 (fractional part of m),
+// and 0 is used instead of T[0]
+// (polynomial evaluation only, for m=1+r, 0<=r<2^{-9})
+// If f=255, r is set to (m-2)/2 (T[255]=0, and only polynomial evaluation is used
+// for m=2(1-r'), 0<=r'<2^{-9})
+//
+// log2(x) is approximated as
+// (l-j) + T[f] + (c1*r+c2*r^2+...+c7*r^7), if f>0
+//
+
+
+// Special values
+//=================================================================
+// log2(0)=-inf, raises Divide by Zero
+// log2(+inf)=inf
+// log2(x)=NaN, raises Invalid if x<0
+//
+
+
+// Registers used
+//==============================================================
+// f6-f15, f32-f33
+// r2-r3, r23-r30
+// p6,p7,p8,p12
+//
+
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35 // This reg. can safely be used
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+
+
+
+// Data tables
+//==============================================================
+
+RODATA
+
+.align 16
+
+LOCAL_OBJECT_START(poly_coeffs)
+
+data8 0xbfd0000000000000, 0x3fc999999999999a //C_4, C_5
+data8 0xbfc5555555555555, 0x3fc2492492492492 //C_6, C_7
+data8 0xb8aa3b295c17f0bc, 0x00003fff // C_1
+data8 0xaaaaaaaaaaaaaaab, 0x00003ffd // C_3=1/3
+LOCAL_OBJECT_END(poly_coeffs)
+
+
+LOCAL_OBJECT_START(T_table)
+
+data8 0xb8d8752172fed131, 0x00003ff6
+data8 0x8ae7f475764180a3, 0x00003ff8
+data8 0xe7f73862e72ee35d, 0x00003ff8
+data8 0xa2b25310c941a2f2, 0x00003ff9
+data8 0xcbb91d671abb2e85, 0x00003ff9
+data8 0xfac91e34daa50483, 0x00003ff9
+data8 0x9504a5042eb495c5, 0x00003ffa
+data8 0xa9c4a0bbb580ee02, 0x00003ffa
+data8 0xc19264dc8a5e3bf9, 0x00003ffa
+data8 0xd67aa6703ebf4a77, 0x00003ffa
+data8 0xee76cac6d6e08ce7, 0x00003ffa
+data8 0x81c3f7de5434ed04, 0x00003ffb
+data8 0x8c563033a3ce01e4, 0x00003ffb
+data8 0x9876e9f09a98661c, 0x00003ffb
+data8 0xa31e0ac9b2326ce2, 0x00003ffb
+data8 0xadcf09e1fd10e4a5, 0x00003ffb
+data8 0xb889f992cf03cdb6, 0x00003ffb
+data8 0xc34eec68d901a714, 0x00003ffb
+data8 0xce1df524e9909ed9, 0x00003ffb
+data8 0xd8f726bcb0b80ad0, 0x00003ffb
+data8 0xe3da945b878e27d1, 0x00003ffb
+data8 0xeec851633b76a320, 0x00003ffb
+data8 0xf82ea4bb6101421a, 0x00003ffb
+data8 0x8197ddd7736b2864, 0x00003ffc
+data8 0x871dad4f994253f0, 0x00003ffc
+data8 0x8ca8cae3e892d549, 0x00003ffc
+data8 0x916d6e1559a4b697, 0x00003ffc
+data8 0x97028118efabeb7d, 0x00003ffc
+data8 0x9bcfbce1592ad5d5, 0x00003ffc
+data8 0xa16ee95d0da54a91, 0x00003ffc
+data8 0xa644dcf3403fa5d0, 0x00003ffc
+data8 0xab1ee14ffd659064, 0x00003ffc
+data8 0xb0cd12faebcc6757, 0x00003ffc
+data8 0xb5affdf9b3b221e0, 0x00003ffc
+data8 0xba970fb307c6ade1, 0x00003ffc
+data8 0xbf824f3a9f3e7561, 0x00003ffc
+data8 0xc544c055fde99333, 0x00003ffc
+data8 0xca39266532bdf26c, 0x00003ffc
+data8 0xcf31d124b8fa2f56, 0x00003ffc
+data8 0xd42ec7f59017b6ab, 0x00003ffc
+data8 0xd930124bea9a2c67, 0x00003ffc
+data8 0xde35b7af70e4dab3, 0x00003ffc
+data8 0xe33fbfbb8533ef03, 0x00003ffc
+data8 0xe77625911a7dcef3, 0x00003ffc
+data8 0xec884bd689cc12e3, 0x00003ffc
+data8 0xf19eeabf9e99a40a, 0x00003ffc
+data8 0xf6ba0a35e3d88051, 0x00003ffc
+data8 0xfbd9b237f7b4192b, 0x00003ffc
+data8 0x80111d4a1ee0c79e, 0x00003ffd
+data8 0x82a523a5f875bbfc, 0x00003ffd
+data8 0x84ccecdc92cd0815, 0x00003ffd
+data8 0x87653369d92c057a, 0x00003ffd
+data8 0x89ffd1742da3aa21, 0x00003ffd
+data8 0x8c2d2227d053d9b6, 0x00003ffd
+data8 0x8e5c189793f7f798, 0x00003ffd
+data8 0x90fd0a20e72f3c96, 0x00003ffd
+data8 0x932fa937301e59ae, 0x00003ffd
+data8 0x95d5061a5f0f5f7f, 0x00003ffd
+data8 0x980b5a2ef10e7023, 0x00003ffd
+data8 0x9a4361c5514d3c27, 0x00003ffd
+data8 0x9c7d1f7d541313fd, 0x00003ffd
+data8 0x9f2b16040b500d04, 0x00003ffd
+data8 0xa168a0fa9db22c98, 0x00003ffd
+data8 0xa3a7eaa1f9116293, 0x00003ffd
+data8 0xa5e8f5b4072a3d44, 0x00003ffd
+data8 0xa82bc4f11a5e88aa, 0x00003ffd
+data8 0xaa705b2001db8317, 0x00003ffd
+data8 0xacb6bb0e1e0f8005, 0x00003ffd
+data8 0xaefee78f75707221, 0x00003ffd
+data8 0xb148e37ec994dd99, 0x00003ffd
+data8 0xb394b1bdaca0bc17, 0x00003ffd
+data8 0xb5e255349707e496, 0x00003ffd
+data8 0xb831d0d2fda791cc, 0x00003ffd
+data8 0xba83278f6838ab20, 0x00003ffd
+data8 0xbcd65c67881c7d47, 0x00003ffd
+data8 0xbeb3e0f21d72dc92, 0x00003ffd
+data8 0xc10a7a03457d35dc, 0x00003ffd
+data8 0xc362f9b6f51eddd3, 0x00003ffd
+data8 0xc5bd6326ebfce656, 0x00003ffd
+data8 0xc7a0b3d0637c8f97, 0x00003ffd
+data8 0xc9fe96af0df8e4b5, 0x00003ffd
+data8 0xcc5e6c214b4a2cd7, 0x00003ffd
+data8 0xce46199f374d29cf, 0x00003ffd
+data8 0xd0a978a14c0d9ebe, 0x00003ffd
+data8 0xd293fecafec7f9b5, 0x00003ffd
+data8 0xd4faf1f6f5cf32e6, 0x00003ffd
+data8 0xd6e8595abaad34d1, 0x00003ffd
+data8 0xd952eb7a8ffc1593, 0x00003ffd
+data8 0xdb433ccd805f171e, 0x00003ffd
+data8 0xddb178dc43e6bd84, 0x00003ffd
+data8 0xdfa4bcfb333342a4, 0x00003ffd
+data8 0xe19953741ccea015, 0x00003ffd
+data8 0xe40cee16a2ff21c5, 0x00003ffd
+data8 0xe6048470cdbde8ea, 0x00003ffd
+data8 0xe7fd7308d6895b14, 0x00003ffd
+data8 0xe9f7bbb6a1ff9f87, 0x00003ffd
+data8 0xec7280138809433d, 0x00003ffd
+data8 0xee6fda4365cd051f, 0x00003ffd
+data8 0xf06e94a122ff1f12, 0x00003ffd
+data8 0xf26eb1151441fce5, 0x00003ffd
+data8 0xf470318b88a77e2f, 0x00003ffd
+data8 0xf67317f4d4c8aa58, 0x00003ffd
+data8 0xf8f8b250a9c4cde6, 0x00003ffd
+data8 0xfafec54831f1a484, 0x00003ffd
+data8 0xfd06449bf3eaea1e, 0x00003ffd
+data8 0xff0f324ddb19ab67, 0x00003ffd
+data8 0x808cc8320a9acf15, 0x00003ffe
+data8 0x8192b0748f2cef06, 0x00003ffe
+data8 0x829952f5e6a24ee5, 0x00003ffe
+data8 0x83a0b0bfafe1424e, 0x00003ffe
+data8 0x8466b29f9c41caea, 0x00003ffe
+data8 0x856f5aae0881d857, 0x00003ffe
+data8 0x8678c0eae8ee8190, 0x00003ffe
+data8 0x8782e6685676b9d7, 0x00003ffe
+data8 0x888dcc3abc4554ec, 0x00003ffe
+data8 0x89997378de7b98b8, 0x00003ffe
+data8 0x8aa5dd3be1044279, 0x00003ffe
+data8 0x8b6facdfd0360ab8, 0x00003ffe
+data8 0x8c7d6db7169e0cdb, 0x00003ffe
+data8 0x8d8bf424d6e130b2, 0x00003ffe
+data8 0x8e575b506f409fa6, 0x00003ffe
+data8 0x8f673e418776492c, 0x00003ffe
+data8 0x9077e9ed700ef9ba, 0x00003ffe
+data8 0x9144ef1baec80b20, 0x00003ffe
+data8 0x9256fcdb537f035f, 0x00003ffe
+data8 0x9369d68d75e7e1d6, 0x00003ffe
+data8 0x943880613b8f9f1e, 0x00003ffe
+data8 0x954cc1d9e0d94206, 0x00003ffe
+data8 0xd3c70a37bdf7a294, 0x0000bffd
+data8 0xd19bb053fb0284ec, 0x0000bffd
+data8 0xcffa1a3b7dafb8bf, 0x0000bffd
+data8 0xcdcbe1e2776479ee, 0x0000bffd
+data8 0xcc282218b8bfdda2, 0x0000bffd
+data8 0xc9f703a9afcb38ac, 0x0000bffd
+data8 0xc851146ab89593c6, 0x0000bffd
+data8 0xc61d08265927a860, 0x0000bffd
+data8 0xc474e39705912d26, 0x0000bffd
+data8 0xc23de19ec30c6e3e, 0x0000bffd
+data8 0xc09381cc45db45b4, 0x0000bffd
+data8 0xbee82b4e025ff90c, 0x0000bffd
+data8 0xbcace101149788ec, 0x0000bffd
+data8 0xbaff46962ea47964, 0x0000bffd
+data8 0xb950b1be5e0c14a2, 0x0000bffd
+data8 0xb7110e6ce866f2bc, 0x0000bffd
+data8 0xb5602ccc2a81db52, 0x0000bffd
+data8 0xb3ae4ce740fc8ef1, 0x0000bffd
+data8 0xb1fb6d92c8240ccc, 0x0000bffd
+data8 0xafb609c09b244abc, 0x0000bffd
+data8 0xae00d1cfdeb43cfd, 0x0000bffd
+data8 0xac4a967a8c8c9bd0, 0x0000bffd
+data8 0xaa93568c249e6c52, 0x0000bffd
+data8 0xa8db10cdff375343, 0x0000bffd
+data8 0xa68e6fc5a42376e3, 0x0000bffd
+data8 0xa4d3c25e68dc57f2, 0x0000bffd
+data8 0xa3180b0c192a3816, 0x0000bffd
+data8 0xa15b488e7aa329a0, 0x0000bffd
+data8 0x9f9d79a30f0e1d5f, 0x0000bffd
+data8 0x9dde9d050ee7d4ac, 0x0000bffd
+data8 0x9c1eb16d63d7356c, 0x0000bffd
+data8 0x9a5db592a310c36a, 0x0000bffd
+data8 0x989ba82907a9016f, 0x0000bffd
+data8 0x96d887e26cd57b79, 0x0000bffd
+data8 0x9514536e481c3a4f, 0x0000bffd
+data8 0x934f0979a3715fc9, 0x0000bffd
+data8 0x9188a8af1742a9d5, 0x0000bffd
+data8 0x8fc12fb6c470995f, 0x0000bffd
+data8 0x8df89d364e34f8f1, 0x0000bffd
+data8 0x8c2eefd0d3f67dd6, 0x0000bffd
+data8 0x8a642626eb093d54, 0x0000bffd
+data8 0x88983ed6985bae58, 0x0000bffd
+data8 0x86cb387b4a0feec6, 0x0000bffd
+data8 0x84fd11add101024b, 0x0000bffd
+data8 0x83c856dd81804b78, 0x0000bffd
+data8 0x81f84c2c62afd6f1, 0x0000bffd
+data8 0x80271d3e4be5ea5a, 0x0000bffd
+data8 0xfca991447e7b485d, 0x0000bffc
+data8 0xf90299c904793a3c, 0x0000bffc
+data8 0xf559511d2dc1ed69, 0x0000bffc
+data8 0xf2e72afee9bd2aee, 0x0000bffc
+data8 0xef39ff1d8a40770e, 0x0000bffc
+data8 0xeb8a7a2311c935dc, 0x0000bffc
+data8 0xe7d8990dc620012f, 0x0000bffc
+data8 0xe560b1e3b86e44b6, 0x0000bffc
+data8 0xe1aadb38caee80c4, 0x0000bffc
+data8 0xddf2a051f81b76a4, 0x0000bffc
+data8 0xdb7678bafcaf4b5f, 0x0000bffc
+data8 0xd7ba3a8f0df19bfc, 0x0000bffc
+data8 0xd3fb8fdbdd5cebdb, 0x0000bffc
+data8 0xd17b191905c35652, 0x0000bffc
+data8 0xcdb85d29cefd7121, 0x0000bffc
+data8 0xc9f32c3c88221ef6, 0x0000bffc
+data8 0xc76e5741a95b5dae, 0x0000bffc
+data8 0xc3a506d80d38c718, 0x0000bffc
+data8 0xbfd938ccef8b68c1, 0x0000bffc
+data8 0xbd4ff63e82eef78c, 0x0000bffc
+data8 0xb97ffa2b563865bd, 0x0000bffc
+data8 0xb6f3eb3011eddcea, 0x0000bffc
+data8 0xb31fb7d64898b3e6, 0x0000bffc
+data8 0xb090d63a409e7880, 0x0000bffc
+data8 0xacb8623c7ffa4f39, 0x0000bffc
+data8 0xa8dd5c83d2e45246, 0x0000bffc
+data8 0xa649e998a8d91f2e, 0x0000bffc
+data8 0xa26a93fed6faa94f, 0x0000bffc
+data8 0x9fd43df079d0db1f, 0x0000bffc
+data8 0x9d3cbe69aecac4c2, 0x0000bffc
+data8 0x99574f13c570d0fb, 0x0000bffc
+data8 0x96bce349bf7ee6c7, 0x0000bffc
+data8 0x92d30c9b86cee18e, 0x0000bffc
+data8 0x9035adef17c5bd5c, 0x0000bffc
+data8 0x8c4765e8e8b5f251, 0x0000bffc
+data8 0x89a70da448316ffa, 0x0000bffc
+data8 0x85b44a24474af78a, 0x0000bffc
+data8 0x8310f17aab5adf70, 0x0000bffc
+data8 0x806c6388d0965f29, 0x0000bffc
+data8 0xf8e69092bf0c5ead, 0x0000bffb
+data8 0xf397608bfd2d90e6, 0x0000bffb
+data8 0xee45be24d0eedbc4, 0x0000bffb
+data8 0xe646af233db881e9, 0x0000bffb
+data8 0xe0eee4e1ce3d06fb, 0x0000bffb
+data8 0xdb94a049e6e87a4f, 0x0000bffb
+data8 0xd3888ef9a4249f5a, 0x0000bffb
+data8 0xce280e6fbac39194, 0x0000bffb
+data8 0xc8c50b72319ad574, 0x0000bffb
+data8 0xc0abcd39f41e329b, 0x0000bffb
+data8 0xbb4279cfa7f9667b, 0x0000bffb
+data8 0xb5d69bac77ec398a, 0x0000bffb
+data8 0xb068306bf20d6233, 0x0000bffb
+data8 0xa83dc1b019ddb6a8, 0x0000bffb
+data8 0xa2c8eb1886c2d024, 0x0000bffb
+data8 0x9d517ee93f8e16c0, 0x0000bffb
+data8 0x97d77aae659b92fb, 0x0000bffb
+data8 0x8f9b91da5736d415, 0x0000bffb
+data8 0x8a1b06b09b7fd1d1, 0x0000bffb
+data8 0x8497daca0a2e077a, 0x0000bffb
+data8 0xfe241745a453f10c, 0x0000bffa
+data8 0xf3132d6708d723c5, 0x0000bffa
+data8 0xe7fcf2e21a0e7d77, 0x0000bffa
+data8 0xd75198b04afb8da9, 0x0000bffa
+data8 0xcc2dfe1a4a8ca305, 0x0000bffa
+data8 0xc10500d63aa65882, 0x0000bffa
+data8 0xb5d69bac77ec398a, 0x0000bffa
+data8 0xaaa2c95dc66abcde, 0x0000bffa
+data8 0x9f6984a342d13101, 0x0000bffa
+data8 0x942ac82e5387ac51, 0x0000bffa
+data8 0x88e68ea899a0976c, 0x0000bffa
+data8 0xefebc4409ccf872e, 0x0000bff9
+data8 0xd947b0c6642ef69e, 0x0000bff9
+data8 0xc2987d51e043d407, 0x0000bff9
+data8 0xabde1eeee6bfd257, 0x0000bff9
+data8 0x95188a9917cf2e01, 0x0000bff9
+data8 0xfc8f6a777c1b7f1e, 0x0000bff8
+data8 0xced727635c59725c, 0x0000bff8
+data8 0xa108358a4c904615, 0x0000bff8
+data8 0xe644fcbeb3ac9c90, 0x0000bff7
+data8 0x8a4bd667bf08e7de, 0x0000bff7
+data8 0x0000000000000000 // T[255] Low
+data8 0x0000000000000000 // T[255] High
+LOCAL_OBJECT_END(T_table)
+
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(log2)
+
+{ .mfi
+ alloc r32=ar.pfs,1,4,4,0
+ // y=frcpa(x)
+ frcpa.s1 f6,p0=f1,f8
+ // will form significand of 1.5 (to test whether the index is 128 or above)
+ mov r24=0xc
+}
+{.mfi
+ nop.m 0
+ // normalize x
+ fma.s1 f7=f8,f1,f0
+ // r2 = pointer to C_1...C_6 followed by T_table
+ addl r2 = @ltoff(poly_coeffs), gp;;
+}
+{.mfi
+ // get significand
+ getf.sig r25=f8
+ // f8 denormal ?
+ fclass.m p8,p10=f8,0x9
+ // will form significand of 1.5 (to test whether the index is 128 or above)
+ shl r24=r24,60
+}
+{.mfi
+ mov r26=0x804
+ nop.f 0
+ // r23=bias-1
+ mov r23=0xfffe;;
+}
+
+{.mmf
+ getf.exp r29=f8
+ // load start address for C_1...C_6 followed by T_table
+ ld8 r2=[r2]
+ // will continue only for positive normal/denormal numbers
+ fclass.nm.unc p12,p7 = f8, 0x19 ;;
+}
+
+.pred.rel "mutex",p8,p10
+{.mfi
+ // denormal input, repeat get significand (after normalization)
+ (p8) getf.sig r25=f7
+ // x=1 ?
+ fcmp.eq.s0 p6,p0=f8,f1
+ // get T_index
+ (p10) shr.u r28=r25,63-8
+}
+{.mfi
+ // f32=0.5
+ setf.exp f32=r23
+ nop.f 0
+ // r27=bias
+ mov r27=0xffff;;
+}
+
+{.mmi
+ // denormal input, repeat get exponent (after normalization)
+ (p8) getf.exp r29=f7
+ mov r23=0xff
+ // r26=0x80400...0 (threshold for using polynomial approximation)
+ shl r26=r26,64-12;;
+}
+
+{.mfb
+ add r3=48,r2
+ // r=1-x*y
+ fms.s1 f6=f6,f8,f1
+ (p12) br.cond.spnt SPECIAL_LOG2
+}
+{.mfi
+ // load C_4, C_5
+ ldfpd f10,f11=[r2],16
+ nop.f 0
+ cmp.geu p12,p0=r25,r24;;
+}
+
+{.mmi
+ // load C_6, C_7
+ ldfpd f12,f13=[r2],16
+ // r27=bias-1 (if index >=128, will add exponent+1)
+ (p12) mov r27=0xfffe
+ (p8) shr.u r28=r25,63-8;;
+}
+
+
+{.mfi
+ // load C_1
+ ldfe f14=[r2],32
+ fmerge.se f7=f1,f7
+ // if first 9 bits after leading 1 are all zero, then p8=1
+ cmp.ltu p8,p12=r25,r26
+}
+{.mfi
+ // load C_3
+ ldfe f15=[r3]
+ nop.f 0
+ // get T_index
+ and r28=r28,r23;;
+}
+{.mfi
+ // r29=exponent-bias
+ sub r29=r29,r27
+ // x=1, return 0
+ (p6) fma.d.s0 f8=f0,f0,f0
+ // get T address
+ shladd r2=r28,4,r2
+}
+{.mfb
+ // first 8 bits after leading 1 are all ones ?
+ cmp.eq p10,p0=r23,r28
+ // if first 8 bits after leading bit are 0, use polynomial approx. only
+ (p8) fms.s1 f6=f7,f1,f1
+ // x=1, return
+ (p6) br.ret.spnt b0;;
+}
+{.mfi
+ // r26=1
+ mov r26=1
+ // if first 8 bits after leading 1 are all ones, use polynomial approx. only
+ (p10) fms.s1 f6=f7,f32,f1
+ nop.i 0;;
+}
+
+.pred.rel "mutex",p8,p12
+{.mmf
+ // load T (unless first 9 bits after leading 1 are 0)
+ (p12) ldfe f33=[r2]
+ // f8=expon - bias
+ setf.sig f8=r29
+ // set T=0 (if first 9 bits after leading 1 are 0)
+ (p8) fma.s1 f33=f0,f0,f0;;
+}
+
+{.mfi
+ nop.m 0
+ // P12=1-0.5*r
+ fnma.s1 f32=f32,f6,f1
+ // r26=2^{63}
+ shl r26=r26,63
+}
+{.mfi
+ nop.m 0
+ // r2=r*r
+ fma.s1 f7=f6,f6,f0
+ nop.i 0;;
+}
+{.mfi
+ // significand(x)=1 ?
+ cmp.eq p0,p6=r26,r25
+ // P67=C_6+C_7*r
+ fma.s1 f13=f13,f6,f12
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // P45=C_4+C_5*r
+ fma.s1 f10=f11,f6,f10
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // C_1*r
+ (p6) fma.s1 f14=f14,f6,f0
+ nop.i 0;;
+}
+{.mfi
+ nop.m 0
+ // normalize additive term (l=exponent of x)
+ fcvt.xf f8=f8
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // P13=1-0.5*r+C_3*r^2
+ (p6) fma.s1 f15=f15,f7,f32
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // P47=P45+r2*P67
+ (p6) fma.s1 f13=f13,f7,f10
+ // if significand(x)=1, return exponent (l)
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // r3=r^3
+ (p6) fma.s1 f7=f7,f6,f0
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // add T+l
+ (p6) fma.s1 f8=f8,f1,f33
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // P17=P13+r3*P47
+ (p6) fma.s1 f13=f13,f7,f15
+ nop.i 0;;
+}
+
+{.mfb
+ nop.m 0
+ // result=T+l+(C_1*r)*P16
+ (p6) fma.d.s0 f8=f13,f14,f8
+ // return
+ br.ret.sptk b0;;
+}
+
+
+SPECIAL_LOG2:
+{.mfi
+ nop.m 0
+ // x=+Infinity ?
+ fclass.m p7,p0=f8,0x21
+ nop.i 0;;
+}
+{.mfi
+ nop.m 0
+ // x=+/-Zero ?
+ fclass.m p8,p0=f8,0x7
+ nop.i 0;;
+}
+{.mfi
+ nop.m 0
+ // x=-Infinity, -normal, -denormal ?
+ fclass.m p6,p0=f8,0x3a
+ nop.i 0;;
+}
+{.mfb
+ nop.m 0
+ // log2(+Infinity)=+Infinity
+ nop.f 0
+ (p7) br.ret.spnt b0;;
+}
+{.mfi
+ (p8) mov GR_Parameter_TAG = 170
+ // log2(+/-0)=-infinity, raises Divide by Zero
+ // set f8=-0
+ (p8) fmerge.ns f8=f0,f8
+ nop.i 0;;
+}
+{.mfb
+ nop.m 0
+ (p8) frcpa.s0 f8,p0=f1,f8
+ (p8) br.cond.sptk __libm_error_region;;
+}
+{.mfb
+ (p6) mov GR_Parameter_TAG = 171
+ // x<0: return NaN, raise Invalid
+ (p6) frcpa.s0 f8,p0=f0,f0
+ (p6) br.cond.sptk __libm_error_region;;
+}
+
+
+{.mfb
+ nop.m 0
+ // Remaining cases: NaNs
+ fma.d.s0 f8=f8,f1,f0
+ br.ret.sptk b0;;
+}
+
+GLOBAL_LIBM_END(log2)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+{ .mmi
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
+
diff --git a/sysdeps/ia64/fpu/e_log2f.S b/sysdeps/ia64/fpu/e_log2f.S
new file mode 100644
index 0000000000..6de2f38720
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_log2f.S
@@ -0,0 +1,550 @@
+.file "log2f.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 09/11/00 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// float log2f(float)
+//
+// Overview of operation
+//==============================================================
+// Background
+//
+// Implementation
+//
+// Let x = 2^l * m, where m=1.b1 b2 ... b8 b9 ... b52
+// y=frcpa(m), r=m*y-1, f=b1 b2 .. b8 (table index)
+// j=0 if f<128; j=1 if f>=128
+// T is a table that stores log2(1/y) (in entries 1..255) rounded to
+// double extended precision; f is used as an index; T[255]=0
+//
+// If f=0 and b9=0, r is set to 2^{-8}* 0.b9 b10 ... b52 = m-1 (fractional part of m),
+// and 0 is used instead of T[0]
+// (polynomial evaluation only, for m=1+r, 0<=r<2^{-9})
+// If f=255, r is set to (m-2)/2 (T[255]=0, and only polynomial evaluation is used
+// for m=2(1-r'), 0<=r'<2^{-9})
+//
+// log2f(x) is approximated as
+// (l-j) + T[f] + (c1*r+c2*r^2+...+c6*r^6), if f>0
+//
+
+
+// Special values
+//==============================================================
+// log2f(0)=-inf, raises Divide by Zero
+// log2f(+inf)=inf
+// log2f(x)=NaN, raises Invalid if x<0
+//
+
+
+// Registers used
+//==============================================================
+// f6-f14
+// r2-r3, r23-r30
+// p6,p7,p8,p12
+//
+
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35 // This reg. can safely be used
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+
+
+
+// Data tables
+//==============================================================
+
+RODATA
+
+.align 16
+
+LOCAL_OBJECT_START(poly_coeffs)
+
+data8 0x3fdec709dc3a03fd, 0xbfd71547652b82fe //C_3 and C_4
+data8 0xb8aa3b295c17f0bc, 0x00003fff // C_1
+data8 0xb8aa3b295c17f0bc, 0x0000bffe // C_2
+LOCAL_OBJECT_END(poly_coeffs)
+
+
+LOCAL_OBJECT_START(T_table)
+
+data8 0x3f671b0ea42e5fda, 0x3f815cfe8eaec830
+data8 0x3f8cfee70c5ce5dc, 0x3f94564a62192834
+data8 0x3f997723ace35766, 0x3f9f5923c69b54a1
+data8 0x3fa2a094a085d693, 0x3fa538941776b01e
+data8 0x3fa8324c9b914bc7, 0x3faacf54ce07d7e9
+data8 0x3fadced958dadc12, 0x3fb0387efbca869e
+data8 0x3fb18ac6067479c0, 0x3fb30edd3e13530d
+data8 0x3fb463c15936464e, 0x3fb5b9e13c3fa21d
+data8 0x3fb7113f3259e07a, 0x3fb869dd8d1b2035
+data8 0x3fb9c3bea49d3214, 0x3fbb1ee4d7961701
+data8 0x3fbc7b528b70f1c5, 0x3fbdd90a2c676ed4
+data8 0x3fbf05d4976c2028, 0x3fc032fbbaee6d65
+data8 0x3fc0e3b5a9f3284a, 0x3fc195195c7d125b
+data8 0x3fc22dadc2ab3497, 0x3fc2e050231df57d
+data8 0x3fc379f79c2b255b, 0x3fc42ddd2ba1b4a9
+data8 0x3fc4c89b9e6807f5, 0x3fc563dc29ffacb2
+data8 0x3fc619a25f5d798d, 0x3fc6b5ffbf367644
+data8 0x3fc752e1f660f8d6, 0x3fc7f049e753e7cf
+data8 0x3fc8a8980abfbd32, 0x3fc94724cca657be
+data8 0x3fc9e63a24971f46, 0x3fca85d8feb202f7
+data8 0x3fcb2602497d5346, 0x3fcbc6b6f5ee1c9b
+data8 0x3fcc67f7f770a67e, 0x3fcceec4b2234fba
+data8 0x3fcd91097ad13982, 0x3fce33dd57f3d335
+data8 0x3fced74146bc7b10, 0x3fcf7b3646fef683
+data8 0x3fd00223a943dc19, 0x3fd054a474bf0eb7
+data8 0x3fd0999d9b9259a1, 0x3fd0eca66d3b2581
+data8 0x3fd13ffa2e85b475, 0x3fd185a444fa0a7b
+data8 0x3fd1cb8312f27eff, 0x3fd21fa1441ce5e8
+data8 0x3fd265f526e603cb, 0x3fd2baa0c34be1ec
+data8 0x3fd3016b45de21ce, 0x3fd3486c38aa29a8
+data8 0x3fd38fa3efaa8262, 0x3fd3e562c0816a02
+data8 0x3fd42d141f53b646, 0x3fd474fd543f222c
+data8 0x3fd4bd1eb680e548, 0x3fd505789e234bd1
+data8 0x3fd54e0b64003b70, 0x3fd596d761c3c1f0
+data8 0x3fd5dfdcf1eeae0e, 0x3fd6291c6fd9329c
+data8 0x3fd6729637b59418, 0x3fd6bc4aa692e0fd
+data8 0x3fd7063a1a5fb4f2, 0x3fd75064f1ed0715
+data8 0x3fd79acb8cf10390, 0x3fd7d67c1e43ae5c
+data8 0x3fd8214f4068afa7, 0x3fd86c5f36dea3dc
+data8 0x3fd8b7ac64dd7f9d, 0x3fd8f4167a0c6f92
+data8 0x3fd93fd2d5e1bf1d, 0x3fd98bcd84296946
+data8 0x3fd9c8c333e6e9a5, 0x3fda152f142981b4
+data8 0x3fda527fd95fd8ff, 0x3fda9f5e3edeb9e6
+data8 0x3fdadd0b2b5755a7, 0x3fdb2a5d6f51ff83
+data8 0x3fdb686799b00be3, 0x3fdbb62f1b887cd8
+data8 0x3fdbf4979f666668, 0x3fdc332a6e8399d4
+data8 0x3fdc819dc2d45fe4, 0x3fdcc0908e19b7bd
+data8 0x3fdcffae611ad12b, 0x3fdd3ef776d43ff4
+data8 0x3fdd8e5002710128, 0x3fddcdfb486cb9a1
+data8 0x3fde0dd294245fe4, 0x3fde4dd622a28840
+data8 0x3fde8e06317114f0, 0x3fdece62fe9a9915
+data8 0x3fdf1f164a15389a, 0x3fdf5fd8a9063e35
+data8 0x3fdfa0c8937e7d5d, 0x3fdfe1e649bb6335
+data8 0x3fe011990641535a, 0x3fe032560e91e59e
+data8 0x3fe0532a5ebcd44a, 0x3fe0741617f5fc28
+data8 0x3fe08cd653f38839, 0x3fe0adeb55c1103b
+data8 0x3fe0cf181d5d1dd0, 0x3fe0f05ccd0aced7
+data8 0x3fe111b9875788ab, 0x3fe1332e6f1bcf73
+data8 0x3fe154bba77c2088, 0x3fe16df59bfa06c1
+data8 0x3fe18fadb6e2d3c2, 0x3fe1b17e849adc26
+data8 0x3fe1caeb6a0de814, 0x3fe1ece7c830eec9
+data8 0x3fe20efd3dae01df, 0x3fe2289de375d901
+data8 0x3fe24adf9b6a6fe0, 0x3fe26d3ad1aebcfc
+data8 0x3fe287100c2771f4, 0x3fe2a9983b3c1b28
+data8 0xbfda78e146f7bef4, 0xbfda33760a7f6051
+data8 0xbfd9ff43476fb5f7, 0xbfd9b97c3c4eec8f
+data8 0xbfd98504431717fc, 0xbfd93ee07535f967
+data8 0xbfd90a228d5712b2, 0xbfd8c3a104cb24f5
+data8 0xbfd88e9c72e0b226, 0xbfd847bc33d8618e
+data8 0xbfd812703988bb69, 0xbfd7dd0569c04bff
+data8 0xbfd7959c202292f1, 0xbfd75fe8d2c5d48f
+data8 0xbfd72a1637cbc183, 0xbfd6e221cd9d0cde
+data8 0xbfd6ac059985503b, 0xbfd675c99ce81f92
+data8 0xbfd63f6db2590482, 0xbfd5f6c138136489
+data8 0xbfd5c01a39fbd688, 0xbfd58952cf519193
+data8 0xbfd5526ad18493ce, 0xbfd51b6219bfe6ea
+data8 0xbfd4d1cdf8b4846f, 0xbfd49a784bcd1b8b
+data8 0xbfd4630161832547, 0xbfd42b6911cf5465
+data8 0xbfd3f3af3461e1c4, 0xbfd3bbd3a0a1dcfb
+data8 0xbfd383d62dac7ae7, 0xbfd34bb6b2546218
+data8 0xbfd313750520f520, 0xbfd2db10fc4d9aaf
+data8 0xbfd2a28a6dc90387, 0xbfd269e12f346e2c
+data8 0xbfd2311515e2e855, 0xbfd1f825f6d88e13
+data8 0xbfd1bf13a6c9c69f, 0xbfd185ddfa1a7ed0
+data8 0xbfd14c84c4dd6128, 0xbfd11307dad30b76
+data8 0xbfd0d9670f6941fe, 0xbfd09fa235ba2020
+data8 0xbfd0790adbb03009, 0xbfd03f09858c55fb
+data8 0xbfd004e3a7c97cbd, 0xbfcf9532288fcf69
+data8 0xbfcf205339208f27, 0xbfceab2a23a5b83e
+data8 0xbfce5ce55fdd37a5, 0xbfcde73fe3b1480f
+data8 0xbfcd714f44623927, 0xbfccfb1321b8c400
+data8 0xbfccac163c770dc9, 0xbfcc355b67195dd0
+data8 0xbfcbbe540a3f036f, 0xbfcb6ecf175f95e9
+data8 0xbfcaf74751e1be33, 0xbfca7f71fb7bab9d
+data8 0xbfca2f632320b86b, 0xbfc9b70ba539dfae
+data8 0xbfc93e6587910444, 0xbfc8edcae8352b6c
+data8 0xbfc874a0db01a719, 0xbfc7fb27199df16d
+data8 0xbfc7a9fec7d05ddf, 0xbfc72fff456ac70d
+data8 0xbfc6de7d66023dbc, 0xbfc663f6fac91316
+data8 0xbfc6121ac74813cf, 0xbfc5970c478fff4a
+data8 0xbfc51bab907a5c8a, 0xbfc4c93d33151b24
+data8 0xbfc44d527fdadf55, 0xbfc3fa87be0f3a1b
+data8 0xbfc3a797cd35d959, 0xbfc32ae9e278ae1a
+data8 0xbfc2d79c6937efdd, 0xbfc25a619370d9dc
+data8 0xbfc206b5bde2f8b8, 0xbfc188ecbd1d16be
+data8 0xbfc134e1b489062e, 0xbfc0b6894488e95f
+data8 0xbfc0621e2f556b5c, 0xbfc00d8c711a12cc
+data8 0xbfbf1cd21257e18c, 0xbfbe72ec117fa5b2
+data8 0xbfbdc8b7c49a1ddb, 0xbfbcc8d5e467b710
+data8 0xbfbc1ddc9c39c7a1, 0xbfbb7294093cdd0f
+data8 0xbfba7111df348494, 0xbfb9c501cdf75872
+data8 0xbfb918a16e46335b, 0xbfb81579a73e83c6
+data8 0xbfb7684f39f4ff2d, 0xbfb6bad3758efd87
+data8 0xbfb60d060d7e41ac, 0xbfb507b836033bb7
+data8 0xbfb4591d6310d85a, 0xbfb3aa2fdd27f1c3
+data8 0xbfb2faef55ccb372, 0xbfb1f3723b4ae6db
+data8 0xbfb14360d6136ffa, 0xbfb092fb594145c1
+data8 0xbfafc482e8b48a7e, 0xbfae6265ace11ae4
+data8 0xbfacff9e5c4341d0, 0xbfaaea3316095f72
+data8 0xbfa985bfc3495194, 0xbfa820a01ac754cb
+data8 0xbfa6bad3758efd87, 0xbfa554592bb8cd58
+data8 0xbfa3ed3094685a26, 0xbfa2855905ca70f6
+data8 0xbfa11cd1d5133413, 0xbf9dfd78881399f1
+data8 0xbf9b28f618cc85df, 0xbf98530faa3c087b
+data8 0xbf957bc3dddcd7fa, 0xbf92a3115322f9e6
+data8 0xbf8f91ed4eef8370, 0xbf89dae4ec6b8b2e
+data8 0xbf842106b1499209, 0xbf7cc89f97d67594
+data8 0xbf71497accf7e11d, 0x0000000000000000
+LOCAL_OBJECT_END(T_table)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(log2f)
+
+{ .mfi
+ alloc r32=ar.pfs,1,4,4,0
+ // y=frcpa(x)
+ frcpa.s1 f6,p0=f1,f8
+ // will form significand of 1.5 (to test whether the index is 128 or above)
+ mov r24=0xc
+}
+{.mfi
+ nop.m 0
+ // normalize x
+ fma.s1 f7=f8,f1,f0
+ // r2 = pointer to C_1...C_6 followed by T_table
+ addl r2 = @ltoff(poly_coeffs), gp;;
+}
+{.mfi
+ // get significand
+ getf.sig r25=f8
+ // f8 denormal ?
+ fclass.m p8,p10=f8,0x9
+ // will form significand of 1.5 (to test whether the index is 128 or above)
+ shl r24=r24,60
+}
+{.mfi
+ mov r26=0x804
+ nop.f 0
+ // r23=bias-1
+ mov r23=0xfffe;;
+}
+
+{.mmf
+ getf.exp r29=f8
+ // load start address for C_1...C_6 followed by T_table
+ ld8 r2=[r2]
+ // will continue only for positive normal/denormal numbers
+ fclass.nm.unc p12,p7 = f8, 0x19 ;;
+}
+
+.pred.rel "mutex",p8,p10
+{.mfi
+ // denormal input, repeat get significand (after normalization)
+ (p8) getf.sig r25=f7
+ // x=1 ?
+ fcmp.eq.s0 p6,p0=f8,f1
+ // get T_index
+ (p10) shr.u r28=r25,63-8
+}
+{.mfi
+ // f12=0.5
+ setf.exp f12=r23
+ nop.f 0
+ // r27=bias
+ mov r27=0xffff;;
+}
+
+{.mfb
+ // denormal input, repeat get exponent (after normalization)
+ (p8) getf.exp r29=f7
+ nop.f 0
+ (p12) br.cond.spnt SPECIAL_log2f
+}
+{.mfi
+ cmp.geu p12,p0=r25,r24
+ nop.f 0
+ mov r23=0xff;;
+}
+
+{.mfi
+ add r3=32,r2
+ // r=1-x*y
+ fms.s1 f6=f6,f8,f1
+ // r26=0x80400...0 (threshold for using polynomial approximation)
+ shl r26=r26,64-12
+}
+{.mfi
+ // load C_3, C_4
+ ldfpd f10,f11=[r2],16
+ nop.f 0
+ // r27=bias-1 (if index >=128, will add exponent+1)
+ (p12) mov r27=0xfffe;;
+}
+
+{.mfi
+ // load C_1
+ ldfe f14=[r2],32
+ // x=1, return 0
+ (p6) fma.s.s0 f8=f0,f0,f0
+ (p8) shr.u r28=r25,63-8
+}
+{.mib
+ // load C_2
+ ldfe f13=[r3]
+ // r29=exponent-bias
+ sub r29=r29,r27
+ // x=1, return
+ (p6) br.ret.spnt b0;;
+}
+
+
+{.mfi
+ // get T_index
+ and r28=r28,r23
+ fmerge.se f7=f1,f7
+ // if first 9 bits after leading 1 are all zero, then p8=1
+ cmp.ltu p8,p12=r25,r26;;
+}
+{.mfi
+ // f8=expon - bias
+ setf.sig f8=r29
+ nop.f 0
+ // get T address
+ shladd r2=r28,3,r2
+}
+{.mfi
+ // first 8 bits after leading 1 are all ones ?
+ cmp.eq p10,p0=r23,r28
+ // if first 8 bits after leading bit are 0, use polynomial approx. only
+ (p8) fms.s1 f6=f7,f1,f1
+ nop.i 0;;
+}
+{.mfi
+ //r26=1
+ mov r26=1
+ // if first 8 bits after leading 1 are all ones, use polynomial approx. only
+ (p10) fms.s1 f6=f7,f12,f1
+ nop.i 0;;
+}
+
+.pred.rel "mutex",p8,p12
+{.mmf
+ // load T (unless first 9 bits after leading 1 are 0)
+ (p12) ldfd f12=[r2]
+ nop.m 0
+ // set T=0 (if first 9 bits after leading 1 are 0)
+ (p8) fma.s1 f12=f0,f0,f0;;
+}
+
+{.mfi
+ nop.m 0
+ // P34=C_3+C_4*r
+ fma.s1 f10=f11,f6,f10
+ // r26=2^{63}
+ shl r26=r26,63
+}
+{.mfi
+ nop.m 0
+ // r2=r*r
+ fma.s1 f11=f6,f6,f0
+ nop.i 0;;
+}
+{.mfi
+ // significand of x is 1 ?
+ cmp.eq p0,p6=r25,r26
+ // P12=C_1+C_2*r
+ fma.s1 f14=f13,f6,f14
+ nop.i 0;;
+}
+{.mfi
+ nop.m 0
+ // normalize additive term (l=exponent of x)
+ fcvt.xf f8=f8
+ // if significand(x)=1, return exponent (l)
+ nop.i 0;;
+}
+{.mfi
+ nop.m 0
+ // add T+l
+ (p6) fma.s1 f8=f8,f1,f12
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // P14=P12+r2*P34
+ (p6) fma.s1 f13=f10,f11,f14
+ nop.i 0;;
+}
+
+{.mfb
+ nop.m 0
+ // result=T+l+r*P14
+ (p6) fma.s.s0 f8=f13,f6,f8
+ // return
+ br.ret.sptk b0;;
+}
+
+
+SPECIAL_log2f:
+{.mfi
+ nop.m 0
+ // x=+Infinity ?
+ fclass.m p7,p0=f8,0x21
+ nop.i 0;;
+}
+{.mfi
+ nop.m 0
+ // x=+/-Zero ?
+ fclass.m p8,p0=f8,0x7
+ nop.i 0;;
+}
+{.mfi
+ nop.m 0
+ // x=-Infinity, -normal, -denormal ?
+ fclass.m p6,p0=f8,0x3a
+ nop.i 0;;
+}
+{.mfb
+ nop.m 0
+ // log2f(+Infinity)=+Infinity
+ nop.f 0
+ (p7) br.ret.spnt b0;;
+}
+{.mfi
+ (p8) mov GR_Parameter_TAG = 172
+ // log2f(+/-0)=-infinity, raises Divide by Zero
+ // set f8=-0
+ (p8) fmerge.ns f8=f0,f8
+ nop.i 0;;
+}
+{.mfb
+ nop.m 0
+ (p8) frcpa.s0 f8,p0=f1,f8
+ (p8) br.cond.sptk __libm_error_region;;
+}
+{.mfb
+ (p6) mov GR_Parameter_TAG = 173
+ // x<0: return NaN, raise Invalid
+ (p6) frcpa.s0 f8,p0=f0,f0
+ (p6) br.cond.sptk __libm_error_region;;
+}
+
+
+{.mfb
+ nop.m 0
+ // Remaining cases: NaNs
+ fma.s.s0 f8=f8,f1,f0
+ br.ret.sptk b0;;
+}
+
+GLOBAL_LIBM_END(log2f)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfs [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfs [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+{ .mmi
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
+
+
+
diff --git a/sysdeps/ia64/fpu/e_log2l.S b/sysdeps/ia64/fpu/e_log2l.S
new file mode 100644
index 0000000000..37af2f2553
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_log2l.S
@@ -0,0 +1,816 @@
+.file "log2l.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 09/25/00 Initial version
+// 11/22/00 Fixed accuracy bug (for mantissas near 1, 2)
+// 12/07/00 Fixed C_1l constant, eliminated rounding errors in
+// reduced argument (x*frcpa(x)-1)
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// long double log2l(long double)
+//
+// Overview of operation
+//==============================================================
+// Background
+//
+// Implementation
+//
+// Let x = 2^l * m, where m=1.b1 b2 ... b8 b9 ... b52
+// y=frcpa(m), r=m*y-1, f=b1 b2 .. b8
+// T_hi is a table that stores the 24 most significant bits of log2(1/y)
+// (in entries 1..255) in single precision format
+// T_low is a table that stores (log2(1/y)-T_high), rounded to double
+// precision
+//
+// f is used as an index; T_high[255]=T_low[255]=0
+//
+// If f=0 and b9=0, r is set to 2^{-8}* 0.b9 b10 ... b52 = m-1 (fractional part of m),
+// and 0 is used instead of T_high[0], T_low[0]
+// (polynomial evaluation only, for m=1+r, 0<=r<2^{-9})
+// If f=255, r is set to (m-2)/2 (T[255]=0, and only polynomial evaluation is used
+// for m=2(1-r'), 0<=r'<2^{-9})
+//
+// If 2^{-9}<=m<2-2^{-8} or (input not near 1), let C1r=(2^{16}+C1*r)-2^{16}
+// and let E=((RN(m*y)-1)-r)+(m*y-RN(m*y))
+// Else let C1r=C1*r (rounded to 64 significant bits) and let E=0
+//
+// Let D=C1*r-C1r
+//
+//
+// log2l(x) is approximated as
+// (l+T_high[f]+C1r) + (D+r*(c1+c2*r+c3*r^2...+c8*r^7)+(T_low[f]+C_1*E))
+//
+
+
+// Special values
+//==============================================================
+// log2l(0)=-inf, raises Divide by Zero
+// log2l(+inf)=inf
+// log2l(x)=NaN, raises Invalid if x<0
+//
+
+
+// Registers used
+//==============================================================
+// f6-f15, f32-f36
+// r2-r3, r23-r23
+// p6,p7,p8,p12
+//
+
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35 // This reg. can safely be used
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+
+
+
+// Data tables
+//==============================================================
+
+RODATA
+
+.align 16
+
+LOCAL_OBJECT_START(poly_coeffs)
+
+data8 0xb8aa3b295c17f0bc, 0x00003fff // C_1
+data8 0x3fca61762a7aded9, 0xbfc71547652b82fe // C_7, C_8
+data8 0x3fd2776c50ef9bfe, 0xbfcec709dc3a03fd // C_5, C_6
+data8 0x3fdec709dc3a03fd, 0xbfd71547652b82fe // C_3, C_4
+//data8 0xd871319ff0342580, 0x0000bfbd // C_1l (low part of C1)
+data8 0x82f0025f2dc582ee, 0x0000bfbe // C_1l (low part of C1)
+data8 0xb8aa3b295c17f0bc, 0x0000bffe // C_2
+LOCAL_OBJECT_END(poly_coeffs)
+
+
+
+
+LOCAL_OBJECT_START(T_table)
+
+data4 0x3b38d875, 0x3c0ae7f4, 0x3c67f738, 0x3ca2b253
+data4 0x3ccbb91d, 0x3cfac91e, 0x3d1504a5, 0x3d29c4a0
+data4 0x3d419264, 0x3d567aa6, 0x3d6e76ca, 0x3d81c3f7
+data4 0x3d8c5630, 0x3d9876e9, 0x3da31e0a, 0x3dadcf09
+data4 0x3db889f9, 0x3dc34eec, 0x3dce1df5, 0x3dd8f726
+data4 0x3de3da94, 0x3deec851, 0x3df82ea4, 0x3e0197dd
+data4 0x3e071dad, 0x3e0ca8ca, 0x3e116d6e, 0x3e170281
+data4 0x3e1bcfbc, 0x3e216ee9, 0x3e2644dc, 0x3e2b1ee1
+data4 0x3e30cd12, 0x3e35affd, 0x3e3a970f, 0x3e3f824f
+data4 0x3e4544c0, 0x3e4a3926, 0x3e4f31d1, 0x3e542ec7
+data4 0x3e593012, 0x3e5e35b7, 0x3e633fbf, 0x3e677625
+data4 0x3e6c884b, 0x3e719eea, 0x3e76ba0a, 0x3e7bd9b2
+data4 0x3e80111d, 0x3e82a523, 0x3e84ccec, 0x3e876533
+data4 0x3e89ffd1, 0x3e8c2d22, 0x3e8e5c18, 0x3e90fd0a
+data4 0x3e932fa9, 0x3e95d506, 0x3e980b5a, 0x3e9a4361
+data4 0x3e9c7d1f, 0x3e9f2b16, 0x3ea168a0, 0x3ea3a7ea
+data4 0x3ea5e8f5, 0x3ea82bc4, 0x3eaa705b, 0x3eacb6bb
+data4 0x3eaefee7, 0x3eb148e3, 0x3eb394b1, 0x3eb5e255
+data4 0x3eb831d0, 0x3eba8327, 0x3ebcd65c, 0x3ebeb3e0
+data4 0x3ec10a7a, 0x3ec362f9, 0x3ec5bd63, 0x3ec7a0b3
+data4 0x3ec9fe96, 0x3ecc5e6c, 0x3ece4619, 0x3ed0a978
+data4 0x3ed293fe, 0x3ed4faf1, 0x3ed6e859, 0x3ed952eb
+data4 0x3edb433c, 0x3eddb178, 0x3edfa4bc, 0x3ee19953
+data4 0x3ee40cee, 0x3ee60484, 0x3ee7fd73, 0x3ee9f7bb
+data4 0x3eec7280, 0x3eee6fda, 0x3ef06e94, 0x3ef26eb1
+data4 0x3ef47031, 0x3ef67317, 0x3ef8f8b2, 0x3efafec5
+data4 0x3efd0644, 0x3eff0f32, 0x3f008cc8, 0x3f0192b0
+data4 0x3f029952, 0x3f03a0b0, 0x3f0466b2, 0x3f056f5a
+data4 0x3f0678c0, 0x3f0782e6, 0x3f088dcc, 0x3f099973
+data4 0x3f0aa5dd, 0x3f0b6fac, 0x3f0c7d6d, 0x3f0d8bf4
+data4 0x3f0e575b, 0x3f0f673e, 0x3f1077e9, 0x3f1144ef
+data4 0x3f1256fc, 0x3f1369d6, 0x3f143880, 0x3f154cc1
+data4 0x3f161c7a, 0x3f173227, 0x3f1802f2, 0x3f191a0f
+data4 0x3f19ebee, 0x3f1b047e, 0x3f1bd775, 0x3f1cf17b
+data4 0x3f1dc58e, 0x3f1ee10f, 0x3f1fb63f, 0x3f208bea
+data4 0x3f21a98f, 0x3f22805c, 0x3f2357a7, 0x3f247778
+data4 0x3f254fe9, 0x3f2628d9, 0x3f270249, 0x3f2824fb
+data4 0x3f28ff97, 0x3f29dab4, 0x3f2ab654, 0x3f2b9277
+data4 0x3f2cb8c8, 0x3f2d961e, 0x3f2e73fa, 0x3f2f525b
+data4 0x3f303143, 0x3f3110b1, 0x3f31f0a7, 0x3f32d125
+data4 0x3f33b22b, 0x3f3493bc, 0x3f3575d6, 0x3f36587b
+data4 0x3f373bab, 0x3f381f68, 0x3f3903b1, 0x3f39e888
+data4 0x3f3acdec, 0x3f3bb3e0, 0x3f3c9a63, 0x3f3d8177
+data4 0x3f3e1bd4, 0x3f3f03d9, 0x3f3fec71, 0x3f40d59b
+data4 0x3f41bf59, 0x3f42a9ab, 0x3f434635, 0x3f443180
+data4 0x3f451d61, 0x3f4609d9, 0x3f46a7d3, 0x3f479549
+data4 0x3f488357, 0x3f492261, 0x3f4a1171, 0x3f4b011c
+data4 0x3f4ba139, 0x3f4c91e8, 0x3f4d8334, 0x3f4e246a
+data4 0x3f4f16be, 0x3f5009b1, 0x3f50ac02, 0x3f51a001
+data4 0x3f524305, 0x3f533812, 0x3f53dbca, 0x3f54d1e7
+data4 0x3f55c8a8, 0x3f566d85, 0x3f57655b, 0x3f580af0
+data4 0x3f58b0d0, 0x3f59aa2c, 0x3f5a50c7, 0x3f5b4b3c
+data4 0x3f5bf294, 0x3f5cee26, 0x3f5d963c, 0x3f5e92ed
+data4 0x3f5f3bc3, 0x3f5fe4e7, 0x3f60e32d, 0x3f618d13
+data4 0x3f623748, 0x3f63372a, 0x3f63e223, 0x3f648d6b
+data4 0x3f658eee, 0x3f663afe, 0x3f66e75e, 0x3f67ea86
+data4 0x3f6897b0, 0x3f69452c, 0x3f69f2f9, 0x3f6af847
+data4 0x3f6ba6e2, 0x3f6c55d0, 0x3f6d0510, 0x3f6e0c8d
+data4 0x3f6ebc9f, 0x3f6f6d04, 0x3f701dbe, 0x3f70cecd
+data4 0x3f718030, 0x3f728ae6, 0x3f733d20, 0x3f73efaf
+data4 0x3f74a296, 0x3f7555d3, 0x3f760967, 0x3f76bd53
+data4 0x3f777197, 0x3f7880a1, 0x3f7935c2, 0x3f79eb3c
+data4 0x3f7aa10f, 0x3f7b573b, 0x3f7c0dc2, 0x3f7cc4a3
+data4 0x3f7d7bdf, 0x3f7e3376, 0x3f7eeb68, 0x00000000
+LOCAL_OBJECT_END(T_table)
+
+
+
+LOCAL_OBJECT_START(T_low)
+
+
+data8 0x3dc0b97f689876ef, 0x3dfd5d906028ac01
+data8 0x3df8b9cbb8d7240b, 0x3de0c941a2f220cd
+data8 0x3e09c6aecba15936, 0x3dfa6d528241827c
+data8 0x3dd0bad25714903c, 0x3e2776b01dc036a2
+data8 0x3e2b914bc77f158b, 0x3e1c0fafd29dc74a
+data8 0x3e28dadc119cd3de, 0x3e3bca869da085be
+data8 0x3e19d1e700f2200a, 0x3e3e13530cc37504
+data8 0x3e3936464d9c41ee, 0x3e3c3fa21c9499d0
+data8 0x3e3259e079b6c6e8, 0x3e2a364069c4f7f3
+data8 0x3e1274c84f6c6364, 0x3e3796170159f454
+data8 0x3e26e1e389f4364e, 0x3e28cedda8c7f658
+data8 0x3e376c2028433268, 0x3e4aee6d650c82e1
+data8 0x3e33e65094fbeeb4, 0x3e4c7d125aa92c5d
+data8 0x3e1559a4b69691d8, 0x3e18efabeb7d7221
+data8 0x3e4c2b255abaa8de, 0x3e37436952a4538b
+data8 0x3e4e6807f4ba00b8, 0x3e33ff5964190e42
+data8 0x3e4f5d798cead43c, 0x3e4f3676443bf453
+data8 0x3e4660f8d5bc1bf5, 0x3e2d4f9f3ab04f36
+data8 0x3e357f7a64ccd537, 0x3e394caf7c9b05af
+data8 0x3e225c7d17ab29b0, 0x3e4eb202f6d55a12
+data8 0x3e32faa68b19bcd2, 0x3e45ee1c9b566a8b
+data8 0x3e4770a67de054ff, 0x3e42234fb9de6d6b
+data8 0x3e4ad139825c6e19, 0x3e47f3d334814a93
+data8 0x3e2af1ec402867b6, 0x3e2bfbda0c956e3d
+data8 0x3e4287b831e77ff2, 0x3e54bf0eb77f7b89
+data8 0x3e5b9259a1029607, 0x3e4a764b015e699d
+data8 0x3e4d0b68ea883ab5, 0x3e33e829ecdadf46
+data8 0x3e52f27efef3031b, 0x3e3073979e4af89e
+data8 0x3e3b980f2cd6c253, 0x3e2a5f0f5f7f66a9
+data8 0x3e37788738117b02, 0x3e58aa29a784d52f
+data8 0x3e4f5504c4ff2466, 0x3e002d40340fa647
+data8 0x3e5f53b64592f4c3, 0x3e543f222c526802
+data8 0x3e5680e547a872fa, 0x3e5e234bd1154450
+data8 0x3e3000edc18b6d21, 0x3e1c3c1f000942a8
+data8 0x3e51eeae0e442d6e, 0x3e4fb265376623f2
+data8 0x3e57b5941782d830, 0x3e3a4b83f24ae52c
+data8 0x3e5a5fb4f23978de, 0x3e51ed071563fb02
+data8 0x3e49e2071f51a7a8, 0x3e5e43ae5b924234
+data8 0x3dfa2be9aedf374a, 0x3e56dea3dbba67d5
+data8 0x3e3375fe732b3c3e, 0x3e5a0c6f91f2e77e
+data8 0x3e55e1bf1c969e41, 0x3e30a5a5166b8eee
+data8 0x3e53e6e9a539d46c, 0x3e542981b3d7b0e6
+data8 0x3e595fd8ff36ad64, 0x3e5edeb9e65cbbb4
+data8 0x3e46aeab4d3434c1, 0x3e4ea3ff0564b010
+data8 0x3e59b00be2e3c25a, 0x3e5b887cd7b0821f
+data8 0x3e5f666668547b4d, 0x3e4d0733a805273f
+data8 0x3e26a2ff21c4aec5, 0x3e4c336f7a3a78f3
+data8 0x3e11ad12b628e2d0, 0x3e56d43ff3f0ea64
+data8 0x3e238809433cccd2, 0x3e40d9734147d40f
+data8 0x3e54245fe3e24e06, 0x3e251441fce4d48c
+data8 0x3e517114efc5d1f9, 0x3e5e9a99154b0d82
+data8 0x3e442a71337970f8, 0x3e420c7c69211fdf
+data8 0x3e537e7d5d43c6a7, 0x3e4376c66ad9ad8b
+data8 0x3e49054d678a4f1c, 0x3e5d23cb3bc19f18
+data8 0x3e6ebcd449dcab2b, 0x3e67f5fc2849c88a
+data8 0x3e63f388395d3e84, 0x3e65c1103b0ad7e9
+data8 0x3e6d5d1dd031f353, 0x3e5a159dae75c4d0
+data8 0x3e4d5e22aa75f71d, 0x3e5e379ee62e1e35
+data8 0x3e4df082213cb2dc, 0x3e6bfa06c156f521
+data8 0x3e66e2d3c19b517b, 0x3e426b7098590071
+data8 0x3e541bd027e9854e, 0x3e5061dd924b0ac0
+data8 0x3e6dae01df373a03, 0x3e3baec80b207b0b
+data8 0x3e6b6a6fe06bebac, 0x3e61aebcfc3ab5d1
+data8 0x3e584ee3e7c79d83, 0x3e6b3c1b2840cb40
+data8 0x3e6c842085d6befd, 0x3e6ac04fd7b141e0
+data8 0x3e6c48250474141d, 0x3e2d889b86125f69
+data8 0x3e6e74740225dad0, 0x3e45940d31d50a7c
+data8 0x3e695476a6c39ddc, 0x3e6d9a6d857a060a
+data8 0x3e4a3e9bb4b69337, 0x3e484f3ce4707ed6
+data8 0x3e39dd125d25fc27, 0x3e563fb400de8732
+data8 0x3e5fdd6d0ee28b48, 0x3e669d15b869bb07
+data8 0x3e40687cfad7964d, 0x3e69317990d43957
+data8 0x3e633d57e24ae1bd, 0x3e618bf03710eabb
+data8 0x3e4b4df6fccd1160, 0x3e3fb26ddaa1ec45
+data8 0x3e3810a5e1817fd4, 0x3e6857373642fa5c
+data8 0x3e673db6193add31, 0x3e63200c8acbc9c3
+data8 0x3e3d2dee448ebb62, 0x3e6a19723a80db6a
+data8 0x3e5e7cdab8fd3e6a, 0x3e671855cd660672
+data8 0x3e473c3c78a85ecd, 0x3e5f5e23056a7cf2
+data8 0x3e52538519527367, 0x3e4b573bcf2580e9
+data8 0x3e6d6f856fe90c60, 0x3e2d932a8487642e
+data8 0x3e5236fc78b6174c, 0x3e50cb91d406db50
+data8 0x3e650e8bd562aa57, 0x3e424ee3d9a82f2e
+data8 0x3e59363960e1e3d9, 0x3e379604c1150a3e
+data8 0x3e6d914f6c2ac258, 0x3e62967a451a7b48
+data8 0x3e684b5f01139cb2, 0x3e448bbfbf6d292c
+data8 0x3e6227e7fb487e73, 0x3e6d39d50290f458
+data8 0x3e58368342b4b668, 0x3e65dc0c25bd1763
+data8 0x3e61b7dc362e22b5, 0x3e671691f094bb80
+data8 0x3e5011642d5123f2, 0x3e4c4eb7f11e41be
+data8 0x3e5dcee36ca242cf, 0x3e6791cefff688f1
+data8 0x3e60e23c8dda4ecd, 0x3e48e6a22fe78cfe
+data8 0x3e6d703f244adc86, 0x3e6a281a85a5049d
+data8 0x3e570f20e6403d9e, 0x3e2211518a12956f
+data8 0x3e6737d1e54d71df, 0x3e66b1881476f5e9
+data8 0x3e6e1bbeef085376, 0x3e47cad4944a32be
+data8 0x3e527f2c738e7ee9, 0x3e699883a4b9fb29
+data8 0x3e5c17d1108740d9, 0x3e5d4a9c79a43389
+data8 0x3e49fdc24462ba3b, 0x3e24dbb3a60cceb2
+data8 0x3e5c5bf618780748, 0x3e5c38005b0c778c
+data8 0x3e6be168dd6dd3fe, 0x3e633ab9370693b0
+data8 0x3dd290556b0ae339, 0x3e607c317927096a
+data8 0x3e59651353b3d90e, 0x3e4d8751e5e0ae0d
+data8 0x3e46c81023272a85, 0x3e6b23c988f391b2
+data8 0x3e608741d215209c, 0x3e60b8ba506d758f
+data8 0x3e62ddbe74803297, 0x3e5dbb8b5087587d
+data8 0x3e642aa529048131, 0x3e3dcbda6835dcf4
+data8 0x3e6db503ce854d2a, 0x3e6dd00b49bc6849
+data8 0x3e4db2f11243bc84, 0x3e3b9848efc2ea97
+data8 0x3e58f18e17c82609, 0x3e6ed8645e16c312
+data8 0x3e4065bdb60a5dd4, 0x3e490453c6e6c30a
+data8 0x3e62373994aa31ba, 0x3e56305f0e6b2a95
+data8 0x3e68c1601a6614ee, 0x3e614e204f19d93f
+data8 0x3e6e5037ca773299, 0x3e693f98892561a6
+data8 0x3e639de4f4bf700d, 0x3e416c071e93fd97
+data8 0x3e65466991b415ef, 0x3e6896a324afac9d
+data8 0x3e44f64802e2f11c, 0x3e64d7d747e2191a
+data8 0x3e6174b7581de84c, 0x3e44c7b946e1d43c
+data8 0x3e6a3bcbe30512ec, 0x3e5d3ed411c95ce4
+data8 0x3e3e5b5735cfaf8e, 0x3e6e538ab34efb51
+data8 0x3e514e204f19d93f, 0x3e5a88e6550c89a4
+data8 0x3e66b97a5d9dfd8b, 0x3e5f46b1e14ebaf3
+data8 0x3e357665f6893f5d, 0x3e6bbf633078d1d5
+data8 0x3e5e7337a212c417, 0x3e3570fde15fc8cc
+data8 0x3e21119402da92b4, 0x3e6566e830d1ff3b
+data8 0x3e558883e480e220, 0x3e589ca3a68da411
+data8 0x3e44eb66df73d648, 0x3e1a0a629b1b7e68
+data8 0x3e54cc207b8c1116, 0x0000000000000000
+LOCAL_OBJECT_END(T_low)
+
+
+.section .text
+GLOBAL_IEEE754_ENTRY(log2l)
+
+{ .mfi
+ alloc r32=ar.pfs,1,4,4,0
+ // normalize x
+ // y=frcpa(x)
+ frcpa.s1 f41,p0=f1,f8
+ // r26=bias-1
+ mov r26=0xfffe
+}
+{.mfi
+ // r23=bias+16
+ mov r23=0xffff+16
+ fma.s1 f7=f8,f1,f0
+ // r2 = pointer to C_1...C_6 followed by T_table
+ addl r2 = @ltoff(poly_coeffs), gp;;
+}
+{.mfi
+ // get significand
+ getf.sig r25=f8
+ // f8 denormal ?
+ fclass.m p8,p10=f8,0x9
+ // r24=bias-8
+ mov r24=0xffff-8;;
+}
+{.mfi
+ setf.exp f36=r26
+ nop.f 0
+ // r27=bias
+ mov r27=0xffff;;
+}
+
+{.mmf
+ getf.exp r29=f8
+ // load start address for C_1...C_7 followed by T_table
+ ld8 r2=[r2]
+ // will continue only for positive normal/unnormal numbers
+ fclass.m.unc p0,p12 = f8, 0x19;;
+}
+
+
+.pred.rel "mutex",p8,p10
+{.mfi
+ // denormal input, repeat get significand (after normalization)
+ (p8) getf.sig r25=f7
+ // x=1 ?
+ fcmp.eq.s0 p6,p0=f8,f1
+ // get T_index
+ (p10) shr.u r28=r25,63-8
+}
+{.mfi
+ // f32=2^16
+ setf.exp f32=r23
+ nop.f 0
+ mov r26=0x804;;
+}
+
+{.mfi
+ // denormal input, repeat get exponent (after normalization)
+ (p8) getf.exp r29=f7
+ // f33=0
+ mov f33=f0
+ // r26=0x80400...0 (threshold for using polynomial approximation)
+ shl r26=r26,64-12;;
+}
+
+{.mfb
+ add r3=16,r2
+ // r=x*y-1
+ fms.s1 f6=f41,f8,f1
+ (p12) br.cond.spnt SPECIAL_log2l
+}
+{.mfi
+ // load C_1
+ ldfe f14=[r2],48
+ // RN(x*y)
+ fma.s1 f43=f41,f8,f0
+ mov r23=0xff;;
+}
+
+{.mmi
+ // load C_7, C_8
+ ldfpd f10,f11=[r3],16
+ // load C_3,C_4
+ ldfpd f15,f42=[r2],16
+ (p8) shr.u r28=r25,63-8;;
+}
+
+
+{.mfi
+ // load C_5, C_6
+ ldfpd f12,f13=[r3]
+ // pseudo-zero ?
+ fcmp.eq.s0 p7,p0=f7,f0
+ // if first 9 bits after leading 1 are all zero, then p8=1
+ cmp.ltu p8,p12=r25,r26
+}
+{.mfi
+ // load C1l
+ ldfe f34=[r2],16
+ fmerge.se f7=f1,f7
+ // get T_index
+ and r28=r28,r23;;
+}
+{.mfi
+ // r29=exponent-bias
+ sub r29=r29,r27
+ // if first 8 bits after leading bit are 0, use polynomial approx. only
+ (p8) fms.s1 f6=f7,f1,f1
+ // start address of T_low
+ add r3=1024+16,r2
+}
+{.mfi
+ // load C_2
+ ldfe f35=[r2],16
+ // x=1, return 0
+ (p6) fma.s0 f8=f0,f0,f0
+ // first 8 bits after leading 1 are all ones ?
+ cmp.eq p10,p0=r23,r28;;
+}
+
+{.mfb
+ // if first 8 bits after leading 1 are all ones, use polynomial approx. only
+ // add 1 to the exponent additive term, and estimate log2(1-r)
+ (p10) add r29=1,r29
+ nop.f 0
+ (p7) br.cond.spnt LOG2_PSEUDO_ZERO
+}
+{.mfi
+ // get T_low adress
+ shladd r3=r28,3,r3
+ // if first 8 bits after leading 1 are all ones, use polynomial approx. only
+ (p10) fms.s1 f6=f7,f36,f1
+ // p10 --> p8=1, p12=0
+ (p10) cmp.eq p8,p12=r0,r0;;
+}
+
+{.mfi
+ // get T_high address
+ shladd r2=r28,2,r2
+ // L(x*y)=x*y-RN(x*y)
+ fms.s1 f41=f41,f8,f43
+ nop.i 0
+}
+{.mfi
+ // p13=p12
+ (p12) cmp.eq.unc p13,p0=r0,r0
+ // RtH=RN(x*y)-1 (will eliminate rounding errors in r)
+ fms.s1 f43=f43,f1,f1
+ nop.i 0;;
+}
+
+.pred.rel "mutex",p8,p12
+{.mfb
+ // load T_high (unless first 9 bits after leading 1 are 0)
+ (p12) ldfs f7=[r2]
+ // set T_high=0 (if first 9 bits after leading 1 are 0)
+ (p8) fma.s1 f7=f0,f0,f0
+ // x=1, return
+ (p6) br.ret.spnt b0
+}
+.pred.rel "mutex",p8,p12
+{.mfi
+ // p12: load T_low
+ (p12) ldfd f36=[r3]
+ // p8: set T_low=0
+ (p8) fma.s1 f36=f0,f0,f0
+ (p8) cmp.eq p8,p12=r29,r0;; //nop.i 0;;
+}
+
+.pred.rel "mutex",p8,p12
+{.mfi
+ // f8=expon - bias
+ setf.sig f8=r29
+ // general case: 2^{16}+C1*r
+ (p12) fma.s1 f33=f6,f14,f32
+ nop.i 0
+}
+{.mfi
+ // r26=1
+ mov r26=1
+ // p8 (mantissa is close to 1, or close to 2): 2^{-8}+C1*r
+ (p8) fma.s1 f32=f6,f14,f33
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // P78=C_7+C_8*r
+ fma.s1 f10=f11,f6,f10
+ // r26=2^{63}
+ shl r26=r26,63
+}
+{.mfi
+ nop.m 0
+ // P34=C_3+r*C_4
+ fma.s1 f15=f42,f6,f15
+ nop.i 0;;
+}
+{.mfi
+ nop.m 0
+ // r2=r*r
+ fma.s1 f11=f6,f6,f0
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // P56=C_5+C_6*r
+ fma.s1 f13=f13,f6,f12
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // Rth-r
+ (p13) fms.s1 f43=f43,f1,f6
+ nop.i 0
+}
+{.mfi
+ // significand(x)=1 ?
+ cmp.eq p0,p6=r25,r26
+ // P12=C1l+C_2*r
+ fma.s1 f34=f35,f6,f34
+ nop.i 0;;
+}
+
+.pred.rel "mutex",p8,p12
+{.mfi
+ nop.m 0
+ // p12: C1r=(2^{16}+C1*r)-2^{16}
+ (p12) fms.s1 f32=f33,f1,f32
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // p8: C1r=C1*r (double extended)
+ (p8) fms.s1 f32=f32,f1,f33
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // L(x*y)*C_1+T_low
+ (p13) fma.s1 f36=f41,f14,f36
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // P58=P56+r2*P78
+ fma.s1 f13=f11,f10,f13
+ nop.i 0;;
+}
+{.mfi
+ nop.m 0
+ // P14=P12+r2*P34
+ fma.s1 f15=f15,f11,f34
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // r4=r2*r2
+ fma.s1 f11=f11,f11,f0
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // normalize additive term (l=exponent of x)
+ fcvt.xf f8=f8
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // D=C1*r-C1r
+ (p6) fms.s1 f12=f14,f6,f32
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // T_low'=(Rth-r)*C1+(L(x*y)*C1+T_low)
+ (p13) fma.s1 f36=f43,f14,f36
+ nop.i 0;;
+}
+{.mfi
+ nop.m 0
+ // P18=P14+r4*P58
+ (p6) fma.s1 f13=f11,f13,f15
+ nop.i 0;;
+}
+
+{.mfi
+ nop.m 0
+ // add T_high+l
+ (p6) fma.s1 f8=f8,f1,f7
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // D+T_low
+ (p6) fma.s1 f12=f12,f1,f36
+ nop.i 0;;
+}
+
+
+{.mfi
+ nop.m 0
+ // (T_high+l)+C1r
+ (p6) fma.s1 f8=f8,f1,f32
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // (D+T_low)+r*P18
+ (p6) fma.s1 f13=f13,f6,f12
+ nop.i 0;;
+}
+
+//{.mfb
+//nop.m 0
+//mov f8=f36
+//fma.s0 f8=f13,f6,f0
+//br.ret.sptk b0;;
+//}
+
+
+{.mfb
+ nop.m 0
+ // result=((T_high+l)+C1r)+((D+T_low)+r*P18)
+ (p6) fma.s0 f8=f13,f1,f8
+ // return
+ br.ret.sptk b0;;
+}
+
+
+SPECIAL_log2l:
+{.mfi
+ nop.m 0
+ mov FR_X=f8
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ // x=+Infinity ?
+ fclass.m p7,p0=f8,0x21
+ nop.i 0;;
+}
+{.mfi
+ nop.m 0
+ // x=+/-Zero ?
+ fclass.m p8,p0=f7,0x7
+ nop.i 0;;
+}
+{.mfi
+ nop.m 0
+ // x=-Infinity, -normal, -denormal ?
+ fclass.m p6,p0=f8,0x3a
+ nop.i 0;;
+}
+{.mfb
+ nop.m 0
+ // log2l(+Infinity)=+Infinity
+ nop.f 0
+ (p7) br.ret.spnt b0;;
+}
+{.mfi
+ (p8) mov GR_Parameter_TAG = 168
+ // log2l(+/-0)=-infinity, raises Divide by Zero
+ // set f8=-0
+ (p8) fmerge.ns f8=f0,f8
+ nop.i 0;;
+}
+{.mfb
+ nop.m 0
+ (p8) frcpa.s0 f8,p0=f1,f8
+ (p8) br.cond.sptk __libm_error_region;;
+}
+{.mfb
+ (p6) mov GR_Parameter_TAG = 169
+ // x<0: return NaN, raise Invalid
+ (p6) frcpa.s0 f8,p0=f0,f0
+ (p6) br.cond.sptk __libm_error_region;;
+}
+
+
+{.mfb
+ nop.m 0
+ // Remaining cases: NaNs
+ fma.s0 f8=f8,f1,f0
+ br.ret.sptk b0;;
+}
+
+LOG2_PSEUDO_ZERO:
+
+{.mfi
+ nop.m 0
+ mov FR_X=f8
+ nop.i 0
+}
+{.mfi
+ mov GR_Parameter_TAG = 168
+ // log2l(+/-0)=-infinity, raises Divide by Zero
+ // set f8=-0
+ fmerge.ns f8=f0,f8
+ nop.i 0;;
+}
+{.mfb
+ nop.m 0
+ frcpa.s0 f8,p0=f1,f8
+ br.cond.sptk __libm_error_region;;
+}
+
+
+GLOBAL_IEEE754_END(log2l)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfe [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfe [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfe [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+{ .mmi
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
+
diff --git a/sysdeps/ia64/fpu/e_logf.S b/sysdeps/ia64/fpu/e_logf.S
index 829d0abed0..0ca6d3f2c8 100644
--- a/sysdeps/ia64/fpu/e_logf.S
+++ b/sysdeps/ia64/fpu/e_logf.S
@@ -1,10 +1,10 @@
.file "logf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,861 +20,1072 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 3/01/00 Initial version
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 03/01/00 Initial version
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-// 1/10/01 Improved speed, fixed flags for neg denormals
-//
+// 01/10/01 Improved speed, fixed flags for neg denormals
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 05/23/02 Modified algorithm. Now only one polynomial is used
+// for |x-1| >= 1/256 and for |x-1| < 1/256
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
// float logf(float)
// float log10f(float)
//
+//
// Overview of operation
//==============================================================
// Background
+// ----------
//
-// Consider x = 2^N 1.f1 f2 f3 f4...f63
-// Log(x) = log(frcpa(x) x/frcpa(x))
-// = log(1/frcpa(x)) + log(frcpa(x) x)
-// = -log(frcpa(x)) + log(frcpa(x) x)
+// This algorithm is based on fact that
+// log(a b) = log(a) + log(b).
//
-// frcpa(x) = 2^-N frcpa((1.f1 f2 ... f63)
+// In our case we have x = 2^N f, where 1 <= f < 2.
+// So
+// log(x) = log(2^N f) = log(2^N) + log(f) = n*log(2) + log(f)
//
-// -log(frcpa(x)) = -log(C)
-// = -log(2^-N) - log(frcpa(1.f1 f2 ... f63))
+// To calculate log(f) we do following
+// log(f) = log(f * frcpa(f) / frcpa(f)) =
+// = log(f * frcpa(f)) + log(1/frcpa(f))
//
-// -log(frcpa(x)) = -log(C)
-// = +Nlog2 - log(frcpa(1.f1 f2 ... f63))
+// According to definition of IA-64's frcpa instruction it's a
+// floating point that approximates 1/f using a lookup on the
+// top of 8 bits of the input number's significand with relative
+// error < 2^(-8.886). So we have following
//
-// -log(frcpa(x)) = -log(C)
-// = +Nlog2 + log(frcpa(1.f1 f2 ... f63))
+// |(1/f - frcpa(f)) / (1/f))| = |1 - f*frcpa(f)| < 1/256
//
-// Log(x) = log(1/frcpa(x)) + log(frcpa(x) x)
-
-// Log(x) = +Nlog2 + log(1./frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)
-// Log(x) = +Nlog2 - log(/frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)
-// Log(x) = +Nlog2 + T + log(frcpa(x) x)
+// and
+//
+// log(f) = log(f * frcpa(f)) + log(1/frcpa(f)) =
+// = log(1 + r) + T
+//
+// The first value can be computed by polynomial P(r) approximating
+// log(1 + r) on |r| < 1/256 and the second is precomputed tabular
+// value defined by top 8 bit of f.
+//
+// Finally we have that log(x) ~ (N*log(2) + T) + P(r)
+//
+// Note that if input argument is close to 1.0 (in our case it means
+// that |1 - x| < 1/256) we can use just polynomial approximation
+// because x = 2^0 * f = f = 1 + r and
+// log(x) = log(1 + r) ~ P(r)
+//
+//
+// To compute log10(x) we just use identity:
//
-// Log(x) = +Nlog2 + T + log(C x)
+// log10(x) = log(x)/log(10)
//
-// Cx = 1 + r
+// so we have that
//
-// Log(x) = +Nlog2 + T + log(1+r)
-// Log(x) = +Nlog2 + T + Series( r - r^2/2 + r^3/3 - r^4/4 ....)
+// log10(x) = (N*log(2) + T + log(1+r)) / log(10) =
+// = N*(log(2)/log(10)) + (T/log(10)) + log(1 + r)/log(10)
//
-// 1.f1 f2 ... f8 has 256 entries.
-// They are 1 + k/2^8, k = 0 ... 255
-// These 256 values are the table entries.
//
// Implementation
-//===============
-// CASE 1: |x-1| >= 2^-8
-// C = frcpa(x)
-// r = C * x - 1
+// --------------
+// It can be seen that formulas for log and log10 differ from one another
+// only by coefficients and tabular values. Namely as log as log10 are
+// calculated as (N*L1 + T) + L2*Series(r) where in case of log
+// L1 = log(2)
+// T = log(1/frcpa(x))
+// L2 = 1.0
+// and in case of log10
+// L1 = log(2)/log(10)
+// T = log(1/frcpa(x))/log(10)
+// L2 = 1.0/log(10)
//
-// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4
+// So common code with two different entry points those set pointers
+// to the base address of coresponding data sets containing values
+// of L2,T and prepare integer representation of L1 needed for following
+// setf instruction can be used.
//
-// x = f * 2*n where f is 1.f_1f_2f_3....f_63
-// Nfloat = float(n) where n is the true unbiased exponent
-// pre-index = f_1f_2....f_8
-// index = pre_index * 16
-// get the dxt table entry at index + offset = T
+// Note that both log and log10 use common approximation polynomial
+// it means we need only one set of coefficients of approximation.
//
-// result = (T + Nfloat * log(2)) + rseries
+// 1. Computation of log(x) for |x-1| >= 1/256
+// InvX = frcpa(x)
+// r = InvX*x - 1
+// P(r) = r*((1 - A2*r) + r^2*(A3 - A4*r)) = r*P2(r),
+// A4,A3,A2 are created with setf inctruction.
+// We use Taylor series and so A4 = 1/4, A3 = 1/3,
+// A2 = 1/2 rounded to double.
//
-// The T table is calculated as follows
-// Form x_k = 1 + k/2^8 where k goes from 0... 255
-// y_k = frcpa(x_k)
-// log(1/y_k) in quad and round to double
-
-// CASE 2: |x-1| < 2^-6
-// w = x - 1
+// N = float(n) where n is true unbiased exponent of x
//
-// Form wseries = w + Q1*w^2 + Q2*w^3 + Q3*w^4
+// T is tabular value of log(1/frcpa(x)) calculated in quad precision
+// and rounded to double. To T we get bits from 55 to 62 of register
+// format significand of x and calculate address
+// ad_T = table_base_addr + 8 * index
//
-// result = wseries
-
-// Special values
+// L2 (1.0 or 1.0/log(10) depending on function) is calculated in quad
+// precision and rounded to double; it's loaded from memory
+//
+// L1 (log(2) or log10(2) depending on function) is calculated in quad
+// precision and rounded to double; it's created with setf.
+//
+// And final result = P2(r)*(r*L2) + (T + N*L1)
+//
+//
+// 2. Computation of log(x) for |x-1| < 1/256
+// r = x - 1
+// P(r) = r*((1 - A2*r) + r^2*(A3 - A4*r)) = r*P2(r),
+// A4,A3,A2 are the same as in case |x-1| >= 1/256
+//
+// And final result = P2(r)*(r*L2)
+//
+// 3. How we define is input argument such that |x-1| < 1/256 or not.
+//
+// To do it we analyze biased exponent and significand of input argment.
+//
+// a) First we test is biased exponent equal to 0xFFFE or 0xFFFF (i.e.
+// we test is 0.5 <= x < 2). This comparison can be performed using
+// unsigned version of cmp instruction in such a way
+// biased_exponent_of_x - 0xFFFE < 2
+//
+//
+// b) Second (in case when result of a) is true) we need to compare x
+// with 1-1/256 and 1+1/256 or in register format representation with
+// 0xFFFEFF00000000000000 and 0xFFFF8080000000000000 correspondingly.
+// As far as biased exponent of x here can be equal only to 0xFFFE or
+// 0xFFFF we need to test only last bit of it. Also signifigand always
+// has implicit bit set to 1 that can be exluded from comparison.
+// Thus it's quite enough to generate 64-bit integer bits of that are
+// ix[63] = biased_exponent_of_x[0] and ix[62-0] = significand_of_x[62-0]
+// and compare it with 0x7F00000000000000 and 0x80800000000000000 (those
+// obtained like ix from register representatinos of 255/256 and
+// 257/256). This comparison can be made like in a), using unsigned
+// version of cmp i.e. ix - 0x7F00000000000000 < 0x0180000000000000.
+// 0x0180000000000000 is difference between 0x80800000000000000 and
+// 0x7F00000000000000.
+//
+// Note: NaT, any NaNs, +/-INF, +/-0, negatives and unnormalized numbers are
+// filtered and processed on special branches.
+//
+//
+// Special values
//==============================================================
-
-
-// log(+0) = -inf
-// log(-0) = -inf
-
-// log(+qnan) = +qnan
-// log(-qnan) = -qnan
-// log(+snan) = +qnan
-// log(-snan) = -qnan
-
-// log(-n) = QNAN Indefinite
-// log(-inf) = QNAN Indefinite
-
-// log(+inf) = +inf
-
+//
+// logf(+0) = -inf
+// logf(-0) = -inf
+//
+// logf(+qnan) = +qnan
+// logf(-qnan) = -qnan
+// logf(+snan) = +qnan
+// logf(-snan) = -qnan
+//
+// logf(-n) = QNAN Indefinite
+// logf(-inf) = QNAN Indefinite
+//
+// logf(+inf) = +inf
+//
// Registers used
//==============================================================
-// Floating Point registers used:
+// Floating Point registers used:
// f8, input
-// f9 -> f15, f32 -> f47
-
-// General registers used:
-// r32 -> r51
-
+// f12 -> f14, f33 -> f39
+//
+// General registers used:
+// r8 -> r11
+// r14 -> r19
+//
// Predicate registers used:
-// p6 -> p15
+// p6 -> p12
-// p8 log base e
-// p6 log base e special
-// p9 used in the frcpa
-// p13 log base e large W
-// p14 log base e small w
-
-// p7 log base 10
-// p10 log base 10 large W
-// p11 log base 10 small w
-// p12 log base 10 special
-
-#include "libm_support.h"
// Assembly macros
//==============================================================
-log_int_Nfloat = f9
-log_Nfloat = f10
-
-log_P3 = f11
-log_P2 = f12
-log_P1 = f13
-log_inv_ln10 = f14
-log_log2 = f15
-
-log_w = f32
-log_T = f33
-log_rp_p32 = f34
-log_rp_p2 = f35
-log_rp_p10 = f36
-log_rsq = f37
-log_T_plus_Nlog2 = f38
-log_r = f39
-log_C = f40
-log_rp_q32 = f41
-log_rp_q2 = f42
-log_rp_q10 = f43
-log_wsq = f44
-log_Q = f45
-log_inv_ln10 = f46
-log_NORM_f8 = f47
-
-// ===================================
-
-log_GR_exp_17_ones = r33
-log_GR_exp_16_ones = r34
-log_GR_exp_f8 = r35
-log_GR_signexp_f8 = r36
-log_GR_true_exp_f8 = r37
-log_GR_significand_f8 = r38
-log_GR_index = r39
-log_AD_1 = r40
-log_GR_signexp_w = r41
-log_GR_fff7 = r42
-log_AD_2 = r43
-log_GR_exp_w = r44
-
-GR_SAVE_B0 = r45
-GR_SAVE_GP = r46
-GR_SAVE_PFS = r47
-
-GR_Parameter_X = r48
-GR_Parameter_Y = r49
-GR_Parameter_RESULT = r50
-log_GR_tag = r51
+GR_TAG = r8
+GR_ad_T = r8
+GR_N = r9
+GR_Exp = r10
+GR_Sig = r11
+
+GR_025 = r14
+GR_05 = r15
+GR_A3 = r16
+GR_Ind = r17
+GR_dx = r15
+GR_Ln2 = r19
+GR_de = r20
+GR_x = r21
+GR_xorg = r22
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+FR_A2 = f12
+FR_A3 = f13
+FR_A4 = f14
+
+FR_RcpX = f33
+FR_r = f34
+FR_r2 = f35
+FR_tmp = f35
+FR_Ln2 = f36
+FR_T = f37
+FR_N = f38
+FR_NxLn2pT = f38
+FR_NormX = f39
+FR_InvLn10 = f40
+
+
+FR_Y = f1
+FR_X = f10
+FR_RESULT = f8
// Data tables
//==============================================================
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-
+RODATA
.align 16
-
-log_table_1:
-ASM_TYPE_DIRECTIVE(log_table_1,@object)
-data8 0xbfd0001008f39d59 // p3
-data8 0x3fd5556073e0c45a // p2
-ASM_SIZE_DIRECTIVE(log_table_1)
-
-log_table_2:
-ASM_TYPE_DIRECTIVE(log_table_2,@object)
-data8 0xbfdffffffffaea15 // p1
-data8 0x3fdbcb7b1526e50e // 1/ln10
-data8 0x3fe62e42fefa39ef // Log(2)
-data8 0x0 // pad
-
-data8 0x3F60040155D5889E //log(1/frcpa(1+ 0/256)
-data8 0x3F78121214586B54 //log(1/frcpa(1+ 1/256)
-data8 0x3F841929F96832F0 //log(1/frcpa(1+ 2/256)
-data8 0x3F8C317384C75F06 //log(1/frcpa(1+ 3/256)
-data8 0x3F91A6B91AC73386 //log(1/frcpa(1+ 4/256)
-data8 0x3F95BA9A5D9AC039 //log(1/frcpa(1+ 5/256)
-data8 0x3F99D2A8074325F4 //log(1/frcpa(1+ 6/256)
-data8 0x3F9D6B2725979802 //log(1/frcpa(1+ 7/256)
-data8 0x3FA0C58FA19DFAAA //log(1/frcpa(1+ 8/256)
-data8 0x3FA2954C78CBCE1B //log(1/frcpa(1+ 9/256)
-data8 0x3FA4A94D2DA96C56 //log(1/frcpa(1+ 10/256)
-data8 0x3FA67C94F2D4BB58 //log(1/frcpa(1+ 11/256)
-data8 0x3FA85188B630F068 //log(1/frcpa(1+ 12/256)
-data8 0x3FAA6B8ABE73AF4C //log(1/frcpa(1+ 13/256)
-data8 0x3FAC441E06F72A9E //log(1/frcpa(1+ 14/256)
-data8 0x3FAE1E6713606D07 //log(1/frcpa(1+ 15/256)
-data8 0x3FAFFA6911AB9301 //log(1/frcpa(1+ 16/256)
-data8 0x3FB0EC139C5DA601 //log(1/frcpa(1+ 17/256)
-data8 0x3FB1DBD2643D190B //log(1/frcpa(1+ 18/256)
-data8 0x3FB2CC7284FE5F1C //log(1/frcpa(1+ 19/256)
-data8 0x3FB3BDF5A7D1EE64 //log(1/frcpa(1+ 20/256)
-data8 0x3FB4B05D7AA012E0 //log(1/frcpa(1+ 21/256)
-data8 0x3FB580DB7CEB5702 //log(1/frcpa(1+ 22/256)
-data8 0x3FB674F089365A7A //log(1/frcpa(1+ 23/256)
-data8 0x3FB769EF2C6B568D //log(1/frcpa(1+ 24/256)
-data8 0x3FB85FD927506A48 //log(1/frcpa(1+ 25/256)
-data8 0x3FB9335E5D594989 //log(1/frcpa(1+ 26/256)
-data8 0x3FBA2B0220C8E5F5 //log(1/frcpa(1+ 27/256)
-data8 0x3FBB0004AC1A86AC //log(1/frcpa(1+ 28/256)
-data8 0x3FBBF968769FCA11 //log(1/frcpa(1+ 29/256)
-data8 0x3FBCCFEDBFEE13A8 //log(1/frcpa(1+ 30/256)
-data8 0x3FBDA727638446A2 //log(1/frcpa(1+ 31/256)
-data8 0x3FBEA3257FE10F7A //log(1/frcpa(1+ 32/256)
-data8 0x3FBF7BE9FEDBFDE6 //log(1/frcpa(1+ 33/256)
-data8 0x3FC02AB352FF25F4 //log(1/frcpa(1+ 34/256)
-data8 0x3FC097CE579D204D //log(1/frcpa(1+ 35/256)
-data8 0x3FC1178E8227E47C //log(1/frcpa(1+ 36/256)
-data8 0x3FC185747DBECF34 //log(1/frcpa(1+ 37/256)
-data8 0x3FC1F3B925F25D41 //log(1/frcpa(1+ 38/256)
-data8 0x3FC2625D1E6DDF57 //log(1/frcpa(1+ 39/256)
-data8 0x3FC2D1610C86813A //log(1/frcpa(1+ 40/256)
-data8 0x3FC340C59741142E //log(1/frcpa(1+ 41/256)
-data8 0x3FC3B08B6757F2A9 //log(1/frcpa(1+ 42/256)
-data8 0x3FC40DFB08378003 //log(1/frcpa(1+ 43/256)
-data8 0x3FC47E74E8CA5F7C //log(1/frcpa(1+ 44/256)
-data8 0x3FC4EF51F6466DE4 //log(1/frcpa(1+ 45/256)
-data8 0x3FC56092E02BA516 //log(1/frcpa(1+ 46/256)
-data8 0x3FC5D23857CD74D5 //log(1/frcpa(1+ 47/256)
-data8 0x3FC6313A37335D76 //log(1/frcpa(1+ 48/256)
-data8 0x3FC6A399DABBD383 //log(1/frcpa(1+ 49/256)
-data8 0x3FC70337DD3CE41B //log(1/frcpa(1+ 50/256)
-data8 0x3FC77654128F6127 //log(1/frcpa(1+ 51/256)
-data8 0x3FC7E9D82A0B022D //log(1/frcpa(1+ 52/256)
-data8 0x3FC84A6B759F512F //log(1/frcpa(1+ 53/256)
-data8 0x3FC8AB47D5F5A310 //log(1/frcpa(1+ 54/256)
-data8 0x3FC91FE49096581B //log(1/frcpa(1+ 55/256)
-data8 0x3FC981634011AA75 //log(1/frcpa(1+ 56/256)
-data8 0x3FC9F6C407089664 //log(1/frcpa(1+ 57/256)
-data8 0x3FCA58E729348F43 //log(1/frcpa(1+ 58/256)
-data8 0x3FCABB55C31693AD //log(1/frcpa(1+ 59/256)
-data8 0x3FCB1E104919EFD0 //log(1/frcpa(1+ 60/256)
-data8 0x3FCB94EE93E367CB //log(1/frcpa(1+ 61/256)
-data8 0x3FCBF851C067555F //log(1/frcpa(1+ 62/256)
-data8 0x3FCC5C0254BF23A6 //log(1/frcpa(1+ 63/256)
-data8 0x3FCCC000C9DB3C52 //log(1/frcpa(1+ 64/256)
-data8 0x3FCD244D99C85674 //log(1/frcpa(1+ 65/256)
-data8 0x3FCD88E93FB2F450 //log(1/frcpa(1+ 66/256)
-data8 0x3FCDEDD437EAEF01 //log(1/frcpa(1+ 67/256)
-data8 0x3FCE530EFFE71012 //log(1/frcpa(1+ 68/256)
-data8 0x3FCEB89A1648B971 //log(1/frcpa(1+ 69/256)
-data8 0x3FCF1E75FADF9BDE //log(1/frcpa(1+ 70/256)
-data8 0x3FCF84A32EAD7C35 //log(1/frcpa(1+ 71/256)
-data8 0x3FCFEB2233EA07CD //log(1/frcpa(1+ 72/256)
-data8 0x3FD028F9C7035C1C //log(1/frcpa(1+ 73/256)
-data8 0x3FD05C8BE0D9635A //log(1/frcpa(1+ 74/256)
-data8 0x3FD085EB8F8AE797 //log(1/frcpa(1+ 75/256)
-data8 0x3FD0B9C8E32D1911 //log(1/frcpa(1+ 76/256)
-data8 0x3FD0EDD060B78081 //log(1/frcpa(1+ 77/256)
-data8 0x3FD122024CF0063F //log(1/frcpa(1+ 78/256)
-data8 0x3FD14BE2927AECD4 //log(1/frcpa(1+ 79/256)
-data8 0x3FD180618EF18ADF //log(1/frcpa(1+ 80/256)
-data8 0x3FD1B50BBE2FC63B //log(1/frcpa(1+ 81/256)
-data8 0x3FD1DF4CC7CF242D //log(1/frcpa(1+ 82/256)
-data8 0x3FD214456D0EB8D4 //log(1/frcpa(1+ 83/256)
-data8 0x3FD23EC5991EBA49 //log(1/frcpa(1+ 84/256)
-data8 0x3FD2740D9F870AFB //log(1/frcpa(1+ 85/256)
-data8 0x3FD29ECDABCDFA04 //log(1/frcpa(1+ 86/256)
-data8 0x3FD2D46602ADCCEE //log(1/frcpa(1+ 87/256)
-data8 0x3FD2FF66B04EA9D4 //log(1/frcpa(1+ 88/256)
-data8 0x3FD335504B355A37 //log(1/frcpa(1+ 89/256)
-data8 0x3FD360925EC44F5D //log(1/frcpa(1+ 90/256)
-data8 0x3FD38BF1C3337E75 //log(1/frcpa(1+ 91/256)
-data8 0x3FD3C25277333184 //log(1/frcpa(1+ 92/256)
-data8 0x3FD3EDF463C1683E //log(1/frcpa(1+ 93/256)
-data8 0x3FD419B423D5E8C7 //log(1/frcpa(1+ 94/256)
-data8 0x3FD44591E0539F49 //log(1/frcpa(1+ 95/256)
-data8 0x3FD47C9175B6F0AD //log(1/frcpa(1+ 96/256)
-data8 0x3FD4A8B341552B09 //log(1/frcpa(1+ 97/256)
-data8 0x3FD4D4F3908901A0 //log(1/frcpa(1+ 98/256)
-data8 0x3FD501528DA1F968 //log(1/frcpa(1+ 99/256)
-data8 0x3FD52DD06347D4F6 //log(1/frcpa(1+ 100/256)
-data8 0x3FD55A6D3C7B8A8A //log(1/frcpa(1+ 101/256)
-data8 0x3FD5925D2B112A59 //log(1/frcpa(1+ 102/256)
-data8 0x3FD5BF406B543DB2 //log(1/frcpa(1+ 103/256)
-data8 0x3FD5EC433D5C35AE //log(1/frcpa(1+ 104/256)
-data8 0x3FD61965CDB02C1F //log(1/frcpa(1+ 105/256)
-data8 0x3FD646A84935B2A2 //log(1/frcpa(1+ 106/256)
-data8 0x3FD6740ADD31DE94 //log(1/frcpa(1+ 107/256)
-data8 0x3FD6A18DB74A58C5 //log(1/frcpa(1+ 108/256)
-data8 0x3FD6CF31058670EC //log(1/frcpa(1+ 109/256)
-data8 0x3FD6F180E852F0BA //log(1/frcpa(1+ 110/256)
-data8 0x3FD71F5D71B894F0 //log(1/frcpa(1+ 111/256)
-data8 0x3FD74D5AEFD66D5C //log(1/frcpa(1+ 112/256)
-data8 0x3FD77B79922BD37E //log(1/frcpa(1+ 113/256)
-data8 0x3FD7A9B9889F19E2 //log(1/frcpa(1+ 114/256)
-data8 0x3FD7D81B037EB6A6 //log(1/frcpa(1+ 115/256)
-data8 0x3FD8069E33827231 //log(1/frcpa(1+ 116/256)
-data8 0x3FD82996D3EF8BCB //log(1/frcpa(1+ 117/256)
-data8 0x3FD85855776DCBFB //log(1/frcpa(1+ 118/256)
-data8 0x3FD8873658327CCF //log(1/frcpa(1+ 119/256)
-data8 0x3FD8AA75973AB8CF //log(1/frcpa(1+ 120/256)
-data8 0x3FD8D992DC8824E5 //log(1/frcpa(1+ 121/256)
-data8 0x3FD908D2EA7D9512 //log(1/frcpa(1+ 122/256)
-data8 0x3FD92C59E79C0E56 //log(1/frcpa(1+ 123/256)
-data8 0x3FD95BD750EE3ED3 //log(1/frcpa(1+ 124/256)
-data8 0x3FD98B7811A3EE5B //log(1/frcpa(1+ 125/256)
-data8 0x3FD9AF47F33D406C //log(1/frcpa(1+ 126/256)
-data8 0x3FD9DF270C1914A8 //log(1/frcpa(1+ 127/256)
-data8 0x3FDA0325ED14FDA4 //log(1/frcpa(1+ 128/256)
-data8 0x3FDA33440224FA79 //log(1/frcpa(1+ 129/256)
-data8 0x3FDA57725E80C383 //log(1/frcpa(1+ 130/256)
-data8 0x3FDA87D0165DD199 //log(1/frcpa(1+ 131/256)
-data8 0x3FDAAC2E6C03F896 //log(1/frcpa(1+ 132/256)
-data8 0x3FDADCCC6FDF6A81 //log(1/frcpa(1+ 133/256)
-data8 0x3FDB015B3EB1E790 //log(1/frcpa(1+ 134/256)
-data8 0x3FDB323A3A635948 //log(1/frcpa(1+ 135/256)
-data8 0x3FDB56FA04462909 //log(1/frcpa(1+ 136/256)
-data8 0x3FDB881AA659BC93 //log(1/frcpa(1+ 137/256)
-data8 0x3FDBAD0BEF3DB165 //log(1/frcpa(1+ 138/256)
-data8 0x3FDBD21297781C2F //log(1/frcpa(1+ 139/256)
-data8 0x3FDC039236F08819 //log(1/frcpa(1+ 140/256)
-data8 0x3FDC28CB1E4D32FD //log(1/frcpa(1+ 141/256)
-data8 0x3FDC4E19B84723C2 //log(1/frcpa(1+ 142/256)
-data8 0x3FDC7FF9C74554C9 //log(1/frcpa(1+ 143/256)
-data8 0x3FDCA57B64E9DB05 //log(1/frcpa(1+ 144/256)
-data8 0x3FDCCB130A5CEBB0 //log(1/frcpa(1+ 145/256)
-data8 0x3FDCF0C0D18F326F //log(1/frcpa(1+ 146/256)
-data8 0x3FDD232075B5A201 //log(1/frcpa(1+ 147/256)
-data8 0x3FDD490246DEFA6B //log(1/frcpa(1+ 148/256)
-data8 0x3FDD6EFA918D25CD //log(1/frcpa(1+ 149/256)
-data8 0x3FDD9509707AE52F //log(1/frcpa(1+ 150/256)
-data8 0x3FDDBB2EFE92C554 //log(1/frcpa(1+ 151/256)
-data8 0x3FDDEE2F3445E4AF //log(1/frcpa(1+ 152/256)
-data8 0x3FDE148A1A2726CE //log(1/frcpa(1+ 153/256)
-data8 0x3FDE3AFC0A49FF40 //log(1/frcpa(1+ 154/256)
-data8 0x3FDE6185206D516E //log(1/frcpa(1+ 155/256)
-data8 0x3FDE882578823D52 //log(1/frcpa(1+ 156/256)
-data8 0x3FDEAEDD2EAC990C //log(1/frcpa(1+ 157/256)
-data8 0x3FDED5AC5F436BE3 //log(1/frcpa(1+ 158/256)
-data8 0x3FDEFC9326D16AB9 //log(1/frcpa(1+ 159/256)
-data8 0x3FDF2391A2157600 //log(1/frcpa(1+ 160/256)
-data8 0x3FDF4AA7EE03192D //log(1/frcpa(1+ 161/256)
-data8 0x3FDF71D627C30BB0 //log(1/frcpa(1+ 162/256)
-data8 0x3FDF991C6CB3B379 //log(1/frcpa(1+ 163/256)
-data8 0x3FDFC07ADA69A910 //log(1/frcpa(1+ 164/256)
-data8 0x3FDFE7F18EB03D3E //log(1/frcpa(1+ 165/256)
-data8 0x3FE007C053C5002E //log(1/frcpa(1+ 166/256)
-data8 0x3FE01B942198A5A1 //log(1/frcpa(1+ 167/256)
-data8 0x3FE02F74400C64EB //log(1/frcpa(1+ 168/256)
-data8 0x3FE04360BE7603AD //log(1/frcpa(1+ 169/256)
-data8 0x3FE05759AC47FE34 //log(1/frcpa(1+ 170/256)
-data8 0x3FE06B5F1911CF52 //log(1/frcpa(1+ 171/256)
-data8 0x3FE078BF0533C568 //log(1/frcpa(1+ 172/256)
-data8 0x3FE08CD9687E7B0E //log(1/frcpa(1+ 173/256)
-data8 0x3FE0A10074CF9019 //log(1/frcpa(1+ 174/256)
-data8 0x3FE0B5343A234477 //log(1/frcpa(1+ 175/256)
-data8 0x3FE0C974C89431CE //log(1/frcpa(1+ 176/256)
-data8 0x3FE0DDC2305B9886 //log(1/frcpa(1+ 177/256)
-data8 0x3FE0EB524BAFC918 //log(1/frcpa(1+ 178/256)
-data8 0x3FE0FFB54213A476 //log(1/frcpa(1+ 179/256)
-data8 0x3FE114253DA97D9F //log(1/frcpa(1+ 180/256)
-data8 0x3FE128A24F1D9AFF //log(1/frcpa(1+ 181/256)
-data8 0x3FE1365252BF0865 //log(1/frcpa(1+ 182/256)
-data8 0x3FE14AE558B4A92D //log(1/frcpa(1+ 183/256)
-data8 0x3FE15F85A19C765B //log(1/frcpa(1+ 184/256)
-data8 0x3FE16D4D38C119FA //log(1/frcpa(1+ 185/256)
-data8 0x3FE18203C20DD133 //log(1/frcpa(1+ 186/256)
-data8 0x3FE196C7BC4B1F3B //log(1/frcpa(1+ 187/256)
-data8 0x3FE1A4A738B7A33C //log(1/frcpa(1+ 188/256)
-data8 0x3FE1B981C0C9653D //log(1/frcpa(1+ 189/256)
-data8 0x3FE1CE69E8BB106B //log(1/frcpa(1+ 190/256)
-data8 0x3FE1DC619DE06944 //log(1/frcpa(1+ 191/256)
-data8 0x3FE1F160A2AD0DA4 //log(1/frcpa(1+ 192/256)
-data8 0x3FE2066D7740737E //log(1/frcpa(1+ 193/256)
-data8 0x3FE2147DBA47A394 //log(1/frcpa(1+ 194/256)
-data8 0x3FE229A1BC5EBAC3 //log(1/frcpa(1+ 195/256)
-data8 0x3FE237C1841A502E //log(1/frcpa(1+ 196/256)
-data8 0x3FE24CFCE6F80D9A //log(1/frcpa(1+ 197/256)
-data8 0x3FE25B2C55CD5762 //log(1/frcpa(1+ 198/256)
-data8 0x3FE2707F4D5F7C41 //log(1/frcpa(1+ 199/256)
-data8 0x3FE285E0842CA384 //log(1/frcpa(1+ 200/256)
-data8 0x3FE294294708B773 //log(1/frcpa(1+ 201/256)
-data8 0x3FE2A9A2670AFF0C //log(1/frcpa(1+ 202/256)
-data8 0x3FE2B7FB2C8D1CC1 //log(1/frcpa(1+ 203/256)
-data8 0x3FE2C65A6395F5F5 //log(1/frcpa(1+ 204/256)
-data8 0x3FE2DBF557B0DF43 //log(1/frcpa(1+ 205/256)
-data8 0x3FE2EA64C3F97655 //log(1/frcpa(1+ 206/256)
-data8 0x3FE3001823684D73 //log(1/frcpa(1+ 207/256)
-data8 0x3FE30E97E9A8B5CD //log(1/frcpa(1+ 208/256)
-data8 0x3FE32463EBDD34EA //log(1/frcpa(1+ 209/256)
-data8 0x3FE332F4314AD796 //log(1/frcpa(1+ 210/256)
-data8 0x3FE348D90E7464D0 //log(1/frcpa(1+ 211/256)
-data8 0x3FE35779F8C43D6E //log(1/frcpa(1+ 212/256)
-data8 0x3FE36621961A6A99 //log(1/frcpa(1+ 213/256)
-data8 0x3FE37C299F3C366A //log(1/frcpa(1+ 214/256)
-data8 0x3FE38AE2171976E7 //log(1/frcpa(1+ 215/256)
-data8 0x3FE399A157A603E7 //log(1/frcpa(1+ 216/256)
-data8 0x3FE3AFCCFE77B9D1 //log(1/frcpa(1+ 217/256)
-data8 0x3FE3BE9D503533B5 //log(1/frcpa(1+ 218/256)
-data8 0x3FE3CD7480B4A8A3 //log(1/frcpa(1+ 219/256)
-data8 0x3FE3E3C43918F76C //log(1/frcpa(1+ 220/256)
-data8 0x3FE3F2ACB27ED6C7 //log(1/frcpa(1+ 221/256)
-data8 0x3FE4019C2125CA93 //log(1/frcpa(1+ 222/256)
-data8 0x3FE4181061389722 //log(1/frcpa(1+ 223/256)
-data8 0x3FE42711518DF545 //log(1/frcpa(1+ 224/256)
-data8 0x3FE436194E12B6BF //log(1/frcpa(1+ 225/256)
-data8 0x3FE445285D68EA69 //log(1/frcpa(1+ 226/256)
-data8 0x3FE45BCC464C893A //log(1/frcpa(1+ 227/256)
-data8 0x3FE46AED21F117FC //log(1/frcpa(1+ 228/256)
-data8 0x3FE47A1527E8A2D3 //log(1/frcpa(1+ 229/256)
-data8 0x3FE489445EFFFCCC //log(1/frcpa(1+ 230/256)
-data8 0x3FE4A018BCB69835 //log(1/frcpa(1+ 231/256)
-data8 0x3FE4AF5A0C9D65D7 //log(1/frcpa(1+ 232/256)
-data8 0x3FE4BEA2A5BDBE87 //log(1/frcpa(1+ 233/256)
-data8 0x3FE4CDF28F10AC46 //log(1/frcpa(1+ 234/256)
-data8 0x3FE4DD49CF994058 //log(1/frcpa(1+ 235/256)
-data8 0x3FE4ECA86E64A684 //log(1/frcpa(1+ 236/256)
-data8 0x3FE503C43CD8EB68 //log(1/frcpa(1+ 237/256)
-data8 0x3FE513356667FC57 //log(1/frcpa(1+ 238/256)
-data8 0x3FE522AE0738A3D8 //log(1/frcpa(1+ 239/256)
-data8 0x3FE5322E26867857 //log(1/frcpa(1+ 240/256)
-data8 0x3FE541B5CB979809 //log(1/frcpa(1+ 241/256)
-data8 0x3FE55144FDBCBD62 //log(1/frcpa(1+ 242/256)
-data8 0x3FE560DBC45153C7 //log(1/frcpa(1+ 243/256)
-data8 0x3FE5707A26BB8C66 //log(1/frcpa(1+ 244/256)
-data8 0x3FE587F60ED5B900 //log(1/frcpa(1+ 245/256)
-data8 0x3FE597A7977C8F31 //log(1/frcpa(1+ 246/256)
-data8 0x3FE5A760D634BB8B //log(1/frcpa(1+ 247/256)
-data8 0x3FE5B721D295F10F //log(1/frcpa(1+ 248/256)
-data8 0x3FE5C6EA94431EF9 //log(1/frcpa(1+ 249/256)
-data8 0x3FE5D6BB22EA86F6 //log(1/frcpa(1+ 250/256)
-data8 0x3FE5E6938645D390 //log(1/frcpa(1+ 251/256)
-data8 0x3FE5F673C61A2ED2 //log(1/frcpa(1+ 252/256)
-data8 0x3FE6065BEA385926 //log(1/frcpa(1+ 253/256)
-data8 0x3FE6164BFA7CC06B //log(1/frcpa(1+ 254/256)
-data8 0x3FE62643FECF9743 //log(1/frcpa(1+ 255/256)
-ASM_SIZE_DIRECTIVE(log_table_2)
-
-
-.align 32
-.global logf#
-.global log10f#
-
-// log10 has p7 true, p8 false
-// log has p8 true, p7 false
-
+LOCAL_OBJECT_START(logf_data)
+data8 0x3FF0000000000000 // 1.0
+//
+// ln(1/frcpa(1+i/256)), i=0...255
+data8 0x3F60040155D5889E // 0
+data8 0x3F78121214586B54 // 1
+data8 0x3F841929F96832F0 // 2
+data8 0x3F8C317384C75F06 // 3
+data8 0x3F91A6B91AC73386 // 4
+data8 0x3F95BA9A5D9AC039 // 5
+data8 0x3F99D2A8074325F4 // 6
+data8 0x3F9D6B2725979802 // 7
+data8 0x3FA0C58FA19DFAAA // 8
+data8 0x3FA2954C78CBCE1B // 9
+data8 0x3FA4A94D2DA96C56 // 10
+data8 0x3FA67C94F2D4BB58 // 11
+data8 0x3FA85188B630F068 // 12
+data8 0x3FAA6B8ABE73AF4C // 13
+data8 0x3FAC441E06F72A9E // 14
+data8 0x3FAE1E6713606D07 // 15
+data8 0x3FAFFA6911AB9301 // 16
+data8 0x3FB0EC139C5DA601 // 17
+data8 0x3FB1DBD2643D190B // 18
+data8 0x3FB2CC7284FE5F1C // 19
+data8 0x3FB3BDF5A7D1EE64 // 20
+data8 0x3FB4B05D7AA012E0 // 21
+data8 0x3FB580DB7CEB5702 // 22
+data8 0x3FB674F089365A7A // 23
+data8 0x3FB769EF2C6B568D // 24
+data8 0x3FB85FD927506A48 // 25
+data8 0x3FB9335E5D594989 // 26
+data8 0x3FBA2B0220C8E5F5 // 27
+data8 0x3FBB0004AC1A86AC // 28
+data8 0x3FBBF968769FCA11 // 29
+data8 0x3FBCCFEDBFEE13A8 // 30
+data8 0x3FBDA727638446A2 // 31
+data8 0x3FBEA3257FE10F7A // 32
+data8 0x3FBF7BE9FEDBFDE6 // 33
+data8 0x3FC02AB352FF25F4 // 34
+data8 0x3FC097CE579D204D // 35
+data8 0x3FC1178E8227E47C // 36
+data8 0x3FC185747DBECF34 // 37
+data8 0x3FC1F3B925F25D41 // 38
+data8 0x3FC2625D1E6DDF57 // 39
+data8 0x3FC2D1610C86813A // 40
+data8 0x3FC340C59741142E // 41
+data8 0x3FC3B08B6757F2A9 // 42
+data8 0x3FC40DFB08378003 // 43
+data8 0x3FC47E74E8CA5F7C // 44
+data8 0x3FC4EF51F6466DE4 // 45
+data8 0x3FC56092E02BA516 // 46
+data8 0x3FC5D23857CD74D5 // 47
+data8 0x3FC6313A37335D76 // 48
+data8 0x3FC6A399DABBD383 // 49
+data8 0x3FC70337DD3CE41B // 50
+data8 0x3FC77654128F6127 // 51
+data8 0x3FC7E9D82A0B022D // 52
+data8 0x3FC84A6B759F512F // 53
+data8 0x3FC8AB47D5F5A310 // 54
+data8 0x3FC91FE49096581B // 55
+data8 0x3FC981634011AA75 // 56
+data8 0x3FC9F6C407089664 // 57
+data8 0x3FCA58E729348F43 // 58
+data8 0x3FCABB55C31693AD // 59
+data8 0x3FCB1E104919EFD0 // 60
+data8 0x3FCB94EE93E367CB // 61
+data8 0x3FCBF851C067555F // 62
+data8 0x3FCC5C0254BF23A6 // 63
+data8 0x3FCCC000C9DB3C52 // 64
+data8 0x3FCD244D99C85674 // 65
+data8 0x3FCD88E93FB2F450 // 66
+data8 0x3FCDEDD437EAEF01 // 67
+data8 0x3FCE530EFFE71012 // 68
+data8 0x3FCEB89A1648B971 // 69
+data8 0x3FCF1E75FADF9BDE // 70
+data8 0x3FCF84A32EAD7C35 // 71
+data8 0x3FCFEB2233EA07CD // 72
+data8 0x3FD028F9C7035C1C // 73
+data8 0x3FD05C8BE0D9635A // 74
+data8 0x3FD085EB8F8AE797 // 75
+data8 0x3FD0B9C8E32D1911 // 76
+data8 0x3FD0EDD060B78081 // 77
+data8 0x3FD122024CF0063F // 78
+data8 0x3FD14BE2927AECD4 // 79
+data8 0x3FD180618EF18ADF // 80
+data8 0x3FD1B50BBE2FC63B // 81
+data8 0x3FD1DF4CC7CF242D // 82
+data8 0x3FD214456D0EB8D4 // 83
+data8 0x3FD23EC5991EBA49 // 84
+data8 0x3FD2740D9F870AFB // 85
+data8 0x3FD29ECDABCDFA04 // 86
+data8 0x3FD2D46602ADCCEE // 87
+data8 0x3FD2FF66B04EA9D4 // 88
+data8 0x3FD335504B355A37 // 89
+data8 0x3FD360925EC44F5D // 90
+data8 0x3FD38BF1C3337E75 // 91
+data8 0x3FD3C25277333184 // 92
+data8 0x3FD3EDF463C1683E // 93
+data8 0x3FD419B423D5E8C7 // 94
+data8 0x3FD44591E0539F49 // 95
+data8 0x3FD47C9175B6F0AD // 96
+data8 0x3FD4A8B341552B09 // 97
+data8 0x3FD4D4F3908901A0 // 98
+data8 0x3FD501528DA1F968 // 99
+data8 0x3FD52DD06347D4F6 // 100
+data8 0x3FD55A6D3C7B8A8A // 101
+data8 0x3FD5925D2B112A59 // 102
+data8 0x3FD5BF406B543DB2 // 103
+data8 0x3FD5EC433D5C35AE // 104
+data8 0x3FD61965CDB02C1F // 105
+data8 0x3FD646A84935B2A2 // 106
+data8 0x3FD6740ADD31DE94 // 107
+data8 0x3FD6A18DB74A58C5 // 108
+data8 0x3FD6CF31058670EC // 109
+data8 0x3FD6F180E852F0BA // 110
+data8 0x3FD71F5D71B894F0 // 111
+data8 0x3FD74D5AEFD66D5C // 112
+data8 0x3FD77B79922BD37E // 113
+data8 0x3FD7A9B9889F19E2 // 114
+data8 0x3FD7D81B037EB6A6 // 115
+data8 0x3FD8069E33827231 // 116
+data8 0x3FD82996D3EF8BCB // 117
+data8 0x3FD85855776DCBFB // 118
+data8 0x3FD8873658327CCF // 119
+data8 0x3FD8AA75973AB8CF // 120
+data8 0x3FD8D992DC8824E5 // 121
+data8 0x3FD908D2EA7D9512 // 122
+data8 0x3FD92C59E79C0E56 // 123
+data8 0x3FD95BD750EE3ED3 // 124
+data8 0x3FD98B7811A3EE5B // 125
+data8 0x3FD9AF47F33D406C // 126
+data8 0x3FD9DF270C1914A8 // 127
+data8 0x3FDA0325ED14FDA4 // 128
+data8 0x3FDA33440224FA79 // 129
+data8 0x3FDA57725E80C383 // 130
+data8 0x3FDA87D0165DD199 // 131
+data8 0x3FDAAC2E6C03F896 // 132
+data8 0x3FDADCCC6FDF6A81 // 133
+data8 0x3FDB015B3EB1E790 // 134
+data8 0x3FDB323A3A635948 // 135
+data8 0x3FDB56FA04462909 // 136
+data8 0x3FDB881AA659BC93 // 137
+data8 0x3FDBAD0BEF3DB165 // 138
+data8 0x3FDBD21297781C2F // 139
+data8 0x3FDC039236F08819 // 140
+data8 0x3FDC28CB1E4D32FD // 141
+data8 0x3FDC4E19B84723C2 // 142
+data8 0x3FDC7FF9C74554C9 // 143
+data8 0x3FDCA57B64E9DB05 // 144
+data8 0x3FDCCB130A5CEBB0 // 145
+data8 0x3FDCF0C0D18F326F // 146
+data8 0x3FDD232075B5A201 // 147
+data8 0x3FDD490246DEFA6B // 148
+data8 0x3FDD6EFA918D25CD // 149
+data8 0x3FDD9509707AE52F // 150
+data8 0x3FDDBB2EFE92C554 // 151
+data8 0x3FDDEE2F3445E4AF // 152
+data8 0x3FDE148A1A2726CE // 153
+data8 0x3FDE3AFC0A49FF40 // 154
+data8 0x3FDE6185206D516E // 155
+data8 0x3FDE882578823D52 // 156
+data8 0x3FDEAEDD2EAC990C // 157
+data8 0x3FDED5AC5F436BE3 // 158
+data8 0x3FDEFC9326D16AB9 // 159
+data8 0x3FDF2391A2157600 // 160
+data8 0x3FDF4AA7EE03192D // 161
+data8 0x3FDF71D627C30BB0 // 162
+data8 0x3FDF991C6CB3B379 // 163
+data8 0x3FDFC07ADA69A910 // 164
+data8 0x3FDFE7F18EB03D3E // 165
+data8 0x3FE007C053C5002E // 166
+data8 0x3FE01B942198A5A1 // 167
+data8 0x3FE02F74400C64EB // 168
+data8 0x3FE04360BE7603AD // 169
+data8 0x3FE05759AC47FE34 // 170
+data8 0x3FE06B5F1911CF52 // 171
+data8 0x3FE078BF0533C568 // 172
+data8 0x3FE08CD9687E7B0E // 173
+data8 0x3FE0A10074CF9019 // 174
+data8 0x3FE0B5343A234477 // 175
+data8 0x3FE0C974C89431CE // 176
+data8 0x3FE0DDC2305B9886 // 177
+data8 0x3FE0EB524BAFC918 // 178
+data8 0x3FE0FFB54213A476 // 179
+data8 0x3FE114253DA97D9F // 180
+data8 0x3FE128A24F1D9AFF // 181
+data8 0x3FE1365252BF0865 // 182
+data8 0x3FE14AE558B4A92D // 183
+data8 0x3FE15F85A19C765B // 184
+data8 0x3FE16D4D38C119FA // 185
+data8 0x3FE18203C20DD133 // 186
+data8 0x3FE196C7BC4B1F3B // 187
+data8 0x3FE1A4A738B7A33C // 188
+data8 0x3FE1B981C0C9653D // 189
+data8 0x3FE1CE69E8BB106B // 190
+data8 0x3FE1DC619DE06944 // 191
+data8 0x3FE1F160A2AD0DA4 // 192
+data8 0x3FE2066D7740737E // 193
+data8 0x3FE2147DBA47A394 // 194
+data8 0x3FE229A1BC5EBAC3 // 195
+data8 0x3FE237C1841A502E // 196
+data8 0x3FE24CFCE6F80D9A // 197
+data8 0x3FE25B2C55CD5762 // 198
+data8 0x3FE2707F4D5F7C41 // 199
+data8 0x3FE285E0842CA384 // 200
+data8 0x3FE294294708B773 // 201
+data8 0x3FE2A9A2670AFF0C // 202
+data8 0x3FE2B7FB2C8D1CC1 // 203
+data8 0x3FE2C65A6395F5F5 // 204
+data8 0x3FE2DBF557B0DF43 // 205
+data8 0x3FE2EA64C3F97655 // 206
+data8 0x3FE3001823684D73 // 207
+data8 0x3FE30E97E9A8B5CD // 208
+data8 0x3FE32463EBDD34EA // 209
+data8 0x3FE332F4314AD796 // 210
+data8 0x3FE348D90E7464D0 // 211
+data8 0x3FE35779F8C43D6E // 212
+data8 0x3FE36621961A6A99 // 213
+data8 0x3FE37C299F3C366A // 214
+data8 0x3FE38AE2171976E7 // 215
+data8 0x3FE399A157A603E7 // 216
+data8 0x3FE3AFCCFE77B9D1 // 217
+data8 0x3FE3BE9D503533B5 // 218
+data8 0x3FE3CD7480B4A8A3 // 219
+data8 0x3FE3E3C43918F76C // 220
+data8 0x3FE3F2ACB27ED6C7 // 221
+data8 0x3FE4019C2125CA93 // 222
+data8 0x3FE4181061389722 // 223
+data8 0x3FE42711518DF545 // 224
+data8 0x3FE436194E12B6BF // 225
+data8 0x3FE445285D68EA69 // 226
+data8 0x3FE45BCC464C893A // 227
+data8 0x3FE46AED21F117FC // 228
+data8 0x3FE47A1527E8A2D3 // 229
+data8 0x3FE489445EFFFCCC // 230
+data8 0x3FE4A018BCB69835 // 231
+data8 0x3FE4AF5A0C9D65D7 // 232
+data8 0x3FE4BEA2A5BDBE87 // 233
+data8 0x3FE4CDF28F10AC46 // 234
+data8 0x3FE4DD49CF994058 // 235
+data8 0x3FE4ECA86E64A684 // 236
+data8 0x3FE503C43CD8EB68 // 237
+data8 0x3FE513356667FC57 // 238
+data8 0x3FE522AE0738A3D8 // 239
+data8 0x3FE5322E26867857 // 240
+data8 0x3FE541B5CB979809 // 241
+data8 0x3FE55144FDBCBD62 // 242
+data8 0x3FE560DBC45153C7 // 243
+data8 0x3FE5707A26BB8C66 // 244
+data8 0x3FE587F60ED5B900 // 245
+data8 0x3FE597A7977C8F31 // 246
+data8 0x3FE5A760D634BB8B // 247
+data8 0x3FE5B721D295F10F // 248
+data8 0x3FE5C6EA94431EF9 // 249
+data8 0x3FE5D6BB22EA86F6 // 250
+data8 0x3FE5E6938645D390 // 251
+data8 0x3FE5F673C61A2ED2 // 252
+data8 0x3FE6065BEA385926 // 253
+data8 0x3FE6164BFA7CC06B // 254
+data8 0x3FE62643FECF9743 // 255
+LOCAL_OBJECT_END(logf_data)
+
+LOCAL_OBJECT_START(log10f_data)
+data8 0x3FDBCB7B1526E50E // 1/ln(10)
+//
+// ln(1/frcpa(1+i/256))/ln(10), i=0...255
+data8 0x3F4BD27045BFD025 // 0
+data8 0x3F64E84E793A474A // 1
+data8 0x3F7175085AB85FF0 // 2
+data8 0x3F787CFF9D9147A5 // 3
+data8 0x3F7EA9D372B89FC8 // 4
+data8 0x3F82DF9D95DA961C // 5
+data8 0x3F866DF172D6372C // 6
+data8 0x3F898D79EF5EEDF0 // 7
+data8 0x3F8D22ADF3F9579D // 8
+data8 0x3F9024231D30C398 // 9
+data8 0x3F91F23A98897D4A // 10
+data8 0x3F93881A7B818F9E // 11
+data8 0x3F951F6E1E759E35 // 12
+data8 0x3F96F2BCE7ADC5B4 // 13
+data8 0x3F988D362CDF359E // 14
+data8 0x3F9A292BAF010982 // 15
+data8 0x3F9BC6A03117EB97 // 16
+data8 0x3F9D65967DE3AB09 // 17
+data8 0x3F9F061167FC31E8 // 18
+data8 0x3FA05409E4F7819C // 19
+data8 0x3FA125D0432EA20E // 20
+data8 0x3FA1F85D440D299B // 21
+data8 0x3FA2AD755749617D // 22
+data8 0x3FA381772A00E604 // 23
+data8 0x3FA45643E165A70B // 24
+data8 0x3FA52BDD034475B8 // 25
+data8 0x3FA5E3966B7E9295 // 26
+data8 0x3FA6BAAF47C5B245 // 27
+data8 0x3FA773B3E8C4F3C8 // 28
+data8 0x3FA84C51EBEE8D15 // 29
+data8 0x3FA906A6786FC1CB // 30
+data8 0x3FA9C197ABF00DD7 // 31
+data8 0x3FAA9C78712191F7 // 32
+data8 0x3FAB58C09C8D637C // 33
+data8 0x3FAC15A8BCDD7B7E // 34
+data8 0x3FACD331E2C2967C // 35
+data8 0x3FADB11ED766ABF4 // 36
+data8 0x3FAE70089346A9E6 // 37
+data8 0x3FAF2F96C6754AEE // 38
+data8 0x3FAFEFCA8D451FD6 // 39
+data8 0x3FB0585283764178 // 40
+data8 0x3FB0B913AAC7D3A7 // 41
+data8 0x3FB11A294F2569F6 // 42
+data8 0x3FB16B51A2696891 // 43
+data8 0x3FB1CD03ADACC8BE // 44
+data8 0x3FB22F0BDD7745F5 // 45
+data8 0x3FB2916ACA38D1E8 // 46
+data8 0x3FB2F4210DF7663D // 47
+data8 0x3FB346A6C3C49066 // 48
+data8 0x3FB3A9FEBC60540A // 49
+data8 0x3FB3FD0C10A3AA54 // 50
+data8 0x3FB46107D3540A82 // 51
+data8 0x3FB4C55DD16967FE // 52
+data8 0x3FB51940330C000B // 53
+data8 0x3FB56D620EE7115E // 54
+data8 0x3FB5D2ABCF26178E // 55
+data8 0x3FB6275AA5DEBF81 // 56
+data8 0x3FB68D4EAF26D7EE // 57
+data8 0x3FB6E28C5C54A28D // 58
+data8 0x3FB7380B9665B7C8 // 59
+data8 0x3FB78DCCC278E85B // 60
+data8 0x3FB7F50C2CF2557A // 61
+data8 0x3FB84B5FD5EAEFD8 // 62
+data8 0x3FB8A1F6BAB2B226 // 63
+data8 0x3FB8F8D144557BDF // 64
+data8 0x3FB94FEFDCD61D92 // 65
+data8 0x3FB9A752EF316149 // 66
+data8 0x3FB9FEFAE7611EE0 // 67
+data8 0x3FBA56E8325F5C87 // 68
+data8 0x3FBAAF1B3E297BB4 // 69
+data8 0x3FBB079479C372AD // 70
+data8 0x3FBB6054553B12F7 // 71
+data8 0x3FBBB95B41AB5CE6 // 72
+data8 0x3FBC12A9B13FE079 // 73
+data8 0x3FBC6C4017382BEA // 74
+data8 0x3FBCB41FBA42686D // 75
+data8 0x3FBD0E38CE73393F // 76
+data8 0x3FBD689B2193F133 // 77
+data8 0x3FBDC3472B1D2860 // 78
+data8 0x3FBE0C06300D528B // 79
+data8 0x3FBE6738190E394C // 80
+data8 0x3FBEC2B50D208D9B // 81
+data8 0x3FBF0C1C2B936828 // 82
+data8 0x3FBF68216C9CC727 // 83
+data8 0x3FBFB1F6381856F4 // 84
+data8 0x3FC00742AF4CE5F8 // 85
+data8 0x3FC02C64906512D2 // 86
+data8 0x3FC05AF1E63E03B4 // 87
+data8 0x3FC0804BEA723AA9 // 88
+data8 0x3FC0AF1FD6711527 // 89
+data8 0x3FC0D4B2A8805A00 // 90
+data8 0x3FC0FA5EF136A06C // 91
+data8 0x3FC1299A4FB3E306 // 92
+data8 0x3FC14F806253C3ED // 93
+data8 0x3FC175805D1587C1 // 94
+data8 0x3FC19B9A637CA295 // 95
+data8 0x3FC1CB5FC26EDE17 // 96
+data8 0x3FC1F1B4E65F2590 // 97
+data8 0x3FC218248B5DC3E5 // 98
+data8 0x3FC23EAED62ADC76 // 99
+data8 0x3FC26553EBD337BD // 100
+data8 0x3FC28C13F1B11900 // 101
+data8 0x3FC2BCAA14381386 // 102
+data8 0x3FC2E3A740B7800F // 103
+data8 0x3FC30ABFD8F333B6 // 104
+data8 0x3FC331F403985097 // 105
+data8 0x3FC35943E7A60690 // 106
+data8 0x3FC380AFAC6E7C07 // 107
+data8 0x3FC3A8377997B9E6 // 108
+data8 0x3FC3CFDB771C9ADB // 109
+data8 0x3FC3EDA90D39A5DF // 110
+data8 0x3FC4157EC09505CD // 111
+data8 0x3FC43D7113FB04C1 // 112
+data8 0x3FC4658030AD1CCF // 113
+data8 0x3FC48DAC404638F6 // 114
+data8 0x3FC4B5F56CBBB869 // 115
+data8 0x3FC4DE5BE05E7583 // 116
+data8 0x3FC4FCBC0776FD85 // 117
+data8 0x3FC525561E9256EE // 118
+data8 0x3FC54E0DF3198865 // 119
+data8 0x3FC56CAB7112BDE2 // 120
+data8 0x3FC59597BA735B15 // 121
+data8 0x3FC5BEA23A506FDA // 122
+data8 0x3FC5DD7E08DE382F // 123
+data8 0x3FC606BDD3F92355 // 124
+data8 0x3FC6301C518A501F // 125
+data8 0x3FC64F3770618916 // 126
+data8 0x3FC678CC14C1E2D8 // 127
+data8 0x3FC6981005ED2947 // 128
+data8 0x3FC6C1DB5F9BB336 // 129
+data8 0x3FC6E1488ECD2881 // 130
+data8 0x3FC70B4B2E7E41B9 // 131
+data8 0x3FC72AE209146BF9 // 132
+data8 0x3FC7551C81BD8DCF // 133
+data8 0x3FC774DD76CC43BE // 134
+data8 0x3FC79F505DB00E88 // 135
+data8 0x3FC7BF3BDE099F30 // 136
+data8 0x3FC7E9E7CAC437F9 // 137
+data8 0x3FC809FE4902D00D // 138
+data8 0x3FC82A2757995CBE // 139
+data8 0x3FC85525C625E098 // 140
+data8 0x3FC8757A79831887 // 141
+data8 0x3FC895E2058D8E03 // 142
+data8 0x3FC8C13437695532 // 143
+data8 0x3FC8E1C812EF32BE // 144
+data8 0x3FC9026F112197E8 // 145
+data8 0x3FC923294888880B // 146
+data8 0x3FC94EEA4B8334F3 // 147
+data8 0x3FC96FD1B639FC09 // 148
+data8 0x3FC990CCA66229AC // 149
+data8 0x3FC9B1DB33334843 // 150
+data8 0x3FC9D2FD740E6607 // 151
+data8 0x3FC9FF49EEDCB553 // 152
+data8 0x3FCA209A84FBCFF8 // 153
+data8 0x3FCA41FF1E43F02B // 154
+data8 0x3FCA6377D2CE9378 // 155
+data8 0x3FCA8504BAE0D9F6 // 156
+data8 0x3FCAA6A5EEEBEFE3 // 157
+data8 0x3FCAC85B878D7879 // 158
+data8 0x3FCAEA259D8FFA0B // 159
+data8 0x3FCB0C0449EB4B6B // 160
+data8 0x3FCB2DF7A5C50299 // 161
+data8 0x3FCB4FFFCA70E4D1 // 162
+data8 0x3FCB721CD17157E3 // 163
+data8 0x3FCB944ED477D4ED // 164
+data8 0x3FCBB695ED655C7D // 165
+data8 0x3FCBD8F2364AEC0F // 166
+data8 0x3FCBFB63C969F4FF // 167
+data8 0x3FCC1DEAC134D4E9 // 168
+data8 0x3FCC4087384F4F80 // 169
+data8 0x3FCC6339498F09E2 // 170
+data8 0x3FCC86010FFC076C // 171
+data8 0x3FCC9D3D065C5B42 // 172
+data8 0x3FCCC029375BA07A // 173
+data8 0x3FCCE32B66978BA4 // 174
+data8 0x3FCD0643AFD51404 // 175
+data8 0x3FCD29722F0DEA45 // 176
+data8 0x3FCD4CB70070FE44 // 177
+data8 0x3FCD6446AB3F8C96 // 178
+data8 0x3FCD87B0EF71DB45 // 179
+data8 0x3FCDAB31D1FE99A7 // 180
+data8 0x3FCDCEC96FDC888F // 181
+data8 0x3FCDE6908876357A // 182
+data8 0x3FCE0A4E4A25C200 // 183
+data8 0x3FCE2E2315755E33 // 184
+data8 0x3FCE461322D1648A // 185
+data8 0x3FCE6A0E95C7787B // 186
+data8 0x3FCE8E216243DD60 // 187
+data8 0x3FCEA63AF26E007C // 188
+data8 0x3FCECA74ED15E0B7 // 189
+data8 0x3FCEEEC692CCD25A // 190
+data8 0x3FCF070A36B8D9C1 // 191
+data8 0x3FCF2B8393E34A2D // 192
+data8 0x3FCF5014EF538A5B // 193
+data8 0x3FCF68833AF1B180 // 194
+data8 0x3FCF8D3CD9F3F04F // 195
+data8 0x3FCFA5C61ADD93E9 // 196
+data8 0x3FCFCAA8567EBA7A // 197
+data8 0x3FCFE34CC8743DD8 // 198
+data8 0x3FD0042BFD74F519 // 199
+data8 0x3FD016BDF6A18017 // 200
+data8 0x3FD023262F907322 // 201
+data8 0x3FD035CCED8D32A1 // 202
+data8 0x3FD042430E869FFC // 203
+data8 0x3FD04EBEC842B2E0 // 204
+data8 0x3FD06182E84FD4AC // 205
+data8 0x3FD06E0CB609D383 // 206
+data8 0x3FD080E60BEC8F12 // 207
+data8 0x3FD08D7E0D894735 // 208
+data8 0x3FD0A06CC96A2056 // 209
+data8 0x3FD0AD131F3B3C55 // 210
+data8 0x3FD0C01771E775FB // 211
+data8 0x3FD0CCCC3CAD6F4B // 212
+data8 0x3FD0D986D91A34A9 // 213
+data8 0x3FD0ECA9B8861A2D // 214
+data8 0x3FD0F972F87FF3D6 // 215
+data8 0x3FD106421CF0E5F7 // 216
+data8 0x3FD11983EBE28A9D // 217
+data8 0x3FD12661E35B785A // 218
+data8 0x3FD13345D2779D3B // 219
+data8 0x3FD146A6F597283A // 220
+data8 0x3FD15399E81EA83D // 221
+data8 0x3FD16092E5D3A9A6 // 222
+data8 0x3FD17413C3B7AB5E // 223
+data8 0x3FD1811BF629D6FB // 224
+data8 0x3FD18E2A47B46686 // 225
+data8 0x3FD19B3EBE1A4418 // 226
+data8 0x3FD1AEE9017CB450 // 227
+data8 0x3FD1BC0CED7134E2 // 228
+data8 0x3FD1C93712ABC7FF // 229
+data8 0x3FD1D66777147D3F // 230
+data8 0x3FD1EA3BD1286E1C // 231
+data8 0x3FD1F77BED932C4C // 232
+data8 0x3FD204C25E1B031F // 233
+data8 0x3FD2120F28CE69B1 // 234
+data8 0x3FD21F6253C48D01 // 235
+data8 0x3FD22CBBE51D60AA // 236
+data8 0x3FD240CE4C975444 // 237
+data8 0x3FD24E37F8ECDAE8 // 238
+data8 0x3FD25BA8215AF7FC // 239
+data8 0x3FD2691ECC29F042 // 240
+data8 0x3FD2769BFFAB2E00 // 241
+data8 0x3FD2841FC23952C9 // 242
+data8 0x3FD291AA1A384978 // 243
+data8 0x3FD29F3B0E15584B // 244
+data8 0x3FD2B3A0EE479DF7 // 245
+data8 0x3FD2C142842C09E6 // 246
+data8 0x3FD2CEEACCB7BD6D // 247
+data8 0x3FD2DC99CE82FF21 // 248
+data8 0x3FD2EA4F902FD7DA // 249
+data8 0x3FD2F80C186A25FD // 250
+data8 0x3FD305CF6DE7B0F7 // 251
+data8 0x3FD3139997683CE7 // 252
+data8 0x3FD3216A9BB59E7C // 253
+data8 0x3FD32F4281A3CEFF // 254
+data8 0x3FD33D2150110092 // 255
+LOCAL_OBJECT_END(log10f_data)
+
+
+// Code
+//==============================================================
.section .text
-.proc log10f#
-.align 32
-log10f:
-#ifdef _LIBC
-.global __ieee754_log10f
-.type __ieee754_log10f,@function
-__ieee754_log10f:
-#endif
-{ .mfi
- alloc r32=ar.pfs,1,15,4,0
- frcpa.s1 log_C,p9 = f1,f8
- cmp.eq.unc p7,p8 = r0, r0
-}
-{ .mfb
- addl log_AD_1 = @ltoff(log_table_1), gp
- fnorm.s1 log_NORM_f8 = f8
- br.sptk L(LOG_LOG10_X)
-}
-;;
-
-.endp log10f
-ASM_SIZE_DIRECTIVE(log10f)
-ASM_SIZE_DIRECTIVE(__ieee754_log10f)
-
-
-
-.section .text
-.proc logf#
-.align 32
-logf:
-#ifdef _LIBC
-.global __ieee754_logf
-.type __ieee754_logf,@function
-__ieee754_logf:
-#endif
+// logf has p13 true, p14 false
+// log10f has p14 true, p13 false
+GLOBAL_IEEE754_ENTRY(log10f)
{ .mfi
- alloc r32=ar.pfs,1,15,4,0
- frcpa.s1 log_C,p9 = f1,f8
- cmp.eq.unc p8,p7 = r0, r0
+ getf.exp GR_Exp = f8 // if x is unorm then must recompute
+ frcpa.s1 FR_RcpX,p0 = f1,f8
+ mov GR_05 = 0xFFFE // biased exponent of A2=0.5
}
+{ .mlx
+ addl GR_ad_T = @ltoff(log10f_data),gp
+ movl GR_A3 = 0x3FD5555555555555 // double precision memory
+ // representation of A3
+};;
{ .mfi
- addl log_AD_1 = @ltoff(log_table_1), gp
- fnorm.s1 log_NORM_f8 = f8
- nop.i 999
-}
-;;
-
-L(LOG_LOG10_X):
-
-{ .mfi
- getf.exp log_GR_signexp_f8 = f8 // If x unorm then must recompute
- fclass.m.unc p15,p0 = f8, 0x0b // Test for x=unorm
- mov log_GR_fff7 = 0xfff7
+ getf.sig GR_Sig = f8 // if x is unorm then must recompute
+ fclass.m p8,p0 = f8,9 // is x positive unorm?
+ sub GR_025 = GR_05,r0,1 // biased exponent of A4=0.25
}
+{ .mlx
+ ld8 GR_ad_T = [GR_ad_T]
+ movl GR_Ln2 = 0x3FD34413509F79FF // double precision memory
+ // representation of
+ // log(2)/ln(10)
+};;
{ .mfi
- ld8 log_AD_1 = [log_AD_1]
- fms.s1 log_w = f8,f1,f1
- mov log_GR_exp_17_ones = 0x1ffff
+ setf.d FR_A3 = GR_A3 // create A3
+ fcmp.eq.s1 p14,p13 = f0,f0 // set p14 to 1 for log10f
+ dep.z GR_xorg = GR_05,55,8 // 0x7F00000000000000 integer number
+ // bits of that are
+ // GR_xorg[63] = last bit of biased
+ // exponent of 255/256
+ // GR_xorg[62-0] = bits from 62 to 0
+ // of significand of 255/256
}
-;;
-
-{ .mmi
- getf.sig log_GR_significand_f8 = f8 // If x unorm then must recompute
- mov log_GR_exp_16_ones = 0xffff
- nop.i 999
-}
-;;
-
-{ .mmb
- adds log_AD_2 = 0x10, log_AD_1
- and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones
-(p15) br.cond.spnt L(LOG_DENORM)
-}
-;;
-
-L(LOG_COMMON):
-{.mfi
- ldfpd log_P3,log_P2 = [log_AD_1],16
- fclass.m.unc p6,p0 = f8, 0xc3 // Test for x=nan
- shl log_GR_index = log_GR_significand_f8,1
-}
-{.mfi
- sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones
- nop.f 999
- nop.i 999
-}
-;;
-
+{ .mib
+ setf.exp FR_A2 = GR_05 // create A2
+ sub GR_de = GR_Exp,GR_05 // biased_exponent_of_x - 0xFFFE
+ // needed to comparion with 0.5 and 2.0
+ br.cond.sptk logf_log10f_common
+};;
+GLOBAL_IEEE754_END(log10f)
+GLOBAL_IEEE754_ENTRY(logf)
{ .mfi
- ldfpd log_P1,log_inv_ln10 = [log_AD_2],16
- fclass.m.unc p11,p0 = f8, 0x21 // Test for x=+inf
- shr.u log_GR_index = log_GR_index,56
+ getf.exp GR_Exp = f8 // if x is unorm then must recompute
+ frcpa.s1 FR_RcpX,p0 = f1,f8
+ mov GR_05 = 0xFFFE // biased exponent of A2=-0.5
}
+{ .mlx
+ addl GR_ad_T = @ltoff(logf_data),gp
+ movl GR_A3 = 0x3FD5555555555555 // double precision memory
+ // representation of A3
+};;
{ .mfi
- setf.sig log_int_Nfloat = log_GR_true_exp_f8
- nop.f 999
- nop.i 999
+ getf.sig GR_Sig = f8 // if x is unorm then must recompute
+ fclass.m p8,p0 = f8,9 // is x positive unorm?
+ dep.z GR_xorg = GR_05,55,8 // 0x7F00000000000000 integer number
+ // bits of that are
+ // GR_xorg[63] = last bit of biased
+ // exponent of 255/256
+ // GR_xorg[62-0] = bits from 62 to 0
+ // of significand of 255/256
}
-;;
-
-
{ .mfi
- ldfd log_log2 = [log_AD_2],16
- fma.s1 log_wsq = log_w, log_w, f0
- nop.i 999
-}
-{ .mfb
- nop.m 999
-(p6) fma.s.s0 f8 = f8,f1,f0 // quietize nan result if x=nan
-(p6) br.ret.spnt b0 // Exit for x=nan
-}
-;;
-
-
+ ld8 GR_ad_T = [GR_ad_T]
+ nop.f 0
+ sub GR_025 = GR_05,r0,1 // biased exponent of A4=0.25
+};;
{ .mfi
- shladd log_AD_2 = log_GR_index,3,log_AD_2
- fcmp.eq.s1 p10,p0 = log_NORM_f8, f1 // Test for x=+1.0
- nop.i 999
+ setf.d FR_A3 = GR_A3 // create A3
+ fcmp.eq.s1 p13,p14 = f0,f0 // p13 - true for logf
+ sub GR_de = GR_Exp,GR_05 // biased_exponent_of_x - 0xFFFE
+ // needed to comparion with 0.5 and 2.0
}
-{ .mfb
- nop.m 999
- fms.s1 log_r = log_C,f8,f1
-(p11) br.ret.spnt b0 // Exit for x=+inf
-}
-;;
-
-
-{ .mmf
- nop.m 999
- nop.m 999
- fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0
-}
-;;
-
-
-{ .mfb
- ldfd log_T = [log_AD_2]
-(p10) fmerge.s f8 = f0, f0
-(p10) br.ret.spnt b0 // Exit for x=1.0
-;;
-}
-
+{ .mlx
+ setf.exp FR_A2 = GR_05 // create A2
+ movl GR_Ln2 = 0x3FE62E42FEFA39EF // double precision memory
+ // representation of log(2)
+};;
+logf_log10f_common:
{ .mfi
- getf.exp log_GR_signexp_w = log_w
- fclass.m.unc p12,p0 = f8, 0x3a // Test for x neg norm, unorm, inf
- nop.i 999
-}
-;;
-
-{ .mmb
- nop.m 999
- nop.m 999
-(p6) br.cond.spnt L(LOG_ZERO_NEG) // Branch if x=0
-;;
+ setf.exp FR_A4 = GR_025 // create A4=0.25
+ fclass.m p9,p0 = f8,0x3A // is x < 0 (including negateve unnormals)?
+ dep GR_x = GR_Exp,GR_Sig,63,1 // produce integer that bits are
+ // GR_x[63] = GR_Exp[0]
+ // GR_x[62-0] = GR_Sig[62-0]
}
-
-
+{ .mib
+ sub GR_N = GR_Exp,GR_05,1 // unbiased exponent of x
+ cmp.gtu p6,p7 = 2,GR_de // is 0.5 <= x < 2.0?
+(p8) br.cond.spnt logf_positive_unorm
+};;
+logf_core:
{ .mfi
- and log_GR_exp_w = log_GR_exp_17_ones, log_GR_signexp_w
- nop.f 999
- nop.i 999
+ setf.sig FR_N = GR_N // copy unbiased exponent of x to the
+ // significand field of FR_N
+ fclass.m p10,p0 = f8,0x1E1 // is x NaN, NaT or +Inf?
+ dep.z GR_dx = GR_05,54,3 // 0x0180000000000000 - difference
+ // between our integer representations
+ // of 257/256 and 255/256
}
-{ .mfb
- nop.m 999
- fma.s1 log_rsq = log_r, log_r, f0
-(p12) br.cond.spnt L(LOG_ZERO_NEG) // Branch if x<0
-;;
-}
-
{ .mfi
- nop.m 999
- fma.s1 log_rp_p32 = log_P3, log_r, log_P2
- nop.i 999
-}
+ nop.m 0
+ nop.f 0
+ sub GR_x = GR_x,GR_xorg // difference between representations
+ // of x and 255/256
+};;
{ .mfi
- nop.m 999
- fma.s1 log_rp_q32 = log_P3, log_w, log_P2
- nop.i 999
-;;
+ ldfd FR_InvLn10 = [GR_ad_T],8
+ fcmp.eq.s1 p11,p0 = f8,f1 // is x equal to 1.0?
+ extr.u GR_Ind = GR_Sig,55,8 // get bits from 55 to 62 as index
}
-
+{ .mib
+ setf.d FR_Ln2 = GR_Ln2 // create log(2) or log10(2)
+(p6) cmp.gtu p6,p7 = GR_dx,GR_x // set p6 if 255/256 <= x < 257/256
+(p9) br.cond.spnt logf_negatives // jump if input argument is negative number
+};;
+// p6 is true if |x-1| < 1/256
+// p7 is true if |x-1| >= 1/256
+.pred.rel "mutex",p6,p7
{ .mfi
- nop.m 999
- fcvt.xf log_Nfloat = log_int_Nfloat
- nop.i 999 ;;
+ shladd GR_ad_T = GR_Ind,3,GR_ad_T // calculate address of T
+(p7) fms.s1 FR_r = FR_RcpX,f8,f1 // range reduction for |x-1|>=1/256
+ extr.u GR_Exp = GR_Exp,0,17 // exponent without sign
}
-
+{ .mfb
+ nop.m 0
+(p6) fms.s1 FR_r = f8,f1,f1 // range reduction for |x-1|<1/256
+(p10) br.cond.spnt logf_nan_nat_pinf // exit for NaN, NaT or +Inf
+};;
+{ .mfb
+ ldfd FR_T = [GR_ad_T] // load T
+(p11) fma.s.s0 f8 = f0,f0,f0
+(p11) br.ret.spnt b0 // exit for x = 1.0
+};;
+{ .mib
+ nop.m 0
+ cmp.eq p12,p0 = r0,GR_Exp // is x +/-0? (here it's quite enough
+ // only to compare exponent with 0
+ // because all unnormals already
+ // have been filtered)
+(p12) br.cond.spnt logf_zeroes // Branch if input argument is +/-0
+};;
{ .mfi
- nop.m 999
- fma.s1 log_rp_p10 = log_P1, log_r, f1
- nop.i 999
+ nop.m 0
+ fnma.s1 FR_A2 = FR_A2,FR_r,f1 // A2*r+1
+ nop.i 0
}
{ .mfi
- nop.m 999
- fma.s1 log_rp_q10 = log_P1, log_w, f1
- nop.i 999
-;;
-}
-
-// p13 <== large w log
-// p14 <== small w log
+ nop.m 0
+ fma.s1 FR_r2 = FR_r,FR_r,f0 // r^2
+ nop.i 0
+};;
{ .mfi
-(p8) cmp.ge.unc p13,p14 = log_GR_exp_w, log_GR_fff7
- fcmp.eq.s0 p6,p0 = f8,f0 // Sets flag on +denormal input
- nop.i 999
-;;
+ nop.m 0
+ fcvt.xf FR_N = FR_N // convert integer N in significand of FR_N
+ // to floating-point representation
+ nop.i 0
}
-
-// p10 <== large w log10
-// p11 <== small w log10
{ .mfi
-(p7) cmp.ge.unc p10,p11 = log_GR_exp_w, log_GR_fff7
- nop.f 999
- nop.i 999 ;;
-}
-
+ nop.m 0
+ fnma.s1 FR_A3 = FR_A4,FR_r,FR_A3 // A4*r+A3
+ nop.i 0
+};;
{ .mfi
- nop.m 999
- fma.s1 log_T_plus_Nlog2 = log_Nfloat,log_log2, log_T
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 FR_r = FR_r,FR_InvLn10,f0 // For log10f we have r/log(10)
+ nop.i 0
}
-
-
{ .mfi
- nop.m 999
- fma.s1 log_rp_p2 = log_rp_p32, log_rsq, log_rp_p10
- nop.i 999
-}
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
{ .mfi
- nop.m 999
- fma.s1 log_rp_q2 = log_rp_q32, log_wsq, log_rp_q10
- nop.i 999
-;;
+ nop.m 0
+ fma.s1 FR_A2 = FR_A3,FR_r2,FR_A2 // (A4*r+A3)*r^2+(A2*r+1)
+ nop.i 0
}
-
-
-// small w, log <== p14
{ .mfi
- nop.m 999
-(p14) fma.s f8 = log_rp_q2, log_w, f0
- nop.i 999
-}
+ nop.m 0
+ fma.s1 FR_NxLn2pT = FR_N,FR_Ln2,FR_T // N*Ln2+T
+ nop.i 0
+};;
+.pred.rel "mutex",p6,p7
{ .mfi
- nop.m 999
-(p11) fma.s1 log_Q = log_rp_q2, log_w, f0
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s.s0 f8 = FR_A2,FR_r,FR_NxLn2pT // result for |x-1|>=1/256
+ nop.i 0
}
+{ .mfb
+ nop.m 0
+(p6) fma.s.s0 f8 = FR_A2,FR_r,f0 // result for |x-1|<1/256
+ br.ret.sptk b0
+};;
-
-// large w, log <== p13
-.pred.rel "mutex",p13,p10
+.align 32
+logf_positive_unorm:
{ .mfi
- nop.m 999
-(p13) fma.s f8 = log_rp_p2, log_r, log_T_plus_Nlog2
- nop.i 999
-}
+ nop.m 0
+(p8) fma.s0 f8 = f8,f1,f0 // Normalize & set D-flag
+ nop.i 0
+};;
{ .mfi
- nop.m 999
-(p10) fma.s1 log_Q = log_rp_p2, log_r, log_T_plus_Nlog2
- nop.i 999 ;;
-}
-
-
-// log10
-{ .mfb
- nop.m 999
-(p7) fma.s f8 = log_inv_ln10,log_Q,f0
- br.ret.sptk b0
-;;
-}
-
-
-L(LOG_DENORM):
-{ .mmi
- getf.exp log_GR_signexp_f8 = log_NORM_f8
- nop.m 999
- nop.i 999
-}
-;;
-{ .mmb
- getf.sig log_GR_significand_f8 = log_NORM_f8
- and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones
- br.cond.sptk L(LOG_COMMON)
-}
-;;
-
-L(LOG_ZERO_NEG):
-
-// qnan snan inf norm unorm 0 -+
-// 0 0 0 0 0 1 11 0x7
-// 0 0 1 1 1 0 10 0x3a
-
-// Save x (f8) in f10
+ getf.exp GR_Exp = f8 // recompute biased exponent
+ nop.f 0
+ cmp.ne p6,p7 = r0,r0 // p6 <- 0, p7 <- 1 because
+ // in case of unorm we are out
+ // interval [255/256; 257/256]
+};;
{ .mfi
- nop.m 999
- fmerge.s f10 = f8,f8
- nop.i 999 ;;
-}
-
-// p8 p9 means ln(+-0) = -inf
-// p7 p10 means log(+-0) = -inf
-
-// p13 means ln(-)
-// p14 means log(-)
-
+ getf.sig GR_Sig = f8 // recompute significand
+ nop.f 0
+ nop.i 0
+};;
+{ .mib
+ sub GR_N = GR_Exp,GR_05,1 // unbiased exponent N
+ nop.i 0
+ br.cond.sptk logf_core // return into main path
+};;
+.align 32
+logf_nan_nat_pinf:
{ .mfi
- nop.m 999
- fmerge.ns f6 = f1,f1 // Form -1.0
- nop.i 999 ;;
+ nop.m 0
+ fma.s.s0 f8 = f8,f1,f0 // set V-flag
+ nop.i 0
}
+{ .mfb
+ nop.m 0
+ nop.f 0
+ br.ret.sptk b0 // exit for NaN, NaT or +Inf
+};;
-// p9 means ln(+-0) = -inf
-// p10 means log(+-0) = -inf
-// Log(+-0) = -inf
-
-{ .mfi
- nop.m 999
-(p8) fclass.m.unc p9,p0 = f10, 0x07
- nop.i 999
-}
+.align 32
+logf_zeroes:
{ .mfi
- nop.m 999
-(p7) fclass.m.unc p10,p0 = f10, 0x07
- nop.i 999 ;;
+ nop.m 0
+ fmerge.s FR_X = f8,f8 // keep input argument for subsequent
+ // call of __libm_error_support#
+ nop.i 0
}
-
-
-// p13 ln(-)
-// p14 log(-)
-
-// Log(-inf, -normal, -unnormal) = QNAN indefinite
{ .mfi
- nop.m 999
-(p8) fclass.m.unc p13,p0 = f10, 0x3a
- nop.i 999
-}
+(p13) mov GR_TAG = 4 // set libm error in case of logf
+ fms.s1 FR_tmp = f0,f0,f1 // -1.0
+ nop.i 0
+};;
{ .mfi
- nop.m 999
-(p7) fclass.m.unc p14,p0 = f10, 0x3a
- nop.i 999 ;;
+ nop.m 0
+ frcpa.s0 f8,p0 = FR_tmp,f0 // log(+/-0) should be equal to -INF.
+ // We can get it using frcpa because it
+ // sets result to the IEEE-754 mandated
+ // quotient of FR_tmp/f0.
+ // As far as FR_tmp is -1 it'll be -INF
+ nop.i 0
}
+{ .mib
+(p14) mov GR_TAG = 10 // set libm error in case of log10f
+ nop.i 0
+ br.cond.sptk logf_libm_err
+};;
-
-.pred.rel "mutex",p9,p10
-{ .mfi
-(p9) mov log_GR_tag = 4
-(p9) frcpa f8,p11 = f6,f0
- nop.i 999
-}
+.align 32
+logf_negatives:
{ .mfi
-(p10) mov log_GR_tag = 10
-(p10) frcpa f8,p12 = f6,f0
- nop.i 999 ;;
-}
-
-.pred.rel "mutex",p13,p14
+(p13) mov GR_TAG = 5 // set libm error in case of logf
+ fmerge.s FR_X = f8,f8 // keep input argument for subsequent
+ // call of __libm_error_support#
+ nop.i 0
+};;
{ .mfi
-(p13) mov log_GR_tag = 5
-(p13) frcpa f8,p11 = f0,f0
- nop.i 999
-}
-{ .mfb
-(p14) mov log_GR_tag = 11
-(p14) frcpa f8,p12 = f0,f0
- br.cond.sptk __libm_error_region ;;
-}
-.endp logf
-ASM_SIZE_DIRECTIVE(logf)
-ASM_SIZE_DIRECTIVE(__ieee754_logf)
+(p14) mov GR_TAG = 11 // set libm error in case of log10f
+ frcpa.s0 f8,p0 = f0,f0 // log(negatives) should be equal to NaN.
+ // We can get it using frcpa because it
+ // sets result to the IEEE-754 mandated
+ // quotient of f0/f0 i.e. NaN.
+ nop.i 0
+};;
+.align 32
+logf_libm_err:
+{ .mmi
+ alloc r32 = ar.pfs,1,4,4,0
+ mov GR_Parameter_TAG = GR_TAG
+ nop.i 0
+};;
+GLOBAL_IEEE754_END(logf)
// Stack operations when calling error support.
// (1) (2) (3) (call) (4)
@@ -890,70 +1101,56 @@ ASM_SIZE_DIRECTIVE(__ieee754_logf)
// save ar.pfs save b0 restore gp
// save gp restore ar.pfs
-
-
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
-
-// (1)
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
- nop.f 0
-.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
- nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
};;
-
-
-// (2)
{ .mmi
- stfs [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfs [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
-
.body
-// (3)
{ .mib
- stfs [GR_Parameter_X] = f10 // STORE Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ stfs [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
-
{ .mmi
- nop.m 0
- nop.m 0
- add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
};;
-
-// (4)
{ .mmi
- ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
- add sp = 64,sp // Restore stack pointer
- mov b0 = GR_SAVE_B0 // Restore return address
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
-
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/e_logl.S b/sysdeps/ia64/fpu/e_logl.S
new file mode 100644
index 0000000000..ba6b55bb9c
--- /dev/null
+++ b/sysdeps/ia64/fpu/e_logl.S
@@ -0,0 +1,1198 @@
+.file "logl.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//*********************************************************************
+//
+// History:
+// 05/21/01 Extracted logl and log10l from log1pl.s file, and optimized
+// all paths.
+// 06/20/01 Fixed error tag for x=-inf.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
+//
+//*********************************************************************
+//
+//*********************************************************************
+//
+// Function: Combined logl(x) and log10l(x) where
+// logl(x) = ln(x), for double-extended precision x values
+// log10l(x) = log (x), for double-extended precision x values
+// 10
+//
+//*********************************************************************
+//
+// Resources Used:
+//
+// Floating-Point Registers: f8 (Input and Return Value)
+// f34-f76
+//
+// General Purpose Registers:
+// r32-r56
+// r53-r56 (Used to pass arguments to error handling routine)
+//
+// Predicate Registers: p6-p14
+//
+//*********************************************************************
+//
+// IEEE Special Conditions:
+//
+// Denormal fault raised on denormal inputs
+// Overflow exceptions cannot occur
+// Underflow exceptions raised when appropriate for log1p
+// (Error Handling Routine called for underflow)
+// Inexact raised when appropriate by algorithm
+//
+// logl(inf) = inf
+// logl(-inf) = QNaN
+// logl(+/-0) = -inf
+// logl(SNaN) = QNaN
+// logl(QNaN) = QNaN
+// logl(EM_special Values) = QNaN
+// log10l(inf) = inf
+// log10l(-inf) = QNaN
+// log10l(+/-0) = -inf
+// log10l(SNaN) = QNaN
+// log10l(QNaN) = QNaN
+// log10l(EM_special Values) = QNaN
+//
+//*********************************************************************
+//
+// Overview
+//
+// The method consists of two cases.
+//
+// If |X-1| < 2^(-7) use case log_near1;
+// else use case log_regular;
+//
+// Case log_near1:
+//
+// logl( 1 + X ) can be approximated by a simple polynomial
+// in W = X-1. This polynomial resembles the truncated Taylor
+// series W - W^/2 + W^3/3 - ...
+//
+// Case log_regular:
+//
+// Here we use a table lookup method. The basic idea is that in
+// order to compute logl(Arg) for an argument Arg in [1,2), we
+// construct a value G such that G*Arg is close to 1 and that
+// logl(1/G) is obtainable easily from a table of values calculated
+// beforehand. Thus
+//
+// logl(Arg) = logl(1/G) + logl(G*Arg)
+// = logl(1/G) + logl(1 + (G*Arg - 1))
+//
+// Because |G*Arg - 1| is small, the second term on the right hand
+// side can be approximated by a short polynomial. We elaborate
+// this method in four steps.
+//
+// Step 0: Initialization
+//
+// We need to calculate logl( X ). Obtain N, S_hi such that
+//
+// X = 2^N * S_hi exactly
+//
+// where S_hi in [1,2)
+//
+// Step 1: Argument Reduction
+//
+// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
+//
+// G := G_1 * G_2 * G_3
+// r := (G * S_hi - 1)
+//
+// These G_j's have the property that the product is exactly
+// representable and that |r| < 2^(-12) as a result.
+//
+// Step 2: Approximation
+//
+//
+// logl(1 + r) is approximated by a short polynomial poly(r).
+//
+// Step 3: Reconstruction
+//
+//
+// Finally, logl( X ) is given by
+//
+// logl( X ) = logl( 2^N * S_hi )
+// ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
+// ~=~ N*logl(2) + logl(1/G) + poly(r).
+//
+// **** Algorithm ****
+//
+// Case log_near1:
+//
+// Here we compute a simple polynomial. To exploit parallelism, we split
+// the polynomial into two portions.
+//
+// W := X - 1
+// Wsq := W * W
+// W4 := Wsq*Wsq
+// W6 := W4*Wsq
+// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4))
+// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8)))
+//
+// Case log_regular:
+//
+// We present the algorithm in four steps.
+//
+// Step 0. Initialization
+// ----------------------
+//
+// Z := X
+// N := unbaised exponent of Z
+// S_hi := 2^(-N) * Z
+//
+// Step 1. Argument Reduction
+// --------------------------
+//
+// Let
+//
+// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63
+//
+// We obtain G_1, G_2, G_3 by the following steps.
+//
+//
+// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted
+// from S_hi.
+//
+// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated
+// to lsb = 2^(-4).
+//
+// Define index_1 := [ d_1 d_2 d_3 d_4 ].
+//
+// Fetch Z_1 := (1/A_1) rounded UP in fixed point with
+// fixed point lsb = 2^(-15).
+// Z_1 looks like z_0.z_1 z_2 ... z_15
+// Note that the fetching is done using index_1.
+// A_1 is actually not needed in the implementation
+// and is used here only to explain how is the value
+// Z_1 defined.
+//
+// Fetch G_1 := (1/A_1) truncated to 21 sig. bits.
+// floating pt. Again, fetching is done using index_1. A_1
+// explains how G_1 is defined.
+//
+// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14)
+// = 1.0 0 0 0 d_5 ... d_14
+// This is accomplised by integer multiplication.
+// It is proved that X_1 indeed always begin
+// with 1.0000 in fixed point.
+//
+//
+// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
+// truncated to lsb = 2^(-8). Similar to A_1,
+// A_2 is not needed in actual implementation. It
+// helps explain how some of the values are defined.
+//
+// Define index_2 := [ d_5 d_6 d_7 d_8 ].
+//
+// Fetch Z_2 := (1/A_2) rounded UP in fixed point with
+// fixed point lsb = 2^(-15). Fetch done using index_2.
+// Z_2 looks like z_0.z_1 z_2 ... z_15
+//
+// Fetch G_2 := (1/A_2) truncated to 21 sig. bits.
+// floating pt.
+//
+// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14)
+// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14
+// This is accomplised by integer multiplication.
+// It is proved that X_2 indeed always begin
+// with 1.00000000 in fixed point.
+//
+//
+// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1.
+// This is 2^(-14) + X_2 truncated to lsb = 2^(-13).
+//
+// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ].
+//
+// Fetch G_3 := (1/A_3) truncated to 21 sig. bits.
+// floating pt. Fetch is done using index_3.
+//
+// Compute G := G_1 * G_2 * G_3.
+//
+// This is done exactly since each of G_j only has 21 sig. bits.
+//
+// Compute
+//
+// r := (G*S_hi - 1)
+//
+//
+// Step 2. Approximation
+// ---------------------
+//
+// This step computes an approximation to logl( 1 + r ) where r is the
+// reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13);
+// thus logl(1+r) can be approximated by a short polynomial:
+//
+// logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5
+//
+//
+// Step 3. Reconstruction
+// ----------------------
+//
+// This step computes the desired result of logl(X):
+//
+// logl(X) = logl( 2^N * S_hi )
+// = N*logl(2) + logl( S_hi )
+// = N*logl(2) + logl(1/G) +
+// logl(1 + G*S_hi - 1 )
+//
+// logl(2), logl(1/G_j) are stored as pairs of (single,double) numbers:
+// log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are
+// single-precision numbers and the low parts are double precision
+// numbers. These have the property that
+//
+// N*log2_hi + SUM ( log1byGj_hi )
+//
+// is computable exactly in double-extended precision (64 sig. bits).
+// Finally
+//
+// Y_hi := N*log2_hi + SUM ( log1byGj_hi )
+// Y_lo := poly_hi + [ poly_lo +
+// ( SUM ( log1byGj_lo ) + N*log2_lo ) ]
+//
+
+RODATA
+.align 64
+
+// ************* DO NOT CHANGE THE ORDER OF THESE TABLES *************
+
+// P_8, P_7, P_6, P_5, P_4, P_3, P_2, and P_1
+
+LOCAL_OBJECT_START(Constants_P)
+data8 0xE3936754EFD62B15,0x00003FFB
+data8 0x8003B271A5E56381,0x0000BFFC
+data8 0x9249248C73282DB0,0x00003FFC
+data8 0xAAAAAA9F47305052,0x0000BFFC
+data8 0xCCCCCCCCCCD17FC9,0x00003FFC
+data8 0x8000000000067ED5,0x0000BFFD
+data8 0xAAAAAAAAAAAAAAAA,0x00003FFD
+data8 0xFFFFFFFFFFFFFFFE,0x0000BFFD
+LOCAL_OBJECT_END(Constants_P)
+
+// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
+
+LOCAL_OBJECT_START(Constants_Q)
+data8 0xB172180000000000,0x00003FFE
+data8 0x82E308654361C4C6,0x0000BFE2
+data8 0xCCCCCAF2328833CB,0x00003FFC
+data8 0x80000077A9D4BAFB,0x0000BFFD
+data8 0xAAAAAAAAAAABE3D2,0x00003FFD
+data8 0xFFFFFFFFFFFFDAB7,0x0000BFFD
+LOCAL_OBJECT_END(Constants_Q)
+
+// 1/ln10_hi, 1/ln10_lo
+
+LOCAL_OBJECT_START(Constants_1_by_LN10)
+data8 0xDE5BD8A937287195,0x00003FFD
+data8 0xD56EAABEACCF70C8,0x00003FBB
+LOCAL_OBJECT_END(Constants_1_by_LN10)
+
+
+// Z1 - 16 bit fixed
+
+LOCAL_OBJECT_START(Constants_Z_1)
+data4 0x00008000
+data4 0x00007879
+data4 0x000071C8
+data4 0x00006BCB
+data4 0x00006667
+data4 0x00006187
+data4 0x00005D18
+data4 0x0000590C
+data4 0x00005556
+data4 0x000051EC
+data4 0x00004EC5
+data4 0x00004BDB
+data4 0x00004925
+data4 0x0000469F
+data4 0x00004445
+data4 0x00004211
+LOCAL_OBJECT_END(Constants_Z_1)
+
+// G1 and H1 - IEEE single and h1 - IEEE double
+
+LOCAL_OBJECT_START(Constants_G_H_h1)
+data4 0x3F800000,0x00000000
+data8 0x0000000000000000
+data4 0x3F70F0F0,0x3D785196
+data8 0x3DA163A6617D741C
+data4 0x3F638E38,0x3DF13843
+data8 0x3E2C55E6CBD3D5BB
+data4 0x3F579430,0x3E2FF9A0
+data8 0xBE3EB0BFD86EA5E7
+data4 0x3F4CCCC8,0x3E647FD6
+data8 0x3E2E6A8C86B12760
+data4 0x3F430C30,0x3E8B3AE7
+data8 0x3E47574C5C0739BA
+data4 0x3F3A2E88,0x3EA30C68
+data8 0x3E20E30F13E8AF2F
+data4 0x3F321640,0x3EB9CEC8
+data8 0xBE42885BF2C630BD
+data4 0x3F2AAAA8,0x3ECF9927
+data8 0x3E497F3497E577C6
+data4 0x3F23D708,0x3EE47FC5
+data8 0x3E3E6A6EA6B0A5AB
+data4 0x3F1D89D8,0x3EF8947D
+data8 0xBDF43E3CD328D9BE
+data4 0x3F17B420,0x3F05F3A1
+data8 0x3E4094C30ADB090A
+data4 0x3F124920,0x3F0F4303
+data8 0xBE28FBB2FC1FE510
+data4 0x3F0D3DC8,0x3F183EBF
+data8 0x3E3A789510FDE3FA
+data4 0x3F088888,0x3F20EC80
+data8 0x3E508CE57CC8C98F
+data4 0x3F042108,0x3F29516A
+data8 0xBE534874A223106C
+LOCAL_OBJECT_END(Constants_G_H_h1)
+
+// Z2 - 16 bit fixed
+
+LOCAL_OBJECT_START(Constants_Z_2)
+data4 0x00008000
+data4 0x00007F81
+data4 0x00007F02
+data4 0x00007E85
+data4 0x00007E08
+data4 0x00007D8D
+data4 0x00007D12
+data4 0x00007C98
+data4 0x00007C20
+data4 0x00007BA8
+data4 0x00007B31
+data4 0x00007ABB
+data4 0x00007A45
+data4 0x000079D1
+data4 0x0000795D
+data4 0x000078EB
+LOCAL_OBJECT_END(Constants_Z_2)
+
+// G2 and H2 - IEEE single and h2 - IEEE double
+
+LOCAL_OBJECT_START(Constants_G_H_h2)
+data4 0x3F800000,0x00000000
+data8 0x0000000000000000
+data4 0x3F7F00F8,0x3B7F875D
+data8 0x3DB5A11622C42273
+data4 0x3F7E03F8,0x3BFF015B
+data8 0x3DE620CF21F86ED3
+data4 0x3F7D08E0,0x3C3EE393
+data8 0xBDAFA07E484F34ED
+data4 0x3F7C0FC0,0x3C7E0586
+data8 0xBDFE07F03860BCF6
+data4 0x3F7B1880,0x3C9E75D2
+data8 0x3DEA370FA78093D6
+data4 0x3F7A2328,0x3CBDC97A
+data8 0x3DFF579172A753D0
+data4 0x3F792FB0,0x3CDCFE47
+data8 0x3DFEBE6CA7EF896B
+data4 0x3F783E08,0x3CFC15D0
+data8 0x3E0CF156409ECB43
+data4 0x3F774E38,0x3D0D874D
+data8 0xBE0B6F97FFEF71DF
+data4 0x3F766038,0x3D1CF49B
+data8 0xBE0804835D59EEE8
+data4 0x3F757400,0x3D2C531D
+data8 0x3E1F91E9A9192A74
+data4 0x3F748988,0x3D3BA322
+data8 0xBE139A06BF72A8CD
+data4 0x3F73A0D0,0x3D4AE46F
+data8 0x3E1D9202F8FBA6CF
+data4 0x3F72B9D0,0x3D5A1756
+data8 0xBE1DCCC4BA796223
+data4 0x3F71D488,0x3D693B9D
+data8 0xBE049391B6B7C239
+LOCAL_OBJECT_END(Constants_G_H_h2)
+
+// G3 and H3 - IEEE single and h3 - IEEE double
+
+LOCAL_OBJECT_START(Constants_G_H_h3)
+data4 0x3F7FFC00,0x38800100
+data8 0x3D355595562224CD
+data4 0x3F7FF400,0x39400480
+data8 0x3D8200A206136FF6
+data4 0x3F7FEC00,0x39A00640
+data8 0x3DA4D68DE8DE9AF0
+data4 0x3F7FE400,0x39E00C41
+data8 0xBD8B4291B10238DC
+data4 0x3F7FDC00,0x3A100A21
+data8 0xBD89CCB83B1952CA
+data4 0x3F7FD400,0x3A300F22
+data8 0xBDB107071DC46826
+data4 0x3F7FCC08,0x3A4FF51C
+data8 0x3DB6FCB9F43307DB
+data4 0x3F7FC408,0x3A6FFC1D
+data8 0xBD9B7C4762DC7872
+data4 0x3F7FBC10,0x3A87F20B
+data8 0xBDC3725E3F89154A
+data4 0x3F7FB410,0x3A97F68B
+data8 0xBD93519D62B9D392
+data4 0x3F7FAC18,0x3AA7EB86
+data8 0x3DC184410F21BD9D
+data4 0x3F7FA420,0x3AB7E101
+data8 0xBDA64B952245E0A6
+data4 0x3F7F9C20,0x3AC7E701
+data8 0x3DB4B0ECAABB34B8
+data4 0x3F7F9428,0x3AD7DD7B
+data8 0x3D9923376DC40A7E
+data4 0x3F7F8C30,0x3AE7D474
+data8 0x3DC6E17B4F2083D3
+data4 0x3F7F8438,0x3AF7CBED
+data8 0x3DAE314B811D4394
+data4 0x3F7F7C40,0x3B03E1F3
+data8 0xBDD46F21B08F2DB1
+data4 0x3F7F7448,0x3B0BDE2F
+data8 0xBDDC30A46D34522B
+data4 0x3F7F6C50,0x3B13DAAA
+data8 0x3DCB0070B1F473DB
+data4 0x3F7F6458,0x3B1BD766
+data8 0xBDD65DDC6AD282FD
+data4 0x3F7F5C68,0x3B23CC5C
+data8 0xBDCDAB83F153761A
+data4 0x3F7F5470,0x3B2BC997
+data8 0xBDDADA40341D0F8F
+data4 0x3F7F4C78,0x3B33C711
+data8 0x3DCD1BD7EBC394E8
+data4 0x3F7F4488,0x3B3BBCC6
+data8 0xBDC3532B52E3E695
+data4 0x3F7F3C90,0x3B43BAC0
+data8 0xBDA3961EE846B3DE
+data4 0x3F7F34A0,0x3B4BB0F4
+data8 0xBDDADF06785778D4
+data4 0x3F7F2CA8,0x3B53AF6D
+data8 0x3DCC3ED1E55CE212
+data4 0x3F7F24B8,0x3B5BA620
+data8 0xBDBA31039E382C15
+data4 0x3F7F1CC8,0x3B639D12
+data8 0x3D635A0B5C5AF197
+data4 0x3F7F14D8,0x3B6B9444
+data8 0xBDDCCB1971D34EFC
+data4 0x3F7F0CE0,0x3B7393BC
+data8 0x3DC7450252CD7ADA
+data4 0x3F7F04F0,0x3B7B8B6D
+data8 0xBDB68F177D7F2A42
+LOCAL_OBJECT_END(Constants_G_H_h3)
+
+
+// Floating Point Registers
+
+FR_Input_X = f8
+
+FR_Y_hi = f34
+FR_Y_lo = f35
+
+FR_Scale = f36
+FR_X_Prime = f37
+FR_S_hi = f38
+FR_W = f39
+FR_G = f40
+
+FR_H = f41
+FR_wsq = f42
+FR_w4 = f43
+FR_h = f44
+FR_w6 = f45
+
+FR_G2 = f46
+FR_H2 = f47
+FR_poly_lo = f48
+FR_P8 = f49
+FR_poly_hi = f50
+
+FR_P7 = f51
+FR_h2 = f52
+FR_rsq = f53
+FR_P6 = f54
+FR_r = f55
+
+FR_log2_hi = f56
+FR_log2_lo = f57
+FR_p87 = f58
+FR_p876 = f58
+FR_p8765 = f58
+FR_float_N = f59
+FR_Q4 = f60
+
+FR_p43 = f61
+FR_p432 = f61
+FR_p4321 = f61
+FR_P4 = f62
+FR_G3 = f63
+FR_H3 = f64
+FR_h3 = f65
+
+FR_Q3 = f66
+FR_P3 = f67
+FR_Q2 = f68
+FR_P2 = f69
+FR_1LN10_hi = f70
+
+FR_Q1 = f71
+FR_P1 = f72
+FR_1LN10_lo = f73
+FR_P5 = f74
+FR_rcub = f75
+
+FR_Output_X_tmp = f76
+
+FR_X = f8
+FR_Y = f0
+FR_RESULT = f76
+
+
+// General Purpose Registers
+
+GR_ad_p = r33
+GR_Index1 = r34
+GR_Index2 = r35
+GR_signif = r36
+GR_X_0 = r37
+GR_X_1 = r38
+GR_X_2 = r39
+GR_Z_1 = r40
+GR_Z_2 = r41
+GR_N = r42
+GR_Bias = r43
+GR_M = r44
+GR_Index3 = r45
+GR_ad_p2 = r46
+GR_exp_mask = r47
+GR_exp_2tom7 = r48
+GR_ad_ln10 = r49
+GR_ad_tbl_1 = r50
+GR_ad_tbl_2 = r51
+GR_ad_tbl_3 = r52
+GR_ad_q = r53
+GR_ad_z_1 = r54
+GR_ad_z_2 = r55
+GR_ad_z_3 = r56
+
+//
+// Added for unwind support
+//
+
+GR_SAVE_PFS = r50
+GR_SAVE_B0 = r51
+GR_SAVE_GP = r52
+GR_Parameter_X = r53
+GR_Parameter_Y = r54
+GR_Parameter_RESULT = r55
+GR_Parameter_TAG = r56
+
+.section .text
+
+GLOBAL_IEEE754_ENTRY(logl)
+{ .mfi
+ alloc r32 = ar.pfs,0,21,4,0
+ fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test for natval, nan, inf
+ cmp.eq p7, p14 = r0, r0 // Set p7 if logl
+}
+{ .mfb
+ addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp
+ fnorm.s1 FR_X_Prime = FR_Input_X // Normalize x
+ br.cond.sptk LOGL_BEGIN
+}
+;;
+
+GLOBAL_IEEE754_END(logl)
+
+GLOBAL_IEEE754_ENTRY(log10l)
+{ .mfi
+ alloc r32 = ar.pfs,0,21,4,0
+ fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test for natval, nan, inf
+ cmp.ne p7, p14 = r0, r0 // Set p14 if log10l
+}
+{ .mfb
+ addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp
+ fnorm.s1 FR_X_Prime = FR_Input_X // Normalize x
+ nop.b 999
+}
+;;
+
+
+// Common code for logl and log10
+LOGL_BEGIN:
+{ .mfi
+ ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1
+ fclass.m p10, p0 = FR_Input_X, 0x0b // Test for denormal
+ mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
+}
+;;
+
+{ .mfb
+ getf.sig GR_signif = FR_Input_X // Get significand of x
+ fcmp.eq.s1 p9, p0 = FR_Input_X, f1 // Test for x=1.0
+(p6) br.cond.spnt LOGL_64_special // Branch for nan, inf, natval
+}
+;;
+
+{ .mfi
+ add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
+ fcmp.lt.s1 p13, p0 = FR_Input_X, f0 // Test for x<0
+ add GR_ad_p = -0x100, GR_ad_z_1 // Point to Constants_P
+}
+{ .mib
+ add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
+ add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
+(p10) br.cond.spnt LOGL_64_denormal // Branch for denormal
+}
+;;
+
+LOGL_64_COMMON:
+{ .mfi
+ add GR_ad_q = 0x080, GR_ad_p // Point to Constants_Q
+ fcmp.eq.s1 p8, p0 = FR_Input_X, f0 // Test for x=0
+ extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
+}
+{ .mfb
+ add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
+(p9) fma.s0 f8 = FR_Input_X, f0, f0 // If x=1, return +0.0
+(p9) br.ret.spnt b0 // Exit if x=1
+}
+;;
+
+{ .mfi
+ shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
+ fclass.nm p10, p0 = FR_Input_X, 0x1FF // Test for unsupported
+ extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of significand
+}
+{ .mfi
+ ldfe FR_P8 = [GR_ad_p],16 // Load P_8 for near1 path
+ fsub.s1 FR_W = FR_X_Prime, f1 // W = x - 1
+ add GR_ad_ln10 = 0x060, GR_ad_q // Point to Constants_1_by_LN10
+}
+;;
+
+{ .mfi
+ ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
+ nop.f 999
+ mov GR_exp_mask = 0x1FFFF // Create exponent mask
+}
+{ .mib
+ shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
+ mov GR_Bias = 0x0FFFF // Create exponent bias
+(p13) br.cond.spnt LOGL_64_negative // Branch if x<0
+}
+;;
+
+{ .mfb
+ ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
+ fmerge.se FR_S_hi = f1,FR_X_Prime // Form |x|
+(p8) br.cond.spnt LOGL_64_zero // Branch if x=0
+}
+;;
+
+{ .mmb
+ getf.exp GR_N = FR_X_Prime // Get N = exponent of x
+ ldfd FR_h = [GR_ad_tbl_1] // Load h_1
+(p10) br.cond.spnt LOGL_64_unsupported // Branch for unsupported type
+}
+;;
+
+{ .mfi
+ ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
+ fcmp.eq.s0 p8, p0 = FR_Input_X, f0 // Dummy op to flag denormals
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
+}
+;;
+
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mmi
+ ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
+(p14) ldfe FR_1LN10_hi = [GR_ad_ln10],16 // If log10l, load 1/ln10_hi
+ sub GR_N = GR_N, GR_Bias
+}
+;;
+
+{ .mmi
+ ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
+(p14) ldfe FR_1LN10_lo = [GR_ad_ln10] // If log10l, load 1/ln10_lo
+ nop.i 999
+}
+;;
+
+{ .mmi
+ ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
+ setf.sig FR_float_N = GR_N // Put integer N into rightmost significand
+ nop.i 999
+}
+;;
+
+{ .mmi
+ getf.exp GR_M = FR_W // Get signexp of w = x - 1
+ ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+}
+;;
+
+{ .mmi
+ ldfe FR_Q1 = [GR_ad_q] // Load Q1
+ shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
+ add GR_ad_p2 = 0x30,GR_ad_p // Point to P_4
+}
+;;
+
+{ .mmi
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
+ and GR_M = GR_exp_mask, GR_M // Get exponent of w = x - 1
+}
+;;
+
+{ .mmi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
+ cmp.lt p8, p9 = GR_M, GR_exp_2tom7 // Test |x-1| < 2^-7
+ nop.i 999
+}
+;;
+
+// Paths are merged.
+// p8 is for the near1 path: |x-1| < 2^-7
+// p9 is for regular path: |x-1| >= 2^-7
+
+{ .mmi
+ ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
+ nop.m 999
+ nop.i 999
+}
+;;
+
+{ .mmi
+(p8) ldfe FR_P7 = [GR_ad_p],16 // Load P_7 for near1 path
+(p8) ldfe FR_P4 = [GR_ad_p2],16 // Load P_4 for near1 path
+(p9) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
+}
+;;
+
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mmi
+(p8) ldfe FR_P6 = [GR_ad_p],16 // Load P_6 for near1 path
+(p8) ldfe FR_P3 = [GR_ad_p2],16 // Load P_3 for near1 path
+ nop.i 999
+}
+;;
+
+{ .mmf
+(p8) ldfe FR_P5 = [GR_ad_p],16 // Load P_5 for near1 path
+(p8) ldfe FR_P2 = [GR_ad_p2],16 // Load P_2 for near1 path
+(p8) fmpy.s1 FR_wsq = FR_W, FR_W // wsq = w * w for near1 path
+}
+;;
+
+{ .mmi
+(p8) ldfe FR_P1 = [GR_ad_p2],16 ;; // Load P_1 for near1 path
+ nop.m 999
+(p9) extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+}
+;;
+
+{ .mfi
+(p9) shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
+(p9) fcvt.xf FR_float_N = FR_float_N
+ nop.i 999
+}
+;;
+
+{ .mfi
+(p9) ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
+ nop.f 999
+ nop.i 999
+}
+;;
+
+{ .mfi
+(p9) ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+(p9) fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p9) fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 999
+}
+;;
+
+{ .mmf
+ nop.m 999
+ nop.m 999
+(p9) fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p8) fmpy.s1 FR_w4 = FR_wsq, FR_wsq // w4 = w^4 for near1 path
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p8) fma.s1 FR_p87 = FR_W, FR_P8, FR_P7 // p87 = w * P8 + P7
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 FR_p43 = FR_W, FR_P4, FR_P3 // p43 = w * P4 + P3
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p9) fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p9) fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p9) fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p8) fmpy.s1 FR_w6 = FR_w4, FR_wsq // w6 = w^6 for near1 path
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 FR_p432 = FR_W, FR_p43, FR_P2 // p432 = w * p43 + P2
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p8) fma.s1 FR_p876 = FR_W, FR_p87, FR_P6 // p876 = w * p87 + P6
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p9) fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi = N * log2_hi + H
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h = N * log2_lo + h
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 FR_p4321 = FR_W, FR_p432, FR_P1 // p4321 = w * p432 + P1
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p8) fma.s1 FR_p8765 = FR_W, FR_p876, FR_P5 // p8765 = w * p876 + P5
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p9) fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 FR_Y_lo = FR_wsq, FR_p4321, f0 // Y_lo = wsq * p4321
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p8) fma.s1 FR_Y_hi = FR_W, f1, f0 // Y_hi = w for near1 path
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo = poly_lo * r + Q2
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 FR_Y_lo = FR_w6, FR_p8765,FR_Y_lo // Y_lo = w6 * p8765 + w2 * p4321
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1 * rsq + r
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h // poly_lo = poly_lo*r^3 + h
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p9) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo = poly_hi + poly_lo
+ nop.i 999
+}
+;;
+
+// Remainder of code is common for near1 and regular paths
+{ .mfi
+ nop.m 999
+(p7) fadd.s0 f8 = FR_Y_lo,FR_Y_hi // If logl, result=Y_lo+Y_hi
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp
+ nop.i 999
+}
+;;
+
+{ .mfb
+ nop.m 999
+(p14) fma.s0 f8 = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp
+ br.ret.sptk b0 // Common exit for 0 < x < inf
+}
+;;
+
+
+// Here if x=+-0
+LOGL_64_zero:
+//
+// If x=+-0 raise divide by zero and return -inf
+//
+{ .mfi
+(p7) mov GR_Parameter_TAG = 0
+ fsub.s1 FR_Output_X_tmp = f0, f1
+ nop.i 999
+}
+;;
+
+{ .mfb
+(p14) mov GR_Parameter_TAG = 6
+ frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0
+ br.cond.sptk __libm_error_region
+}
+;;
+
+LOGL_64_special:
+{ .mfi
+ nop.m 999
+ fclass.m.unc p8, p0 = FR_Input_X, 0x1E1 // Test for natval, nan, +inf
+ nop.i 999
+}
+;;
+
+//
+// For SNaN raise invalid and return QNaN.
+// For QNaN raise invalid and return QNaN.
+// For +Inf return +Inf.
+//
+{ .mfb
+ nop.m 999
+(p8) fmpy.s0 f8 = FR_Input_X, f1
+(p8) br.ret.sptk b0 // Return for natval, nan, +inf
+}
+;;
+
+//
+// For -Inf raise invalid and return QNaN.
+//
+{ .mmi
+(p7) mov GR_Parameter_TAG = 1
+ nop.m 999
+ nop.i 999
+}
+;;
+
+{ .mfb
+(p14) mov GR_Parameter_TAG = 7
+ fmpy.s0 FR_Output_X_tmp = FR_Input_X, f0
+ br.cond.sptk __libm_error_region
+}
+;;
+
+// Here if x denormal or unnormal
+LOGL_64_denormal:
+{ .mmi
+ getf.sig GR_signif = FR_X_Prime // Get significand of normalized input
+ nop.m 999
+ nop.i 999
+}
+;;
+
+{ .mmb
+ getf.exp GR_N = FR_X_Prime // Get exponent of normalized input
+ nop.m 999
+ br.cond.sptk LOGL_64_COMMON // Branch back to common code
+}
+;;
+
+LOGL_64_unsupported:
+//
+// Return generated NaN or other value.
+//
+{ .mfb
+ nop.m 999
+ fmpy.s0 f8 = FR_Input_X, f0
+ br.ret.sptk b0
+}
+;;
+
+// Here if -inf < x < 0
+LOGL_64_negative:
+//
+// Deal with x < 0 in a special way - raise
+// invalid and produce QNaN indefinite.
+//
+{ .mfi
+(p7) mov GR_Parameter_TAG = 1
+ frcpa.s0 FR_Output_X_tmp, p8 = f0, f0
+ nop.i 999
+}
+;;
+
+{ .mib
+(p14) mov GR_Parameter_TAG = 7
+ nop.i 999
+ br.cond.sptk __libm_error_region
+}
+;;
+
+
+GLOBAL_IEEE754_END(log10l)
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ nop.b 0 // Parameter 3 address
+}
+{ .mib
+ stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 999
+ nop.m 999
+ add GR_Parameter_RESULT = 48,sp
+};;
+{ .mmi
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region#)
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_pow.S b/sysdeps/ia64/fpu/e_pow.S
index 56f7f078ba..11fae53d72 100644
--- a/sysdeps/ia64/fpu/e_pow.S
+++ b/sysdeps/ia64/fpu/e_pow.S
@@ -1,10 +1,10 @@
.file "pow.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,30 +35,41 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 2/03/00 Added p12 to definite over/under path. With odd power we did not
+// 02/02/00 Initial version
+// 02/03/00 Added p12 to definite over/under path. With odd power we did not
// maintain the sign of x in this path.
-// 4/04/00 Unwind support added
-// 4/19/00 pow(+-1,inf) now returns NaN
-// pow(+-val, +-inf) returns 0 or inf, but now does not call error support
+// 04/04/00 Unwind support added
+// 04/19/00 pow(+-1,inf) now returns NaN
+// pow(+-val, +-inf) returns 0 or inf, but now does not call error
+// support
// Added s1 to fcvt.fx because invalid flag was incorrectly set.
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-// 9/07/00 Improved performance by eliminating bank conflicts and other stalls,
+// 09/07/00 Improved performance by eliminating bank conflicts and other stalls,
// and tweaking the critical path
-// 9/08/00 Per c99, pow(+-1,inf) now returns 1, and pow(+1,nan) returns 1
-// 9/28/00 Updated NaN**0 path
-// 1/20/01 Fixed denormal flag settings.
-// 2/12/01 Improved speed.
+// 09/08/00 Per c99, pow(+-1,inf) now returns 1, and pow(+1,nan) returns 1
+// 09/28/00 Updated NaN**0 path
+// 01/20/01 Fixed denormal flag settings.
+// 02/13/01 Improved speed.
+// 03/19/01 Reordered exp polynomial to improve speed and eliminate monotonicity
+// problem in round up, down, and to zero modes. Also corrected
+// overflow result when x negative, y odd in round up, down, zero.
+// 06/14/01 Added brace missing from bundle
+// 12/10/01 Corrected case where x negative, 2^52 <= |y| < 2^53, y odd integer.
+// 12/20/01 Fixed monotonity problem in round to nearest.
+// 02/08/02 Fixed overflow/underflow cases that were not calling error support.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 08/29/02 Improved Itanium 2 performance
+// 09/21/02 Added branch for |y*log(x)|<2^-11 to fix monotonicity problems.
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
-// double pow(double)
-// float powf(float)
+// double pow(double x, double y)
//
// Overview of operation
//==============================================================
@@ -67,51 +78,51 @@
// 1. Log(x)
// 2. y Log(x)
// 3. exp(y log(x))
-//
+//
// This means we work with the absolute value of x and merge in the sign later.
// Log(x) = G + delta + r -rsq/2 + p
// G,delta depend on the exponent of x and table entries. The table entries are
// indexed by the exponent of x, called K.
-//
+//
// The G and delta come out of the reduction; r is the reduced x.
-//
+//
// B = frcpa(x)
// xB-1 is small means that B is the approximate inverse of x.
-//
+//
// Log(x) = Log( (1/B)(Bx) )
// = Log(1/B) + Log(Bx)
// = Log(1/B) + Log( 1 + (Bx-1))
-//
+//
// x = 2^K 1.x_1x_2.....x_52
-// B= frcpa(x) = 2^-k Cm
+// B= frcpa(x) = 2^-k Cm
// Log(1/B) = Log(1/(2^-K Cm))
// Log(1/B) = Log((2^K/ Cm))
// Log(1/B) = K Log(2) + Log(1/Cm)
-//
+//
// Log(x) = K Log(2) + Log(1/Cm) + Log( 1 + (Bx-1))
-//
+//
// If you take the significand of x, set the exponent to true 0, then Cm is
// the frcpa. We tabulate the Log(1/Cm) values. There are 256 of them.
// The frcpa table is indexed by 8 bits, the x_1 thru x_8.
// m = x_1x_2...x_8 is an 8-bit index.
-//
+//
// Log(1/Cm) = log(1/frcpa(1+m/256)) where m goes from 0 to 255.
-//
+//
// We tabluate as two doubles, T and t, where T +t is the value itself.
-//
+//
// Log(x) = (K Log(2)_hi + T) + (Log(2)_hi + t) + Log( 1 + (Bx-1))
// Log(x) = G + delta + Log( 1 + (Bx-1))
-//
+//
// The Log( 1 + (Bx-1)) can be calculated as a series in r = Bx-1.
-//
+//
// Log( 1 + (Bx-1)) = r - rsq/2 + p
-//
+//
// Then,
-//
+//
// yLog(x) = yG + y delta + y(r-rsq/2) + yp
// yLog(x) = Z1 + e3 + Z2 + Z3 + (e2 + e3)
-//
-//
+//
+//
// exp(yLog(x)) = exp(Z1 + Z2 + Z3) exp(e1 + e2 + e3)
//
//
@@ -133,7 +144,7 @@
// exp(r) = exp(Z - N log2/128)
//
// r = s + d = (Z - N (log2/128)_hi) -N (log2/128)_lo
-// = Z - N (log2/128)
+// = Z - N (log2/128)
//
// Z = s+d +N (log2/128)
//
@@ -149,22 +160,22 @@
// n log2/128 = n_7n_6n_5 log2/8 + n_4n_3n_2n_1 log2/128
// n log2/128 = I2 log2/8 + I1 log2/128
//
-// N log2/128 = M log2 + I2 log2/8 + I1 log2/128
+// N log2/128 = M log2 + I2 log2/8 + I1 log2/128
//
// exp(Z) = exp(s) (1+d) exp(log(2^M) + log(2^I2/8) + log(2^I1/128))
// exp(Z) = exp(s) (1+d1) (1+d2)(2^M) 2^I2/8 2^I1/128
// exp(Z) = exp(s) f1 f2 (2^M) 2^I2/8 2^I1/128
//
// I1, I2 are table indices. Use a series for exp(s).
-// Then get exp(Z)
+// Then get exp(Z)
//
// exp(yLog(x)) = exp(Z1 + Z2 + Z3) exp(e1 + e2 + e3)
-// exp(yLog(x)) = exp(Z) exp(Z3) f3
-// exp(yLog(x)) = exp(Z)f3 exp(Z3)
-// exp(yLog(x)) = A exp(Z3)
+// exp(yLog(x)) = exp(Z) exp(Z3) f3
+// exp(yLog(x)) = exp(Z)f3 exp(Z3)
+// exp(yLog(x)) = A exp(Z3)
//
// We actually calculate exp(Z3) -1.
-// Then,
+// Then,
// exp(yLog(x)) = A + A( exp(Z3) -1)
//
@@ -175,142 +186,146 @@
// ==============
// The operation (K*log2_hi) must be exact. K is the true exponent of x.
// If we allow gradual underflow (denormals), K can be represented in 12 bits
-// (as a two's complement number). We assume 13 bits as an engineering precaution.
-//
+// (as a two's complement number). We assume 13 bits as an engineering
+// precaution.
+//
// +------------+----------------+-+
// | 13 bits | 50 bits | |
// +------------+----------------+-+
// 0 1 66
// 2 34
-//
+//
// So we want the lsb(log2_hi) to be 2^-50
// We get log2 as a quad-extended (15-bit exponent, 128-bit significand)
-//
+//
// 0 fffe b17217f7d1cf79ab c9e3b39803f2f6af (4...)
-//
+//
// Consider numbering the bits left to right, starting at 0 thru 127.
// Bit 0 is the 2^-1 bit; bit 49 is the 2^-50 bit.
-//
+//
// ...79ab
// 0111 1001 1010 1011
// 44
// 89
-//
-// So if we shift off the rightmost 14 bits, then (shift back only
+//
+// So if we shift off the rightmost 14 bits, then (shift back only
// the top half) we get
-//
+//
// 0 fffe b17217f7d1cf4000 e6af278ece600fcb dabc000000000000
-//
+//
// Put the right 64-bit signficand in an FR register, convert to double;
// it is exact. Put the next 128 bits into a quad register and round to double.
// The true exponent of the low part is -51.
-//
+//
// hi is 0 fffe b17217f7d1cf4000
// lo is 0 ffcc e6af278ece601000
-//
+//
// Convert to double memory format and get
-//
+//
// hi is 0x3fe62e42fefa39e8
-// lo is 0x3cccd5e4f1d9cc02
-//
+// lo is 0x3cccd5e4f1d9cc02
+//
// log2_hi + log2_lo is an accurate value for log2.
-//
-//
+//
+//
// The T and t values
// ==================
// A similar method is used to generate the T and t values.
-//
+//
// K * log2_hi + T must be exact.
-//
+//
// Smallest T,t
// ----------
-// The smallest T,t is
+// The smallest T,t is
// T t
-// data8 0x3f60040155d58800, 0x3c93bce0ce3ddd81 log(1/frcpa(1+0/256))= +1.95503e-003
-//
+// 0x3f60040155d58800, 0x3c93bce0ce3ddd81 log(1/frcpa(1+0/256))= +1.95503e-003
+//
// The exponent is 0x3f6 (biased) or -9 (true).
// For the smallest T value, what we want is to clip the significand such that
-// when it is shifted right by 9, its lsb is in the bit for 2^-51. The 9 is the specific
-// for the first entry. In general, it is 0xffff - (biased 15-bit exponent).
+// when it is shifted right by 9, its lsb is in the bit for 2^-51. The 9 is the
+// specific for the first entry. In general, it is 0xffff - (biased 15-bit
+// exponent).
-// Independently, what we have calculated is the table value as a quad precision number.
+// Independently, what we have calculated is the table value as a quad
+// precision number.
// Table entry 1 is
// 0 fff6 80200aaeac44ef38 338f77605fdf8000
-//
+//
// We store this quad precision number in a data structure that is
-// sign: 1
+// sign: 1
// exponent: 15
// signficand_hi: 64 (includes explicit bit)
// signficand_lo: 49
// Because the explicit bit is included, the significand is 113 bits.
-//
+//
// Consider significand_hi for table entry 1.
-//
-//
+//
+//
// +-+--- ... -------+--------------------+
// | |
// +-+--- ... -------+--------------------+
// 0 1 4444444455555555556666
// 2345678901234567890123
-//
+//
// Labeled as above, bit 0 is 2^0, bit 1 is 2^-1, etc.
// Bit 42 is 2^-42. If we shift to the right by 9, the bit in
// bit 42 goes in 51.
-//
+//
// So what we want to do is shift bits 43 thru 63 into significand_lo.
-// This is shifting bit 42 into bit 63, taking care to retain the shifted-off bits.
-// Then shifting (just with signficaand_hi) back into bit 42.
-//
-// The shift_value is 63-42 = 21. In general, this is
+// This is shifting bit 42 into bit 63, taking care to retain shifted-off bits.
+// Then shifting (just with signficaand_hi) back into bit 42.
+//
+// The shift_value is 63-42 = 21. In general, this is
// 63 - (51 -(0xffff - 0xfff6))
// For this example, it is
// 63 - (51 - 9) = 63 - 42 = 21
-//
-// This means we are shifting 21 bits into significand_lo. We must maintain more
-// that a 128-bit signficand not to lose bits. So before the shift we put the 128-bit
-// significand into a 256-bit signficand and then shift.
+//
+// This means we are shifting 21 bits into significand_lo. We must maintain more
+// that a 128-bit signficand not to lose bits. So before the shift we put the
+// 128-bit significand into a 256-bit signficand and then shift.
// The 256-bit significand has four parts: hh, hl, lh, and ll.
-//
+//
// Start off with
// hh hl lh ll
// <64> <49><15_0> <64_0> <64_0>
-//
+//
// After shift by 21 (then return for significand_hi),
// <43><21_0> <21><43> <6><58_0> <64_0>
-//
+//
// Take the hh part and convert to a double. There is no rounding here.
-// The conversion is exact. The true exponent of the high part is the same as the
-// true exponent of the input quad.
-//
-// We have some 64 plus significand bits for the low part. In this example, we have
-// 70 bits. We want to round this to a double. Put them in a quad and then do a quad fnorm.
-// For this example the true exponent of the low part is
+// The conversion is exact. The true exponent of the high part is the same as
+// the true exponent of the input quad.
+//
+// We have some 64 plus significand bits for the low part. In this example, we
+// have 70 bits. We want to round this to a double. Put them in a quad and then
+// do a quad fnorm.
+// For this example the true exponent of the low part is
// true_exponent_of_high - 43 = true_exponent_of_high - (64-21)
-// In general, this is
-// true_exponent_of_high - (64 - shift_value)
-//
-//
+// In general, this is
+// true_exponent_of_high - (64 - shift_value)
+//
+//
// Largest T,t
// ----------
// The largest T,t is
-// data8 0x3fe62643fecf9742, 0x3c9e3147684bd37d log(1/frcpa(1+255/256))= +6.92171e-001
-//
+// 0x3fe62643fecf9742, 0x3c9e3147684bd37d log(1/frcpa(1+255/256))=+6.92171e-001
+//
// Table entry 256 is
// 0 fffe b1321ff67cba178c 51da12f4df5a0000
-//
-// The shift value is
+//
+// The shift value is
// 63 - (51 -(0xffff - 0xfffe)) = 13
-//
-// The true exponent of the low part is
+//
+// The true exponent of the low part is
// true_exponent_of_high - (64 - shift_value)
// -1 - (64-13) = -52
// Biased as a double, this is 0x3cb
-//
-//
-//
+//
+//
+//
// So then lsb(T) must be >= 2^-51
// msb(Klog2_hi) <= 2^12
-//
+//
// +--------+---------+
// | 51 bits | <== largest T
// +--------+---------+
@@ -320,7 +335,6 @@
// +------------+----------------+-+
-
// Special Cases
//==============================================================
@@ -385,63 +399,67 @@
// X any Y =0 +1
-#include "libm_support.h"
-
// Assembly macros
//==============================================================
// integer registers used
-pow_AD_Tt = r33
-pow_GR_FFF7 = r34
-pow_GR_exp_Y = r34 // duplicate
-pow_GR_17ones = r35
-
-pow_AD_P = r36
-pow_AD_Q = r37
-pow_AD_tbl1 = r38
-pow_AD_tbl2 = r39
-pow_GR_exp_X = r40
-pow_GR_true_exp_X = r40 // duplicate
-
-pow_GR_offset = r41
-pow_GR_exp_Xm1 = r42
-pow_GR_sig_X = r43
-pow_GR_signexp_X = r44
-
-pow_GR_signexp_Xm1 = r46
-pow_GR_int_W1 = r47
-pow_GR_int_W2 = r48
-pow_GR_int_N = r49
-pow_GR_index1 = r50
-
-pow_GR_index2 = r51
-pow_AD_T1 = r52
-pow_AD_T2 = r53
-pow_GR_gt_ln = r53 // duplicate
-pow_int_GR_M = r54
-pow_GR_10033 = r55
-
-pow_GR_16ones = r56
-pow_GR_sig_int_Y = r57
-pow_GR_sign_Y_Gpr = r58
-pow_GR_17ones_m1 = r59
-pow_GR_one = r60
-pow_GR_sign_Y = r60
-
-pow_GR_signexp_Y_Gpr = r61
-pow_GR_exp_Y_Gpr = r62
-pow_GR_true_exp_Y_Gpr = r63
-pow_GR_signexp_Y = r64
-
-GR_SAVE_B0 = r65
-GR_SAVE_GP = r66
-GR_SAVE_PFS = r67
-
-GR_Parameter_X = r68
-GR_Parameter_Y = r69
-GR_Parameter_RESULT = r70
-pow_GR_tag = r71
+pow_GR_signexp_X = r14
+pow_GR_17ones = r15
+pow_AD_P = r16
+pow_GR_exp_2tom8 = r17
+pow_GR_sig_X = r18
+pow_GR_10033 = r19
+pow_GR_16ones = r20
+
+pow_AD_Tt = r21
+pow_GR_exp_X = r22
+pow_AD_Q = r23
+pow_GR_true_exp_X = r24
+pow_GR_y_zero = r25
+
+pow_GR_exp_Y = r26
+pow_AD_tbl1 = r27
+pow_AD_tbl2 = r28
+pow_GR_offset = r29
+pow_GR_exp_Xm1 = r30
+pow_GR_xneg_yodd = r31
+
+pow_GR_signexp_Xm1 = r35
+pow_GR_int_W1 = r36
+pow_GR_int_W2 = r37
+pow_GR_int_N = r38
+pow_GR_index1 = r39
+pow_GR_index2 = r40
+
+pow_AD_T1 = r41
+pow_AD_T2 = r42
+pow_int_GR_M = r43
+pow_GR_sig_int_Y = r44
+pow_GR_sign_Y_Gpr = r45
+
+pow_GR_17ones_m1 = r46
+pow_GR_one = r47
+pow_GR_sign_Y = r48
+pow_GR_signexp_Y_Gpr = r49
+pow_GR_exp_Y_Gpr = r50
+
+pow_GR_true_exp_Y_Gpr = r51
+pow_GR_signexp_Y = r52
+pow_GR_x_one = r53
+pow_GR_exp_2toM63 = r54
+pow_GR_big_pos = r55
+
+pow_GR_big_neg = r56
+
+GR_SAVE_B0 = r50
+GR_SAVE_GP = r51
+GR_SAVE_PFS = r52
+
+GR_Parameter_X = r53
+GR_Parameter_Y = r54
+GR_Parameter_RESULT = r55
+pow_GR_tag = r56
// floating point registers used
@@ -464,7 +482,8 @@ POW_log2_lo = f43
POW_r = f44
POW_Q0_half = f45
-POW_Q1 = f46
+POW_Q1 = f46
+POW_tmp = f47
POW_log2_hi = f48
POW_Q4 = f49
POW_P1 = f50
@@ -476,6 +495,7 @@ POW_Yrcub = f54
POW_log2_by_128_lo = f55
POW_v6 = f56
+POW_xsq = f57
POW_v4 = f58
POW_v2 = f59
POW_T = f60
@@ -484,6 +504,7 @@ POW_Tt = f61
POW_RSHF = f62
POW_v21ps = f63
POW_s4 = f64
+POW_twoV = f65
POW_U = f66
POW_G = f67
@@ -533,44 +554,45 @@ POW_1ps = f103
POW_A = f104
POW_es = f105
+POW_Xp1 = f106
POW_int_K = f107
POW_K = f108
POW_f123 = f109
POW_Gpr = f110
-POW_Y_Gpr = f111
+POW_Y_Gpr = f111
POW_int_Y = f112
+POW_abs_q = f114
+POW_2toM63 = f115
POW_float_int_Y = f116
POW_ftz_urm_f8 = f117
POW_wre_urm_f8 = f118
-POW_abs_A = f119
-POW_gt_pln = f120
+POW_big_neg = f119
+POW_big_pos = f120
-POW_xsq = f121
-
-POW_twoV = f122
-POW_Xp1 = f123
+POW_GY_Z2 = f121
+POW_pYrcub_e3 = f122
+POW_d = f123
+POW_d2 = f124
+POW_poly_d_hi = f121
+POW_poly_d_lo = f122
+POW_poly_d = f121
// Data tables
//==============================================================
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+RODATA
.align 16
-pow_table_P:
-ASM_TYPE_DIRECTIVE(pow_table_P,@object)
+LOCAL_OBJECT_START(pow_table_P)
data8 0x8000F7B249FF332D, 0x0000BFFC // P_5
data8 0xAAAAAAA9E7902C7F, 0x0000BFFC // P_3
data8 0x80000000000018E5, 0x0000BFFD // P_1
data8 0xb8aa3b295c17f0bc, 0x00004006 // inv_ln2_by_128
-
-
+//
+//
data8 0x3FA5555555554A9E // Q_2
data8 0x3F8111124F4DD9F9 // Q_3
data8 0x3FE0000000000000 // Q_0
@@ -580,20 +602,18 @@ data8 0x43e8000000000000 // Right shift constant for exp
data8 0xc9e3b39803f2f6af, 0x00003fb7 // ln2_by_128_lo
data8 0x0000000000000000 // pad to eliminate bank conflicts with pow_table_Q
data8 0x0000000000000000 // pad to eliminate bank conflicts with pow_table_Q
-ASM_SIZE_DIRECTIVE(pow_table_P)
+LOCAL_OBJECT_END(pow_table_P)
-pow_table_Q:
-ASM_TYPE_DIRECTIVE(pow_table_Q,@object)
+LOCAL_OBJECT_START(pow_table_Q)
data8 0x9249FE7F0DC423CF, 0x00003FFC // P_4
data8 0xCCCCCCCC4ED2BA7F, 0x00003FFC // P_2
data8 0xAAAAAAAAAAAAB505, 0x00003FFD // P_0
data8 0x3fe62e42fefa39e8, 0x3cccd5e4f1d9cc02 // log2 hi lo = +6.93147e-001
data8 0xb17217f7d1cf79ab, 0x00003ff7 // ln2_by_128_hi
-ASM_SIZE_DIRECTIVE(pow_table_Q)
+LOCAL_OBJECT_END(pow_table_Q)
-pow_Tt:
-ASM_TYPE_DIRECTIVE(pow_Tt,@object)
+LOCAL_OBJECT_START(pow_Tt)
data8 0x3f60040155d58800, 0x3c93bce0ce3ddd81 // log(1/frcpa(1+0/256))= +1.95503e-003
data8 0x3f78121214586a00, 0x3cb540e0a5cfc9bc // log(1/frcpa(1+1/256))= +5.87661e-003
data8 0x3f841929f9683200, 0x3cbdf1d57404da1f // log(1/frcpa(1+2/256))= +9.81362e-003
@@ -850,13 +870,12 @@ data8 0x3fe5f673c61a2ed0, 0x3caa385eef5f2789 // log(1/frcpa(1+252/256))= +6.863
data8 0x3fe6065bea385924, 0x3cb11624f165c5b4 // log(1/frcpa(1+253/256))= +6.88276e-001
data8 0x3fe6164bfa7cc068, 0x3cbad884f87073fa // log(1/frcpa(1+254/256))= +6.90222e-001
data8 0x3fe62643fecf9740, 0x3cb78c51da12f4df // log(1/frcpa(1+255/256))= +6.92171e-001
-ASM_SIZE_DIRECTIVE(pow_Tt)
+LOCAL_OBJECT_END(pow_Tt)
// Table 1 is 2^(index_1/128) where
// index_1 goes from 0 to 15
-pow_tbl1:
-ASM_TYPE_DIRECTIVE(pow_tbl1,@object)
+LOCAL_OBJECT_START(pow_tbl1)
data8 0x8000000000000000 , 0x00003FFF
data8 0x80B1ED4FD999AB6C , 0x00003FFF
data8 0x8164D1F3BC030773 , 0x00003FFF
@@ -873,13 +892,12 @@ data8 0x88980E8092DA8527 , 0x00003FFF
data8 0x8955EE03618E5FDD , 0x00003FFF
data8 0x8A14D575496EFD9A , 0x00003FFF
data8 0x8AD4C6452C728924 , 0x00003FFF
-ASM_SIZE_DIRECTIVE(pow_tbl1)
+LOCAL_OBJECT_END(pow_tbl1)
// Table 2 is 2^(index_1/8) where
// index_2 goes from 0 to 7
-pow_tbl2:
-ASM_TYPE_DIRECTIVE(pow_tbl2,@object)
+LOCAL_OBJECT_START(pow_tbl2)
data8 0x8000000000000000 , 0x00003FFF
data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF
data8 0x9837F0518DB8A96F , 0x00003FFF
@@ -888,402 +906,319 @@ data8 0xB504F333F9DE6484 , 0x00003FFF
data8 0xC5672A115506DADD , 0x00003FFF
data8 0xD744FCCAD69D6AF4 , 0x00003FFF
data8 0xEAC0C6E7DD24392F , 0x00003FFF
-ASM_SIZE_DIRECTIVE(pow_tbl2)
-
-.global pow
+LOCAL_OBJECT_END(pow_tbl2)
.section .text
-.proc pow
-.align 32
-
-pow:
+GLOBAL_LIBM_ENTRY(pow)
+// Get exponent of x. Will be used to calculate K.
{ .mfi
- alloc r32=ar.pfs,1,35,4,0
- fms.s1 POW_Xm1 = f8,f1,f1 // Will be used for r1 if x>0
- mov pow_GR_17ones = 0x1FFFF
+ getf.exp pow_GR_signexp_X = f8
+ fms.s1 POW_Xm1 = f8,f1,f1 // Will be used for r1 if x>0
+ mov pow_GR_17ones = 0x1FFFF
}
{ .mfi
-(p0) addl pow_AD_P = @ltoff(pow_table_P), gp
- fma.s1 POW_Xp1 = f8,f1,f1 // Will be used for r1 if x<0
+ addl pow_AD_P = @ltoff(pow_table_P), gp
+ fma.s1 POW_Xp1 = f8,f1,f1 // Will be used for r1 if x<0
nop.i 999
;;
}
-
-// Get exponent of x. Will be used to calculate K.
+// Get significand of x. Will be used to get index to fetch T, Tt.
{ .mfi
- getf.exp pow_GR_signexp_X = f8
- frcpa.s1 POW_B, p6 = f1,f8
+ getf.sig pow_GR_sig_X = f8
+ frcpa.s1 POW_B, p6 = f1,f8
nop.i 999
}
{ .mfi
ld8 pow_AD_P = [pow_AD_P]
- fma.s1 POW_NORM_X = f8,f1,f0
- mov pow_GR_FFF7 = 0xFFF7
+ fma.s1 POW_NORM_X = f8,f1,f0
+ mov pow_GR_exp_2tom8 = 0xFFF7
}
;;
-
-
-// Get significand of x. Will be used to get index to fetch T, Tt.
// p13 = TRUE ==> X is unorm
// DOUBLE 0x10033 exponent limit at which y is an integer
-// SINGLE 0x10016
{ .mfi
- getf.sig pow_GR_sig_X = f8
- fclass.m p13,p0 = f8, 0x0b // Test for x unorm
- addl pow_GR_10033 = 0x10033, r0
+ nop.m 999
+ fclass.m p13,p0 = f8, 0x0b // Test for x unorm
+ addl pow_GR_10033 = 0x10033, r0
}
{ .mfi
mov pow_GR_16ones = 0xFFFF
- fma.s1 POW_NORM_Y = f9,f1,f0
+ fma.s1 POW_NORM_Y = f9,f1,f0
nop.i 999
}
;;
-
// p14 = TRUE ==> X is ZERO
{ .mfi
adds pow_AD_Tt = pow_Tt - pow_table_P, pow_AD_P
- fclass.m p14,p15 = f8, 0x07
- and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones
+ fclass.m p14,p0 = f8, 0x07
+ and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones
}
{ .mfi
- adds pow_AD_Q = pow_table_Q - pow_table_P, pow_AD_P
+ adds pow_AD_Q = pow_table_Q - pow_table_P, pow_AD_P
nop.f 999
nop.i 999
}
;;
{ .mfi
- ldfe POW_P5 = [pow_AD_P], 16
- fcmp.lt.s1 p8,p9 = f8, f0 // Test for x<0
- shl pow_GR_offset = pow_GR_sig_X, 1
+ ldfe POW_P5 = [pow_AD_P], 16
+ fcmp.lt.s1 p8,p9 = f8, f0 // Test for x<0
+ nop.i 999
}
{ .mib
- ldfe POW_P4 = [pow_AD_Q], 16
- sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones
-(p13) br.cond.spnt L(POW_X_DENORM)
+ ldfe POW_P4 = [pow_AD_Q], 16
+ sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones
+(p13) br.cond.spnt POW_X_DENORM
}
;;
-
// Continue normal and denormal paths here
-L(POW_COMMON):
+POW_COMMON:
// p11 = TRUE ==> Y is a NAN
{ .mfi
- ldfe POW_P3 = [pow_AD_P], 16
- fclass.m.unc p11,p0 = f9, 0xc3
- shr.u pow_GR_offset = pow_GR_offset,56
+ ldfe POW_P3 = [pow_AD_P], 16
+ fclass.m p11,p0 = f9, 0xc3
+ nop.i 999
}
{ .mfi
- ldfe POW_P2 = [pow_AD_Q], 16
+ ldfe POW_P2 = [pow_AD_Q], 16
nop.f 999
- nop.i 999
+ mov pow_GR_y_zero = 0
}
;;
-
-
-// Compute xsq to decide later if |x|=1
-// p11 = TRUE ==> Y is a NaN
+// Note POW_Xm1 and POW_r1 are used interchangably
{ .mfi
- setf.sig POW_int_K = pow_GR_true_exp_X
-(p15) fms.s1 POW_r = POW_B, POW_NORM_X,f1
- shladd pow_AD_Tt = pow_GR_offset, 4, pow_AD_Tt
+ alloc r32=ar.pfs,2,19,4,0
+ fms.s1 POW_r = POW_B, POW_NORM_X,f1
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p8) fnma.s1 POW_Xm1 = POW_Xp1,f1,f0
+ setf.sig POW_int_K = pow_GR_true_exp_X
+(p8) fnma.s1 POW_Xm1 = POW_Xp1,f1,f0
nop.i 999
}
;;
-
-
-// p12 = TRUE ==> X is ZERO and Y is ZERO
+// p12 = TRUE if Y is ZERO
+// Compute xsq to decide later if |x|=1
{ .mfi
- ldfe POW_P1 = [pow_AD_P], 16
-(p14) fclass.m.unc p12,p0 = f9, 0x07
- nop.i 999
+ ldfe POW_P1 = [pow_AD_P], 16
+ fclass.m p12,p0 = f9, 0x07
+ shl pow_GR_offset = pow_GR_sig_X, 1
}
{ .mfb
- ldfe POW_P0 = [pow_AD_Q], 16
+ ldfe POW_P0 = [pow_AD_Q], 16
fma.s1 POW_xsq = POW_NORM_X, POW_NORM_X, f0
-(p11) br.cond.spnt L(POW_Y_NAN)
+(p11) br.cond.spnt POW_Y_NAN // Branch if y=nan
}
;;
-
-.pred.rel "mutex",p8,p9
// Get exponent of |x|-1 to use in comparison to 2^-8
-{ .mmf
-(p8) getf.exp pow_GR_signexp_Xm1 = POW_Xp1
-(p9) getf.exp pow_GR_signexp_Xm1 = POW_Xm1
- fcvt.fx.s1 POW_int_Y = POW_NORM_Y
+{ .mfi
+ getf.exp pow_GR_signexp_Xm1 = POW_Xm1
+ fcvt.fx.s1 POW_int_Y = POW_NORM_Y
+ shr.u pow_GR_offset = pow_GR_offset,56
}
;;
-
// p11 = TRUE ==> X is a NAN
{ .mfi
ldfpd POW_log2_hi, POW_log2_lo = [pow_AD_Q], 16
- fclass.m.unc p11,p0 = f8, 0xc3
- nop.i 999
-}
-{ .mib
- ldfpd POW_T, POW_Tt = [pow_AD_Tt], 16
- nop.i 999
-(p12) br.cond.spnt L(POW_X_0_Y_0)
+ fclass.m p11,p0 = f8, 0xc3
+ shladd pow_AD_Tt = pow_GR_offset, 4, pow_AD_Tt
}
-;;
-
-
-// p14 = TRUE ==> X is zero
-// p15 = TRUE ==> X is zero AND Y is negative
-// p10 = TRUE ==> X is zero AND Y is >= zero
{ .mfi
ldfe POW_inv_log2_by_128 = [pow_AD_P], 16
-(p14) fcmp.lt.unc.s1 p15, p10 = f9,f0
- nop.i 999
+ fma.s1 POW_delta = f0,f0,f0 // delta=0 in case |x| near 1
+(p12) mov pow_GR_y_zero = 1
}
-{ .mfi
- nop.m 999
- nop.f 999
- and pow_GR_exp_Xm1 = pow_GR_signexp_Xm1, pow_GR_17ones
-}
-;;
-
-
-// Determine if we will use the |x| near 1 path (p6) or normal path (p7)
-// p12 = TRUE ==> X is a NAN and Y is a zero
-// p13 = TRUE ==> X is a NAN and Y is anything else
-{ .mfi
- getf.exp pow_GR_signexp_Y = POW_NORM_Y
-(p11) fclass.m.unc p12,p13 = f9, 0x07
- cmp.lt.unc p6,p7 = pow_GR_exp_Xm1, pow_GR_FFF7
-}
-{ .mfi
- ldfpd POW_Q2, POW_Q3 = [pow_AD_P], 16
- fma.s1 POW_rsq = POW_r, POW_r,f0
- nop.i 999
;;
-}
-// If on the x near 1 path, assign r1 to r and r1*r1 to rsq
{ .mfi
- ldfpd POW_Q0_half, POW_Q1 = [pow_AD_P], 16
-(p6) fma.s1 POW_r = POW_r1, f1, f0
- nop.i 999
+ ldfpd POW_Q2, POW_Q3 = [pow_AD_P], 16
+ fma.s1 POW_G = f0,f0,f0 // G=0 in case |x| near 1
+ and pow_GR_exp_Xm1 = pow_GR_signexp_Xm1, pow_GR_17ones
}
-{ .mfi
- nop.m 999
-(p6) fma.s1 POW_rsq = POW_r1, POW_r1, f0
- nop.i 999
;;
-}
-
+// Determine if we will use the |x| near 1 path (p6) or normal path (p7)
{ .mfi
- ldfpd POW_Q4, POW_RSHF = [pow_AD_P], 16
-(p7) fma.s1 POW_v6 = POW_r, POW_P5, POW_P4
- and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones
+ getf.exp pow_GR_signexp_Y = POW_NORM_Y
+ nop.f 999
+ cmp.lt p6,p7 = pow_GR_exp_Xm1, pow_GR_exp_2tom8
}
{ .mfb
- nop.m 999
-(p6) fma.s1 POW_v6 = POW_r1, POW_P5, POW_P4
-(p12) br.cond.spnt L(POW_X_NAN_Y_0)
+ ldfpd POW_T, POW_Tt = [pow_AD_Tt], 16
+ fma.s1 POW_rsq = POW_r, POW_r,f0
+(p11) br.cond.spnt POW_X_NAN // Branch if x=nan and y not nan
}
;;
-
+// If on the x near 1 path, assign r1 to r and r1*r1 to rsq
{ .mfi
- nop.m 999
-(p7) fma.s1 POW_v4 = POW_P3, POW_r, POW_P2
- andcm pow_GR_sign_Y = pow_GR_signexp_Y, pow_GR_17ones
+ ldfpd POW_Q0_half, POW_Q1 = [pow_AD_P], 16
+(p6) fma.s1 POW_r = POW_r1, f1, f0
+ nop.i 999
}
{ .mfb
nop.m 999
-(p6) fma.s1 POW_v4 = POW_P3, POW_r1, POW_P2
-(p12) br.cond.spnt L(POW_X_NAN_Y_0)
+(p6) fma.s1 POW_rsq = POW_r1, POW_r1, f0
+(p14) br.cond.spnt POW_X_0 // Branch if x zero and y not nan
}
;;
{ .mfi
- nop.m 999
- fcvt.xf POW_K = POW_int_K
+ ldfpd POW_Q4, POW_RSHF = [pow_AD_P], 16
+(p7) fma.s1 POW_v6 = POW_r, POW_P5, POW_P4
nop.i 999
}
-{ .mfb
- nop.m 999
-(p13) fma.d f8 = f8,f1,f0
-(p13) br.ret.spnt b0 // Exit if x nan, y anything but zero
+{ .mfi
+ mov pow_GR_exp_2toM63 = 0xffc0 // Exponent of 2^-63
+(p6) fma.s1 POW_v6 = POW_r1, POW_P5, POW_P4
+ nop.i 999
}
;;
-
-// p10 = TRUE ==> X is zero AND Y is positive
-// p8 = TRUE ==> X is zero AND Y is outside integer range (treat as even int)
-// return +0
-// p9 = TRUE ==> X is zero AND Y is within integer range (may not be integer)
+
{ .mfi
-(p10) cmp.gt.unc p8,p9 = pow_GR_exp_Y, pow_GR_10033
-(p6) fmerge.s POW_delta = f0,f0
+ setf.exp POW_2toM63 = pow_GR_exp_2toM63 // Form 2^-63 for test of q
+(p7) fma.s1 POW_v4 = POW_P3, POW_r, POW_P2
nop.i 999
}
{ .mfi
nop.m 999
-(p6) fma.s1 POW_G = f0,f0,f0
+(p6) fma.s1 POW_v4 = POW_P3, POW_r1, POW_P2
nop.i 999
}
;;
{ .mfi
- getf.sig pow_GR_sig_int_Y = POW_int_Y
- fnma.s1 POW_twoV = POW_NORM_Y, POW_rsq,f0
- nop.i 999
-}
-{ .mfi
nop.m 999
- fma.s1 POW_U = POW_NORM_Y,POW_r,f0
+ fcvt.xf POW_K = POW_int_K
nop.i 999
}
;;
{ .mfi
- ldfe POW_log2_by_128_lo = [pow_AD_P], 16
-(p6) fma.s1 POW_v2 = POW_P1, POW_r1, POW_P0
- nop.i 999
+ getf.sig pow_GR_sig_int_Y = POW_int_Y
+ fnma.s1 POW_twoV = POW_NORM_Y, POW_rsq,f0
+ and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones
}
-{ .mfi
- ldfe POW_log2_by_128_hi = [pow_AD_Q], 16
-(p7) fma.s1 POW_v2 = POW_P1, POW_r, POW_P0
- nop.i 999
+{ .mfb
+ andcm pow_GR_sign_Y = pow_GR_signexp_Y, pow_GR_17ones
+ fma.s1 POW_U = POW_NORM_Y,POW_r,f0
+(p12) br.cond.spnt POW_Y_0 // Branch if y=zero, x not zero or nan
}
;;
-
+// p11 = TRUE ==> X is NEGATIVE but not inf
{ .mfi
- nop.m 999
- fcvt.xf POW_float_int_Y = POW_int_Y
+ ldfe POW_log2_by_128_lo = [pow_AD_P], 16
+ fclass.m p11,p0 = f8, 0x1a
nop.i 999
}
{ .mfi
- nop.m 999
- fma.s1 POW_v3 = POW_v6, POW_rsq, POW_v4
- adds pow_AD_tbl1 = pow_tbl1 - pow_Tt, pow_AD_Q
+ ldfe POW_log2_by_128_hi = [pow_AD_Q], 16
+ fma.s1 POW_v2 = POW_P1, POW_r, POW_P0
+ nop.i 999
}
;;
{ .mfi
nop.m 999
-(p7) fma.s1 POW_delta = POW_K, POW_log2_lo, POW_Tt
+ fcvt.xf POW_float_int_Y = POW_int_Y
nop.i 999
}
{ .mfi
nop.m 999
-(p7) fma.s1 POW_G = POW_K, POW_log2_hi, POW_T
- adds pow_AD_tbl2 = pow_tbl2 - pow_tbl1, pow_AD_tbl1
+ fma.s1 POW_v3 = POW_v6, POW_rsq, POW_v4
+ adds pow_AD_tbl1 = pow_tbl1 - pow_Tt, pow_AD_Q
}
;;
-
{ .mfi
nop.m 999
- fms.s1 POW_e2 = POW_NORM_Y, POW_r, POW_U
+(p7) fma.s1 POW_delta = POW_K, POW_log2_lo, POW_Tt
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 POW_Z2 = POW_twoV, POW_Q0_half, POW_U
- nop.i 999
+(p7) fma.s1 POW_G = POW_K, POW_log2_hi, POW_T
+ adds pow_AD_tbl2 = pow_tbl2 - pow_tbl1, pow_AD_tbl1
}
;;
-// p11 = TRUE ==> X is NEGATIVE
-// p8 = TRUE ==> X is zero AND Y is outside intger range (treat as even int)
-// return +0
{ .mfi
nop.m 999
- fclass.m.unc p11,p0 = f8, 0x1a
+ fms.s1 POW_e2 = POW_NORM_Y, POW_r, POW_U
nop.i 999
}
-{ .mfb
+{ .mfi
nop.m 999
-(p8) fma.d f8 = f0,f0,f0
-(p8) br.ret.spnt b0
+ fma.s1 POW_Z2 = POW_twoV, POW_Q0_half, POW_U
+ nop.i 999
}
;;
-{ .mfi
+{ .mfi
nop.m 999
- fma.s1 POW_Yrcub = POW_rsq, POW_U, f0
+ fma.s1 POW_Yrcub = POW_rsq, POW_U, f0
nop.i 999
}
-{ .mfi
+{ .mfi
nop.m 999
- fma.s1 POW_p = POW_rsq, POW_v3, POW_v2
+ fma.s1 POW_p = POW_rsq, POW_v3, POW_v2
nop.i 999
}
;;
-
-// p11 = TRUE ==> X is NEGATIVE
-// p12 = TRUE ==> X is NEGATIVE AND Y already int
+// p11 = TRUE ==> X is NEGATIVE but not inf
+// p12 = TRUE ==> X is NEGATIVE AND Y already even int
// p13 = TRUE ==> X is NEGATIVE AND Y possible int
{ .mfi
nop.m 999
- fma.s1 POW_Z1 = POW_NORM_Y, POW_G, f0
-(p11) cmp.ge.unc p12,p13 = pow_GR_exp_Y, pow_GR_10033
+ fma.s1 POW_Z1 = POW_NORM_Y, POW_G, f0
+(p11) cmp.gt.unc p12,p13 = pow_GR_exp_Y, pow_GR_10033
}
{ .mfi
nop.m 999
- fma.s1 POW_e3 = POW_NORM_Y, POW_delta, f0
+ fma.s1 POW_Gpr = POW_G, f1, POW_r
nop.i 999
}
;;
-// p9 = TRUE ==> X is zero AND Y is within integer range (may not be integer)
-// p6 = TRUE ==> X is zero AND Y is an integer (may be even or odd)
-// p7 = TRUE ==> X is zero AND Y is NOT an integer, return +0
+// By adding RSHF (1.1000...*2^63) we put integer part in rightmost significand
{ .mfi
nop.m 999
-(p9) fcmp.eq.unc.s1 p6,p7 = POW_float_int_Y, POW_NORM_Y
+ fma.s1 POW_W2 = POW_Z2, POW_inv_log2_by_128, POW_RSHF
nop.i 999
}
-{ .mfi
+{ .mfi
nop.m 999
- fma.s1 POW_Gpr = POW_G, f1, POW_r
+ fms.s1 POW_UmZ2 = POW_U, f1, POW_Z2
nop.i 999
}
;;
-// By adding RSHF (1.1000...*2^63) we put integer part in rightmost significand
{ .mfi
nop.m 999
- fma.s1 POW_W2 = POW_Z2, POW_inv_log2_by_128, POW_RSHF
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fms.s1 POW_UmZ2 = POW_U, f1, POW_Z2
+ fma.s1 POW_e3 = POW_NORM_Y, POW_delta, f0
nop.i 999
}
;;
-
-// If x=0 and y>0, test y and flag denormal
-// p6 = TRUE ==> X is zero AND Y is an integer (may be even or odd)
-// p8 = TRUE ==> X is zero AND Y is an odd integer
-// p9 = TRUE ==> X is zero AND Y is an even integer
{ .mfi
nop.m 999
-(p10) fcmp.eq.s0 p15,p0 = f9,f0
-(p6) tbit.nz.unc p8,p9 = pow_GR_sig_int_Y,0
+ fma.s1 POW_Z3 = POW_p, POW_Yrcub, f0
+ nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 POW_Z3 = POW_p, POW_Yrcub, f0
+ fma.s1 POW_GY_Z2 = POW_G, POW_NORM_Y, POW_Z2
nop.i 999
}
;;
@@ -1291,7 +1226,7 @@ L(POW_COMMON):
// By adding RSHF (1.1000...*2^63) we put integer part in rightmost significand
{ .mfi
nop.m 999
- fms.s1 POW_e1 = POW_NORM_Y, POW_G, POW_Z1
+ fms.s1 POW_e1 = POW_NORM_Y, POW_G, POW_Z1
nop.i 999
}
{ .mfi
@@ -1301,81 +1236,60 @@ L(POW_COMMON):
}
;;
+// p13 = TRUE ==> X is NEGATIVE AND Y possible int
+// p10 = TRUE ==> X is NEG and Y is an int
+// p12 = TRUE ==> X is NEG and Y is not an int
{ .mfi
nop.m 999
-(p7) fma.d f8 = f0,f0,f0 // Result +0 if x zero and y not integer
- nop.i 999
+(p13) fcmp.eq.unc.s1 p10,p12 = POW_float_int_Y, POW_NORM_Y
+ mov pow_GR_xneg_yodd = 0
}
-{ .mfb
+{ .mfi
nop.m 999
- fma.s1 POW_Y_Gpr = POW_NORM_Y, POW_Gpr, f0
-(p8) br.ret.spnt b0 // Exit if x zero and y odd integer
+ fma.s1 POW_Y_Gpr = POW_NORM_Y, POW_Gpr, f0
+ nop.i 999
}
;;
// By subtracting RSHF we get rounded integer POW_N2float
-// p15 = TRUE ==> X_0_Y_NEG
{ .mfi
nop.m 999
fms.s1 POW_N2float = POW_W2, f1, POW_RSHF
nop.i 999
}
-{ .mfb
+{ .mfi
nop.m 999
- fma.s1 POW_UmZ2pV = POW_twoV,POW_Q0_half,POW_UmZ2
-(p15) br.cond.spnt L(POW_X_0_Y_NEG)
+ fma.s1 POW_UmZ2pV = POW_twoV,POW_Q0_half,POW_UmZ2
+ nop.i 999
}
;;
-
-
{ .mfi
nop.m 999
- fma.s1 POW_Z3sq = POW_Z3, POW_Z3, f0
+ fma.s1 POW_Z3sq = POW_Z3, POW_Z3, f0
nop.i 999
}
-{ .mfb
+{ .mfi
nop.m 999
- fma.s1 POW_v4 = POW_Z3, POW_Q3, POW_Q2
-(p7) br.ret.spnt b0 // Exit if x zero and y not an integer
+ fma.s1 POW_v4 = POW_Z3, POW_Q3, POW_Q2
+ nop.i 999
}
;;
-
-
// Extract rounded integer from rightmost significand of POW_W2
// By subtracting RSHF we get rounded integer POW_N1float
{ .mfi
- getf.sig pow_GR_int_W2 = POW_W2
+ getf.sig pow_GR_int_W2 = POW_W2
fms.s1 POW_N1float = POW_W1, f1, POW_RSHF
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 POW_v2 = POW_Z3, POW_Q1, POW_Q0_half
+ fma.s1 POW_v2 = POW_Z3, POW_Q1, POW_Q0_half
nop.i 999
}
;;
-
-
-
-// p13 = TRUE ==> X is NEGATIVE AND Y possible int
-// p10 = TRUE ==> X is NEG and Y is an int
-// p12 = TRUE ==> X is NEG and Y is not an int
-{ .mfi
- nop.m 999
-(p13) fcmp.eq.unc.s1 p10,p12 = POW_float_int_Y, POW_NORM_Y
- nop.i 999
-}
-{ .mfb
- nop.m 999
-(p9) fma.d f8 = f0,f0,f0 // Result +0 if x zero and y even integer
-(p9) br.ret.spnt b0 // Exit if x zero and y even integer
-}
-;;
-
-
{ .mfi
nop.m 999
fnma.s1 POW_s2 = POW_N2float, POW_log2_by_128_hi, POW_Z2
@@ -1383,7 +1297,7 @@ L(POW_COMMON):
}
{ .mfi
nop.m 999
- fma.s1 POW_e2 = POW_e2,f1,POW_UmZ2pV
+ fma.s1 POW_e2 = POW_e2,f1,POW_UmZ2pV
nop.i 999
}
;;
@@ -1391,278 +1305,283 @@ L(POW_COMMON):
// Extract rounded integer from rightmost significand of POW_W1
// Test if x inf
{ .mfi
- getf.sig pow_GR_int_W1 = POW_W1
- fclass.m.unc p15,p0 = POW_NORM_X, 0x23
+ getf.sig pow_GR_int_W1 = POW_W1
+ fclass.m p15,p0 = POW_NORM_X, 0x23
nop.i 999
}
{ .mfb
nop.m 999
fnma.s1 POW_f2 = POW_N2float, POW_log2_by_128_lo, f1
-(p12) br.cond.spnt L(POW_X_NEG_Y_NONINT) // Branch if x neg, y not integer
+(p12) br.cond.spnt POW_X_NEG_Y_NONINT // Branch if x neg, y not integer
}
;;
+// p11 = TRUE ==> X is +1.0
// p12 = TRUE ==> X is NEGATIVE AND Y is an odd integer
{ .mfi
- getf.exp pow_GR_signexp_Y_Gpr = POW_Y_Gpr
- fma.s1 POW_v3 = POW_Z3sq, POW_Q4, POW_v4
-(p10) tbit.nz.unc p12,p0 = pow_GR_sig_int_Y,0
+ getf.exp pow_GR_signexp_Y_Gpr = POW_Y_Gpr
+ fcmp.eq.s1 p11,p0 = POW_NORM_X, f1
+(p10) tbit.nz.unc p12,p0 = pow_GR_sig_int_Y,0
+}
+{ .mfi
+ nop.m 999
+ fma.s1 POW_v3 = POW_Z3sq, POW_Q4, POW_v4
+ nop.i 999
}
;;
-
{ .mfi
- add pow_GR_int_N = pow_GR_int_W1, pow_GR_int_W2
+ nop.m 999
fnma.s1 POW_f1 = POW_N1float, POW_log2_by_128_lo, f1
nop.i 999
}
{ .mfb
nop.m 999
fnma.s1 POW_s1 = POW_N1float, POW_log2_by_128_hi, POW_Z1
-(p15) br.cond.spnt L(POW_X_INF)
+(p15) br.cond.spnt POW_X_INF
}
;;
-
// Test x and y and flag denormal
{ .mfi
- and pow_GR_index1 = 0x0f, pow_GR_int_N
+ nop.m 999
fcmp.eq.s0 p15,p0 = f8,f9
- shr r2 = pow_GR_int_N, 7
+ nop.i 999
}
{ .mfi
- and pow_GR_exp_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones
- nop.f 999
- and pow_GR_index2 = 0x70, pow_GR_int_N
+ nop.m 999
+ fma.s1 POW_pYrcub_e3 = POW_p, POW_Yrcub, POW_e3
+ nop.i 999
}
;;
-
-
{ .mfi
- shladd pow_AD_T1 = pow_GR_index1, 4, pow_AD_tbl1
+ nop.m 999
fcmp.eq.s1 p7,p0 = POW_NORM_Y, f1 // Test for y=1.0
- sub pow_GR_true_exp_Y_Gpr = pow_GR_exp_Y_Gpr, pow_GR_16ones
+ nop.i 999
}
{ .mfi
- addl pow_int_GR_M = 0xFFFF, r2
- fma.s1 POW_e12 = POW_e1,f1,POW_e2
- add pow_AD_T2 = pow_AD_tbl2, pow_GR_index2
+ nop.m 999
+ fma.s1 POW_e12 = POW_e1,f1,POW_e2
+ nop.i 999
}
;;
-
-{ .mmi
- ldfe POW_T1 = [pow_AD_T1],16
- setf.exp POW_2M = pow_int_GR_M
- andcm pow_GR_sign_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones
+{ .mfi
+ add pow_GR_int_N = pow_GR_int_W1, pow_GR_int_W2
+(p11) fma.d.s0 f8 = f1,f1,f0 // If x=1, result is +1
+ nop.i 999
+}
+{ .mib
+(p12) mov pow_GR_xneg_yodd = 1
+ nop.i 999
+(p11) br.ret.spnt b0 // Early exit if x=1.0, result is +1
}
;;
-
-{ .mfb
- ldfe POW_T2 = [pow_AD_T2],16
- fma.s1 POW_q = POW_Z3sq, POW_v3, POW_v2
+{ .mfi
+ and pow_GR_index1 = 0x0f, pow_GR_int_N
+ fma.s1 POW_q = POW_Z3sq, POW_v3, POW_v2
+ shr pow_int_GR_M = pow_GR_int_N, 7 // M = N/128
+}
+{ .mib
+ and pow_GR_index2 = 0x70, pow_GR_int_N
+ cmp.eq p6, p0 = pow_GR_xneg_yodd, r0
(p7) br.ret.spnt b0 // Early exit if y=1.0, result is x
}
;;
-
-// double: p8 TRUE ==> |Y(G + r)| >= 10
-// single: p8 TRUE ==> |Y(G + r)| >= 7
-
-// double
-// -2^10 -2^9 2^9 2^10
-// -----+-----+----+ ... +-----+-----+-----
-// p8 | p9 | p8
-// | | p10 | |
-// single
-// -2^7 -2^6 2^6 2^7
-// -----+-----+----+ ... +-----+-----+-----
-// p8 | p9 | p8
-// | | p10 | |
-
-
{ .mfi
-(p0) cmp.le.unc p8,p9 = 10, pow_GR_true_exp_Y_Gpr
- fma.s1 POW_s = POW_s1, f1, POW_s2
- nop.i 999
+ shladd pow_AD_T1 = pow_GR_index1, 4, pow_AD_tbl1
+ fma.s1 POW_s = POW_s1, f1, POW_s2
+ add pow_int_GR_M = pow_GR_16ones, pow_int_GR_M
}
{ .mfi
- nop.m 999
- fma.s1 POW_f12 = POW_f1, POW_f2,f0
- nop.i 999
+ add pow_AD_T2 = pow_AD_tbl2, pow_GR_index2
+ fma.s1 POW_f12 = POW_f1, POW_f2,f0
+ and pow_GR_exp_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones
}
;;
-
-{ .mfi
- nop.f 999
-(p9) cmp.le.unc p0,p10 = 9, pow_GR_true_exp_Y_Gpr
+{ .mmi
+ ldfe POW_T1 = [pow_AD_T1]
+ ldfe POW_T2 = [pow_AD_T2]
+ sub pow_GR_true_exp_Y_Gpr = pow_GR_exp_Y_Gpr, pow_GR_16ones
}
;;
-
-
+{ .mfi
+ setf.exp POW_2M = pow_int_GR_M
+ fma.s1 POW_e123 = POW_e12, f1, POW_e3
+ nop.i 999
+}
{ .mfb
- nop.m 999
- fma.s1 POW_e123 = POW_e12, f1, POW_e3
-(p8) br.cond.spnt L(POW_OVER_UNDER_X_NOT_INF)
+(p6) cmp.gt p6, p0 = -11, pow_GR_true_exp_Y_Gpr
+ fma.s1 POW_d = POW_GY_Z2, f1, POW_pYrcub_e3
+(p6) br.cond.spnt POW_NEAR_ONE // branch if |y*log(x)| < 2^(-11)
}
;;
-
-{ .mmf
- fma.s1 POW_q = POW_Z3sq, POW_q, POW_Z3
+{ .mfi
+ nop.m 999
+ fma.s1 POW_q = POW_Z3sq, POW_q, POW_Z3
+ nop.i 999
}
;;
+// p8 TRUE ==> |Y(G + r)| >= 10
+// double
+// -2^10 -2^9 2^9 2^10
+// -----+-----+----+ ... +-----+-----+-----
+// p8 | p9 | p8
+// | | p10 | |
+
+// Form signexp of constants to indicate overflow
{ .mfi
- nop.m 999
- fma.s1 POW_ssq = POW_s, POW_s, f0
- nop.i 999
+ mov pow_GR_big_pos = 0x103ff
+ fma.s1 POW_ssq = POW_s, POW_s, f0
+ cmp.le p8,p9 = 10, pow_GR_true_exp_Y_Gpr
}
{ .mfi
- nop.m 999
- fma.s1 POW_v4 = POW_s, POW_Q3, POW_Q2
- nop.i 999
+ mov pow_GR_big_neg = 0x303ff
+ fma.s1 POW_v4 = POW_s, POW_Q3, POW_Q2
+ andcm pow_GR_sign_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones
}
;;
+// Form big positive and negative constants to test for possible overflow
{ .mfi
- nop.m 999
- fma.s1 POW_v2 = POW_s, POW_Q1, POW_Q0_half
- nop.i 999
+ setf.exp POW_big_pos = pow_GR_big_pos
+ fma.s1 POW_v2 = POW_s, POW_Q1, POW_Q0_half
+(p9) cmp.le.unc p0,p10 = 9, pow_GR_true_exp_Y_Gpr
}
-{ .mfi
- nop.m 999
- fma.s1 POW_1ps = f1,f1,POW_s
- nop.i 999
+{ .mfb
+ setf.exp POW_big_neg = pow_GR_big_neg
+ fma.s1 POW_1ps = f1,f1,POW_s
+(p8) br.cond.spnt POW_OVER_UNDER_X_NOT_INF
}
;;
+// f123 = f12*(e123+1) = f12*e123+f12
{ .mfi
nop.m 999
- fma.s1 POW_f3 = POW_e123,f1,f1
+ fma.s1 POW_f123 = POW_e123,POW_f12,POW_f12
nop.i 999
}
;;
{ .mfi
nop.m 999
- fma.s1 POW_T1T2 = POW_T1, POW_T2, f0
+ fma.s1 POW_T1T2 = POW_T1, POW_T2, f0
nop.i 999
}
-;;
-
{ .mfi
nop.m 999
- fma.s1 POW_v3 = POW_ssq, POW_Q4, POW_v4
- nop.i 999
+ fma.s1 POW_v3 = POW_ssq, POW_Q4, POW_v4
+ cmp.ne p12,p13 = pow_GR_xneg_yodd, r0
}
;;
{ .mfi
nop.m 999
- fma.s1 POW_v21ps = POW_ssq, POW_v2, POW_1ps
+ fma.s1 POW_v21ps = POW_ssq, POW_v2, POW_1ps
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 POW_s4 = POW_ssq, POW_ssq, f0
+ fma.s1 POW_s4 = POW_ssq, POW_ssq, f0
nop.i 999
}
;;
{ .mfi
nop.m 999
- fma.s1 POW_f123 = POW_f12, POW_f3, f0
+(p12) fnma.s1 POW_A = POW_2M, POW_f123, f0
nop.i 999
}
+{ .mfi
+ nop.m 999
+(p13) fma.s1 POW_A = POW_2M, POW_f123, f0
+ cmp.eq p14,p11 = r0,r0 // Initialize p14 on, p11 off
+}
;;
{ .mfi
nop.m 999
- fma.s1 POW_A = POW_2M, POW_T1T2, f0
+ fmerge.s POW_abs_q = f0, POW_q // Form |q| so can test its size
nop.i 999
}
;;
-
-
{ .mfi
- nop.m 999
-(p12) fmerge.s POW_f123 = f8,POW_f123 // if x neg, y odd int
+(p10) cmp.eq p0,p14 = r0,r0 // Turn off p14 if no overflow
+ fma.s1 POW_es = POW_s4, POW_v3, POW_v21ps
nop.i 999
}
{ .mfi
nop.m 999
-// fma.s1 POW_es = POW_ssq, POW_v3, POW_v2
+ fma.s1 POW_A = POW_A, POW_T1T2, f0
nop.i 999
}
;;
{ .mfi
+// Test for |q| < 2^-63. If so then reverse last two steps of the result
+// to avoid monotonicity problems for results near 1.0 in round up/down/zero.
+// p11 will be set if need to reverse the order, p14 if not.
nop.m 999
- fma.s1 POW_es = POW_s4, POW_v3, POW_v21ps
+(p10) fcmp.lt.s0 p11,p14 = POW_abs_q, POW_2toM63 // Test |q| <2^-63
nop.i 999
}
;;
-
+.pred.rel "mutex",p11,p14
{ .mfi
nop.m 999
- fma.s1 POW_A = POW_A, POW_f123, f0
+(p14) fma.s1 POW_A = POW_A, POW_es, f0
nop.i 999
}
{ .mfi
nop.m 999
-// fma.s1 POW_es = POW_es, POW_ssq, POW_1ps
+(p11) fma.s1 POW_A = POW_A, POW_q, POW_A
nop.i 999
}
;;
-
+// Dummy op to set inexact if |q| < 2^-63
{ .mfi
nop.m 999
- fma.s1 POW_A = POW_A, POW_es,f0
+(p11) fma.d.s0 POW_tmp = POW_A, POW_q, POW_A
nop.i 999
}
;;
-
-
+{ .mfi
+ nop.m 999
+(p14) fma.d.s0 f8 = POW_A, POW_q, POW_A
+ nop.i 999
+}
{ .mfb
nop.m 999
-(p10) fma.d f8 = POW_A, POW_q, POW_A
-(p10) br.ret.sptk b0
+(p11) fma.d.s0 f8 = POW_A, POW_es, f0
+(p10) br.ret.sptk b0 // Exit main branch if no over/underflow
}
;;
-
-
-
-
// POSSIBLE_OVER_UNDER
-// p6 = TRUE ==> Y negative
+// p6 = TRUE ==> Y_Gpr negative
+// Result is already computed. We just need to know if over/underflow occurred.
-{ .mfi
- nop.m 999
- fmerge.s POW_abs_A = f0, POW_A
- cmp.eq.unc p0,p6 = pow_GR_sign_Y, r0
-}
-;;
-
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(POW_POSSIBLE_UNDER)
+{ .mfb
+ cmp.eq p0,p6 = pow_GR_sign_Y_Gpr, r0
+ nop.f 999
+(p6) br.cond.spnt POW_POSSIBLE_UNDER
}
;;
// POSSIBLE_OVER
-// We got an answer.
+// We got an answer.
// overflow is a possibility, not a certainty
@@ -1692,21 +1611,20 @@ L(POW_COMMON):
// RN RN
// RZ
-
// Put in s2 (td set, wre set)
{ .mfi
- mov pow_GR_gt_ln = 0x103ff
+ nop.m 999
fsetc.s2 0x7F,0x42
- nop.i 999
+ nop.i 999
}
;;
-
{ .mfi
- setf.exp POW_gt_pln = pow_GR_gt_ln
- fma.d.s2 POW_wre_urm_f8 = POW_abs_A, POW_q, POW_abs_A
- nop.i 999 ;;
+ nop.m 999
+ fma.d.s2 POW_wre_urm_f8 = POW_A, POW_q, POW_A
+ nop.i 999
}
+;;
// Return s2 to default
{ .mfi
@@ -1716,31 +1634,67 @@ L(POW_COMMON):
}
;;
-
// p7 = TRUE ==> yes, we have an overflow
{ .mfi
nop.m 999
- fcmp.ge.unc.s1 p7, p0 = POW_wre_urm_f8, POW_gt_pln
+ fcmp.ge.s1 p7, p8 = POW_wre_urm_f8, POW_big_pos
nop.i 999
}
;;
+{ .mfi
+ nop.m 999
+(p8) fcmp.le.s1 p7, p0 = POW_wre_urm_f8, POW_big_neg
+ nop.i 999
+}
+;;
+{ .mbb
+(p7) mov pow_GR_tag = 24
+(p7) br.cond.spnt __libm_error_region // Branch if overflow
+ br.ret.sptk b0 // Exit if did not overflow
+}
+;;
-{ .mfb
-(p7) mov pow_GR_tag = 24
- fma.d f8 = POW_A, POW_q, POW_A
-(p7) br.cond.spnt __libm_error_region
+// Here if |y*log(x)| < 2^(-11)
+// pow(x,y) ~ exp(d) ~ 1 + d + 0.5*d^2 + Q1*d^3 + Q2*d^4, where d = y*log(x)
+.align 32
+POW_NEAR_ONE:
+
+{ .mfi
+ nop.m 999
+ fma.s1 POW_d2 = POW_d, POW_d, f0
+ nop.i 999
}
-{ .mfb
- nop.m 999
- nop.f 999
-(p0) br.ret.sptk b0
+;;
+
+{ .mfi
+ nop.m 999
+ fma.s1 POW_poly_d_hi = POW_d, POW_Q0_half, f1
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+ fma.s1 POW_poly_d_lo = POW_d, POW_Q2, POW_Q1
+ nop.i 999
}
;;
+{ .mfi
+ nop.m 999
+ fma.s1 POW_poly_d = POW_d2, POW_poly_d_lo, POW_poly_d_hi
+ nop.i 999
+}
+;;
+
+{ .mfb
+ nop.m 999
+ fma.d.s0 f8 = POW_d, POW_poly_d, f1
+ br.ret.sptk b0 // exit function for arguments |y*log(x)| < 2^(-11)
+}
+;;
-L(POW_POSSIBLE_UNDER):
+POW_POSSIBLE_UNDER:
// We got an answer. input was < -2^9 but > -2^10 (double)
// We got an answer. input was < -2^6 but > -2^7 (float)
// underflow is a possibility, not a certainty
@@ -1763,124 +1717,250 @@ L(POW_POSSIBLE_UNDER):
// 0.1...11 2^-3ffe (biased, 1)
// largest dn smallest normal
-
// Put in s2 (td set, ftz set)
{ .mfi
nop.m 999
fsetc.s2 0x7F,0x41
- nop.i 999
+ nop.i 999
}
;;
-
-
{ .mfi
nop.m 999
- fma.d.s2 POW_ftz_urm_f8 = POW_A, POW_q, POW_A
+ fma.d.s2 POW_ftz_urm_f8 = POW_A, POW_q, POW_A
nop.i 999
}
;;
-
// Return s2 to default
{ .mfi
nop.m 999
fsetc.s2 0x7F,0x40
- nop.i 999
+ nop.i 999
}
;;
-
// p7 = TRUE ==> yes, we have an underflow
{ .mfi
nop.m 999
- fcmp.eq.unc.s1 p7, p0 = POW_ftz_urm_f8, f0
- nop.i 999
+ fcmp.eq.s1 p7, p0 = POW_ftz_urm_f8, f0
+ nop.i 999
}
;;
+{ .mbb
+(p7) mov pow_GR_tag = 25
+(p7) br.cond.spnt __libm_error_region // Branch if underflow
+ br.ret.sptk b0 // Exit if did not underflow
+}
+;;
+
+POW_X_DENORM:
+// Here if x unorm. Use the NORM_X for getf instructions, and then back
+// to normal path
+{ .mfi
+ getf.exp pow_GR_signexp_X = POW_NORM_X
+ nop.f 999
+ nop.i 999
+}
+;;
+{ .mmi
+ getf.sig pow_GR_sig_X = POW_NORM_X
+;;
+ and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones
+ nop.i 999
+}
+;;
+
+{ .mib
+ sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones
+ nop.i 999
+ br.cond.sptk POW_COMMON
+}
+;;
+POW_X_0:
+// Here if x=0 and y not nan
+//
+// We have the following cases:
+// p6 x=0 and y>0 and is an integer (may be even or odd)
+// p7 x=0 and y>0 and is NOT an integer, return +0
+// p8 x=0 and y>0 and so big as to always be an even integer, return +0
+// p9 x=0 and y>0 and may not be integer
+// p10 x=0 and y>0 and is an odd integer, return x
+// p11 x=0 and y>0 and is an even integer, return +0
+// p12 used in dummy fcmp to set denormal flag if y=unorm
+// p13 x=0 and y>0
+// p14 x=0 and y=0, branch to code for calling error handling
+// p15 x=0 and y<0, branch to code for calling error handling
+//
+{ .mfi
+ getf.sig pow_GR_sig_int_Y = POW_int_Y // Get signif of int_Y
+ fcmp.lt.s1 p15,p13 = f9, f0 // Test for y<0
+ and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones
+}
+{ .mfb
+ cmp.ne p14,p0 = pow_GR_y_zero,r0 // Test for y=0
+ fcvt.xf POW_float_int_Y = POW_int_Y
+(p14) br.cond.spnt POW_X_0_Y_0 // Branch if x=0 and y=0
+}
+;;
+// If x=0 and y>0, test y and flag denormal
{ .mfb
-(p7) mov pow_GR_tag = 25
- fma.d f8 = POW_A, POW_q, POW_A
-(p7) br.cond.spnt __libm_error_region
+(p13) cmp.gt.unc p8,p9 = pow_GR_exp_Y, pow_GR_10033 // Test y +big = even int
+(p13) fcmp.eq.s0 p12,p0 = f9,f0 // If x=0, y>0 dummy op to flag denormal
+(p15) br.cond.spnt POW_X_0_Y_NEG // Branch if x=0 and y<0
}
;;
+// Here if x=0 and y>0
+{ .mfi
+ nop.m 999
+(p9) fcmp.eq.unc.s1 p6,p7 = POW_float_int_Y, POW_NORM_Y // Test y=int
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p8) fma.d.s0 f8 = f0,f0,f0 // If x=0, y>0 and large even int, return +0
+ nop.i 999
+}
+;;
+{ .mfi
+ nop.m 999
+(p7) fma.d.s0 f8 = f0,f0,f0 // Result +0 if x=0 and y>0 and not integer
+(p6) tbit.nz.unc p10,p11 = pow_GR_sig_int_Y,0 // If y>0 int, test y even/odd
+}
+;;
+
+// Note if x=0, y>0 and odd integer, just return x
{ .mfb
nop.m 999
- nop.f 999
- br.ret.sptk b0
+(p11) fma.d.s0 f8 = f0,f0,f0 // Result +0 if x=0 and y even integer
+ br.ret.sptk b0 // Exit if x=0 and y>0
}
;;
+POW_X_0_Y_0:
+// When X is +-0 and Y is +-0, IEEE returns 1.0
+// We call error support with this value
-L(POW_X_DENORM):
-// Here if x unorm. Use the NORM_X for getf instructions, and the back
-// to normal path
-{ .mfi
- getf.exp pow_GR_signexp_X = POW_NORM_X
- nop.f 999
- nop.i 999
+{ .mfb
+ mov pow_GR_tag = 26
+ fma.d.s0 f8 = f1,f1,f0
+ br.cond.sptk __libm_error_region
}
;;
+POW_X_0_Y_NEG:
+// When X is +-0 and Y is negative, IEEE returns
+// X Y answer
+// +0 -odd int +inf
+// -0 -odd int -inf
+
+// +0 !-odd int +inf
+// -0 !-odd int +inf
+
+// p6 == Y is a floating point number outside the integer.
+// Hence it is an integer and is even.
+// return +inf
+
+// p7 == Y is a floating point number within the integer range.
+// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even.
+// p11 odd
+// return (sign_of_x)inf
+// p12 even
+// return +inf
+// p10 == Y is not an integer
+// return +inf
+//
+
{ .mfi
- getf.sig pow_GR_sig_X = POW_NORM_X
- nop.f 999
- nop.i 999
+ nop.m 999
+ nop.f 999
+ cmp.gt p6,p7 = pow_GR_exp_Y, pow_GR_10033
}
;;
{ .mfi
- and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones
- nop.f 999
+ mov pow_GR_tag = 27
+(p7) fcmp.eq.unc.s1 p9,p10 = POW_float_int_Y, POW_NORM_Y
+ nop.i 999
}
;;
-{ .mib
- sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones
- shl pow_GR_offset = pow_GR_sig_X, 1
- br.cond.sptk L(POW_COMMON)
+{ .mfb
+ nop.m 999
+(p6) frcpa.s0 f8,p13 = f1, f0
+(p6) br.cond.sptk __libm_error_region // x=0, y<0, y large neg int
+}
+;;
+
+{ .mfb
+ nop.m 999
+(p10) frcpa.s0 f8,p13 = f1, f0
+(p10) br.cond.sptk __libm_error_region // x=0, y<0, y not int
}
;;
+// x=0, y<0, y an int
+{ .mib
+ nop.m 999
+(p9) tbit.nz.unc p11,p12 = pow_GR_sig_int_Y,0
+ nop.b 999
+}
+;;
-L(POW_X_0_Y_0):
-// When X is +-0 and Y is +-0, IEEE returns 1.0
-// We call error support with this value
+{ .mfi
+ nop.m 999
+(p12) frcpa.s0 f8,p13 = f1,f0
+ nop.i 999
+}
+;;
{ .mfb
- mov pow_GR_tag = 26
- fma.d f8 = f1,f1,f0
- br.cond.sptk __libm_error_region
+ nop.m 999
+(p11) frcpa.s0 f8,p13 = f1,f8
+ br.cond.sptk __libm_error_region
}
;;
+POW_Y_0:
+// Here for y zero, x anything but zero and nan
+// Set flag if x denormal
+// Result is +1.0
+{ .mfi
+ nop.m 999
+ fcmp.eq.s0 p6,p0 = f8,f0 // Sets flag if x denormal
+ nop.i 999
+}
+{ .mfb
+ nop.m 999
+ fma.d.s0 f8 = f1,f1,f0
+ br.ret.sptk b0
+}
+;;
-L(POW_X_INF):
-// When X is +-inf and Y is +-, IEEE returns
+POW_X_INF:
+// Here when X is +-inf
-// overflow
-// X +inf Y +inf +inf
-// X -inf Y +inf +inf
+// X +inf Y +inf +inf
+// X -inf Y +inf +inf
-// X +inf Y >0 +inf
+// X +inf Y >0 +inf
// X -inf Y >0, !odd integer +inf <== (-inf)^0.5 = +inf !!
-// X -inf Y >0, odd integer -inf
+// X -inf Y >0, odd integer -inf
-// underflow
-// X +inf Y -inf +0
-// X -inf Y -inf +0
+// X +inf Y -inf +0
+// X -inf Y -inf +0
-// X +inf Y <0 +0
-// X -inf Y <0, !odd integer +0
-// X -inf Y <0, odd integer -0
+// X +inf Y <0 +0
+// X -inf Y <0, !odd integer +0
+// X -inf Y <0, odd integer -0
// X + inf Y=+0 +1
// X + inf Y=-0 +1
@@ -1892,32 +1972,30 @@ L(POW_X_INF):
// p6 == Y is a floating point number outside the integer.
// Hence it is an integer and is even.
-// p13 == (Y negative)
+// p13 == (Y negative)
// return +inf
// p14 == (Y positive)
// return +0
-
-
// p7 == Y is a floating point number within the integer range.
// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even.
// p11 odd
-// p13 == (Y negative)
+// p13 == (Y negative)
// return (sign_of_x)inf
-// p14 == (Y positive)
+// p14 == (Y positive)
// return (sign_of_x)0
-// pxx even
-// p13 == (Y negative)
-// return +inf
+// pxx even
+// p13 == (Y negative)
+// return +inf
// p14 == (Y positive)
-// return +0
+// return +0
// pxx == Y is not an integer
-// p13 == (Y negative)
+// p13 == (Y negative)
// return +inf
// p14 == (Y positive)
// return +0
-//
+//
// If x=inf, test y and flag denormal
{ .mfi
@@ -1929,207 +2007,131 @@ L(POW_X_INF):
{ .mfi
nop.m 999
- fcmp.lt p13,p14 = POW_NORM_Y,f0
- cmp.gt.unc p6,p7 = pow_GR_exp_Y, pow_GR_10033
+ fcmp.lt.s0 p13,p14 = POW_NORM_Y,f0
+ cmp.gt p6,p7 = pow_GR_exp_Y, pow_GR_10033
}
{ .mfi
nop.m 999
- fclass.m p12,p0 = f9, 0x23
+ fclass.m p12,p0 = f9, 0x23 //@inf
nop.i 999
}
;;
-
{ .mfi
nop.m 999
- fclass.m p15,p0 = f9, 0x07 //@zero
+ fclass.m p15,p0 = f9, 0x07 //@zero
nop.i 999
}
;;
{ .mfb
nop.m 999
-(p15) fmerge.s f8 = f1,f1
-(p15) br.ret.spnt b0
+(p15) fmerge.s f8 = f1,f1 // Return +1.0 if x=inf, y=0
+(p15) br.ret.spnt b0 // Exit if x=inf, y=0
}
;;
-
{ .mfi
-(p13) mov pow_GR_tag = 25
-(p14) frcpa.s1 f8,p10 = f1,f0
+ nop.m 999
+(p14) frcpa.s1 f8,p10 = f1,f0 // If x=inf, y>0, assume result +inf
nop.i 999
}
{ .mfb
-(p14) mov pow_GR_tag = 24
-(p13) fma.s1 f8 = f0,f0,f0
-(p12) br.ret.spnt b0
-}
-;;
-
-
-
-{ .mfb
nop.m 999
-(p7) fcmp.eq.unc.s1 p9,p0 = POW_float_int_Y, POW_NORM_Y
- nop.b 999
+(p13) fma.d.s0 f8 = f0,f0,f0 // If x=inf, y<0, assume result +0.0
+(p12) br.ret.spnt b0 // Exit if x=inf, y=inf
}
;;
+// Here if x=inf, and 0 < |y| < inf. Need to correct results if y odd integer.
{ .mfi
nop.m 999
- nop.f 999
-(p9) tbit.nz.unc p11,p0 = pow_GR_sig_int_Y,0
-}
-;;
-
-{ .mfb
- nop.m 999
-(p11) fmerge.s f8 = POW_NORM_X,f8
- br.ret.sptk b0
+(p7) fcmp.eq.unc.s1 p9,p0 = POW_float_int_Y, POW_NORM_Y // Is y integer?
+ nop.i 999
}
;;
-
-
-L(POW_X_0_Y_NEG):
-// When X is +-0 and Y is negative, IEEE returns
-// X Y answer
-// +0 -odd int +inf
-// -0 -odd int -inf
-
-// +0 !-odd int +inf
-// -0 !-odd int +inf
-
-
-// p6 == Y is a floating point number outside the integer.
-// Hence it is an integer and is even.
-// return +inf
-
-// p7 == Y is a floating point number within the integer range.
-// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even.
-// p11 odd
-// return (sign_of_x)inf
-// p12 even
-// return +inf
-// p10 == Y is not an integer
-// return +inf
-//
-//
-
{ .mfi
nop.m 999
nop.f 999
- cmp.gt.unc p6,p7 = pow_GR_exp_Y, pow_GR_10033
-}
-;;
-
-
-{ .mfi
- mov pow_GR_tag = 27
-(p7) fcmp.eq.unc.s1 p9,p10 = POW_float_int_Y, POW_NORM_Y
- nop.i 999
-}
-;;
-
-
-{ .mfb
- nop.m 999
-(p6) frcpa.s0 f8,p13 = f1, f0
-(p6) br.cond.sptk __libm_error_region
+(p9) tbit.nz.unc p11,p0 = pow_GR_sig_int_Y,0 // Test for y odd integer
}
;;
{ .mfb
nop.m 999
-(p10) frcpa.s0 f8,p13 = f1, f0
-(p10) br.cond.sptk __libm_error_region
+(p11) fmerge.s f8 = POW_NORM_X,f8 // If y odd integer use sign of x
+ br.ret.sptk b0 // Exit for x=inf, 0 < |y| < inf
}
;;
+POW_X_NEG_Y_NONINT:
+// When X is negative and Y is a non-integer, IEEE
+// returns a qnan indefinite.
+// We call error support with this value
-{ .mib
- nop.m 999
-(p9) tbit.nz.unc p11,p12 = pow_GR_sig_int_Y,0
- nop.b 999
+{ .mfb
+ mov pow_GR_tag = 28
+ frcpa.s0 f8,p6 = f0,f0
+ br.cond.sptk __libm_error_region
}
;;
-
-
+POW_X_NAN:
+// Here if x=nan, y not nan
{ .mfi
- nop.m 999
-(p12) frcpa.s0 f8,p13 = f1,f0
- nop.i 999
+ nop.m 999
+ fclass.m p9,p13 = f9, 0x07 // Test y=zero
+ nop.i 999
}
;;
{ .mfb
- nop.m 999
-(p11) frcpa f8,p13 = f1,f8
- br.cond.sptk __libm_error_region
+ nop.m 999
+(p13) fma.d.s0 f8 = f8,f1,f0
+(p13) br.ret.sptk b0 // Exit if x nan, y anything but zero or nan
}
;;
-
-
-
-L(POW_X_NEG_Y_NONINT):
-// When X is negative and Y is a non-integer, IEEE
-// returns a qnan indefinite.
-// We call error support with this value
-
-{ .mfb
- mov pow_GR_tag = 28
- frcpa f8,p6 = f0,f0
- br.cond.sptk __libm_error_region
-}
-;;
-
-
-
-
-L(POW_X_NAN_Y_0):
+POW_X_NAN_Y_0:
// When X is a NAN and Y is zero, IEEE returns 1.
// We call error support with this value.
-
{ .mfi
- nop.m 0
- fma.d.s0 f10 = f8,f1,f0
- nop.i 0
+ nop.m 999
+ fcmp.eq.s0 p6,p0 = f8,f0 // Dummy op to set invalid on snan
+ nop.i 999
}
{ .mfb
- mov pow_GR_tag = 29
- fma.d.s0 f8 = f0,f0,f1
+ mov pow_GR_tag = 29
+ fma.d.s0 f8 = f0,f0,f1
br.cond.sptk __libm_error_region
}
;;
-L(POW_OVER_UNDER_X_NOT_INF):
+POW_OVER_UNDER_X_NOT_INF:
// p8 is TRUE for overflow
// p9 is TRUE for underflow
// if y is infinity, we should not over/underflow
-
{ .mfi
nop.m 999
- fcmp.eq.unc.s1 p14, p13 = POW_xsq,f1
- cmp.eq.unc p8,p9 = pow_GR_sign_Y_Gpr, r0
+ fcmp.eq.s1 p14, p13 = POW_xsq,f1 // Test |x|=1
+ cmp.eq p8,p9 = pow_GR_sign_Y_Gpr, r0
}
;;
{ .mfi
nop.m 999
-(p14) fclass.m.unc p15, p0 = f9, 0x23
+(p14) fclass.m.unc p15, p0 = f9, 0x23 // If |x|=1, test y=inf
nop.i 999
}
{ .mfi
nop.m 999
-(p13) fclass.m.unc p11,p0 = f9, 0x23
+(p13) fclass.m.unc p11,p0 = f9, 0x23 // If |x| not 1, test y=inf
nop.i 999
}
;;
@@ -2137,31 +2139,33 @@ L(POW_OVER_UNDER_X_NOT_INF):
// p15 = TRUE if |x|=1, y=inf, return +1
{ .mfb
nop.m 999
-(p15) fma.d f8 = f1,f1,f0
-(p15) br.ret.spnt b0
+(p15) fma.d.s0 f8 = f1,f1,f0 // If |x|=1, y=inf, result +1
+(p15) br.ret.spnt b0 // Exit if |x|=1, y=inf
}
;;
.pred.rel "mutex",p8,p9
{ .mfb
-(p8) setf.exp f8 = pow_GR_17ones
-(p9) fmerge.s f8 = f0,f0
-(p11) br.ret.sptk b0
+(p8) setf.exp f8 = pow_GR_17ones // If exp(+big), result inf
+(p9) fmerge.s f8 = f0,f0 // If exp(-big), result 0
+(p11) br.ret.sptk b0 // Exit if |x| not 1, y=inf
}
+;;
{ .mfb
nop.m 999
nop.f 999
- br.cond.sptk L(POW_OVER_UNDER_ERROR)
+ br.cond.sptk POW_OVER_UNDER_ERROR // Branch if y not inf
}
;;
-L(POW_Y_NAN):
-// Is x = +1 then result is +1, else result is quiet Y
+POW_Y_NAN:
+// Here if y=nan, x anything
+// If x = +1 then result is +1, else result is quiet Y
{ .mfi
nop.m 999
- fcmp.eq.s1 p10,p9 = POW_NORM_X, f1
+ fcmp.eq.s1 p10,p9 = POW_NORM_X, f1
nop.i 999
}
;;
@@ -2175,148 +2179,117 @@ L(POW_Y_NAN):
{ .mfi
nop.m 999
-(p10) fma.d f8 = f1,f1,f0
+(p10) fma.d.s0 f8 = f1,f1,f0
nop.i 999
}
{ .mfb
nop.m 999
-(p9) fma.d f8 = f9,f8,f0
- br.ret.sptk b0
+(p9) fma.d.s0 f8 = f9,f8,f0
+ br.ret.sptk b0 // Exit y=nan
}
;;
-L(POW_OVER_UNDER_ERROR):
+POW_OVER_UNDER_ERROR:
+// Here if we have overflow or underflow.
+// Enter with p12 true if x negative and y odd int to force -0 or -inf
{ .mfi
- nop.m 999
- fmerge.s f10 = POW_NORM_X,POW_NORM_X
- nop.i 999
-}
-{ .mfi
- sub pow_GR_17ones_m1 = pow_GR_17ones, r0, 1
- nop.f 999
- mov pow_GR_one = 0x1
+ sub pow_GR_17ones_m1 = pow_GR_17ones, r0, 1
+ nop.f 999
+ mov pow_GR_one = 0x1
}
;;
-// overflow
+// overflow, force inf with O flag
{ .mmb
-(p8) mov pow_GR_tag = 24
-(p8) setf.exp f11 = pow_GR_17ones_m1
+(p8) mov pow_GR_tag = 24
+(p8) setf.exp POW_tmp = pow_GR_17ones_m1
nop.b 999
}
;;
-
-// underflow
+// underflow, force zero with I, U flags
{ .mmi
-(p9) mov pow_GR_tag = 25
-(p9) setf.exp f11 = pow_GR_one
+(p9) mov pow_GR_tag = 25
+(p9) setf.exp POW_tmp = pow_GR_one
nop.i 999
}
;;
-
-// p12 x is negative and y is an odd integer
-
-
{ .mfi
nop.m 999
- fma.d f8 = f11, f11, f0
+ fma.d.s0 f8 = POW_tmp, POW_tmp, f0
nop.i 999
}
;;
+// p12 x is negative and y is an odd integer, change sign of result
{ .mfi
nop.m 999
-(p12) fmerge.ns f8 = f8, f8
+(p12) fnma.d.s0 f8 = POW_tmp, POW_tmp, f0
nop.i 999
}
;;
+GLOBAL_LIBM_END(pow)
-.endp pow
-ASM_SIZE_DIRECTIVE(pow)
-
-
-// Stack operations when calling error support.
-// (1) (2) (3) (call) (4)
-// sp -> + psp -> + psp -> + sp -> +
-// | | | |
-// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
-// | | | |
-// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
-// | | | |
-// | | <- GR_X X1 ->| |
-// | | | |
-// sp-64 -> + sp -> + sp -> + +
-// save ar.pfs save b0 restore gp
-// save gp restore ar.pfs
-
-
+LOCAL_LIBM_ENTRY(__libm_error_region)
-.proc __libm_error_region
-__libm_error_region:
-
-// Answer is inf for overflow and 0 for underflow.
.prologue
-// (1)
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
-
-// (2)
{ .mmi
stfd [GR_Parameter_Y] = POW_NORM_Y,16 // STORE Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
-// (3)
{ .mib
- stfd [GR_Parameter_X] = POW_NORM_X // STORE Parameter 1 on stack
+ stfd [GR_Parameter_X] = POW_NORM_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
- stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
+
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
-// (4)
{ .mmi
- ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
- add sp = 64,sp // Restore stack pointer
- mov b0 = GR_SAVE_B0 // Restore return address
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
+
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/e_powf.S b/sysdeps/ia64/fpu/e_powf.S
index d464058262..275843f1e2 100644
--- a/sysdeps/ia64/fpu/e_powf.S
+++ b/sysdeps/ia64/fpu/e_powf.S
@@ -1,10 +1,10 @@
.file "powf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,30 +35,39 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 2/03/00 Added p12 to definite over/under path. With odd power we did not
+// 02/02/00 Initial version
+// 02/03/00 Added p12 to definite over/under path. With odd power we did not
// maintain the sign of x in this path.
-// 4/04/00 Unwind support added
-// 4/19/00 pow(+-1,inf) now returns NaN
-// pow(+-val, +-inf) returns 0 or inf, but now does not call error support
+// 04/04/00 Unwind support added
+// 04/19/00 pow(+-1,inf) now returns NaN
+// pow(+-val, +-inf) returns 0 or inf, but now does not call error
+// support
// Added s1 to fcvt.fx because invalid flag was incorrectly set.
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-// 9/07/00 Improved performance by eliminating bank conflicts and other stalls,
+// 09/07/00 Improved performance by eliminating bank conflicts and other stalls,
// and tweaking the critical path
-// 9/08/00 Per c99, pow(+-1,inf) now returns 1, and pow(+1,nan) returns 1
-// 9/28/00 Updated NaN**0 path
-// 1/20/01 Fixed denormal flag settings.
-// 2/12/01 Improved speed.
+// 09/08/00 Per c99, pow(+-1,inf) now returns 1, and pow(+1,nan) returns 1
+// 09/28/00 Updated NaN**0 path
+// 01/20/01 Fixed denormal flag settings.
+// 02/13/01 Improved speed.
+// 03/19/01 Reordered exp polynomial to improve speed and eliminate monotonicity
+// problem in round up, down, and to zero modes. Also corrected
+// overflow result when x negative, y odd in round up, down, zero.
+// 06/14/01 Added brace missing from bundle
+// 12/10/01 Corrected case where x negative, 2^23 <= |y| < 2^24, y odd integer.
+// 02/08/02 Fixed overflow/underflow cases that were not calling error support.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 08/29/02 Improved Itanium 2 performance
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
-// double pow(double)
-// float powf(float)
+// float powf(float x, float y)
//
// Overview of operation
//==============================================================
@@ -67,51 +76,51 @@
// 1. Log(x)
// 2. y Log(x)
// 3. exp(y log(x))
-//
+//
// This means we work with the absolute value of x and merge in the sign later.
// Log(x) = G + delta + r -rsq/2 + p
// G,delta depend on the exponent of x and table entries. The table entries are
// indexed by the exponent of x, called K.
-//
+//
// The G and delta come out of the reduction; r is the reduced x.
-//
+//
// B = frcpa(x)
// xB-1 is small means that B is the approximate inverse of x.
-//
+//
// Log(x) = Log( (1/B)(Bx) )
// = Log(1/B) + Log(Bx)
// = Log(1/B) + Log( 1 + (Bx-1))
-//
+//
// x = 2^K 1.x_1x_2.....x_52
-// B= frcpa(x) = 2^-k Cm
+// B= frcpa(x) = 2^-k Cm
// Log(1/B) = Log(1/(2^-K Cm))
// Log(1/B) = Log((2^K/ Cm))
// Log(1/B) = K Log(2) + Log(1/Cm)
-//
+//
// Log(x) = K Log(2) + Log(1/Cm) + Log( 1 + (Bx-1))
-//
+//
// If you take the significand of x, set the exponent to true 0, then Cm is
// the frcpa. We tabulate the Log(1/Cm) values. There are 256 of them.
// The frcpa table is indexed by 8 bits, the x_1 thru x_8.
// m = x_1x_2...x_8 is an 8-bit index.
-//
+//
// Log(1/Cm) = log(1/frcpa(1+m/256)) where m goes from 0 to 255.
-//
+//
// We tabluate as two doubles, T and t, where T +t is the value itself.
-//
+//
// Log(x) = (K Log(2)_hi + T) + (Log(2)_hi + t) + Log( 1 + (Bx-1))
// Log(x) = G + delta + Log( 1 + (Bx-1))
-//
+//
// The Log( 1 + (Bx-1)) can be calculated as a series in r = Bx-1.
-//
+//
// Log( 1 + (Bx-1)) = r - rsq/2 + p
-//
+//
// Then,
-//
+//
// yLog(x) = yG + y delta + y(r-rsq/2) + yp
// yLog(x) = Z1 + e3 + Z2 + Z3 + (e2 + e3)
-//
-//
+//
+//
// exp(yLog(x)) = exp(Z1 + Z2 + Z3) exp(e1 + e2 + e3)
//
//
@@ -133,7 +142,7 @@
// exp(r) = exp(Z - N log2/128)
//
// r = s + d = (Z - N (log2/128)_hi) -N (log2/128)_lo
-// = Z - N (log2/128)
+// = Z - N (log2/128)
//
// Z = s+d +N (log2/128)
//
@@ -149,22 +158,22 @@
// n log2/128 = n_7n_6n_5 log2/8 + n_4n_3n_2n_1 log2/128
// n log2/128 = I2 log2/8 + I1 log2/128
//
-// N log2/128 = M log2 + I2 log2/8 + I1 log2/128
+// N log2/128 = M log2 + I2 log2/8 + I1 log2/128
//
// exp(Z) = exp(s) (1+d) exp(log(2^M) + log(2^I2/8) + log(2^I1/128))
// exp(Z) = exp(s) (1+d1) (1+d2)(2^M) 2^I2/8 2^I1/128
// exp(Z) = exp(s) f1 f2 (2^M) 2^I2/8 2^I1/128
//
// I1, I2 are table indices. Use a series for exp(s).
-// Then get exp(Z)
+// Then get exp(Z)
//
// exp(yLog(x)) = exp(Z1 + Z2 + Z3) exp(e1 + e2 + e3)
-// exp(yLog(x)) = exp(Z) exp(Z3) f3
-// exp(yLog(x)) = exp(Z)f3 exp(Z3)
-// exp(yLog(x)) = A exp(Z3)
+// exp(yLog(x)) = exp(Z) exp(Z3) f3
+// exp(yLog(x)) = exp(Z)f3 exp(Z3)
+// exp(yLog(x)) = A exp(Z3)
//
// We actually calculate exp(Z3) -1.
-// Then,
+// Then,
// exp(yLog(x)) = A + A( exp(Z3) -1)
//
@@ -175,142 +184,146 @@
// ==============
// The operation (K*log2_hi) must be exact. K is the true exponent of x.
// If we allow gradual underflow (denormals), K can be represented in 12 bits
-// (as a two's complement number). We assume 13 bits as an engineering precaution.
-//
+// (as a two's complement number). We assume 13 bits as an engineering
+// precaution.
+//
// +------------+----------------+-+
// | 13 bits | 50 bits | |
// +------------+----------------+-+
// 0 1 66
// 2 34
-//
+//
// So we want the lsb(log2_hi) to be 2^-50
// We get log2 as a quad-extended (15-bit exponent, 128-bit significand)
-//
+//
// 0 fffe b17217f7d1cf79ab c9e3b39803f2f6af (4...)
-//
+//
// Consider numbering the bits left to right, starting at 0 thru 127.
// Bit 0 is the 2^-1 bit; bit 49 is the 2^-50 bit.
-//
+//
// ...79ab
// 0111 1001 1010 1011
// 44
// 89
-//
-// So if we shift off the rightmost 14 bits, then (shift back only
+//
+// So if we shift off the rightmost 14 bits, then (shift back only
// the top half) we get
-//
+//
// 0 fffe b17217f7d1cf4000 e6af278ece600fcb dabc000000000000
-//
+//
// Put the right 64-bit signficand in an FR register, convert to double;
// it is exact. Put the next 128 bits into a quad register and round to double.
// The true exponent of the low part is -51.
-//
+//
// hi is 0 fffe b17217f7d1cf4000
// lo is 0 ffcc e6af278ece601000
-//
+//
// Convert to double memory format and get
-//
+//
// hi is 0x3fe62e42fefa39e8
-// lo is 0x3cccd5e4f1d9cc02
-//
+// lo is 0x3cccd5e4f1d9cc02
+//
// log2_hi + log2_lo is an accurate value for log2.
-//
-//
+//
+//
// The T and t values
// ==================
// A similar method is used to generate the T and t values.
-//
+//
// K * log2_hi + T must be exact.
-//
+//
// Smallest T,t
// ----------
-// The smallest T,t is
+// The smallest T,t is
// T t
-// data8 0x3f60040155d58800, 0x3c93bce0ce3ddd81 log(1/frcpa(1+0/256))= +1.95503e-003
-//
+// 0x3f60040155d58800, 0x3c93bce0ce3ddd81 log(1/frcpa(1+0/256))= +1.95503e-003
+//
// The exponent is 0x3f6 (biased) or -9 (true).
// For the smallest T value, what we want is to clip the significand such that
-// when it is shifted right by 9, its lsb is in the bit for 2^-51. The 9 is the specific
-// for the first entry. In general, it is 0xffff - (biased 15-bit exponent).
+// when it is shifted right by 9, its lsb is in the bit for 2^-51. The 9 is the
+// specific for the first entry. In general, it is 0xffff - (biased 15-bit
+// exponent).
-// Independently, what we have calculated is the table value as a quad precision number.
+// Independently, what we have calculated is the table value as a quad
+// precision number.
// Table entry 1 is
// 0 fff6 80200aaeac44ef38 338f77605fdf8000
-//
+//
// We store this quad precision number in a data structure that is
-// sign: 1
+// sign: 1
// exponent: 15
// signficand_hi: 64 (includes explicit bit)
// signficand_lo: 49
// Because the explicit bit is included, the significand is 113 bits.
-//
+//
// Consider significand_hi for table entry 1.
-//
-//
+//
+//
// +-+--- ... -------+--------------------+
// | |
// +-+--- ... -------+--------------------+
// 0 1 4444444455555555556666
// 2345678901234567890123
-//
+//
// Labeled as above, bit 0 is 2^0, bit 1 is 2^-1, etc.
// Bit 42 is 2^-42. If we shift to the right by 9, the bit in
// bit 42 goes in 51.
-//
+//
// So what we want to do is shift bits 43 thru 63 into significand_lo.
-// This is shifting bit 42 into bit 63, taking care to retain the shifted-off bits.
-// Then shifting (just with signficaand_hi) back into bit 42.
-//
-// The shift_value is 63-42 = 21. In general, this is
+// This is shifting bit 42 into bit 63, taking care to retain shifted-off bits.
+// Then shifting (just with signficaand_hi) back into bit 42.
+//
+// The shift_value is 63-42 = 21. In general, this is
// 63 - (51 -(0xffff - 0xfff6))
// For this example, it is
// 63 - (51 - 9) = 63 - 42 = 21
-//
-// This means we are shifting 21 bits into significand_lo. We must maintain more
-// that a 128-bit signficand not to lose bits. So before the shift we put the 128-bit
-// significand into a 256-bit signficand and then shift.
+//
+// This means we are shifting 21 bits into significand_lo. We must maintain more
+// that a 128-bit signficand not to lose bits. So before the shift we put the
+// 128-bit significand into a 256-bit signficand and then shift.
// The 256-bit significand has four parts: hh, hl, lh, and ll.
-//
+//
// Start off with
// hh hl lh ll
// <64> <49><15_0> <64_0> <64_0>
-//
+//
// After shift by 21 (then return for significand_hi),
// <43><21_0> <21><43> <6><58_0> <64_0>
-//
+//
// Take the hh part and convert to a double. There is no rounding here.
-// The conversion is exact. The true exponent of the high part is the same as the
-// true exponent of the input quad.
-//
-// We have some 64 plus significand bits for the low part. In this example, we have
-// 70 bits. We want to round this to a double. Put them in a quad and then do a quad fnorm.
-// For this example the true exponent of the low part is
+// The conversion is exact. The true exponent of the high part is the same as
+// the true exponent of the input quad.
+//
+// We have some 64 plus significand bits for the low part. In this example, we
+// have 70 bits. We want to round this to a double. Put them in a quad and then
+// do a quad fnorm.
+// For this example the true exponent of the low part is
// true_exponent_of_high - 43 = true_exponent_of_high - (64-21)
-// In general, this is
-// true_exponent_of_high - (64 - shift_value)
-//
-//
+// In general, this is
+// true_exponent_of_high - (64 - shift_value)
+//
+//
// Largest T,t
// ----------
// The largest T,t is
-// data8 0x3fe62643fecf9742, 0x3c9e3147684bd37d log(1/frcpa(1+255/256))= +6.92171e-001
-//
+// 0x3fe62643fecf9742, 0x3c9e3147684bd37d log(1/frcpa(1+255/256))=+6.92171e-001
+//
// Table entry 256 is
// 0 fffe b1321ff67cba178c 51da12f4df5a0000
-//
-// The shift value is
+//
+// The shift value is
// 63 - (51 -(0xffff - 0xfffe)) = 13
-//
-// The true exponent of the low part is
+//
+// The true exponent of the low part is
// true_exponent_of_high - (64 - shift_value)
// -1 - (64-13) = -52
// Biased as a double, this is 0x3cb
-//
-//
-//
+//
+//
+//
// So then lsb(T) must be >= 2^-51
// msb(Klog2_hi) <= 2^12
-//
+//
// +--------+---------+
// | 51 bits | <== largest T
// +--------+---------+
@@ -320,7 +333,6 @@
// +------------+----------------+-+
-
// Special Cases
//==============================================================
@@ -385,63 +397,66 @@
// X any Y =0 +1
-#include "libm_support.h"
-
// Assembly macros
//==============================================================
// integer registers used
-pow_AD_Tt = r33
-pow_GR_FFF7 = r34
-pow_GR_exp_Y = r34 // duplicate
-pow_GR_17ones = r35
-
-pow_AD_P = r36
-pow_AD_Q = r37
-pow_AD_tbl1 = r38
-pow_AD_tbl2 = r39
-pow_GR_exp_X = r40
-pow_GR_true_exp_X = r40 // duplicate
-
-pow_GR_offset = r41
-pow_GR_exp_Xm1 = r42
-pow_GR_sig_X = r43
-pow_GR_signexp_X = r44
-
-pow_GR_signexp_Xm1 = r46
-pow_GR_int_W1 = r47
-pow_GR_int_W2 = r48
-pow_GR_int_N = r49
-pow_GR_index1 = r50
-
-pow_GR_index2 = r51
-pow_AD_T1 = r52
-pow_AD_T2 = r53
-pow_GR_gt_ln = r53 // duplicate
-pow_int_GR_M = r54
-pow_GR_10033 = r55
-
-pow_GR_16ones = r56
-pow_GR_sig_int_Y = r57
-pow_GR_sign_Y_Gpr = r58
-pow_GR_17ones_m1 = r59
-pow_GR_one = r60
-pow_GR_sign_Y = r60
-
-pow_GR_signexp_Y_Gpr = r61
-pow_GR_exp_Y_Gpr = r62
-pow_GR_true_exp_Y_Gpr = r63
-pow_GR_signexp_Y = r64
-
-GR_SAVE_B0 = r65
-GR_SAVE_GP = r66
-GR_SAVE_PFS = r67
-
-GR_Parameter_X = r68
-GR_Parameter_Y = r69
-GR_Parameter_RESULT = r70
-pow_GR_tag = r71
+pow_GR_signexp_X = r14
+pow_GR_17ones = r15
+pow_AD_P = r16
+pow_GR_exp_2tom8 = r17
+pow_GR_sig_X = r18
+pow_GR_10033 = r19
+pow_GR_16ones = r20
+
+pow_AD_Tt = r21
+pow_GR_exp_X = r22
+pow_AD_Q = r23
+pow_GR_true_exp_X = r24
+pow_GR_y_zero = r25
+
+pow_GR_exp_Y = r26
+pow_AD_tbl1 = r27
+pow_AD_tbl2 = r28
+pow_GR_offset = r29
+pow_GR_exp_Xm1 = r30
+pow_GR_xneg_yodd = r31
+
+pow_GR_signexp_Xm1 = r35
+pow_GR_int_W1 = r36
+pow_GR_int_W2 = r37
+pow_GR_int_N = r38
+pow_GR_index1 = r39
+pow_GR_index2 = r40
+
+pow_AD_T1 = r41
+pow_AD_T2 = r42
+pow_int_GR_M = r43
+pow_GR_sig_int_Y = r44
+pow_GR_sign_Y_Gpr = r45
+
+pow_GR_17ones_m1 = r46
+pow_GR_one = r47
+pow_GR_sign_Y = r48
+pow_GR_signexp_Y_Gpr = r49
+pow_GR_exp_Y_Gpr = r50
+
+pow_GR_true_exp_Y_Gpr = r51
+pow_GR_signexp_Y = r52
+pow_GR_x_one = r53
+pow_GR_big_pos = r55
+
+pow_GR_big_neg = r56
+
+GR_SAVE_B0 = r50
+GR_SAVE_GP = r51
+GR_SAVE_PFS = r52
+
+GR_Parameter_X = r53
+GR_Parameter_Y = r54
+GR_Parameter_RESULT = r55
+pow_GR_tag = r56
// floating point registers used
@@ -464,7 +479,8 @@ POW_log2_lo = f43
POW_r = f44
POW_Q0_half = f45
-POW_Q1 = f46
+POW_Q1 = f46
+POW_tmp = f47
POW_log2_hi = f48
POW_Q4 = f49
POW_P1 = f50
@@ -476,6 +492,7 @@ POW_Yrcub = f54
POW_log2_by_128_lo = f55
POW_v6 = f56
+POW_xsq = f57
POW_v4 = f58
POW_v2 = f59
POW_T = f60
@@ -484,6 +501,7 @@ POW_Tt = f61
POW_RSHF = f62
POW_v21ps = f63
POW_s4 = f64
+POW_twoV = f65
POW_U = f66
POW_G = f67
@@ -533,44 +551,36 @@ POW_1ps = f103
POW_A = f104
POW_es = f105
+POW_Xp1 = f106
POW_int_K = f107
POW_K = f108
POW_f123 = f109
POW_Gpr = f110
-POW_Y_Gpr = f111
+POW_Y_Gpr = f111
POW_int_Y = f112
+POW_2Mqp1 = f113
POW_float_int_Y = f116
POW_ftz_urm_f8 = f117
POW_wre_urm_f8 = f118
-POW_abs_A = f119
-POW_gt_pln = f120
-
-POW_xsq = f121
-
-POW_twoV = f122
-POW_Xp1 = f123
+POW_big_neg = f119
+POW_big_pos = f120
// Data tables
//==============================================================
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+RODATA
.align 16
-pow_table_P:
-ASM_TYPE_DIRECTIVE(pow_table_P,@object)
+LOCAL_OBJECT_START(pow_table_P)
data8 0x8000F7B249FF332D, 0x0000BFFC // P_5
data8 0xAAAAAAA9E7902C7F, 0x0000BFFC // P_3
data8 0x80000000000018E5, 0x0000BFFD // P_1
data8 0xb8aa3b295c17f0bc, 0x00004006 // inv_ln2_by_128
-
-
+//
+//
data8 0x3FA5555555554A9E // Q_2
data8 0x3F8111124F4DD9F9 // Q_3
data8 0x3FE0000000000000 // Q_0
@@ -580,20 +590,18 @@ data8 0x43e8000000000000 // Right shift constant for exp
data8 0xc9e3b39803f2f6af, 0x00003fb7 // ln2_by_128_lo
data8 0x0000000000000000 // pad to eliminate bank conflicts with pow_table_Q
data8 0x0000000000000000 // pad to eliminate bank conflicts with pow_table_Q
-ASM_SIZE_DIRECTIVE(pow_table_P)
+LOCAL_OBJECT_END(pow_table_P)
-pow_table_Q:
-ASM_TYPE_DIRECTIVE(pow_table_Q,@object)
+LOCAL_OBJECT_START(pow_table_Q)
data8 0x9249FE7F0DC423CF, 0x00003FFC // P_4
data8 0xCCCCCCCC4ED2BA7F, 0x00003FFC // P_2
data8 0xAAAAAAAAAAAAB505, 0x00003FFD // P_0
data8 0x3fe62e42fefa39e8, 0x3cccd5e4f1d9cc02 // log2 hi lo = +6.93147e-001
data8 0xb17217f7d1cf79ab, 0x00003ff7 // ln2_by_128_hi
-ASM_SIZE_DIRECTIVE(pow_table_Q)
+LOCAL_OBJECT_END(pow_table_Q)
-pow_Tt:
-ASM_TYPE_DIRECTIVE(pow_Tt,@object)
+LOCAL_OBJECT_START(pow_Tt)
data8 0x3f60040155d58800, 0x3c93bce0ce3ddd81 // log(1/frcpa(1+0/256))= +1.95503e-003
data8 0x3f78121214586a00, 0x3cb540e0a5cfc9bc // log(1/frcpa(1+1/256))= +5.87661e-003
data8 0x3f841929f9683200, 0x3cbdf1d57404da1f // log(1/frcpa(1+2/256))= +9.81362e-003
@@ -850,13 +858,12 @@ data8 0x3fe5f673c61a2ed0, 0x3caa385eef5f2789 // log(1/frcpa(1+252/256))= +6.863
data8 0x3fe6065bea385924, 0x3cb11624f165c5b4 // log(1/frcpa(1+253/256))= +6.88276e-001
data8 0x3fe6164bfa7cc068, 0x3cbad884f87073fa // log(1/frcpa(1+254/256))= +6.90222e-001
data8 0x3fe62643fecf9740, 0x3cb78c51da12f4df // log(1/frcpa(1+255/256))= +6.92171e-001
-ASM_SIZE_DIRECTIVE(pow_Tt)
+LOCAL_OBJECT_END(pow_Tt)
// Table 1 is 2^(index_1/128) where
// index_1 goes from 0 to 15
-pow_tbl1:
-ASM_TYPE_DIRECTIVE(pow_tbl1,@object)
+LOCAL_OBJECT_START(pow_tbl1)
data8 0x8000000000000000 , 0x00003FFF
data8 0x80B1ED4FD999AB6C , 0x00003FFF
data8 0x8164D1F3BC030773 , 0x00003FFF
@@ -873,13 +880,12 @@ data8 0x88980E8092DA8527 , 0x00003FFF
data8 0x8955EE03618E5FDD , 0x00003FFF
data8 0x8A14D575496EFD9A , 0x00003FFF
data8 0x8AD4C6452C728924 , 0x00003FFF
-ASM_SIZE_DIRECTIVE(pow_tbl1)
+LOCAL_OBJECT_END(pow_tbl1)
// Table 2 is 2^(index_1/8) where
// index_2 goes from 0 to 7
-pow_tbl2:
-ASM_TYPE_DIRECTIVE(pow_tbl2,@object)
+LOCAL_OBJECT_START(pow_tbl2)
data8 0x8000000000000000 , 0x00003FFF
data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF
data8 0x9837F0518DB8A96F , 0x00003FFF
@@ -888,372 +894,287 @@ data8 0xB504F333F9DE6484 , 0x00003FFF
data8 0xC5672A115506DADD , 0x00003FFF
data8 0xD744FCCAD69D6AF4 , 0x00003FFF
data8 0xEAC0C6E7DD24392F , 0x00003FFF
-ASM_SIZE_DIRECTIVE(pow_tbl2)
-
-.global powf
+LOCAL_OBJECT_END(pow_tbl2)
.section .text
-.proc powf
-.align 32
-
-powf:
+GLOBAL_LIBM_ENTRY(powf)
+// Get exponent of x. Will be used to calculate K.
{ .mfi
- alloc r32=ar.pfs,1,35,4,0
- fms.s1 POW_Xm1 = f8,f1,f1 // Will be used for r1 if x>0
- mov pow_GR_17ones = 0x1FFFF
+ getf.exp pow_GR_signexp_X = f8
+ fms.s1 POW_Xm1 = f8,f1,f1 // Will be used for r1 if x>0
+ mov pow_GR_17ones = 0x1FFFF
}
{ .mfi
-(p0) addl pow_AD_P = @ltoff(pow_table_P), gp
- fma.s1 POW_Xp1 = f8,f1,f1 // Will be used for r1 if x<0
+ addl pow_AD_P = @ltoff(pow_table_P), gp
+ fma.s1 POW_Xp1 = f8,f1,f1 // Will be used for r1 if x<0
nop.i 999
;;
}
-
-// Get exponent of x. Will be used to calculate K.
+// Get significand of x. Will be used to get index to fetch T, Tt.
{ .mfi
- getf.exp pow_GR_signexp_X = f8
- frcpa.s1 POW_B, p6 = f1,f8
+ getf.sig pow_GR_sig_X = f8
+ frcpa.s1 POW_B, p6 = f1,f8
nop.i 999
}
{ .mfi
ld8 pow_AD_P = [pow_AD_P]
- fma.s1 POW_NORM_X = f8,f1,f0
- mov pow_GR_FFF7 = 0xFFF7
+ fma.s1 POW_NORM_X = f8,f1,f0
+ mov pow_GR_exp_2tom8 = 0xFFF7
}
;;
-
-
-// Get significand of x. Will be used to get index to fetch T, Tt.
// p13 = TRUE ==> X is unorm
// DOUBLE 0x10033 exponent limit at which y is an integer
-// SINGLE 0x10016
{ .mfi
- getf.sig pow_GR_sig_X = f8
- fclass.m p13,p0 = f8, 0x0b // Test for x unorm
- addl pow_GR_10033 = 0x10033, r0
+ nop.m 999
+ fclass.m p13,p0 = f8, 0x0b // Test for x unorm
+ addl pow_GR_10033 = 0x10033, r0
}
{ .mfi
mov pow_GR_16ones = 0xFFFF
- fma.s1 POW_NORM_Y = f9,f1,f0
+ fma.s1 POW_NORM_Y = f9,f1,f0
nop.i 999
}
;;
-
// p14 = TRUE ==> X is ZERO
{ .mfi
adds pow_AD_Tt = pow_Tt - pow_table_P, pow_AD_P
- fclass.m p14,p15 = f8, 0x07
- and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones
+ fclass.m p14,p0 = f8, 0x07
+ and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones
}
{ .mfi
- adds pow_AD_Q = pow_table_Q - pow_table_P, pow_AD_P
+ adds pow_AD_Q = pow_table_Q - pow_table_P, pow_AD_P
nop.f 999
nop.i 999
}
;;
{ .mfi
- ldfe POW_P5 = [pow_AD_P], 16
- fcmp.lt.s1 p8,p9 = f8, f0 // Test for x<0
- shl pow_GR_offset = pow_GR_sig_X, 1
+ ldfe POW_P5 = [pow_AD_P], 16
+ fcmp.lt.s1 p8,p9 = f8, f0 // Test for x<0
+ nop.i 999
}
{ .mib
- ldfe POW_P4 = [pow_AD_Q], 16
- sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones
-(p13) br.cond.spnt L(POW_X_DENORM)
+ ldfe POW_P4 = [pow_AD_Q], 16
+ sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones
+(p13) br.cond.spnt POW_X_DENORM
}
;;
-
// Continue normal and denormal paths here
-L(POW_COMMON):
+POW_COMMON:
// p11 = TRUE ==> Y is a NAN
{ .mfi
- ldfe POW_P3 = [pow_AD_P], 16
- fclass.m.unc p11,p0 = f9, 0xc3
- shr.u pow_GR_offset = pow_GR_offset,56
+ ldfe POW_P3 = [pow_AD_P], 16
+ fclass.m p11,p0 = f9, 0xc3
+ nop.i 999
}
{ .mfi
- ldfe POW_P2 = [pow_AD_Q], 16
+ ldfe POW_P2 = [pow_AD_Q], 16
nop.f 999
- nop.i 999
+ mov pow_GR_y_zero = 0
}
;;
-
-
-// Compute xsq to decide later if |x|=1
-// p11 = TRUE ==> Y is a NaN
+// Note POW_Xm1 and POW_r1 are used interchangably
{ .mfi
- setf.sig POW_int_K = pow_GR_true_exp_X
-(p15) fms.s1 POW_r = POW_B, POW_NORM_X,f1
- shladd pow_AD_Tt = pow_GR_offset, 4, pow_AD_Tt
+ alloc r32=ar.pfs,2,19,4,0
+ fms.s1 POW_r = POW_B, POW_NORM_X,f1
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p8) fnma.s1 POW_Xm1 = POW_Xp1,f1,f0
+ setf.sig POW_int_K = pow_GR_true_exp_X
+(p8) fnma.s1 POW_Xm1 = POW_Xp1,f1,f0
nop.i 999
}
;;
-
-
-// p12 = TRUE ==> X is ZERO and Y is ZERO
+// p12 = TRUE if Y is ZERO
+// Compute xsq to decide later if |x|=1
{ .mfi
- ldfe POW_P1 = [pow_AD_P], 16
-(p14) fclass.m.unc p12,p0 = f9, 0x07
- nop.i 999
+ ldfe POW_P1 = [pow_AD_P], 16
+ fclass.m p12,p0 = f9, 0x07
+ shl pow_GR_offset = pow_GR_sig_X, 1
}
{ .mfb
- ldfe POW_P0 = [pow_AD_Q], 16
+ ldfe POW_P0 = [pow_AD_Q], 16
fma.s1 POW_xsq = POW_NORM_X, POW_NORM_X, f0
-(p11) br.cond.spnt L(POW_Y_NAN)
+(p11) br.cond.spnt POW_Y_NAN // Branch if y=nan
}
;;
-
-.pred.rel "mutex",p8,p9
// Get exponent of |x|-1 to use in comparison to 2^-8
-{ .mmf
-(p8) getf.exp pow_GR_signexp_Xm1 = POW_Xp1
-(p9) getf.exp pow_GR_signexp_Xm1 = POW_Xm1
- fcvt.fx.s1 POW_int_Y = POW_NORM_Y
+{ .mfi
+ getf.exp pow_GR_signexp_Xm1 = POW_Xm1
+ fcvt.fx.s1 POW_int_Y = POW_NORM_Y
+ shr.u pow_GR_offset = pow_GR_offset,56
}
;;
-
// p11 = TRUE ==> X is a NAN
{ .mfi
ldfpd POW_log2_hi, POW_log2_lo = [pow_AD_Q], 16
- fclass.m.unc p11,p0 = f8, 0xc3
- nop.i 999
+ fclass.m p11,p0 = f8, 0xc3
+ shladd pow_AD_Tt = pow_GR_offset, 4, pow_AD_Tt
}
-{ .mib
- ldfpd POW_T, POW_Tt = [pow_AD_Tt], 16
- nop.i 999
-(p12) br.cond.spnt L(POW_X_0_Y_0)
+{ .mfi
+ ldfe POW_inv_log2_by_128 = [pow_AD_P], 16
+ fma.s1 POW_delta = f0,f0,f0 // delta=0 in case |x| near 1
+(p12) mov pow_GR_y_zero = 1
}
;;
-
-// p14 = TRUE ==> X is zero
-// p15 = TRUE ==> X is zero AND Y is negative
-// p10 = TRUE ==> X is zero AND Y is >= zero
{ .mfi
- ldfe POW_inv_log2_by_128 = [pow_AD_P], 16
-(p14) fcmp.lt.unc.s1 p15, p10 = f9,f0
- nop.i 999
+ ldfpd POW_Q2, POW_Q3 = [pow_AD_P], 16
+ fma.s1 POW_G = f0,f0,f0 // G=0 in case |x| near 1
+ and pow_GR_exp_Xm1 = pow_GR_signexp_Xm1, pow_GR_17ones
}
-{ .mfi
- nop.m 999
- nop.f 999
- and pow_GR_exp_Xm1 = pow_GR_signexp_Xm1, pow_GR_17ones
-}
;;
-
// Determine if we will use the |x| near 1 path (p6) or normal path (p7)
-// p12 = TRUE ==> X is a NAN and Y is a zero
-// p13 = TRUE ==> X is a NAN and Y is anything else
{ .mfi
- getf.exp pow_GR_signexp_Y = POW_NORM_Y
-(p11) fclass.m.unc p12,p13 = f9, 0x07
- cmp.lt.unc p6,p7 = pow_GR_exp_Xm1, pow_GR_FFF7
+ getf.exp pow_GR_signexp_Y = POW_NORM_Y
+ nop.f 999
+ cmp.lt p6,p7 = pow_GR_exp_Xm1, pow_GR_exp_2tom8
}
-{ .mfi
- ldfpd POW_Q2, POW_Q3 = [pow_AD_P], 16
- fma.s1 POW_rsq = POW_r, POW_r,f0
- nop.i 999
-;;
+{ .mfb
+ ldfpd POW_T, POW_Tt = [pow_AD_Tt], 16
+ fma.s1 POW_rsq = POW_r, POW_r,f0
+(p11) br.cond.spnt POW_X_NAN // Branch if x=nan and y not nan
}
+;;
// If on the x near 1 path, assign r1 to r and r1*r1 to rsq
{ .mfi
- ldfpd POW_Q0_half, POW_Q1 = [pow_AD_P], 16
-(p6) fma.s1 POW_r = POW_r1, f1, f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p6) fma.s1 POW_rsq = POW_r1, POW_r1, f0
+ ldfpd POW_Q0_half, POW_Q1 = [pow_AD_P], 16
+(p6) fma.s1 POW_r = POW_r1, f1, f0
nop.i 999
-;;
-}
-
-
-{ .mfi
- ldfpd POW_Q4, POW_RSHF = [pow_AD_P], 16
-(p7) fma.s1 POW_v6 = POW_r, POW_P5, POW_P4
- and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones
}
{ .mfb
nop.m 999
-(p6) fma.s1 POW_v6 = POW_r1, POW_P5, POW_P4
-(p12) br.cond.spnt L(POW_X_NAN_Y_0)
+(p6) fma.s1 POW_rsq = POW_r1, POW_r1, f0
+(p14) br.cond.spnt POW_X_0 // Branch if x zero and y not nan
}
;;
-
{ .mfi
- nop.m 999
-(p7) fma.s1 POW_v4 = POW_P3, POW_r, POW_P2
- andcm pow_GR_sign_Y = pow_GR_signexp_Y, pow_GR_17ones
+ ldfpd POW_Q4, POW_RSHF = [pow_AD_P], 16
+(p7) fma.s1 POW_v6 = POW_r, POW_P5, POW_P4
+ nop.i 999
}
-{ .mfb
+{ .mfi
nop.m 999
-(p6) fma.s1 POW_v4 = POW_P3, POW_r1, POW_P2
-(p12) br.cond.spnt L(POW_X_NAN_Y_0)
+(p6) fma.s1 POW_v6 = POW_r1, POW_P5, POW_P4
+ nop.i 999
}
;;
{ .mfi
nop.m 999
- fcvt.xf POW_K = POW_int_K
+(p7) fma.s1 POW_v4 = POW_P3, POW_r, POW_P2
nop.i 999
}
-{ .mfb
- nop.m 999
-(p13) fma.s f8 = f8,f1,f0
-(p13) br.ret.spnt b0 // Exit if x nan, y anything but zero
-}
-;;
-
-// p10 = TRUE ==> X is zero AND Y is positive
-// p8 = TRUE ==> X is zero AND Y is outside integer range (treat as even int)
-// return +0
-// p9 = TRUE ==> X is zero AND Y is within integer range (may not be integer)
{ .mfi
-(p10) cmp.gt.unc p8,p9 = pow_GR_exp_Y, pow_GR_10033
-(p6) fmerge.s POW_delta = f0,f0
+ nop.m 999
+(p6) fma.s1 POW_v4 = POW_P3, POW_r1, POW_P2
nop.i 999
}
+;;
+
{ .mfi
nop.m 999
-(p6) fma.s1 POW_G = f0,f0,f0
+ fcvt.xf POW_K = POW_int_K
nop.i 999
}
;;
{ .mfi
- getf.sig pow_GR_sig_int_Y = POW_int_Y
- fnma.s1 POW_twoV = POW_NORM_Y, POW_rsq,f0
- nop.i 999
+ getf.sig pow_GR_sig_int_Y = POW_int_Y
+ fnma.s1 POW_twoV = POW_NORM_Y, POW_rsq,f0
+ and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones
}
-{ .mfi
- nop.m 999
- fma.s1 POW_U = POW_NORM_Y,POW_r,f0
- nop.i 999
+{ .mfb
+ andcm pow_GR_sign_Y = pow_GR_signexp_Y, pow_GR_17ones
+ fma.s1 POW_U = POW_NORM_Y,POW_r,f0
+(p12) br.cond.spnt POW_Y_0 // Branch if y=zero, x not zero or nan
}
;;
+// p11 = TRUE ==> X is NEGATIVE but not inf
{ .mfi
- ldfe POW_log2_by_128_lo = [pow_AD_P], 16
-(p6) fma.s1 POW_v2 = POW_P1, POW_r1, POW_P0
+ ldfe POW_log2_by_128_lo = [pow_AD_P], 16
+ fclass.m p11,p0 = f8, 0x1a
nop.i 999
}
{ .mfi
- ldfe POW_log2_by_128_hi = [pow_AD_Q], 16
-(p7) fma.s1 POW_v2 = POW_P1, POW_r, POW_P0
+ ldfe POW_log2_by_128_hi = [pow_AD_Q], 16
+ fma.s1 POW_v2 = POW_P1, POW_r, POW_P0
nop.i 999
}
;;
-
{ .mfi
nop.m 999
- fcvt.xf POW_float_int_Y = POW_int_Y
+ fcvt.xf POW_float_int_Y = POW_int_Y
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 POW_v3 = POW_v6, POW_rsq, POW_v4
- adds pow_AD_tbl1 = pow_tbl1 - pow_Tt, pow_AD_Q
+ fma.s1 POW_v3 = POW_v6, POW_rsq, POW_v4
+ adds pow_AD_tbl1 = pow_tbl1 - pow_Tt, pow_AD_Q
}
;;
{ .mfi
nop.m 999
-(p7) fma.s1 POW_delta = POW_K, POW_log2_lo, POW_Tt
+(p7) fma.s1 POW_delta = POW_K, POW_log2_lo, POW_Tt
nop.i 999
}
{ .mfi
nop.m 999
-(p7) fma.s1 POW_G = POW_K, POW_log2_hi, POW_T
- adds pow_AD_tbl2 = pow_tbl2 - pow_tbl1, pow_AD_tbl1
+(p7) fma.s1 POW_G = POW_K, POW_log2_hi, POW_T
+ adds pow_AD_tbl2 = pow_tbl2 - pow_tbl1, pow_AD_tbl1
}
;;
-
{ .mfi
nop.m 999
- fms.s1 POW_e2 = POW_NORM_Y, POW_r, POW_U
+ fms.s1 POW_e2 = POW_NORM_Y, POW_r, POW_U
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 POW_Z2 = POW_twoV, POW_Q0_half, POW_U
+ fma.s1 POW_Z2 = POW_twoV, POW_Q0_half, POW_U
nop.i 999
}
;;
-// p11 = TRUE ==> X is NEGATIVE
-// p8 = TRUE ==> X is zero AND Y is outside intger range (treat as even int)
-// return +0
{ .mfi
nop.m 999
- fclass.m.unc p11,p0 = f8, 0x1a
- nop.i 999
-}
-{ .mfb
- nop.m 999
-(p8) fma.s f8 = f0,f0,f0
-(p8) br.ret.spnt b0
-}
-;;
-
-{ .mfi
- nop.m 999
- fma.s1 POW_Yrcub = POW_rsq, POW_U, f0
+ fma.s1 POW_Yrcub = POW_rsq, POW_U, f0
nop.i 999
}
-{ .mfi
+{ .mfi
nop.m 999
- fma.s1 POW_p = POW_rsq, POW_v3, POW_v2
+ fma.s1 POW_p = POW_rsq, POW_v3, POW_v2
nop.i 999
}
;;
-
-// p11 = TRUE ==> X is NEGATIVE
-// p12 = TRUE ==> X is NEGATIVE AND Y already int
+// p11 = TRUE ==> X is NEGATIVE but not inf
+// p12 = TRUE ==> X is NEGATIVE AND Y already even int
// p13 = TRUE ==> X is NEGATIVE AND Y possible int
{ .mfi
nop.m 999
- fma.s1 POW_Z1 = POW_NORM_Y, POW_G, f0
-(p11) cmp.ge.unc p12,p13 = pow_GR_exp_Y, pow_GR_10033
+ fma.s1 POW_Z1 = POW_NORM_Y, POW_G, f0
+(p11) cmp.gt.unc p12,p13 = pow_GR_exp_Y, pow_GR_10033
}
{ .mfi
nop.m 999
- fma.s1 POW_e3 = POW_NORM_Y, POW_delta, f0
- nop.i 999
-}
-;;
-
-// p9 = TRUE ==> X is zero AND Y is within integer range (may not be integer)
-// p6 = TRUE ==> X is zero AND Y is an integer (may be even or odd)
-// p7 = TRUE ==> X is zero AND Y is NOT an integer, return +0
-{ .mfi
- nop.m 999
-(p9) fcmp.eq.unc.s1 p6,p7 = POW_float_int_Y, POW_NORM_Y
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 POW_Gpr = POW_G, f1, POW_r
+ fma.s1 POW_Gpr = POW_G, f1, POW_r
nop.i 999
}
;;
@@ -1266,24 +1187,14 @@ L(POW_COMMON):
}
{ .mfi
nop.m 999
- fms.s1 POW_UmZ2 = POW_U, f1, POW_Z2
+ fms.s1 POW_UmZ2 = POW_U, f1, POW_Z2
nop.i 999
}
;;
-
-// If x=0 and y>0, test y and flag denormal
-// p6 = TRUE ==> X is zero AND Y is an integer (may be even or odd)
-// p8 = TRUE ==> X is zero AND Y is an odd integer
-// p9 = TRUE ==> X is zero AND Y is an even integer
-{ .mfi
- nop.m 999
-(p10) fcmp.eq.s0 p15,p0 = f9,f0
-(p6) tbit.nz.unc p8,p9 = pow_GR_sig_int_Y,0
-}
{ .mfi
nop.m 999
- fma.s1 POW_Z3 = POW_p, POW_Yrcub, f0
+ fma.s1 POW_Z3 = POW_p, POW_Yrcub, f0
nop.i 999
}
;;
@@ -1291,7 +1202,7 @@ L(POW_COMMON):
// By adding RSHF (1.1000...*2^63) we put integer part in rightmost significand
{ .mfi
nop.m 999
- fms.s1 POW_e1 = POW_NORM_Y, POW_G, POW_Z1
+ fms.s1 POW_e1 = POW_NORM_Y, POW_G, POW_Z1
nop.i 999
}
{ .mfi
@@ -1301,81 +1212,60 @@ L(POW_COMMON):
}
;;
+// p13 = TRUE ==> X is NEGATIVE AND Y possible int
+// p10 = TRUE ==> X is NEG and Y is an int
+// p12 = TRUE ==> X is NEG and Y is not an int
{ .mfi
nop.m 999
-(p7) fma.s f8 = f0,f0,f0 // Result +0 if x zero and y not integer
- nop.i 999
+(p13) fcmp.eq.unc.s1 p10,p12 = POW_float_int_Y, POW_NORM_Y
+ mov pow_GR_xneg_yodd = 0
}
-{ .mfb
+{ .mfi
nop.m 999
- fma.s1 POW_Y_Gpr = POW_NORM_Y, POW_Gpr, f0
-(p8) br.ret.spnt b0 // Exit if x zero and y odd integer
+ fma.s1 POW_Y_Gpr = POW_NORM_Y, POW_Gpr, f0
+ nop.i 999
}
;;
// By subtracting RSHF we get rounded integer POW_N2float
-// p15 = TRUE ==> X_0_Y_NEG
{ .mfi
nop.m 999
fms.s1 POW_N2float = POW_W2, f1, POW_RSHF
nop.i 999
}
-{ .mfb
+{ .mfi
nop.m 999
- fma.s1 POW_UmZ2pV = POW_twoV,POW_Q0_half,POW_UmZ2
-(p15) br.cond.spnt L(POW_X_0_Y_NEG)
+ fma.s1 POW_UmZ2pV = POW_twoV,POW_Q0_half,POW_UmZ2
+ nop.i 999
}
;;
-
-
{ .mfi
nop.m 999
- fma.s1 POW_Z3sq = POW_Z3, POW_Z3, f0
+ fma.s1 POW_Z3sq = POW_Z3, POW_Z3, f0
nop.i 999
}
-{ .mfb
+{ .mfi
nop.m 999
- fma.s1 POW_v4 = POW_Z3, POW_Q3, POW_Q2
-(p7) br.ret.spnt b0 // Exit if x zero and y not an integer
+ fma.s1 POW_v4 = POW_Z3, POW_Q3, POW_Q2
+ nop.i 999
}
;;
-
-
// Extract rounded integer from rightmost significand of POW_W2
// By subtracting RSHF we get rounded integer POW_N1float
{ .mfi
- getf.sig pow_GR_int_W2 = POW_W2
+ getf.sig pow_GR_int_W2 = POW_W2
fms.s1 POW_N1float = POW_W1, f1, POW_RSHF
nop.i 999
}
{ .mfi
nop.m 999
- fma.s1 POW_v2 = POW_Z3, POW_Q1, POW_Q0_half
- nop.i 999
-}
-;;
-
-
-
-
-// p13 = TRUE ==> X is NEGATIVE AND Y possible int
-// p10 = TRUE ==> X is NEG and Y is an int
-// p12 = TRUE ==> X is NEG and Y is not an int
-{ .mfi
- nop.m 999
-(p13) fcmp.eq.unc.s1 p10,p12 = POW_float_int_Y, POW_NORM_Y
+ fma.s1 POW_v2 = POW_Z3, POW_Q1, POW_Q0_half
nop.i 999
}
-{ .mfb
- nop.m 999
-(p9) fma.s f8 = f0,f0,f0 // Result +0 if x zero and y even integer
-(p9) br.ret.spnt b0 // Exit if x zero and y even integer
-}
;;
-
{ .mfi
nop.m 999
fnma.s1 POW_s2 = POW_N2float, POW_log2_by_128_hi, POW_Z2
@@ -1383,7 +1273,7 @@ L(POW_COMMON):
}
{ .mfi
nop.m 999
- fma.s1 POW_e2 = POW_e2,f1,POW_UmZ2pV
+ fma.s1 POW_e2 = POW_e2,f1,POW_UmZ2pV
nop.i 999
}
;;
@@ -1391,278 +1281,250 @@ L(POW_COMMON):
// Extract rounded integer from rightmost significand of POW_W1
// Test if x inf
{ .mfi
- getf.sig pow_GR_int_W1 = POW_W1
- fclass.m.unc p15,p0 = POW_NORM_X, 0x23
+ getf.sig pow_GR_int_W1 = POW_W1
+ fclass.m p15,p0 = POW_NORM_X, 0x23
nop.i 999
}
{ .mfb
nop.m 999
fnma.s1 POW_f2 = POW_N2float, POW_log2_by_128_lo, f1
-(p12) br.cond.spnt L(POW_X_NEG_Y_NONINT) // Branch if x neg, y not integer
+(p12) br.cond.spnt POW_X_NEG_Y_NONINT // Branch if x neg, y not integer
}
;;
+// p11 = TRUE ==> X is +1.0
// p12 = TRUE ==> X is NEGATIVE AND Y is an odd integer
{ .mfi
- getf.exp pow_GR_signexp_Y_Gpr = POW_Y_Gpr
- fma.s1 POW_v3 = POW_Z3sq, POW_Q4, POW_v4
-(p10) tbit.nz.unc p12,p0 = pow_GR_sig_int_Y,0
+ getf.exp pow_GR_signexp_Y_Gpr = POW_Y_Gpr
+ fcmp.eq.s1 p11,p0 = POW_NORM_X, f1
+(p10) tbit.nz.unc p12,p0 = pow_GR_sig_int_Y,0
+}
+{ .mfi
+ nop.m 999
+ fma.s1 POW_v3 = POW_Z3sq, POW_Q4, POW_v4
+ nop.i 999
}
;;
-
{ .mfi
- add pow_GR_int_N = pow_GR_int_W1, pow_GR_int_W2
+ nop.m 999
fnma.s1 POW_f1 = POW_N1float, POW_log2_by_128_lo, f1
nop.i 999
}
{ .mfb
nop.m 999
fnma.s1 POW_s1 = POW_N1float, POW_log2_by_128_hi, POW_Z1
-(p15) br.cond.spnt L(POW_X_INF)
+(p15) br.cond.spnt POW_X_INF
}
;;
-
// Test x and y and flag denormal
{ .mfi
- and pow_GR_index1 = 0x0f, pow_GR_int_N
+ nop.m 999
fcmp.eq.s0 p15,p0 = f8,f9
- shr r2 = pow_GR_int_N, 7
+ nop.i 999
}
{ .mfi
- and pow_GR_exp_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones
- nop.f 999
- and pow_GR_index2 = 0x70, pow_GR_int_N
+ nop.m 999
+ fma.s1 POW_e3 = POW_NORM_Y, POW_delta, f0
+ nop.i 999
}
;;
-
-
{ .mfi
- shladd pow_AD_T1 = pow_GR_index1, 4, pow_AD_tbl1
+ nop.m 999
fcmp.eq.s1 p7,p0 = POW_NORM_Y, f1 // Test for y=1.0
- sub pow_GR_true_exp_Y_Gpr = pow_GR_exp_Y_Gpr, pow_GR_16ones
+ nop.i 999
}
{ .mfi
- addl pow_int_GR_M = 0xFFFF, r2
- fma.s1 POW_e12 = POW_e1,f1,POW_e2
- add pow_AD_T2 = pow_AD_tbl2, pow_GR_index2
+ nop.m 999
+ fma.s1 POW_e12 = POW_e1,f1,POW_e2
+ nop.i 999
}
;;
-
-{ .mmi
- ldfe POW_T1 = [pow_AD_T1],16
- setf.exp POW_2M = pow_int_GR_M
- andcm pow_GR_sign_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones
+{ .mfi
+ add pow_GR_int_N = pow_GR_int_W1, pow_GR_int_W2
+(p11) fma.s.s0 f8 = f1,f1,f0 // If x=1, result is +1
+ nop.i 999
+}
+{ .mib
+(p12) mov pow_GR_xneg_yodd = 1
+ nop.i 999
+(p11) br.ret.spnt b0 // Early exit if x=1.0, result is +1
}
;;
-
-{ .mfb
- ldfe POW_T2 = [pow_AD_T2],16
- fma.s1 POW_q = POW_Z3sq, POW_v3, POW_v2
+{ .mfi
+ and pow_GR_index1 = 0x0f, pow_GR_int_N
+ fma.s1 POW_q = POW_Z3sq, POW_v3, POW_v2
+ shr pow_int_GR_M = pow_GR_int_N, 7 // M = N/128
+}
+{ .mib
+ and pow_GR_index2 = 0x70, pow_GR_int_N
+ nop.i 999
(p7) br.ret.spnt b0 // Early exit if y=1.0, result is x
}
;;
-
-// double: p8 TRUE ==> |Y(G + r)| >= 10
-// single: p8 TRUE ==> |Y(G + r)| >= 7
-
-// double
-// -2^10 -2^9 2^9 2^10
-// -----+-----+----+ ... +-----+-----+-----
-// p8 | p9 | p8
-// | | p10 | |
-// single
-// -2^7 -2^6 2^6 2^7
-// -----+-----+----+ ... +-----+-----+-----
-// p8 | p9 | p8
-// | | p10 | |
-
-
{ .mfi
-(p0) cmp.le.unc p8,p9 = 7, pow_GR_true_exp_Y_Gpr
- fma.s1 POW_s = POW_s1, f1, POW_s2
- nop.i 999
+ shladd pow_AD_T1 = pow_GR_index1, 4, pow_AD_tbl1
+ fma.s1 POW_s = POW_s1, f1, POW_s2
+ add pow_int_GR_M = pow_GR_16ones, pow_int_GR_M
}
{ .mfi
- nop.m 999
- fma.s1 POW_f12 = POW_f1, POW_f2,f0
+ add pow_AD_T2 = pow_AD_tbl2, pow_GR_index2
+ fma.s1 POW_f12 = POW_f1, POW_f2,f0
nop.i 999
}
;;
-
-{ .mfi
+{ .mmf
+ ldfe POW_T1 = [pow_AD_T1]
+ ldfe POW_T2 = [pow_AD_T2]
nop.f 999
-(p9) cmp.le.unc p0,p10 = 6, pow_GR_true_exp_Y_Gpr
}
;;
-
-
-{ .mfb
- nop.m 999
- fma.s1 POW_e123 = POW_e12, f1, POW_e3
-(p8) br.cond.spnt L(POW_OVER_UNDER_X_NOT_INF)
+{ .mfi
+ setf.exp POW_2M = pow_int_GR_M
+ fma.s1 POW_e123 = POW_e12, f1, POW_e3
+ and pow_GR_exp_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones
}
;;
-
-{ .mmf
- fma.s1 POW_q = POW_Z3sq, POW_q, POW_Z3
+{ .mfi
+ nop.m 999
+ fma.s1 POW_q = POW_Z3sq, POW_q, POW_Z3
+ sub pow_GR_true_exp_Y_Gpr = pow_GR_exp_Y_Gpr, pow_GR_16ones
}
;;
+// p8 TRUE ==> |Y(G + r)| >= 7
+// single
+// -2^7 -2^6 2^6 2^7
+// -----+-----+----+ ... +-----+-----+-----
+// p8 | p9 | p8
+// | | p10 | |
+
+// Form signexp of constants to indicate overflow
{ .mfi
- nop.m 999
- fma.s1 POW_ssq = POW_s, POW_s, f0
- nop.i 999
+ mov pow_GR_big_pos = 0x1007f
+ fma.s1 POW_ssq = POW_s, POW_s, f0
+ cmp.le p8,p9 = 7, pow_GR_true_exp_Y_Gpr
}
{ .mfi
- nop.m 999
- fma.s1 POW_v4 = POW_s, POW_Q3, POW_Q2
- nop.i 999
+ mov pow_GR_big_neg = 0x3007f
+ fma.s1 POW_v4 = POW_s, POW_Q3, POW_Q2
+ andcm pow_GR_sign_Y_Gpr = pow_GR_signexp_Y_Gpr, pow_GR_17ones
}
;;
+// Form big positive and negative constants to test for possible overflow
{ .mfi
- nop.m 999
- fma.s1 POW_v2 = POW_s, POW_Q1, POW_Q0_half
- nop.i 999
+ setf.exp POW_big_pos = pow_GR_big_pos
+ fma.s1 POW_v2 = POW_s, POW_Q1, POW_Q0_half
+(p9) cmp.le.unc p0,p10 = 6, pow_GR_true_exp_Y_Gpr
}
-{ .mfi
- nop.m 999
- fma.s1 POW_1ps = f1,f1,POW_s
- nop.i 999
+{ .mfb
+ setf.exp POW_big_neg = pow_GR_big_neg
+ fma.s1 POW_1ps = f1,f1,POW_s
+(p8) br.cond.spnt POW_OVER_UNDER_X_NOT_INF
}
;;
+// f123 = f12*(e123+1) = f12*e123+f12
{ .mfi
nop.m 999
- fma.s1 POW_f3 = POW_e123,f1,f1
+ fma.s1 POW_f123 = POW_e123,POW_f12,POW_f12
nop.i 999
}
;;
{ .mfi
nop.m 999
- fma.s1 POW_T1T2 = POW_T1, POW_T2, f0
+ fma.s1 POW_T1T2 = POW_T1, POW_T2, f0
nop.i 999
}
-;;
-
{ .mfi
nop.m 999
- fma.s1 POW_v3 = POW_ssq, POW_Q4, POW_v4
- nop.i 999
+ fma.s1 POW_v3 = POW_ssq, POW_Q4, POW_v4
+ cmp.ne p12,p13 = pow_GR_xneg_yodd, r0
}
;;
{ .mfi
nop.m 999
- fma.s1 POW_v21ps = POW_ssq, POW_v2, POW_1ps
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 POW_s4 = POW_ssq, POW_ssq, f0
+ fma.s1 POW_2Mqp1 = POW_2M, POW_q, POW_2M
nop.i 999
}
;;
{ .mfi
nop.m 999
- fma.s1 POW_f123 = POW_f12, POW_f3, f0
+ fma.s1 POW_v21ps = POW_ssq, POW_v2, POW_1ps
nop.i 999
}
-;;
-
{ .mfi
nop.m 999
- fma.s1 POW_A = POW_2M, POW_T1T2, f0
+ fma.s1 POW_s4 = POW_ssq, POW_ssq, f0
nop.i 999
}
;;
-
-
{ .mfi
nop.m 999
-(p12) fmerge.s POW_f123 = f8,POW_f123 // if x neg, y odd int
+(p12) fnma.s1 POW_A = POW_T1T2, POW_f123, f0
nop.i 999
}
{ .mfi
nop.m 999
-// fma.s1 POW_es = POW_ssq, POW_v3, POW_v2
+(p13) fma.s1 POW_A = POW_T1T2, POW_f123, f0
nop.i 999
}
;;
{ .mfi
nop.m 999
- fma.s1 POW_es = POW_s4, POW_v3, POW_v21ps
+ fma.s1 POW_es = POW_s4, POW_v3, POW_v21ps
nop.i 999
}
-;;
-
-
{ .mfi
nop.m 999
- fma.s1 POW_A = POW_A, POW_f123, f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
-// fma.s1 POW_es = POW_es, POW_ssq, POW_1ps
+ fma.s1 POW_A = POW_A, POW_2Mqp1, f0
nop.i 999
}
;;
-
+// Dummy op to set inexact
{ .mfi
nop.m 999
- fma.s1 POW_A = POW_A, POW_es,f0
+ fma.s0 POW_tmp = POW_2M, POW_q, POW_2M
nop.i 999
}
;;
-
-
{ .mfb
nop.m 999
-(p10) fma.s f8 = POW_A, POW_q, POW_A
-(p10) br.ret.sptk b0
+ fma.s.s0 f8 = POW_A, POW_es, f0
+(p10) br.ret.sptk b0 // Exit main branch if no over/underflow
}
;;
-
-
-
-
// POSSIBLE_OVER_UNDER
-// p6 = TRUE ==> Y negative
+// p6 = TRUE ==> Y_Gpr negative
+// Result is already computed. We just need to know if over/underflow occurred.
-{ .mfi
- nop.m 999
- fmerge.s POW_abs_A = f0, POW_A
- cmp.eq.unc p0,p6 = pow_GR_sign_Y, r0
-}
-;;
-
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(POW_POSSIBLE_UNDER)
+{ .mfb
+ cmp.eq p0,p6 = pow_GR_sign_Y_Gpr, r0
+ nop.f 999
+(p6) br.cond.spnt POW_POSSIBLE_UNDER
}
;;
// POSSIBLE_OVER
-// We got an answer.
+// We got an answer.
// overflow is a possibility, not a certainty
@@ -1692,21 +1554,20 @@ L(POW_COMMON):
// RN RN
// RZ
-
// Put in s2 (td set, wre set)
{ .mfi
- mov pow_GR_gt_ln = 0x1007f
+ nop.m 999
fsetc.s2 0x7F,0x42
- nop.i 999
+ nop.i 999
}
;;
-
{ .mfi
- setf.exp POW_gt_pln = pow_GR_gt_ln
- fma.s.s2 POW_wre_urm_f8 = POW_abs_A, POW_q, POW_abs_A
- nop.i 999 ;;
+ nop.m 999
+ fma.s.s2 POW_wre_urm_f8 = POW_A, POW_es, f0
+ nop.i 999
}
+;;
// Return s2 to default
{ .mfi
@@ -1716,31 +1577,30 @@ L(POW_COMMON):
}
;;
-
// p7 = TRUE ==> yes, we have an overflow
{ .mfi
nop.m 999
- fcmp.ge.unc.s1 p7, p0 = POW_wre_urm_f8, POW_gt_pln
+ fcmp.ge.s1 p7, p8 = POW_wre_urm_f8, POW_big_pos
nop.i 999
}
;;
-
-
-{ .mfb
-(p7) mov pow_GR_tag = 30
- fma.s f8 = POW_A, POW_q, POW_A
-(p7) br.cond.spnt __libm_error_region
+{ .mfi
+ nop.m 999
+(p8) fcmp.le.s1 p7, p0 = POW_wre_urm_f8, POW_big_neg
+ nop.i 999
}
-{ .mfb
- nop.m 999
- nop.f 999
-(p0) br.ret.sptk b0
+;;
+
+{ .mbb
+(p7) mov pow_GR_tag = 30
+(p7) br.cond.spnt __libm_error_region // Branch if overflow
+ br.ret.sptk b0 // Exit if did not overflow
}
;;
-L(POW_POSSIBLE_UNDER):
+POW_POSSIBLE_UNDER:
// We got an answer. input was < -2^9 but > -2^10 (double)
// We got an answer. input was < -2^6 but > -2^7 (float)
// underflow is a possibility, not a certainty
@@ -1763,124 +1623,250 @@ L(POW_POSSIBLE_UNDER):
// 0.1...11 2^-3ffe (biased, 1)
// largest dn smallest normal
-
// Put in s2 (td set, ftz set)
{ .mfi
nop.m 999
fsetc.s2 0x7F,0x41
- nop.i 999
+ nop.i 999
}
;;
-
-
{ .mfi
nop.m 999
- fma.s.s2 POW_ftz_urm_f8 = POW_A, POW_q, POW_A
+ fma.s.s2 POW_ftz_urm_f8 = POW_A, POW_es, f0
nop.i 999
}
;;
-
// Return s2 to default
{ .mfi
nop.m 999
fsetc.s2 0x7F,0x40
- nop.i 999
+ nop.i 999
}
;;
-
// p7 = TRUE ==> yes, we have an underflow
{ .mfi
nop.m 999
- fcmp.eq.unc.s1 p7, p0 = POW_ftz_urm_f8, f0
- nop.i 999
+ fcmp.eq.s1 p7, p0 = POW_ftz_urm_f8, f0
+ nop.i 999
}
;;
+{ .mbb
+(p7) mov pow_GR_tag = 31
+(p7) br.cond.spnt __libm_error_region // Branch if underflow
+ br.ret.sptk b0 // Exit if did not underflow
+}
+;;
+POW_X_DENORM:
+// Here if x unorm. Use the NORM_X for getf instructions, and then back
+// to normal path
+{ .mfi
+ getf.exp pow_GR_signexp_X = POW_NORM_X
+ nop.f 999
+ nop.i 999
+}
+;;
+{ .mmi
+ getf.sig pow_GR_sig_X = POW_NORM_X
+;;
+ and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones
+ nop.i 999
+}
+;;
+
+{ .mib
+ sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones
+ nop.i 999
+ br.cond.sptk POW_COMMON
+}
+;;
+
+POW_X_0:
+// Here if x=0 and y not nan
+//
+// We have the following cases:
+// p6 x=0 and y>0 and is an integer (may be even or odd)
+// p7 x=0 and y>0 and is NOT an integer, return +0
+// p8 x=0 and y>0 and so big as to always be an even integer, return +0
+// p9 x=0 and y>0 and may not be integer
+// p10 x=0 and y>0 and is an odd integer, return x
+// p11 x=0 and y>0 and is an even integer, return +0
+// p12 used in dummy fcmp to set denormal flag if y=unorm
+// p13 x=0 and y>0
+// p14 x=0 and y=0, branch to code for calling error handling
+// p15 x=0 and y<0, branch to code for calling error handling
+//
+{ .mfi
+ getf.sig pow_GR_sig_int_Y = POW_int_Y // Get signif of int_Y
+ fcmp.lt.s1 p15,p13 = f9, f0 // Test for y<0
+ and pow_GR_exp_Y = pow_GR_signexp_Y, pow_GR_17ones
+}
+{ .mfb
+ cmp.ne p14,p0 = pow_GR_y_zero,r0 // Test for y=0
+ fcvt.xf POW_float_int_Y = POW_int_Y
+(p14) br.cond.spnt POW_X_0_Y_0 // Branch if x=0 and y=0
+}
+;;
+// If x=0 and y>0, test y and flag denormal
{ .mfb
-(p7) mov pow_GR_tag = 31
- fma.s f8 = POW_A, POW_q, POW_A
-(p7) br.cond.spnt __libm_error_region
+(p13) cmp.gt.unc p8,p9 = pow_GR_exp_Y, pow_GR_10033 // Test y +big = even int
+(p13) fcmp.eq.s0 p12,p0 = f9,f0 // If x=0, y>0 dummy op to flag denormal
+(p15) br.cond.spnt POW_X_0_Y_NEG // Branch if x=0 and y<0
}
;;
+// Here if x=0 and y>0
+{ .mfi
+ nop.m 999
+(p9) fcmp.eq.unc.s1 p6,p7 = POW_float_int_Y, POW_NORM_Y // Test y=int
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p8) fma.s.s0 f8 = f0,f0,f0 // If x=0, y>0 and large even int, return +0
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p7) fma.s.s0 f8 = f0,f0,f0 // Result +0 if x=0 and y>0 and not integer
+(p6) tbit.nz.unc p10,p11 = pow_GR_sig_int_Y,0 // If y>0 int, test y even/odd
+}
+;;
+// Note if x=0, y>0 and odd integer, just return x
{ .mfb
nop.m 999
- nop.f 999
- br.ret.sptk b0
+(p11) fma.s.s0 f8 = f0,f0,f0 // Result +0 if x=0 and y even integer
+ br.ret.sptk b0 // Exit if x=0 and y>0
}
;;
+POW_X_0_Y_0:
+// When X is +-0 and Y is +-0, IEEE returns 1.0
+// We call error support with this value
-L(POW_X_DENORM):
-// Here if x unorm. Use the NORM_X for getf instructions, and the back
-// to normal path
-{ .mfi
- getf.exp pow_GR_signexp_X = POW_NORM_X
- nop.f 999
- nop.i 999
+{ .mfb
+ mov pow_GR_tag = 32
+ fma.s.s0 f8 = f1,f1,f0
+ br.cond.sptk __libm_error_region
}
;;
+POW_X_0_Y_NEG:
+// When X is +-0 and Y is negative, IEEE returns
+// X Y answer
+// +0 -odd int +inf
+// -0 -odd int -inf
+
+// +0 !-odd int +inf
+// -0 !-odd int +inf
+
+// p6 == Y is a floating point number outside the integer.
+// Hence it is an integer and is even.
+// return +inf
+
+// p7 == Y is a floating point number within the integer range.
+// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even.
+// p11 odd
+// return (sign_of_x)inf
+// p12 even
+// return +inf
+// p10 == Y is not an integer
+// return +inf
+//
+
{ .mfi
- getf.sig pow_GR_sig_X = POW_NORM_X
- nop.f 999
- nop.i 999
+ nop.m 999
+ nop.f 999
+ cmp.gt p6,p7 = pow_GR_exp_Y, pow_GR_10033
}
;;
{ .mfi
- and pow_GR_exp_X = pow_GR_signexp_X, pow_GR_17ones
- nop.f 999
+ mov pow_GR_tag = 33
+(p7) fcmp.eq.unc.s1 p9,p10 = POW_float_int_Y, POW_NORM_Y
+ nop.i 999
}
;;
-{ .mib
- sub pow_GR_true_exp_X = pow_GR_exp_X, pow_GR_16ones
- shl pow_GR_offset = pow_GR_sig_X, 1
- br.cond.sptk L(POW_COMMON)
+{ .mfb
+ nop.m 999
+(p6) frcpa.s0 f8,p13 = f1, f0
+(p6) br.cond.sptk __libm_error_region // x=0, y<0, y large neg int
}
;;
+{ .mfb
+ nop.m 999
+(p10) frcpa.s0 f8,p13 = f1, f0
+(p10) br.cond.sptk __libm_error_region // x=0, y<0, y not int
+}
+;;
-L(POW_X_0_Y_0):
-// When X is +-0 and Y is +-0, IEEE returns 1.0
-// We call error support with this value
+// x=0, y<0, y an int
+{ .mib
+ nop.m 999
+(p9) tbit.nz.unc p11,p12 = pow_GR_sig_int_Y,0
+ nop.b 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p12) frcpa.s0 f8,p13 = f1,f0
+ nop.i 999
+}
+;;
{ .mfb
- mov pow_GR_tag = 32
- fma.s f8 = f1,f1,f0
- br.cond.sptk __libm_error_region
+ nop.m 999
+(p11) frcpa.s0 f8,p13 = f1,f8
+ br.cond.sptk __libm_error_region
}
;;
+POW_Y_0:
+// Here for y zero, x anything but zero and nan
+// Set flag if x denormal
+// Result is +1.0
+{ .mfi
+ nop.m 999
+ fcmp.eq.s0 p6,p0 = f8,f0 // Sets flag if x denormal
+ nop.i 999
+}
+{ .mfb
+ nop.m 999
+ fma.s.s0 f8 = f1,f1,f0
+ br.ret.sptk b0
+}
+;;
-L(POW_X_INF):
-// When X is +-inf and Y is +-, IEEE returns
+POW_X_INF:
+// Here when X is +-inf
-// overflow
-// X +inf Y +inf +inf
-// X -inf Y +inf +inf
+// X +inf Y +inf +inf
+// X -inf Y +inf +inf
-// X +inf Y >0 +inf
+// X +inf Y >0 +inf
// X -inf Y >0, !odd integer +inf <== (-inf)^0.5 = +inf !!
-// X -inf Y >0, odd integer -inf
+// X -inf Y >0, odd integer -inf
-// underflow
-// X +inf Y -inf +0
-// X -inf Y -inf +0
+// X +inf Y -inf +0
+// X -inf Y -inf +0
-// X +inf Y <0 +0
-// X -inf Y <0, !odd integer +0
-// X -inf Y <0, odd integer -0
+// X +inf Y <0 +0
+// X -inf Y <0, !odd integer +0
+// X -inf Y <0, odd integer -0
// X + inf Y=+0 +1
// X + inf Y=-0 +1
@@ -1892,32 +1878,30 @@ L(POW_X_INF):
// p6 == Y is a floating point number outside the integer.
// Hence it is an integer and is even.
-// p13 == (Y negative)
+// p13 == (Y negative)
// return +inf
// p14 == (Y positive)
// return +0
-
-
// p7 == Y is a floating point number within the integer range.
// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even.
// p11 odd
-// p13 == (Y negative)
+// p13 == (Y negative)
// return (sign_of_x)inf
-// p14 == (Y positive)
+// p14 == (Y positive)
// return (sign_of_x)0
-// pxx even
-// p13 == (Y negative)
-// return +inf
+// pxx even
+// p13 == (Y negative)
+// return +inf
// p14 == (Y positive)
-// return +0
+// return +0
// pxx == Y is not an integer
-// p13 == (Y negative)
+// p13 == (Y negative)
// return +inf
// p14 == (Y positive)
// return +0
-//
+//
// If x=inf, test y and flag denormal
{ .mfi
@@ -1929,207 +1913,131 @@ L(POW_X_INF):
{ .mfi
nop.m 999
- fcmp.lt p13,p14 = POW_NORM_Y,f0
- cmp.gt.unc p6,p7 = pow_GR_exp_Y, pow_GR_10033
+ fcmp.lt.s0 p13,p14 = POW_NORM_Y,f0
+ cmp.gt p6,p7 = pow_GR_exp_Y, pow_GR_10033
}
{ .mfi
nop.m 999
- fclass.m p12,p0 = f9, 0x23
+ fclass.m p12,p0 = f9, 0x23 //@inf
nop.i 999
}
;;
-
{ .mfi
nop.m 999
- fclass.m p15,p0 = f9, 0x07 //@zero
+ fclass.m p15,p0 = f9, 0x07 //@zero
nop.i 999
}
;;
{ .mfb
nop.m 999
-(p15) fmerge.s f8 = f1,f1
-(p15) br.ret.spnt b0
+(p15) fmerge.s f8 = f1,f1 // Return +1.0 if x=inf, y=0
+(p15) br.ret.spnt b0 // Exit if x=inf, y=0
}
;;
-
{ .mfi
-(p13) mov pow_GR_tag = 31
-(p14) frcpa.s1 f8,p10 = f1,f0
+ nop.m 999
+(p14) frcpa.s1 f8,p10 = f1,f0 // If x=inf, y>0, assume result +inf
nop.i 999
}
{ .mfb
-(p14) mov pow_GR_tag = 30
-(p13) fma.s1 f8 = f0,f0,f0
-(p12) br.ret.spnt b0
-}
-;;
-
-
-
-{ .mfb
nop.m 999
-(p7) fcmp.eq.unc.s1 p9,p0 = POW_float_int_Y, POW_NORM_Y
- nop.b 999
+(p13) fma.s.s0 f8 = f0,f0,f0 // If x=inf, y<0, assume result +0.0
+(p12) br.ret.spnt b0 // Exit if x=inf, y=inf
}
;;
+// Here if x=inf, and 0 < |y| < inf. Need to correct results if y odd integer.
{ .mfi
nop.m 999
- nop.f 999
-(p9) tbit.nz.unc p11,p0 = pow_GR_sig_int_Y,0
-}
-;;
-
-{ .mfb
- nop.m 999
-(p11) fmerge.s f8 = POW_NORM_X,f8
- br.ret.sptk b0
+(p7) fcmp.eq.unc.s1 p9,p0 = POW_float_int_Y, POW_NORM_Y // Is y integer?
+ nop.i 999
}
;;
-
-
-L(POW_X_0_Y_NEG):
-// When X is +-0 and Y is negative, IEEE returns
-// X Y answer
-// +0 -odd int +inf
-// -0 -odd int -inf
-
-// +0 !-odd int +inf
-// -0 !-odd int +inf
-
-
-// p6 == Y is a floating point number outside the integer.
-// Hence it is an integer and is even.
-// return +inf
-
-// p7 == Y is a floating point number within the integer range.
-// p9 == (int_Y = NORM_Y), Y is an integer, which may be odd or even.
-// p11 odd
-// return (sign_of_x)inf
-// p12 even
-// return +inf
-// p10 == Y is not an integer
-// return +inf
-//
-//
-
{ .mfi
nop.m 999
nop.f 999
- cmp.gt.unc p6,p7 = pow_GR_exp_Y, pow_GR_10033
-}
-;;
-
-
-{ .mfi
- mov pow_GR_tag = 33
-(p7) fcmp.eq.unc.s1 p9,p10 = POW_float_int_Y, POW_NORM_Y
- nop.i 999
-}
-;;
-
-
-{ .mfb
- nop.m 999
-(p6) frcpa.s0 f8,p13 = f1, f0
-(p6) br.cond.sptk __libm_error_region
+(p9) tbit.nz.unc p11,p0 = pow_GR_sig_int_Y,0 // Test for y odd integer
}
;;
{ .mfb
nop.m 999
-(p10) frcpa.s0 f8,p13 = f1, f0
-(p10) br.cond.sptk __libm_error_region
+(p11) fmerge.s f8 = POW_NORM_X,f8 // If y odd integer use sign of x
+ br.ret.sptk b0 // Exit for x=inf, 0 < |y| < inf
}
;;
+POW_X_NEG_Y_NONINT:
+// When X is negative and Y is a non-integer, IEEE
+// returns a qnan indefinite.
+// We call error support with this value
-{ .mib
- nop.m 999
-(p9) tbit.nz.unc p11,p12 = pow_GR_sig_int_Y,0
- nop.b 999
+{ .mfb
+ mov pow_GR_tag = 34
+ frcpa.s0 f8,p6 = f0,f0
+ br.cond.sptk __libm_error_region
}
;;
-
-
+POW_X_NAN:
+// Here if x=nan, y not nan
{ .mfi
- nop.m 999
-(p12) frcpa.s0 f8,p13 = f1,f0
- nop.i 999
-}
-;;
-
-{ .mfb
- nop.m 999
-(p11) frcpa f8,p13 = f1,f8
- br.cond.sptk __libm_error_region
+ nop.m 999
+ fclass.m p9,p13 = f9, 0x07 // Test y=zero
+ nop.i 999
}
;;
-
-
-
-L(POW_X_NEG_Y_NONINT):
-// When X is negative and Y is a non-integer, IEEE
-// returns a qnan indefinite.
-// We call error support with this value
-
{ .mfb
- mov pow_GR_tag = 34
- frcpa f8,p6 = f0,f0
- br.cond.sptk __libm_error_region
+ nop.m 999
+(p13) fma.s.s0 f8 = f8,f1,f0
+(p13) br.ret.sptk b0 // Exit if x nan, y anything but zero or nan
}
;;
-
-
-
-L(POW_X_NAN_Y_0):
+POW_X_NAN_Y_0:
// When X is a NAN and Y is zero, IEEE returns 1.
// We call error support with this value.
-
{ .mfi
- nop.m 0
- fma.s.s0 f10 = f8,f1,f0
- nop.i 0
+ nop.m 999
+ fcmp.eq.s0 p6,p0 = f8,f0 // Dummy op to set invalid on snan
+ nop.i 999
}
{ .mfb
- mov pow_GR_tag = 35
- fma.s.s0 f8 = f0,f0,f1
+ mov pow_GR_tag = 35
+ fma.s.s0 f8 = f0,f0,f1
br.cond.sptk __libm_error_region
}
;;
-L(POW_OVER_UNDER_X_NOT_INF):
+POW_OVER_UNDER_X_NOT_INF:
// p8 is TRUE for overflow
// p9 is TRUE for underflow
// if y is infinity, we should not over/underflow
-
{ .mfi
nop.m 999
- fcmp.eq.unc.s1 p14, p13 = POW_xsq,f1
- cmp.eq.unc p8,p9 = pow_GR_sign_Y_Gpr, r0
+ fcmp.eq.s1 p14, p13 = POW_xsq,f1 // Test |x|=1
+ cmp.eq p8,p9 = pow_GR_sign_Y_Gpr, r0
}
;;
{ .mfi
nop.m 999
-(p14) fclass.m.unc p15, p0 = f9, 0x23
+(p14) fclass.m.unc p15, p0 = f9, 0x23 // If |x|=1, test y=inf
nop.i 999
}
{ .mfi
nop.m 999
-(p13) fclass.m.unc p11,p0 = f9, 0x23
+(p13) fclass.m.unc p11,p0 = f9, 0x23 // If |x| not 1, test y=inf
nop.i 999
}
;;
@@ -2137,31 +2045,33 @@ L(POW_OVER_UNDER_X_NOT_INF):
// p15 = TRUE if |x|=1, y=inf, return +1
{ .mfb
nop.m 999
-(p15) fma.s f8 = f1,f1,f0
-(p15) br.ret.spnt b0
+(p15) fma.s.s0 f8 = f1,f1,f0 // If |x|=1, y=inf, result +1
+(p15) br.ret.spnt b0 // Exit if |x|=1, y=inf
}
;;
.pred.rel "mutex",p8,p9
{ .mfb
-(p8) setf.exp f8 = pow_GR_17ones
-(p9) fmerge.s f8 = f0,f0
-(p11) br.ret.sptk b0
+(p8) setf.exp f8 = pow_GR_17ones // If exp(+big), result inf
+(p9) fmerge.s f8 = f0,f0 // If exp(-big), result 0
+(p11) br.ret.sptk b0 // Exit if |x| not 1, y=inf
}
+;;
{ .mfb
nop.m 999
nop.f 999
- br.cond.sptk L(POW_OVER_UNDER_ERROR)
+ br.cond.sptk POW_OVER_UNDER_ERROR // Branch if y not inf
}
;;
-L(POW_Y_NAN):
-// Is x = +1 then result is +1, else result is quiet Y
+POW_Y_NAN:
+// Here if y=nan, x anything
+// If x = +1 then result is +1, else result is quiet Y
{ .mfi
nop.m 999
- fcmp.eq.s1 p10,p9 = POW_NORM_X, f1
+ fcmp.eq.s1 p10,p9 = POW_NORM_X, f1
nop.i 999
}
;;
@@ -2175,148 +2085,117 @@ L(POW_Y_NAN):
{ .mfi
nop.m 999
-(p10) fma.s f8 = f1,f1,f0
+(p10) fma.s.s0 f8 = f1,f1,f0
nop.i 999
}
{ .mfb
nop.m 999
-(p9) fma.s f8 = f9,f8,f0
- br.ret.sptk b0
+(p9) fma.s.s0 f8 = f9,f8,f0
+ br.ret.sptk b0 // Exit y=nan
}
;;
-L(POW_OVER_UNDER_ERROR):
+POW_OVER_UNDER_ERROR:
+// Here if we have overflow or underflow.
+// Enter with p12 true if x negative and y odd int to force -0 or -inf
{ .mfi
- nop.m 999
- fmerge.s f10 = POW_NORM_X,POW_NORM_X
- nop.i 999
-}
-{ .mfi
- sub pow_GR_17ones_m1 = pow_GR_17ones, r0, 1
- nop.f 999
- mov pow_GR_one = 0x1
+ sub pow_GR_17ones_m1 = pow_GR_17ones, r0, 1
+ nop.f 999
+ mov pow_GR_one = 0x1
}
;;
-// overflow
+// overflow, force inf with O flag
{ .mmb
-(p8) mov pow_GR_tag = 30
-(p8) setf.exp f11 = pow_GR_17ones_m1
+(p8) mov pow_GR_tag = 30
+(p8) setf.exp POW_tmp = pow_GR_17ones_m1
nop.b 999
}
;;
-
-// underflow
+// underflow, force zero with I, U flags
{ .mmi
-(p9) mov pow_GR_tag = 31
-(p9) setf.exp f11 = pow_GR_one
+(p9) mov pow_GR_tag = 31
+(p9) setf.exp POW_tmp = pow_GR_one
nop.i 999
}
;;
-
-// p12 x is negative and y is an odd integer
-
-
{ .mfi
nop.m 999
- fma.s f8 = f11, f11, f0
+ fma.s.s0 f8 = POW_tmp, POW_tmp, f0
nop.i 999
}
;;
+// p12 x is negative and y is an odd integer, change sign of result
{ .mfi
nop.m 999
-(p12) fmerge.ns f8 = f8, f8
+(p12) fnma.s.s0 f8 = POW_tmp, POW_tmp, f0
nop.i 999
}
;;
+GLOBAL_LIBM_END(powf)
-.endp powf
-ASM_SIZE_DIRECTIVE(powf)
-
-
-// Stack operations when calling error support.
-// (1) (2) (3) (call) (4)
-// sp -> + psp -> + psp -> + sp -> +
-// | | | |
-// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
-// | | | |
-// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
-// | | | |
-// | | <- GR_X X1 ->| |
-// | | | |
-// sp-64 -> + sp -> + sp -> + +
-// save ar.pfs save b0 restore gp
-// save gp restore ar.pfs
-
-
+LOCAL_LIBM_ENTRY(__libm_error_region)
-.proc __libm_error_region
-__libm_error_region:
-
-// Answer is inf for overflow and 0 for underflow.
.prologue
-// (1)
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
-
-// (2)
{ .mmi
stfs [GR_Parameter_Y] = POW_NORM_Y,16 // STORE Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
-// (3)
{ .mib
- stfs [GR_Parameter_X] = POW_NORM_X // STORE Parameter 1 on stack
+ stfs [GR_Parameter_X] = POW_NORM_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
- stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
+
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
-// (4)
{ .mmi
- ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
- add sp = 64,sp // Restore stack pointer
- mov b0 = GR_SAVE_B0 // Restore return address
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
+
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/e_powl.S b/sysdeps/ia64/fpu/e_powl.S
index d286e9abad..0896c19aac 100644
--- a/sysdeps/ia64/fpu/e_powl.S
+++ b/sysdeps/ia64/fpu/e_powl.S
@@ -1,10 +1,10 @@
.file "powl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,61 +20,69 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-// *********************************************************************
+//*********************************************************************
//
// Function: powl(x,y), where
-// y
+// y
// powl(x,y) = x , for double extended precision x and y values
//
-// *********************************************************************
+//*********************************************************************
//
-// History:
-// 2/02/00 (Hand Optimized)
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// History:
+// 02/02/00 (Hand Optimized)
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-// 1/22/01 Corrected results for powl(1,inf), powl(1,nan), and
+// 01/22/01 Corrected results for powl(1,inf), powl(1,nan), and
// powl(snan,0) to be 1 per C99, not nan. Fixed many flag settings.
-// 2/06/01 Call __libm_error support if over/underflow when y=2.
+// 02/06/01 Call __libm_error support if over/underflow when y=2.
+// 04/17/01 Support added for y close to 1 and x a non-special value.
+// Shared software under/overflow detection for all paths
+// 02/07/02 Corrected sf3 setting to disable traps
+// 05/13/02 Improved performance of all paths
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
+// 04/17/03 Added missing mutex directive
//
-// *********************************************************************
+//*********************************************************************
//
// Resources Used:
//
-// Floating-Point Registers:
-// f8 (Input and Return Value)
-// f9-f15,f32-f63,f99
+// Floating-Point Registers:
+// f8 (Input x and Return Value)
+// f9 (Input y)
+// f10-f15,f32-f79
//
// General Purpose Registers:
-// Locals r32 - r61
+// Locals r14-24,r32-r65
// Parameters to __libm_error_support r62,r63,r64,r65
//
// Predicate Registers: p6-p15
//
-// *********************************************************************
+//*********************************************************************
//
// Special Cases and IEEE special conditions:
//
// Denormal fault raised on denormal inputs
-// Overflow exceptions raised when appropriate for pow
-// Underflow exceptions raised when appropriate for pow
+// Overflow exceptions raised when appropriate for pow
+// Underflow exceptions raised when appropriate for pow
// (Error Handling Routine called for overflow and Underflow)
// Inexact raised when appropriate by algorithm
//
@@ -102,8 +110,8 @@
// 22. X or Y denorm/unorm and denorm/unorm operand trap is enabled,
// generate denorm/unorm fault except if invalid or div_0 raised.
//
-// *********************************************************************
-//
+//*********************************************************************
+//
// Algorithm
// =========
//
@@ -113,23 +121,23 @@
// If Y = 0.5, return sqrt(X).
//
// Compute log(X) to extra precision.
-//
+//
// ker_log_80( X, logX_hi, logX_lo, Safe );
//
-// ...logX_hi + logX_lo approximates log(X) to roughly 80
+// ...logX_hi + logX_lo approximates log(X) to roughly 80
// ...significant bits of accuracy.
//
// Compute Y*log(X) to extra precision.
//
// P_hi := Y * logX_hi
-// P_lo := Y * logX_hi - P_hi ...using FMA
-// P_lo := Y * logX_lo + P_lo ...using FMA
+// P_lo := Y * logX_hi - P_hi ...using FMA
+// P_lo := Y * logX_lo + P_lo ...using FMA
//
// Compute exp(P_hi + P_lo)
//
-// Flag := 2;
+// Flag := 2;
// Expo_Range := 2; (assuming double-extended power function)
-// ker_exp_64( P_hi, P_lo, Flag, Expo_Range,
+// ker_exp_64( P_hi, P_lo, Flag, Expo_Range,
// Z_hi, Z_lo, scale, Safe )
//
// scale := sgn * scale
@@ -138,7 +146,7 @@
// return scale*Z_hi + (scale*Z_lo)
// quickly
// Else
-// take necessary precaution in computing
+// take necessary precaution in computing
// scale*Z_hi + (scale*Z_lo)
// to set possible exceptions correctly.
// End If
@@ -152,8 +160,8 @@
// If Y is qNaN, return Y without exception.
// If X is qNaN, return X without exception.
//
-// At this point, X is real and Y is +-inf.
-// Thus |X| can only be 1, strictly bigger than 1, or
+// At this point, X is real and Y is +-inf.
+// Thus |X| can only be 1, strictly bigger than 1, or
// strictly less than 1.
//
// If |X| < 1, then
@@ -169,8 +177,8 @@
// ...Note that Y is real, finite, non-zero, and not +1.
//
// If X is qNaN, return X without exception.
-//
-// If X is +-0,
+//
+// If X is +-0,
// return ( Y > 0 ? +0 : +inf )
//
// If X is +inf
@@ -180,11 +188,11 @@
// return -0 ** -Y
// return ( Y > 0 ? +inf : +0 )
//
-// Case_Invalid
+// Case_Invalid
//
// Return 0 * inf to generate a quiet NaN together
// with an invalid exception.
-//
+//
// Implementation
// ==============
//
@@ -193,15 +201,15 @@
//
// STAGE 1
// -------
-// This stage contains two threads.
+// This stage contains two threads.
//
// Stage1.Thread1
//
// fclass.m X_excep, X_ok = X, (NatVal or s/qNaN) or
-// +-0, +-infinity
+// +-0, +-infinity
//
// fclass.nm X_unsupp, X_supp = X, (NatVal or s/qNaN) or
-// +-(0, unnorm, norm, infinity)
+// +-(0, unnorm, norm, infinity)
//
// X_norm := fnorm( X ) with traps disabled
//
@@ -209,26 +217,26 @@
// If (X_unsupp) goto Filtering (Step 2)
//
// Stage1.Thread2
-// ..............
+// ..............
//
// fclass.m Y_excep, Y_ok = Y, (NatVal or s/qNaN) or
-// +-0, +-infinity
+// +-0, +-infinity
//
// fclass.nm Y_unsupp, Y_supp = Y, (NatVal or s/qNaN) or
-// +-(0, unnorm, norm, infinity)
+// +-(0, unnorm, norm, infinity)
//
// Y_norm := fnorm( Y ) with traps disabled
//
// If (Y_excep) goto Filtering (Step 2)
// If (Y_unsupp) goto Filtering (Step 2)
//
-//
+//
// STAGE 2
// -------
// This stage contains two threads.
//
-// Stage2.Thread1
-// ..............
+// Stage2.Thread1
+// ..............
//
// Set X_lt_0 if X < 0 (using fcmp)
// sgn := +1.0
@@ -245,14 +253,14 @@
// This stage contains two threads.
//
//
-// Stage3.Thread1
-// ..............
+// Stage3.Thread1
+// ..............
//
// X := fnorm(X) in prevailing traps
//
//
-// Stage3.Thread2
-// ..............
+// Stage3.Thread2
+// ..............
//
// Y := fnorm(Y) in prevailing traps
//
@@ -262,60 +270,56 @@
// Go to Case_Normal.
//
-#include "libm_support.h"
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-
-// Inv_L, L_hi, L_lo
-.align 64
-Constants_exp_64_Arg:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object)
-data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000
-data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000
-data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg)
-
-.align 64
-Constants_exp_64_Exponents:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object)
-data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF
-data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF
-data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
-data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
-data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF
-data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF
-ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents)
-
-.align 64
-Constants_exp_64_A:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object)
-// Reversed
-data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000
-data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000
-data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_exp_64_A)
-
-.align 64
-Constants_exp_64_P:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object)
-// Reversed
-data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000
-data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000
-data4 0x7474C518,0x88888888,0x00003FF8,0x00000000
-data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000
-data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000
-data4 0x000004C7,0x80000000,0x00003FFE,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_exp_64_P)
-
-.align 64
-Constants_exp_64_T1:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object)
-data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
-data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
+
+// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
+
+// double-extended 1/ln(2)
+// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
+// 3fff b8aa 3b29 5c17 f0bc
+// For speed the significand will be loaded directly with a movl and setf.sig
+// and the exponent will be bias+63 instead of bias+0. Thus subsequent
+// computations need to scale appropriately.
+// The constant 2^12/ln(2) is needed for the computation of N. This is also
+// obtained by scaling the computations.
+//
+// Two shifting constants are loaded directly with movl and setf.d.
+// 1. RSHF_2TO51 = 1.1000..00 * 2^(63-12)
+// This constant is added to x*1/ln2 to shift the integer part of
+// x*2^12/ln2 into the rightmost bits of the significand.
+// The result of this fma is N_signif.
+// 2. RSHF = 1.1000..00 * 2^(63)
+// This constant is subtracted from N_signif * 2^(-51) to give
+// the integer part of N, N_fix, as a floating-point number.
+// The result of this fms is float_N.
+RODATA
+
+.align 16
+// L_hi, L_lo
+LOCAL_OBJECT_START(Constants_exp_64_Arg)
+data8 0xB17217F400000000,0x00003FF2 // L_hi = hi part log(2)/2^12
+data8 0xF473DE6AF278ECE6,0x00003FD4 // L_lo = lo part log(2)/2^12
+LOCAL_OBJECT_END(Constants_exp_64_Arg)
+
+LOCAL_OBJECT_START(Constants_exp_64_A)
+// Reversed
+data8 0xAAAAAAABB1B736A0,0x00003FFA
+data8 0xAAAAAAAB90CD6327,0x00003FFC
+data8 0xFFFFFFFFFFFFFFFF,0x00003FFD
+LOCAL_OBJECT_END(Constants_exp_64_A)
+
+LOCAL_OBJECT_START(Constants_exp_64_P)
+// Reversed
+data8 0xD00D6C8143914A8A,0x00003FF2
+data8 0xB60BC4AC30304B30,0x00003FF5
+data8 0x888888887474C518,0x00003FF8
+data8 0xAAAAAAAA8DAE729D,0x00003FFA
+data8 0xAAAAAAAAAAAAAF61,0x00003FFC
+data8 0x80000000000004C7,0x00003FFE
+LOCAL_OBJECT_END(Constants_exp_64_P)
+
+LOCAL_OBJECT_START(Constants_exp_64_T1)
+data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
+data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
@@ -330,274 +334,263 @@ data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
-ASM_SIZE_DIRECTIVE(Constants_exp_64_T1)
-
-.align 64
-Constants_exp_64_T2:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object)
-data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
-data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
-data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
-data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
-data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
-data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
-data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
-data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
-data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
-data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
-data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
-data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
-data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
-data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
-data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
+LOCAL_OBJECT_END(Constants_exp_64_T1)
+
+LOCAL_OBJECT_START(Constants_exp_64_T2)
+data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
+data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
+data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
+data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
+data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
+data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
+data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
+data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
+data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
+data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
+data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
+data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
+data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
+data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
+data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
-ASM_SIZE_DIRECTIVE(Constants_exp_64_T2)
-
-.align 64
-Constants_exp_64_W1:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object)
-data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454
-data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6
-data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA
-data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50
-data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2
-data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE
-data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B
-data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04
-data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419
-data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376
-data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A
-data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB
-data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E
-data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA
-data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08
-data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B
-data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75
-data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79
-data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7
-data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087
-data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB
-data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643
-data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C
-data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D
-data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873
-data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F
-data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861
-data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0
-data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC
-data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB
-data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB
-data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148
-ASM_SIZE_DIRECTIVE(Constants_exp_64_W1)
-
-.align 64
-Constants_exp_64_W2:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object)
-data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25
-data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8
-data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A
-data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E
-data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9
-data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2
-data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0
-data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509
-data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33
-data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D
-data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87
-data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3
-data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9
-data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F
-data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82
-data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4
-data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D
-data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030
-data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29
-data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED
-data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B
-data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893
-data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35
-data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C
-data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313
-data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE
-data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426
-data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550
-data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4
-data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31
-data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE
-data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9
-ASM_SIZE_DIRECTIVE(Constants_exp_64_W2)
-
-.align 64
-Constants_log_80_P:
-ASM_TYPE_DIRECTIVE(Constants_log_80_P,@object)
-// 1/2, P_8, P_7, ..., P_1
-data4 0x00000000, 0x80000000, 0x00003FFE, 0x00000000
-data4 0x3B1042BC, 0xCCCE8B88, 0x0000BFFB, 0x00000000
-data4 0xCADC2149, 0xE38997B7, 0x00003FFB, 0x00000000
-data4 0xB1ACB090, 0xFFFFFFFE, 0x0000BFFB, 0x00000000
-data4 0x06481C81, 0x92492498, 0x00003FFC, 0x00000000
-data4 0xAAAAB0EF, 0xAAAAAAAA, 0x0000BFFC, 0x00000000
-data4 0xCCC91416, 0xCCCCCCCC, 0x00003FFC, 0x00000000
-data4 0x00000000, 0x80000000, 0x0000BFFD, 0x00000000
-data4 0xAAAAAAAB, 0xAAAAAAAA, 0x00003FFD
-ASM_SIZE_DIRECTIVE(Constants_log_80_P)
-
-.align 64
-Constants_log_80_Q:
-ASM_TYPE_DIRECTIVE(Constants_log_80_Q,@object)
-// log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1
-data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
-data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
-data4 0xA51BE0AF,0x92492453,0x00003FFC,0x00000000
-data4 0xA0CFD29F,0xAAAAAB73,0x0000BFFC,0x00000000
-data4 0xCCCE3872,0xCCCCCCCC,0x00003FFC,0x00000000
-data4 0xFFFFB4FB,0xFFFFFFFF,0x0000BFFC,0x00000000
-data4 0xAAAAAAAB,0xAAAAAAAA,0x00003FFD,0x00000000
-data4 0x00000000,0x80000000,0x0000BFFE,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_log_80_Q)
-
-.align 64
-Constants_log_80_Z_G_H_h1:
-ASM_TYPE_DIRECTIVE(Constants_log_80_Z_G_H_h1,@object)
-// Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double
+LOCAL_OBJECT_END(Constants_exp_64_T2)
+
+LOCAL_OBJECT_START(Constants_exp_64_W1)
+data8 0x0000000000000000, 0xBE384454171EC4B4
+data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8
+data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36
+data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE
+data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F
+data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329
+data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5
+data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F
+data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF
+data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F
+data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92
+data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E
+data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D
+data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29
+data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A
+data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA
+data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6
+data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF
+data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC
+data8 0xBE51C2141AA42614, 0xBE48D087C37293F4
+data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38
+data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962
+data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788
+data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7
+data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2
+data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4
+data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA
+data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B
+data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A
+data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719
+data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D
+data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707
+LOCAL_OBJECT_END(Constants_exp_64_W1)
+
+LOCAL_OBJECT_START(Constants_exp_64_W2)
+data8 0x0000000000000000, 0xBE641F2537A3D7A2
+data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6
+data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE
+data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3
+data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4
+data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B
+data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7
+data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA
+data8 0xBE56856B49BFF529, 0x3E66DD3300508651
+data8 0x3E51165FC114BC13, 0x3E53333DC453290F
+data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696
+data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93
+data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE
+data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22
+data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97
+data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8
+data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC
+data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1
+data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7
+data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D
+data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C
+data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5
+data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9
+data8 0xBE559725ADE45917, 0xBE68C29C042FC476
+data8 0xBE67593B01E511FA, 0xBE4A4313398801ED
+data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E
+data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D
+data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F
+data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1
+data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795
+data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E
+data8 0x3E68BF5C17365712, 0x3E3956F9B3785569
+LOCAL_OBJECT_END(Constants_exp_64_W2)
+
+LOCAL_OBJECT_START(Constants_log_80_P)
+// P_8, P_7, ..., P_1
+data8 0xCCCE8B883B1042BC, 0x0000BFFB // P_8
+data8 0xE38997B7CADC2149, 0x00003FFB // P_7
+data8 0xFFFFFFFEB1ACB090, 0x0000BFFB // P_6
+data8 0x9249249806481C81, 0x00003FFC // P_5
+data8 0x0000000000000000, 0x00000000 // Pad for bank conflicts
+data8 0xAAAAAAAAAAAAB0EF, 0x0000BFFC // P_4
+data8 0xCCCCCCCCCCC91416, 0x00003FFC // P_3
+data8 0x8000000000000000, 0x0000BFFD // P_2
+data8 0xAAAAAAAAAAAAAAAB, 0x00003FFD // P_1
+LOCAL_OBJECT_END(Constants_log_80_P)
+
+LOCAL_OBJECT_START(Constants_log_80_Q)
+// log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1
+data8 0xB172180000000000,0x00003FFE
+data8 0x82E308654361C4C6,0x0000BFE2
+data8 0x92492453A51BE0AF,0x00003FFC
+data8 0xAAAAAB73A0CFD29F,0x0000BFFC
+data8 0xCCCCCCCCCCCE3872,0x00003FFC
+data8 0xFFFFFFFFFFFFB4FB,0x0000BFFC
+data8 0xAAAAAAAAAAAAAAAB,0x00003FFD
+data8 0x8000000000000000,0x0000BFFE
+LOCAL_OBJECT_END(Constants_log_80_Q)
+
+LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h1)
+// Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double
data4 0x00008000,0x3F800000,0x00000000,0x00000000
-data4 0x00000000,0x00000000,0x00000000,0x00000000
+data4 0x00000000,0x00000000,0x00000000,0x00000000
data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000
data4 0xEBA0E0D1,0x8B1D330B,0x00003FDA,0x00000000
data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000
data4 0x9EADD553,0xE2AF365E,0x00003FE2,0x00000000
data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000
-data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000
+data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000
data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000
-data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000
-data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000
-data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000
-data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000
+data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000
+data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000
+data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000
+data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000
data4 0x457978A1,0x8718789F,0x00003FE2,0x00000000
-data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000
-data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000
-data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000
-data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000
-data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000
-data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000
-data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000
-data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000
-data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000
-data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000
+data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000
+data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000
+data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000
+data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000
+data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000
+data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000
+data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000
+data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000
+data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000
+data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000
data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000
-data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000
-data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000
-data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000
-data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000
-data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000
+data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000
+data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000
+data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000
+data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000
+data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000
data4 0x00004211,0x3F042108,0x3F29516A,0x00000000
-data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_log_80_Z_G_H_h1)
-
-.align 64
-Constants_log_80_Z_G_H_h2:
-ASM_TYPE_DIRECTIVE(Constants_log_80_Z_G_H_h2,@object)
-// Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double
-data4 0x00008000,0x3F800000,0x00000000,0x00000000
-data4 0x00000000,0x00000000,0x00000000,0x00000000
-data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000
+data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000
+LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h1)
+
+LOCAL_OBJECT_START(Constants_log_80_Z_G_H_h2)
+// Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double
+data4 0x00008000,0x3F800000,0x00000000,0x00000000
+data4 0x00000000,0x00000000,0x00000000,0x00000000
+data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000
data4 0x211398BF,0xAD08B116,0x00003FDB,0x00000000
-data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000
-data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000
-data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000
-data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000
-data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000
-data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000
-data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000
+data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000
+data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000
+data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000
+data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000
+data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000
+data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000
+data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000
data4 0x049EB22F,0xD1B87D3C,0x00003FDE,0x00000000
-data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000
-data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000
+data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000
+data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000
data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000
-data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000
-data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000
-data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000
-data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000
-data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000
-data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000
-data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000
-data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000
-data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000
-data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000
-data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000
-data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000
-data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000
-data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000
-data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000
-data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000
+data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000
+data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000
+data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000
+data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000
+data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000
+data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000
+data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000
+data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000
+data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000
+data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000
+data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000
+data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000
+data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000
+data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000
+data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000
+data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000
data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_log_80_Z_G_H_h2)
-
-.align 64
-Constants_log_80_h3_G_H:
-ASM_TYPE_DIRECTIVE(Constants_log_80_h3_G_H,@object)
-// h3 IEEE double extended, H3 and G3 IEEE single
-data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00
+LOCAL_OBJECT_END(Constants_log_80_Z_G_H_h2)
+
+LOCAL_OBJECT_START(Constants_log_80_h3_G_H)
+// h3 IEEE double extended, H3 and G3 IEEE single
+data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00
data4 0x9B7FAD21,0x90051030,0x00003FD8,0x3F7FF400
-data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00
-data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400
+data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00
+data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400
data4 0xCA964D95,0xCE65C1D8,0x0000BFD8,0x3F7FDC00
-data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400
-data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08
-data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408
-data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10
-data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410
-data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18
+data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400
+data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08
+data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408
+data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10
+data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410
+data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18
data4 0x2F053150,0xB25CA912,0x0000BFDA,0x3F7FA420
-data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20
-data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428
-data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30
-data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438
-data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40
-data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448
-data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50
-data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458
-data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68
-data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470
-data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78
+data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20
+data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428
+data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30
+data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438
+data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40
+data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448
+data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50
+data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458
+data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68
+data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470
+data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78
data4 0x1F34A7EB,0x9A995A97,0x0000BFDC,0x3F7F4488
-data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90
-data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0
-data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8
-data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8
-data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8
-data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8
-data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0
-data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0
-data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here
-data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D
+data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90
+data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0
+data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8
+data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8
+data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8
+data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8
+data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0
+data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0
+data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here
+data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D
data4 0x3A87F20B,0x3A97F68B,0x3AA7EB86,0x3AB7E101
-data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED
-data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766
-data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6
-data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620
-data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D
-ASM_SIZE_DIRECTIVE(Constants_log_80_h3_G_H)
-
-.align 64
-Constant_half:
-ASM_TYPE_DIRECTIVE(Constant_half,@object)
-data4 0x00000000,0x80000000,0x00003FFE
-ASM_SIZE_DIRECTIVE(Constant_half)
-
-GR_Expo_Range = r32
-GR_Flag = r33
+data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED
+data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766
+data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6
+data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620
+data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D
+LOCAL_OBJECT_END(Constants_log_80_h3_G_H)
+
+GR_sig_inv_ln2 = r14
+GR_rshf_2to51 = r15
+GR_exp_2tom51 = r16
+GR_rshf = r17
+GR_exp_half = r18
+GR_sign_mask = r19
+GR_exp_square_oflow = r20
+GR_exp_square_uflow = r21
+GR_exp_ynear1_oflow = r22
+GR_exp_ynear1_uflow = r23
+GR_signif_Z = r24
+
+GR_signexp_x = r32
+
+GR_exp_x = r33
+
GR_Table_Ptr = r34
GR_Table_Ptr1 = r35
-GR_BIAS = r35
GR_Index1 = r36
-GR_sign_mask = r36
GR_Index2 = r37
GR_Expo_X = r37
-GR_signif_Z = r38
GR_M = r38
GR_X_0 = r39
@@ -620,45 +613,49 @@ GR_k = r44
GR_Big_Pos_Exp = r45
+GR_exp_pos_max = r46
-GR_BIAS_p_k = r47
-GR_BIASed_exp_y = r47
+GR_exp_bias_p_k = r47
-GR_Big_Neg_Exp = r48
GR_Index3 = r48
GR_temp = r48
GR_vsm_expo = r49
-GR_y_sign = r49
GR_T1_ptr = r50
+GR_P_ptr1 = r50
GR_T2_ptr = r51
+GR_P_ptr2 = r51
GR_N_fix = r52
GR_exp_y = r53
GR_signif_y = r54
-GR_exp_and_sign_y = r55
+GR_signexp_y = r55
+GR_fraction_y = r55
GR_low_order_bit = r56
-GR_get_exp_mask = r57
-GR_exponent_zero = r58
-
-// ** Registers for unwind support
+GR_exp_mask = r57
+GR_exp_bias = r58
+GR_y_sign = r59
+GR_table_base = r60
+GR_ptr_exp_Arg = r61
+GR_Delta_Exp = r62
+GR_Special_Exp = r63
+GR_exp_neg_max = r64
+GR_Big_Neg_Exp = r65
+
+//** Registers for unwind support
GR_SAVE_PFS = r59
GR_SAVE_B0 = r60
GR_SAVE_GP = r61
-GR_Parameter_X = r62
-GR_Parameter_Y = r63
-GR_Parameter_RESULT = r64
-GR_Parameter_TAG = r65
-
-FR_X = f8
-FR_Y = f9
-FR_RESULT = f99
+GR_Parameter_X = r62
+GR_Parameter_Y = r63
+GR_Parameter_RESULT = r64
+GR_Parameter_TAG = r65
-// **
+//**
FR_Input_X = f8
-FR_Output = f8
+FR_Result = f8
FR_Input_Y = f9
FR_Neg = f10
@@ -671,7 +668,6 @@ FR_poly_hi = f11
FR_Sgn = f12
-FR_Neg_X = f13
FR_half_W = f13
FR_X_cor = f14
@@ -698,13 +694,11 @@ FR_Scale = f36
FR_G_1 = f37
FR_G = f37
FR_Wsq = f37
-FR_L_Inv = f37
FR_temp = f37
FR_H_1 = f38
FR_H = f38
FR_W4 = f38
-FR_float_N = f38
FR_h = f39
FR_h_1 = f39
@@ -720,9 +714,7 @@ FR_L_lo = f41
FR_A_1 = f41
FR_h_2 = f42
-FR_P_6 = f42
-FR_abs_W = f43
FR_W1 = f43
FR_G_3 = f44
@@ -740,7 +732,6 @@ FR_H_3 = f47
FR_float_N = f48
-FR_P_4 = f49
FR_A_2 = f49
FR_Q_4 = f50
@@ -768,7 +759,6 @@ FR_Two = f56
FR_Big = f57
FR_neg_2_mK = f58
-FR_NBig = f58
FR_r = f59
@@ -777,1652 +767,1253 @@ FR_poly_lo = f60
FR_poly = f61
FR_P_5 = f62
+FR_Result_small = f62
FR_rsq = f63
-FR_Result = f99
-FR_Result_small = f100
-FR_Result_big = f101
+FR_Delta = f64
-.section .text
-.proc powl#
-.global powl#
-.align 64
+FR_save_Input_X = f65
+FR_norm_X = f66
+FR_norm_Y = f67
+FR_Y_lo_2 = f68
-powl:
-{ .mfi
-alloc GR_Expo_Range = ar.pfs,0,30,4,0
-(p0) fclass.m.unc p7, p13 = FR_Input_Y, 0x1E7
-nop.i 0
-}
-{ .mfi
-(p0) getf.exp GR_exp_and_sign_y = FR_Input_Y
+FR_P_6 = f69
+FR_Result_big = f69
+
+FR_RSHF_2TO51 = f70
+FR_INV_LN2_2TO63 = f71
+FR_2TOM51 = f72
+FR_RSHF = f73
+FR_TMP1 = f74
+FR_TMP2 = f75
+FR_TMP3 = f76
+FR_Tscale = f77
+FR_P_4 = f78
+FR_NBig = f79
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(powl)
//
-// Save State
+// Get significand of x. It is the critical path.
//
-(p0) fclass.m.unc p6, p12 = FR_Input_X, 0x1E7
-nop.i 0
-};;
{ .mfi
-(p0) getf.sig GR_signif_y = FR_Input_Y
-(p0) fcmp.eq.unc.s1 p12, p13 = FR_Input_X, f1
-nop.i 0
+ getf.sig GR_signif_Z = FR_Input_X // Get significand of x
+ fclass.m p11, p12 = FR_Input_X, 0x0b // Test x unorm
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Check for y = 1
-// Identify EM unsupporteds.
-// Load FR_half = .5
-//
-(p0) fadd.s1 FR_Two = f1, f1
-//
-// Load 1/2 in GP register
-//
-nop.i 0
+ nop.m 999
+ fnorm.s1 FR_norm_X = FR_Input_X // Normalize x
+ mov GR_exp_half = 0xffff - 1 // Exponent for 0.5
}
;;
-{ .mmi
- nop.m 999
-(p0) addl GR_Table_Ptr = @ltoff(Constant_half#), gp
- nop.i 999
+{ .mfi
+ alloc r32 = ar.pfs,0,30,4,0
+ fclass.m p7, p0 = FR_Input_Y, 0x1E7 // Test y natval, nan, inf, zero
+ mov GR_exp_pos_max = 0x13fff // Max exponent for pos oflow test
+}
+{ .mfi
+ addl GR_table_base = @ltoff(Constants_exp_64_Arg#), gp // Ptr to tables
+ fnorm.s1 FR_norm_Y = FR_Input_Y // Normalize y
+ mov GR_exp_neg_max = 0x33fff // Max exponent for neg oflow test
}
;;
-{ .mmi
- ld8 GR_Table_Ptr = [GR_Table_Ptr]
- nop.m 999
- nop.i 999
+{ .mfi
+ getf.exp GR_signexp_y = FR_Input_Y // Get sign and exp of y
+(p12) fclass.m p11, p0 = FR_Input_Y, 0x0b // Test y unorm
+ mov GR_sign_mask = 0x20000 // Sign mask
+}
+{ .mfi
+ ld8 GR_table_base = [GR_table_base] // Get base address for tables
+ fadd.s1 FR_Two = f1, f1 // Form 2.0 for square test
+ mov GR_exp_mask = 0x1FFFF // Exponent mask
}
;;
-{ .mlx
-(p0) ldfe FR_Half =[GR_Table_Ptr],0
-(p0) movl GR_get_exp_mask = 0x1FFFF ;;
+{ .mfi
+ getf.sig GR_signif_y = FR_Input_Y // Get significand of y
+ fclass.m p6, p0 = FR_Input_X, 0x1E7 // Test x natval, nan, inf, zero
+ nop.i 999
}
+;;
{ .mfi
- nop.m 999
-(p0) fclass.nm.unc p9, p15 = FR_Input_Y, 0x1FF
-//
-// Create FR_Two = 2
-// Get exp and significand of Y
-// Crate Masks
-// sgn = 1
-//
-(p0) and GR_exp_y = GR_get_exp_mask,GR_exp_and_sign_y
+ getf.exp GR_signexp_x = FR_Input_X // Get signexp of x
+ fmerge.s FR_save_Input_X = FR_Input_X, FR_Input_X
+ extr.u GR_Index1 = GR_signif_Z, 59, 4 // Extract upper 4 signif bits of x
}
-{ .mlx
- nop.m 999
-(p0) movl GR_exponent_zero = 0xFFFF ;;
+{ .mfb
+ setf.exp FR_Half = GR_exp_half // Load half
+ nop.f 999
+(p11) br.cond.spnt POWL_DENORM // Branch if x or y denorm/unorm
}
+;;
+
+// Return here from POWL_DENORM
+POWL_COMMON:
{ .mfi
- nop.m 999
-(p0) mov FR_Sgn = f1
- nop.i 999
+ setf.exp FR_Big = GR_exp_pos_max // Form big pos value for oflow test
+ fclass.nm p11, p0 = FR_Input_Y, 0x1FF // Test Y unsupported
+ shl GR_Index1 = GR_Index1,5 // Adjust index1 pointer x 32
}
{ .mfi
- nop.m 999
-(p0) fcmp.eq.unc.s1 p10, p11 = FR_Input_Y, f1
- nop.i 999 ;;
+ add GR_Table_Ptr = 0x7c0, GR_table_base // Constants_log_80_Z_G_H_h1
+ fma.s1 FR_Sgn = f1,f1,f0 // Assume result positive
+ mov GR_exp_bias = 0xFFFF // Form exponent bias
}
-{ .mfb
- nop.m 999
+;;
+
//
// Identify NatVals, NaNs, Infs, and Zeros.
-// Load Half
//
-(p0) fclass.nm.unc p8, p14 = FR_Input_X, 0x1FF
-//
-// Remove sign bit from exponent of y.
-// Check for x = 1
-//
-(p6) br.cond.spnt L(POWL_64_SPECIAL) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p7) br.cond.spnt L(POWL_64_SPECIAL) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p8) br.cond.spnt L(POWL_64_UNSUPPORT) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.spnt L(POWL_64_UNSUPPORT) ;;
-}
-{ .mfi
-(p0) cmp.lt.unc p9, p0 = GR_exp_y,GR_exponent_zero
-(p0) fcmp.lt.unc.s1 p6, p13 = FR_Input_X, f0
//
+// Remove sign bit from exponent of y.
+// Check for x = 1
// Branch on Infs, Nans, Zeros, and Natvals
// Check to see that exponent < 0
//
-(p0) sub GR_exp_y = GR_exp_y,GR_exponent_zero
-}
-// x not zero, is y ==2?
{ .mfi
- nop.m 999
-(p11) fcmp.eq.unc.s1 p7, p14 = FR_Input_Y, FR_Two
- nop.i 999 ;;
+ setf.exp FR_NBig = GR_exp_neg_max // Form big neg value for oflow test
+ fclass.nm p8, p0 = FR_Input_X, 0x1FF // Test X unsupported
+ and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent of y
}
{ .mfb
- nop.m 999
-(p9) fcmp.lt.unc.s1 p9, p0 = FR_Input_X, f0
-(p7) br.cond.spnt L(POWL_64_SQUARE) ;; // Branch if x not zero and y=2
+ add GR_Index1 = GR_Index1,GR_Table_Ptr
+ nop.f 999
+(p6) br.cond.spnt POWL_64_SPECIAL // Branch if x natval, nan, inf, zero
}
-{ .mfi
- nop.m 999
-(p6) fmerge.ns FR_Neg_X = FR_Input_X, FR_Input_X
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p10) fmpy.s0 FR_Result = FR_Input_X, f1
-//
-// For y = 1, compute result = x
-// For x = 1, compute 1
-// When Y is one return X and possible raise
-// denormal operand exception.
-// Remove exponent BIAS
+;;
+
+// load Z_1 from Index1
+
+// There is logic starting here to determine if y is an integer when x < 0.
+// If 0 < |y| < 1 then clearly y is not an integer.
+// If |y| > 1, then the significand of y is shifted left by the size of
+// the exponent of y. This preserves the lsb of the integer part + the
+// fractional bits. The lsb of the integer can be tested to determine if
+// the integer is even or odd. The fractional bits can be tested. If zero,
+// then y is an integer.
//
-(p6) shl GR_exp_and_sign_y= GR_signif_y,GR_exp_y ;;
-}
{ .mfi
-(p9) or GR_exp_and_sign_y = 0xF,GR_signif_y
-(p12) fma.s0 FR_Result = FR_Input_Y, f0, f1
- nop.i 999 ;;
+ ld2 GR_Z_1 =[GR_Index1],4 // Load Z_1
+ fmerge.s FR_Z = f0, FR_norm_X // Z = |x|
+ extr.u GR_X_0 = GR_signif_Z, 49, 15 // Extract X_0 from significand
}
-{ .mii
- nop.m 999
-(p6) extr.u GR_exp_y = GR_exp_and_sign_y,63,1 ;;
-(p6) cmp.ne.unc p9, p0 = GR_exp_y, r0
+{ .mfb
+ cmp.lt p9, p0 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1
+ nop.f 999
+(p7) br.cond.spnt POWL_64_SPECIAL // Branch if y natval, nan, inf, zero
}
-{ .mii
- nop.m 999
-//
-// Both predicates can be set.
-// Don't consider y's < 1.
-//
-(p6) shl GR_signif_y= GR_exp_and_sign_y,1 ;;
-//
-// Is shift off integer part of y.
-// Get y's even or odd bit.
-//
-(p6) cmp.ne.unc p8, p0 = GR_signif_y, r0
+;;
+
+{ .mfb
+ ldfs FR_G_1 = [GR_Index1],4 // Load G_1
+ fcmp.eq.s1 p10, p0 = FR_Input_Y, f1 // Test Y = +1.0
+(p8) br.cond.spnt POWL_64_UNSUPPORT // Branch if x unsupported
}
-{ .mib
- nop.m 999
- nop.i 999
+;;
+
//
-// Is the fractional part of the y = 0?
-// Is the integer even or odd.
+// X_0 = High order 15 bit of Z
//
-(p10) br.cond.spnt L(POWL_64_RETURN) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p12) br.cond.spnt L(POWL_64_RETURN) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p8) br.cond.spnt L(POWL_64_XNEG) ;;
+{ .mfb
+ ldfs FR_H_1 = [GR_Index1],8 // Load H_1
+(p9) fcmp.lt.unc.s1 p9, p0 = FR_Input_X, f0 // Test x<0, 0 <|y|<1
+(p11) br.cond.spnt POWL_64_UNSUPPORT // Branch if y unsupported
}
+;;
+
{ .mfi
- nop.m 999
-(p9) fmerge.ns FR_Sgn = FR_Sgn, FR_Sgn
- nop.i 999
+ ldfe FR_h_1 = [GR_Index1] // Load h_1
+ fcmp.eq.s1 p7, p0 = FR_Input_Y, FR_Two // Test y = 2.0
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // X_1 = X_0 * Z_1 (bits 15-30)
+ // Wait 4 cycles to use result
}
{ .mfi
- nop.m 999
-(p0) fcmp.eq.unc.s0 p11, p0 = FR_Input_Y, FR_Half
- nop.i 999 ;;
+ add GR_Table_Ptr = 0x9c0, GR_table_base // Constants_log_80_Z_G_H_h2
+ nop.f 999
+ sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent of y
}
+;;
+
//
-// Raise possible denormal operand exception for both
-// X and Y.
+// Branch for (x < 0) and Y not an integer.
//
{ .mfb
- nop.m 999
-//
-// Branch for (x < 0) and Y not an integer.
-//
-(p0) fcmp.eq.unc.s0 p12, p0 = FR_Input_X, f1
-//
-// For x < 0 and y integer, make x positive
-// For x < 0 and y odd integer,, set sign = -1.
-//
-(p11) br.cond.spnt L(POWL_64_SQRT) ;;
-}
-{ .mmf
-(p0) cmp.eq.unc p15, p14 = r0, r0
- nop.m 999
-(p13) fnorm.s1 FR_Z = FR_Input_X ;;
-}
-{ .mfi
- nop.m 999
-(p6) fnorm.s1 FR_Z = FR_Neg_X
- nop.i 999
+ nop.m 999
+ fcmp.lt.s1 p6, p0 = FR_Input_X, f0 // Test x < 0
+(p9) br.cond.spnt POWL_64_XNEG // Branch if x < 0, 0 < |y| < 1
}
;;
-//
-// Branch to embedded sqrt(x)
-//
-//
-// Computes ln( x ) to extra precision
-// Input FR 1: FR_X
-// Output FR 2: FR_Y_hi
-// Output FR 3: FR_Y_lo
-// Output PR 1: PR_Safe
-//
-
-{ .mmi
+{ .mfi
nop.m 999
-(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Z_G_H_h1#), gp
+ fcmp.eq.s1 p12, p0 = FR_Input_X, f1 // Test x=+1.0
nop.i 999
}
+{ .mfb
+ nop.m 999
+ fsub.s1 FR_W = FR_Z, f1 // W = Z - 1
+(p7) br.cond.spnt POWL_64_SQUARE // Branch if y=2
+}
;;
-{ .mmi
- ld8 GR_Table_Ptr = [GR_Table_Ptr]
+{ .mfi
nop.m 999
- nop.i 999
+(p10) fmpy.s0 FR_Result = FR_Input_X, f1 // If y=+1.0, result=x
+(p6) shl GR_fraction_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction
+ // Wait 4 cycles to use result
}
;;
-
-{ .mlx
- nop.m 999
-(p0) movl GR_BIAS = 0x000000000000FFFF ;;
-}
{ .mfi
- nop.m 999
-(p0) fsub.s1 FR_W = FR_Z, f1
- nop.i 999 ;;
-}
-//
-// Z = Norm(X) - both + and - case
-// Set Safe = True
-//
-{ .mmb
-(p0) getf.sig GR_signif_Z = FR_Z
-(p0) getf.exp GR_N = FR_Z
- nop.b 999 ;;
-}
-{ .mii
- nop.m 999
-//
-// Get significand of Z
-// W = Z - 1
-//
-(p0) extr.u GR_Index1 = GR_signif_Z, 59, 4 ;;
-//
-// Index1 = High order 4 bits of Z
-// X_0 = High order 15 bit of Z
-//
-(p0) shl GR_Index1 = GR_Index1,5 ;;
-}
-{ .mfi
- nop.m 999
-//
-// Add offset to Index1 ptr.
-//
-(p0) fabs FR_abs_W = FR_W
-//
-// BIAS = 0x000...FFFF
-// Adjust Index1 ptr ( x 32) .
-//
-(p0) add GR_Index1 = GR_Index1,GR_Table_Ptr
+ nop.m 999
+(p12) fma.s0 FR_Result = FR_Input_Y, f0, f1 // If x=1.0, result=1, chk denorm
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract index2
}
-{ .mmi
- nop.m 999 ;;
-(p0) ld2 GR_Z_1 =[GR_Index1],4
-(p0) extr.u GR_X_0 = GR_signif_Z, 49, 15
+;;
+
+//
+// N = exponent of Z
+//
+{ .mib
+ getf.exp GR_N = FR_Z // Get exponent of Z (also x)
+ shl GR_Index2=GR_Index2,5 // Index2 x 32 bytes
+(p10) br.ret.spnt b0 // Exit if y=+1.0
}
;;
-{ .mmi
- nop.m 999
-(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Z_G_H_h2#), gp
+{ .mib
+ add GR_Index2 = GR_Index2, GR_Table_Ptr // Pointer to table 2
nop.i 999
+(p12) br.ret.spnt b0 // Exit if x=+1.0
}
;;
{ .mmi
- ld8 GR_Table_Ptr = [GR_Table_Ptr]
- nop.m 999
+ ld2 GR_Z_2 =[GR_Index2],4 // Load Z_2
+;;
+ ldfs FR_G_2 = [GR_Index2],4 // Load G_2
nop.i 999
}
;;
-
-{ .mmi
-(p0) ldfs FR_G_1 = [GR_Index1],4 ;;
-(p0) ldfs FR_H_1 = [GR_Index1],8
- nop.i 999 ;;
+{ .mii
+ ldfs FR_H_2 = [GR_Index2],8 // Load H_2
+(p6) tbit.nz.unc p9, p0 = GR_fraction_y, 63 // Test x<0 and y odd integer
+ add GR_Table_Ptr = 0xbcc, GR_table_base // Constants_log_80_h3_G_H, G_3
}
+;;
+
//
-// Adjust Index2 (x 32).
+// For x < 0 and y odd integer,, set sign = -1.
//
{ .mfi
-(p0) ldfe FR_h_1 = [GR_Index1],0
- nop.f 999
-(p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 ;;
-}
-{ .mmi
- nop.m 999 ;;
-//
-// load Z_1 from Index1
-// abs_W = |W|
-// Point to Table2
-//
-(p0) getf.exp GR_M = FR_abs_W
-//
-// M = M - BIAS
-// Load G_1
-// N = exponent of Z
-//
- nop.i 999;;
+ getf.exp GR_M = FR_W // Get signexp of W
+ nop.f 999
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // X_2 = X_1 * Z_2 (bits 15-30)
}
-{ .mmi
- nop.m 999
- nop.m 999
- nop.i 999;;
+{ .mfi
+ ldfe FR_h_2 = [GR_Index2] // Load h_2
+(p9) fnma.s1 FR_Sgn = f1, f1, f0 // If x<0, y odd int, result negative
+ sub GR_N = GR_N, GR_exp_bias // Get true exponent of x = N
}
-{ .mmi
- nop.m 999
- nop.m 999
- nop.i 999;;
+;;
+
+{ .mfi
+ add GR_Table_Ptr1 = 0xdc0, GR_table_base // Ptr to H_3
+ fcmp.eq.s0 p11, p0 = FR_Input_Y, FR_Half // Test y=0.5, also set denorm
+(p6) shl GR_fraction_y= GR_fraction_y, 1 // Shift left 1 to get fraction
}
-{ .mmi
- nop.m 999
- nop.m 999
-(p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;;
+;;
+
+{ .mmb
+ setf.sig FR_float_N = GR_N
+(p6) cmp.ne.unc p8, p0 = GR_fraction_y, r0 // Test x<0 and y not integer
+(p8) br.cond.spnt POWL_64_XNEG // Branch if x<0 and y not int
}
-{ .mii
- nop.m 999
-//
-// Extract Index2
-// Load H_1
-// Is -8 > M ?
+;;
+
//
-(p0) shl GR_Index2=GR_Index2,5 ;;
-(p0) add GR_Index2 = GR_Index2, GR_Table_Ptr
-}
+// Raise possible denormal operand exception for both X and Y.
+// Set pointers in case |x| near 1
+// Branch to embedded sqrt(x) if y=0.5
//
-// M = exponent of abs_W
-// X_1 = X_0 * Z_1
-//
-{ .mii
-(p0) sub GR_M = GR_M, GR_BIAS
- nop.i 999 ;;
-(p0) cmp.gt.unc p7, p14 = -8, GR_M
+{ .mfi
+ add GR_P_ptr1 = 0x6b0, GR_table_base // Constants_log_80_P, P8, NEAR path
+ fcmp.eq.s0 p12, p0 = FR_Input_X, FR_Input_Y // Dummy to set denormal
+ add GR_P_ptr2 = 0x700, GR_table_base // Constants_log_80_P, P4, NEAR path
}
-{ .mib
- nop.m 999
- nop.i 999
-(p7) br.cond.spnt L(LOGL80_NEAR) ;;
+{ .mfb
+ cmp.eq p15, p14 = r0, r0 // Assume result safe (no over/under)
+ fsub.s1 FR_Delta = FR_Input_Y,f1 // Delta = y - 1.0
+(p11) br.cond.spnt POWL_64_SQRT // Branch if y=0.5
}
+;;
+
//
-// Load h_1
-// Possible branch out.
-// Add offset of table to Index2
+// Computes ln( x ) to extra precision
+// Input FR 1: FR_X
+// Output FR 2: FR_Y_hi
+// Output FR 3: FR_Y_lo
+// Output PR 1: PR_Safe
//
{ .mfi
-(p0) ld2 GR_Z_2 =[GR_Index2],4
-(p0) fmerge.se FR_S = f1,FR_Z
-(p0) sub GR_N = GR_N, GR_BIAS
+ and GR_M = GR_exp_mask, GR_M // Mask to get exponent of W
+ nop.f 999
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Get index3
}
;;
{ .mmi
- nop.m 999
-(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_h3_G_H#), gp
- nop.i 999
+ shladd GR_Table_Ptr1 = GR_Index3,2,GR_Table_Ptr1 // Ptr to H_3
+ shladd GR_Index3 = GR_Index3,4,GR_Table_Ptr // Ptr to G_3
+ sub GR_M = GR_M, GR_exp_bias // Get true exponent of W
}
;;
-{ .mmi
- ld8 GR_Table_Ptr = [GR_Table_Ptr]
- nop.m 999
- nop.i 999
+{ .mib
+ ldfs FR_G_3 = [GR_Index3],-12 // Load G_3
+ cmp.gt p7, p14 = -8, GR_M // Test if |x-1| < 2^-8
+(p7) br.cond.spnt LOGL80_NEAR // Branch if |x-1| < 2^-8
}
;;
-//
-// load Z_2
-// N - BIAS
-// Point to Table 3.
-// S = merging of Z and 1.0
-//
-{ .mmi
-(p0) ldfs FR_G_2 = [GR_Index2],4
-(p0) setf.sig FR_float_N = GR_N
-(p0) add GR_Table_Ptr1 = 0x200,GR_Table_Ptr ;;
-}
-//
-// load G_2
-// X_2 = X_1 * Z_2
-// Add offset to Table 2 ptr.
-// float_N = significand of N
-//
-{ .mmi
-(p0) ldfs FR_H_2 = [GR_Index2],8 ;;
-//
-// load H_2
-// G = G * G_2
-//
-(p0) ldfe FR_h_2 = [GR_Index2],0
-(p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;;
-}
-{ .mmi
- nop.m 999
- nop.m 999
- nop.i 999;;
-}
-{ .mmi
- nop.m 999
- nop.m 999
- nop.i 999;;
-}
-{ .mmi
- nop.m 999
- nop.m 999
- nop.i 999;;
+// Here if |x-1| >= 2^-8
+{ .mmf
+ ldfs FR_H_3 = [GR_Table_Ptr1] // Load H_3
+ nop.m 999
+ nop.f 999
}
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;;
+;;
+
+{ .mfi
+ ldfe FR_h_3 = [GR_Index3] // Load h_3
+ fmerge.se FR_S = f1,FR_Z // S = merge of 1.0 and signif(Z)
+ nop.i 999
}
{ .mfi
-(p0) shladd GR_Table_Ptr1 = GR_Index3,2,GR_Table_Ptr1
- nop.f 999
+ add GR_Table_Ptr = 0x740, GR_table_base // Constants_log_80_Q
+ fmpy.s1 FR_G = FR_G_1, FR_G_2 // G = G_1 * G_2
+ nop.i 999
+}
+;;
+
//
-// h = h_1 + h_2
-// Adjust Index3
+// Begin Loading Q's - load log2_hi part
//
-(p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Ptr ;;
-}
-{ .mmb
- nop.m 999
-(p0) ldfe FR_h_3 = [GR_Index3],12
- nop.b 999 ;;
-}
-{ .mmf
-(p0) ldfs FR_H_3 = [GR_Table_Ptr1],0
+{ .mfi
+ ldfe FR_log2_hi = [GR_Table_Ptr],16 // Load log2_hi
+ fadd.s1 FR_H = FR_H_1, FR_H_2 // H = H_1 + H_2
+ nop.i 999
+};;
+
//
-// float_N = Make N a fp number
-// Load h_3
-// Get pointer to Q table.
+// h = h_1 + h_2
//
-(p0) ldfs FR_G_3 = [GR_Index3],0
-(p0) fmpy.s1 FR_G = FR_G_1, FR_G_2
+{ .mfi
+ ldfe FR_log2_lo = [GR_Table_Ptr],16 // Load log2_lo
+ fadd.s1 FR_h = FR_h_1, FR_h_2 // h = h_1 + h_2
+ nop.i 999
}
;;
-{ .mmi
- nop.m 999
-(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Q#), gp
+{ .mfi
+ ldfe FR_Q_6 = [GR_Table_Ptr],16 // Load Q_6
+ fcvt.xf FR_float_N = FR_float_N
nop.i 999
}
;;
-{ .mmi
- ld8 GR_Table_Ptr = [GR_Table_Ptr]
- nop.m 999
+{ .mfi
+ ldfe FR_Q_5 = [GR_Table_Ptr],16 // Load Q_5
+ nop.f 999
nop.i 999
}
;;
-
-
-{ .mfi
-(p0) ldfe FR_log2_hi = [GR_Table_Ptr],16
-(p0) fadd.s1 FR_H = FR_H_1, FR_H_2
- nop.i 999 ;;
-}
-{ .mmf
- nop.m 999
-//
-// G = G_1 * G_2 * G_3
-//
-(p0) ldfe FR_log2_lo = [GR_Table_Ptr],16
-//
-// load h_2
-// H = H_1 + H_2
-// Get Index3
//
-(p0) fadd.s1 FR_h = FR_h_1, FR_h_2 ;;
-}
-//
-// Load log2_lo part
-// r = G*S -1
+// G = G_1 * G_2 * G_3
//
{ .mfi
-(p0) ldfe FR_Q_6 = [GR_Table_Ptr],16
-//
-// Load H_3
-//
-(p0) fcvt.xf FR_float_N = FR_float_N
- nop.i 999 ;;
+ ldfe FR_Q_4 = [GR_Table_Ptr],16 // Load Q_4
+ fmpy.s1 FR_G = FR_G, FR_G_3
+ nop.i 999
}
+;;
+
//
-// Load Q_6
+// H = H_1 + H_2 + H_3
//
-{ .mmi
-(p0) ldfe FR_Q_5 = [GR_Table_Ptr],16 ;;
-(p0) ldfe FR_Q_4 = [GR_Table_Ptr],16
- nop.i 999 ;;
-}
-{ .mmi
-(p0) ldfe FR_Q_3 = [GR_Table_Ptr],16 ;;
-(p0) ldfe FR_Q_2 = [GR_Table_Ptr],16
- nop.i 999 ;;
+{ .mfi
+ ldfe FR_Q_3 = [GR_Table_Ptr],16 // Load Q_3
+ fadd.s1 FR_H = FR_H, FR_H_3
+ nop.i 999
}
-{ .mmf
- nop.m 999
-//
-// poly_lo = Q_5 + r * Q_6
-// Load Q_2
-// rsq = r * r
+;;
+
//
-(p0) ldfe FR_Q_1 = [GR_Table_Ptr],16
+// Y_lo = poly + Y_lo
//
-// h = h_1 + h_2 + h_3
-// H = H_1 + H_2 + H_3
-// Load G_3.
-// Begin Loading Q's - load log2_hi part
+// h = h_1 + h_2 + h_3
//
-(p0) fmpy.s1 FR_G = FR_G, FR_G_3
-}
{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_H = FR_H, FR_H_3
- nop.i 999
+ ldfe FR_Q_2 = [GR_Table_Ptr],16 // Load Q_2
+ fadd.s1 FR_h = FR_h, FR_h_3
+ nop.i 999
}
;;
//
-// Y_lo = poly + Y_lo
+// GS_hi = G*S
+// r = G*S -1
//
-
-{ .mmi
- nop.m 999
-(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Arg#), gp
+{ .mfi
+ ldfe FR_Q_1 = [GR_Table_Ptr],16 // Load Q_1
+ fmpy.s1 FR_GS_hi = FR_G, FR_S
nop.i 999
}
-;;
-
-{ .mmi
- ld8 GR_Table_Ptr = [GR_Table_Ptr]
+{ .mfi
nop.m 999
+ fms.s1 FR_r = FR_G, FR_S, f1
nop.i 999
}
;;
-
-{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_h = FR_h, FR_h_3
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
//
-// Load Q_5
+// poly_lo = Q_5 + r * Q_6
//
-(p0) fmpy.s1 FR_GS_hi = FR_G, FR_S
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p0) fms.s1 FR_r = FR_G, FR_S, f1
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly_lo = FR_r, FR_Q_6, FR_Q_5
- nop.i 999
+ getf.exp GR_Delta_Exp = FR_Delta // Get signexp of y-1 for exp calc
+ fma.s1 FR_poly_lo = FR_r, FR_Q_6, FR_Q_5
+ nop.i 999
}
-{ .mfi
- nop.m 999
//
-// GS_hi = G*S
-// Load Q_4
+// r_cor = GS_hi -1
//
-(p0) fsub.s1 FR_r_cor = FR_GS_hi, f1
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fms.s1 FR_GS_lo = FR_G, FR_S, FR_GS_hi
- nop.i 999
-}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly = FR_r, FR_Q_2, FR_Q_1
- nop.i 999 ;;
+ nop.m 999
+ fsub.s1 FR_r_cor = FR_GS_hi, f1
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
-// Load Q_3
-// r_cor = GS_hi -1
// GS_lo = G*S - GS_hi
//
-(p0) fmpy.s1 FR_rsq = FR_r, FR_r
- nop.i 999
-}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_G = FR_float_N, FR_log2_hi, FR_H
- nop.i 999 ;;
+ nop.m 999
+ fms.s1 FR_GS_lo = FR_G, FR_S, FR_GS_hi
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
-// poly = poly_hi + rsq * poly_lo
-// Tbl = float_N*log2_hi + H
+// rsq = r * r
//
-(p0) fma.s1 FR_Y_lo = FR_float_N, FR_log2_lo, FR_h
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-//
-// r_cor = r_cor - r
-// poly_hi = r * Q_2 + Q_1
-//
-(p0) fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_4
- nop.i 999
+ nop.m 999
+ fmpy.s1 FR_rsq = FR_r, FR_r
+ nop.i 999
}
-{ .mfi
- nop.m 999
//
-// Load Q_1
+// G = float_N*log2_hi + H
//
-(p0) fsub.s1 FR_r_cor = FR_r_cor, FR_r
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-//
-// Y_lo = float_N*log2_lo + h
-//
-(p0) fadd.s1 FR_Y_hi = FR_G, FR_r
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_G = FR_float_N, FR_log2_hi, FR_H
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
-// poly_lo = Q_4 + r * poly_lo;;
-// r_cor = r_cor + GS_lo;;
+// Y_lo = float_N*log2_lo + h
//
-(p0) fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_3
- nop.i 999
-}
{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_r_cor = FR_r_cor, FR_GS_lo
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_r_cor = FR_r_cor, FR_Y_lo
- nop.i 999
+ nop.m 999
+ fma.s1 FR_Y_lo = FR_float_N, FR_log2_lo, FR_h
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
-// poly_lo = Q_3 + r * poly_lo;;
+// poly_lo = Q_4 + r * poly_lo
+// r_cor = r_cor - r
//
-(p0) fma.s1 FR_poly = FR_rsq, FR_poly_lo, FR_poly
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p0) fsub.s1 FR_Y_lo = FR_G, FR_Y_hi
- nop.i 999
-}
-{ .mmi
-(p0) ldfe FR_L_Inv = [GR_Table_Ptr],16 ;;
-(p0) ldfe FR_L_hi = [GR_Table_Ptr],16
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_4
+ nop.i 999
}
{ .mfi
-(p0) ldfe FR_L_lo = [GR_Table_Ptr],16
- nop.f 999
- nop.i 999 ;;
+ nop.m 999
+ fsub.s1 FR_r_cor = FR_r_cor, FR_r
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
-// Y_hi = Tbl + r
-// r_cor = r_cor + Y_lo
+// poly_hi = r * Q_2 + Q_1
+// Y_hi = G + r
//
-(p0) fma.s1 FR_poly = FR_rsq, FR_poly, FR_r_cor
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-// Y_lo = Tbl - Y_hi
-// poly = rsq * poly + r_cor
-//
-(p0) fadd.s1 FR_Y_lo = FR_Y_lo, FR_r
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_poly = FR_r, FR_Q_2, FR_Q_1
+ nop.i 999
}
-{ .mfb
- nop.m 999
-//
-// Y_lo = Y_lo + r
-//
-(p0) fadd.s1 FR_Y_lo = FR_Y_lo, FR_poly
-//
-// Load L_Inv
-// Load L_hi
-// Load L_lo
-// all long before they are needed.
-// They are used in LOGL_RETURN PATH
-//
-br.cond.sptk L(LOGL_RETURN) ;;
+{ .mfi
+ nop.m 999
+ fadd.s1 FR_Y_hi = FR_G, FR_r
+ nop.i 999
}
-L(LOGL80_NEAR):
+;;
+
//
-// Branch LOGL80_NEAR
+// poly_lo = Q_3 + r * poly_lo
+// r_cor = r_cor + GS_lo
//
-
-{ .mmi
+{ .mfi
nop.m 999
-(p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_P#), gp
+ fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_3
nop.i 999
}
-;;
-
-{ .mmi
- ld8 GR_Table_Ptr = [GR_Table_Ptr]
+{ .mfi
nop.m 999
+ fadd.s1 FR_r_cor = FR_r_cor, FR_GS_lo
nop.i 999
}
;;
-{ .mfi
- nop.m 999
-(p0) fmpy.s1 FR_Wsq = FR_W, FR_W
-(p0) add GR_Table_Ptr1 = 0x50,GR_Table_Ptr
-}
//
-// Adjust ptr to 1/2
-// Adjust Ptr1 to P_4
+// Y_lo = G - Y_hi
//
-{ .mmi
-(p0) ldfe FR_Half = [GR_Table_Ptr],16 ;;
-(p0) ldfe FR_P_4 = [GR_Table_Ptr1],16
- nop.i 999
+{ .mfi
+ nop.m 999
+ fsub.s1 FR_Y_lo_2 = FR_G, FR_Y_hi
+ nop.i 999
}
+;;
+
//
-// Load 1/2
+// r_cor = r_cor + Y_lo
+// poly = poly_hi + rsq * poly_lo
//
-{ .mmi
-(p0) ldfe FR_P_8 = [GR_Table_Ptr],16 ;;
-(p0) ldfe FR_P_3 = [GR_Table_Ptr1],16
- nop.i 999
+{ .mfi
+ add GR_Table_Ptr = 0x0, GR_table_base // Constants_exp_64_Arg
+ fadd.s1 FR_r_cor = FR_r_cor, FR_Y_lo
+ nop.i 999
}
-{ .mmi
-(p0) ldfe FR_P_7 = [GR_Table_Ptr],16 ;;
-(p0) ldfe FR_P_2 = [GR_Table_Ptr1],16
- nop.i 999
+{ .mfi
+ nop.m 999
+ fma.s1 FR_poly = FR_rsq, FR_poly_lo, FR_poly
+ nop.i 999
}
+;;
+
//
-// Load P_7
-// half_W = .5 * W
-// Load P_3
-//
-{ .mmi
-(p0) ldfe FR_P_6 = [GR_Table_Ptr],16 ;;
-(p0) ldfe FR_P_1 = [GR_Table_Ptr1],16
- nop.i 999 ;;
-}
+// Load L_hi
+// Load L_lo
+// all long before they are needed.
+// They are used in LOGL_RETURN PATH
//
-// Load P_6
-// Wsq = w * w
-// poly = w*P_4 + P_3
-// Load P_2
+// Y_lo = Y_lo + r
+// poly = rsq * poly + r_cor
//
{ .mfi
-(p0) ldfe FR_P_5 = [GR_Table_Ptr],16
-//
-// Load P_5
-// poly_lo = w * P_8 + P_7
-// Y_hi = w - (1/2)w*w
-// Load P_1
-//
-(p0) fmpy.s1 FR_W4 = FR_Wsq, FR_Wsq
- nop.i 999
+ ldfe FR_L_hi = [GR_Table_Ptr],16 // Load L_hi
+ fadd.s1 FR_Y_lo = FR_Y_lo_2, FR_r
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 FR_W3 = FR_Wsq, FR_W
- nop.i 999
+ nop.m 999
+ fma.s1 FR_poly = FR_rsq, FR_poly, FR_r_cor
+ nop.i 999
}
;;
+{ .mfb
+ ldfe FR_L_lo = [GR_Table_Ptr],16 // Load L_lo
+ fadd.s1 FR_Y_lo = FR_Y_lo, FR_poly
+ br.cond.sptk LOGL_RETURN // Branch to common code
+}
+;;
+
+
+LOGL80_NEAR:
+// Here if |x-1| < 2^-8
//
-// Y_lo = W3 * poly + Y_lo
+// Branch LOGL80_NEAR
//
+{ .mmf
+ ldfe FR_P_8 = [GR_P_ptr1],16 // Load P_8
+ ldfe FR_P_4 = [GR_P_ptr2],16 // Load P_4
+ fmpy.s1 FR_Wsq = FR_W, FR_W
+}
+;;
+
{ .mmi
- nop.m 999
-(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Arg#), gp
+ ldfe FR_P_7 = [GR_P_ptr1],16 // Load P_7
+ ldfe FR_P_3 = [GR_P_ptr2],16 // Load P_3
nop.i 999
}
;;
{ .mmi
- ld8 GR_Table_Ptr = [GR_Table_Ptr]
- nop.m 999
+ ldfe FR_P_6 = [GR_P_ptr1],16 // Load P_6
+ ldfe FR_P_2 = [GR_P_ptr2],16 // Load P_2
nop.i 999
}
;;
-
{ .mmi
-(p0) ldfe FR_L_Inv = [GR_Table_Ptr],16 ;;
-(p0) ldfe FR_L_hi = [GR_Table_Ptr],16
- nop.i 999 ;;
-}
-{ .mfi
-(p0) ldfe FR_L_lo = [GR_Table_Ptr],16
-//
-// Load P_8
-// Load P_4
-//
-(p0) fmpy.s1 FR_half_W = FR_Half, FR_W
- nop.i 999 ;;
+ ldfe FR_P_5 = [GR_P_ptr1],16 // Load P_5
+ ldfe FR_P_1 = [GR_P_ptr2],16 // Load P_1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly_lo = FR_W, FR_P_8,FR_P_7
- nop.i 999
+ getf.exp GR_Delta_Exp = FR_Delta // Get signexp of y-1 for exp calc
+ fmpy.s1 FR_W4 = FR_Wsq, FR_Wsq
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly = FR_W, FR_P_4, FR_P_3
- nop.i 999 ;;
+ add GR_Table_Ptr = 0x0, GR_table_base // Constants_exp_64_Arg
+ fmpy.s1 FR_W3 = FR_Wsq, FR_W
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fnma.s1 FR_Y_hi = FR_W, FR_half_W, FR_W
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 FR_half_W = FR_Half, FR_W
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// W4 = Wsq * Wsq
-// poly = w *poly + P_2
-//
-(p0) fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_6
- nop.i 999
+ ldfe FR_L_hi = [GR_Table_Ptr],16
+ fma.s1 FR_poly_lo = FR_W, FR_P_8,FR_P_7
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly = FR_W, FR_poly, FR_P_2
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_poly = FR_W, FR_P_4, FR_P_3
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fsub.s1 FR_Y_lo = FR_W, FR_Y_hi
- nop.i 999 ;;
+ ldfe FR_L_lo = [GR_Table_Ptr],16
+ fnma.s1 FR_Y_hi = FR_W, FR_half_W, FR_W
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly = w * poly + P_1
-// w3 = wsq * w
-//
-(p0) fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_5
- nop.i 999
+ nop.m 999
+ fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_6
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// poly_lo = w * poly_lo + P_6
-// Y_lo = W - Y_hi
-//
-(p0) fma.s1 FR_poly = FR_W, FR_poly, FR_P_1
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_poly = FR_W, FR_poly, FR_P_2
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fnma.s1 FR_Y_lo = FR_W, FR_half_W, FR_Y_lo
- nop.i 999 ;;
+ nop.m 999
+ fsub.s1 FR_Y_lo = FR_W, FR_Y_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly_lo = w * poly_lo +
-// Y_lo = Y_lo - w * (1/2)w
-//
-(p0) fma.s1 FR_poly = FR_poly_lo, FR_W4, FR_poly
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_5
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Y_lo = (W-Y_hi) - w * (1/2)w
-// poly = W4* poly_lo + poly
-//
-(p0) fma.s1 FR_Y_lo = FR_poly, FR_W3, FR_Y_lo
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_poly = FR_W, FR_poly, FR_P_1
+ nop.i 999
}
-L(LOGL_RETURN):
+;;
+
{ .mfi
-(p0) add GR_Expo_Range = 0x2,r0
-//
-// Load L_Inv
-// Load L_hi
-// Load L_lo
-// all long before they are needed.
-//
-//
-// kernel_log_80 computed ln(X)
-// and return logX_hi and logX_lo as results.
-// PR_pow_Safe set as well.
-//
-(p0) fmpy.s1 FR_X_lo = FR_Input_Y, FR_logx_lo
-//
-// Compute Y * (logX_hi + logX_lo)
-// P_hi -> X
-// P_lo -> X_cor
-// (Manipulate names so that inputs are in
-// the place kernel_exp expects them)
-// Set GR_Flag to 2
-// Set GR_Expo_Range to Double
-//
-// This function computes exp( x + x_cor)
-// Input FR 1: FR_X
-// Input FR 2: FR_X_cor
-// Input GR 1: GR_Flag
-// Input GR 2: GR_Expo_Range
-// Output FR 3: FR_Y_hi
-// Output FR 4: FR_Y_lo
-// Output FR 5: FR_Scale
-// Output PR 1: PR_Safe
-//
-(p0) cmp.eq.unc p15, p0 = r0, r0
+ nop.m 999
+ fnma.s1 FR_Y_lo = FR_W, FR_half_W, FR_Y_lo
+ nop.i 999
}
;;
-{ .mmi
-(p0) addl GR_W1_ptr = @ltoff(Constants_exp_64_W1#), gp
-(p0) addl GR_W2_ptr = @ltoff(Constants_exp_64_W2#), gp
-(p0) add GR_Flag = 0x2,r0
+{ .mfi
+ nop.m 999
+ fma.s1 FR_poly = FR_poly_lo, FR_W4, FR_poly
+ nop.i 999
}
;;
-{ .mmi
- ld8 GR_W1_ptr = [GR_W1_ptr]
- ld8 GR_W2_ptr = [GR_W2_ptr]
-(p0) cmp.ne.unc p7, p0 = 0x1, GR_Flag
+{ .mfi
+ nop.m 999
+ fma.s1 FR_Y_lo = FR_poly, FR_W3, FR_Y_lo
+ nop.i 999
}
;;
-{ .mlx
- nop.m 999
-(p0) movl GR_Mask = 0x1FFFF ;;
-}
+LOGL_RETURN:
+// Common code for completion of both logx paths
-{ .mlx
- nop.m 999
-(p0) movl GR_BIAS = 0x0FFFF ;;
-}
-{ .mfi
- nop.m 999
//
-// X_lo = Y * logX_lo
+// L_hi, L_lo already loaded.
//
-(p0) fma.s1 FR_P_hi = FR_Input_Y, FR_logx_hi,FR_X_lo
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
//
-// Set Safe=True
-// Flag is always 2 for this routine
+// kernel_log_80 computed ln(X)
+// and return logX_hi and logX_lo as results.
+// PR_pow_Safe set as well.
//
-(p0) fmpy.s1 FR_float_N = FR_X, FR_L_Inv
- nop.i 999
-}
-{ .mfi
- nop.m 999
//
-// X_hi = Y * logX_hi + X_lo
-// Set GR_Flag = 2 for exp(x + xcor)
+// Compute Y * (logX_hi + logX_lo)
+// P_hi -> X
+// P_lo -> X_cor
+// (Manipulate names so that inputs are in
+// the place kernel_exp expects them)
//
-(p0) fms.s1 FR_P_lo= FR_Input_Y, FR_logx_hi, FR_P_hi
- nop.i 999 ;;
+// This function computes exp( x + x_cor)
+// Input FR 1: FR_X
+// Input FR 2: FR_X_cor
+// Output FR 3: FR_Y_hi
+// Output FR 4: FR_Y_lo
+// Output FR 5: FR_Scale
+// Output PR 1: PR_Safe
+//
+// P15 is True
+//
+// Load constants used in computing N using right-shift technique
+{ .mlx
+ mov GR_exp_2tom51 = 0xffff-51
+ movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
}
-{ .mmi
- nop.m 999 ;;
-(p0) getf.exp GR_Expo_X = FR_X
- nop.i 999 ;;
+{ .mlx
+ add GR_Special_Exp = -50,GR_exp_bias
+ movl GR_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51)
}
-{ .mfi
-(p0) and GR_Expo_X = GR_Expo_X, GR_Mask
+;;
+
//
-// Calculate unBIASed exponent of X
// Point to Table of W1s
// Point to Table of W2s
//
-(p0) fcvt.fx.s1 FR_N = FR_float_N
- nop.i 999 ;;
-}
+{ .mmi
+ add GR_W1_ptr = 0x2b0, GR_table_base // Constants_exp_64_W1
+ add GR_W2_ptr = 0x4b0, GR_table_base // Constants_exp_64_W2
+ cmp.le p6,p0= GR_Delta_Exp,GR_Special_Exp
+};;
+
+// Form two constants we need
+// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
+// 1.1000..000 * 2^(63+63-12) to right shift int(N) into the significand
+
{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_P_lo = FR_P_lo, FR_X_lo
-//
-// Float_N = X * L_Inv
-// Create exponent BIAS
-// Get BIASed exponent of X
-//
-(p0) sub GR_Expo_X = GR_Expo_X, GR_BIAS ;;
+ setf.sig FR_INV_LN2_2TO63 = GR_sig_inv_ln2 // form 1/ln2 * 2^63
+ nop.f 999
+ and GR_Delta_Exp=GR_Delta_Exp,GR_exp_mask // Get exponent of y-1
}
-{ .mib
-(p0) cmp.gt.unc p9, p0 = -6, GR_Expo_X
- nop.i 999
-//
-// N = fcvt.fx(float_N)
-// If -6 > Expo_X, set P9
-//
-(p9) br.cond.spnt L(EXPL_SMALL)
+{ .mlx
+ setf.d FR_RSHF_2TO51 = GR_rshf_2to51 // Form const 1.1000 * 2^(63+51)
+ movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift
}
;;
-//
-// If expo_X < -6 goto exp_small
-//
-{ .mmi
+{ .mfi
nop.m 999
-(p0) addl GR_T1_ptr = @ltoff(Constants_exp_64_T1#), gp
-(p0) cmp.lt.unc p10, p0 = 14, GR_Expo_X
+ fmpy.s1 FR_X_lo = FR_Input_Y, FR_logx_lo // logx_lo is Y_lo
+ cmp.eq p15, p0= r0, r0 // Set p15, assume safe
+};;
+
+{ .mmi
+ setf.exp FR_2TOM51 = GR_exp_2tom51 // Form 2^-51 for scaling float_N
+ setf.d FR_RSHF = GR_rshf // Form right shift const 1.1000 * 2^63
+ add GR_Table_Ptr1 = 0x50, GR_table_base // Constants_exp_64_P for
+ // EXPL_SMALL path
}
;;
{ .mmi
- ld8 GR_T1_ptr = [GR_T1_ptr]
- nop.m 999
+ ldfe FR_P_6 = [GR_Table_Ptr1],16 // Load P_6 for EXPL_SMALL path
+;;
+ ldfe FR_P_5 = [GR_Table_Ptr1],16 // Load P_5 for EXPL_SMALL path
nop.i 999
}
;;
-{ .mib
- nop.m 999
- nop.i 999
-//
-// If 14 < Expo_X, set P10
-// Create pointer to T1 table
-//
-(p10) br.cond.spnt L(EXPL_HUGE) ;;
+{ .mfi
+ ldfe FR_P_4 = [GR_Table_Ptr1],16 // Load P_4 for EXPL_SMALL path
+ fma.s1 FR_P_hi = FR_Input_Y, FR_logx_hi,FR_X_lo // logx_hi ix Y_hi
+ nop.i 999
}
-
+;;
{ .mmi
-(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Exponents#), gp
-(p0) addl GR_T2_ptr = @ltoff(Constants_exp_64_T2#), gp
+ ldfe FR_P_3 = [GR_Table_Ptr1],16 // Load P_3 for EXPL_SMALL path
+;;
+ ldfe FR_P_2 = [GR_Table_Ptr1],16 // Load P_2 for EXPL_SMALL path
nop.i 999
}
;;
-{ .mmi
- ld8 GR_Table_Ptr = [GR_Table_Ptr]
- ld8 GR_T2_ptr = [GR_T2_ptr]
+// N = X * Inv_log2_by_2^12
+// By adding 1.10...0*2^63 we shift and get round_int(N_signif) in significand.
+// We actually add 1.10...0*2^51 to X * Inv_log2 to do the same thing.
+{ .mfi
+ ldfe FR_P_1 = [GR_Table_Ptr1] // Load P_1 for EXPL_SMALL path
+ fma.s1 FR_N = FR_X, FR_INV_LN2_2TO63, FR_RSHF_2TO51
nop.i 999
}
+{ .mfb
+ nop.m 999
+ fms.s1 FR_P_lo= FR_Input_Y, FR_logx_hi, FR_P_hi // P_hi is X
+(p6) br.cond.spnt POWL_Y_ALMOST_1 // Branch if |y-1| < 2^-50
+}
;;
-
{ .mmi
-(p0) shladd GR_Table_Ptr = GR_Expo_Range,4,GR_Table_Ptr ;;
-//
-// Adjust T1_ptr by x 4 for single-precision values
-// Adjust T2_ptr by x 4 for single-precision values
-//
-(p0) ld8 GR_Big_Pos_Exp = [GR_Table_Ptr],8
- nop.i 999 ;;
-}
-//
-// Load double W1
-// Load +max exponent
-//
-{ .mfi
-(p0) ld8 GR_Big_Neg_Exp = [GR_Table_Ptr],0
-//
-// If 14 < Expo_X, goto exp_huge
-//
-(p0) fcvt.xf FR_float_N = FR_N
- nop.i 999
+ getf.exp GR_Expo_X = FR_X
+ add GR_T1_ptr = 0x0b0, GR_table_base // Constants_exp_64_T1
+ add GR_T2_ptr = 0x1b0, GR_table_base // Constants_exp_64_T2
}
;;
-//
-// Load double W2
-// Load -max exponent
-// Load ptr to A's
-//
+// float_N = round_int(N)
+// The signficand of N contains the rounded integer part of X * 2^12/ln2,
+// as a twos complement number in the lower bits (that is, it may be negative).
+// That twos complement number (called N) is put into GR_N_fix.
-{ .mmi
-(p0) getf.sig GR_N_fix = FR_N
-(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_A#), gp
+// Since N is scaled by 2^51, it must be multiplied by 2^-51
+// before the shift constant 1.10000 * 2^63 is subtracted to yield float_N.
+// Thus, float_N contains the floating point version of N
+
+
+{ .mfi
+ add GR_Table_Ptr = 0x20, GR_table_base // Constants_exp_64_A
+ fms.s1 FR_float_N = FR_N, FR_2TOM51, FR_RSHF // Form float_N
nop.i 999
}
-;;
+// Create low part of Y(ln(x)_hi + ln(x)_lo) as P_lo
+{ .mfi
+ mov GR_Big_Pos_Exp = 0x3ffe // 16382, largest safe exponent
+ fadd.s1 FR_P_lo = FR_P_lo, FR_X_lo
+ mov GR_Big_Neg_Exp = -0x3ffd // -16381 smallest safe exponent
+};;
-{ .mmi
- ld8 GR_Table_Ptr = [GR_Table_Ptr]
+{ .mfi
nop.m 999
- nop.i 999
+ fmpy.s1 FR_rsq = FR_X, FR_X // rsq = X*X for EXPL_SMALL path
+ mov GR_vsm_expo = -70 // Exponent for very small path
+}
+{ .mfi
+ nop.m 999
+ fma.s1 FR_poly_lo = FR_P_6, FR_X, FR_P_5 // poly_lo for EXPL_SMALL path
+ add GR_temp = 0x1,r0 // For tiny signif if small path
}
;;
//
-// Load single T1
-// Load single T2
-// W_1_p1 = W_1 + 1
-//
-{ .mmi
-(p0) ldfe FR_A_3 = [GR_Table_Ptr],16 ;;
-//
-// Load A_3
-// if k > big_pos_exp, set p14 and Safe=False
-//
-(p0) ldfe FR_A_2 = [GR_Table_Ptr],16
-(p0) extr.u GR_M1 = GR_N_fix, 6, 6
-}
-{ .mmi
- nop.m 999 ;;
-(p0) shladd GR_W1_ptr = GR_M1,3,GR_W1_ptr
-//
-// float_N = fcvt.xf(N)
-// N_fix = significand of N
-// Create pointer to T2 table
-//
-(p0) extr.u GR_M2 = GR_N_fix, 0, 6
-}
-//
-// r = r + X_cor
-// Adjust W1_ptr by x 8 for double-precision values
-// Adjust W2_ptr by x 8 for double-precision values
-// Adjust Table_ptr by Expo_Rangex16
+// If expo_X < -6 goto exp_small
//
{ .mmi
-(p0) shladd GR_T1_ptr = GR_M1,2,GR_T1_ptr ;;
-(p0) ldfd FR_W1 = [GR_W1_ptr],0
-(p0) shladd GR_W2_ptr = GR_M2,3,GR_W2_ptr
+ getf.sig GR_N_fix = FR_N
+ ldfe FR_A_3 = [GR_Table_Ptr],16 // Load A_3
+ and GR_Expo_X = GR_Expo_X, GR_exp_mask // Get exponent of X
}
-//
-// Load ptr to A's
-//
+;;
+
{ .mfi
-(p0) ldfs FR_T1 = [GR_T1_ptr],0
-(p0) fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_X
-(p0) shladd GR_T2_ptr = GR_M2,2,GR_T2_ptr ;;
+ ldfe FR_A_2 = [GR_Table_Ptr],16 // Load A_2
+ nop.f 999
+ sub GR_Expo_X = GR_Expo_X, GR_exp_bias // Get true exponent of X
}
-{ .mmi
-(p0) ldfd FR_W2 = [GR_W2_ptr],0
-(p0) ldfs FR_T2 = [GR_T2_ptr],0
+;;
+
//
-// r = x - L_hi * float_N
-// M2 = extr.u(N_fix,0,6)
-// M1 = extr.u(N_fix,6,6)
+// If -6 > Expo_X, set P9 and branch
//
-(p0) extr GR_k = GR_N_fix, 12, 52 ;;
+{ .mfb
+ cmp.gt p9, p0 = -6, GR_Expo_X
+ fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_X // r = X - L_hi * float_N
+(p9) br.cond.spnt EXPL_SMALL // Branch if |X| < 2^-6
}
+;;
+
//
-// Load A_1
-// poly = A_3 * r + A_2
-// rsq = r*r
+// If 14 <= Expo_X, set P10
//
-{ .mii
-(p0) add GR_BIAS_p_k = GR_BIAS, GR_k
-(p0) cmp.gt.unc p14,p15 = GR_k,GR_Big_Pos_Exp ;;
-(p15) cmp.lt p14,p15 = GR_k,GR_Big_Neg_Exp
+{ .mib
+ cmp.le p10, p0 = 14, GR_Expo_X
+ nop.i 999
+(p10) br.cond.spnt EXPL_HUGE // Branch if |X| >= 2^14
}
+;;
+
//
-// BIAS_p_K = BIAS + k
-// T = T1 * T2
+// Load single T1
+// Load single T2
+// W_1_p1 = W_1 + 1
//
-{ .mfi
-(p0) setf.exp FR_Scale = GR_BIAS_p_k
- nop.f 999
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r
- nop.i 999
+{ .mmi
+ nop.m 999
+ nop.m 999
+ extr.u GR_M1 = GR_N_fix, 6, 6 // Extract index M_1
}
+;;
+
//
-// W = W_1_p1 * W2 + W1
+// k = extr.u(N_fix,0,6)
//
-{ .mfi
-(p0) ldfe FR_A_1 = [GR_Table_Ptr],16
- nop.f 999
- nop.i 999 ;;
+{ .mmi
+ shladd GR_W1_ptr = GR_M1,3,GR_W1_ptr // Point to W1
+ shladd GR_T1_ptr = GR_M1,2,GR_T1_ptr // Point to T1
+ extr.u GR_M2 = GR_N_fix, 0, 6 // Extract index M_2
}
-{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_W_1_p1 = FR_W1, f1
- nop.i 999 ;;
+;;
+
+// N_fix is only correct up to 50 bits because of our right shift technique.
+// Actually in the normal path we will have restricted K to about 14 bits.
+// Somewhat arbitrarily we extract 32 bits.
+{ .mmi
+ ldfd FR_W1 = [GR_W1_ptr]
+ shladd GR_W2_ptr = GR_M2,3,GR_W2_ptr // Point to W2
+ extr GR_k = GR_N_fix, 12, 32 // Extract k
}
+;;
+
{ .mfi
- nop.m 999
-//
-// k = extr.u(N_fix,0,6)
-// r = r - N * L_lo
-// Load ptr to Table of exponent thresholds.
-//
-(p0) fadd.s1 FR_r = FR_r, FR_X_cor
- nop.i 999
+ ldfs FR_T1 = [GR_T1_ptr]
+ fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r
+ shladd GR_T2_ptr = GR_M2,2,GR_T2_ptr // Point to T2
}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 FR_T = FR_T1, FR_T2
- nop.i 999 ;;
+ add GR_exp_bias_p_k = GR_exp_bias, GR_k
+ nop.f 999
+ cmp.gt p14,p15 = GR_k,GR_Big_Pos_Exp
}
-{ .mfi
- nop.m 999
+;;
+
//
-// if k < big_neg_exp, set p14 and Safe=False
-// Load A_2
+// if k < big_neg_exp, set p14 and Safe=False
//
-(p0) fma.s1 FR_W = FR_W2, FR_W_1_p1, FR_W1
- nop.i 999 ;;
+{ .mmi
+ ldfs FR_T2 = [GR_T2_ptr]
+(p15) cmp.lt p14,p15 = GR_k,GR_Big_Neg_Exp
+ nop.i 999
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly = FR_r, FR_A_3, FR_A_2
- nop.i 999
+;;
+
+{ .mmi
+ setf.exp FR_Scale = GR_exp_bias_p_k
+ ldfd FR_W2 = [GR_W2_ptr]
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmpy.s1 FR_rsq = FR_r, FR_r
- nop.i 999 ;;
+ ldfe FR_A_1 = [GR_Table_Ptr],16
+ fadd.s1 FR_r = FR_r, FR_X_cor
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) mov FR_Y_hi = FR_T
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 FR_W_1_p1 = FR_W1, f1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Scale = set_exp(BIAS_p_k)
-// poly = r * poly + A_1
-//
-(p0) fadd.s1 FR_Wp1 = FR_W, f1
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_poly = FR_r, FR_A_3, FR_A_2
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly = FR_r, FR_poly, FR_A_1
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 FR_rsq = FR_r, FR_r
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly = FR_rsq, FR_poly,FR_r
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 FR_T = FR_T1, FR_T2
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Wp1 = W + 1
-// poly = rsq * poly + rk
-//
-(p0) fma.s1 FR_Y_lo = FR_Wp1, FR_poly, FR_W
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-//
-// Y_lo = poly * Wp1 + W
-// Y_hi = T
-//
-(p0) fmpy.s1 FR_Y_lo = FR_Y_lo, FR_T
-//
-// Y_lo = T * Y_lo
-//
-(p0) br.cond.sptk L(EXPL_RETURN) ;;
+ nop.m 999
+ fma.s1 FR_W = FR_W2, FR_W_1_p1, FR_W1
+ nop.i 999
}
+;;
-L(EXPL_SMALL):
-
-//
-// r4 = rsq * rsq
-//
-
-{ .mmi
+{ .mfi
nop.m 999
-(p0) addl GR_Table_Ptr1 = @ltoff(Constants_exp_64_P), gp
+ fma.s1 FR_TMP1 = FR_Scale, FR_Sgn, f0
nop.i 999
}
;;
-{ .mmi
- ld8 GR_Table_Ptr1 = [GR_Table_Ptr1]
+{ .mfi
nop.m 999
+ fma.s1 FR_poly = FR_r, FR_poly, FR_A_1
nop.i 999
}
;;
-{ .mmf
- nop.m 999
-(p0) ldfe FR_P_6 = [GR_Table_Ptr1],16
-//
-// Return
-//
-(p0) fadd.s1 FR_r = FR_X,f0 ;;
+{ .mfi
+ nop.m 999
+ fma.s1 FR_TMP2 = FR_T, f1, f0 // TMP2 = Y_hi = T
+ nop.i 999
}
+;;
-{ .mmi
+{ .mfi
nop.m 999
-(p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Exponents#), gp
+ fadd.s1 FR_Wp1 = FR_W, f1
nop.i 999
}
;;
-{ .mmi
- ld8 GR_Table_Ptr = [GR_Table_Ptr]
-(p0) ldfe FR_P_5 = [GR_Table_Ptr1],16
+{ .mfi
+ nop.m 999
+ fma.s1 FR_poly = FR_rsq, FR_poly,FR_r
nop.i 999
}
;;
-//
-// Is input very small?
-// Load P_5
-//
-{ .mii
-(p0) ldfe FR_P_4 = [GR_Table_Ptr1],16
-(p0) add GR_Table_Ptr = 0x040,GR_Table_Ptr ;;
-(p0) shladd GR_Table_Ptr = GR_Expo_Range,3,GR_Table_Ptr ;;
-}
-{ .mmb
-(p0) ldfe FR_P_3 = [GR_Table_Ptr1],16
-//
-// Adjust ptr.
-//
-(p0) ld8 GR_vsm_expo = [GR_Table_Ptr],0
- nop.b 999 ;;
-}
{ .mfi
- nop.m 999
-//
-// r = X (don't seem to need X_Cor)
-// Load the threshold exponents
-//
-(p0) fmpy.s1 FR_rsq = FR_r, FR_r
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_Tscale = FR_T, FR_TMP1, f0 // Scale * Sgn * T
+ nop.i 999
}
-//
-// Load the negative integer
-// Load P_5
-//
{ .mfi
-(p0) cmp.lt.unc p12, p0 = GR_Expo_X, GR_vsm_expo
- nop.f 999
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_Y_lo = FR_Wp1, FR_poly, FR_W
+ nop.i 999
}
+;;
+
{ .mfb
- nop.m 999
-//
-// rsq = r * r
-// Offset into exponents
-//
-(p0) fmpy.s1 FR_r4 = FR_rsq, FR_rsq
-(p12) br.cond.spnt L(EXPL_VERY_SMALL) ;;
+ nop.m 999
+ fmpy.s1 FR_TMP3 = FR_Y_lo, FR_Tscale
+ br.cond.sptk POWL_64_SHARED
}
-{ .mfi
-(p0) ldfe FR_P_2 = [GR_Table_Ptr1],16
-//
-// Load p4,p3,p2,p1
-//
-(p0) fma.s1 FR_poly_lo = FR_P_6, FR_r, FR_P_5
+;;
+
+
+EXPL_SMALL:
+// Here if |ylogx| < 2^-6
//
-// Y_lo = r4 * poly_lo + poly_hi
-// Scale = 1.0
+// Begin creating lsb to perturb final result
//
-(p0) add GR_temp = 0x1,r0 ;;
+{ .mfi
+ setf.sig FR_temp = GR_temp
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_4
+ cmp.lt p12, p0 = GR_Expo_X, GR_vsm_expo // Test |ylogx| < 2^-70
}
-{ .mmf
- nop.m 999
-(p0) ldfe FR_P_1 = [GR_Table_Ptr1],0
-(p0) mov FR_Scale = f1
+{ .mfi
+ nop.m 999
+ fma.s1 FR_poly_hi = FR_P_2, FR_X, FR_P_1
+ nop.i 999
}
-//
-// Begin creating lsb to perturb final result
-//
+;;
+
{ .mfi
-(p0) setf.sig FR_temp = GR_temp
-(p0) mov FR_Y_hi = f1
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 FR_TMP2 = f1, f1
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// poly_lo = p_5 + p_6 * r
-// poly_hi = p_1 + p_2 * r
-//
-(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_P_4
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 FR_TMP1 = FR_Sgn, f1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly_lo = p_4 + poly_lo * r
-// poly_hi = r + poly_hi * rsq
-//
-(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_P_3
- nop.i 999
+ nop.m 999
+ fmpy.s1 FR_r4 = FR_rsq, FR_rsq
+(p12) cmp.eq p15, p0 = r0, r0 // Set safe if |ylogx| < 2^-70
}
+{ .mfb
+ nop.m 999
+(p12) fmpy.s1 FR_TMP3 = FR_Sgn, FR_X
+(p12) br.cond.spnt POWL_64_SHARED // Branch if |ylogx| < 2^-70
+}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly_hi = FR_P_2, FR_r, FR_P_1
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_X, FR_P_3
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, FR_r
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, FR_X
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly_lo = p_3 + poly_lo * r
-// Y_hi = 1, always
-//
-(p0) fma.s1 FR_Y_lo = FR_poly_lo, FR_r4, FR_poly_hi
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_Y_lo = FR_poly_lo, FR_r4, FR_poly_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Set lsb in fp register
-//
-(p0) for FR_temp = FR_Y_lo,FR_temp
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 FR_TMP3 = FR_Y_lo, FR_TMP1 // Add sign info
+ nop.i 999
}
-{ .mfb
- nop.m 999
+;;
+
//
// Toggle on last bit of Y_lo
-//
-(p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_temp
-//
// Set lsb of Y_lo to 1
//
-(p0) br.cond.sptk L(EXPL_RETURN) ;;
-}
-L(EXPL_VERY_SMALL):
{ .mfi
- nop.m 999
-(p0) mov FR_Y_lo = FR_r
-(p0) cmp.eq.unc p15, p0 = r0, r0
+ nop.m 999
+ for FR_temp = FR_Y_lo,FR_temp
+ nop.i 999
}
-{ .mfi
- nop.m 999
-(p0) mov FR_Scale = f1
- nop.i 999
-};;
+;;
+
{ .mfb
- nop.m 999
-(p0) mov FR_Y_hi = f1
-//
-// If flag_not_1,
-// Y_hi = 1.0
-// Y_lo = X + X_cor
-// PR_Safe = true
-//
-(p0) br.cond.sptk L(EXPL_RETURN) ;;
+ nop.m 999
+ fmerge.se FR_TMP3 = FR_TMP3,FR_temp
+ br.cond.sptk POWL_64_SHARED
}
-L(EXPL_HUGE):
+;;
+
+
+EXPL_HUGE:
+// Here if |ylogx| >= 2^14
{ .mfi
- nop.m 999
-//
-// Return for flag=2
-//
-(p0) fcmp.gt.unc.s1 p12, p13 = FR_X, f0
-(p0) cmp.eq.unc p14, p15 = r0, r0 ;;
+ mov GR_temp = 0x0A1DC // If X < 0, exponent -24100
+ fcmp.gt.s1 p12, p13 = FR_X, f0 // Test X > 0
+ cmp.eq p14, p15 = r0, r0 // Set Safe to false
}
-{ .mlx
- nop.m 999
-//
-// Set Safe to false
-// Is x > 0
-//
-(p12) movl GR_Mask = 0x15DC0 ;;
-}
-{ .mlx
-(p12) setf.exp FR_Y_hi = GR_Mask
-(p13) movl GR_Mask = 0xA240 ;;
+;;
+
+{ .mmi
+(p12) mov GR_Mask = 0x15DC0 // If X > 0, exponent +24000
+(p13) mov GR_Mask = 0x0A240 // If X < 0, exponent -24000
+ nop.i 999
}
-{ .mlx
-(p13) setf.exp FR_Y_hi = GR_Mask
-//
-// x > 0: Create mask for Y_hi = 2**(24,000)
-// x <= 0: Create mask for Y_hi = 2**(-24,000)
-//
-(p13) movl GR_temp = 0xA1DC ;;
+;;
+
+{ .mmf
+ setf.exp FR_TMP2 = GR_Mask // Form Y_hi = TMP2
+(p13) setf.exp FR_Y_lo = GR_temp // If X < 0, Y_lo = 2^-24100
+(p12) mov FR_Y_lo = f1 // IF X > 0, Y_lo = 1.0
}
+;;
+
{ .mfi
-(p13) setf.exp FR_Y_lo = GR_temp
-//
-// x < =0: Create mask for 2**(-24,100)
-// x <= 0: Y_lo = w**(-24,100)
-//
-(p12) mov FR_Y_lo = f1
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 FR_TMP1 = FR_TMP2, FR_Sgn // TMP1 = Y_hi * Sgn
+ nop.i 999
}
-{ .mfi
- nop.m 999
-(p12) mov FR_Scale = FR_Y_hi
- nop.i 999 ;;
+;;
+
+{ .mfb
+ nop.m 999
+ fmpy.s1 FR_TMP3 = FR_Y_lo,FR_TMP1 // TMP3 = Y_lo * (Y_hi * Sgn)
+ br.cond.sptk POWL_64_SHARED
}
-{ .mfi
- nop.m 999
+;;
+
+POWL_Y_ALMOST_1:
+// Here if delta = |y-1| < 2^-50
//
-// x > 0: Y_lo = 1.0
-// x > 0: Scale = 2**(24,000)
+// x**(1 + delta) = x * e (ln(x)*delta) = x ( 1 + ln(x) * delta)
//
-(p13) mov FR_Scale = FR_Y_hi
- nop.i 999 ;;
-}
-L(EXPL_RETURN):
+// Computation will be safe for 2^-16381 <= x < 2^16383
+
{ .mfi
- nop.m 999
-//
-// Scale = 2**(24,000)
-//
-//
-// exp(y *ln(x)) almost complete
-// FR_Scale is Scale
-// f34 is Z_hi
-// f35 is Z_lo
-//
-(p0) fmpy.s1 FR_Sgn = FR_Scale, FR_Sgn
- nop.i 999 ;;
+ mov GR_exp_ynear1_oflow = 0xffff + 16383
+ fma.s1 FR_TMP1 = FR_Input_X,FR_Delta,f0
+ and GR_exp_x = GR_exp_mask, GR_signexp_x
}
+;;
+
{ .mfi
- nop.m 999
-//
-// sgn * scale
-//
-(p0) fmpy.s1 FR_Y_lo = FR_Y_lo,FR_Sgn
- nop.i 999 ;;
+ cmp.lt p15, p14 = GR_exp_x, GR_exp_ynear1_oflow
+ fma.s1 FR_TMP2 = FR_logx_hi,f1,FR_X_lo
+ mov GR_exp_ynear1_uflow = 0xffff - 16381
}
+;;
+
{ .mfb
- nop.m 999
-//
-// Z_lo * (sgn * scale)
+(p15) cmp.ge p15, p14 = GR_exp_x, GR_exp_ynear1_uflow
+ fma.s1 FR_TMP3 = FR_Input_X,f1,f0
+ br.cond.sptk POWL_64_SHARED
+};;
+
+POWL_64_SQUARE:
//
-(p0) fma.s0 FR_Result = FR_Y_hi, FR_Sgn, FR_Y_lo
+// Here if x not zero and y=2.
//
-// Z_hi * (sgn * scale) + Z_lo
+// Setup for multipath code
//
-(p15) br.cond.sptk L(POWL_64_RETURN) ;;
-}
{ .mfi
- nop.m 999
-(p0) fsetc.s3 0x7F,0x01
- nop.i 999
-}
-{ .mlx
- nop.m 999
-//
-// Z_hi * (sgn * scale) + Z_lo with wre & td
-// Z_hi * (sgn * scale) + Z_lo with fz & td
-//
-(p0) movl GR_T1_ptr = 0x00000000013FFF ;;
+ mov GR_exp_square_oflow = 0xffff + 8192 // Exponent where x*x overflows
+ fmerge.se FR_TMP1 = FR_Input_X, FR_Input_X
+ and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s3 FR_Result_small = FR_Y_hi, FR_Sgn, FR_Y_lo
- nop.i 999
+ cmp.lt p15, p14 = GR_exp_x, GR_exp_square_oflow // Decide safe/unsafe
+ fmerge.se FR_TMP2 = FR_Input_X, FR_Input_X
+ mov GR_exp_square_uflow = 0xffff - 8191 // Exponent where x*x underflows
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fsetc.s3 0x7F,0x40
- nop.i 999 ;;
+(p15) cmp.ge p15, p14 = GR_exp_x, GR_exp_square_uflow // Decide safe/unsafe
+ fma.s1 FR_TMP3 = f0,f0,f0
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
-// Return if no danger of over of underflow.
+// This is the shared path that will set overflow and underflow.
//
-(p0) fsetc.s2 0x7F,0x42
- nop.i 999;;
-}
-{ .mfi
- nop.m 999
+POWL_64_SHARED:
+
//
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// S3 user supplied status + FZ + TD (Underflows)
+// Return if no danger of over or underflow.
//
-(p0) fma.s2 FR_Result_big = FR_Y_hi, FR_Sgn, FR_Y_lo
- nop.i 999 ;;
+{ .mfb
+ nop.m 999
+ fma.s0 FR_Result = FR_TMP1, FR_TMP2, FR_TMP3
+(p15) br.ret.sptk b0 // Main path return if certain no over/underflow
}
+;;
+
//
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// S3 user supplied status + FZ + TD (Underflows)
+// S0 user supplied status
+// S2 user supplied status + WRE + TD (Overflows)
+// S2 user supplied status + FZ + TD (Underflows)
//
//
// If (Safe) is true, then
@@ -2430,973 +2021,741 @@ L(EXPL_RETURN):
// No overflow or underflow here, but perhaps inexact.
// Return
// Else
-// Determine if overflow or underflow was raised.
-// Fetch +/- overflow threshold for IEEE single, double,
-// double extended
-//
-{ .mfi
-(p0) setf.exp FR_Big = GR_T1_ptr
-(p0) fsetc.s2 0x7F,0x40
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p11, p0 = FR_Result_small, 0x00F
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fmerge.ns FR_NBig = FR_Big, FR_Big
- nop.i 999
-}
-{ .mfi
- nop.m 999
-//
-// Create largest double exponent + 1.
-// Create smallest double exponent - 1.
-// Identify denormals
-//
-(p0) fcmp.ge.unc.s1 p8, p0 = FR_Result_big , FR_Big
- nop.i 999 ;;
-}
-{ .mii
- nop.m 999
- nop.i 999 ;;
-//
-// fcmp: resultS2 <= - overflow threshold
-// fclass: resultS3 is denorm/unorm/0
-//
-(p8) mov GR_Parameter_TAG = 18 ;;
-}
-{ .mfb
- nop.m 999
-//
-// fcmp: resultS2 >= + overflow threshold
-//
-(p0) fcmp.le.unc.s1 p9, p0 = FR_Result_big, FR_NBig
-(p8) br.cond.spnt __libm_error_region ;;
-}
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p9) mov GR_Parameter_TAG = 18
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.spnt __libm_error_region ;;
-}
-//
-// Report that pow overflowed - either +Inf, or -Inf
-//
-{ .mmb
-(p11) mov GR_Parameter_TAG = 19
- nop.m 999
-(p11) br.cond.spnt __libm_error_region ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-//
-// Report that pow underflowed
-//
-(p0) br.cond.sptk L(POWL_64_RETURN) ;;
-}
-
+// Determine if overflow or underflow was raised.
+// Fetch +/- overflow threshold for IEEE double extended
-L(POWL_64_SQUARE):
-// Here if x not zero and y=2.
-// Must call __libm_error_support for overflow or underflow
-//
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// S3 user supplied status + FZ + TD (Underflows)
-//
{ .mfi
- nop.m 999
-(p0) fma.s0 FR_Result = FR_Input_X, FR_Input_X, f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p0) fsetc.s3 0x7F,0x01
- nop.i 999
-}
-{ .mlx
- nop.m 999
-(p0) movl GR_T1_ptr = 0x00000000013FFF ;;
-}
-{ .mfi
- nop.m 999
-(p0) fma.s3 FR_Result_small = FR_Input_X, FR_Input_X, f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p0) fsetc.s3 0x7F,0x40
- nop.i 999 ;;
+ nop.m 999
+ fsetc.s2 0x7F,0x41 // For underflow test, set S2=User+TD+FTZ
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Return if no danger of over of underflow.
-//
-(p0) fsetc.s2 0x7F,0x42
- nop.i 999;;
+ nop.m 999
+ fma.s2 FR_Result_small = FR_TMP1, FR_TMP2, FR_TMP3
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s2 FR_Result_big = FR_Input_X, FR_Input_X, f0
- nop.i 999 ;;
+ nop.m 999
+ fsetc.s2 0x7F,0x42 // For overflow test, set S2=User+TD+WRE
+ nop.i 999
}
-//
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// S3 user supplied status + FZ + TD (Underflows)
-//
-//
-// If (Safe) is true, then
-// Compute result using user supplied status field.
-// No overflow or underflow here, but perhaps inexact.
-// Return
-// Else
-// Determine if overflow or underflow was raised.
-// Fetch +/- overflow threshold for IEEE single, double,
-// double extended
-//
+;;
+
{ .mfi
-(p0) setf.exp FR_Big = GR_T1_ptr
-(p0) fsetc.s2 0x7F,0x40
- nop.i 999 ;;
+ nop.m 999
+ fma.s2 FR_Result_big = FR_TMP1, FR_TMP2,FR_TMP3
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p11, p0 = FR_Result_small, 0x00F
- nop.i 999 ;;
+ nop.m 999
+ fsetc.s2 0x7F,0x40 // Reset S2=User
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmerge.ns FR_NBig = FR_Big, FR_Big
- nop.i 999
+ nop.m 999
+ fclass.m p11, p0 = FR_Result_small, 0x00F // Test small result unorm/zero
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Create largest double exponent + 1.
-// Create smallest double exponent - 1.
-// Identify denormals
-//
-(p0) fcmp.ge.unc.s1 p8, p0 = FR_Result_big , FR_Big
- nop.i 999 ;;
-}
-{ .mii
- nop.m 999
- nop.i 999 ;;
-//
-// fcmp: resultS2 <= - overflow threshold
-// fclass: resultS3 is denorm/unorm/0
-//
-(p8) mov GR_Parameter_TAG = 18 ;;
+ nop.m 999
+ fcmp.ge.s1 p8, p0 = FR_Result_big , FR_Big // Test >= + oflow threshold
+ nop.i 999
}
+;;
+
{ .mfb
- nop.m 999
-//
-// fcmp: resultS2 >= + overflow threshold
-//
-(p0) fcmp.le.unc.s1 p9, p0 = FR_Result_big, FR_NBig
-(p8) br.cond.spnt __libm_error_region ;;
-}
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p9) mov GR_Parameter_TAG = 18
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.spnt __libm_error_region ;;
-}
-//
-// Report that pow overflowed - either +Inf, or -Inf
-//
-{ .mmb
-(p11) mov GR_Parameter_TAG = 19
- nop.m 999
-(p11) br.cond.spnt __libm_error_region ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-//
-// Report that pow underflowed
-//
-(p0) br.cond.sptk L(POWL_64_RETURN) ;;
+(p11) mov GR_Parameter_TAG = 19 // Set tag for underflow
+ fcmp.le.s1 p9, p0 = FR_Result_big, FR_NBig // Test <= - oflow threshold
+(p11) br.cond.spnt __libm_error_region // Branch if pow underflowed
}
+;;
+{ .mfb
+(p8) mov GR_Parameter_TAG = 18 // Set tag for overflow
+ nop.f 999
+(p8) br.cond.spnt __libm_error_region // Branch if pow +overflow
+}
+;;
+{ .mbb
+(p9) mov GR_Parameter_TAG = 18 // Set tag for overflow
+(p9) br.cond.spnt __libm_error_region // Branch if pow -overflow
+ br.ret.sptk b0 // Branch if result really ok
+}
+;;
-L(POWL_64_SPECIAL):
+POWL_64_SPECIAL:
+// Here if x or y is NatVal, nan, inf, or zero
{ .mfi
- nop.m 999
-(p0) fcmp.eq.s1 p15, p0 = FR_Input_X, f1 // Is x=+1
- nop.i 999 ;;
+ nop.m 999
+ fcmp.eq.s1 p15, p0 = FR_Input_X, f1 // Test x=+1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p14, p0 = FR_Input_Y, 0x023
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p8, p0 = FR_Input_X, 0x143 // Test x natval, snan
+ nop.i 999
}
+;;
{ .mfi
- nop.m 999
-(p15) fcmp.eq.unc.s0 p6,p0 = FR_Input_Y, f0 // If x=1, flag invalid if y=SNaN
- nop.i 999
+ nop.m 999
+(p15) fcmp.eq.unc.s0 p6,p0 = FR_Input_Y, f0 // If x=1, flag invalid if y=SNaN
+ nop.i 999
}
{ .mfb
- nop.m 999
-(p15) fmpy.s0 FR_Result = f1,f1 // If x=1, result=1
-(p15) br.cond.spnt L(POWL_64_RETURN) ;; // Exit if x=1
+ nop.m 999
+(p15) fmpy.s0 FR_Result = f1,f1 // If x=1, result=1
+(p15) br.ret.spnt b0 // Exit if x=1
}
+;;
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p13, p0 = FR_Input_X, 0x023
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p8, p0 = FR_Input_X, 0x143
- nop.i 999
+ nop.m 999
+ fclass.m p6, p0 = FR_Input_Y, 0x007 // Test y zero
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p9, p0 = FR_Input_Y, 0x143
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p9, p0 = FR_Input_Y, 0x143 // Test y natval, snan
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p10, p0 = FR_Input_X, 0x083
- nop.i 999
+ nop.m 999
+ fclass.m p10, p0 = FR_Input_X, 0x083 // Test x qnan
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p11, p0 = FR_Input_Y, 0x083
- nop.i 999 ;;
+ nop.m 999
+(p8) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If x=snan, result=qnan
+(p6) cmp.ne p8,p0 = r0,r0 // Don't exit if x=snan, y=0 ==> result=+1
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6, p0 = FR_Input_Y, 0x007
- nop.i 999
+ nop.m 999
+(p6) fclass.m.unc p15, p0 = FR_Input_X,0x007 // Test x=0, y=0
+ nop.i 999
}
-{ .mfi
- nop.m 999
-(p0) fcmp.eq.unc.s1 p7, p0 = FR_Input_Y, f1
- nop.i 999 ;;
+{ .mfb
+ nop.m 999
+(p9) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X // If y=snan, result=qnan
+(p8) br.ret.spnt b0 // Exit if x=snan, y not 0,
+ // result=qnan
}
+;;
+
{ .mfi
- nop.m 999
-//
-// set p13 if x +/- Inf
-// set p14 if y +/- Inf
-// set p8 if x Natval or +/-SNaN
-// set p9 if y Natval or +/-SNaN
-// set p10 if x QNaN
-// set p11 if y QNaNs
-// set p6 if y is +/-0
-// set p7 if y is 1
-//
-(p8) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X
-(p6) cmp.ne p8,p0 = r0,r0 ;; // Don't exit if x=snan, y=0 ==> result=+1
-}
-{ .mfb
- nop.m 999
-(p9) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X
-(p8) br.cond.spnt L(POWL_64_RETURN) ;;
+ nop.m 999
+ fcmp.eq.s1 p7, p0 = FR_Input_Y, f1 // Test y +1.0
+ nop.i 999
}
{ .mfb
- nop.m 999
-(p10) fmpy.s0 FR_Result = FR_Input_X, f0
-(p9) br.cond.spnt L(POWL_64_RETURN) ;;
-}
-{ .mfi
- nop.m 999
-//
-// Produce result for SNaN and NatVals and return
-//
-(p6) fclass.m.unc p15, p0 = FR_Input_X,0x007
- nop.i 999
+ nop.m 999
+(p10) fmpy.s0 FR_Result = FR_Input_X, f0 // If x=qnan, result=qnan
+(p9) br.ret.spnt b0 // Exit if y=snan, result=qnan
}
+;;
+
{ .mfi
- nop.m 999
-//
-// If Y +/- 0, set p15 if x +/- 0
-//
-(p6) fclass.m.unc p8, p0 = FR_Input_X,0x0C3
- nop.i 999 ;;
+ nop.m 999
+(p6) fclass.m.unc p8, p0 = FR_Input_X,0x0C3 // Test x=nan, y=0
+ nop.i 999
}
+;;
{ .mfi
- nop.m 999
-(p6) fcmp.eq.s0 p9,p0 = FR_Input_X, f0 // If y=0, flag if x denormal
- nop.i 999
+ nop.m 999
+(p6) fcmp.eq.s0 p9,p0 = FR_Input_X, f0 // If y=0, flag if x denormal
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p6) fadd.s0 FR_Result = f1, f0
- nop.i 999 ;;
+ nop.m 999
+(p6) fadd.s0 FR_Result = f1, f0 // If y=0, result=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Set p8 if y = +/-0 and X is a QNaN/SNaN
-// If y = +/-0, let result = 1.0
-//
-(p7) fmpy.s0 FR_Result = FR_Input_X,f1
-//
-// If y == 1, result = x * 1
-//
-(p15) mov GR_Parameter_TAG = 20
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p15) br.cond.spnt __libm_error_region ;;
-}
-{ .mib
- nop.m 999
-//
-// If x and y are both zero, result = 1.0 and call error
-// support.
-//
-(p8) mov GR_Parameter_TAG = 23
-(p8) br.cond.spnt __libm_error_region ;;
+ nop.m 999
+ fclass.m p11, p0 = FR_Input_Y, 0x083 // Test y qnan
+ nop.i 999
}
-{ .mib
- nop.m 999
- nop.i 999
-//
-// If y = +/-0 and x is a QNaN, result = 1.0 and call error
-// support.
-//
-(p6) br.cond.spnt L(POWL_64_RETURN) ;;
+{ .mfb
+(p15) mov GR_Parameter_TAG = 20 // Error tag for x=0, y=0
+(p7) fmpy.s0 FR_Result = FR_Input_X,f1 // If y=1, result=x
+(p15) br.cond.spnt __libm_error_region // Branch if x=0, y=0, result=1
}
+;;
-// If x=0, y=-inf, go to the X_IS_ZERO path
{ .mfb
- nop.m 999
-(p14) fcmp.eq.unc.s1 p0,p14 = FR_Input_X,f0
-(p7) br.cond.spnt L(POWL_64_RETURN) ;;
+(p8) mov GR_Parameter_TAG = 23 // Error tag for x=nan, y=0
+ fclass.m p14, p0 = FR_Input_Y, 0x023 // Test y inf
+(p8) br.cond.spnt __libm_error_region // Branch if x=snan, y=0,
+ // result=1
}
+;;
-{ .mfi
- nop.m 999
-//
-// Produce all results for x**0 and x**1
-// Let all the result x ** 0 == 1 and return
-// Let all x ** 1 == x and return
-//
-(p10) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X
- nop.i 999 ;;
-}
{ .mfb
- nop.m 999
-(p11) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X
-(p10) br.cond.spnt L(POWL_64_RETURN) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p11) br.cond.spnt L(POWL_64_RETURN) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-//
-// Return result for x or y QNaN input with QNaN result
-//
-(p14) br.cond.spnt L(POWL_64_Y_IS_INF) ;;
+ nop.m 999
+ fclass.m p13, p0 = FR_Input_X, 0x023 // Test x inf
+(p6) br.ret.spnt b0 // Exit y=0, x not nan or 0,
+ // result=1
}
-{ .mib
- nop.m 999
- nop.i 999
-(p13) br.cond.spnt L(POWL_64_X_IS_INF) ;;
+;;
+
+{ .mfb
+ nop.m 999
+(p14) fcmp.eq.unc.s1 p0,p14 = FR_Input_X,f0 // Test x not 0, y=inf
+(p7) br.ret.spnt b0 // Exit y=1, x not snan,
+ // result=x
}
-L(POWL_64_X_IS_ZERO):
-{ .mmb
-(p0) getf.sig GR_signif_y = FR_Input_Y
-(p0) getf.exp GR_BIASed_exp_y = FR_Input_Y
- nop.b 999 ;;
+;;
+
+{ .mfb
+ nop.m 999
+(p10) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X // If x=qnan, y not snan,
+ // result=qnan
+(p10) br.ret.spnt b0 // Exit x=qnan, y not snan,
+ // result=qnan
}
-{ .mlx
- nop.m 999
-(p0) movl GR_Mask = 0x1FFFF
+;;
+
+{ .mfb
+ nop.m 999
+(p11) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X // If y=qnan, x not nan or 1,
+ // result=qnan
+(p11) br.ret.spnt b0 // Exit y=qnan, x not nan or 1,
+ // result=qnan
}
-{ .mlx
- nop.m 999
-(p0) movl GR_y_sign = 0x20000 ;;
+;;
+
+{ .mbb
+ nop.m 999
+(p14) br.cond.spnt POWL_64_Y_IS_INF // Branch if y=inf, x not 1 or nan
+(p13) br.cond.spnt POWL_64_X_IS_INF // Branch if x=inf, y not 1 or nan
}
-//
-// Get BIASed exp and significand of y
+;;
+
+
+POWL_64_X_IS_ZERO:
+// Here if x=0, y not nan or 1 or inf or 0
+
+// There is logic starting here to determine if y is an integer when x = 0.
+// If 0 < |y| < 1 then clearly y is not an integer.
+// If |y| > 1, then the significand of y is shifted left by the size of
+// the exponent of y. This preserves the lsb of the integer part + the
+// fractional bits. The lsb of the integer can be tested to determine if
+// the integer is even or odd. The fractional bits can be tested. If zero,
+// then y is an integer.
//
{ .mfi
-(p0) and GR_exp_y = GR_Mask,GR_BIASed_exp_y
- nop.f 999
-(p0) and GR_y_sign = GR_y_sign,GR_BIASed_exp_y
-}
-{ .mlx
- nop.m 999
-(p0) movl GR_BIAS = 0xFFFF ;;
+ and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent of y
+ nop.f 999
+ and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y
}
-{ .mfi
-(p0) cmp.lt.unc p9, p8 = GR_exp_y,GR_BIAS
- nop.f 999
+;;
+
//
// Maybe y is < 1 already, so
// can never be an integer.
-// Remove sign bit from exponent.
-//
-(p0) sub GR_exp_y = GR_exp_y,GR_BIAS ;;
-}
-{ .mii
- nop.m 999
- nop.i 999 ;;
-//
-// Remove exponent BIAS
//
-(p8) shl GR_exp_y= GR_signif_y,GR_exp_y ;;
-}
{ .mfi
-(p9) or GR_exp_y= 0xF,GR_signif_y
- nop.f 999
- nop.i 999 ;;
+ cmp.lt p9, p8 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1
+ nop.f 999
+ sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent of y
}
-{ .mii
- nop.m 999
+;;
+
//
// Shift significand of y looking for nonzero bits
// For y > 1, shift signif_y exp_y bits to the left
-// For y < 1, turn on 4 low order bits of significand of y
+// For y < 1, turn on 4 low order bits of significand of y
// so that the fraction will always be non-zero
//
-(p0) shl GR_signif_y= GR_exp_y,1 ;;
-(p0) extr.u GR_low_order_bit = GR_exp_y,63,1
+{ .mmi
+(p9) or GR_exp_y= 0xF,GR_signif_y // Force nonzero fraction if y<1
+;;
+ nop.m 999
+(p8) shl GR_exp_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction
+ // Wait 4 cycles to use result
+}
+;;
+
+{ .mmi
+ nop.m 999
+;;
+ nop.m 999
+ nop.i 999
+}
+;;
+
+{ .mmi
+ nop.m 999
+;;
+ nop.m 999
+ shl GR_fraction_y= GR_exp_y,1 // Shift left 1 to get fraction
}
+;;
+
//
// Integer part of y shifted off.
// Get y's low even or odd bit - y might not be an int.
//
{ .mii
-(p0) cmp.eq.unc p13,p0 = GR_signif_y, r0
-(p0) cmp.eq.unc p8,p9 = GR_y_sign, r0 ;;
-//
-// Is y an int?
-// Is y positive
-//
-(p13) cmp.ne.unc p13,p0 = GR_low_order_bit, r0 ;;
+ cmp.eq p13,p0 = GR_fraction_y, r0 // Test for y integer
+ cmp.eq p8,p0 = GR_y_sign, r0 // Test for y > 0
+;;
+(p13) tbit.nz.unc p13,p0 = GR_exp_y, 63 // Test if y an odd integer
}
+;;
+
+{ .mfi
+(p13) cmp.eq.unc p13,p14 = GR_y_sign, r0 // Test y pos odd integer
+(p8) fcmp.eq.s0 p12,p0 = FR_Input_Y, f0 // If x=0 and y>0 flag if y denormal
+ nop.i 999
+}
+;;
+
//
-// Is y and int and odd?
+// Return +/-0 when x=+/-0 and y is positive odd integer
//
{ .mfb
-(p13) cmp.eq.unc p13,p14 = GR_y_sign, r0
-(p8) fcmp.eq.s0 p12,p0 = FR_Input_Y, f0 // If x=0 and y>0 flag if y denormal
- nop.b 999 ;;
+ nop.m 999
+(p13) mov FR_Result = FR_Input_X // If x=0, y pos odd int, result=x
+(p13) br.ret.spnt b0 // Exit x=0, y pos odd int, result=x
}
-{ .mfb
- nop.m 999
+;;
+
//
-// Is y and int and odd and positive?
+// Return +/-inf when x=+/-0 and y is negative odd int
//
-(p13) mov FR_Result = FR_Input_X
-(p13) br.cond.sptk L(POWL_64_RETURN) ;;
+{ .mfb
+(p14) mov GR_Parameter_TAG = 21
+(p14) frcpa.s0 FR_Result, p0 = f1, FR_Input_X // Result +-inf, set Z flag
+(p14) br.cond.spnt __libm_error_region
}
-{ .mfi
- nop.m 999
+;;
+
//
-// Return +/-0 when x=+/-0 and y is and odd pos. int
+// Return +0 when x=+/-0 and y positive and not an odd integer
//
-(p14) frcpa.s0 FR_Result, p10 = f1, FR_Input_X
-(p14) mov GR_Parameter_TAG = 21
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p14) br.cond.spnt __libm_error_region ;;
+{ .mfb
+ nop.m 999
+(p8) mov FR_Result = f0 // If x=0, y>0 and not odd integer, result=+0
+(p8) br.ret.sptk b0 // Exit x=0, y>0 and not odd integer, result=+0
}
+;;
-{ .mfb
- nop.m 999
//
-// Return +/-0 when x=+/-Inf and y is and odd neg int
-// and raise dz exception
+// Return +inf when x=+/-0 and y is negative and not odd int
//
-(p8) mov FR_Result = f0
-(p8) br.cond.sptk L(POWL_64_RETURN) ;;
+{ .mfb
+ mov GR_Parameter_TAG = 21
+ frcpa.s0 FR_Result, p10 = f1,f0 // Result +inf, raise Z flag
+ br.cond.sptk __libm_error_region
}
-{ .mfi
- nop.m 999
+;;
+
+
+POWL_64_X_IS_INF:
//
-// Return +0 when x=+/-0 and y > 0 and not odd.
+// Here if x=inf, y not 1 or nan
//
-(p9) frcpa.s0 FR_Result, p10 = f1,f0
-(p9) mov GR_Parameter_TAG = 21
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.sptk __libm_error_region ;;
-}
-L(POWL_64_X_IS_INF):
{ .mfi
-(p0) getf.exp GR_exp_y = FR_Input_Y
-(p0) fclass.m.unc p13, p0 = FR_Input_X,0x022
-(p0) mov GR_Mask = 0x1FFFF ;;
+ and GR_exp_y = GR_exp_mask,GR_signexp_y // Get biased exponent y
+ fclass.m p13, p0 = FR_Input_X,0x022 // Test x=-inf
+ nop.i 999
}
+;;
{ .mfi
-(p0) getf.sig GR_signif_y = FR_Input_Y
-(p0) fcmp.eq.s0 p9,p0 = FR_Input_Y, f0 // Flag if y denormal
- nop.i 999 ;;
+ and GR_y_sign = GR_sign_mask,GR_signexp_y // Get sign of y
+ fcmp.eq.s0 p9,p0 = FR_Input_Y, f0 // Dummy to set flag if y denorm
+ nop.i 999
}
+;;
//
-// Get exp and significand of y
-// Create exponent mask and sign mask
+// Maybe y is < 1 already, so
+// isn't an int.
//
-{ .mlx
-(p0) and GR_low_order_bit = GR_Mask,GR_exp_y
-(p0) movl GR_BIAS = 0xFFFF
+{ .mfi
+(p13) cmp.lt.unc p9, p8 = GR_exp_y,GR_exp_bias // Test 0 < |y| < 1 if x=-inf
+ fclass.m p11, p0 = FR_Input_X,0x021 // Test x=+inf
+ sub GR_exp_y = GR_exp_y,GR_exp_bias // Get true exponent y
}
-{ .mmi
- nop.m 999 ;;
+;;
+
//
-// Remove sign bit from exponent.
+// Shift significand of y looking for nonzero bits
+// For y > 1, shift signif_y exp_y bits to the left
+// For y < 1, turn on 4 low order bits of significand of y
+// so that the fraction will always be non-zero
//
-(p0) cmp.lt.unc p9, p8 = GR_low_order_bit,GR_BIAS
+{ .mmi
+(p9) or GR_exp_y= 0xF,GR_signif_y // Force nonzero fraction if y<1
+;;
+(p11) cmp.eq.unc p14,p12 = GR_y_sign, r0 // Test x=+inf, y>0
+(p8) shl GR_exp_y= GR_signif_y,GR_exp_y // Get lsb of int + fraction
+ // Wait 4 cycles to use result
+}
+;;
+
//
-// Maybe y is < 1 already, so
-// isn't an int.
+// Return +inf for x=+inf, y > 0
+// Return +0 for x=+inf, y < 0
//
-(p0) sub GR_low_order_bit = GR_low_order_bit,GR_BIAS
+{ .mfi
+ nop.m 999
+(p12) mov FR_Result = f0 // If x=+inf, y<0, result=+0
+ nop.i 999
}
-{ .mlx
- nop.m 999
-(p0) movl GR_sign_mask = 0x20000 ;;
+{ .mfb
+ nop.m 999
+(p14) fma.s0 FR_Result = FR_Input_X,f1,f0 // If x=+inf, y>0, result=+inf
+(p11) br.ret.sptk b0 // Exit x=+inf
}
-{ .mfi
-(p0) and GR_sign_mask = GR_sign_mask,GR_exp_y
+;;
+
//
-// Return +Inf when x=+/-0 and y < 0 and not odd and raise
-// divide-by-zero exception.
+// Here only if x=-inf. Wait until can use result of shl...
//
-(p0) fclass.m.unc p11, p0 = FR_Input_X,0x021
- nop.i 999 ;;
-}
{ .mmi
- nop.m 999 ;;
-//
-// Is shift off integer part of y.
-// Get y's even or odd bit - y might not be an int.
-//
-(p11) cmp.eq.unc p11,p12 = GR_sign_mask, r0
-//
-// Remove exponent BIAS
-//
-(p8) shl GR_exp_y = GR_signif_y,GR_low_order_bit ;;
+ nop.m 999
+;;
+ nop.m 999
+ nop.i 999
}
+;;
+
{ .mfi
-(p9) or GR_exp_y = 0xF,GR_signif_y
-//
-// Is y positive or negative when x is +Inf?
-// Is y and int when x = -Inf
-//
-(p11) mov FR_Result = FR_Input_X
- nop.i 999 ;;
+ cmp.eq p8,p9 = GR_y_sign, r0 // Test y pos
+ nop.f 999
+ shl GR_fraction_y = GR_exp_y,1 // Shift left 1 to get fraction
}
-{ .mfi
- nop.m 999
-(p12) mov FR_Result = f0
- nop.i 999 ;;
+;;
+
+{ .mmi
+ cmp.eq p13,p0 = GR_fraction_y, r0 // Test y integer
+;;
+ nop.m 999
+(p13) tbit.nz.unc p13,p0 = GR_exp_y, 63 // Test y odd integer
}
-{ .mii
- nop.m 999
+;;
+
//
-// Shift signficand looking for nonzero bits
-// For y non-ints, upset the significand.
+// Is y even or odd?
//
-(p0) shl GR_signif_y = GR_exp_y,1 ;;
-(p13) cmp.eq.unc p13,p0 = GR_signif_y, r0
-}
{ .mii
- nop.m 999
-(p0) extr.u GR_low_order_bit = GR_exp_y,63,1 ;;
-(p13) cmp.ne.unc p13,p0 = GR_low_order_bit, r0
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p11) br.cond.sptk L(POWL_64_RETURN) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p12) br.cond.sptk L(POWL_64_RETURN) ;;
+(p13) cmp.eq.unc p14,p10 = GR_y_sign, r0 // Test x=-inf, y pos odd int
+(p13) cmp.ne.and p8,p9 = r0,r0 // If y odd int, turn off p8,p9
+ nop.i 999
}
+;;
+
//
-// Return Inf for y > 0
-// Return +0 for y < 0
-// Is y even or odd?
+// Return -0 for x = -inf and y < 0 and odd int.
+// Return -Inf for x = -inf and y > 0 and odd int.
//
-{ .mii
-(p13) cmp.eq.unc p13,p10 = GR_sign_mask, r0
-(p0) cmp.eq.unc p8,p9 = GR_sign_mask, r0 ;;
- nop.i 999
+{ .mfi
+ nop.m 999
+(p10) fmerge.ns FR_Result = f0, f0 // If x=-inf, y neg odd int, result=-0
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
+(p14) fmpy.s0 FR_Result = FR_Input_X,f1 // If x=-inf, y pos odd int, result=-inf
+ nop.i 999
+}
+;;
+
//
-// For x = -inf, y is and int, positive
-// and odd
-// Is y positive in general?
+// Return Inf for x = -inf and y > 0 not an odd int.
+// Return +0 for x = -inf and y < 0 not an odd int.
//
-(p13) mov FR_Result = FR_Input_X
- nop.i 999 ;;
+.pred.rel "mutex",p8,p9
+{ .mfi
+ nop.m 999
+(p8) fmerge.ns FR_Result = FR_Input_X, FR_Input_X // If x=-inf, y>0 not odd int
+ // result=+inf
+ nop.i 999
}
{ .mfb
- nop.m 999
-(p10) fmerge.ns FR_Result = f0, f0
-(p13) br.cond.sptk L(POWL_64_RETURN) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p10) br.cond.sptk L(POWL_64_RETURN) ;;
+ nop.m 999
+(p9) fmpy.s0 FR_Result = f0,f0 // If x=-inf, y<0 not odd int
+ // result=+0
+ br.ret.sptk b0 // Exit for x=-inf
}
-{ .mfi
- nop.m 999
+;;
+
+
+POWL_64_Y_IS_INF:
+// Here if y=inf, x not 1 or nan
//
-// Return -Inf for x = -inf and y > 0 and odd int.
-// Return -0 for x = -inf and y < 0 and odd int.
+// For y = +Inf and |x| < 1 returns 0
+// For y = +Inf and |x| > 1 returns Inf
+// For y = -Inf and |x| < 1 returns Inf
+// For y = -Inf and |x| > 1 returns 0
+// For y = Inf and |x| = 1 returns 1
//
-(p8) fmerge.ns FR_Result = FR_Input_X, FR_Input_X
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p9) mov FR_Result = f0
-(p8) br.cond.sptk L(POWL_64_RETURN) ;;
+{ .mfi
+ nop.m 999
+ fclass.m p8, p0 = FR_Input_Y, 0x021 // Test y=+inf
+ nop.i 999
}
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.sptk L(POWL_64_RETURN) ;;
+;;
+
+{ .mfi
+ nop.m 999
+ fclass.m p9, p0 = FR_Input_Y, 0x022 // Test y=-inf
+ nop.i 999
}
-L(POWL_64_Y_IS_INF):
+;;
+
{ .mfi
- nop.m 999
-//
-// Return Inf for x = -inf and y > 0 not an odd int.
-// Return +0 for x = -inf and y < 0 and not an odd int.
-//
-(p0) fclass.m.unc p8, p0 = FR_Input_Y, 0x021
- nop.i 999
+ nop.m 999
+ fabs FR_X = FR_Input_X // Form |x|
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p9, p0 = FR_Input_Y, 0x022
- nop.i 999 ;;
+ nop.m 999
+ fcmp.eq.s0 p10,p0 = FR_Input_X, f0 // flag if x denormal
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fabs FR_X = FR_Input_X
- nop.i 999 ;;
+ nop.m 999
+(p8) fcmp.lt.unc.s1 p6, p0 = FR_X, f1 // Test y=+inf, |x|<1
+ nop.i 999
}
+;;
{ .mfi
- nop.m 999
-(p0) fcmp.eq.s0 p10,p0 = FR_Input_X, f0 // flag if x denormal
- nop.i 999 ;;
+ nop.m 999
+(p8) fcmp.gt.unc.s1 p7, p0 = FR_X, f1 // Test y=+inf, |x|>1
+ nop.i 999
}
+;;
{ .mfi
- nop.m 999
-//
-// Find y = +/- Inf
-// Compute |x|
-//
-(p8) fcmp.lt.unc.s1 p6, p0 = FR_X, f1
- nop.i 999
+ nop.m 999
+(p9) fcmp.lt.unc.s1 p12, p0 = FR_X, f1 // Test y=-inf, |x|<1
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p8) fcmp.gt.unc.s1 p7, p0 = FR_X, f1
- nop.i 999 ;;
+ nop.m 999
+(p6) fmpy.s0 FR_Result = f0,f0 // If y=+inf, |x|<1, result=+0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p9) fcmp.lt.unc.s1 p12, p0 = FR_X, f1
- nop.i 999
+ nop.m 999
+(p9) fcmp.gt.unc.s1 p13, p0 = FR_X, f1 // Test y=-inf, |x|>1
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p9) fcmp.gt.unc.s1 p13, p0 = FR_X, f1
- nop.i 999 ;;
+ nop.m 999
+(p7) fmpy.s0 FR_Result = FR_Input_Y, f1 // If y=+inf, |x|>1, result=+inf
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// For y = +Inf and |x| < 1 returns 0
-// For y = +Inf and |x| > 1 returns Inf
-// For y = -Inf and |x| < 1 returns Inf
-// For y = -Inf and |x| > 1 returns 0
-//
-(p6) mov FR_Result = f0
- nop.i 999 ;;
+ nop.m 999
+ fcmp.eq.s1 p14, p0 = FR_X, f1 // Test y=inf, |x|=1
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) mov FR_Result = FR_Input_Y
- nop.i 999 ;;
+ nop.m 999
+(p12) fnma.s0 FR_Result = FR_Input_Y, f1, f0 // If y=-inf, |x|<1, result=+inf
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p12) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_Y
- nop.i 999 ;;
+ nop.m 999
+(p13) mov FR_Result = f0 // If y=-inf, |x|>1, result=+0
+ nop.i 999
}
+;;
+
{ .mfb
- nop.m 999
-(p13) mov FR_Result = f0
-//
-// Produce x ** +/- Inf results
-//
-(p6) br.cond.spnt L(POWL_64_RETURN) ;;
+ nop.m 999
+(p14) fmpy.s0 FR_Result = f1,f1 // If y=inf, |x|=1, result=+1
+ br.ret.sptk b0 // Common return for y=inf
}
-{ .mib
- nop.m 999
- nop.i 999
-(p7) br.cond.spnt L(POWL_64_RETURN) ;;
+;;
+
+
+// Here if x or y denorm/unorm
+POWL_DENORM:
+{ .mmi
+ getf.sig GR_signif_Z = FR_norm_X // Get significand of x
+;;
+ getf.exp GR_signexp_y = FR_norm_Y // Get sign and exp of y
+ nop.i 999
}
-{ .mib
- nop.m 999
- nop.i 999
-(p12) br.cond.spnt L(POWL_64_RETURN) ;;
+;;
+
+{ .mfi
+ getf.sig GR_signif_y = FR_norm_Y // Get significand of y
+ nop.f 999
+ nop.i 999
}
+;;
+
{ .mib
- nop.m 999
- nop.i 999
-(p13) br.cond.spnt L(POWL_64_RETURN) ;;
+ getf.exp GR_signexp_x = FR_norm_X // Get sign and exp of x
+ extr.u GR_Index1 = GR_signif_Z, 59, 4 // Extract upper 4 signif bits of x
+ br.cond.sptk POWL_COMMON // Branch back to main path
}
-{ .mfb
- nop.m 999
+;;
+
+
+POWL_64_UNSUPPORT:
//
-// +/-1 ** +/-Inf, result is +1
+// Raise exceptions for specific
+// values - pseudo NaN and
+// infinities.
+// Return NaN and raise invalid
//
-(p0) fmpy.s0 FR_Result = f1,f1
-(p0) br.cond.sptk L(POWL_64_RETURN) ;;
-}
-L(POWL_64_UNSUPPORT):
{ .mfb
- nop.m 999
+ nop.m 999
+ fmpy.s0 FR_Result = FR_Input_X,f0
+ br.ret.sptk b0
+}
+;;
+
+POWL_64_XNEG:
//
-// Return NaN and raise invalid
+// Raise invalid for x < 0 and
+// y not an integer
//
-(p0) fmpy.s0 FR_Result = FR_Input_X,f0
-//
-// Raise exceptions for specific
-// values - pseudo NaN and
-// infinities.
-//
-(p0) br.cond.sptk L(POWL_64_RETURN) ;;
-}
-L(POWL_64_XNEG):
{ .mfi
- nop.m 999
-(p0) frcpa.s0 FR_Result, p8 = f0, f0
-//
-// Raise invalid for x < 0 and
-// y not an integer and
-//
-(p0) mov GR_Parameter_TAG = 22
+ nop.m 999
+ frcpa.s0 FR_Result, p8 = f0, f0
+ mov GR_Parameter_TAG = 22
}
{ .mib
- nop.m 999
- nop.i 999
-(p0) br.cond.sptk __libm_error_region ;;
+ nop.m 999
+ nop.i 999
+ br.cond.sptk __libm_error_region
}
-L(POWL_64_SQRT):
+;;
+
+POWL_64_SQRT:
{ .mfi
- nop.m 999
-(p0) frsqrta.s0 FR_Result,p10 = FR_Input_X
- nop.i 999 ;;
+ nop.m 999
+ frsqrta.s0 FR_Result,p10 = FR_save_Input_X
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p10) fma.s1 f62=FR_Half,FR_Input_X,f0
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 f62=FR_Half,FR_save_Input_X,f0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
-// Step (2)
-// h = 1/2 * a in f9
-//
-(p10) fma.s1 f63=FR_Result,FR_Result,f0
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 f63=FR_Result,FR_Result,f0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
-// Step (3)
-// t1 = y0 * y0 in f10
-//
-(p10) fnma.s1 f32=f63,f62,f11
- nop.i 999 ;;
+ nop.m 999
+(p10) fnma.s1 f32=f63,f62,FR_Half
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
-// Step (4)
-// t2 = 1/2 - t1 * h in f10
-//
-(p10) fma.s1 f33=f32,FR_Result,FR_Result
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 f33=f32,FR_Result,FR_Result
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
-// Step (5)
-// y1 = y0 + t2 * y0 in f13
-//
-(p10) fma.s1 f34=f33,f62,f0
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 f34=f33,f62,f0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
-// Step (6)
-// t3 = y1 * h in f10
-//
-(p10) fnma.s1 f35=f34,f33,f11
- nop.i 999 ;;
+ nop.m 999
+(p10) fnma.s1 f35=f34,f33,FR_Half
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
-// Step (7)
-// t4 = 1/2 - t3 * y1 in f10
-//
-(p10) fma.s1 f63=f35,f33,f33
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 f63=f35,f33,f33
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
-// Step (8)
-// y2 = y1 + t4 * y1 in f13
-//
-(p10) fma.s1 f32=FR_Input_X,f63,f0
- nop.i 999
+ nop.m 999
+(p10) fma.s1 f32=FR_save_Input_X,f63,f0
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Step (9)
-// S = a * y2 in f10
-//
-(p10) fma.s1 FR_Result=f63,f62,f0
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_Result=f63,f62,f0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
-// Step (10)
-// t5 = y2 * h in f9
-//
-(p10) fma.s1 f33=f11,f63,f0
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 f33=f11,f63,f0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
-// Step (11)
-// H = 1/2 * y2 in f11
-//
-(p10) fnma.s1 f34=f32,f32,f8
- nop.i 999
+ nop.m 999
+(p10) fnma.s1 f34=f32,f32,FR_save_Input_X
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Step (12)
-// d = a - S * S in f12
-//
-(p10) fnma.s1 f35=FR_Result,f63,f11
- nop.i 999 ;;
+ nop.m 999
+(p10) fnma.s1 f35=FR_Result,f63,FR_Half
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
-// Step (13)
-// t6 = 1/2 - t5 * y2 in f7
-//
-(p10) fma.s1 f62=f33,f34,f32
- nop.i 999
+ nop.m 999
+(p10) fma.s1 f62=f33,f34,f32
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Step (14)
-// S1 = S + d * H in f13
-//
-(p10) fma.s1 f63=f33,f35,f33
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 f63=f33,f35,f33
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
-// Step (15)
-// H1 = H + t6 * h in f7
-//
-(p10) fnma.s1 f32=f62,f62,FR_Input_X
- nop.i 999 ;;
+ nop.m 999
+(p10) fnma.s1 f32=f62,f62,FR_save_Input_X
+ nop.i 999 ;;
}
{ .mfb
- nop.m 999
-//
-// Step (16)
-// d1 = a - S1 * S1
-//
-(p10) fma.s0 FR_Result=f32,f63,f62
-//
-// Step (17)
-// R = S1 + d1 * H1
-//
-(p10) br.cond.sptk L(POWL_64_RETURN) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-//
-// Do the Newton-Raphson iteration from the EAS.
-//
-(p0) br.cond.sptk L(POWL_64_RETURN) ;;
+ nop.m 999
+(p10) fma.s0 FR_Result=f32,f63,f62
+ br.ret.sptk b0 // Exit for x > 0, y = 0.5
}
-//
-// Take care of the degenerate cases.
-//
+;;
-L(POWL_64_RETURN):
-{ .mfb
- nop.m 999
-(p0) mov FR_Output = FR_Result
-(p0) br.ret.sptk b0 ;;
-}
-.endp powl
-ASM_SIZE_DIRECTIVE(powl)
+GLOBAL_LIBM_END(powl)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
@@ -3411,32 +2770,32 @@ __libm_error_region:
mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
- stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
+ stfe [GR_Parameter_Y] = FR_Input_Y,16 // Save Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
.body
{ .mib
- stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ stfe [GR_Parameter_X] = FR_save_Input_X // Store Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y
nop.b 0 // Parameter 3 address
}
{ .mib
- stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ stfe [GR_Parameter_Y] = FR_Result // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
{ .mmi
- ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
- add sp = 64,sp // Restore stack pointer
- mov b0 = GR_SAVE_B0 // Restore return address
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
mov gp = GR_SAVE_GP // Restore gp
@@ -3444,7 +2803,6 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+.endp
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_remainder.S b/sysdeps/ia64/fpu/e_remainder.S
index d8a27722de..2f6e90f994 100644
--- a/sysdeps/ia64/fpu/e_remainder.S
+++ b/sysdeps/ia64/fpu/e_remainder.S
@@ -1,10 +1,10 @@
- .file "remainder.asm"
-// Copyright (C) 2000, 2001, Intel Corporation
+.file "remainder.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska, Bob Norin,
-// Shane Story, and Ping Tak Peter Tang of the Computational Software Lab,
-// Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,17 +35,19 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//====================================================================
-// 2/02/00 Initial version
-// 3/02/00 New Algorithm
-// 4/04/00 Unwind support added
-// 7/21/00 Fixed quotient=2^{24*m+23}*1.q1...q23 1 bug
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 03/02/00 New Algorithm
+// 04/04/00 Unwind support added
+// 07/21/00 Fixed quotient=2^{24*m+23}*1.q1...q23 1 bug
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-//11/29/00 Set FR_Y to f9
+// 11/29/00 Set FR_Y to f9
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//====================================================================
@@ -78,16 +80,12 @@
// a=+/- Inf, or b=+/-0: return NaN, call libm_error_support
// a=NaN or b=NaN: return NaN
-#include "libm_support.h"
-
// Registers used
//====================================================================
// Predicate registers: p6-p14
// General registers: r2,r3,r28,r29,r32 (ar.pfs), r33-r39
// Floating point registers: f6-f15,f32
- .section .text
-
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35
@@ -103,18 +101,9 @@ FR_Y = f9
FR_RESULT = f8
+.section .text
+GLOBAL_IEEE754_ENTRY(remainder)
- .proc remainder#
- .align 32
- .global remainder#
- .align 32
-
-remainder:
-#ifdef _LIBC
-.global __remainder
-.type __remainder,@function
-__remainder:
-#endif
// inputs in f8, f9
// result in f8
@@ -139,7 +128,7 @@ __remainder:
// Y +-NAN, +-inf, +-0? p11
{ .mfi
setf.exp f32=r28
-(p0) fclass.m.unc p11,p0 = f9, 0xe7
+ fclass.m.unc p11,p0 = f9, 0xe7
nop.i 999
}
// qnan snan inf norm unorm 0 -+
@@ -148,7 +137,7 @@ __remainder:
// X +-NAN, +-inf, ? p9
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p0 = f8, 0xe3
+ fclass.m.unc p9,p0 = f8, 0xe3
nop.i 999;;
}
@@ -167,8 +156,8 @@ __remainder:
}
{.bbb
- (p9) br.cond.spnt L(FREM_X_NAN_INF)
- (p11) br.cond.spnt L(FREM_Y_NAN_INF_ZERO)
+ (p9) br.cond.spnt FREM_X_NAN_INF
+ (p11) br.cond.spnt FREM_Y_NAN_INF_ZERO
nop.b 0
} {.mfi
nop.m 0
@@ -178,7 +167,7 @@ __remainder:
}
-L(remloop24):
+remloop24:
{ .mfi
nop.m 0
// Step (2)
@@ -200,7 +189,7 @@ L(remloop24):
{.mfi
nop.m 0
// q1=q0*(1+e0)
- fma.s1 f15=f12,f7,f12
+ (p6) fma.s1 f15=f12,f7,f12
nop.i 0
}
{ .mfi
@@ -331,7 +320,7 @@ L(remloop24):
// (p9) set r=r2 (new a, if not last iteration)
// (p10) new a =r
(p10) mov f13=f6
- (p12) br.cond.sptk L(remloop24);;
+ (p12) br.cond.sptk remloop24;;
}
// last iteration
@@ -388,7 +377,7 @@ L(remloop24):
}
-L(FREM_X_NAN_INF):
+FREM_X_NAN_INF:
// Y zero ?
{.mfi
@@ -405,19 +394,19 @@ L(FREM_X_NAN_INF):
nop.m 0
nop.i 0
// if Y zero
- (p11) br.cond.spnt L(FREM_Y_ZERO);;
+ (p11) br.cond.spnt FREM_Y_ZERO;;
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
-(p0) fclass.m.unc p8,p0 = f8, 0x23
+ fclass.m.unc p8,p0 = f8, 0x23
nop.i 999
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
-(p0) fclass.m.unc p11,p0 = f8, 0x23
+ fclass.m.unc p11,p0 = f8, 0x23
nop.i 999;;
}
// Y NaN ?
@@ -445,14 +434,14 @@ L(FREM_X_NAN_INF):
}
{ .mfi
nop.m 999
-(p8) fma.d f8=f8,f1,f0
+(p8) fma.d.s0 f8=f8,f1,f0
nop.i 0 ;;
}
{ .mfb
nop.m 999
frcpa.s0 f8,p7=f8,f9
- (p11) br.cond.spnt L(EXP_ERROR_RETURN);;
+ (p11) br.cond.spnt EXP_ERROR_RETURN;;
}
{ .mib
nop.m 0
@@ -461,35 +450,35 @@ L(FREM_X_NAN_INF):
}
-L(FREM_Y_NAN_INF_ZERO):
+FREM_Y_NAN_INF_ZERO:
// Y INF
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f9, 0x23
+ fclass.m.unc p7,p0 = f9, 0x23
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p7) fma.d f8=f8,f1,f0
+(p7) fma.d.s0 f8=f8,f1,f0
(p7) br.ret.spnt b0 ;;
}
// Y NAN?
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p0 = f9, 0xc3
+ fclass.m.unc p9,p0 = f9, 0xc3
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p9) fma.d f8=f9,f1,f0
+(p9) fma.d.s0 f8=f9,f1,f0
(p9) br.ret.spnt b0 ;;
}
-L(FREM_Y_ZERO):
+FREM_Y_ZERO:
// Y zero? Must be zero at this point
// because it is the only choice left.
// Return QNAN indefinite
@@ -497,7 +486,7 @@ L(FREM_Y_ZERO):
// X NAN?
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p10 = f8, 0xc3
+ fclass.m.unc p9,p10 = f8, 0xc3
nop.i 999 ;;
}
{ .mfi
@@ -508,47 +497,41 @@ L(FREM_Y_ZERO):
{.mfi
nop.m 999
- (p9) frcpa f11,p7=f8,f0
+ (p9) frcpa.s0 f11,p7=f8,f0
nop.i 0;;
}
{ .mfi
nop.m 999
-(p10) frcpa f11,p7 = f0,f0
+(p10) frcpa.s0 f11,p7 = f0,f0
nop.i 999;;
}
{ .mfi
nop.m 999
-(p0) fmerge.s f10 = f8, f8
+ fmerge.s f10 = f8, f8
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fma.d f8=f11,f1,f0
+ fma.d.s0 f8=f11,f1,f0
nop.i 999
}
-L(EXP_ERROR_RETURN):
+EXP_ERROR_RETURN:
{ .mib
-(p0) mov GR_Parameter_TAG = 124
+ mov GR_Parameter_TAG = 124
nop.i 999
-(p0) br.sptk __libm_error_region;;
+ br.sptk __libm_error_region;;
}
-.endp remainder
-ASM_SIZE_DIRECTIVE(remainder)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__remainder)
-#endif
-
+GLOBAL_IEEE754_END(remainder)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
@@ -596,10 +579,11 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
+
+
diff --git a/sysdeps/ia64/fpu/e_remainderf.S b/sysdeps/ia64/fpu/e_remainderf.S
index 40f9b32921..bbb5fd0e0f 100644
--- a/sysdeps/ia64/fpu/e_remainderf.S
+++ b/sysdeps/ia64/fpu/e_remainderf.S
@@ -1,11 +1,10 @@
- .file "remainderf.asm"
-// Copyright (C) 2000, 2001, Intel Corporation
+.file "remainderf.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational
-// Software Lab,
-// Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -36,17 +35,19 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//====================================================================
-// 2/02/00 Initial version
-// 3/02/00 New algorithm
-// 4/04/00 Unwind support added
-// 7/21/00 Fixed quotient=2^{24*m+23} bug
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 03/02/00 New algorithm
+// 04/04/00 Unwind support added
+// 07/21/00 Fixed quotient=2^{24*m+23} bug
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-//11/29/00 Set FR_Y to f9
+// 11/29/00 Set FR_Y to f9
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//====================================================================
@@ -78,9 +79,6 @@
//====================================================================
// a=+/- Inf, or b=+/-0: return NaN, call libm_error_support
// a=NaN or b=NaN: return NaN
-
-#include "libm_support.h"
-
//
// Registers used
//====================================================================
@@ -89,8 +87,6 @@
// Floating point registers: f6-f15
//
-.section .text
-
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
GR_SAVE_GP = r35
@@ -106,17 +102,9 @@ FR_Y = f9
FR_RESULT = f8
- .proc remainderf#
- .align 32
- .global remainderf#
- .align 32
+.section .text
+GLOBAL_IEEE754_ENTRY(remainderf)
-remainderf:
-#ifdef _LIBC
-.global __remainderf
-.type __remainderf,@function
-__remainderf:
-#endif
// inputs in f8, f9
// result in f8
@@ -141,7 +129,7 @@ __remainderf:
// Y +-NAN, +-inf, +-0? p11
{ .mfi
nop.m 999
-(p0) fclass.m.unc p11,p0 = f9, 0xe7
+ fclass.m.unc p11,p0 = f9, 0xe7
nop.i 999
}
// qnan snan inf norm unorm 0 -+
@@ -150,7 +138,7 @@ __remainderf:
// X +-NAN, +-inf, ? p9
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p0 = f8, 0xe3
+ fclass.m.unc p9,p0 = f8, 0xe3
nop.i 999;;
}
@@ -168,8 +156,8 @@ __remainderf:
nop.i 0;;
}
{.bbb
- (p9) br.cond.spnt L(FREM_X_NAN_INF)
- (p11) br.cond.spnt L(FREM_Y_NAN_INF_ZERO)
+ (p9) br.cond.spnt FREM_X_NAN_INF
+ (p11) br.cond.spnt FREM_Y_NAN_INF_ZERO
nop.b 0
} {.mfi
nop.m 0
@@ -179,7 +167,7 @@ __remainderf:
}
.align 32
-L(remloop24):
+remloop24:
{ .mfi
// f12=2^{24}-2
setf.s f12=r3
@@ -347,7 +335,7 @@ L(remloop24):
// (p9) set r=r2 (new a, if not last iteration)
// (p10) new a =r
(p10) mov f13=f6
- (p12) br.cond.sptk L(remloop24);;
+ (p12) br.cond.sptk remloop24;;
}
// last iteration
@@ -408,7 +396,7 @@ L(remloop24):
}
-L(FREM_X_NAN_INF):
+FREM_X_NAN_INF:
// Y zero ?
{.mfi
@@ -425,19 +413,19 @@ L(FREM_X_NAN_INF):
nop.m 0
nop.i 0
// if Y zero
- (p11) br.cond.spnt L(FREM_Y_ZERO);;
+ (p11) br.cond.spnt FREM_Y_ZERO;;
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
-(p0) fclass.m.unc p8,p0 = f8, 0x23
+ fclass.m.unc p8,p0 = f8, 0x23
nop.i 999
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
-(p0) fclass.m.unc p11,p0 = f8, 0x23
+ fclass.m.unc p11,p0 = f8, 0x23
nop.i 999;;
}
// Y NaN ?
@@ -465,14 +453,14 @@ L(FREM_X_NAN_INF):
}
{ .mfi
nop.m 999
-(p8) fma.s f8=f8,f1,f0
+(p8) fma.s.s0 f8=f8,f1,f0
nop.i 0 ;;
}
{ .mfb
nop.m 999
frcpa.s0 f8,p7=f8,f9
- (p11) br.cond.spnt L(EXP_ERROR_RETURN);;
+ (p11) br.cond.spnt EXP_ERROR_RETURN;;
}
{ .mib
nop.m 0
@@ -481,35 +469,35 @@ L(FREM_X_NAN_INF):
}
-L(FREM_Y_NAN_INF_ZERO):
+FREM_Y_NAN_INF_ZERO:
// Y INF
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f9, 0x23
+ fclass.m.unc p7,p0 = f9, 0x23
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p7) fma.s f8=f8,f1,f0
+(p7) fma.s.s0 f8=f8,f1,f0
(p7) br.ret.spnt b0 ;;
}
// Y NAN?
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p0 = f9, 0xc3
+ fclass.m.unc p9,p0 = f9, 0xc3
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p9) fma.s f8=f9,f1,f0
+(p9) fma.s.s0 f8=f9,f1,f0
(p9) br.ret.spnt b0 ;;
}
-L(FREM_Y_ZERO):
+FREM_Y_ZERO:
// Y zero? Must be zero at this point
// because it is the only choice left.
// Return QNAN indefinite
@@ -517,7 +505,7 @@ L(FREM_Y_ZERO):
// X NAN?
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p10 = f8, 0xc3
+ fclass.m.unc p9,p10 = f8, 0xc3
nop.i 999 ;;
}
{ .mfi
@@ -528,47 +516,41 @@ L(FREM_Y_ZERO):
{.mfi
nop.m 999
- (p9) frcpa f11,p7=f8,f0
+ (p9) frcpa.s0 f11,p7=f8,f0
nop.i 0;;
}
{ .mfi
nop.m 999
-(p10) frcpa f11,p7 = f0,f0
+(p10) frcpa.s0 f11,p7 = f0,f0
nop.i 999;;
}
{ .mfi
nop.m 999
-(p0) fmerge.s f10 = f8, f8
+ fmerge.s f10 = f8, f8
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fma.s f8=f11,f1,f0
+ fma.s.s0 f8=f11,f1,f0
nop.i 999
}
-L(EXP_ERROR_RETURN):
+EXP_ERROR_RETURN:
{ .mib
-(p0) mov GR_Parameter_TAG = 125
+ mov GR_Parameter_TAG = 125
nop.i 999
-(p0) br.sptk __libm_error_region;;
+ br.sptk __libm_error_region;;
}
-.endp remainderf
-ASM_SIZE_DIRECTIVE(remainderf)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__remainderf)
-#endif
-
+GLOBAL_IEEE754_END(remainderf)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
@@ -616,9 +598,11 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
+
+
+
diff --git a/sysdeps/ia64/fpu/e_remainderl.S b/sysdeps/ia64/fpu/e_remainderl.S
index 5856861442..1c1a3c3072 100644
--- a/sysdeps/ia64/fpu/e_remainderl.S
+++ b/sysdeps/ia64/fpu/e_remainderl.S
@@ -1,10 +1,10 @@
-.file "remainderl.asm"
-// Copyright (C) 2000, 2001, Intel Corporation
+.file "remainderl.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang of the Computational
-// Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,17 +35,19 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//====================================================================
-// 2/02/00 Initial version
-// 3/02/00 New algorithm
-// 4/04/00 Unwind support added
-// 7/21/00 Fixed quotient=2^{24*m+23}*1.q1...q23 1 bug
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 03/02/00 New algorithm
+// 04/04/00 Unwind support added
+// 07/21/00 Fixed quotient=2^{24*m+23}*1.q1...q23 1 bug
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-//11/29/00 Set FR_Y to f9
+// 11/29/00 Set FR_Y to f9
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//====================================================================
@@ -77,9 +79,6 @@
//====================================================================
// a=+/- Inf, or b=+/-0: return NaN, call libm_error_support
// a=NaN or b=NaN: return NaN
-
-#include "libm_support.h"
-
//
// Registers used
//====================================================================
@@ -87,8 +86,6 @@
// General registers: r2,r3,r28,r29,r32 (ar.pfs), r33-r39
// Floating point registers: f6-f15,f32
//
-.section .text
-
GR_SAVE_B0 = r33
GR_SAVE_PFS = r34
@@ -105,19 +102,9 @@ FR_Y = f9
FR_RESULT = f8
+.section .text
+GLOBAL_IEEE754_ENTRY(remainderl)
-
- .proc remainderl#
- .align 32
- .global remainderl#
- .align 32
-
-remainderl:
-#ifdef _LIBC
-.global __remainderl
-.type __remainderl,@function
-__remainderl:
-#endif
// inputs in f8, f9
// result in f8
@@ -159,7 +146,7 @@ cmp.eq p11,p10=r29,r0;;
// X +-NAN, +-inf, ? p9
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p8 = f8, 0xe3
+ fclass.m.unc p9,p8 = f8, 0xe3
nop.i 999;;
}
@@ -196,8 +183,8 @@ cmp.eq p11,p10=r29,r0;;
}
{.bbb
- (p9) br.cond.spnt L(FREM_X_NAN_INF)
- (p11) br.cond.spnt L(FREM_Y_NAN_INF_ZERO)
+ (p9) br.cond.spnt FREM_X_NAN_INF
+ (p11) br.cond.spnt FREM_Y_NAN_INF_ZERO
nop.b 0
} {.mfi
nop.m 0
@@ -206,7 +193,7 @@ cmp.eq p11,p10=r29,r0;;
nop.i 0;;
}
-L(remloop24):
+remloop24:
{ .mfi
nop.m 0
// Step (2)
@@ -228,7 +215,7 @@ L(remloop24):
{.mfi
nop.m 0
// q1=q0*(1+e0)
- fma.s1 f15=f12,f7,f12
+ (p6) fma.s1 f15=f12,f7,f12
nop.i 0
}
{ .mfi
@@ -358,7 +345,7 @@ L(remloop24):
// (p9) set r=r2 (new a, if not last iteration)
// (p10) new a =r
(p10) mov f13=f6
- (p12) br.cond.sptk L(remloop24);;
+ (p12) br.cond.sptk remloop24;;
}
// last iteration
@@ -416,7 +403,7 @@ L(remloop24):
-L(FREM_X_NAN_INF):
+FREM_X_NAN_INF:
// Y zero ?
{.mfi
@@ -433,19 +420,19 @@ L(FREM_X_NAN_INF):
nop.m 0
nop.i 0
// if Y zero
- (p11) br.cond.spnt L(FREM_Y_ZERO);;
+ (p11) br.cond.spnt FREM_Y_ZERO;;
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
-(p0) fclass.m.unc p8,p0 = f8, 0x23
+ fclass.m.unc p8,p0 = f8, 0x23
nop.i 999
}
// X infinity? Return QNAN indefinite
{ .mfi
nop.m 999
-(p0) fclass.m.unc p11,p0 = f8, 0x23
+ fclass.m.unc p11,p0 = f8, 0x23
nop.i 999;;
}
// Y NaN ?
@@ -473,14 +460,14 @@ L(FREM_X_NAN_INF):
}
{ .mfi
nop.m 999
-(p8) fma f8=f8,f1,f0
+(p8) fma.s0 f8=f8,f1,f0
nop.i 0 ;;
}
{ .mfb
nop.m 999
frcpa.s0 f8,p7=f8,f9
- (p11) br.cond.spnt L(EXP_ERROR_RETURN);;
+ (p11) br.cond.spnt EXP_ERROR_RETURN;;
}
{ .mib
nop.m 0
@@ -489,24 +476,24 @@ L(FREM_X_NAN_INF):
}
-L(FREM_Y_NAN_INF_ZERO):
+FREM_Y_NAN_INF_ZERO:
// Y INF
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f9, 0x23
+ fclass.m.unc p7,p0 = f9, 0x23
nop.i 999 ;;
}
{ .mfb
nop.m 999
-(p7) fma f8=f8,f1,f0
+(p7) fma.s0 f8=f8,f1,f0
(p7) br.ret.spnt b0 ;;
}
// Y NAN?
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p10 = f9, 0xc3
+ fclass.m.unc p9,p10 = f9, 0xc3
nop.i 999 ;;
}
{ .mfi
@@ -517,11 +504,11 @@ L(FREM_Y_NAN_INF_ZERO):
{ .mfb
nop.m 999
-(p9) fma f8=f9,f1,f0
+(p9) fma.s0 f8=f9,f1,f0
(p9) br.ret.spnt b0 ;;
}
-L(FREM_Y_ZERO):
+FREM_Y_ZERO:
// Y zero? Must be zero at this point
// because it is the only choice left.
// Return QNAN indefinite
@@ -529,7 +516,7 @@ L(FREM_Y_ZERO):
// X NAN?
{ .mfi
nop.m 999
-(p0) fclass.m.unc p9,p10 = f8, 0xc3
+ fclass.m.unc p9,p10 = f8, 0xc3
nop.i 999 ;;
}
{ .mfi
@@ -540,43 +527,37 @@ L(FREM_Y_ZERO):
{.mfi
nop.m 999
- (p9) frcpa f11,p7=f8,f0
+ (p9) frcpa.s0 f11,p7=f8,f0
nop.i 0;;
}
{ .mfi
nop.m 999
-(p10) frcpa f11,p7 = f0,f0
+(p10) frcpa.s0 f11,p7 = f0,f0
nop.i 999;;
}
{ .mfi
nop.m 999
-(p0) fmerge.s f10 = f8, f8
+ fmerge.s f10 = f8, f8
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fma f8=f11,f1,f0
+ fma.s0 f8=f11,f1,f0
nop.i 999;;
}
-L(EXP_ERROR_RETURN):
+EXP_ERROR_RETURN:
{ .mib
-(p0) mov GR_Parameter_TAG = 123
+ mov GR_Parameter_TAG = 123
nop.i 999
-(p0) br.sptk __libm_error_region;;
+ br.sptk __libm_error_region;;
}
-.endp remainderl
-ASM_SIZE_DIRECTIVE(remainderl)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__remainderl)
-#endif
-
-.proc __libm_error_region
-__libm_error_region:
+GLOBAL_IEEE754_END(remainderl)
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
@@ -624,9 +605,12 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
+
+
+
+
diff --git a/sysdeps/ia64/fpu/e_scalb.S b/sysdeps/ia64/fpu/e_scalb.S
index 7f5b5796de..82e914e259 100644
--- a/sysdeps/ia64/fpu/e_scalb.S
+++ b/sysdeps/ia64/fpu/e_scalb.S
@@ -1,10 +1,10 @@
.file "scalb.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,12 +35,14 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 1/26/01 Scalb completely reworked and now standalone version
+// 02/02/00 Initial version
+// 01/26/01 Scalb completely reworked and now standalone version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
@@ -53,8 +55,6 @@
//
//
-#include "libm_support.h"
-
FR_Floating_X = f8
FR_Result = f8
FR_Floating_N = f9
@@ -84,19 +84,8 @@ GR_Parameter_Y = r36
GR_Parameter_RESULT = r37
GR_Tag = r38
-.align 32
-.global scalb
-
.section .text
-.proc scalb
-.align 32
-
-scalb:
-#ifdef _LIBC
-.global __ieee754_scalb
-.type __ieee754_scalb,@function
-__ieee754_scalb:
-#endif
+GLOBAL_IEEE754_ENTRY(scalb)
//
// Is x NAN, INF, ZERO, +-?
@@ -140,12 +129,12 @@ __ieee754_scalb:
{ .mib
setf.exp FR_Big = GR_Scratch
nop.i 0
-(p6) br.cond.spnt L(SCALB_NAN_INF_ZERO)
+(p6) br.cond.spnt SCALB_NAN_INF_ZERO
}
{ .mib
setf.exp FR_NBig = GR_Scratch1
nop.i 0
-(p7) br.cond.spnt L(SCALB_NAN_INF_ZERO)
+(p7) br.cond.spnt SCALB_NAN_INF_ZERO
};;
//
@@ -212,7 +201,7 @@ __ieee754_scalb:
}
{ .mfb
nop.m 0
-(p7) frcpa f8,p11 = f0,f0
+(p7) frcpa.s0 f8,p11 = f0,f0
(p7) br.ret.spnt b0
};;
@@ -246,7 +235,7 @@ __ieee754_scalb:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch = 0x00000000000303FF
+ movl GR_Scratch = 0x00000000000303FF
};;
{ .mfi
nop.m 0
@@ -255,7 +244,7 @@ __ieee754_scalb:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch1= 0x00000000000103FF
+ movl GR_Scratch1= 0x00000000000103FF
};;
// Set up necessary status fields
@@ -266,12 +255,12 @@ __ieee754_scalb:
//
{ .mfi
nop.m 999
-(p0) fsetc.s3 0x7F,0x41
+ fsetc.s3 0x7F,0x41
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fsetc.s2 0x7F,0x42
+ fsetc.s2 0x7F,0x42
nop.i 999
};;
@@ -345,7 +334,7 @@ __ieee754_scalb:
{ .mfb
(p6) addl GR_Tag = 54, r0
(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig
-(p6) br.cond.spnt L(SCALB_UNDERFLOW)
+(p6) br.cond.spnt SCALB_UNDERFLOW
};;
//
@@ -353,8 +342,8 @@ __ieee754_scalb:
//
{ .mbb
nop.m 0
-(p7) br.cond.spnt L(SCALB_OVERFLOW)
-(p9) br.cond.spnt L(SCALB_OVERFLOW)
+(p7) br.cond.spnt SCALB_OVERFLOW
+(p9) br.cond.spnt SCALB_OVERFLOW
};;
//
@@ -366,7 +355,7 @@ __ieee754_scalb:
br.ret.sptk b0;;
}
-L(SCALB_NAN_INF_ZERO):
+SCALB_NAN_INF_ZERO:
//
// Convert N to a fp integer
@@ -471,16 +460,11 @@ L(SCALB_NAN_INF_ZERO):
br.ret.sptk b0
};;
-.endp scalb
-ASM_SIZE_DIRECTIVE(scalb)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__ieee754_scalb)
-#endif
-.proc __libm_error_region
+GLOBAL_IEEE754_END(scalb)
__libm_error_region:
-L(SCALB_OVERFLOW):
-L(SCALB_UNDERFLOW):
+SCALB_OVERFLOW:
+SCALB_UNDERFLOW:
//
// Get stack address of N
@@ -557,8 +541,7 @@ L(SCALB_UNDERFLOW):
br.ret.sptk b0
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_scalbf.S b/sysdeps/ia64/fpu/e_scalbf.S
index 40af080d38..07acb3297e 100644
--- a/sysdeps/ia64/fpu/e_scalbf.S
+++ b/sysdeps/ia64/fpu/e_scalbf.S
@@ -1,10 +1,10 @@
.file "scalbf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,12 +35,14 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 1/26/01 Scalb completely reworked and now standalone version
+// 02/02/00 Initial version
+// 01/26/01 Scalb completely reworked and now standalone version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
@@ -53,8 +55,6 @@
//
//
-#include "libm_support.h"
-
FR_Floating_X = f8
FR_Result = f8
FR_Floating_N = f9
@@ -84,19 +84,8 @@ GR_Parameter_Y = r36
GR_Parameter_RESULT = r37
GR_Tag = r38
-.align 32
-.global scalbf
-
.section .text
-.proc scalbf
-.align 32
-
-scalbf:
-#ifdef _LIBC
-.global __ieee754_scalbf
-.type __ieee754_scalbf,@function
-__ieee754_scalbf:
-#endif
+GLOBAL_IEEE754_ENTRY(scalbf)
//
// Is x NAN, INF, ZERO, +-?
@@ -140,12 +129,12 @@ __ieee754_scalbf:
{ .mib
setf.exp FR_Big = GR_Scratch
nop.i 0
-(p6) br.cond.spnt L(SCALBF_NAN_INF_ZERO)
+(p6) br.cond.spnt SCALBF_NAN_INF_ZERO
}
{ .mib
setf.exp FR_NBig = GR_Scratch1
nop.i 0
-(p7) br.cond.spnt L(SCALBF_NAN_INF_ZERO)
+(p7) br.cond.spnt SCALBF_NAN_INF_ZERO
};;
//
@@ -212,7 +201,7 @@ __ieee754_scalbf:
}
{ .mfb
nop.m 0
-(p7) frcpa f8,p11 = f0,f0
+(p7) frcpa.s0 f8,p11 = f0,f0
(p7) br.ret.spnt b0
};;
@@ -246,7 +235,7 @@ __ieee754_scalbf:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch = 0x000000000003007F
+ movl GR_Scratch = 0x000000000003007F
};;
{ .mfi
nop.m 0
@@ -255,7 +244,7 @@ __ieee754_scalbf:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch1= 0x000000000001007F
+ movl GR_Scratch1= 0x000000000001007F
};;
// Set up necessary status fields
@@ -266,12 +255,12 @@ __ieee754_scalbf:
//
{ .mfi
nop.m 999
-(p0) fsetc.s3 0x7F,0x41
+ fsetc.s3 0x7F,0x41
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fsetc.s2 0x7F,0x42
+ fsetc.s2 0x7F,0x42
nop.i 999
};;
@@ -345,7 +334,7 @@ __ieee754_scalbf:
{ .mfb
(p6) addl GR_Tag = 56, r0
(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig
-(p6) br.cond.spnt L(SCALBF_UNDERFLOW)
+(p6) br.cond.spnt SCALBF_UNDERFLOW
};;
//
@@ -353,8 +342,8 @@ __ieee754_scalbf:
//
{ .mbb
nop.m 0
-(p7) br.cond.spnt L(SCALBF_OVERFLOW)
-(p9) br.cond.spnt L(SCALBF_OVERFLOW)
+(p7) br.cond.spnt SCALBF_OVERFLOW
+(p9) br.cond.spnt SCALBF_OVERFLOW
};;
//
@@ -366,7 +355,7 @@ __ieee754_scalbf:
br.ret.sptk b0;;
}
-L(SCALBF_NAN_INF_ZERO):
+SCALBF_NAN_INF_ZERO:
//
// Convert N to a fp integer
@@ -471,16 +460,11 @@ L(SCALBF_NAN_INF_ZERO):
br.ret.sptk b0
};;
-.endp scalbf
-ASM_SIZE_DIRECTIVE(scalbf)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__ieee754_scalbf)
-#endif
-.proc __libm_error_region
+GLOBAL_IEEE754_END(scalbf)
__libm_error_region:
-L(SCALBF_OVERFLOW):
-L(SCALBF_UNDERFLOW):
+SCALBF_OVERFLOW:
+SCALBF_UNDERFLOW:
//
// Get stack address of N
@@ -557,8 +541,7 @@ L(SCALBF_UNDERFLOW):
br.ret.sptk b0
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_scalbl.S b/sysdeps/ia64/fpu/e_scalbl.S
index 43eac7a2ad..d22d029155 100644
--- a/sysdeps/ia64/fpu/e_scalbl.S
+++ b/sysdeps/ia64/fpu/e_scalbl.S
@@ -1,10 +1,10 @@
.file "scalbl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,12 +35,14 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 1/26/01 Scalb completely reworked and now standalone version
+// 02/02/00 Initial version
+// 01/26/01 Scalb completely reworked and now standalone version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
@@ -53,8 +55,6 @@
//
//
-#include "libm_support.h"
-
FR_Floating_X = f8
FR_Result = f8
FR_Floating_N = f9
@@ -84,19 +84,8 @@ GR_Parameter_Y = r36
GR_Parameter_RESULT = r37
GR_Tag = r38
-.align 32
-.global scalbl
-
.section .text
-.proc scalbl
-.align 32
-
-scalbl:
-#ifdef _LIBC
-.global __ieee754_scalbl
-.type __ieee754_scalbl,@function
-__ieee754_scalbl:
-#endif
+GLOBAL_IEEE754_ENTRY(scalbl)
//
// Is x NAN, INF, ZERO, +-?
@@ -140,12 +129,12 @@ __ieee754_scalbl:
{ .mib
setf.exp FR_Big = GR_Scratch
nop.i 0
-(p6) br.cond.spnt L(SCALBL_NAN_INF_ZERO)
+(p6) br.cond.spnt SCALBL_NAN_INF_ZERO
}
{ .mib
setf.exp FR_NBig = GR_Scratch1
nop.i 0
-(p7) br.cond.spnt L(SCALBL_NAN_INF_ZERO)
+(p7) br.cond.spnt SCALBL_NAN_INF_ZERO
};;
//
@@ -212,7 +201,7 @@ __ieee754_scalbl:
}
{ .mfb
nop.m 0
-(p7) frcpa f8,p11 = f0,f0
+(p7) frcpa.s0 f8,p11 = f0,f0
(p7) br.ret.spnt b0
};;
@@ -246,7 +235,7 @@ __ieee754_scalbl:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch = 0x0000000000033FFF
+ movl GR_Scratch = 0x0000000000033FFF
};;
{ .mfi
nop.m 0
@@ -255,7 +244,7 @@ __ieee754_scalbl:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch1= 0x0000000000013FFF
+ movl GR_Scratch1= 0x0000000000013FFF
};;
// Set up necessary status fields
@@ -266,12 +255,12 @@ __ieee754_scalbl:
//
{ .mfi
nop.m 999
-(p0) fsetc.s3 0x7F,0x41
+ fsetc.s3 0x7F,0x41
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fsetc.s2 0x7F,0x42
+ fsetc.s2 0x7F,0x42
nop.i 999
};;
@@ -345,7 +334,7 @@ __ieee754_scalbl:
{ .mfb
(p6) addl GR_Tag = 52, r0
(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig
-(p6) br.cond.spnt L(SCALBL_UNDERFLOW)
+(p6) br.cond.spnt SCALBL_UNDERFLOW
};;
//
@@ -353,8 +342,8 @@ __ieee754_scalbl:
//
{ .mbb
nop.m 0
-(p7) br.cond.spnt L(SCALBL_OVERFLOW)
-(p9) br.cond.spnt L(SCALBL_OVERFLOW)
+(p7) br.cond.spnt SCALBL_OVERFLOW
+(p9) br.cond.spnt SCALBL_OVERFLOW
};;
//
@@ -366,7 +355,7 @@ __ieee754_scalbl:
br.ret.sptk b0;;
}
-L(SCALBL_NAN_INF_ZERO):
+SCALBL_NAN_INF_ZERO:
//
// Convert N to a fp integer
@@ -471,16 +460,11 @@ L(SCALBL_NAN_INF_ZERO):
br.ret.sptk b0
};;
-.endp scalbl
-ASM_SIZE_DIRECTIVE(scalbl)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__ieee754_scalbl)
-#endif
-.proc __libm_error_region
+GLOBAL_IEEE754_END(scalbl)
__libm_error_region:
-L(SCALBL_OVERFLOW):
-L(SCALBL_UNDERFLOW):
+SCALBL_OVERFLOW:
+SCALBL_UNDERFLOW:
//
// Get stack address of N
@@ -557,8 +541,7 @@ L(SCALBL_UNDERFLOW):
br.ret.sptk b0
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_sinh.S b/sysdeps/ia64/fpu/e_sinh.S
index 4415dc7524..84c312c2b7 100644
--- a/sysdeps/ia64/fpu/e_sinh.S
+++ b/sysdeps/ia64/fpu/e_sinh.S
@@ -1,10 +1,10 @@
.file "sinh.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2002, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,1249 +20,838 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 10/12/00 Update to set denormal operand and underflow flags
-// 1/22/01 Fixed to set inexact flag for small args.
-//
+// 01/22/01 Fixed to set inexact flag for small args.
+// 05/02/01 Reworked to improve speed of all paths
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 11/20/02 Improved speed with new algorithm
+
// API
//==============================================================
-// double = sinh(double)
-// input floating point f8
-// output floating point f8
-//
-// Registers used
-//==============================================================
-// general registers:
-// r32 -> r47
-// predicate registers used:
-// p6 p7 p8 p9
-// floating-point registers used:
-// f9 -> f15; f32 -> f45;
-// f8 has input, then output
-//
+// double sinh(double)
+
// Overview of operation
//==============================================================
-// There are four paths
-// 1. |x| < 0.25 SINH_BY_POLY
-// 2. |x| < 32 SINH_BY_TBL
-// 3. |x| < 2^14 SINH_BY_EXP
-// 4. |x_ >= 2^14 SINH_HUGE
-//
-// For double extended we get infinity for x >= 400c b174 ddc0 31ae c0ea
-// >= 1.0110001.... x 2^13
-// >= 11357.2166
-//
-// But for double we get infinity for x >= 408633ce8fb9f87e
-// >= 1.0110...x 2^9
-// >= +7.10476e+002
-//
-// And for single we get infinity for x >= 42b3a496
-// >= 1.0110... 2^6
-// >= 89.8215
+// Case 1: 0 < |x| < 2^-60
+// Result = x, computed by x+sgn(x)*x^2) to handle flags and rounding
//
-// SAFE: If there is danger of overflow set SAFE to 0
-// NOT implemented: if there is danger of underflow, set SAFE to 0
-// SAFE for all paths listed below
+// Case 2: 2^-60 < |x| < 0.25
+// Evaluate sinh(x) by a 13th order polynomial
+// Care is take for the order of multiplication; and A1 is not exactly 1/3!,
+// A2 is not exactly 1/5!, etc.
+// sinh(x) = x + (A1*x^3 + A2*x^5 + A3*x^7 + A4*x^9 + A5*x^11 + A6*x^13)
//
-// 1. SINH_BY_POLY
-// ===============
-// If |x| is less than the tiny threshold, then clear SAFE
-// For double, the tiny threshold is -1022 = -0x3fe => -3fe + ffff = fc01
-// register-biased, this is fc01
-// For single, the tiny threshold is -126 = -7e => -7e + ffff = ff81
-// If |x| < tiny threshold, set SAFE = 0
+// Case 3: 0.25 < |x| < 710.47586
+// Algorithm is based on the identity sinh(x) = ( exp(x) - exp(-x) ) / 2.
+// The algorithm for exp is described as below. There are a number of
+// economies from evaluating both exp(x) and exp(-x). Although we
+// are evaluating both quantities, only where the quantities diverge do we
+// duplicate the computations. The basic algorithm for exp(x) is described
+// below.
//
-// 2. SINH_BY_TBL
-// =============
-// SAFE: SAFE is always 1 for TBL;
-//
-// 3. SINH_BY_EXP
-// ==============
-// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe
-// r34 has N-1; 16382 is in register biased form, 0x13ffd
-// There is danger of double overflow if N-1 > 0x3fe
-// in register biased form, 0x103fd
-// Analagously, there is danger of single overflow if N-1 > 0x7e
-// in register biased form, 0x1007d
-// SAFE: If there is danger of overflow set SAFE to 0
-//
-// 4. SINH_HUGE
-// ============
-// SAFE: SAFE is always 0 for HUGE
+// Take the input x. w is "how many log2/128 in x?"
+// w = x * 128/log2
+// n = int(w)
+// x = n log2/128 + r + delta
-#include "libm_support.h"
+// n = 128M + index_1 + 2^4 index_2
+// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta
-//
-// Assembly macros
-//==============================================================
-sinh_FR_X = f44
-sinh_FR_X2 = f9
-sinh_FR_X4 = f10
-sinh_FR_SGNX = f40
-sinh_FR_all_ones = f45
-sinh_FR_tmp = f42
-
-sinh_FR_Inv_log2by64 = f9
-sinh_FR_log2by64_lo = f11
-sinh_FR_log2by64_hi = f10
-
-sinh_FR_A1 = f9
-sinh_FR_A2 = f10
-sinh_FR_A3 = f11
-
-sinh_FR_Rcub = f12
-sinh_FR_M_temp = f13
-sinh_FR_R_temp = f13
-sinh_FR_Rsq = f13
-sinh_FR_R = f14
-
-sinh_FR_M = f38
-
-sinh_FR_B1 = f15
-sinh_FR_B2 = f32
-sinh_FR_B3 = f33
-
-sinh_FR_peven_temp1 = f34
-sinh_FR_peven_temp2 = f35
-sinh_FR_peven = f36
-
-sinh_FR_podd_temp1 = f34
-sinh_FR_podd_temp2 = f35
-sinh_FR_podd = f37
-
-sinh_FR_poly_podd_temp1 = f11
-sinh_FR_poly_podd_temp2 = f13
-sinh_FR_poly_peven_temp1 = f11
-sinh_FR_poly_peven_temp2 = f13
+// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta)
+// Construct 2^M
+// Get 2^(index_1/128) from table_1;
+// Get 2^(index_2/8) from table_2;
+// Calculate exp(r) by 5th order polynomial
+// r = x - n (log2/128)_high
+// delta = - n (log2/128)_low
+// Calculate exp(delta) as 1 + delta
-sinh_FR_J_temp = f9
-sinh_FR_J = f10
-sinh_FR_Mmj = f39
-
-sinh_FR_N_temp1 = f11
-sinh_FR_N_temp2 = f12
-sinh_FR_N = f13
-
-sinh_FR_spos = f14
-sinh_FR_sneg = f15
-
-sinh_FR_Tjhi = f32
-sinh_FR_Tjlo = f33
-sinh_FR_Tmjhi = f34
-sinh_FR_Tmjlo = f35
-
-sinh_GR_mJ = r35
-sinh_GR_J = r36
-
-sinh_AD_mJ = r38
-sinh_AD_J = r39
-sinh_GR_all_ones = r40
-
-sinh_FR_S_hi = f9
-sinh_FR_S_hi_temp = f10
-sinh_FR_S_lo_temp1 = f11
-sinh_FR_S_lo_temp2 = f12
-sinh_FR_S_lo_temp3 = f13
-
-sinh_FR_S_lo = f38
-sinh_FR_C_hi = f39
+// Special values
+//==============================================================
+// sinh(+0) = +0
+// sinh(-0) = -0
-sinh_FR_C_hi_temp1 = f10
-sinh_FR_Y_hi = f11
-sinh_FR_Y_lo_temp = f12
-sinh_FR_Y_lo = f13
-sinh_FR_SINH = f9
+// sinh(+qnan) = +qnan
+// sinh(-qnan) = -qnan
+// sinh(+snan) = +qnan
+// sinh(-snan) = -qnan
-sinh_FR_P1 = f14
-sinh_FR_P2 = f15
-sinh_FR_P3 = f32
-sinh_FR_P4 = f33
-sinh_FR_P5 = f34
-sinh_FR_P6 = f35
+// sinh(-inf) = -inf
+// sinh(+inf) = +inf
-sinh_FR_TINY_THRESH = f9
+// Overflow and Underflow
+//=======================
+// sinh(x) = largest double normal when
+// |x| = 710.47586 = 0x408633ce8fb9f87d
+//
+// Underflow is handled as described in case 1 above
-sinh_FR_SINH_temp = f10
-sinh_FR_SCALE = f11
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input, output
+// f6 -> f15, f32 -> f61
-sinh_FR_signed_hi_lo = f10
+// General registers used:
+// r14 -> r40
+// Predicate registers used:
+// p6 -> p15
-GR_SAVE_PFS = r41
-GR_SAVE_B0 = r42
-GR_SAVE_GP = r43
+// Assembly macros
+//==============================================================
-GR_Parameter_X = r44
-GR_Parameter_Y = r45
-GR_Parameter_RESULT = r46
+rRshf = r14
+rN_neg = r14
+rAD_TB1 = r15
+rAD_TB2 = r16
+rAD_P = r17
+rN = r18
+rIndex_1 = r19
+rIndex_2_16 = r20
+rM = r21
+rBiased_M = r21
+rSig_inv_ln2 = r22
+rIndex_1_neg = r22
+rExp_bias = r23
+rExp_bias_minus_1 = r23
+rExp_mask = r24
+rTmp = r24
+rGt_ln = r24
+rIndex_2_16_neg = r24
+rM_neg = r25
+rBiased_M_neg = r25
+rRshf_2to56 = r26
+rAD_T1_neg = r26
+rExp_2tom56 = r28
+rAD_T2_neg = r28
+rAD_T1 = r29
+rAD_T2 = r30
+rSignexp_x = r31
+rExp_x = r31
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+fRSHF_2TO56 = f6
+fINV_LN2_2TO63 = f7
+fW_2TO56_RSH = f9
+f2TOM56 = f11
+fP5 = f12
+fP4 = f13
+fP3 = f14
+fP2 = f15
+
+fLn2_by_128_hi = f33
+fLn2_by_128_lo = f34
+
+fRSHF = f35
+fNfloat = f36
+fNormX = f37
+fR = f38
+fF = f39
+
+fRsq = f40
+f2M = f41
+fS1 = f42
+fT1 = f42
+fS2 = f43
+fT2 = f43
+fS = f43
+fWre_urm_f8 = f44
+fAbsX = f44
+
+fMIN_DBL_OFLOW_ARG = f45
+fMAX_DBL_NORM_ARG = f46
+fXsq = f47
+fX4 = f48
+fGt_pln = f49
+fTmp = f49
+
+fP54 = f50
+fP5432 = f50
+fP32 = f51
+fP = f52
+fP54_neg = f53
+fP5432_neg = f53
+fP32_neg = f54
+fP_neg = f55
+fF_neg = f56
+
+f2M_neg = f57
+fS1_neg = f58
+fT1_neg = f58
+fS2_neg = f59
+fT2_neg = f59
+fS_neg = f59
+fExp = f60
+fExp_neg = f61
+
+fA6 = f50
+fA65 = f50
+fA6543 = f50
+fA654321 = f50
+fA5 = f51
+fA4 = f52
+fA43 = f52
+fA3 = f53
+fA2 = f54
+fA21 = f54
+fA1 = f55
+fX3 = f56
// Data tables
//==============================================================
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-
+RODATA
.align 16
-double_sinh_arg_reduction:
-ASM_TYPE_DIRECTIVE(double_sinh_arg_reduction,@object)
- data8 0xB8AA3B295C17F0BC, 0x00004005
- data8 0xB17217F7D1000000, 0x00003FF8
- data8 0xCF79ABC9E3B39804, 0x00003FD0
-ASM_SIZE_DIRECTIVE(double_sinh_arg_reduction)
-
-double_sinh_p_table:
-ASM_TYPE_DIRECTIVE(double_sinh_p_table,@object)
- data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC
- data8 0x8888888888888412, 0x00003FF8
- data8 0xD00D00D00D4D39F2, 0x00003FF2
- data8 0xB8EF1D28926D8891, 0x00003FEC
- data8 0xD732377688025BE9, 0x00003FE5
- data8 0xB08AF9AE78C1239F, 0x00003FDE
-ASM_SIZE_DIRECTIVE(double_sinh_p_table)
-
-double_sinh_ab_table:
-ASM_TYPE_DIRECTIVE(double_sinh_ab_table,@object)
- data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC
- data8 0x88888888884ECDD5, 0x00003FF8
- data8 0xD00D0C6DCC26A86B, 0x00003FF2
- data8 0x8000000000000002, 0x00003FFE
- data8 0xAAAAAAAAAA402C77, 0x00003FFA
- data8 0xB60B6CC96BDB144D, 0x00003FF5
-ASM_SIZE_DIRECTIVE(double_sinh_ab_table)
-
-double_sinh_j_table:
-ASM_TYPE_DIRECTIVE(double_sinh_j_table,@object)
- data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000
- data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000
- data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000
- data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000
- data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000
- data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000
- data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000
- data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000
- data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000
- data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000
- data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000
- data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000
- data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000
- data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000
- data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000
- data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000
- data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000
- data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000
- data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000
- data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000
- data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000
- data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000
- data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000
- data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000
- data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000
- data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000
- data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000
- data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000
- data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000
- data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000
- data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000
- data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000
- data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000
- data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000
- data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000
- data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000
- data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000
- data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000
- data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000
- data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000
- data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000
- data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000
- data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000
- data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000
- data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000
- data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000
- data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000
- data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000
- data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000
- data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000
- data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000
- data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000
- data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000
- data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000
- data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000
- data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000
- data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000
- data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000
- data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000
- data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000
- data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000
- data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000
- data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000
- data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000
- data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000
-ASM_SIZE_DIRECTIVE(double_sinh_j_table)
-
-.align 32
-.global sinh#
-.section .text
-.proc sinh#
-.align 32
-
-sinh:
-#ifdef _LIBC
-.global __ieee754_sinh
-.type __ieee754_sinh,@function
-__ieee754_sinh:
-#endif
-
-// X infinity or NAN?
-// Take invalid fault if enabled
+// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
+// double-extended 1/ln(2)
+// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
+// 3fff b8aa 3b29 5c17 f0bc
+// For speed the significand will be loaded directly with a movl and setf.sig
+// and the exponent will be bias+63 instead of bias+0. Thus subsequent
+// computations need to scale appropriately.
+// The constant 128/ln(2) is needed for the computation of w. This is also
+// obtained by scaling the computations.
+//
+// Two shifting constants are loaded directly with movl and setf.d.
+// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7)
+// This constant is added to x*1/ln2 to shift the integer part of
+// x*128/ln2 into the rightmost bits of the significand.
+// The result of this fma is fW_2TO56_RSH.
+// 2. fRSHF = 1.1000..00 * 2^(63)
+// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give
+// the integer part of w, n, as a floating-point number.
+// The result of this fms is fNfloat.
+
+
+LOCAL_OBJECT_START(exp_table_1)
+data8 0x408633ce8fb9f87e // smallest dbl overflow arg
+data8 0x408633ce8fb9f87d // largest dbl arg to give normal dbl result
+data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi
+data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo
+//
+// Table 1 is 2^(index_1/128) where
+// index_1 goes from 0 to 15
+//
+data8 0x8000000000000000 , 0x00003FFF
+data8 0x80B1ED4FD999AB6C , 0x00003FFF
+data8 0x8164D1F3BC030773 , 0x00003FFF
+data8 0x8218AF4373FC25EC , 0x00003FFF
+data8 0x82CD8698AC2BA1D7 , 0x00003FFF
+data8 0x8383594EEFB6EE37 , 0x00003FFF
+data8 0x843A28C3ACDE4046 , 0x00003FFF
+data8 0x84F1F656379C1A29 , 0x00003FFF
+data8 0x85AAC367CC487B15 , 0x00003FFF
+data8 0x8664915B923FBA04 , 0x00003FFF
+data8 0x871F61969E8D1010 , 0x00003FFF
+data8 0x87DB357FF698D792 , 0x00003FFF
+data8 0x88980E8092DA8527 , 0x00003FFF
+data8 0x8955EE03618E5FDD , 0x00003FFF
+data8 0x8A14D575496EFD9A , 0x00003FFF
+data8 0x8AD4C6452C728924 , 0x00003FFF
+LOCAL_OBJECT_END(exp_table_1)
+
+// Table 2 is 2^(index_1/8) where
+// index_2 goes from 0 to 7
+LOCAL_OBJECT_START(exp_table_2)
+data8 0x8000000000000000 , 0x00003FFF
+data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF
+data8 0x9837F0518DB8A96F , 0x00003FFF
+data8 0xA5FED6A9B15138EA , 0x00003FFF
+data8 0xB504F333F9DE6484 , 0x00003FFF
+data8 0xC5672A115506DADD , 0x00003FFF
+data8 0xD744FCCAD69D6AF4 , 0x00003FFF
+data8 0xEAC0C6E7DD24392F , 0x00003FFF
+LOCAL_OBJECT_END(exp_table_2)
+
+
+LOCAL_OBJECT_START(exp_p_table)
+data8 0x3f8111116da21757 //P5
+data8 0x3fa55555d787761c //P4
+data8 0x3fc5555555555414 //P3
+data8 0x3fdffffffffffd6a //P2
+LOCAL_OBJECT_END(exp_p_table)
+
+LOCAL_OBJECT_START(sinh_p_table)
+data8 0xB08AF9AE78C1239F, 0x00003FDE // A6
+data8 0xB8EF1D28926D8891, 0x00003FEC // A4
+data8 0x8888888888888412, 0x00003FF8 // A2
+data8 0xD732377688025BE9, 0x00003FE5 // A5
+data8 0xD00D00D00D4D39F2, 0x00003FF2 // A3
+data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC // A1
+LOCAL_OBJECT_END(sinh_p_table)
-{ .mfi
- alloc r32 = ar.pfs,0,12,4,0
-(p0) fclass.m.unc p6,p0 = f8, 0xe3 //@qnan | @snan | @inf
- mov sinh_GR_all_ones = -1
-}
-;;
+.section .text
+GLOBAL_IEEE754_ENTRY(sinh)
-{ .mfb
- nop.m 999
-(p6) fma.d.s0 f8 = f8,f1,f8
-(p6) br.ret.spnt b0 ;;
+{ .mlx
+ getf.exp rSignexp_x = f8 // Must recompute if x unorm
+ movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
}
-
-// Put 0.25 in f9; p6 true if x < 0.25
-// Make constant that will generate inexact when squared
{ .mlx
- setf.sig sinh_FR_all_ones = sinh_GR_all_ones
-(p0) movl r32 = 0x000000000000fffd ;;
+ addl rAD_TB1 = @ltoff(exp_table_1), gp
+ movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56)
}
+;;
{ .mfi
-(p0) setf.exp f9 = r32
-(p0) fclass.m.unc p7,p0 = f8, 0x07 //@zero
- nop.i 999 ;;
+ ld8 rAD_TB1 = [rAD_TB1]
+ fclass.m p6,p0 = f8,0x0b // Test for x=unorm
+ mov rExp_mask = 0x1ffff
}
-
-{ .mfb
- nop.m 999
-(p0) fmerge.s sinh_FR_X = f0,f8
-(p7) br.ret.spnt b0 ;;
-}
-
-// Identify denormal operands.
{ .mfi
- nop.m 999
- fclass.m.unc p10,p0 = f8, 0x09 // + denorm
- nop.i 999
-};;
-{ .mfi
- nop.m 999
- fclass.m.unc p11,p0 = f8, 0x0a // - denorm
- nop.i 999
+ mov rExp_bias = 0xffff
+ fnorm.s1 fNormX = f8
+ mov rExp_2tom56 = 0xffff-56
}
+;;
+
+// Form two constants we need
+// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
+// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand
{ .mfi
- nop.m 999
-(p0) fmerge.s sinh_FR_SGNX = f8,f1
- nop.i 999 ;;
+ setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63
+ fclass.m p8,p0 = f8,0x07 // Test for x=0
+ nop.i 999
}
+{ .mlx
+ setf.d fRSHF_2TO56 = rRshf_2to56 // Form const 1.100 * 2^(63+56)
+ movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for right shift
+}
+;;
{ .mfi
- nop.m 999
-(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9
- nop.i 999 ;;
+ ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_NORM_ARG = [rAD_TB1],16
+ fclass.m p10,p0 = f8,0x1e3 // Test for x=inf, nan, NaT
+ nop.i 0
}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p7) br.cond.sptk L(SINH_BY_TBL) ;;
-}
-
-
-L(SINH_BY_POLY):
-
-// POLY cannot overflow so there is no need to call __libm_error_support
-// Set tiny_SAFE (p7) to 1(0) if answer is not tiny
-// Currently we do not use tiny_SAFE. So the setting of tiny_SAFE is
-// commented out.
-//(p0) movl r32 = 0x000000000000fc01
-//(p0) setf.exp f10 = r32
-//(p0) fcmp.lt.unc.s1 p6,p7 = f8,f10
-// Here is essentially the algorithm for SINH_BY_POLY. Care is take for the order
-// of multiplication; and P_1 is not exactly 1/3!, P_2 is not exactly 1/5!, etc.
-// Note that ax = |x|
-// sinh(x) = sign * (series(e^x) - series(e^-x))/2
-// = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11! + ax^13/13!)
-// = sign * (ax + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) )
-// + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) ) )
-// = sign * (ax + ax*p_odd + (ax*p_even))
-// = sign * (ax + Y_lo)
-// sinh(x) = sign * (Y_hi + Y_lo)
-// Get the values of P_x from the table
{ .mfb
-(p0) addl r34 = @ltoff(double_sinh_p_table), gp
-(p10) fma.d.s0 f8 = f8,f8,f8
-(p10) br.ret.spnt b0
+ setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat
+ nop.f 0
+(p6) br.cond.spnt SINH_UNORM // Branch if x=unorm
}
;;
+SINH_COMMON:
+{ .mfi
+ ldfe fLn2_by_128_hi = [rAD_TB1],16
+ nop.f 0
+ nop.i 0
+}
{ .mfb
- ld8 r34 = [r34]
-(p11) fnma.d.s0 f8 = f8,f8,f8
-(p11) br.ret.spnt b0
+ setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63
+ nop.f 0
+(p8) br.ret.spnt b0 // Exit for x=0, result=x
}
;;
-// Calculate sinh_FR_X2 = ax*ax and sinh_FR_X4 = ax*ax*ax*ax
-{ .mmf
- nop.m 999
-(p0) ldfe sinh_FR_P1 = [r34],16
-(p0) fma.s1 sinh_FR_X2 = sinh_FR_X, sinh_FR_X, f0 ;;
-}
-
-{ .mmi
-(p0) ldfe sinh_FR_P2 = [r34],16 ;;
-(p0) ldfe sinh_FR_P3 = [r34],16
- nop.i 999 ;;
+{ .mfi
+ ldfe fLn2_by_128_lo = [rAD_TB1],16
+ nop.f 0
+ nop.i 0
}
-
-{ .mmi
-(p0) ldfe sinh_FR_P4 = [r34],16 ;;
-(p0) ldfe sinh_FR_P5 = [r34],16
- nop.i 999 ;;
+{ .mfb
+ and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x
+(p10) fma.d.s0 f8 = f8,f1,f0 // Result if x=inf, nan, NaT
+(p10) br.ret.spnt b0 // quick exit for x=inf, nan, NaT
}
+;;
+// After that last load rAD_TB1 points to the beginning of table 1
{ .mfi
-(p0) ldfe sinh_FR_P6 = [r34],16
-(p0) fma.s1 sinh_FR_X4 = sinh_FR_X2, sinh_FR_X2, f0
- nop.i 999 ;;
+ nop.m 0
+ fcmp.eq.s0 p6,p0 = f8, f0 // Dummy to set D
+ sub rExp_x = rExp_x, rExp_bias // True exponent of x
}
+;;
-// Calculate sinh_FR_podd = p_odd and sinh_FR_peven = p_even
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_poly_podd_temp1 = sinh_FR_X4, sinh_FR_P5, sinh_FR_P3
- nop.i 999 ;;
+ nop.m 0
+ fmerge.s fAbsX = f0, fNormX // Form |x|
+ nop.i 0
}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_poly_podd_temp2 = sinh_FR_X4, sinh_FR_poly_podd_temp1, sinh_FR_P1
- nop.i 999
+{ .mfb
+ cmp.gt p7, p0 = -2, rExp_x // Test |x| < 2^(-2)
+ fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path
+(p7) br.cond.spnt SINH_SMALL // Branch if 0 < |x| < 2^-2
}
+;;
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_poly_peven_temp1 = sinh_FR_X4, sinh_FR_P6, sinh_FR_P4
- nop.i 999 ;;
-}
+// W = X * Inv_log2_by_128
+// By adding 1.10...0*2^63 we shift and get round_int(W) in significand.
+// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing.
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_podd = sinh_FR_X2, sinh_FR_poly_podd_temp2, f0
- nop.i 999
+ add rAD_P = 0x180, rAD_TB1
+ fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56
+ add rAD_TB2 = 0x100, rAD_TB1
}
+;;
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_poly_peven_temp2 = sinh_FR_X4, sinh_FR_poly_peven_temp1, sinh_FR_P2
- nop.i 999 ;;
-}
+// Divide arguments into the following categories:
+// Certain Safe - 0.25 <= |x| <= MAX_DBL_NORM_ARG
+// Possible Overflow p14 - MAX_DBL_NORM_ARG < |x| < MIN_DBL_OFLOW_ARG
+// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= |x| < +inf
+//
+// If the input is really a double arg, then there will never be
+// "Possible Overflow" arguments.
+//
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_peven = sinh_FR_X4, sinh_FR_poly_peven_temp2, f0
- nop.i 999 ;;
+ ldfpd fP5, fP4 = [rAD_P] ,16
+ fcmp.ge.s1 p15,p14 = fAbsX,fMIN_DBL_OFLOW_ARG
+ nop.i 0
}
+;;
-// Calculate sinh_FR_Y_lo = ax*p_odd + (ax*p_even)
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_X, sinh_FR_peven, f0
- nop.i 999 ;;
-}
+// Nfloat = round_int(W)
+// The signficand of fW_2TO56_RSH contains the rounded integer part of W,
+// as a twos complement number in the lower bits (that is, it may be negative).
+// That twos complement number (called N) is put into rN.
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_X, sinh_FR_podd, sinh_FR_Y_lo_temp
- nop.i 999 ;;
-}
+// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56
+// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat.
+// Thus, fNfloat contains the floating point version of N
-// Calculate sinh_FR_SINH = Y_hi + Y_lo. Note that ax = Y_hi
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_SINH = sinh_FR_X, f1, sinh_FR_Y_lo
- nop.i 999 ;;
+ ldfpd fP3, fP2 = [rAD_P]
+(p14) fcmp.gt.unc.s1 p14,p0 = fAbsX,fMAX_DBL_NORM_ARG
+ nop.i 0
}
-// Dummy multiply to generate inexact
-{ .mfi
- nop.m 999
-(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones
- nop.i 999
-}
-
-// Calculate f8 = sign * (Y_hi + Y_lo)
-// Go to return
{ .mfb
- nop.m 999
-(p0) fma.d.s0 f8 = sinh_FR_SGNX,sinh_FR_SINH,f0
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+ fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF
+(p15) br.cond.spnt SINH_CERTAIN_OVERFLOW
}
+;;
-
-L(SINH_BY_TBL):
-
-// Now that we are at TBL; so far all we know is that |x| >= 0.25.
-// The first two steps are the same for TBL and EXP, but if we are HUGE
-// we want to leave now.
-// Double-extended:
-// Go to HUGE if |x| >= 2^14, 1000d (register-biased) is e = 14 (true)
-// Double
-// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true)
-// Single
-// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true)
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000010009 ;;
+{ .mfi
+ getf.sig rN = fW_2TO56_RSH
+ nop.f 0
+ mov rExp_bias_minus_1 = 0xfffe
}
+;;
+// rIndex_1 has index_1
+// rIndex_2_16 has index_2 * 16
+// rBiased_M has M
+
+// rM has true M
+// r = x - Nfloat * ln2_by_128_hi
+// f = 1 - Nfloat * ln2_by_128_lo
{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ and rIndex_1 = 0x0f, rN
+ fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX
+ shr rM = rN, 0x7
}
-
{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc.s1 p6,p7 = sinh_FR_X,f9
- nop.i 999 ;;
+ and rIndex_2_16 = 0x70, rN
+ fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1
+ sub rN_neg = r0, rN
}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(SINH_HUGE) ;;
-}
-
-// r32 = 1
-// r34 = N-1
-// r35 = N
-// r36 = j
-// r37 = N+1
-
-// TBL can never overflow
-// sinh(x) = sinh(B+R)
-// = sinh(B)cosh(R) + cosh(B)sinh(R)
-//
-// ax = |x| = M*log2/64 + R
-// B = M*log2/64
-// M = 64*N + j
-// We will calcualte M and get N as (M-j)/64
-// The division is a shift.
-// exp(B) = exp(N*log2 + j*log2/64)
-// = 2^N * 2^(j*log2/64)
-// sinh(B) = 1/2(e^B -e^-B)
-// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64))
-// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
-// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
-// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32
-// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit)
-// R = ax - M*log2/64
-// R = ax - M*log2_by_64_hi - M*log2_by_64_lo
-// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...)
-// = 1 + p_odd + p_even
-// where the p_even uses the A coefficients and the p_even uses the B coefficients
-// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd
-// cosh(R) = 1 + p_even
-// sinh(B) = S_hi + S_lo
-// cosh(B) = C_hi
-// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R)
-// ******************************************************
-// STEP 1 (TBL and EXP)
-// ******************************************************
-// Get the following constants.
-// f9 = Inv_log2by64
-// f10 = log2by64_hi
-// f11 = log2by64_lo
+;;
{ .mmi
-(p0) adds r32 = 0x1,r0
-(p0) addl r34 = @ltoff(double_sinh_arg_reduction), gp
- nop.i 999
+ and rIndex_1_neg = 0x0f, rN_neg
+ add rBiased_M = rExp_bias_minus_1, rM
+ shr rM_neg = rN_neg, 0x7
}
-;;
-
{ .mmi
- ld8 r34 = [r34]
- nop.m 999
- nop.i 999
+ and rIndex_2_16_neg = 0x70, rN_neg
+ add rAD_T2 = rAD_TB2, rIndex_2_16
+ shladd rAD_T1 = rIndex_1, 4, rAD_TB1
}
;;
-
-// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and
-// put them in an exponent.
-// sinh_FR_spos = 2^(N-1) and sinh_FR_sneg = 2^(-N-1)
-// r39 = 0xffff + (N-1) = 0xffff +N -1
-// r40 = 0xffff - (N +1) = 0xffff -N -1
-
-{ .mlx
- nop.m 999
-(p0) movl r38 = 0x000000000000fffe ;;
-}
+// rAD_T1 has address of T1
+// rAD_T2 has address if T2
{ .mmi
-(p0) ldfe sinh_FR_Inv_log2by64 = [r34],16 ;;
-(p0) ldfe sinh_FR_log2by64_hi = [r34],16
- nop.i 999 ;;
-}
-
-{ .mbb
-(p0) ldfe sinh_FR_log2by64_lo = [r34],16
- nop.b 999
- nop.b 999 ;;
+ setf.exp f2M = rBiased_M
+ ldfe fT2 = [rAD_T2]
+ nop.i 0
}
-
-// Get the A coefficients
-// f9 = A_1
-// f10 = A_2
-// f11 = A_3
-
{ .mmi
- nop.m 999
-(p0) addl r34 = @ltoff(double_sinh_ab_table), gp
- nop.i 999
+ add rBiased_M_neg = rExp_bias_minus_1, rM_neg
+ add rAD_T2_neg = rAD_TB2, rIndex_2_16_neg
+ shladd rAD_T1_neg = rIndex_1_neg, 4, rAD_TB1
}
;;
+// Create Scale = 2^M
+// Load T1 and T2
{ .mmi
- ld8 r34 = [r34]
- nop.m 999
- nop.i 999
+ ldfe fT1 = [rAD_T1]
+ nop.m 0
+ nop.i 0
}
-;;
-
-
-// Calculate M and keep it as integer and floating point.
-// f38 = M = round-to-integer(x*Inv_log2by64)
-// sinh_FR_M = M = truncate(ax/(log2/64))
-// Put the significand of M in r35
-// and the floating point representation of M in sinh_FR_M
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_M = sinh_FR_X, sinh_FR_Inv_log2by64, f0
- nop.i 999
-}
-
-{ .mfi
-(p0) ldfe sinh_FR_A1 = [r34],16
- nop.f 999
- nop.i 999 ;;
+{ .mmf
+ setf.exp f2M_neg = rBiased_M_neg
+ ldfe fT2_neg = [rAD_T2_neg]
+ fma.s1 fF_neg = fNfloat, fLn2_by_128_lo, f1
}
+;;
{ .mfi
- nop.m 999
-(p0) fcvt.fx.s1 sinh_FR_M_temp = sinh_FR_M
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fRsq = fR, fR, f0
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fnorm.s1 sinh_FR_M = sinh_FR_M_temp
- nop.i 999 ;;
+ ldfe fT1_neg = [rAD_T1_neg]
+ fma.s1 fP54 = fR, fP5, fP4
+ nop.i 0
}
+;;
{ .mfi
-(p0) getf.sig r35 = sinh_FR_M_temp
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fP32 = fR, fP3, fP2
+ nop.i 0
}
-
-// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It
-// has a range of -32 thru 31.
-// r35 = M
-// r36 = j
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p0) and r36 = 0x3f, r35 ;;
-}
-
-// Calculate R
-// f13 = f44 - f12*f10 = ax - M*log2by64_hi
-// f14 = f13 - f8*f11 = R = (ax - M*log2by64_hi) - M*log2by64_lo
-
{ .mfi
- nop.m 999
-(p0) fnma.s1 sinh_FR_R_temp = sinh_FR_M, sinh_FR_log2by64_hi, sinh_FR_X
- nop.i 999
+ nop.m 0
+ fnma.s1 fP54_neg = fR, fP5, fP4
+ nop.i 0
}
+;;
{ .mfi
-(p0) ldfe sinh_FR_A2 = [r34],16
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fnma.s1 fP32_neg = fR, fP3, fP2
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fnma.s1 sinh_FR_R = sinh_FR_M, sinh_FR_log2by64_lo, sinh_FR_R_temp
- nop.i 999
-}
-
-// Get the B coefficients
-// f15 = B_1
-// f32 = B_2
-// f33 = B_3
-
-{ .mmi
-(p0) ldfe sinh_FR_A3 = [r34],16 ;;
-(p0) ldfe sinh_FR_B1 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) ldfe sinh_FR_B2 = [r34],16 ;;
-(p0) ldfe sinh_FR_B3 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
-(p0) shl r34 = r36, 0x2 ;;
-(p0) sxt1 r37 = r34 ;;
+ nop.m 0
+ fma.s1 fP5432 = fRsq, fP54, fP32
+ nop.i 0
}
-
-// ******************************************************
-// STEP 2 (TBL and EXP)
-// ******************************************************
-// Calculate Rsquared and Rcubed in preparation for p_even and p_odd
-// f12 = R*R*R
-// f13 = R*R
-// f14 = R <== from above
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Rsq = sinh_FR_R, sinh_FR_R, f0
-(p0) shr r36 = r37, 0x2 ;;
-}
-
-// r34 = M-j = r35 - r36
-// r35 = N = (M-j)/64
-
-{ .mii
-(p0) sub r34 = r35, r36
- nop.i 999 ;;
-(p0) shr r35 = r34, 0x6 ;;
-}
-
-{ .mii
-(p0) sub r40 = r38, r35
-(p0) adds r37 = 0x1, r35
-(p0) add r39 = r38, r35 ;;
-}
-
-// Get the address of the J table, add the offset,
-// addresses are sinh_AD_mJ and sinh_AD_J, get the T value
-// f32 = T(j)_hi
-// f33 = T(j)_lo
-// f34 = T(-j)_hi
-// f35 = T(-j)_lo
-
-{ .mmi
-(p0) sub r34 = r35, r32
-(p0) addl r37 = @ltoff(double_sinh_j_table), gp
- nop.i 999
+ nop.m 0
+ fma.s1 fS2 = fF,fT2,f0
+ nop.i 0
}
;;
-{ .mmi
- ld8 r37 = [r37]
- nop.m 999
- nop.i 999
-}
-;;
-
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Rcub = sinh_FR_Rsq, sinh_FR_R, f0
- nop.i 999
-}
-
-// ******************************************************
-// STEP 3 Now decide if we need to branch to EXP
-// ******************************************************
-// Put 32 in f9; p6 true if x < 32
-// Go to EXP if |x| >= 32
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000010004 ;;
-}
-
-// Calculate p_even
-// f34 = B_2 + Rsq *B_3
-// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3)
-// f36 = p_even = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_peven_temp1 = sinh_FR_Rsq, sinh_FR_B3, sinh_FR_B2
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fS1 = f2M,fT1,f0
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_peven_temp2 = sinh_FR_Rsq, sinh_FR_peven_temp1, sinh_FR_B1
- nop.i 999
+ nop.m 0
+ fma.s1 fP5432_neg = fRsq, fP54_neg, fP32_neg
+ nop.i 0
}
-
-// Calculate p_odd
-// f34 = A_2 + Rsq *A_3
-// f35 = A_1 + Rsq * (A_2 + Rsq *A_3)
-// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_podd_temp1 = sinh_FR_Rsq, sinh_FR_A3, sinh_FR_A2
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fS1_neg = f2M_neg,fT1_neg,f0
+ nop.i 0
}
-
{ .mfi
-(p0) setf.exp sinh_FR_N_temp1 = r39
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fS2_neg = fF_neg,fT2_neg,f0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_peven = sinh_FR_Rsq, sinh_FR_peven_temp2, f0
- nop.i 999
+ nop.m 0
+ fma.s1 fP = fRsq, fP5432, fR
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_podd_temp2 = sinh_FR_Rsq, sinh_FR_podd_temp1, sinh_FR_A1
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fS = fS1,fS2,f0
+ nop.i 0
}
+;;
{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fms.s1 fP_neg = fRsq, fP5432_neg, fR
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_podd = sinh_FR_podd_temp2, sinh_FR_Rcub, sinh_FR_R
- nop.i 999
+ nop.m 0
+ fma.s1 fS_neg = fS1_neg,fS2_neg,f0
+ nop.i 0
}
+;;
-// sinh_GR_mj contains the table offset for -j
-// sinh_GR_j contains the table offset for +j
-// p6 is true when j <= 0
-
-{ .mlx
-(p0) setf.exp sinh_FR_N_temp2 = r40
-(p0) movl r40 = 0x0000000000000020 ;;
+{ .mfb
+ nop.m 0
+ fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact
+(p14) br.cond.spnt SINH_POSSIBLE_OVERFLOW
}
+;;
{ .mfi
-(p0) sub sinh_GR_mJ = r40, r36
-(p0) fmerge.se sinh_FR_spos = sinh_FR_N_temp1, f1
-(p0) adds sinh_GR_J = 0x20, r36 ;;
+ nop.m 0
+ fma.s1 fExp = fS, fP, fS
+ nop.i 0
}
-
-{ .mii
- nop.m 999
-(p0) shl sinh_GR_mJ = sinh_GR_mJ, 5 ;;
-(p0) add sinh_AD_mJ = r37, sinh_GR_mJ ;;
-}
-
-{ .mmi
- nop.m 999
-(p0) ldfe sinh_FR_Tmjhi = [sinh_AD_mJ],16
-(p0) shl sinh_GR_J = sinh_GR_J, 5 ;;
-}
-
{ .mfi
-(p0) ldfs sinh_FR_Tmjlo = [sinh_AD_mJ],16
-(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9
-(p0) add sinh_AD_J = r37, sinh_GR_J ;;
-}
-
-{ .mmi
-(p0) ldfe sinh_FR_Tjhi = [sinh_AD_J],16 ;;
-(p0) ldfs sinh_FR_Tjlo = [sinh_AD_J],16
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fExp_neg = fS_neg, fP_neg, fS_neg
+ nop.i 0
}
+;;
{ .mfb
- nop.m 999
-(p0) fmerge.se sinh_FR_sneg = sinh_FR_N_temp2, f1
-(p7) br.cond.spnt L(SINH_BY_EXP) ;;
+ nop.m 0
+ fms.d.s0 f8 = fExp, f1, fExp_neg
+ br.ret.sptk b0 // Normal path exit
}
+;;
+// Here if 0 < |x| < 0.25
+SINH_SMALL:
{ .mfi
- nop.m 999
- nop.f 999
- nop.i 999 ;;
+ add rAD_T1 = 0x1a0, rAD_TB1
+ fcmp.lt.s1 p7, p8 = fNormX, f0 // Test sign of x
+ cmp.gt p6, p0 = -60, rExp_x // Test |x| < 2^(-60)
}
-
-// ******************************************************
-// If NOT branch to EXP
-// ******************************************************
-// Calculate S_hi and S_lo
-// sinh_FR_S_hi_temp = sinh_FR_sneg * sinh_FR_Tmjhi
-// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi_temp
-// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - (sinh_FR_sneg * sinh_FR_Tmjlo)
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_S_hi_temp = sinh_FR_sneg, sinh_FR_Tmjhi, f0
- nop.i 999 ;;
+ add rAD_T2 = 0x1d0, rAD_TB1
+ nop.f 0
+ nop.i 0
}
+;;
-{ .mfi
- nop.m 999
-(p0) fms.s1 sinh_FR_S_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi_temp
- nop.i 999
+{ .mmb
+ ldfe fA6 = [rAD_T1],16
+ ldfe fA5 = [rAD_T2],16
+(p6) br.cond.spnt SINH_VERY_SMALL // Branch if |x| < 2^(-60)
}
+;;
-// Calculate C_hi
-// sinh_FR_C_hi_temp1 = sinh_FR_sneg * sinh_FR_Tmjhi
-// sinh_FR_C_hi = sinh_FR_spos * sinh_FR_Tjhi + sinh_FR_C_hi_temp1
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_C_hi_temp1 = sinh_FR_sneg, sinh_FR_Tmjhi, f0
- nop.i 999 ;;
+{ .mmi
+ ldfe fA4 = [rAD_T1],16
+ ldfe fA3 = [rAD_T2],16
+ nop.i 0
}
+;;
-// sinh_FR_S_lo_temp1 = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi
-// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi)
-// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_S_lo_temp1 )
-
-{ .mfi
- nop.m 999
-(p0) fms.s1 sinh_FR_S_lo_temp1 = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi
- nop.i 999
+{ .mmi
+ ldfe fA2 = [rAD_T1]
+ ldfe fA1 = [rAD_T2]
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_C_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_C_hi_temp1
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fX3 = fNormX, fXsq, f0
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fnma.s1 sinh_FR_S_lo_temp2 = sinh_FR_sneg, sinh_FR_Tmjhi, sinh_FR_S_lo_temp1
- nop.i 999
+ nop.m 0
+ fma.s1 fX4 = fXsq, fXsq, f0
+ nop.i 0
}
-
-// sinh_FR_S_lo_temp1 = sinh_FR_sneg * sinh_FR_Tmjlo
-// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo - sinh_FR_S_lo_temp1
-// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo -(sinh_FR_sneg * sinh_FR_Tmjlo)
-// sinh_FR_S_lo = sinh_FR_S_lo_temp3 + sinh_FR_S_lo_temp2
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_S_lo_temp1 = sinh_FR_sneg, sinh_FR_Tmjlo, f0
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA65 = fXsq, fA6, fA5
+ nop.i 0
}
-
-/////////// BUG FIX fma to fms -TK
{ .mfi
- nop.m 999
-(p0) fms.s1 sinh_FR_S_lo_temp3 = sinh_FR_spos, sinh_FR_Tjlo, sinh_FR_S_lo_temp1
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA43 = fXsq, fA4, fA3
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_S_lo = sinh_FR_S_lo_temp3, f1, sinh_FR_S_lo_temp2
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA21 = fXsq, fA2, fA1
+ nop.i 0
}
-
-// Y_hi = S_hi
-// Y_lo = C_hi*p_odd + (S_hi*p_even + S_lo)
-// sinh_FR_Y_lo_temp = sinh_FR_S_hi * sinh_FR_peven + sinh_FR_S_lo
-// sinh_FR_Y_lo = sinh_FR_C_hi * sinh_FR_podd + sinh_FR_Y_lo_temp
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_S_hi, sinh_FR_peven, sinh_FR_S_lo
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA6543 = fX4, fA65, fA43
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_C_hi, sinh_FR_podd, sinh_FR_Y_lo_temp
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA654321 = fX4, fA6543, fA21
+ nop.i 0
}
-
-// sinh_FR_SINH = Y_hi + Y_lo
-// f8 = answer = sinh_FR_SGNX * sinh_FR_SINH
+;;
// Dummy multiply to generate inexact
{ .mfi
- nop.m 999
-(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones
- nop.i 999
+ nop.m 0
+ fmpy.s0 fTmp = fA6, fA6
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_SINH = sinh_FR_S_hi, f1, sinh_FR_Y_lo
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = fA654321, fX3, fNormX
+ br.ret.sptk b0 // Exit if 2^-60 < |x| < 0.25
}
+;;
+SINH_VERY_SMALL:
+// Here if 0 < |x| < 2^-60
+// Compute result by x + sgn(x)*x^2 to get properly rounded result
+.pred.rel "mutex",p7,p8
+{ .mfi
+ nop.m 0
+(p7) fnma.d.s0 f8 = fNormX, fNormX, fNormX // If x<0 result ~ x-x^2
+ nop.i 0
+}
{ .mfb
- nop.m 999
-(p0) fma.d.s0 f8 = sinh_FR_SGNX, sinh_FR_SINH,f0
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+(p8) fma.d.s0 f8 = fNormX, fNormX, fNormX // If x>0 result ~ x+x^2
+ br.ret.sptk b0 // Exit if |x| < 2^-60
}
+;;
-L(SINH_BY_EXP):
+SINH_POSSIBLE_OVERFLOW:
-// When p7 is true, we know that an overflow is not going to happen
-// When p7 is false, we must check for possible overflow
-// p7 is the over_SAFE flag
-// Y_hi = Tjhi
-// Y_lo = Tjhi * (p_odd + p_even) +Tjlo
-// Scale = sign * 2^(N-1)
-// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_peven + sinh_FR_podd)
-// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_Y_lo_temp )
+// Here if fMAX_DBL_NORM_ARG < |x| < fMIN_DBL_OFLOW_ARG
+// This cannot happen if input is a double, only if input higher precision.
+// Overflow is a possibility, not a certainty.
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_peven, f1, sinh_FR_podd
- nop.i 999
-}
-
-// Now we are in EXP. This is the only path where an overflow is possible
-// but not for certain. So this is the only path where over_SAFE has any use.
-// r34 still has N-1
-// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe
-// There is a danger of double overflow if N-1 > 0x3fe = 1022
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x00000000000003fe ;;
-}
+// Recompute result using status field 2 with user's rounding mode,
+// and wre set. If result is larger than largest double, then we have
+// overflow
{ .mfi
-(p0) cmp.gt.unc p0,p7 = r34, r32
-(p0) fmerge.s sinh_FR_SCALE = sinh_FR_SGNX, sinh_FR_spos
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_Tjhi, sinh_FR_Y_lo_temp, sinh_FR_Tjlo
- nop.i 999 ;;
+ mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp
+ fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
+ nop.i 0
}
+;;
-// f8 = answer = scale * (Y_hi + Y_lo)
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_SINH_temp = sinh_FR_Y_lo, f1, sinh_FR_Tjhi
- nop.i 999 ;;
+ setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp
+ fma.d.s2 fWre_urm_f8 = fS, fP, fS // Result with wre set
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.d.s0 f44 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0
- nop.i 999 ;;
+ nop.m 0
+ fsetc.s2 0x7F,0x40 // Turn off wre in sf2
+ nop.i 0
}
+;;
-// Dummy multiply to generate inexact
{ .mfi
- nop.m 999
-(p7) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones
- nop.i 999 ;;
+ nop.m 0
+ fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
+ nop.i 0
}
+;;
-// If over_SAFE is set, return
{ .mfb
- nop.m 999
-(p7) fmerge.s f8 = f44,f44
-(p7) br.ret.sptk b0 ;;
+ nop.m 0
+ nop.f 0
+(p6) br.cond.spnt SINH_CERTAIN_OVERFLOW // Branch if overflow
}
+;;
-// Else see if we overflowed
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// If WRE is set then an overflow will not occur in EXP.
-// The input value that would cause a register (WRE) value to overflow is about 2^15
-// and this input would go into the HUGE path.
-// Answer with WRE is in f43.
-
-{ .mfi
- nop.m 999
-(p0) fsetc.s2 0x7F,0x42
- nop.i 999;;
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = fS, fP, fS
+ br.ret.sptk b0 // Exit if really no overflow
}
+;;
+SINH_CERTAIN_OVERFLOW:
{ .mfi
- nop.m 999
-(p0) fma.d.s2 f43 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0
- nop.i 999 ;;
-}
-
-// 103FF => 103FF -FFFF = 400(true)
-// 400 + 3FF = 7FF, which is 1 more that the exponent of the largest
-// double (7FE). So 0 103FF 8000000000000000 is one ulp more than
-// largest double in register bias
-// Now set p8 if the answer with WRE is greater than or equal this value
-// Also set p9 if the answer with WRE is less than or equal to negative this value
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x000000000103FF ;;
+ sub rTmp = rExp_mask, r0, 1
+ fcmp.lt.s1 p6, p7 = fNormX, f0 // Test for x < 0
+ nop.i 0
}
+;;
{ .mmf
- nop.m 999
-(p0) setf.exp f41 = r32
-(p0) fsetc.s2 0x7F,0x40 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41
- nop.i 999
+ alloc r32=ar.pfs,1,4,4,0
+ setf.exp fTmp = rTmp
+ fmerge.s FR_X = f8,f8
}
+;;
{ .mfi
- nop.m 999
-(p0) fmerge.ns f42 = f41, f41
- nop.i 999 ;;
-}
-
-// The error tag for overflow is 127
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p8) mov r47 = 127 ;;
+ mov GR_Parameter_TAG = 127
+(p6) fnma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and -INF result
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p0) fcmp.le.unc.s1 p9, p0 = f43, f42
-(p8) br.cond.spnt L(SINH_ERROR_SUPPORT) ;;
-}
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p9) mov r47 = 127
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.spnt L(SINH_ERROR_SUPPORT) ;;
-}
-
-// Dummy multiply to generate inexact
-{ .mfi
- nop.m 999
-(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result
+ br.cond.sptk __libm_error_region
}
+;;
+// Here if x unorm
+SINH_UNORM:
{ .mfb
- nop.m 999
-(p0) fmerge.s f8 = f44,f44
-(p0) br.ret.sptk b0 ;;
-}
-
-L(SINH_HUGE):
-
-// for SINH_HUGE, put 24000 in exponent; take sign from input; add 1
-// SAFE: SAFE is always 0 for HUGE
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000015dbf ;;
-}
-
-{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ getf.exp rSignexp_x = fNormX // Must recompute if x unorm
+ fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag
+ br.cond.sptk SINH_COMMON
}
+;;
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_signed_hi_lo = sinh_FR_SGNX, f9, f1
- nop.i 999 ;;
-}
+GLOBAL_IEEE754_END(sinh)
-{ .mfi
- nop.m 999
-(p0) fma.d.s0 f44 = sinh_FR_signed_hi_lo, f9, f0
-(p0) mov r47 = 127
-}
-.endp sinh
-ASM_SIZE_DIRECTIVE(sinh)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__ieee754_sinh)
-#endif
-
-// Stack operations when calling error support.
-// (1) (2) (3) (call) (4)
-// sp -> + psp -> + psp -> + sp -> +
-// | | | |
-// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
-// | | | |
-// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
-// | | | |
-// | | <- GR_X X1 ->| |
-// | | | |
-// sp-64 -> + sp -> + sp -> + +
-// save ar.pfs save b0 restore gp
-// save gp restore ar.pfs
-
-.proc __libm_error_region
-__libm_error_region:
-L(SINH_ERROR_SUPPORT):
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
-
-// (1)
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
@@ -1271,39 +860,32 @@ L(SINH_ERROR_SUPPORT):
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
-
-
-// (2)
{ .mmi
- stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
-
.body
-// (3)
{ .mib
- stfd [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfd [GR_Parameter_Y] = f44 // STORE Parameter 3 on stack
+ stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
-
-// (4)
{ .mmi
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
@@ -1316,8 +898,6 @@ L(SINH_ERROR_SUPPORT):
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
-
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_sinhf.S b/sysdeps/ia64/fpu/e_sinhf.S
index d5aa2dca16..4a407b7f3c 100644
--- a/sysdeps/ia64/fpu/e_sinhf.S
+++ b/sysdeps/ia64/fpu/e_sinhf.S
@@ -1,10 +1,10 @@
.file "sinhf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2002, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,1305 +20,727 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+
// History
-//==============================================================
-// 2/02/00 Initial version
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+//*********************************************************************
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 10/12/00 Update to set denormal operand and underflow flags
-// 1/22/01 Fixed to set inexact flag for small args.
+// 01/22/01 Fixed to set inexact flag for small args.
+// 05/02/01 Reworked to improve speed of all paths
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 11/20/02 Improved algorithm based on expf
//
// API
-//==============================================================
-// float = sinhf(float)
-// input floating point f8
-// output floating point f8
-//
-// Registers used
-//==============================================================
-// general registers:
-// r32 -> r47
-// predicate registers used:
-// p6 p7 p8 p9
-// floating-point registers used:
-// f9 -> f15; f32 -> f45;
-// f8 has input, then output
+//*********************************************************************
+// float sinhf(float)
//
// Overview of operation
-//==============================================================
-// There are four paths
-// 1. |x| < 0.25 SINH_BY_POLY
-// 2. |x| < 32 SINH_BY_TBL
-// 3. |x| < 2^14 SINH_BY_EXP
-// 4. |x_ >= 2^14 SINH_HUGE
-//
-// For double extended we get infinity for x >= 400c b174 ddc0 31ae c0ea
-// >= 1.0110001.... x 2^13
-// >= 11357.2166
+//*********************************************************************
+// Case 1: 0 < |x| < 2^-60
+// Result = x, computed by x+sgn(x)*x^2) to handle flags and rounding
//
-// But for double we get infinity for x >= 408633ce8fb9f87e
-// >= 1.0110...x 2^9
-// >= +7.10476e+002
+// Case 2: 2^-60 < |x| < 0.25
+// Evaluate sinh(x) by a 9th order polynomial
+// Care is take for the order of multiplication; and A2 is not exactly 1/5!,
+// A3 is not exactly 1/7!, etc.
+// sinh(x) = x + (A1*x^3 + A2*x^5 + A3*x^7 + A4*x^9)
//
-// And for single we get infinity for x >= 42b3a496
-// >= 1.0110... 2^6
-// >= 89.8215
+// Case 3: 0.25 < |x| < 89.41598
+// Algorithm is based on the identity sinh(x) = ( exp(x) - exp(-x) ) / 2.
+// The algorithm for exp is described as below. There are a number of
+// economies from evaluating both exp(x) and exp(-x). Although we
+// are evaluating both quantities, only where the quantities diverge do we
+// duplicate the computations. The basic algorithm for exp(x) is described
+// below.
//
-// SAFE: If there is danger of overflow set SAFE to 0
-// NOT implemented: if there is danger of underflow, set SAFE to 0
-// SAFE for all paths listed below
-//
-// 1. SINH_BY_POLY
-// ===============
-// If |x| is less than the tiny threshold, then clear SAFE
-// For double, the tiny threshold is -1022 = -0x3fe => -3fe + ffff = fc01
-// register-biased, this is fc01
-// For single, the tiny threshold is -126 = -7e => -7e + ffff = ff81
-// If |x| < tiny threshold, set SAFE = 0
-//
-// 2. SINH_BY_TBL
-// =============
-// SAFE: SAFE is always 1 for TBL;
+// Take the input x. w is "how many log2/128 in x?"
+// w = x * 64/log2
+// NJ = int(w)
+// x = NJ*log2/64 + R
+
+// NJ = 64*n + j
+// x = n*log2 + (log2/64)*j + R
//
-// 3. SINH_BY_EXP
-// ==============
-// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe
-// r34 has N-1; 16382 is in register biased form, 0x13ffd
-// There is danger of double overflow if N-1 > 0x3fe
-// in register biased form, 0x103fd
-// Analagously, there is danger of single overflow if N-1 > 0x7e
-// in register biased form, 0x1007d
-// SAFE: If there is danger of overflow set SAFE to 0
+// So, exp(x) = 2^n * 2^(j/64)* exp(R)
//
-// 4. SINH_HUGE
-// ============
-// SAFE: SAFE is always 0 for HUGE
+// T = 2^n * 2^(j/64)
+// Construct 2^n
+// Get 2^(j/64) table
+// actually all the entries of 2^(j/64) table are stored in DP and
+// with exponent bits set to 0 -> multiplication on 2^n can be
+// performed by doing logical "or" operation with bits presenting 2^n
+
+// exp(R) = 1 + (exp(R) - 1)
+// P = exp(R) - 1 approximated by Taylor series of 3rd degree
+// P = A3*R^3 + A2*R^2 + R, A3 = 1/6, A2 = 1/2
//
-#include "libm_support.h"
-
-// Assembly macros
-//==============================================================
-sinh_FR_X = f44
-sinh_FR_X2 = f9
-sinh_FR_X4 = f10
-sinh_FR_SGNX = f40
-sinh_FR_all_ones = f45
-sinh_FR_tmp = f42
-
-sinh_FR_Inv_log2by64 = f9
-sinh_FR_log2by64_lo = f11
-sinh_FR_log2by64_hi = f10
-
-sinh_FR_A1 = f9
-sinh_FR_A2 = f10
-sinh_FR_A3 = f11
-
-sinh_FR_Rcub = f12
-sinh_FR_M_temp = f13
-sinh_FR_R_temp = f13
-sinh_FR_Rsq = f13
-sinh_FR_R = f14
-
-sinh_FR_M = f38
-
-sinh_FR_B1 = f15
-sinh_FR_B2 = f32
-sinh_FR_B3 = f33
+// The final result is reconstructed as follows
+// exp(x) = T + T*P
-sinh_FR_peven_temp1 = f34
-sinh_FR_peven_temp2 = f35
-sinh_FR_peven = f36
+// Special values
+//*********************************************************************
+// sinhf(+0) = +0
+// sinhf(-0) = -0
-sinh_FR_podd_temp1 = f34
-sinh_FR_podd_temp2 = f35
-sinh_FR_podd = f37
+// sinhf(+qnan) = +qnan
+// sinhf(-qnan) = -qnan
+// sinhf(+snan) = +qnan
+// sinhf(-snan) = -qnan
-sinh_FR_poly_podd_temp1 = f11
-sinh_FR_poly_podd_temp2 = f13
-sinh_FR_poly_peven_temp1 = f11
-sinh_FR_poly_peven_temp2 = f13
+// sinhf(-inf) = -inf
+// sinhf(+inf) = +inf
-sinh_FR_J_temp = f9
-sinh_FR_J = f10
-
-sinh_FR_Mmj = f39
-
-sinh_FR_N_temp1 = f11
-sinh_FR_N_temp2 = f12
-sinh_FR_N = f13
-
-sinh_FR_spos = f14
-sinh_FR_sneg = f15
-
-sinh_FR_Tjhi = f32
-sinh_FR_Tjlo = f33
-sinh_FR_Tmjhi = f34
-sinh_FR_Tmjlo = f35
-
-sinh_GR_mJ = r35
-sinh_GR_J = r36
-
-sinh_AD_mJ = r38
-sinh_AD_J = r39
-sinh_GR_all_ones = r40
-
-sinh_FR_S_hi = f9
-sinh_FR_S_hi_temp = f10
-sinh_FR_S_lo_temp1 = f11
-sinh_FR_S_lo_temp2 = f12
-sinh_FR_S_lo_temp3 = f13
-
-sinh_FR_S_lo = f38
-sinh_FR_C_hi = f39
-
-sinh_FR_C_hi_temp1 = f10
-sinh_FR_Y_hi = f11
-sinh_FR_Y_lo_temp = f12
-sinh_FR_Y_lo = f13
-sinh_FR_SINH = f9
-
-sinh_FR_P1 = f14
-sinh_FR_P2 = f15
-sinh_FR_P3 = f32
-sinh_FR_P4 = f33
-sinh_FR_P5 = f34
-sinh_FR_P6 = f35
-
-sinh_FR_TINY_THRESH = f9
-
-sinh_FR_SINH_temp = f10
-sinh_FR_SCALE = f11
-
-sinh_FR_signed_hi_lo = f10
-
-
-GR_SAVE_PFS = r41
-GR_SAVE_B0 = r42
-GR_SAVE_GP = r43
+// Overflow and Underflow
+//*********************************************************************
+// sinhf(x) = largest single normal when
+// x = 89.41598 = 0x42b2d4fc
+//
+// Underflow is handled as described in case 1 above
-GR_Parameter_X = r44
-GR_Parameter_Y = r45
-GR_Parameter_RESULT = r46
+// Registers used
+//*********************************************************************
+// Floating Point registers used:
+// f8 input, output
+// f6,f7, f9 -> f15, f32 -> f45
-// Data tables
-//==============================================================
+// General registers used:
+// r2, r3, r16 -> r38
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+// Predicate registers used:
+// p6 -> p15
+// Assembly macros
+//*********************************************************************
+// integer registers used
+// scratch
+rNJ = r2
+rNJ_neg = r3
+
+rJ_neg = r16
+rN_neg = r17
+rSignexp_x = r18
+rExp_x = r18
+rExp_mask = r19
+rExp_bias = r20
+rAd1 = r21
+rAd2 = r22
+rJ = r23
+rN = r24
+rTblAddr = r25
+rA3 = r26
+rExpHalf = r27
+rLn2Div64 = r28
+rGt_ln = r29
+r17ones_m1 = r29
+rRightShifter = r30
+rJ_mask = r30
+r64DivLn2 = r31
+rN_mask = r31
+// stacked
+GR_SAVE_PFS = r32
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_Parameter_X = r35
+GR_Parameter_Y = r36
+GR_Parameter_RESULT = r37
+GR_Parameter_TAG = r38
+
+// floating point registers used
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+// scratch
+fRightShifter = f6
+f64DivLn2 = f7
+fNormX = f9
+fNint = f10
+fN = f11
+fR = f12
+fLn2Div64 = f13
+fA2 = f14
+fA3 = f15
+// stacked
+fP = f32
+fT = f33
+fMIN_SGL_OFLOW_ARG = f34
+fMAX_SGL_NORM_ARG = f35
+fRSqr = f36
+fA1 = f37
+fA21 = f37
+fA4 = f38
+fA43 = f38
+fA4321 = f38
+fX4 = f39
+fTmp = f39
+fGt_pln = f39
+fWre_urm_f8 = f40
+fXsq = f40
+fP_neg = f41
+fX3 = f41
+fT_neg = f42
+fExp = f43
+fExp_neg = f44
+fAbsX = f45
+
+
+RODATA
.align 16
-double_sinh_arg_reduction:
-ASM_TYPE_DIRECTIVE(double_sinh_arg_reduction,@object)
- data8 0xB8AA3B295C17F0BC, 0x00004005
- data8 0xB17217F7D1000000, 0x00003FF8
- data8 0xCF79ABC9E3B39804, 0x00003FD0
-ASM_SIZE_DIRECTIVE(double_sinh_arg_reduction)
-
-double_sinh_p_table:
-ASM_TYPE_DIRECTIVE(double_sinh_p_table,@object)
- data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC
- data8 0x8888888888888412, 0x00003FF8
- data8 0xD00D00D00D4D39F2, 0x00003FF2
- data8 0xB8EF1D28926D8891, 0x00003FEC
- data8 0xD732377688025BE9, 0x00003FE5
- data8 0xB08AF9AE78C1239F, 0x00003FDE
-ASM_SIZE_DIRECTIVE(double_sinh_p_table)
-
-double_sinh_ab_table:
-ASM_TYPE_DIRECTIVE(double_sinh_ab_table,@object)
- data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC
- data8 0x88888888884ECDD5, 0x00003FF8
- data8 0xD00D0C6DCC26A86B, 0x00003FF2
- data8 0x8000000000000002, 0x00003FFE
- data8 0xAAAAAAAAAA402C77, 0x00003FFA
- data8 0xB60B6CC96BDB144D, 0x00003FF5
-ASM_SIZE_DIRECTIVE(double_sinh_ab_table)
-
-double_sinh_j_table:
-ASM_TYPE_DIRECTIVE(double_sinh_j_table,@object)
- data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000
- data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000
- data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000
- data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000
- data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000
- data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000
- data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000
- data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000
- data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000
- data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000
- data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000
- data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000
- data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000
- data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000
- data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000
- data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000
- data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000
- data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000
- data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000
- data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000
- data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000
- data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000
- data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000
- data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000
- data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000
- data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000
- data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000
- data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000
- data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000
- data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000
- data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000
- data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000
- data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000
- data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000
- data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000
- data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000
- data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000
- data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000
- data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000
- data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000
- data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000
- data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000
- data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000
- data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000
- data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000
- data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000
- data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000
- data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000
- data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000
- data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000
- data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000
- data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000
- data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000
- data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000
- data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000
- data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000
- data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000
- data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000
- data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000
- data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000
- data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000
- data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000
- data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000
- data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000
- data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000
-ASM_SIZE_DIRECTIVE(double_sinh_j_table)
-
-.align 32
-.global sinhf#
-
-.section .text
-.proc sinhf#
-.align 32
-
-sinhf:
-#ifdef _LIBC
-.global __ieee754_sinhf
-.type __ieee754_sinhf,@function
-__ieee754_sinhf:
-#endif
-
-// X infinity or NAN?
-// Take invalid fault if enabled
-
-{ .mfi
- alloc r32 = ar.pfs,0,12,4,0
-(p0) fclass.m.unc p6,p0 = f8, 0xe3 //@qnan | @snan | @inf
- mov sinh_GR_all_ones = -1
-}
-;;
+LOCAL_OBJECT_START(_sinhf_table)
+data4 0x42b2d4fd // Smallest single arg to overflow single result
+data4 0x42b2d4fc // Largest single arg to give normal single result
+data4 0x00000000 // pad
+data4 0x00000000 // pad
+//
+// 2^(j/64) table, j goes from 0 to 63
+data8 0x0000000000000000 // 2^(0/64)
+data8 0x00002C9A3E778061 // 2^(1/64)
+data8 0x000059B0D3158574 // 2^(2/64)
+data8 0x0000874518759BC8 // 2^(3/64)
+data8 0x0000B5586CF9890F // 2^(4/64)
+data8 0x0000E3EC32D3D1A2 // 2^(5/64)
+data8 0x00011301D0125B51 // 2^(6/64)
+data8 0x0001429AAEA92DE0 // 2^(7/64)
+data8 0x000172B83C7D517B // 2^(8/64)
+data8 0x0001A35BEB6FCB75 // 2^(9/64)
+data8 0x0001D4873168B9AA // 2^(10/64)
+data8 0x0002063B88628CD6 // 2^(11/64)
+data8 0x0002387A6E756238 // 2^(12/64)
+data8 0x00026B4565E27CDD // 2^(13/64)
+data8 0x00029E9DF51FDEE1 // 2^(14/64)
+data8 0x0002D285A6E4030B // 2^(15/64)
+data8 0x000306FE0A31B715 // 2^(16/64)
+data8 0x00033C08B26416FF // 2^(17/64)
+data8 0x000371A7373AA9CB // 2^(18/64)
+data8 0x0003A7DB34E59FF7 // 2^(19/64)
+data8 0x0003DEA64C123422 // 2^(20/64)
+data8 0x0004160A21F72E2A // 2^(21/64)
+data8 0x00044E086061892D // 2^(22/64)
+data8 0x000486A2B5C13CD0 // 2^(23/64)
+data8 0x0004BFDAD5362A27 // 2^(24/64)
+data8 0x0004F9B2769D2CA7 // 2^(25/64)
+data8 0x0005342B569D4F82 // 2^(26/64)
+data8 0x00056F4736B527DA // 2^(27/64)
+data8 0x0005AB07DD485429 // 2^(28/64)
+data8 0x0005E76F15AD2148 // 2^(29/64)
+data8 0x0006247EB03A5585 // 2^(30/64)
+data8 0x0006623882552225 // 2^(31/64)
+data8 0x0006A09E667F3BCD // 2^(32/64)
+data8 0x0006DFB23C651A2F // 2^(33/64)
+data8 0x00071F75E8EC5F74 // 2^(34/64)
+data8 0x00075FEB564267C9 // 2^(35/64)
+data8 0x0007A11473EB0187 // 2^(36/64)
+data8 0x0007E2F336CF4E62 // 2^(37/64)
+data8 0x00082589994CCE13 // 2^(38/64)
+data8 0x000868D99B4492ED // 2^(39/64)
+data8 0x0008ACE5422AA0DB // 2^(40/64)
+data8 0x0008F1AE99157736 // 2^(41/64)
+data8 0x00093737B0CDC5E5 // 2^(42/64)
+data8 0x00097D829FDE4E50 // 2^(43/64)
+data8 0x0009C49182A3F090 // 2^(44/64)
+data8 0x000A0C667B5DE565 // 2^(45/64)
+data8 0x000A5503B23E255D // 2^(46/64)
+data8 0x000A9E6B5579FDBF // 2^(47/64)
+data8 0x000AE89F995AD3AD // 2^(48/64)
+data8 0x000B33A2B84F15FB // 2^(49/64)
+data8 0x000B7F76F2FB5E47 // 2^(50/64)
+data8 0x000BCC1E904BC1D2 // 2^(51/64)
+data8 0x000C199BDD85529C // 2^(52/64)
+data8 0x000C67F12E57D14B // 2^(53/64)
+data8 0x000CB720DCEF9069 // 2^(54/64)
+data8 0x000D072D4A07897C // 2^(55/64)
+data8 0x000D5818DCFBA487 // 2^(56/64)
+data8 0x000DA9E603DB3285 // 2^(57/64)
+data8 0x000DFC97337B9B5F // 2^(58/64)
+data8 0x000E502EE78B3FF6 // 2^(59/64)
+data8 0x000EA4AFA2A490DA // 2^(60/64)
+data8 0x000EFA1BEE615A27 // 2^(61/64)
+data8 0x000F50765B6E4540 // 2^(62/64)
+data8 0x000FA7C1819E90D8 // 2^(63/64)
+LOCAL_OBJECT_END(_sinhf_table)
+
+LOCAL_OBJECT_START(sinh_p_table)
+data8 0x3ec749d84bc96d7d // A4
+data8 0x3f2a0168d09557cf // A3
+data8 0x3f811111326ed15a // A2
+data8 0x3fc55555552ed1e2 // A1
+LOCAL_OBJECT_END(sinh_p_table)
-{ .mfb
- nop.m 999
-(p6) fma.s.s0 f8 = f8,f1,f8
-(p6) br.ret.spnt b0 ;;
-}
+.section .text
+GLOBAL_IEEE754_ENTRY(sinhf)
-// Put 0.25 in f9; p6 true if x < 0.25
-// Make constant that will generate inexact when squared
{ .mlx
- setf.sig sinh_FR_all_ones = sinh_GR_all_ones
-(p0) movl r32 = 0x000000000000fffd ;;
-}
-
-{ .mfi
-(p0) setf.exp f9 = r32
-(p0) fclass.m.unc p7,p0 = f8, 0x07 //@zero
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-(p0) fmerge.s sinh_FR_X = f0,f8
-(p7) br.ret.spnt b0 ;;
+ getf.exp rSignexp_x = f8 // Must recompute if x unorm
+ movl r64DivLn2 = 0x40571547652B82FE // 64/ln(2)
}
-
-// Identify denormal operands.
-{ .mfi
- nop.m 999
- fclass.m.unc p10,p0 = f8, 0x09 // + denorm
- nop.i 999
-};;
-{ .mfi
- nop.m 999
- fclass.m.unc p11,p0 = f8, 0x0a // - denorm
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fmerge.s sinh_FR_SGNX = f8,f1
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9
- nop.i 999 ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p7) br.cond.sptk L(SINH_BY_TBL) ;;
-}
-
-
-L(SINH_BY_POLY):
-
-// POLY cannot overflow so there is no need to call __libm_error_support
-// Set tiny_SAFE (p7) to 1(0) if answer is not tiny
-// Currently we do not use tiny_SAFE. So the setting of tiny_SAFE is
-// commented out.
-//(p0) movl r32 = 0x000000000000fc01
-//(p0) setf.exp f10 = r32
-//(p0) fcmp.lt.unc.s1 p6,p7 = f8,f10
-// Here is essentially the algorithm for SINH_BY_POLY. Care is take for the order
-// of multiplication; and P_1 is not exactly 1/3!, P_2 is not exactly 1/5!, etc.
-// Note that ax = |x|
-// sinh(x) = sign * (series(e^x) - series(e^-x))/2
-// = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11! + ax^13/13!)
-// = sign * (ax + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) )
-// + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) ) )
-// = sign * (ax + ax*p_odd + (ax*p_even))
-// = sign * (ax + Y_lo)
-// sinh(x) = sign * (Y_hi + Y_lo)
-// Get the values of P_x from the table
-{ .mfb
-(p0) addl r34 = @ltoff(double_sinh_p_table), gp
-(p10) fma.s.s0 f8 = f8,f8,f8
-(p10) br.ret.spnt b0
-}
-;;
-
-{ .mfb
- ld8 r34 = [r34]
-(p11) fnma.s.s0 f8 = f8,f8,f8
-(p11) br.ret.spnt b0
+{ .mlx
+ addl rTblAddr = @ltoff(_sinhf_table),gp
+ movl rRightShifter = 0x43E8000000000000 // DP Right Shifter
}
;;
-// Calculate sinh_FR_X2 = ax*ax and sinh_FR_X4 = ax*ax*ax*ax
-{ .mmf
- nop.m 999
-(p0) ldfe sinh_FR_P1 = [r34],16
-(p0) fma.s1 sinh_FR_X2 = sinh_FR_X, sinh_FR_X, f0 ;;
-}
-
-{ .mmi
-(p0) ldfe sinh_FR_P2 = [r34],16 ;;
-(p0) ldfe sinh_FR_P3 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) ldfe sinh_FR_P4 = [r34],16 ;;
-(p0) ldfe sinh_FR_P5 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mfi
-(p0) ldfe sinh_FR_P6 = [r34],16
-(p0) fma.s1 sinh_FR_X4 = sinh_FR_X2, sinh_FR_X2, f0
- nop.i 999 ;;
-}
-
-// Calculate sinh_FR_podd = p_odd and sinh_FR_peven = p_even
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_poly_podd_temp1 = sinh_FR_X4, sinh_FR_P5, sinh_FR_P3
- nop.i 999 ;;
+ // point to the beginning of the table
+ ld8 rTblAddr = [rTblAddr]
+ fclass.m p6, p0 = f8, 0x0b // Test for x=unorm
+ addl rA3 = 0x3E2AA, r0 // high bits of 1.0/6.0 rounded to SP
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_poly_podd_temp2 = sinh_FR_X4, sinh_FR_poly_podd_temp1, sinh_FR_P1
- nop.i 999
+ nop.m 0
+ fnorm.s1 fNormX = f8 // normalized x
+ addl rExpHalf = 0xFFFE, r0 // exponent of 1/2
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_poly_peven_temp1 = sinh_FR_X4, sinh_FR_P6, sinh_FR_P4
- nop.i 999 ;;
+ setf.d f64DivLn2 = r64DivLn2 // load 64/ln(2) to FP reg
+ fclass.m p15, p0 = f8, 0x1e3 // test for NaT,NaN,Inf
+ nop.i 0
}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_podd = sinh_FR_X2, sinh_FR_poly_podd_temp2, f0
- nop.i 999
+{ .mlx
+ // load Right Shifter to FP reg
+ setf.d fRightShifter = rRightShifter
+ movl rLn2Div64 = 0x3F862E42FEFA39EF // DP ln(2)/64 in GR
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_poly_peven_temp2 = sinh_FR_X4, sinh_FR_poly_peven_temp1, sinh_FR_P2
- nop.i 999 ;;
+ mov rExp_mask = 0x1ffff
+ fcmp.eq.s1 p13, p0 = f0, f8 // test for x = 0.0
+ shl rA3 = rA3, 12 // 0x3E2AA000, approx to 1.0/6.0 in SP
}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_peven = sinh_FR_X4, sinh_FR_poly_peven_temp2, f0
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+ nop.f 0
+(p6) br.cond.spnt SINH_UNORM // Branch if x=unorm
}
+;;
-// Calculate sinh_FR_Y_lo = ax*p_odd + (ax*p_even)
+SINH_COMMON:
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_X, sinh_FR_peven, f0
- nop.i 999 ;;
+ setf.exp fA2 = rExpHalf // load A2 to FP reg
+ nop.f 0
+ mov rExp_bias = 0xffff
}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_X, sinh_FR_podd, sinh_FR_Y_lo_temp
- nop.i 999 ;;
+{ .mfb
+ setf.d fLn2Div64 = rLn2Div64 // load ln(2)/64 to FP reg
+(p15) fma.s.s0 f8 = f8, f1, f0 // result if x = NaT,NaN,Inf
+(p15) br.ret.spnt b0 // exit here if x = NaT,NaN,Inf
}
+;;
-// Calculate sinh_FR_SINH = Y_hi + Y_lo. Note that ax = Y_hi
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_SINH = sinh_FR_X, f1, sinh_FR_Y_lo
- nop.i 999 ;;
+ // min overflow and max normal threshold
+ ldfps fMIN_SGL_OFLOW_ARG, fMAX_SGL_NORM_ARG = [rTblAddr], 8
+ nop.f 0
+ and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x
}
-// Dummy multiply to generate inexact
-{ .mfi
- nop.m 999
-(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones
- nop.i 999
-}
-
-// Calculate f8 = sign * (Y_hi + Y_lo)
-// Go to return
{ .mfb
- nop.m 999
-(p0) fma.s.s0 f8 = sinh_FR_SGNX,sinh_FR_SINH,f0
-(p0) br.ret.sptk b0 ;;
-}
-
-
-L(SINH_BY_TBL):
-
-// Now that we are at TBL; so far all we know is that |x| >= 0.25.
-// The first two steps are the same for TBL and EXP, but if we are HUGE
-// we want to leave now.
-// Double-extended:
-// Go to HUGE if |x| >= 2^14, 1000d (register-biased) is e = 14 (true)
-// Double
-// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true)
-// Single
-// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true)
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000010006 ;;
+ setf.s fA3 = rA3 // load A3 to FP reg
+ nop.f 0
+(p13) br.ret.spnt b0 // exit here if x=0.0, return x
}
+;;
{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ sub rExp_x = rExp_x, rExp_bias // True exponent of x
+ fmerge.s fAbsX = f0, fNormX // Form |x|
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc.s1 p6,p7 = sinh_FR_X,f9
- nop.i 999 ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(SINH_HUGE) ;;
-}
-
-// r32 = 1
-// r34 = N-1
-// r35 = N
-// r36 = j
-// r37 = N+1
-
-// TBL can never overflow
-// sinh(x) = sinh(B+R)
-// = sinh(B)cosh(R) + cosh(B)sinh(R)
-//
-// ax = |x| = M*log2/64 + R
-// B = M*log2/64
-// M = 64*N + j
-// We will calcualte M and get N as (M-j)/64
-// The division is a shift.
-// exp(B) = exp(N*log2 + j*log2/64)
-// = 2^N * 2^(j*log2/64)
-// sinh(B) = 1/2(e^B -e^-B)
-// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64))
-// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
-// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
-// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32
-// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit)
-// R = ax - M*log2/64
-// R = ax - M*log2_by_64_hi - M*log2_by_64_lo
-// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...)
-// = 1 + p_odd + p_even
-// where the p_even uses the A coefficients and the p_even uses the B coefficients
-// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd
-// cosh(R) = 1 + p_even
-// sinh(B) = S_hi + S_lo
-// cosh(B) = C_hi
-// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R)
-// ******************************************************
-// STEP 1 (TBL and EXP)
-// ******************************************************
-// Get the following constants.
-// f9 = Inv_log2by64
-// f10 = log2by64_hi
-// f11 = log2by64_lo
-
-{ .mmi
-(p0) adds r32 = 0x1,r0
-(p0) addl r34 = @ltoff(double_sinh_arg_reduction), gp
- nop.i 999
+ nop.m 0
+ // x*(64/ln(2)) + Right Shifter
+ fma.s1 fNint = fNormX, f64DivLn2, fRightShifter
+ add rTblAddr = 8, rTblAddr
}
-;;
-
-{ .mmi
- ld8 r34 = [r34]
- nop.m 999
- nop.i 999
+{ .mfb
+ cmp.gt p7, p0 = -2, rExp_x // Test |x| < 2^(-2)
+ fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path
+(p7) br.cond.spnt SINH_SMALL // Branch if 0 < |x| < 2^-2
}
;;
-
-// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and
-// put them in an exponent.
-// sinh_FR_spos = 2^(N-1) and sinh_FR_sneg = 2^(-N-1)
-// r39 = 0xffff + (N-1) = 0xffff +N -1
-// r40 = 0xffff - (N +1) = 0xffff -N -1
-
-{ .mlx
- nop.m 999
-(p0) movl r38 = 0x000000000000fffe ;;
-}
-
-{ .mmi
-(p0) ldfe sinh_FR_Inv_log2by64 = [r34],16 ;;
-(p0) ldfe sinh_FR_log2by64_hi = [r34],16
- nop.i 999 ;;
-}
-
-{ .mbb
-(p0) ldfe sinh_FR_log2by64_lo = [r34],16
- nop.b 999
- nop.b 999 ;;
-}
-
-// Get the A coefficients
-// f9 = A_1
-// f10 = A_2
-// f11 = A_3
-
-{ .mmi
- nop.m 999
-(p0) addl r34 = @ltoff(double_sinh_ab_table), gp
- nop.i 999
+{ .mfi
+ nop.m 0
+ // check for overflow
+ fcmp.ge.s1 p12, p13 = fAbsX, fMIN_SGL_OFLOW_ARG
+ mov rJ_mask = 0x3f // 6-bit mask for J
}
;;
-{ .mmi
- ld8 r34 = [r34]
- nop.m 999
- nop.i 999
+{ .mfb
+ nop.m 0
+ fms.s1 fN = fNint, f1, fRightShifter // n in FP register
+ // branch out if overflow
+(p12) br.cond.spnt SINH_CERTAIN_OVERFLOW
}
;;
-
-// Calculate M and keep it as integer and floating point.
-// f38 = M = round-to-integer(x*Inv_log2by64)
-// sinh_FR_M = M = truncate(ax/(log2/64))
-// Put the significand of M in r35
-// and the floating point representation of M in sinh_FR_M
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_M = sinh_FR_X, sinh_FR_Inv_log2by64, f0
- nop.i 999
+ getf.sig rNJ = fNint // bits of n, j
+ // check for possible overflow
+ fcmp.gt.s1 p13, p0 = fAbsX, fMAX_SGL_NORM_ARG
+ nop.i 0
}
+;;
{ .mfi
-(p0) ldfe sinh_FR_A1 = [r34],16
- nop.f 999
- nop.i 999 ;;
+ addl rN = 0xFFBF - 63, rNJ // biased and shifted n-1,j
+ fnma.s1 fR = fLn2Div64, fN, fNormX // R = x - N*ln(2)/64
+ and rJ = rJ_mask, rNJ // bits of j
}
-
{ .mfi
- nop.m 999
-(p0) fcvt.fx.s1 sinh_FR_M_temp = sinh_FR_M
- nop.i 999 ;;
+ sub rNJ_neg = r0, rNJ // bits of n, j for -x
+ nop.f 0
+ andcm rN_mask = -1, rJ_mask // 0xff...fc0 to mask N
}
+;;
{ .mfi
- nop.m 999
-(p0) fnorm.s1 sinh_FR_M = sinh_FR_M_temp
- nop.i 999 ;;
+ shladd rJ = rJ, 3, rTblAddr // address in the 2^(j/64) table
+ nop.f 0
+ and rN = rN_mask, rN // biased, shifted n-1
}
-
{ .mfi
-(p0) getf.sig r35 = sinh_FR_M_temp
- nop.f 999
- nop.i 999 ;;
+ addl rN_neg = 0xFFBF - 63, rNJ_neg // -x biased, shifted n-1,j
+ nop.f 0
+ and rJ_neg = rJ_mask, rNJ_neg // bits of j for -x
}
-
-// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It
-// has a range of -32 thru 31.
-// r35 = M
-// r36 = j
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p0) and r36 = 0x3f, r35 ;;
-}
-
-// Calculate R
-// f13 = f44 - f12*f10 = ax - M*log2by64_hi
-// f14 = f13 - f8*f11 = R = (ax - M*log2by64_hi) - M*log2by64_lo
+;;
{ .mfi
- nop.m 999
-(p0) fnma.s1 sinh_FR_R_temp = sinh_FR_M, sinh_FR_log2by64_hi, sinh_FR_X
- nop.i 999
+ ld8 rJ = [rJ] // Table value
+ nop.f 0
+ shl rN = rN, 46 // 2^(n-1) bits in DP format
}
-
{ .mfi
-(p0) ldfe sinh_FR_A2 = [r34],16
- nop.f 999
- nop.i 999 ;;
+ shladd rJ_neg = rJ_neg, 3, rTblAddr // addr in 2^(j/64) table -x
+ nop.f 0
+ and rN_neg = rN_mask, rN_neg // biased, shifted n-1 for -x
}
+;;
{ .mfi
- nop.m 999
-(p0) fnma.s1 sinh_FR_R = sinh_FR_M, sinh_FR_log2by64_lo, sinh_FR_R_temp
- nop.i 999
+ ld8 rJ_neg = [rJ_neg] // Table value for -x
+ nop.f 0
+ shl rN_neg = rN_neg, 46 // 2^(n-1) bits in DP format for -x
}
-
-// Get the B coefficients
-// f15 = B_1
-// f32 = B_2
-// f33 = B_3
-
-{ .mmi
-(p0) ldfe sinh_FR_A3 = [r34],16 ;;
-(p0) ldfe sinh_FR_B1 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) ldfe sinh_FR_B2 = [r34],16 ;;
-(p0) ldfe sinh_FR_B3 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
-(p0) shl r34 = r36, 0x2 ;;
-(p0) sxt1 r37 = r34 ;;
-}
-
-// ******************************************************
-// STEP 2 (TBL and EXP)
-// ******************************************************
-// Calculate Rsquared and Rcubed in preparation for p_even and p_odd
-// f12 = R*R*R
-// f13 = R*R
-// f14 = R <== from above
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Rsq = sinh_FR_R, sinh_FR_R, f0
-(p0) shr r36 = r37, 0x2 ;;
-}
-
-// r34 = M-j = r35 - r36
-// r35 = N = (M-j)/64
-
-{ .mii
-(p0) sub r34 = r35, r36
- nop.i 999 ;;
-(p0) shr r35 = r34, 0x6 ;;
-}
-
-{ .mii
-(p0) sub r40 = r38, r35
-(p0) adds r37 = 0x1, r35
-(p0) add r39 = r38, r35 ;;
-}
-
-// Get the address of the J table, add the offset,
-// addresses are sinh_AD_mJ and sinh_AD_J, get the T value
-// f32 = T(j)_hi
-// f33 = T(j)_lo
-// f34 = T(-j)_hi
-// f35 = T(-j)_lo
-
-{ .mmi
-(p0) sub r34 = r35, r32
-(p0) addl r37 = @ltoff(double_sinh_j_table), gp
- nop.i 999
+ or rN = rN, rJ // bits of 2^n * 2^(j/64) in DP format
+ nop.f 0
+ nop.i 0
}
;;
-{ .mmi
- ld8 r37 = [r37]
- nop.m 999
- nop.i 999
+{ .mmf
+ setf.d fT = rN // 2^(n-1) * 2^(j/64)
+ or rN_neg = rN_neg, rJ_neg // -x bits of 2^n * 2^(j/64) in DP
+ fma.s1 fRSqr = fR, fR, f0 // R^2
}
;;
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Rcub = sinh_FR_Rsq, sinh_FR_R, f0
- nop.i 999
-}
-
-// ******************************************************
-// STEP 3 Now decide if we need to branch to EXP
-// ******************************************************
-// Put 32 in f9; p6 true if x < 32
-// Go to EXP if |x| >= 32
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000010004 ;;
-}
-
-// Calculate p_even
-// f34 = B_2 + Rsq *B_3
-// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3)
-// f36 = p_even = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_peven_temp1 = sinh_FR_Rsq, sinh_FR_B3, sinh_FR_B2
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_peven_temp2 = sinh_FR_Rsq, sinh_FR_peven_temp1, sinh_FR_B1
- nop.i 999
-}
-
-// Calculate p_odd
-// f34 = A_2 + Rsq *A_3
-// f35 = A_1 + Rsq * (A_2 + Rsq *A_3)
-// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_podd_temp1 = sinh_FR_Rsq, sinh_FR_A3, sinh_FR_A2
- nop.i 999 ;;
-}
-
{ .mfi
-(p0) setf.exp sinh_FR_N_temp1 = r39
- nop.f 999
- nop.i 999 ;;
+ setf.d fT_neg = rN_neg // 2^(n-1) * 2^(j/64) for -x
+ fma.s1 fP = fA3, fR, fA2 // A3*R + A2
+ nop.i 0
}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_peven = sinh_FR_Rsq, sinh_FR_peven_temp2, f0
- nop.i 999
-}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_podd_temp2 = sinh_FR_Rsq, sinh_FR_podd_temp1, sinh_FR_A1
- nop.i 999 ;;
+ nop.m 0
+ fnma.s1 fP_neg = fA3, fR, fA2 // A3*R + A2 for -x
+ nop.i 0
}
+;;
{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fP = fP, fRSqr, fR // P = (A3*R + A2)*R^2 + R
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_podd = sinh_FR_podd_temp2, sinh_FR_Rcub, sinh_FR_R
- nop.i 999
-}
-
-// sinh_GR_mj contains the table offset for -j
-// sinh_GR_j contains the table offset for +j
-// p6 is true when j <= 0
-
-{ .mlx
-(p0) setf.exp sinh_FR_N_temp2 = r40
-(p0) movl r40 = 0x0000000000000020 ;;
+ nop.m 0
+ fms.s1 fP_neg = fP_neg, fRSqr, fR // P = (A3*R + A2)*R^2 + R, -x
+ nop.i 0
}
+;;
{ .mfi
-(p0) sub sinh_GR_mJ = r40, r36
-(p0) fmerge.se sinh_FR_spos = sinh_FR_N_temp1, f1
-(p0) adds sinh_GR_J = 0x20, r36 ;;
-}
-
-{ .mii
- nop.m 999
-(p0) shl sinh_GR_mJ = sinh_GR_mJ, 5 ;;
-(p0) add sinh_AD_mJ = r37, sinh_GR_mJ ;;
-}
-
-{ .mmi
- nop.m 999
-(p0) ldfe sinh_FR_Tmjhi = [sinh_AD_mJ],16
-(p0) shl sinh_GR_J = sinh_GR_J, 5 ;;
+ nop.m 0
+ fmpy.s0 fTmp = fLn2Div64, fLn2Div64 // Force inexact
+ nop.i 0
}
+;;
{ .mfi
-(p0) ldfs sinh_FR_Tmjlo = [sinh_AD_mJ],16
-(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9
-(p0) add sinh_AD_J = r37, sinh_GR_J ;;
+ nop.m 0
+ fma.s1 fExp = fP, fT, fT // exp(x)/2
+ nop.i 0
}
-
-{ .mmi
-(p0) ldfe sinh_FR_Tjhi = [sinh_AD_J],16 ;;
-(p0) ldfs sinh_FR_Tjlo = [sinh_AD_J],16
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+ fma.s1 fExp_neg = fP_neg, fT_neg, fT_neg // exp(-x)/2
+ // branch out if possible overflow result
+(p13) br.cond.spnt SINH_POSSIBLE_OVERFLOW
}
+;;
{ .mfb
- nop.m 999
-(p0) fmerge.se sinh_FR_sneg = sinh_FR_N_temp2, f1
-(p7) br.cond.spnt L(SINH_BY_EXP) ;;
+ nop.m 0
+ // final result in the absence of overflow
+ fms.s.s0 f8 = fExp, f1, fExp_neg // result = (exp(x)-exp(-x))/2
+ // exit here in the absence of overflow
+ br.ret.sptk b0 // Exit main path, 0.25 <= |x| < 89.41598
}
+;;
+// Here if 0 < |x| < 0.25. Evaluate 9th order polynomial.
+SINH_SMALL:
{ .mfi
- nop.m 999
- nop.f 999
- nop.i 999 ;;
+ add rAd1 = 0x200, rTblAddr
+ fcmp.lt.s1 p7, p8 = fNormX, f0 // Test sign of x
+ cmp.gt p6, p0 = -60, rExp_x // Test |x| < 2^(-60)
}
-
-// ******************************************************
-// If NOT branch to EXP
-// ******************************************************
-// Calculate S_hi and S_lo
-// sinh_FR_S_hi_temp = sinh_FR_sneg * sinh_FR_Tmjhi
-// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi_temp
-// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - (sinh_FR_sneg * sinh_FR_Tmjlo)
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_S_hi_temp = sinh_FR_sneg, sinh_FR_Tmjhi, f0
- nop.i 999 ;;
+ add rAd2 = 0x210, rTblAddr
+ nop.f 0
+ nop.i 0
}
+;;
-{ .mfi
- nop.m 999
-(p0) fms.s1 sinh_FR_S_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi_temp
- nop.i 999
+{ .mmb
+ ldfpd fA4, fA3 = [rAd1]
+ ldfpd fA2, fA1 = [rAd2]
+(p6) br.cond.spnt SINH_VERY_SMALL // Branch if |x| < 2^(-60)
}
-
-// Calculate C_hi
-// sinh_FR_C_hi_temp1 = sinh_FR_sneg * sinh_FR_Tmjhi
-// sinh_FR_C_hi = sinh_FR_spos * sinh_FR_Tjhi + sinh_FR_C_hi_temp1
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_C_hi_temp1 = sinh_FR_sneg, sinh_FR_Tmjhi, f0
- nop.i 999 ;;
-}
-
-// sinh_FR_S_lo_temp1 = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi
-// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi)
-// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_S_lo_temp1 )
+;;
{ .mfi
- nop.m 999
-(p0) fms.s1 sinh_FR_S_lo_temp1 = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi
- nop.i 999
+ nop.m 0
+ fma.s1 fX3 = fXsq, fNormX, f0
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_C_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_C_hi_temp1
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fX4 = fXsq, fXsq, f0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fnma.s1 sinh_FR_S_lo_temp2 = sinh_FR_sneg, sinh_FR_Tmjhi, sinh_FR_S_lo_temp1
- nop.i 999
+ nop.m 0
+ fma.s1 fA43 = fXsq, fA4, fA3
+ nop.i 0
}
-
-// sinh_FR_S_lo_temp1 = sinh_FR_sneg * sinh_FR_Tmjlo
-// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo - sinh_FR_S_lo_temp1
-// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo -(sinh_FR_sneg * sinh_FR_Tmjlo)
-// sinh_FR_S_lo = sinh_FR_S_lo_temp3 + sinh_FR_S_lo_temp2
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_S_lo_temp1 = sinh_FR_sneg, sinh_FR_Tmjlo, f0
- nop.i 999 ;;
-}
-
-/////////// BUG FIX fma to fms -TK
-{ .mfi
- nop.m 999
-(p0) fms.s1 sinh_FR_S_lo_temp3 = sinh_FR_spos, sinh_FR_Tjlo, sinh_FR_S_lo_temp1
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA21 = fXsq, fA2, fA1
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_S_lo = sinh_FR_S_lo_temp3, f1, sinh_FR_S_lo_temp2
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA4321 = fX4, fA43, fA21
+ nop.i 0
}
+;;
-// Y_hi = S_hi
-// Y_lo = C_hi*p_odd + (S_hi*p_even + S_lo)
-// sinh_FR_Y_lo_temp = sinh_FR_S_hi * sinh_FR_peven + sinh_FR_S_lo
-// sinh_FR_Y_lo = sinh_FR_C_hi * sinh_FR_podd + sinh_FR_Y_lo_temp
-
+// Dummy multiply to generate inexact
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_S_hi, sinh_FR_peven, sinh_FR_S_lo
- nop.i 999 ;;
+ nop.m 0
+ fmpy.s0 fTmp = fA4, fA4
+ nop.i 0
}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_C_hi, sinh_FR_podd, sinh_FR_Y_lo_temp
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+ fma.s.s0 f8 = fA4321, fX3, fNormX
+ br.ret.sptk b0 // Exit if 2^-60 < |x| < 0.25
}
+;;
-// sinh_FR_SINH = Y_hi + Y_lo
-// f8 = answer = sinh_FR_SGNX * sinh_FR_SINH
-
-// Dummy multiply to generate inexact
-{ .mfi
- nop.m 999
-(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones
- nop.i 999
-}
+SINH_VERY_SMALL:
+// Here if 0 < |x| < 2^-60
+// Compute result by x + sgn(x)*x^2 to get properly rounded result
+.pred.rel "mutex",p7,p8
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_SINH = sinh_FR_S_hi, f1, sinh_FR_Y_lo
- nop.i 999 ;;
+ nop.m 0
+(p7) fnma.s.s0 f8 = fNormX, fNormX, fNormX // If x<0 result ~ x-x^2
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p0) fma.s.s0 f8 = sinh_FR_SGNX, sinh_FR_SINH,f0
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+(p8) fma.s.s0 f8 = fNormX, fNormX, fNormX // If x>0 result ~ x+x^2
+ br.ret.sptk b0 // Exit if |x| < 2^-60
}
+;;
+SINH_POSSIBLE_OVERFLOW:
-L(SINH_BY_EXP):
+// Here if fMAX_SGL_NORM_ARG < x < fMIN_SGL_OFLOW_ARG
+// This cannot happen if input is a single, only if input higher precision.
+// Overflow is a possibility, not a certainty.
-// When p7 is true, we know that an overflow is not going to happen
-// When p7 is false, we must check for possible overflow
-// p7 is the over_SAFE flag
-// Y_hi = Tjhi
-// Y_lo = Tjhi * (p_odd + p_even) +Tjlo
-// Scale = sign * 2^(N-1)
-// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_peven + sinh_FR_podd)
-// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_Y_lo_temp )
+// Recompute result using status field 2 with user's rounding mode,
+// and wre set. If result is larger than largest single, then we have
+// overflow
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_peven, f1, sinh_FR_podd
- nop.i 999
-}
-
-// Now we are in EXP. This is the only path where an overflow is possible
-// but not for certain. So this is the only path where over_SAFE has any use.
-// r34 still has N-1
-// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe
-// There is a danger of double overflow if N-1 > 0x3fe = 1022
-// There is a danger of single overflow if N-1 > 0x7e = 126
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x000000000000007e ;;
-}
-
-{ .mfi
-(p0) cmp.gt.unc p0,p7 = r34, r32
-(p0) fmerge.s sinh_FR_SCALE = sinh_FR_SGNX, sinh_FR_spos
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_Tjhi, sinh_FR_Y_lo_temp, sinh_FR_Tjlo
- nop.i 999 ;;
+ mov rGt_ln = 0x1007f // Exponent for largest single + 1 ulp
+ fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
+ nop.i 0
}
+;;
-// f8 = answer = scale * (Y_hi + Y_lo)
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_SINH_temp = sinh_FR_Y_lo, f1, sinh_FR_Tjhi
- nop.i 999 ;;
+ setf.exp fGt_pln = rGt_ln // Create largest single + 1 ulp
+ fma.s.s2 fWre_urm_f8 = fP, fT, fT // Result with wre set
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s.s0 f44 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0
- nop.i 999 ;;
+ nop.m 0
+ fsetc.s2 0x7F,0x40 // Turn off wre in sf2
+ nop.i 0
}
+;;
-// Dummy multiply to generate inexact
{ .mfi
- nop.m 999
-(p7) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones
- nop.i 999 ;;
+ nop.m 0
+ fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
+ nop.i 0
}
+;;
-// If over_SAFE is set, return
{ .mfb
- nop.m 999
-(p7) fmerge.s f8 = f44,f44
-(p7) br.ret.sptk b0 ;;
+ nop.m 0
+ nop.f 0
+(p6) br.cond.spnt SINH_CERTAIN_OVERFLOW // Branch if overflow
}
+;;
-// Else see if we overflowed
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// If WRE is set then an overflow will not occur in EXP.
-// The input value that would cause a register (WRE) value to overflow is about 2^15
-// and this input would go into the HUGE path.
-// Answer with WRE is in f43.
-
-{ .mfi
- nop.m 999
-(p0) fsetc.s2 0x7F,0x42
- nop.i 999;;
+{ .mfb
+ nop.m 0
+ fma.s.s0 f8 = fP, fT, fT
+ br.ret.sptk b0 // Exit if really no overflow
}
+;;
+// here if overflow
+SINH_CERTAIN_OVERFLOW:
{ .mfi
- nop.m 999
-(p0) fma.s.s2 f43 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0
- nop.i 999 ;;
-}
-
-// 1007F => 1007F -FFFF = 80(true)
-// 80 + 7F = FF, which is 1 more that the exponent of the largest
-// double (FE). So 0 1007F 8000000000000000 is one ulp more than
-// largest single in register bias
-// Now set p8 if the answer with WRE is greater than or equal this value
-// Also set p9 if the answer with WRE is less than or equal to negative this value
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000001007F ;;
+ addl r17ones_m1 = 0x1FFFE, r0
+ fcmp.lt.s1 p6, p7 = fNormX, f0 // Test for x < 0
+ nop.i 0
}
+;;
{ .mmf
- nop.m 999
-(p0) setf.exp f41 = r32
-(p0) fsetc.s2 0x7F,0x40 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41
- nop.i 999
+ alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers
+ setf.exp fTmp = r17ones_m1
+ fmerge.s FR_X = f8,f8
}
+;;
{ .mfi
- nop.m 999
-(p0) fmerge.ns f42 = f41, f41
- nop.i 999 ;;
-}
-
-// The error tag for overflow is 128
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p8) mov r47 = 128 ;;
+ mov GR_Parameter_TAG = 128
+(p6) fnma.s.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and -INF result
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p0) fcmp.le.unc.s1 p9, p0 = f43, f42
-(p8) br.cond.spnt L(SINH_ERROR_SUPPORT) ;;
-}
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p9) mov r47 = 128
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.spnt L(SINH_ERROR_SUPPORT) ;;
-}
-
-// Dummy multiply to generate inexact
-{ .mfi
- nop.m 999
-(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result
+ br.cond.sptk __libm_error_region
}
+;;
+// Here if x unorm
+SINH_UNORM:
{ .mfb
- nop.m 999
-(p0) fmerge.s f8 = f44,f44
-(p0) br.ret.sptk b0 ;;
-}
-
-L(SINH_HUGE):
-
-// for SINH_HUGE, put 24000 in exponent; take sign from input; add 1
-// SAFE: SAFE is always 0 for HUGE
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000015dbf ;;
-}
-
-{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ getf.exp rSignexp_x = fNormX // Must recompute if x unorm
+ fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag
+ br.cond.sptk SINH_COMMON // Return to main path
}
+;;
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_signed_hi_lo = sinh_FR_SGNX, f9, f1
- nop.i 999 ;;
-}
+GLOBAL_IEEE754_END(sinhf)
-{ .mfi
- nop.m 999
-(p0) fma.s.s0 f44 = sinh_FR_signed_hi_lo, f9, f0
-(p0) mov r47 = 128
-}
-.endp sinhf
-ASM_SIZE_DIRECTIVE(sinhf)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__ieee754_sinhf)
-#endif
-
-// Stack operations when calling error support.
-// (1) (2) (3) (call) (4)
-// sp -> + psp -> + psp -> + sp -> +
-// | | | |
-// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
-// | | | |
-// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
-// | | | |
-// | | <- GR_X X1 ->| |
-// | | | |
-// sp-64 -> + sp -> + sp -> + +
-// save ar.pfs save b0 restore gp
-// save gp restore ar.pfs
-
-.proc __libm_error_region
-__libm_error_region:
-L(SINH_ERROR_SUPPORT):
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
-
-// (1)
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
- nop.f 0
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
- nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
};;
-
-
-// (2)
{ .mmi
- stfs [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
-
.body
-// (3)
-{ .mib
- stfs [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+{ .mfi
+ stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ nop.f 0
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
}
{ .mib
- stfs [GR_Parameter_Y] = f44 // STORE Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
+
{ .mmi
- nop.m 0
- nop.m 0
- add GR_Parameter_RESULT = 48,sp
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
-// (4)
{ .mmi
- ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
- add sp = 64,sp // Restore stack pointer
- mov b0 = GR_SAVE_B0 // Restore return address
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
+
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_sinhl.S b/sysdeps/ia64/fpu/e_sinhl.S
index b880b95b64..ccc996a8cc 100644
--- a/sysdeps/ia64/fpu/e_sinhl.S
+++ b/sysdeps/ia64/fpu/e_sinhl.S
@@ -1,10 +1,10 @@
.file "sinhl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2002, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,17 +35,20 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
// 10/12/00 Update to set denormal operand and underflow flags
-// 1/22/01 Fixed to set inexact flag for small args. Fixed incorrect
+// 01/22/01 Fixed to set inexact flag for small args. Fixed incorrect
// call to __libm_error_support for 710.476 < x < 11357.2166.
+// 05/02/01 Reworked to improve speed of all paths
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 12/04/02 Improved performance
//
// API
//==============================================================
@@ -56,1269 +59,1058 @@
// Registers used
//==============================================================
// general registers:
-// r32 -> r47
+// r14 -> r40
// predicate registers used:
-// p6 p7 p8 p9
+// p6 -> p11
// floating-point registers used:
-// f9 -> f15; f32 -> f45;
+// f9 -> f15; f32 -> f90;
// f8 has input, then output
//
// Overview of operation
//==============================================================
-// There are four paths
-// 1. |x| < 0.25 SINH_BY_POLY
-// 2. |x| < 32 SINH_BY_TBL
-// 3. |x| < 2^14 SINH_BY_EXP
-// 4. |x_ >= 2^14 SINH_HUGE
-//
-// For double extended we get infinity for x >= 400c b174 ddc0 31ae c0ea
-// >= 1.0110001.... x 2^13
-// >= 11357.2166
+// There are seven paths
+// 1. 0 < |x| < 0.25 SINH_BY_POLY
+// 2. 0.25 <=|x| < 32 SINH_BY_TBL
+// 3. 32 <= |x| < 11357.21655 SINH_BY_EXP (merged path with SINH_BY_TBL)
+// 4. |x| >= 11357.21655 SINH_HUGE
+// 5. x=0 Done with early exit
+// 6. x=inf,nan Done with early exit
+// 7. x=denormal SINH_DENORM
//
-// But for double we get infinity for x >= 408633ce8fb9f87e
-// >= 1.0110...x 2^9
-// >= +7.10476e+002
+// For double extended we get overflow for x >= 400c b174 ddc0 31ae c0ea
+// >= 11357.21655
//
-// And for single we get infinity for x >= 42b3a496
-// >= 1.0110... 2^6
-// >= 89.8215
//
-// SAFE: If there is danger of overflow set SAFE to 0
-// NOT implemented: if there is danger of underflow, set SAFE to 0
-// SAFE for all paths listed below
-//
-// 1. SINH_BY_POLY
+// 1. SINH_BY_POLY 0 < |x| < 0.25
// ===============
-// If |x| is less than the tiny threshold, then clear SAFE
-// For double, the tiny threshold is -1022 = -0x3fe => -3fe + ffff = fc01
-// register-biased, this is fc01
-// For single, the tiny threshold is -126 = -7e => -7e + ffff = ff81
-// If |x| < tiny threshold, set SAFE = 0
+// Evaluate sinh(x) by a 13th order polynomial
+// Care is take for the order of multiplication; and P_1 is not exactly 1/3!,
+// P_2 is not exactly 1/5!, etc.
+// sinh(x) = sign * (series(e^x) - series(e^-x))/2
+// = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11!
+// + ax^13/13!)
+// = sign * (ax + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) )
+// + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) ))
+// = sign * (ax + ax*p_odd + (ax*p_even))
+// = sign * (ax + Y_lo)
+// sinh(x) = sign * (Y_hi + Y_lo)
+// Note that ax = |x|
//
-// 2. SINH_BY_TBL
+// 2. SINH_BY_TBL 0.25 <= |x| < 32.0
// =============
-// SAFE: SAFE is always 1 for TBL;
+// sinh(x) = sinh(B+R)
+// = sinh(B)cosh(R) + cosh(B)sinh(R)
+//
+// ax = |x| = M*log2/64 + R
+// B = M*log2/64
+// M = 64*N + j
+// We will calculate M and get N as (M-j)/64
+// The division is a shift.
+// exp(B) = exp(N*log2 + j*log2/64)
+// = 2^N * 2^(j*log2/64)
+// sinh(B) = 1/2(e^B -e^-B)
+// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64))
+// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
+// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
+// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32
+// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit)
+//
+// R = ax - M*log2/64
+// R = ax - M*log2_by_64_hi - M*log2_by_64_lo
+// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...)
+// = 1 + p_odd + p_even
+// where the p_even uses the A coefficients and the p_even uses
+// the B coefficients
+//
+// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd
+// cosh(R) = 1 + p_even
+// sinh(B) = S_hi + S_lo
+// cosh(B) = C_hi
+// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R)
//
-// 3. SINH_BY_EXP
+// 3. SINH_BY_EXP 32.0 <= |x| < 11357.21655 ( 400c b174 ddc0 31ae c0ea )
// ==============
-// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe
-// r34 has N-1; 16382 is in register biased form, 0x13ffd
-// There is danger of double overflow if N-1 > 0x3fe
-// in register biased form, 0x103fd
-// Analagously, there is danger of single overflow if N-1 > 0x7e
-// in register biased form, 0x1007d
-// SAFE: If there is danger of overflow set SAFE to 0
+// Can approximate result by exp(x)/2 in this region.
+// Y_hi = Tjhi
+// Y_lo = Tjhi * (p_odd + p_even) + Tjlo
+// sinh(x) = Y_hi + Y_lo
//
-// 4. SINH_HUGE
+// 4. SINH_HUGE |x| >= 11357.21655 ( 400c b174 ddc0 31ae c0ea )
// ============
-// SAFE: SAFE is always 0 for HUGE
+// Set error tag and call error support
+//
//
-
-#include "libm_support.h"
-
// Assembly macros
//==============================================================
-sinh_FR_X = f44
-sinh_FR_X2 = f9
-sinh_FR_X4 = f10
-sinh_FR_SGNX = f40
-sinh_FR_all_ones = f45
-sinh_FR_tmp = f42
-
-sinh_FR_Inv_log2by64 = f9
-sinh_FR_log2by64_lo = f11
-sinh_FR_log2by64_hi = f10
-
-sinh_FR_A1 = f9
-sinh_FR_A2 = f10
-sinh_FR_A3 = f11
-
-sinh_FR_Rcub = f12
-sinh_FR_M_temp = f13
-sinh_FR_R_temp = f13
-sinh_FR_Rsq = f13
-sinh_FR_R = f14
-
-sinh_FR_M = f38
-
-sinh_FR_B1 = f15
-sinh_FR_B2 = f32
-sinh_FR_B3 = f33
+r_ad5 = r14
+r_rshf_2to57 = r15
+r_exp_denorm = r15
+r_ad_mJ_lo = r15
+r_ad_J_lo = r16
+r_2Nm1 = r17
+r_2mNm1 = r18
+r_exp_x = r18
+r_ad_J_hi = r19
+r_ad2o = r19
+r_ad_mJ_hi = r20
+r_mj = r21
+r_ad2e = r22
+r_ad3 = r23
+r_ad1 = r24
+r_Mmj = r24
+r_rshf = r25
+r_M = r25
+r_N = r25
+r_jshf = r26
+r_exp_2tom57 = r26
+r_j = r26
+r_exp_mask = r27
+r_signexp_x = r28
+r_signexp_sgnx_0_5 = r28
+r_exp_0_25 = r29
+r_sig_inv_ln2 = r30
+r_exp_32 = r30
+r_exp_huge = r30
+r_ad4 = r31
+
+GR_SAVE_PFS = r34
+GR_SAVE_B0 = r35
+GR_SAVE_GP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+f_ABS_X = f9
+f_X2 = f10
+f_X4 = f11
+f_tmp = f14
+f_RSHF = f15
+
+f_Inv_log2by64 = f32
+f_log2by64_lo = f33
+f_log2by64_hi = f34
+f_A1 = f35
+
+f_A2 = f36
+f_A3 = f37
+f_Rcub = f38
+f_M_temp = f39
+f_R_temp = f40
+
+f_Rsq = f41
+f_R = f42
+f_M = f43
+f_B1 = f44
+f_B2 = f45
+
+f_B3 = f46
+f_peven_temp1 = f47
+f_peven_temp2 = f48
+f_peven = f49
+f_podd_temp1 = f50
+
+f_podd_temp2 = f51
+f_podd = f52
+f_poly65 = f53
+f_poly6543 = f53
+f_poly6to1 = f53
+f_poly43 = f54
+f_poly21 = f55
+
+f_X3 = f56
+f_INV_LN2_2TO63 = f57
+f_RSHF_2TO57 = f58
+f_2TOM57 = f59
+f_smlst_oflow_input = f60
+
+f_pre_result = f61
+f_huge = f62
+f_spos = f63
+f_sneg = f64
+f_Tjhi = f65
+
+f_Tjlo = f66
+f_Tmjhi = f67
+f_Tmjlo = f68
+f_S_hi = f69
+f_SC_hi_temp = f70
+
+f_S_lo_temp1 = f71
+f_S_lo_temp2 = f72
+f_S_lo_temp3 = f73
+f_S_lo_temp4 = f73
+f_S_lo = f74
+f_C_hi = f75
+
+f_Y_hi = f77
+f_Y_lo_temp = f78
+f_Y_lo = f79
+f_NORM_X = f80
+
+f_P1 = f81
+f_P2 = f82
+f_P3 = f83
+f_P4 = f84
+f_P5 = f85
+
+f_P6 = f86
+f_Tjhi_spos = f87
+f_Tjlo_spos = f88
+f_huge = f89
+f_signed_hi_lo = f90
-sinh_FR_peven_temp1 = f34
-sinh_FR_peven_temp2 = f35
-sinh_FR_peven = f36
-
-sinh_FR_podd_temp1 = f34
-sinh_FR_podd_temp2 = f35
-sinh_FR_podd = f37
-
-sinh_FR_poly_podd_temp1 = f11
-sinh_FR_poly_podd_temp2 = f13
-sinh_FR_poly_peven_temp1 = f11
-sinh_FR_poly_peven_temp2 = f13
-
-sinh_FR_J_temp = f9
-sinh_FR_J = f10
-
-sinh_FR_Mmj = f39
-
-sinh_FR_N_temp1 = f11
-sinh_FR_N_temp2 = f12
-sinh_FR_N = f13
-
-sinh_FR_spos = f14
-sinh_FR_sneg = f15
-
-sinh_FR_Tjhi = f32
-sinh_FR_Tjlo = f33
-sinh_FR_Tmjhi = f34
-sinh_FR_Tmjlo = f35
-
-sinh_GR_mJ = r35
-sinh_GR_J = r36
-
-sinh_AD_mJ = r38
-sinh_AD_J = r39
-sinh_GR_all_ones = r40
-
-sinh_FR_S_hi = f9
-sinh_FR_S_hi_temp = f10
-sinh_FR_S_lo_temp1 = f11
-sinh_FR_S_lo_temp2 = f12
-sinh_FR_S_lo_temp3 = f13
-
-sinh_FR_S_lo = f38
-sinh_FR_C_hi = f39
-
-sinh_FR_C_hi_temp1 = f10
-sinh_FR_Y_hi = f11
-sinh_FR_Y_lo_temp = f12
-sinh_FR_Y_lo = f13
-sinh_FR_SINH = f9
-
-sinh_FR_P1 = f14
-sinh_FR_P2 = f15
-sinh_FR_P3 = f32
-sinh_FR_P4 = f33
-sinh_FR_P5 = f34
-sinh_FR_P6 = f35
-
-sinh_FR_TINY_THRESH = f9
-
-sinh_FR_SINH_temp = f10
-sinh_FR_SCALE = f11
-
-sinh_FR_signed_hi_lo = f10
-
-
-GR_SAVE_PFS = r41
-GR_SAVE_B0 = r42
-GR_SAVE_GP = r43
-
-GR_Parameter_X = r44
-GR_Parameter_Y = r45
-GR_Parameter_RESULT = r46
// Data tables
//==============================================================
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+// DO NOT CHANGE ORDER OF THESE TABLES
+RODATA
.align 16
-double_sinh_arg_reduction:
-ASM_TYPE_DIRECTIVE(double_sinh_arg_reduction,@object)
- data8 0xB8AA3B295C17F0BC, 0x00004005
- data8 0xB17217F7D1000000, 0x00003FF8
- data8 0xCF79ABC9E3B39804, 0x00003FD0
-ASM_SIZE_DIRECTIVE(double_sinh_arg_reduction)
-
-double_sinh_p_table:
-ASM_TYPE_DIRECTIVE(double_sinh_p_table,@object)
- data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC
- data8 0x8888888888888412, 0x00003FF8
- data8 0xD00D00D00D4D39F2, 0x00003FF2
- data8 0xB8EF1D28926D8891, 0x00003FEC
- data8 0xD732377688025BE9, 0x00003FE5
- data8 0xB08AF9AE78C1239F, 0x00003FDE
-ASM_SIZE_DIRECTIVE(double_sinh_p_table)
-
-double_sinh_ab_table:
-ASM_TYPE_DIRECTIVE(double_sinh_ab_table,@object)
- data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC
- data8 0x88888888884ECDD5, 0x00003FF8
- data8 0xD00D0C6DCC26A86B, 0x00003FF2
- data8 0x8000000000000002, 0x00003FFE
- data8 0xAAAAAAAAAA402C77, 0x00003FFA
- data8 0xB60B6CC96BDB144D, 0x00003FF5
-ASM_SIZE_DIRECTIVE(double_sinh_ab_table)
-
-double_sinh_j_table:
-ASM_TYPE_DIRECTIVE(double_sinh_j_table,@object)
- data8 0xB504F333F9DE6484, 0x00003FFE, 0x1EB2FB13, 0x00000000
- data8 0xB6FD91E328D17791, 0x00003FFE, 0x1CE2CBE2, 0x00000000
- data8 0xB8FBAF4762FB9EE9, 0x00003FFE, 0x1DDC3CBC, 0x00000000
- data8 0xBAFF5AB2133E45FB, 0x00003FFE, 0x1EE9AA34, 0x00000000
- data8 0xBD08A39F580C36BF, 0x00003FFE, 0x9EAEFDC1, 0x00000000
- data8 0xBF1799B67A731083, 0x00003FFE, 0x9DBF517B, 0x00000000
- data8 0xC12C4CCA66709456, 0x00003FFE, 0x1EF88AFB, 0x00000000
- data8 0xC346CCDA24976407, 0x00003FFE, 0x1E03B216, 0x00000000
- data8 0xC5672A115506DADD, 0x00003FFE, 0x1E78AB43, 0x00000000
- data8 0xC78D74C8ABB9B15D, 0x00003FFE, 0x9E7B1747, 0x00000000
- data8 0xC9B9BD866E2F27A3, 0x00003FFE, 0x9EFE3C0E, 0x00000000
- data8 0xCBEC14FEF2727C5D, 0x00003FFE, 0x9D36F837, 0x00000000
- data8 0xCE248C151F8480E4, 0x00003FFE, 0x9DEE53E4, 0x00000000
- data8 0xD06333DAEF2B2595, 0x00003FFE, 0x9E24AE8E, 0x00000000
- data8 0xD2A81D91F12AE45A, 0x00003FFE, 0x1D912473, 0x00000000
- data8 0xD4F35AABCFEDFA1F, 0x00003FFE, 0x1EB243BE, 0x00000000
- data8 0xD744FCCAD69D6AF4, 0x00003FFE, 0x1E669A2F, 0x00000000
- data8 0xD99D15C278AFD7B6, 0x00003FFE, 0x9BBC610A, 0x00000000
- data8 0xDBFBB797DAF23755, 0x00003FFE, 0x1E761035, 0x00000000
- data8 0xDE60F4825E0E9124, 0x00003FFE, 0x9E0BE175, 0x00000000
- data8 0xE0CCDEEC2A94E111, 0x00003FFE, 0x1CCB12A1, 0x00000000
- data8 0xE33F8972BE8A5A51, 0x00003FFE, 0x1D1BFE90, 0x00000000
- data8 0xE5B906E77C8348A8, 0x00003FFE, 0x1DF2F47A, 0x00000000
- data8 0xE8396A503C4BDC68, 0x00003FFE, 0x1EF22F22, 0x00000000
- data8 0xEAC0C6E7DD24392F, 0x00003FFE, 0x9E3F4A29, 0x00000000
- data8 0xED4F301ED9942B84, 0x00003FFE, 0x1EC01A5B, 0x00000000
- data8 0xEFE4B99BDCDAF5CB, 0x00003FFE, 0x1E8CAC3A, 0x00000000
- data8 0xF281773C59FFB13A, 0x00003FFE, 0x9DBB3FAB, 0x00000000
- data8 0xF5257D152486CC2C, 0x00003FFE, 0x1EF73A19, 0x00000000
- data8 0xF7D0DF730AD13BB9, 0x00003FFE, 0x9BB795B5, 0x00000000
- data8 0xFA83B2DB722A033A, 0x00003FFE, 0x1EF84B76, 0x00000000
- data8 0xFD3E0C0CF486C175, 0x00003FFE, 0x9EF5818B, 0x00000000
- data8 0x8000000000000000, 0x00003FFF, 0x00000000, 0x00000000
- data8 0x8164D1F3BC030773, 0x00003FFF, 0x1F77CACA, 0x00000000
- data8 0x82CD8698AC2BA1D7, 0x00003FFF, 0x1EF8A91D, 0x00000000
- data8 0x843A28C3ACDE4046, 0x00003FFF, 0x1E57C976, 0x00000000
- data8 0x85AAC367CC487B15, 0x00003FFF, 0x9EE8DA92, 0x00000000
- data8 0x871F61969E8D1010, 0x00003FFF, 0x1EE85C9F, 0x00000000
- data8 0x88980E8092DA8527, 0x00003FFF, 0x1F3BF1AF, 0x00000000
- data8 0x8A14D575496EFD9A, 0x00003FFF, 0x1D80CA1E, 0x00000000
- data8 0x8B95C1E3EA8BD6E7, 0x00003FFF, 0x9D0373AF, 0x00000000
- data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF, 0x9F167097, 0x00000000
- data8 0x8EA4398B45CD53C0, 0x00003FFF, 0x1EB70051, 0x00000000
- data8 0x9031DC431466B1DC, 0x00003FFF, 0x1F6EB029, 0x00000000
- data8 0x91C3D373AB11C336, 0x00003FFF, 0x1DFD6D8E, 0x00000000
- data8 0x935A2B2F13E6E92C, 0x00003FFF, 0x9EB319B0, 0x00000000
- data8 0x94F4EFA8FEF70961, 0x00003FFF, 0x1EBA2BEB, 0x00000000
- data8 0x96942D3720185A00, 0x00003FFF, 0x1F11D537, 0x00000000
- data8 0x9837F0518DB8A96F, 0x00003FFF, 0x1F0D5A46, 0x00000000
- data8 0x99E0459320B7FA65, 0x00003FFF, 0x9E5E7BCA, 0x00000000
- data8 0x9B8D39B9D54E5539, 0x00003FFF, 0x9F3AAFD1, 0x00000000
- data8 0x9D3ED9A72CFFB751, 0x00003FFF, 0x9E86DACC, 0x00000000
- data8 0x9EF5326091A111AE, 0x00003FFF, 0x9F3EDDC2, 0x00000000
- data8 0xA0B0510FB9714FC2, 0x00003FFF, 0x1E496E3D, 0x00000000
- data8 0xA27043030C496819, 0x00003FFF, 0x9F490BF6, 0x00000000
- data8 0xA43515AE09E6809E, 0x00003FFF, 0x1DD1DB48, 0x00000000
- data8 0xA5FED6A9B15138EA, 0x00003FFF, 0x1E65EBFB, 0x00000000
- data8 0xA7CD93B4E965356A, 0x00003FFF, 0x9F427496, 0x00000000
- data8 0xA9A15AB4EA7C0EF8, 0x00003FFF, 0x1F283C4A, 0x00000000
- data8 0xAB7A39B5A93ED337, 0x00003FFF, 0x1F4B0047, 0x00000000
- data8 0xAD583EEA42A14AC6, 0x00003FFF, 0x1F130152, 0x00000000
- data8 0xAF3B78AD690A4375, 0x00003FFF, 0x9E8367C0, 0x00000000
- data8 0xB123F581D2AC2590, 0x00003FFF, 0x9F705F90, 0x00000000
- data8 0xB311C412A9112489, 0x00003FFF, 0x1EFB3C53, 0x00000000
- data8 0xB504F333F9DE6484, 0x00003FFF, 0x1F32FB13, 0x00000000
-ASM_SIZE_DIRECTIVE(double_sinh_j_table)
-
-.align 32
-.global sinhl#
-
-.section .text
-.proc sinhl#
-.align 32
-
-sinhl:
-#ifdef _LIBC
-.global __ieee754_sinhl
-.type __ieee754_sinhl,@function
-__ieee754_sinhl:
-#endif
-
-// X infinity or NAN?
-// Take invalid fault if enabled
-
+LOCAL_OBJECT_START(sinh_arg_reduction)
+// data8 0xB8AA3B295C17F0BC, 0x00004005 // 64/log2 -- signif loaded with setf
+ data8 0xB17217F7D1000000, 0x00003FF8 // log2/64 high part
+ data8 0xCF79ABC9E3B39804, 0x00003FD0 // log2/64 low part
+ data8 0xb174ddc031aec0ea, 0x0000400c // Smallest x to overflow (11357.21655)
+LOCAL_OBJECT_END(sinh_arg_reduction)
+
+LOCAL_OBJECT_START(sinh_p_table)
+ data8 0xB08AF9AE78C1239F, 0x00003FDE // P6
+ data8 0xB8EF1D28926D8891, 0x00003FEC // P4
+ data8 0x8888888888888412, 0x00003FF8 // P2
+ data8 0xD732377688025BE9, 0x00003FE5 // P5
+ data8 0xD00D00D00D4D39F2, 0x00003FF2 // P3
+ data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC // P1
+LOCAL_OBJECT_END(sinh_p_table)
+
+LOCAL_OBJECT_START(sinh_ab_table)
+ data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC // A1
+ data8 0x88888888884ECDD5, 0x00003FF8 // A2
+ data8 0xD00D0C6DCC26A86B, 0x00003FF2 // A3
+ data8 0x8000000000000002, 0x00003FFE // B1
+ data8 0xAAAAAAAAAA402C77, 0x00003FFA // B2
+ data8 0xB60B6CC96BDB144D, 0x00003FF5 // B3
+LOCAL_OBJECT_END(sinh_ab_table)
+
+LOCAL_OBJECT_START(sinh_j_hi_table)
+ data8 0xB504F333F9DE6484, 0x00003FFE
+ data8 0xB6FD91E328D17791, 0x00003FFE
+ data8 0xB8FBAF4762FB9EE9, 0x00003FFE
+ data8 0xBAFF5AB2133E45FB, 0x00003FFE
+ data8 0xBD08A39F580C36BF, 0x00003FFE
+ data8 0xBF1799B67A731083, 0x00003FFE
+ data8 0xC12C4CCA66709456, 0x00003FFE
+ data8 0xC346CCDA24976407, 0x00003FFE
+ data8 0xC5672A115506DADD, 0x00003FFE
+ data8 0xC78D74C8ABB9B15D, 0x00003FFE
+ data8 0xC9B9BD866E2F27A3, 0x00003FFE
+ data8 0xCBEC14FEF2727C5D, 0x00003FFE
+ data8 0xCE248C151F8480E4, 0x00003FFE
+ data8 0xD06333DAEF2B2595, 0x00003FFE
+ data8 0xD2A81D91F12AE45A, 0x00003FFE
+ data8 0xD4F35AABCFEDFA1F, 0x00003FFE
+ data8 0xD744FCCAD69D6AF4, 0x00003FFE
+ data8 0xD99D15C278AFD7B6, 0x00003FFE
+ data8 0xDBFBB797DAF23755, 0x00003FFE
+ data8 0xDE60F4825E0E9124, 0x00003FFE
+ data8 0xE0CCDEEC2A94E111, 0x00003FFE
+ data8 0xE33F8972BE8A5A51, 0x00003FFE
+ data8 0xE5B906E77C8348A8, 0x00003FFE
+ data8 0xE8396A503C4BDC68, 0x00003FFE
+ data8 0xEAC0C6E7DD24392F, 0x00003FFE
+ data8 0xED4F301ED9942B84, 0x00003FFE
+ data8 0xEFE4B99BDCDAF5CB, 0x00003FFE
+ data8 0xF281773C59FFB13A, 0x00003FFE
+ data8 0xF5257D152486CC2C, 0x00003FFE
+ data8 0xF7D0DF730AD13BB9, 0x00003FFE
+ data8 0xFA83B2DB722A033A, 0x00003FFE
+ data8 0xFD3E0C0CF486C175, 0x00003FFE
+ data8 0x8000000000000000, 0x00003FFF // Center of table
+ data8 0x8164D1F3BC030773, 0x00003FFF
+ data8 0x82CD8698AC2BA1D7, 0x00003FFF
+ data8 0x843A28C3ACDE4046, 0x00003FFF
+ data8 0x85AAC367CC487B15, 0x00003FFF
+ data8 0x871F61969E8D1010, 0x00003FFF
+ data8 0x88980E8092DA8527, 0x00003FFF
+ data8 0x8A14D575496EFD9A, 0x00003FFF
+ data8 0x8B95C1E3EA8BD6E7, 0x00003FFF
+ data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF
+ data8 0x8EA4398B45CD53C0, 0x00003FFF
+ data8 0x9031DC431466B1DC, 0x00003FFF
+ data8 0x91C3D373AB11C336, 0x00003FFF
+ data8 0x935A2B2F13E6E92C, 0x00003FFF
+ data8 0x94F4EFA8FEF70961, 0x00003FFF
+ data8 0x96942D3720185A00, 0x00003FFF
+ data8 0x9837F0518DB8A96F, 0x00003FFF
+ data8 0x99E0459320B7FA65, 0x00003FFF
+ data8 0x9B8D39B9D54E5539, 0x00003FFF
+ data8 0x9D3ED9A72CFFB751, 0x00003FFF
+ data8 0x9EF5326091A111AE, 0x00003FFF
+ data8 0xA0B0510FB9714FC2, 0x00003FFF
+ data8 0xA27043030C496819, 0x00003FFF
+ data8 0xA43515AE09E6809E, 0x00003FFF
+ data8 0xA5FED6A9B15138EA, 0x00003FFF
+ data8 0xA7CD93B4E965356A, 0x00003FFF
+ data8 0xA9A15AB4EA7C0EF8, 0x00003FFF
+ data8 0xAB7A39B5A93ED337, 0x00003FFF
+ data8 0xAD583EEA42A14AC6, 0x00003FFF
+ data8 0xAF3B78AD690A4375, 0x00003FFF
+ data8 0xB123F581D2AC2590, 0x00003FFF
+ data8 0xB311C412A9112489, 0x00003FFF
+ data8 0xB504F333F9DE6484, 0x00003FFF
+LOCAL_OBJECT_END(sinh_j_hi_table)
+
+LOCAL_OBJECT_START(sinh_j_lo_table)
+ data4 0x1EB2FB13
+ data4 0x1CE2CBE2
+ data4 0x1DDC3CBC
+ data4 0x1EE9AA34
+ data4 0x9EAEFDC1
+ data4 0x9DBF517B
+ data4 0x1EF88AFB
+ data4 0x1E03B216
+ data4 0x1E78AB43
+ data4 0x9E7B1747
+ data4 0x9EFE3C0E
+ data4 0x9D36F837
+ data4 0x9DEE53E4
+ data4 0x9E24AE8E
+ data4 0x1D912473
+ data4 0x1EB243BE
+ data4 0x1E669A2F
+ data4 0x9BBC610A
+ data4 0x1E761035
+ data4 0x9E0BE175
+ data4 0x1CCB12A1
+ data4 0x1D1BFE90
+ data4 0x1DF2F47A
+ data4 0x1EF22F22
+ data4 0x9E3F4A29
+ data4 0x1EC01A5B
+ data4 0x1E8CAC3A
+ data4 0x9DBB3FAB
+ data4 0x1EF73A19
+ data4 0x9BB795B5
+ data4 0x1EF84B76
+ data4 0x9EF5818B
+ data4 0x00000000 // Center of table
+ data4 0x1F77CACA
+ data4 0x1EF8A91D
+ data4 0x1E57C976
+ data4 0x9EE8DA92
+ data4 0x1EE85C9F
+ data4 0x1F3BF1AF
+ data4 0x1D80CA1E
+ data4 0x9D0373AF
+ data4 0x9F167097
+ data4 0x1EB70051
+ data4 0x1F6EB029
+ data4 0x1DFD6D8E
+ data4 0x9EB319B0
+ data4 0x1EBA2BEB
+ data4 0x1F11D537
+ data4 0x1F0D5A46
+ data4 0x9E5E7BCA
+ data4 0x9F3AAFD1
+ data4 0x9E86DACC
+ data4 0x9F3EDDC2
+ data4 0x1E496E3D
+ data4 0x9F490BF6
+ data4 0x1DD1DB48
+ data4 0x1E65EBFB
+ data4 0x9F427496
+ data4 0x1F283C4A
+ data4 0x1F4B0047
+ data4 0x1F130152
+ data4 0x9E8367C0
+ data4 0x9F705F90
+ data4 0x1EFB3C53
+ data4 0x1F32FB13
+LOCAL_OBJECT_END(sinh_j_lo_table)
-{ .mfi
- alloc r32 = ar.pfs,0,12,4,0
-(p0) fclass.m.unc p6,p0 = f8, 0xe3 //@qnan | @snan | @inf
- mov sinh_GR_all_ones = -1
-}
-;;
+.section .text
+GLOBAL_IEEE754_ENTRY(sinhl)
-{ .mfb
- nop.m 999
-(p6) fma.s0 f8 = f8,f1,f8
-(p6) br.ret.spnt b0 ;;
-}
-
-// Put 0.25 in f9; p6 true if x < 0.25
-// Make constant that will generate inexact when squared
{ .mlx
- setf.sig sinh_FR_all_ones = sinh_GR_all_ones
-(p0) movl r32 = 0x000000000000fffd ;;
-}
-
-{ .mfi
-(p0) setf.exp f9 = r32
-(p0) fclass.m.unc p7,p0 = f8, 0x07 //@zero
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-(p0) fmerge.s sinh_FR_X = f0,f8
-(p7) br.ret.spnt b0 ;;
+ getf.exp r_signexp_x = f8 // Get signexp of x, must redo if unorm
+ movl r_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
}
-
-// Identify denormal operands.
-{ .mfi
- nop.m 999
- fclass.m.unc p10,p0 = f8, 0x09 // + denorm
- nop.i 999
-};;
-{ .mfi
- nop.m 999
- fclass.m.unc p11,p0 = f8, 0x0a // - denorm
- nop.i 999
+{ .mlx
+ addl r_ad1 = @ltoff(sinh_arg_reduction), gp
+ movl r_rshf_2to57 = 0x4778000000000000 // 1.10000 2^(63+57)
}
+;;
{ .mfi
- nop.m 999
-(p0) fmerge.s sinh_FR_SGNX = f8,f1
- nop.i 999 ;;
+ ld8 r_ad1 = [r_ad1]
+ fmerge.s f_ABS_X = f0,f8
+ mov r_exp_0_25 = 0x0fffd // Form exponent for 0.25
}
-
{ .mfi
- nop.m 999
-(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9
- nop.i 999 ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p7) br.cond.sptk L(SINH_BY_TBL) ;;
-}
-
-
-L(SINH_BY_POLY):
-
-// POLY cannot overflow so there is no need to call __libm_error_support
-// Set tiny_SAFE (p7) to 1(0) if answer is not tiny
-// Currently we do not use tiny_SAFE. So the setting of tiny_SAFE is
-// commented out.
-//(p0) movl r32 = 0x000000000000fc01
-//(p0) setf.exp f10 = r32
-//(p0) fcmp.lt.unc.s1 p6,p7 = f8,f10
-// Here is essentially the algorithm for SINH_BY_POLY. Care is take for the order
-// of multiplication; and P_1 is not exactly 1/3!, P_2 is not exactly 1/5!, etc.
-// Note that ax = |x|
-// sinh(x) = sign * (series(e^x) - series(e^-x))/2
-// = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11! + ax^13/13!)
-// = sign * (ax + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) )
-// + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) ) )
-// = sign * (ax + ax*p_odd + (ax*p_even))
-// = sign * (ax + Y_lo)
-// sinh(x) = sign * (Y_hi + Y_lo)
-// Get the values of P_x from the table
-{ .mfb
-(p0) addl r34 = @ltoff(double_sinh_p_table), gp
-(p10) fma.s0 f8 = f8,f8,f8
-(p10) br.ret.spnt b0
-}
-;;
-
-{ .mfb
- ld8 r34 = [r34]
-(p11) fnma.s0 f8 = f8,f8,f8
-(p11) br.ret.spnt b0
+ nop.m 0
+ fnorm.s1 f_NORM_X = f8
+ mov r_exp_2tom57 = 0xffff-57
}
;;
-// Calculate sinh_FR_X2 = ax*ax and sinh_FR_X4 = ax*ax*ax*ax
-{ .mmf
- nop.m 999
-(p0) ldfe sinh_FR_P1 = [r34],16
-(p0) fma.s1 sinh_FR_X2 = sinh_FR_X, sinh_FR_X, f0 ;;
-}
-
-{ .mmi
-(p0) ldfe sinh_FR_P2 = [r34],16 ;;
-(p0) ldfe sinh_FR_P3 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) ldfe sinh_FR_P4 = [r34],16 ;;
-(p0) ldfe sinh_FR_P5 = [r34],16
- nop.i 999 ;;
-}
-
{ .mfi
-(p0) ldfe sinh_FR_P6 = [r34],16
-(p0) fma.s1 sinh_FR_X4 = sinh_FR_X2, sinh_FR_X2, f0
- nop.i 999 ;;
+ setf.d f_RSHF_2TO57 = r_rshf_2to57 // Form const 1.100 * 2^120
+ fclass.m p10,p0 = f8, 0x0b // Test for denorm
+ mov r_exp_mask = 0x1ffff
}
-
-// Calculate sinh_FR_podd = p_odd and sinh_FR_peven = p_even
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_poly_podd_temp1 = sinh_FR_X4, sinh_FR_P5, sinh_FR_P3
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_poly_podd_temp2 = sinh_FR_X4, sinh_FR_poly_podd_temp1, sinh_FR_P1
- nop.i 999
+{ .mlx
+ setf.sig f_INV_LN2_2TO63 = r_sig_inv_ln2 // Form 1/ln2 * 2^63
+ movl r_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_poly_peven_temp1 = sinh_FR_X4, sinh_FR_P6, sinh_FR_P4
- nop.i 999 ;;
+ nop.m 0
+ fclass.m p7,p0 = f8, 0x07 // Test if x=0
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_podd = sinh_FR_X2, sinh_FR_poly_podd_temp2, f0
- nop.i 999
+ setf.exp f_2TOM57 = r_exp_2tom57 // Form 2^-57 for scaling
+ nop.f 0
+ add r_ad3 = 0x90, r_ad1 // Point to ab_table
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_poly_peven_temp2 = sinh_FR_X4, sinh_FR_poly_peven_temp1, sinh_FR_P2
- nop.i 999 ;;
+ setf.d f_RSHF = r_rshf // Form right shift const 1.100 * 2^63
+ fclass.m p6,p0 = f8, 0xe3 // Test if x nan, inf
+ add r_ad4 = 0x2f0, r_ad1 // Point to j_hi_table midpoint
}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_peven = sinh_FR_X4, sinh_FR_poly_peven_temp2, f0
- nop.i 999 ;;
+{ .mib
+ add r_ad2e = 0x20, r_ad1 // Point to p_table
+ nop.i 0
+(p10) br.cond.spnt SINH_DENORM // Branch if x denorm
}
+;;
-// Calculate sinh_FR_Y_lo = ax*p_odd + (ax*p_even)
+// Common path -- return here from SINH_DENORM if x is unnorm
+SINH_COMMON:
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_X, sinh_FR_peven, f0
- nop.i 999 ;;
+ ldfe f_smlst_oflow_input = [r_ad2e],16
+ nop.f 0
+ add r_ad5 = 0x580, r_ad1 // Point to j_lo_table midpoint
}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_X, sinh_FR_podd, sinh_FR_Y_lo_temp
- nop.i 999 ;;
+{ .mib
+ ldfe f_log2by64_hi = [r_ad1],16
+ and r_exp_x = r_exp_mask, r_signexp_x
+(p7) br.ret.spnt b0 // Exit if x=0
}
+;;
-// Calculate sinh_FR_SINH = Y_hi + Y_lo. Note that ax = Y_hi
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_SINH = sinh_FR_X, f1, sinh_FR_Y_lo
- nop.i 999 ;;
-}
-// Dummy multiply to generate inexact
+// Get the A coefficients for SINH_BY_TBL
{ .mfi
- nop.m 999
-(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones
- nop.i 999
+ ldfe f_A1 = [r_ad3],16
+ fcmp.lt.s1 p8,p9 = f8,f0 // Test for x<0
+ cmp.lt p7,p0 = r_exp_x, r_exp_0_25 // Test x < 0.25
}
-
-// Calculate f8 = sign * (Y_hi + Y_lo)
-// Go to return
{ .mfb
- nop.m 999
-(p0) fma.s0 f8 = sinh_FR_SGNX,sinh_FR_SINH,f0
-(p0) br.ret.sptk b0 ;;
-}
-
-
-L(SINH_BY_TBL):
-
-// Now that we are at TBL; so far all we know is that |x| >= 0.25.
-// The first two steps are the same for TBL and EXP, but if we are HUGE
-// we want to leave now.
-// Double-extended:
-// Go to HUGE if |x| >= 2^14, 1000d (register-biased) is e = 14 (true)
-// Double
-// Go to HUGE if |x| >= 2^10, 10009 (register-biased) is e = 10 (true)
-// Single
-// Go to HUGE if |x| >= 2^7, 10006 (register-biased) is e = 7 (true)
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x000000000001000d ;;
-}
-
-{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ add r_ad2o = 0x30, r_ad2e // Point to p_table odd coeffs
+(p6) fma.s0 f8 = f8,f1,f0 // Result for x nan, inf
+(p6) br.ret.spnt b0 // Exit for x nan, inf
}
+;;
+// Calculate X2 = ax*ax for SINH_BY_POLY
{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc.s1 p6,p7 = sinh_FR_X,f9
- nop.i 999 ;;
+ ldfe f_log2by64_lo = [r_ad1],16
+ nop.f 0
+ nop.i 0
}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(SINH_HUGE) ;;
+{ .mfb
+ ldfe f_A2 = [r_ad3],16
+ fma.s1 f_X2 = f_NORM_X, f_NORM_X, f0
+(p7) br.cond.spnt SINH_BY_POLY
}
+;;
-// r32 = 1
-// r34 = N-1
-// r35 = N
-// r36 = j
-// r37 = N+1
-
-// TBL can never overflow
-// sinh(x) = sinh(B+R)
-// = sinh(B)cosh(R) + cosh(B)sinh(R)
-//
-// ax = |x| = M*log2/64 + R
-// B = M*log2/64
-// M = 64*N + j
-// We will calcualte M and get N as (M-j)/64
-// The division is a shift.
-// exp(B) = exp(N*log2 + j*log2/64)
-// = 2^N * 2^(j*log2/64)
-// sinh(B) = 1/2(e^B -e^-B)
-// = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64))
-// sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
-// cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
-// 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32
-// Tjhi is double-extended (80-bit) and Tjlo is single(32-bit)
-// R = ax - M*log2/64
-// R = ax - M*log2_by_64_hi - M*log2_by_64_lo
-// exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...)
-// = 1 + p_odd + p_even
-// where the p_even uses the A coefficients and the p_even uses the B coefficients
-// So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd
-// cosh(R) = 1 + p_even
-// sinh(B) = S_hi + S_lo
-// cosh(B) = C_hi
-// sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R)
+// Here if |x| >= 0.25
+SINH_BY_TBL:
// ******************************************************
-// STEP 1 (TBL and EXP)
+// STEP 1 (TBL and EXP) - Argument reduction
// ******************************************************
// Get the following constants.
-// f9 = Inv_log2by64
-// f10 = log2by64_hi
-// f11 = log2by64_lo
-
-{ .mmi
-(p0) adds r32 = 0x1,r0
-(p0) addl r34 = @ltoff(double_sinh_arg_reduction), gp
- nop.i 999
-}
-;;
-
-{ .mmi
- ld8 r34 = [r34]
- nop.m 999
- nop.i 999
-}
-;;
+// Inv_log2by64
+// log2by64_hi
+// log2by64_lo
// We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and
// put them in an exponent.
-// sinh_FR_spos = 2^(N-1) and sinh_FR_sneg = 2^(-N-1)
-// r39 = 0xffff + (N-1) = 0xffff +N -1
-// r40 = 0xffff - (N +1) = 0xffff -N -1
-
-{ .mlx
- nop.m 999
-(p0) movl r38 = 0x000000000000fffe ;;
-}
-
-{ .mmi
-(p0) ldfe sinh_FR_Inv_log2by64 = [r34],16 ;;
-(p0) ldfe sinh_FR_log2by64_hi = [r34],16
- nop.i 999 ;;
-}
-
-{ .mbb
-(p0) ldfe sinh_FR_log2by64_lo = [r34],16
- nop.b 999
- nop.b 999 ;;
-}
+// f_spos = 2^(N-1) and f_sneg = 2^(-N-1)
+// 0xffff + (N-1) = 0xffff +N -1
+// 0xffff - (N +1) = 0xffff -N -1
-// Get the A coefficients
-// f9 = A_1
-// f10 = A_2
-// f11 = A_3
-{ .mmi
- nop.m 999
-(p0) addl r34 = @ltoff(double_sinh_ab_table), gp
- nop.i 999
-}
-;;
+// Calculate M and keep it as integer and floating point.
+// M = round-to-integer(x*Inv_log2by64)
+// f_M = M = truncate(ax/(log2/64))
+// Put the integer representation of M in r_M
+// and the floating point representation of M in f_M
+// Get the remaining A,B coefficients
{ .mmi
- ld8 r34 = [r34]
- nop.m 999
- nop.i 999
+ ldfe f_A3 = [r_ad3],16
+ nop.m 0
+ nop.i 0
}
;;
-
-// Calculate M and keep it as integer and floating point.
-// f38 = M = round-to-integer(x*Inv_log2by64)
-// sinh_FR_M = M = truncate(ax/(log2/64))
-// Put the significand of M in r35
-// and the floating point representation of M in sinh_FR_M
-
+.pred.rel "mutex",p8,p9
+// Use constant (1.100*2^(63-6)) to get rounded M into rightmost significand
+// |x| * 64 * 1/ln2 * 2^(63-6) + 1.1000 * 2^(63+(63-6))
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_M = sinh_FR_X, sinh_FR_Inv_log2by64, f0
- nop.i 999
+(p8) mov r_signexp_sgnx_0_5 = 0x2fffe // signexp of -0.5
+ fma.s1 f_M_temp = f_ABS_X, f_INV_LN2_2TO63, f_RSHF_2TO57
+(p9) mov r_signexp_sgnx_0_5 = 0x0fffe // signexp of +0.5
}
+;;
+// Test for |x| >= overflow limit
{ .mfi
-(p0) ldfe sinh_FR_A1 = [r34],16
- nop.f 999
- nop.i 999 ;;
+ ldfe f_B1 = [r_ad3],16
+ fcmp.ge.s1 p6,p0 = f_ABS_X, f_smlst_oflow_input
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fcvt.fx.s1 sinh_FR_M_temp = sinh_FR_M
- nop.i 999 ;;
+ ldfe f_B2 = [r_ad3],16
+ nop.f 0
+ mov r_exp_32 = 0x10004
}
+;;
-{ .mfi
- nop.m 999
-(p0) fnorm.s1 sinh_FR_M = sinh_FR_M_temp
- nop.i 999 ;;
+// Subtract RSHF constant to get rounded M as a floating point value
+// M_temp * 2^(63-6) - 2^63
+{ .mfb
+ ldfe f_B3 = [r_ad3],16
+ fms.s1 f_M = f_M_temp, f_2TOM57, f_RSHF
+(p6) br.cond.spnt SINH_HUGE // Branch if result will overflow
}
+;;
{ .mfi
-(p0) getf.sig r35 = sinh_FR_M_temp
- nop.f 999
- nop.i 999 ;;
+ getf.sig r_M = f_M_temp
+ nop.f 0
+ cmp.ge p7,p6 = r_exp_x, r_exp_32 // Test if x >= 32
}
+;;
-// M is still in r35. Calculate j. j is the signed extension of the six lsb of M. It
+// Calculate j. j is the signed extension of the six lsb of M. It
// has a range of -32 thru 31.
-// r35 = M
-// r36 = j
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p0) and r36 = 0x3f, r35 ;;
-}
// Calculate R
-// f13 = f44 - f12*f10 = ax - M*log2by64_hi
-// f14 = f13 - f8*f11 = R = (ax - M*log2by64_hi) - M*log2by64_lo
-
-{ .mfi
- nop.m 999
-(p0) fnma.s1 sinh_FR_R_temp = sinh_FR_M, sinh_FR_log2by64_hi, sinh_FR_X
- nop.i 999
-}
+// ax - M*log2by64_hi
+// R = (ax - M*log2by64_hi) - M*log2by64_lo
{ .mfi
-(p0) ldfe sinh_FR_A2 = [r34],16
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fnma.s1 f_R_temp = f_M, f_log2by64_hi, f_ABS_X
+ and r_j = 0x3f, r_M
}
+;;
-{ .mfi
- nop.m 999
-(p0) fnma.s1 sinh_FR_R = sinh_FR_M, sinh_FR_log2by64_lo, sinh_FR_R_temp
- nop.i 999
+{ .mii
+ nop.m 0
+ shl r_jshf = r_j, 0x2 // Shift j so can sign extend it
+;;
+ sxt1 r_jshf = r_jshf
}
+;;
-// Get the B coefficients
-// f15 = B_1
-// f32 = B_2
-// f33 = B_3
-
-{ .mmi
-(p0) ldfe sinh_FR_A3 = [r34],16 ;;
-(p0) ldfe sinh_FR_B1 = [r34],16
- nop.i 999 ;;
+{ .mii
+ nop.m 0
+ shr r_j = r_jshf, 0x2 // Now j has range -32 to 31
+ nop.i 0
}
+;;
{ .mmi
-(p0) ldfe sinh_FR_B2 = [r34],16 ;;
-(p0) ldfe sinh_FR_B3 = [r34],16
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
-(p0) shl r34 = r36, 0x2 ;;
-(p0) sxt1 r37 = r34 ;;
+ shladd r_ad_J_hi = r_j, 4, r_ad4 // pointer to Tjhi
+ sub r_Mmj = r_M, r_j // M-j
+ sub r_mj = r0, r_j // Form -j
}
+;;
-// ******************************************************
-// STEP 2 (TBL and EXP)
-// ******************************************************
-// Calculate Rsquared and Rcubed in preparation for p_even and p_odd
-// f12 = R*R*R
-// f13 = R*R
-// f14 = R <== from above
-
+// The TBL and EXP branches are merged and predicated
+// If TBL, p6 true, 0.25 <= |x| < 32
+// If EXP, p7 true, 32 <= |x| < overflow_limit
+//
+// N = (M-j)/64
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Rsq = sinh_FR_R, sinh_FR_R, f0
-(p0) shr r36 = r37, 0x2 ;;
+ ldfe f_Tjhi = [r_ad_J_hi]
+ fnma.s1 f_R = f_M, f_log2by64_lo, f_R_temp
+ shr r_N = r_Mmj, 0x6 // N = (M-j)/64
}
-
-// r34 = M-j = r35 - r36
-// r35 = N = (M-j)/64
-
-{ .mii
-(p0) sub r34 = r35, r36
- nop.i 999 ;;
-(p0) shr r35 = r34, 0x6 ;;
+{ .mfi
+ shladd r_ad_mJ_hi = r_mj, 4, r_ad4 // pointer to Tmjhi
+ nop.f 0
+ shladd r_ad_mJ_lo = r_mj, 2, r_ad5 // pointer to Tmjlo
}
+;;
-{ .mii
-(p0) sub r40 = r38, r35
-(p0) adds r37 = 0x1, r35
-(p0) add r39 = r38, r35 ;;
+{ .mfi
+ sub r_2mNm1 = r_signexp_sgnx_0_5, r_N // signexp sgnx*2^(-N-1)
+ nop.f 0
+ shladd r_ad_J_lo = r_j, 2, r_ad5 // pointer to Tjlo
}
-
-// Get the address of the J table, add the offset,
-// addresses are sinh_AD_mJ and sinh_AD_J, get the T value
-// f32 = T(j)_hi
-// f33 = T(j)_lo
-// f34 = T(-j)_hi
-// f35 = T(-j)_lo
-
-{ .mmi
-(p0) sub r34 = r35, r32
-(p0) addl r37 = @ltoff(double_sinh_j_table), gp
- nop.i 999
+{ .mfi
+ ldfe f_Tmjhi = [r_ad_mJ_hi]
+ nop.f 0
+ add r_2Nm1 = r_signexp_sgnx_0_5, r_N // signexp sgnx*2^(N-1)
}
;;
-{ .mmi
- ld8 r37 = [r37]
- nop.m 999
- nop.i 999
+{ .mmf
+ ldfs f_Tmjlo = [r_ad_mJ_lo]
+ setf.exp f_sneg = r_2mNm1 // Form sgnx * 2^(-N-1)
+ nop.f 0
}
;;
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Rcub = sinh_FR_Rsq, sinh_FR_R, f0
- nop.i 999
+{ .mmf
+ ldfs f_Tjlo = [r_ad_J_lo]
+ setf.exp f_spos = r_2Nm1 // Form sgnx * 2^(N-1)
+ nop.f 0
}
+;;
// ******************************************************
-// STEP 3 Now decide if we need to branch to EXP
+// STEP 2 (TBL and EXP)
// ******************************************************
-// Put 32 in f9; p6 true if x < 32
-// Go to EXP if |x| >= 32
+// Calculate Rsquared and Rcubed in preparation for p_even and p_odd
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000010004 ;;
+{ .mmf
+ nop.m 0
+ nop.m 0
+ fma.s1 f_Rsq = f_R, f_R, f0
}
+;;
-// Calculate p_even
-// f34 = B_2 + Rsq *B_3
-// f35 = B_1 + Rsq*f34 = B_1 + Rsq * (B_2 + Rsq *B_3)
-// f36 = p_even = Rsq * f35 = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_peven_temp1 = sinh_FR_Rsq, sinh_FR_B3, sinh_FR_B2
- nop.i 999 ;;
-}
+// Calculate p_even
+// B_2 + Rsq *B_3
+// B_1 + Rsq * (B_2 + Rsq *B_3)
+// p_even = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_peven_temp2 = sinh_FR_Rsq, sinh_FR_peven_temp1, sinh_FR_B1
- nop.i 999
+ nop.m 0
+ fma.s1 f_peven_temp1 = f_Rsq, f_B3, f_B2
+ nop.i 0
}
-
// Calculate p_odd
-// f34 = A_2 + Rsq *A_3
-// f35 = A_1 + Rsq * (A_2 + Rsq *A_3)
-// f37 = podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
-
+// A_2 + Rsq *A_3
+// A_1 + Rsq * (A_2 + Rsq *A_3)
+// podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_podd_temp1 = sinh_FR_Rsq, sinh_FR_A3, sinh_FR_A2
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_podd_temp1 = f_Rsq, f_A3, f_A2
+ nop.i 0
}
+;;
{ .mfi
-(p0) setf.exp sinh_FR_N_temp1 = r39
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_Rcub = f_Rsq, f_R, f0
+ nop.i 0
}
+;;
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_peven = sinh_FR_Rsq, sinh_FR_peven_temp2, f0
- nop.i 999
-}
+//
+// If TBL,
+// Calculate S_hi and S_lo, and C_hi
+// SC_hi_temp = sneg * Tmjhi
+// S_hi = spos * Tjhi - SC_hi_temp
+// S_hi = spos * Tjhi - (sneg * Tmjhi)
+// C_hi = spos * Tjhi + SC_hi_temp
+// C_hi = spos * Tjhi + (sneg * Tmjhi)
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_podd_temp2 = sinh_FR_Rsq, sinh_FR_podd_temp1, sinh_FR_A1
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.s1 f_SC_hi_temp = f_sneg, f_Tmjhi, f0
+ nop.i 0
}
+;;
+// If TBL,
+// S_lo_temp3 = sneg * Tmjlo
+// S_lo_temp4 = spos * Tjlo - S_lo_temp3
+// S_lo_temp4 = spos * Tjlo -(sneg * Tmjlo)
{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.s1 f_S_lo_temp3 = f_sneg, f_Tmjlo, f0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_podd = sinh_FR_podd_temp2, sinh_FR_Rcub, sinh_FR_R
- nop.i 999
-}
-
-// sinh_GR_mj contains the table offset for -j
-// sinh_GR_j contains the table offset for +j
-// p6 is true when j <= 0
-
-{ .mlx
-(p0) setf.exp sinh_FR_N_temp2 = r40
-(p0) movl r40 = 0x0000000000000020 ;;
+ nop.m 0
+ fma.s1 f_peven_temp2 = f_Rsq, f_peven_temp1, f_B1
+ nop.i 0
}
-
{ .mfi
-(p0) sub sinh_GR_mJ = r40, r36
-(p0) fmerge.se sinh_FR_spos = sinh_FR_N_temp1, f1
-(p0) adds sinh_GR_J = 0x20, r36 ;;
-}
-
-{ .mii
- nop.m 999
-(p0) shl sinh_GR_mJ = sinh_GR_mJ, 5 ;;
-(p0) add sinh_AD_mJ = r37, sinh_GR_mJ ;;
-}
-
-{ .mmi
- nop.m 999
-(p0) ldfe sinh_FR_Tmjhi = [sinh_AD_mJ],16
-(p0) shl sinh_GR_J = sinh_GR_J, 5 ;;
+ nop.m 0
+ fma.s1 f_podd_temp2 = f_Rsq, f_podd_temp1, f_A1
+ nop.i 0
}
+;;
+// If EXP,
+// Compute sgnx * 2^(N-1) * Tjhi and sgnx * 2^(N-1) * Tjlo
{ .mfi
-(p0) ldfs sinh_FR_Tmjlo = [sinh_AD_mJ],16
-(p0) fcmp.lt.unc.s1 p0,p7 = sinh_FR_X,f9
-(p0) add sinh_AD_J = r37, sinh_GR_J ;;
-}
-
-{ .mmi
-(p0) ldfe sinh_FR_Tjhi = [sinh_AD_J],16 ;;
-(p0) ldfs sinh_FR_Tjlo = [sinh_AD_J],16
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-(p0) fmerge.se sinh_FR_sneg = sinh_FR_N_temp2, f1
-(p7) br.cond.spnt L(SINH_BY_EXP) ;;
+ nop.m 0
+(p7) fma.s1 f_Tjhi_spos = f_Tjhi, f_spos, f0
+ nop.i 0
}
-
{ .mfi
- nop.m 999
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s1 f_Tjlo_spos = f_Tjlo, f_spos, f0
+ nop.i 0
}
-
-// ******************************************************
-// If NOT branch to EXP
-// ******************************************************
-// Calculate S_hi and S_lo
-// sinh_FR_S_hi_temp = sinh_FR_sneg * sinh_FR_Tmjhi
-// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi_temp
-// sinh_FR_S_hi = sinh_FR_spos * sinh_FR_Tjhi - (sinh_FR_sneg * sinh_FR_Tmjlo)
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_S_hi_temp = sinh_FR_sneg, sinh_FR_Tmjhi, f0
- nop.i 999 ;;
+ nop.m 0
+(p6) fms.s1 f_S_hi = f_spos, f_Tjhi, f_SC_hi_temp
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fms.s1 sinh_FR_S_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi_temp
- nop.i 999
+ nop.m 0
+(p6) fma.s1 f_C_hi = f_spos, f_Tjhi, f_SC_hi_temp
+ nop.i 0
}
-
-// Calculate C_hi
-// sinh_FR_C_hi_temp1 = sinh_FR_sneg * sinh_FR_Tmjhi
-// sinh_FR_C_hi = sinh_FR_spos * sinh_FR_Tjhi + sinh_FR_C_hi_temp1
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_C_hi_temp1 = sinh_FR_sneg, sinh_FR_Tmjhi, f0
- nop.i 999 ;;
+ nop.m 0
+(p6) fms.s1 f_S_lo_temp4 = f_spos, f_Tjlo, f_S_lo_temp3
+ nop.i 0
}
-
-// sinh_FR_S_lo_temp1 = sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi
-// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_spos * sinh_FR_Tjhi - sinh_FR_S_hi)
-// sinh_FR_S_lo_temp2 = -sinh_FR_sneg * sinh_FR_Tmjlo + (sinh_FR_S_lo_temp1 )
+;;
{ .mfi
- nop.m 999
-(p0) fms.s1 sinh_FR_S_lo_temp1 = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_S_hi
- nop.i 999
+ nop.m 0
+ fma.s1 f_peven = f_Rsq, f_peven_temp2, f0
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_C_hi = sinh_FR_spos, sinh_FR_Tjhi, sinh_FR_C_hi_temp1
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_podd = f_podd_temp2, f_Rcub, f_R
+ nop.i 0
}
+;;
+
+// If TBL,
+// S_lo_temp1 = spos * Tjhi - S_hi
+// S_lo_temp2 = -sneg * Tmjlo + S_lo_temp1
+// S_lo_temp2 = -sneg * Tmjlo + (spos * Tjhi - S_hi)
{ .mfi
- nop.m 999
-(p0) fnma.s1 sinh_FR_S_lo_temp2 = sinh_FR_sneg, sinh_FR_Tmjhi, sinh_FR_S_lo_temp1
- nop.i 999
+ nop.m 0
+(p6) fms.s1 f_S_lo_temp1 = f_spos, f_Tjhi, f_S_hi
+ nop.i 0
}
-
-// sinh_FR_S_lo_temp1 = sinh_FR_sneg * sinh_FR_Tmjlo
-// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo - sinh_FR_S_lo_temp1
-// sinh_FR_S_lo_temp3 = sinh_FR_spos * sinh_FR_Tjlo -(sinh_FR_sneg * sinh_FR_Tmjlo)
-// sinh_FR_S_lo = sinh_FR_S_lo_temp3 + sinh_FR_S_lo_temp2
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_S_lo_temp1 = sinh_FR_sneg, sinh_FR_Tmjlo, f0
- nop.i 999 ;;
+ nop.m 0
+(p6) fnma.s1 f_S_lo_temp2 = f_sneg, f_Tmjhi, f_S_lo_temp1
+ nop.i 0
}
+;;
-/////////// BUG FIX fma to fms -TK
+// If EXP,
+// Y_hi = sgnx * 2^(N-1) * Tjhi
+// Y_lo = sgnx * 2^(N-1) * Tjhi * (p_odd + p_even) + sgnx * 2^(N-1) * Tjlo
{ .mfi
- nop.m 999
-(p0) fms.s1 sinh_FR_S_lo_temp3 = sinh_FR_spos, sinh_FR_Tjlo, sinh_FR_S_lo_temp1
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s1 f_Y_lo_temp = f_peven, f1, f_podd
+ nop.i 0
}
+;;
+// If TBL,
+// S_lo = S_lo_temp4 + S_lo_temp2
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_S_lo = sinh_FR_S_lo_temp3, f1, sinh_FR_S_lo_temp2
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.s1 f_S_lo = f_S_lo_temp4, f1, f_S_lo_temp2
+ nop.i 0
}
+;;
+// If TBL,
// Y_hi = S_hi
// Y_lo = C_hi*p_odd + (S_hi*p_even + S_lo)
-// sinh_FR_Y_lo_temp = sinh_FR_S_hi * sinh_FR_peven + sinh_FR_S_lo
-// sinh_FR_Y_lo = sinh_FR_C_hi * sinh_FR_podd + sinh_FR_Y_lo_temp
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_S_hi, sinh_FR_peven, sinh_FR_S_lo
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.s1 f_Y_lo_temp = f_S_hi, f_peven, f_S_lo
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_C_hi, sinh_FR_podd, sinh_FR_Y_lo_temp
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s1 f_Y_lo = f_Tjhi_spos, f_Y_lo_temp, f_Tjlo_spos
+ nop.i 0
}
-
-// sinh_FR_SINH = Y_hi + Y_lo
-// f8 = answer = sinh_FR_SGNX * sinh_FR_SINH
+;;
// Dummy multiply to generate inexact
{ .mfi
- nop.m 999
-(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones
- nop.i 999
+ nop.m 0
+ fmpy.s0 f_tmp = f_B2, f_B2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p6) fma.s1 f_Y_lo = f_C_hi, f_podd, f_Y_lo_temp
+ nop.i 0
}
+;;
+
+// f8 = answer = Y_hi + Y_lo
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_SINH = sinh_FR_S_hi, f1, sinh_FR_Y_lo
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s0 f8 = f_Y_lo, f1, f_Tjhi_spos
+ nop.i 0
}
+;;
+// f8 = answer = Y_hi + Y_lo
{ .mfb
- nop.m 999
-(p0) fma.s0 f8 = sinh_FR_SGNX, sinh_FR_SINH,f0
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+(p6) fma.s0 f8 = f_Y_lo, f1, f_S_hi
+ br.ret.sptk b0 // Exit for SINH_BY_TBL and SINH_BY_EXP
}
+;;
-L(SINH_BY_EXP):
-
-// When p7 is true, we know that an overflow is not going to happen
-// When p7 is false, we must check for possible overflow
-// p7 is the over_SAFE flag
-// Y_hi = Tjhi
-// Y_lo = Tjhi * (p_odd + p_even) +Tjlo
-// Scale = sign * 2^(N-1)
-// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_peven + sinh_FR_podd)
-// sinh_FR_Y_lo = sinh_FR_Tjhi * (sinh_FR_Y_lo_temp )
+// Here if 0 < |x| < 0.25
+SINH_BY_POLY:
+{ .mmf
+ ldfe f_P6 = [r_ad2e],16
+ ldfe f_P5 = [r_ad2o],16
+ nop.f 0
+}
+;;
-{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo_temp = sinh_FR_peven, f1, sinh_FR_podd
- nop.i 999
+{ .mmi
+ ldfe f_P4 = [r_ad2e],16
+ ldfe f_P3 = [r_ad2o],16
+ nop.i 0
}
+;;
-// Now we are in EXP. This is the only path where an overflow is possible
-// but not for certain. So this is the only path where over_SAFE has any use.
-// r34 still has N-1
-// There is a danger of double-extended overflow if N-1 > 16382 = 0x3ffe
-// There is a danger of double overflow if N-1 > 0x3fe = 1022
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000003ffe ;;
+{ .mmi
+ ldfe f_P2 = [r_ad2e],16
+ ldfe f_P1 = [r_ad2o],16
+ nop.i 0
}
+;;
{ .mfi
-(p0) cmp.gt.unc p0,p7 = r34, r32
-(p0) fmerge.s sinh_FR_SCALE = sinh_FR_SGNX, sinh_FR_spos
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_X3 = f_NORM_X, f_X2, f0
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_Y_lo = sinh_FR_Tjhi, sinh_FR_Y_lo_temp, sinh_FR_Tjlo
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_X4 = f_X2, f_X2, f0
+ nop.i 0
}
+;;
-// f8 = answer = scale * (Y_hi + Y_lo)
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_SINH_temp = sinh_FR_Y_lo, f1, sinh_FR_Tjhi
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_poly65 = f_X2, f_P6, f_P5
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s0 f44 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_poly43 = f_X2, f_P4, f_P3
+ nop.i 0
}
+;;
-// Dummy multiply to generate inexact
{ .mfi
- nop.m 999
-(p7) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 f_poly21 = f_X2, f_P2, f_P1
+ nop.i 0
}
+;;
-// If over_SAFE is set, return
-{ .mfb
- nop.m 999
-(p7) fmerge.s f8 = f44,f44
-(p7) br.ret.sptk b0 ;;
+{ .mfi
+ nop.m 0
+ fma.s1 f_poly6543 = f_X4, f_poly65, f_poly43
+ nop.i 0
}
-
-// Else see if we overflowed
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// If WRE is set then an overflow will not occur in EXP.
-// The input value that would cause a register (WRE) value to overflow is about 2^15
-// and this input would go into the HUGE path.
-// Answer with WRE is in f43.
+;;
{ .mfi
- nop.m 999
-(p0) fsetc.s2 0x7F,0x42
- nop.i 999;;
+ nop.m 0
+ fma.s1 f_poly6to1 = f_X4, f_poly6543, f_poly21
+ nop.i 0
}
+;;
+// Dummy multiply to generate inexact
{ .mfi
- nop.m 999
-(p0) fma.s2 f43 = sinh_FR_SCALE, sinh_FR_SINH_temp, f0
- nop.i 999 ;;
+ nop.m 0
+ fmpy.s0 f_tmp = f_P6, f_P6
+ nop.i 0
}
-
-// 13FFF => 13FFF -FFFF = 4000(true)
-// 4000 + 3FFF = 7FFF, which is 1 more that the exponent of the largest
-// long double (7FFE). So 0 13FFF 8000000000000000 is one ulp more than
-// largest long double in register bias
-// Now set p8 if the answer with WRE is greater than or equal this value
-// Also set p9 if the answer with WRE is less than or equal to negative this value
-
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x00000000013FFF ;;
+{ .mfb
+ nop.m 0
+ fma.s0 f8 = f_poly6to1, f_X3, f_NORM_X
+ br.ret.sptk b0 // Exit SINH_BY_POLY
}
+;;
-{ .mmf
- nop.m 999
-(p0) setf.exp f41 = r32
-(p0) fsetc.s2 0x7F,0x40 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc.s1 p8, p0 = f43, f41
- nop.i 999
+// Here if x denorm or unorm
+SINH_DENORM:
+// Determine if x really a denorm and not a unorm
+{ .mmf
+ getf.exp r_signexp_x = f_NORM_X
+ mov r_exp_denorm = 0x0c001 // Real denorms have exp < this
+ fmerge.s f_ABS_X = f0, f_NORM_X
}
+;;
{ .mfi
- nop.m 999
-(p0) fmerge.ns f42 = f41, f41
- nop.i 999 ;;
+ nop.m 0
+ fcmp.eq.s0 p10,p0 = f8, f0 // Set denorm flag
+ nop.i 0
}
+;;
-// The error tag for overflow is 126
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p8) mov r47 = 126 ;;
+// Set p8 if really a denorm
+{ .mmi
+ and r_exp_x = r_exp_mask, r_signexp_x
+;;
+ cmp.lt p8,p9 = r_exp_x, r_exp_denorm
+ nop.i 0
}
+;;
+// Identify denormal operands.
{ .mfb
- nop.m 999
-(p0) fcmp.le.unc.s1 p9, p0 = f43, f42
-(p8) br.cond.spnt L(SINH_ERROR_SUPPORT) ;;
-}
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p9) mov r47 = 126
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.spnt L(SINH_ERROR_SUPPORT) ;;
+ nop.m 0
+(p8) fcmp.ge.unc.s1 p6,p7 = f8, f0 // Test sign of denorm
+(p9) br.cond.sptk SINH_COMMON // Return to main path if x unorm
}
+;;
-// Dummy multiply to generate inexact
{ .mfi
- nop.m 999
-(p0) fmpy.s0 sinh_FR_tmp = sinh_FR_all_ones, sinh_FR_all_ones
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.s0 f8 = f8,f8,f8 // If x +denorm, result=x+x^2
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p0) fmerge.s f8 = f44,f44
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+(p7) fnma.s0 f8 = f8,f8,f8 // If x -denorm, result=x-x^2
+ br.ret.sptk b0 // Exit if x denorm
}
+;;
-L(SINH_HUGE):
-
-// for SINH_HUGE, put 24000 in exponent; take sign from input; add 1
-// SAFE: SAFE is always 0 for HUGE
-{ .mlx
- nop.m 999
-(p0) movl r32 = 0x0000000000015dbf ;;
+// Here if |x| >= overflow limit
+SINH_HUGE:
+// for SINH_HUGE, put 24000 in exponent; take sign from input
+{ .mmi
+ mov r_exp_huge = 0x15dbf
+;;
+ setf.exp f_huge = r_exp_huge
+ nop.i 0
}
+;;
+.pred.rel "mutex",p8,p9
{ .mfi
-(p0) setf.exp f9 = r32
- nop.f 999
- nop.i 999 ;;
+ alloc r32 = ar.pfs,0,5,4,0
+(p8) fnma.s1 f_signed_hi_lo = f_huge, f1, f1
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 sinh_FR_signed_hi_lo = sinh_FR_SGNX, f9, f1
- nop.i 999 ;;
+ nop.m 0
+(p9) fma.s1 f_signed_hi_lo = f_huge, f1, f1
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s0 f44 = sinh_FR_signed_hi_lo, f9, f0
-(p0) mov r47 = 126
-}
-.endp sinhl
-ASM_SIZE_DIRECTIVE(sinhl)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__ieee754_sinhl)
-#endif
-
-// Stack operations when calling error support.
-// (1) (2) (3) (call) (4)
-// sp -> + psp -> + psp -> + sp -> +
-// | | | |
-// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
-// | | | |
-// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
-// | | | |
-// | | <- GR_X X1 ->| |
-// | | | |
-// sp-64 -> + sp -> + sp -> + +
-// save ar.pfs save b0 restore gp
-// save gp restore ar.pfs
-
-.proc __libm_error_region
-__libm_error_region:
-L(SINH_ERROR_SUPPORT):
+ nop.m 0
+ fma.s0 f_pre_result = f_signed_hi_lo, f_huge, f0
+ mov GR_Parameter_TAG = 126
+}
+;;
+
+GLOBAL_IEEE754_END(sinhl)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
-// (1)
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
-
-// (2)
{ .mmi
- stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
-// (3)
{ .mib
- stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
+ stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
nop.b 0
}
{ .mib
- stfe [GR_Parameter_Y] = f44 // STORE Parameter 3 on stack
+ stfe [GR_Parameter_Y] = f_pre_result // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
+
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
-// (4)
{ .mmi
- ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
- add sp = 64,sp // Restore stack pointer
- mov b0 = GR_SAVE_B0 // Restore return address
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
+
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
+
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/e_sqrt.S b/sysdeps/ia64/fpu/e_sqrt.S
index dd057f58ee..0e208b3de1 100644
--- a/sysdeps/ia64/fpu/e_sqrt.S
+++ b/sysdeps/ia64/fpu/e_sqrt.S
@@ -1,11 +1,11 @@
.file "sqrt.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
-//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,27 +35,28 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-// ********************************************************************
+//********************************************************************
// History
-// ********************************************************************
-// 2/02/00 Initial version
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+//********************************************************************
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
-// ********************************************************************
+//********************************************************************
//
// Function: Combined sqrt(x), where
// _
// sqrt(x) = |x, for double precision x values
//
-// ********************************************************************
+//********************************************************************
//
// Accuracy: Correctly Rounded
//
-// ********************************************************************
+//********************************************************************
//
// Resources Used:
//
@@ -68,7 +69,7 @@
//
// Predicate Registers: p6, p7, p8
//
-// *********************************************************************
+//*********************************************************************
//
// IEEE Special Conditions:
//
@@ -78,15 +79,13 @@
// sqrt(+/-0) = +/-0
// sqrt(negative) = QNaN and error handling is called
//
-// *********************************************************************
+//*********************************************************************
//
// Implementation:
//
// Modified Newton-Raphson Algorithm
//
-// *********************************************************************
-
-#include "libm_support.h"
+//*********************************************************************
GR_SAVE_PFS = r33
GR_SAVE_B0 = r34
@@ -98,19 +97,7 @@ GR_Parameter_RESULT = r39
.section .text
-.proc sqrt#
-.global sqrt#
-.align 64
-
-sqrt:
-#ifdef _LIBC
-.global __sqrt
-.type __sqrt,@function
-__sqrt:
-.global __ieee754_sqrt
-.type __ieee754_sqrt,@function
-__ieee754_sqrt:
-#endif
+GLOBAL_IEEE754_ENTRY(sqrt)
{ .mfi
alloc r32= ar.pfs,0,5,4,0
frsqrta.s0 f7,p6=f8
@@ -255,7 +242,7 @@ __ieee754_sqrt:
{ .mfb
nop.m 0
- (p0) mov f8 = f7
+ mov f8 = f7
(p8) br.ret.sptk b0 ;;
}
{ .mfb
@@ -264,13 +251,7 @@ __ieee754_sqrt:
(p7) br.cond.sptk __libm_error_region ;;
}
// END DOUBLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM
-.endp sqrt#
-ASM_SIZE_DIRECTIVE(sqrt)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__sqrt)
-ASM_SIZE_DIRECTIVE(__ieee754_sqrt)
-#endif
-
+GLOBAL_IEEE754_END(sqrt)
// Stack operations when calling error support.
// (1) (2) (3) (call) (4)
// sp -> + psp -> + psp -> + sp -> +
@@ -286,8 +267,7 @@ ASM_SIZE_DIRECTIVE(__ieee754_sqrt)
// save gp restore ar.pfs
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
//
// This branch includes all those special values that are not negative,
@@ -352,8 +332,9 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
+
+
.type __libm_error_support#,@function
diff --git a/sysdeps/ia64/fpu/e_sqrtf.S b/sysdeps/ia64/fpu/e_sqrtf.S
index 1799845d6d..bee0df7414 100644
--- a/sysdeps/ia64/fpu/e_sqrtf.S
+++ b/sysdeps/ia64/fpu/e_sqrtf.S
@@ -1,10 +1,10 @@
.file "sqrtf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,27 +35,29 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-// *********************************************************************
+//*********************************************************************
// History:
//
-// 2/02/00 Initial version
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
-// *********************************************************************
+//*********************************************************************
//
// Function: Combined sqrtf(x), where
// _
// sqrtf(x) = |x, for single precision x values
//
-// ********************************************************************
+//********************************************************************
//
// Accuracy: Correctly Rounded
//
-// ********************************************************************
+//********************************************************************
//
// Resources Used:
//
@@ -68,7 +70,7 @@
//
// Predicate Registers: p6, p7, p8
//
-// ********************************************************************
+//********************************************************************
//
// IEEE Special Conditions:
//
@@ -78,15 +80,14 @@
// sqrtf(+/-0) = +/-0
// sqrtf(negative) = QNaN and error handling is called
//
-// ********************************************************************
+//********************************************************************
//
// Implementation:
//
// Modified Newton-Raphson Algorithm
//
-// ********************************************************************
+//********************************************************************
-#include "libm_support.h"
GR_SAVE_B0 = r34
GR_SAVE_PFS = r33
@@ -102,21 +103,8 @@ FR_Y = f0
FR_RESULT = f8
-
.section .text
-.proc sqrtf#
-.global sqrtf#
-.align 64
-
-sqrtf:
-#ifdef _LIBC
-.global __sqrtf
-.type __sqrtf,@function
-__sqrtf:
-.global __ieee754_sqrtf
-.type __ieee754_sqrtf,@function
-__ieee754_sqrtf:
-#endif
+GLOBAL_IEEE754_ENTRY(sqrtf)
{ .mlx
// BEGIN SINGLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM
alloc r32= ar.pfs,0,5,4,0
@@ -197,7 +185,7 @@ __ieee754_sqrtf:
// Step (10)
// d1 = a - S1 * S1 in f9
(p6) fnma.s1 f9=f7,f7,f8
- nop.i 0;;;
+ nop.i 0;;
} { .mfb
nop.m 0
// Step (11)
@@ -207,27 +195,20 @@ __ieee754_sqrtf:
// END SINGLE PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM
} { .mfb
nop.m 0
- (p0) mov f8 = f7
+ mov f8 = f7
(p8) br.ret.sptk b0 ;;
}
//
// This branch includes all those special values that are not negative,
// with the result equal to frcpa(x)
//
-.endp sqrtf
-ASM_SIZE_DIRECTIVE(sqrtf)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__sqrtf)
-ASM_SIZE_DIRECTIVE(__ieee754_sqrtf)
-#endif
-
+GLOBAL_IEEE754_END(sqrtf)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mii
add GR_Parameter_Y=-32,sp // Parameter 2 value
-(p0) mov GR_Parameter_TAG = 50
+ mov GR_Parameter_TAG = 50
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
@@ -271,8 +252,7 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
diff --git a/sysdeps/ia64/fpu/e_sqrtl.S b/sysdeps/ia64/fpu/e_sqrtl.S
index e41148243a..ec1475626d 100644
--- a/sysdeps/ia64/fpu/e_sqrtl.S
+++ b/sysdeps/ia64/fpu/e_sqrtl.S
@@ -1,10 +1,10 @@
.file "sqrtl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,23 +35,25 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-// ********************************************************************
+//********************************************************************
//
// History:
-// 2/02/00 (hand-optimized)
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 (hand-optimized)
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
-// ********************************************************************
+//********************************************************************
//
// Function: Combined sqrtl(x), where
// _
// sqrtl(x) = |x, for double-extended precision x values
//
-// ********************************************************************
+//********************************************************************
//
// Resources Used:
//
@@ -64,7 +66,7 @@
//
// Predicate Registers: p6, p7, p8
//
-// ********************************************************************
+//********************************************************************
//
// IEEE Special Conditions:
//
@@ -74,15 +76,13 @@
// sqrtl(+/-0) = +/-0
// sqrtl(negative) = QNaN and error handling is called
//
-// ********************************************************************
+//********************************************************************
//
// Implementation:
//
// Modified Newton-Raphson Algorithm
//
-// ********************************************************************
-
-#include "libm_support.h"
+//********************************************************************
GR_SAVE_PFS = r33
GR_SAVE_B0 = r34
@@ -97,19 +97,7 @@ FR_Y = f0
FR_RESULT = f8
.section .text
-.proc sqrtl#
-.global sqrtl#
-.align 64
-
-sqrtl:
-#ifdef _LIBC
-.global __sqrtl
-.type __sqrtl,@function
-__sqrtl:
-.global __ieee754_sqrtl
-.type __ieee754_sqrtl,@function
-__ieee754_sqrtl:
-#endif
+GLOBAL_IEEE754_ENTRY(sqrtl)
{ .mlx
alloc r32= ar.pfs,0,5,4,0
// exponent of +1/2 in r2
@@ -151,7 +139,7 @@ alloc r32= ar.pfs,0,5,4,0
}
{ .mfi
nop.m 0
- (p0) mov f15=f8
+ mov f15=f8
nop.i 0;;
} { .mfi
nop.m 0
@@ -221,8 +209,8 @@ alloc r32= ar.pfs,0,5,4,0
(p6) br.ret.sptk b0 ;;
}
{ .mfb
- (p0) mov GR_Parameter_TAG = 48
- (p0) mov f8 = f7
+ mov GR_Parameter_TAG = 48
+ mov f8 = f7
(p8) br.ret.sptk b0 ;;
}
//
@@ -232,15 +220,8 @@ alloc r32= ar.pfs,0,5,4,0
// END DOUBLE EXTENDED PRECISION MINIMUM LATENCY SQUARE ROOT ALGORITHM
-.endp sqrtl#
-ASM_SIZE_DIRECTIVE(sqrtl)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__sqrtl)
-ASM_SIZE_DIRECTIVE(__ieee754_sqrtl)
-#endif
-
-.proc __libm_error_region
-__libm_error_region:
+GLOBAL_IEEE754_END(sqrtl)
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
@@ -288,7 +269,6 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region#)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/gen_import_file_list b/sysdeps/ia64/fpu/gen_import_file_list
new file mode 100644
index 0000000000..a02bb3155b
--- /dev/null
+++ b/sysdeps/ia64/fpu/gen_import_file_list
@@ -0,0 +1,80 @@
+#!/bin/sh
+
+libm_dir=$1
+
+import() {
+ # $1 = name
+ # $2 = source file-name
+ # $3 = destination file-name
+ echo "$1 $libm_dir/$2 $3"
+}
+
+import_c() {
+ # $1 = name
+ # $2 = source file-name
+ # $3 = destination file-name
+ echo "$1 $libm_dir/$2 $3"
+}
+
+import_c DUMMY libm_support.h libm_support.h
+import_c DUMMY libm_error.c libm_error.c
+import_c scalblnf scalblnf.c s_scalblnf.c
+
+for f in acos acosh asin atanh cosh exp2 exp10 fmod log2 pow remainder \
+ scalb sinh sqrt; do
+ for t in "" f l; do
+ import $f$t $f$t.s e_$f$t.S
+ done
+done
+
+for f in atan2 exp; do
+ for t in "" f; do
+ import $f$t $f$t.s e_$f$t.S
+ done
+done
+import "atan" atan.s s_atan.S
+import "atanf" atanf.s s_atanf.S
+import "atan(2)?l" atanl.s s_atanl.S
+import "exp(m1)?l" expl_m1.s s_expm1l.S
+
+for t in "" f l; do
+ import "log(10)?$t" log$t.s e_log$t.S
+ import tgamma$t tgamma$t.s w_tgamma$t.S
+ import "(hypot|cabs)$t" hypot$t.s e_hypot$t.S
+done
+
+for f in asinh cbrt ceil erf erfc fabs floor \
+ ilogb log1p logb modf nearbyint nextafter nexttoward \
+ rint round significand fdim fma fmax tanh trunc; do
+ for t in "" f l; do
+ import $f$t $f$t.s s_$f$t.S
+ done
+done
+
+for t in "" f l; do
+ import "(tan|cot)$t" tancot$t.s s_tan$t.S
+done
+
+for t in "" f l; do
+ import "(sin|cos)$t" sincos$t.s s_cos$t.S
+ import_c frexp$t frexp$t.c s_frexp$t.c
+ import_c ldexp$t ldexp$t.c s_ldexp$t.c
+ import_c scalbn$t scalbn$t.c s_scalbn$t.c
+done
+
+import expm1 exp_m1.s s_expm1.S
+import expm1f expf_m1.s s_expm1f.S
+
+for f in frexp frexpf frexpl reduce; do
+ import __libm_$f libm_$f.s libm_$f.S
+done
+
+for t in "" f l; do
+ import __libm_ldexp$t libm_ldexp$t.s s_libm_ldexp$t.S
+ import "(__libm_)?(sincos|cis)$t" libm_sincos$t.s libm_sincos$t.S
+ import __libm_lgamma$t libm_lgamma$t.s libm_lgamma$t.S
+ import __libm_scalbn$t libm_scalbn$t.s s_libm_scalbn$t.S
+done
+import __libm_scalblnf libm_scalblnf.s libm_scalblnf.S
+import "__libm_(sin|cos|sincos)_large" libm_sincos_large.s \
+ libm_sincos_large.S
diff --git a/sysdeps/ia64/fpu/import_check b/sysdeps/ia64/fpu/import_check
new file mode 100644
index 0000000000..21176f578d
--- /dev/null
+++ b/sysdeps/ia64/fpu/import_check
@@ -0,0 +1,81 @@
+#!/bin/sh
+
+objdir="$1"
+
+num_errors=0
+
+check_syms() {
+ global_count=0
+ entry_count=0
+ while read value type name; do
+ if [ $value = "U" ]; then
+ name=$type
+ # undefined symbols must start with double-underscore
+ if [ $(expr $name : '\(..\)') != "__" ]; then
+ echo -e "$(basename $file):\tError: undefined reference $name doesn't start with \"__\"."
+ num_errors=$(($num_errors + 1))
+ fi
+ continue
+ fi
+
+ case "$type" in
+ W)
+ entry_count=$(($entry_count + 1))
+ ;;
+ *)
+ entry_count=$(($entry_count + 1))
+ if [ "$(expr $name : '\(..\)')" != "__" ]; then
+ global_count=$(($global_count + 1))
+ fi
+ ;;
+ esac
+ done
+ if [ $entry_count -gt 1 -a $global_count -gt 0 ]; then
+ echo -e "$(basename $file):\tError: detected $global_count strong " \
+ "global and $entry_count entry-points."
+ num_errors=$(($num_errors + 1))
+ fi
+}
+
+check_file() {
+ file=$1
+ size=$(readelf -S $file | \
+ (sz=0; while read line; do
+ if echo $line | fgrep -q " .rodata"; then
+ read sz rest
+ break
+ fi
+ done;
+ printf "%d" 0x$sz))
+
+ summands=$(readelf -s $file | fgrep " OBJECT " | tr -s ' ' |
+ cut -f4 -d' ' | sed 's,$,+,')0
+ sum=$(($summands))
+ if [ $sum != $size ]; then
+ echo -e "$(basename $file):\tError: sum of objects=$sum bytes, .rodata size=$size bytes"
+ num_errors=$(($num_errors + 1))
+ fi
+
+ tmp=$(tempfile -p syms)
+ nm -g $file > $tmp
+ check_syms < $tmp
+}
+
+do_checks() {
+ echo "Note: 1 error expected in w_tgammal.o due to 64-byte alignment-padding."
+ while read func_pattern src_file dst_file; do
+ if [ "$(expr $dst_file : '.*\(S\)$')" = "S" ]; then
+ objfile=$(expr $dst_file : '\(.*\)[.]S$')
+ check_file $objdir/$objfile.o
+ fi
+ done
+}
+
+do_checks < import_file_list
+
+if [ $num_errors -gt 0 ]; then
+ echo "FAILURE: Detected $num_errors error(s)."
+ exit 1
+fi
+echo SUCCESS
+exit 0
diff --git a/sysdeps/ia64/fpu/import_diffs b/sysdeps/ia64/fpu/import_diffs
new file mode 100644
index 0000000000..147280d5fd
--- /dev/null
+++ b/sysdeps/ia64/fpu/import_diffs
@@ -0,0 +1,7 @@
+#!/bin/sh
+do_diffs() {
+ while read func_pattern src_file dst_file; do
+ diff -up $src_file $dst_file
+ done
+}
+do_diffs < import_file_list
diff --git a/sysdeps/ia64/fpu/import_file.awk b/sysdeps/ia64/fpu/import_file.awk
new file mode 100644
index 0000000000..c6335dc1df
--- /dev/null
+++ b/sysdeps/ia64/fpu/import_file.awk
@@ -0,0 +1,148 @@
+BEGIN {
+ getline;
+ while (!match($0, "^/[/*] static char cvs_id")) {
+ print;
+ getline;
+ }
+ getline;
+ while (!match($0, "^// WARRANTY DISCLAIMER")) {
+ print;
+ getline;
+ }
+ getline;
+ printf \
+"// Redistribution and use in source and binary forms, with or without\n" \
+"// modification, are permitted provided that the following conditions are\n" \
+"// met:\n" \
+"//\n" \
+"// * Redistributions of source code must retain the above copyright\n" \
+"// notice, this list of conditions and the following disclaimer.\n" \
+"//\n" \
+"// * Redistributions in binary form must reproduce the above copyright\n" \
+"// notice, this list of conditions and the following disclaimer in the\n" \
+"// documentation and/or other materials provided with the distribution.\n" \
+"//\n" \
+"// * The name of Intel Corporation may not be used to endorse or promote\n" \
+"// products derived from this software without specific prior written\n" \
+"// permission.\n\n";
+ if (LICENSE_ONLY == "y") {
+ do {
+ print;
+ } while (getline);
+ }
+}
+
+/^[.]data/ {
+ print "RODATA";
+ next;
+}
+/^([a-zA-Z_0-9]*_(tb[l0-9]|Tt|[tT]able|data|low|coeffs|constants|CONSTANTS|reduction|Stirling)(_?([1-9cdimpqstPQT]+|tail))?|(Constants|Poly|coeff)_.+|(double_sin_?cos|double_cis)[fl]?_.+):/ {
+ table_name=substr($1,1,length($1)-1);
+ printf "LOCAL_OBJECT_START(%s)\n", table_name;
+ getline;
+ while (!match($0, "^[ \t]*data")) {
+ print;
+ getline;
+ }
+ while (match($0, "(//|^[ \t]*data)")) {
+ print;
+ getline;
+ }
+ printf "LOCAL_OBJECT_END(%s)\n\n", table_name;
+ next;
+}
+/^[.]proc[ \t]+__libm_(error_region|callout)/ {
+ printf "LOCAL_LIBM_ENTRY(%s)\n", $2;
+ getline;
+ next;
+}
+/^[.]endp[ \t]+__libm_(error_region|callout)/ {
+ printf "LOCAL_LIBM_END(%s)\n", $2;
+ next;
+}
+/^[.]global/ {
+ split($2, part, "#");
+ name=part[1];
+ if (match(name, "^"FUNC"$")) {
+ next;
+ }
+}
+/^[.]proc/ {
+ split($2, part, "#");
+ name=part[1];
+ if (match(name, "^"FUNC"$")) {
+ local_funcs=("^(" \
+ "cis|cisf|cisl" \
+ "|cabs|cabsf|cabsl" \
+ "|cot|cotf|cotl" \
+ ")$");
+ ieee754_funcs=("^(" \
+ "atan2|atan2f|atan2l|atanl" \
+ "|cos|cosf|cosl" \
+ "|cosh|coshf|coshl" \
+ "|exp|expf|expl" \
+ "|exp10|exp10f|exp10l" \
+ "|expm1|expm1f|expm1l" \
+ "|fmod|fmodf|fmodl" \
+ "|hypot|hypotf|hypotl" \
+ "|fabs|fabsf|fabsl" \
+ "|floor|floorf|floorl" \
+ "|log1p|log1pf|log1pl" \
+ "|log|log10|log10f|log10l|log2l|logf|logl" \
+ "|remainder|remainderf|remainderl|" \
+ "|rint|rintf|rintl|" \
+ "|scalb|scalbf|scalbl" \
+ "|sin|sinf|sinl" \
+ "|sincos|sincosf|sincosl" \
+ "|sinh|sinhf|sinhl" \
+ "|sqrt|sqrtf|sqrtl" \
+ "|tan|tanf|tanl" \
+ ")$");
+ if (match(name, ieee754_funcs)) {
+ type="GLOBAL_IEEE754";
+ } else if (match (name, local_funcs)) {
+ type="LOCAL_LIBM";
+ } else {
+ type="GLOBAL_LIBM";
+ }
+ printf "%s_ENTRY(%s)\n", type, name;
+ getline;
+ while (!match($0, "^"name"#?:")) {
+ getline;
+ }
+ getline;
+ while (!match($0, "^.endp")) {
+ print
+ getline;
+ }
+ getline;
+ printf "%s_END(%s)\n", type, name;
+ if (match(name, "^exp10[fl]?$")) {
+ t=substr(name,6)
+ printf "weak_alias (exp10%s, pow10%s)\n", t, t
+ }
+ next;
+ }
+}
+/^[a-zA-Z_]+:/ {
+ split($1, part, ":");
+ name=part[1];
+ if (match(name, "^"FUNC"$")) {
+ printf "GLOBAL_LIBM_ENTRY(%s)\n", name;
+ getline;
+ while (!match($0, "^"name"#?:")) {
+ getline;
+ }
+ getline;
+ while (!match($0, "^.endp")) {
+ print
+ getline;
+ }
+ getline;
+ printf "GLOBAL_LIBM_END(%s)\n", name;
+ next;
+ }
+}
+
+{ print }
+
diff --git a/sysdeps/ia64/fpu/import_intel_libm b/sysdeps/ia64/fpu/import_intel_libm
new file mode 100644
index 0000000000..752ba37478
--- /dev/null
+++ b/sysdeps/ia64/fpu/import_intel_libm
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+# Notes:
+
+# We don't import copysign finite, fpclassify, isinf, isnan, and signbit
+# since our own versions are nicer and just as correct and fast (except
+# perhaps that they don't handle non-finite arguments well?).
+#
+# Also, leave out cabs for now since it doesn't seem overridable in
+# glibc.
+
+libm_dir=$1
+
+import_s() {
+ # $1 = name
+ # $2 = source file-name
+ # $3 = destination file-name
+ echo "Importing $1 from $2 -> $3"
+ awk -f import_file.awk FUNC=$1 $2 > $3
+}
+
+import_c() {
+ # $1 = name
+ # $2 = source file-name
+ # $3 = destination file-name
+ echo "Importing $1 from $2 -> $3"
+ awk -f import_file.awk LICENSE_ONLY=y $2 > $3
+}
+
+do_imports() {
+ while read func_pattern src_file dst_file; do
+ if [ "$(expr $src_file : '.*\(c\)$')" = "c" ]; then
+ import_c "$func_pattern" "$src_file" "$dst_file"
+ else
+ import_s "$func_pattern" "$src_file" "$dst_file"
+ fi
+ done
+}
+
+./gen_import_file_list $libm_dir > import_file_list
+
+do_imports < import_file_list
diff --git a/sysdeps/ia64/fpu/libm-symbols.h b/sysdeps/ia64/fpu/libm-symbols.h
new file mode 100644
index 0000000000..3d0eb37d2a
--- /dev/null
+++ b/sysdeps/ia64/fpu/libm-symbols.h
@@ -0,0 +1,64 @@
+#include <sysdep.h>
+#undef ret /* get rid of the stupid "ret" macro; it breaks br.ret */
+
+/* Support for compatible assembler handling. */
+
+#ifdef __ELF__
+# define ASM_SIZE_DIRECTIVE(name) .size name,.-name
+# define ASM_TYPE_DIRECTIVE(name,T) .type name,T
+#else
+# define ASM_SIZE_DIRECTIVE(name)
+# define ASM_TYPE_DIRECTIVE(name,T)
+#endif
+
+#define LOCAL_LIBM_ENTRY(name) \
+ .proc name; \
+ name:
+
+#define LOCAL_LIBM_END(name) \
+ .endp name; \
+ ASM_SIZE_DIRECTIVE(name)
+
+
+#define RODATA .rodata
+#define LOCAL_OBJECT_START(name) \
+ name:; \
+ ASM_TYPE_DIRECTIVE(name, @object)
+#define LOCAL_OBJECT_END(name) \
+ ASM_SIZE_DIRECTIVE(name)
+
+#define GLOBAL_LIBM_ENTRY(name) \
+ LOCAL_LIBM_ENTRY(name); \
+ .global name
+#define GLOBAL_LIBM_END(name) LOCAL_LIBM_END(name)
+
+#define INTERNAL_LIBM_ENTRY(name) \
+ GLOBAL_LIBM_ENTRY(__libm_##name); \
+ .global __libm_##name
+#define INTERNAL_LIBM_END(name) GLOBAL_LIBM_END(__libm_##name)
+
+#define WEAK_LIBM_ENTRY(name) \
+ .align 32; \
+ LOCAL_LIBM_ENTRY(name); \
+ .global __##name; \
+ __##name:
+#define WEAK_LIBM_END(name) \
+ weak_alias (__##name, name); \
+ .hidden __##name; \
+ LOCAL_LIBM_END(name); \
+ ASM_SIZE_DIRECTIVE(__##name); \
+ ASM_TYPE_DIRECTIVE(__##name, @function)
+
+#define GLOBAL_IEEE754_ENTRY(name) \
+ WEAK_LIBM_ENTRY(name); \
+ .global __ieee754_##name; \
+ .hidden __ieee754_##name; \
+ __ieee754_##name:
+#define GLOBAL_IEEE754_END(name) \
+ WEAK_LIBM_END(name); \
+ ASM_SIZE_DIRECTIVE(__ieee754_##name); \
+ ASM_TYPE_DIRECTIVE(__ieee754_##name, @function)
+
+#if defined ASSEMBLER && !defined NOT_IN_libc
+# define __libm_error_support HIDDEN_JUMPTARGET(__libm_error_support)
+#endif
diff --git a/sysdeps/ia64/fpu/libm_atan2_reg.S b/sysdeps/ia64/fpu/libm_atan2_reg.S
deleted file mode 100644
index 5649670d19..0000000000
--- a/sysdeps/ia64/fpu/libm_atan2_reg.S
+++ /dev/null
@@ -1,1234 +0,0 @@
-.file "libm_atan2_reg.s"
-
-// Copyright (C) 2000, 2001, Intel Corporation
-// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// * The name of Intel Corporation may not be used to endorse or promote
-// products derived from this software without specific prior written
-// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
-//
-// History
-//==============================================================
-// 2/02/00: Initial version
-// 4/04/00 Unwind support added
-
-#include "libm_support.h"
-
-.data
-
-.align 64
-ASM_TYPE_DIRECTIVE(Constants_atan#,@object)
-Constants_atan:
-data4 0x54442D18, 0x3FF921FB, 0x248D3132, 0x3E000000
-// double pi/2, single lo_pi/2, two**(-3)
-data4 0xAAAAAAA3, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // P_1
-data4 0xCCCC54B2, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // P_2
-data4 0x47E4D0C2, 0x92492492, 0x0000BFFC, 0x00000000 // P_3
-data4 0x58870889, 0xE38E38E0, 0x00003FFB, 0x00000000 // P_4
-data4 0x290149F8, 0xBA2E895B, 0x0000BFFB, 0x00000000 // P_5
-data4 0x250F733D, 0x9D88E6D4, 0x00003FFB, 0x00000000 // P_6
-data4 0xFB8745A0, 0x884E51FF, 0x0000BFFB, 0x00000000 // P_7
-data4 0x394396BD, 0xE1C7412B, 0x00003FFA, 0x00000000 // P_8
-data4 0xAAAAA52F, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // Q_1
-data4 0xC75B60D3, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // Q_2
-data4 0x011F1940, 0x924923AD, 0x0000BFFC, 0x00000000 // Q_3
-data4 0x2A5F89BD, 0xE36F716D, 0x00003FFB, 0x00000000 // Q_4
-// Entries Tbl_hi (double precision)
-// B = 1+Index/16+1/32 Index = 0
-// Entries Tbl_lo (single precision)
-// B = 1+Index/16+1/32 Index = 0
-data4 0xA935BD8E, 0x3FE9A000, 0x23ACA08F, 0x00000000
-// Entries Tbl_hi (double precision) Index = 0,1,...,15
-// B = 2^(-1)*(1+Index/16+1/32)
-// Entries Tbl_lo (single precision)
-// Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32)
-data4 0x7F175A34, 0x3FDE77EB, 0x238729EE, 0x00000000
-data4 0x73C1A40B, 0x3FE0039C, 0x249334DB, 0x00000000
-data4 0x5B5B43DA, 0x3FE0C614, 0x22CBA7D1, 0x00000000
-data4 0x88BE7C13, 0x3FE1835A, 0x246310E7, 0x00000000
-data4 0xE2CC9E6A, 0x3FE23B71, 0x236210E5, 0x00000000
-data4 0x8406CBCA, 0x3FE2EE62, 0x2462EAF5, 0x00000000
-data4 0x1CD41719, 0x3FE39C39, 0x24B73EF3, 0x00000000
-data4 0x5B795B55, 0x3FE44506, 0x24C11260, 0x00000000
-data4 0x5BB6EC04, 0x3FE4E8DE, 0x242519EE, 0x00000000
-data4 0x1F732FBA, 0x3FE587D8, 0x24D4346C, 0x00000000
-data4 0x115D7B8D, 0x3FE6220D, 0x24ED487B, 0x00000000
-data4 0x920B3D98, 0x3FE6B798, 0x2495FF1E, 0x00000000
-data4 0x8FBA8E0F, 0x3FE74897, 0x223D9531, 0x00000000
-data4 0x289FA093, 0x3FE7D528, 0x242B0411, 0x00000000
-data4 0x576CC2C5, 0x3FE85D69, 0x2335B374, 0x00000000
-data4 0xA99CC05D, 0x3FE8E17A, 0x24C27CFB, 0x00000000
-//
-// Entries Tbl_hi (double precision) Index = 0,1,...,15
-// B = 2^(-2)*(1+Index/16+1/32)
-// Entries Tbl_lo (single precision)
-// Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32)
-//
-data4 0x510665B5, 0x3FD025FA, 0x24263482, 0x00000000
-data4 0x362431C9, 0x3FD1151A, 0x242C8DC9, 0x00000000
-data4 0x67E47C95, 0x3FD20255, 0x245CF9BA, 0x00000000
-data4 0x7A823CFE, 0x3FD2ED98, 0x235C892C, 0x00000000
-data4 0x29271134, 0x3FD3D6D1, 0x2389BE52, 0x00000000
-data4 0x586890E6, 0x3FD4BDEE, 0x24436471, 0x00000000
-data4 0x175E0F4E, 0x3FD5A2E0, 0x2389DBD4, 0x00000000
-data4 0x9F5FA6FD, 0x3FD68597, 0x2476D43F, 0x00000000
-data4 0x52817501, 0x3FD76607, 0x24711774, 0x00000000
-data4 0xB8DF95D7, 0x3FD84422, 0x23EBB501, 0x00000000
-data4 0x7CD0C662, 0x3FD91FDE, 0x23883A0C, 0x00000000
-data4 0x66168001, 0x3FD9F930, 0x240DF63F, 0x00000000
-data4 0x5422058B, 0x3FDAD00F, 0x23FE261A, 0x00000000
-data4 0x378624A5, 0x3FDBA473, 0x23A8CD0E, 0x00000000
-data4 0x0AAD71F8, 0x3FDC7655, 0x2422D1D0, 0x00000000
-data4 0xC9EC862B, 0x3FDD45AE, 0x2344A109, 0x00000000
-//
-// Entries Tbl_hi (double precision) Index = 0,1,...,15
-// B = 2^(-3)*(1+Index/16+1/32)
-// Entries Tbl_lo (single precision)
-// Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32)
-//
-data4 0x84212B3D, 0x3FC068D5, 0x239874B6, 0x00000000
-data4 0x41060850, 0x3FC16465, 0x2335E774, 0x00000000
-data4 0x171A535C, 0x3FC25F6E, 0x233E36BE, 0x00000000
-data4 0xEDEB99A3, 0x3FC359E8, 0x239680A3, 0x00000000
-data4 0xC6092A9E, 0x3FC453CE, 0x230FB29E, 0x00000000
-data4 0xBA11570A, 0x3FC54D18, 0x230C1418, 0x00000000
-data4 0xFFB3AA73, 0x3FC645BF, 0x23F0564A, 0x00000000
-data4 0xE8A7D201, 0x3FC73DBD, 0x23D4A5E1, 0x00000000
-data4 0xE398EBC7, 0x3FC8350B, 0x23D4ADDA, 0x00000000
-data4 0x7D050271, 0x3FC92BA3, 0x23BCB085, 0x00000000
-data4 0x601081A5, 0x3FCA217E, 0x23BC841D, 0x00000000
-data4 0x574D780B, 0x3FCB1696, 0x23CF4A8E, 0x00000000
-data4 0x4D768466, 0x3FCC0AE5, 0x23BECC90, 0x00000000
-data4 0x4E1D5395, 0x3FCCFE65, 0x2323DCD2, 0x00000000
-data4 0x864C9D9D, 0x3FCDF110, 0x23F53F3A, 0x00000000
-data4 0x451D980C, 0x3FCEE2E1, 0x23CCB11F, 0x00000000
-data4 0x54442D18, 0x400921FB, 0x33145C07, 0x3CA1A626 // I two doubles
-data4 0x54442D18, 0x3FF921FB, 0x33145C07, 0x3C91A626 // I_by_2 two dbls
-data4 0x54442D18, 0x3FE921FB, 0x33145C07, 0x3C81A626 // I_by_4 two dbls
-data4 0x7F3321D2, 0x4002D97C, 0x4C9E8A0A, 0x3C9A7939 // 3I_by_4 two dbls
-ASM_SIZE_DIRECTIVE(Constants_atan#)
-.section .text
-
-.proc __libm_atan2_reg#
-.global __libm_atan2_reg#
-.align 64
-__libm_atan2_reg:
-
-
-{ .mfi
- alloc r32 = ar.pfs,0,20,4,0
-(p0) mov f32 = f8
- nop.i 0
-}
-{ .mmi
- nop.m 0
-(p0) addl r39 = @ltoff(Constants_atan#), gp
- nop.i 999
-}
-;;
-
-{ .mmi
- ld8 r39 = [r39]
- nop.m 999
- nop.i 999
-}
-;;
-
-{ .mfi
- nop 999 // EMbo added ...
-(p0) mov f33 = f9
- nop.i 0
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fclass.nm.unc p9,p0 = f32 ,0x1FF
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fclass.nm.unc p8,p0 = f33 ,0x1FF
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fclass.m.unc p6,p0 = f33 ,0x103
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fclass.m.unc p7,p0 = f32 ,0x103
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fclass.m.unc p12,p0 = f33 ,0x0C3
- nop 999;; // EMbo added ...
- } { .mfb
- nop 999 // EMbo added ...
-//
-// Check for NatVals.
-// Check for EM Unsupporteds
-// Check for NaNs.
-//
-(p0) fclass.m.unc p13,p0 = f32 ,0x0C3
-(p6) br.cond.sptk L(ATAN_NATVAL);;
- } { .mbb
- nop 999 // EMbo added ...
-(p7) br.cond.sptk L(ATAN_NATVAL)
-(p8) br.cond.sptk L(ATAN_UNSUPPORTED);;
- } { .mib
-(p0) add r40 = 96, r39
- nop 999 // EMbo added ...
-(p9) br.cond.sptk L(ATAN_UNSUPPORTED);;
- } { .mib
-(p0) ldfd f50 = [r39],8
- nop 999 // EMbo added ...
-(p12) br.cond.sptk L(ATAN_NAN);;
- } { .mfb
- nop 999 // EMbo added ...
-(p0) fnorm.s1 f33 = f33
-(p13) br.cond.sptk L(ATAN_NAN);;
- } { .mfi
-(p0) ldfs f51 = [r39],4
-//
-// Remove sign bits from exponents
-// Load 2**(-3)
-// Normalize the input argument.
-//
-(p0) fnorm.s1 f32 = f32
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) mov f82 = f1
- nop 999;; // EMbo added ...
- } { .mmi
- nop 999;; // EMbo added ...
-(p0) ldfs f78 = [r39],180
- nop 999;; // EMbo added ...
- } { .mmi
-(p0) getf.exp r36 = f33;;
-//
-// Get exp and sign of ArgX
-// Get exp and sign of ArgY
-// Load 2**(-3) and increment ptr to Q_4.
-//
-(p0) getf.exp r37 = f32
-(p0) shr.u r36 = r36,17;;
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fmerge.s f84 = f1,f32
-(p0) shr.u r37 = r37,17;;
- } { .mfi
- nop 999 // EMbo added ...
-//
-// ArgX_abs = |ArgX|
-// ArgY_abs = |ArgY|
-// sign_X is sign bit of ArgX
-// sign_Y is sign bit of ArgY
-//
-(p0) fmerge.s f83 = f1,f33
-(p0) cmp.eq.unc p8,p9 = 0x00000, r37;;
- } { .mfi
- nop 999 // EMbo added ...
-(p8) fadd.s1 f34 = f0, f1
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p9) fsub.s1 f34 = f0, f1
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fmin.s1 f36 = f83, f84
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fmax.s1 f35 = f83, f84
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// Is ArgX_abs >= ArgY_abs
-// Is sign_Y == 0?
-//
-(p0) fcmp.ge.s1 p6,p7 = f83,f84
- nop 999;; // EMbo added ...
- } { .mii
-(p6) cmp.eq.unc p10, p11 = 0x00000, r36
-(p6) add r38 = r0, r0;;
-//
-// U = max(ArgX_abs,ArgY_abs)
-// V = min(ArgX_abs,ArgY_abs)
-// if p6, swap = 0
-// if p7, swap = 1
-//
-//
-// Let M = 1.0
-// if p8, s_Y = 1.0
-// if p9, s_Y = -1.0
-//
-(p7) add r38 = 1,r0;;
- } { .mfi
- nop 999 // EMbo added ...
-(p0) frcpa.s1 f37, p6 = f36, f35
- nop 999;; // EMbo added ...
- } { .mfb
- nop 999 // EMbo added ...
-//
-// E = frcpa(V,U)
-//
-(p10) fsub.s1 f82 = f82, f1
-(p6) br.cond.sptk L(ATAN_STEP2);;
- } { .mib
- nop 999 // EMbo added ...
- nop 999 // EMbo added ...
-// /**************************************************/
-// /********************* STEP2 **********************/
-// /**************************************************/
-(p0) br.cond.spnt L(ATAN_SPECIAL_HANDLING);;
- }
-L(ATAN_STEP2):
- { .mlx
- nop 999 // EMbo added ...
-(p0) movl r47 = 0x8400000000000000
- } { .mlx
- nop 999 // EMbo added ...
-(p0) movl r48 = 0x0000000000000100;;
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fmpy.s1 f38 = f37, f36
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fcmp.lt.unc.s0 p0,p9 = f9,f1
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fcmp.lt.unc.s0 p0,p8 = f8,f1
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// Q = E * V
-//
-(p11) fadd.s1 f82 = f82, f1
- nop 999;; // EMbo added ...
- } { .mfi
-(p0) getf.sig r46 = f38
-(p0) fcmp.lt.unc p6,p7 = f38,f78
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fmpy.s1 f38 = f37, f36
-(p0) extr.u r42 = r46, 59, 4;;
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fmpy.s1 f50 = f82, f50
-(p0) dep r47 = r42, r47, 59, 4
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fmpy.s1 f51 = f82, f51
- nop 999;; // EMbo added ...
- } { .mmi
- nop 999;; // EMbo added ...
-//
-// Is Q < 2**(-3)?
-//
-//
-// Do fcmp to raise any denormal operand
-// exceptions.
-//
-(p0) getf.exp r45 = f38
- nop 999;; // EMbo added ...
- } { .mib
-//
-// lookup = b_1 b_2 b_3 B_4
-//
-//
-// Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
-//
-(p0) andcm r41 = 0x0003, r45
- nop 999 // EMbo added ...
-//
-// We waited a few extra cycles so P_lo and P_hi could be calculated.
-// Load the constant 256 for loading up table entries.
-//
-// /**************************************************/
-// /********************* STEP3 **********************/
-// /**************************************************/
-(p6) br.cond.spnt L(ATAN_POLY);;
- } { .mii
-(p0) setf.sig f39 = r47
-(p0) cmp.eq.unc p8, p9 = 0x0000, r41
-//
-// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
-// point to beginning of Tbl_hi entries - k = 0.
-//
-(p0) add r40 = 16, r39
- } { .mmi
-(p0) ldfe f73 = [r39],-16;;
-(p9) sub r41 = r41,r0,1
-(p9) add r40 = 16,r40
- } { .mfi
-(p8) ldfd f48 = [r40],8
-(p0) fmpy.s1 f50 = f34, f50
-(p0) xor r38 = r36,r38;;
- } { .mmi
-(p0) ldfe f71 = [r39],-16;;
-(p8) ldfs f49 = [r40],8
-(p9) pmpy2.r r41 = r41,r48;;
- } { .mfi
-(p0) ldfe f69 = [r39],-16
-//
-// Let z_hi have exponent and sign of original Q
-// Load the Tbl_hi(0) else, increment pointer.
-//
-(p0) fmerge.se f39 = f38,f39
-(p9) shladd r42 = r42,0x0004,r41;;
- } { .mmi
-(p9) add r40 = r40, r42;;
-(p9) ldfd f48 = [r40],8
- nop 999;; // EMbo added ...
- } { .mmi
-(p0) ldfe f67 = [r39],-16;;
-(p9) ldfs f49 = [r40],8
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// U_prime_hi = U + V * z_hi
-// Load the Tbl_lo(0)
-//
-(p0) fma.s1 f40 = f36, f39, f35
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fnma.s1 f42 = f35, f39, f36
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) mov f52 = f48
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) frcpa.s1 f43, p6 = f1, f40
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// U_prime_lo = U - U_prime_hi
-// k = k * 256 - result can be 0, 256, or 512.
-//
-(p0) fsub.s1 f41 = f35, f40
-(p0) cmp.eq.unc p7, p6 = 0x00000, r38
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fmpy.s1 f52 = f34, f52
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p7) fadd.s1 f54 = f0, f1
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p6) fsub.s1 f54 = f0, f1
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fnma.s1 f80 = f43, f40, f1
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fadd.s1 f79 = f41, f40
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fma.s1 f41 = f36, f39, f41
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fma.s1 f56 = f54, f52, f50
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fma.s1 f43 = f80, f43, f43
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// U_prime_lo = U - U_hold
-// lookup -> lookup * 16 + k
-//
-//
-// V_prime = V - U * z_hi
-// U_prime_lo = V * z_hi + U_prime_lo
-//
-(p0) fsub.s1 f79 = f35, f79
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fnma.s1 f80 = f43, f40, f1
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// C_hi = frcpa(1,U_prime_hi)
-// U_prime_lo = U_prime_lo + U_hold
-//
-//
-// C_hi_hold = 1 - C_hi * U_prime_hi (1)
-//
-//
-// C_hi = C_hi + C_hi * C_hi_hold (1)
-//
-//
-// C_hi_hold = 1 - C_hi * U_prime_hi (2)
-//
-(p0) fadd.s1 f41 = f41, f79
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// C_hi = C_hi + C_hi * C_hi_hold (2)
-//
-(p0) fma.s1 f43 = f80, f43, f43
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// C_hi_hold = 1 - C_hi * U_prime_hi (3)
-//
-(p0) fnma.s1 f80 = f43, f40, f1
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// C_hi = C_hi + C_hi * C_hi_hold (3)
-//
-(p0) fma.s1 f43 = f80, f43, f43
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// w_hi = V_prime * C_hi
-//
-(p0) fmpy.s1 f44 = f42, f43
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fmpy.s1 f46 = f44, f44
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// wsq = w_hi * w_hi
-// w_lo = = V_prime - w_hi * U_prime_hi
-//
-(p0) fnma.s1 f45 = f44, f40, f42
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fma.s1 f47 = f46, f73, f71
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// poly = Q_3 + wsq * Q_4
-// w_lo = = w_lo - w_hi * U_prime_lo
-//
-(p0) fnma.s1 f45 = f44, f41, f45
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fma.s1 f47 = f46, f47, f69
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// poly = Q_2 + wsq * poly
-// w_lo = = w_lo * C_hi
-//
-(p0) fmpy.s1 f45 = f43, f45
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fma.s1 f47 = f46, f47, f67
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// poly = Q_1 + wsq * poly
-// A_lo = Tbl_lo + w_lo
-// swap = xor(swap,sign_X)
-//
-(p0) fadd.s1 f53 = f49, f45
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// Is (swap) != 0 ?
-// poly = wsq * poly
-// A_hi = Tbl_hi
-//
-(p0) fmpy.s1 f47 = f46, f47
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// poly = wsq * poly
-//
-//
-// if (p6) sigma = -1.0
-// if (p7) sigma = 1.0
-//
-(p0) fmpy.s1 f47 = f44, f47
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// P_hi = s_Y * P_hi
-// A_lo = A_lo + poly
-//
-(p0) fadd.s1 f53 = f53, f47
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// A_lo = A_lo + w_hi
-// A_hi = s_Y * A_hi
-//
-(p0) fadd.s1 f53 = f53, f44
- nop 999;; // EMbo added ...
- } { .mfb
- nop 999 // EMbo added ...
-//
-// result_hi = P_hi + sigma * A_hi
-// result_lo = P_lo + sigma * A_lo
-//
-(p0) fma.s1 f55 = f54, f53, f51
-(p0) br.cond.sptk L(RETURN_ATAN);;
-}
-//
-// result = result_hi + result_lo * s_Y (User Supplied Rounding Mode)
-//
-// (p0) fma.d.s0 f57 = f55, f34, f56
-//
-// /**************************************************/
-// /********************* STEP4 **********************/
-// /**************************************************/
-//
-L(ATAN_POLY):
-{ .mmi
-(p0) xor r38 = r36,r38
-(p0) addl r39 = @ltoff(Constants_atan#), gp
- nop.i 999
-}
-;;
-
-{ .mmi
- ld8 r39 = [r39]
- nop.m 999
- nop.i 999
-}
-;;
-
-
-{ .mlx
- nop 999 // EMbo added ...
-(p0) movl r47 = 0x24005;;
- } { .mfi
-(p0) add r39 = 128, r39
-(p0) fnma.s1 f81 = f37, f35, f1
-(p0) cmp.eq.unc p7, p6 = 0x00000, r38;;
- } { .mmf
- nop 999 // EMbo added ...
-(p0) ldfe f77 = [r39],-16
-//
-// Iterate 3 times E = E + E*(1.0 - E*U)
-// Also load P_8, P_7, P_6, P_5, P_4
-// E_hold = 1.0 - E * U (1)
-// A_temp = Q
-//
-(p0) mov f85 = f38;;
- } { .mmf
- nop 999 // EMbo added ...
-(p0) ldfe f76 = [r39],-16
-(p6) fsub.s1 f54 = f0, f1;;
- } { .mmf
- nop 999 // EMbo added ...
-(p0) ldfe f75 = [r39],-16
-//
-// E = E + E_hold*E (1)
-// Point to P_8.
-//
-(p0) fma.s1 f37 = f37, f81, f37;;
- } { .mmf
- nop 999 // EMbo added ...
-(p0) ldfe f74 = [r39],-16
-(p0) fnma.s1 f64 = f85, f35, f36;;
- } { .mmf
- nop 999 // EMbo added ...
-(p0) ldfe f72 = [r39],-16
-(p7) fadd.s1 f54 = f0, f1;;
- } { .mmf
- nop 999 // EMbo added ...
-(p0) ldfe f70 = [r39],-16
-//
-// E_hold = 1.0 - E * U (2)
-//
-(p0) fnma.s1 f81 = f37, f35, f1;;
- } { .mmf
- nop 999 // EMbo added ...
-(p0) ldfe f68 = [r39],-16
-(p0) fmpy.s1 f50 = f34, f50;;
- } { .mmf
- nop 999 // EMbo added ...
-(p0) ldfe f66 = [r39],-16
-(p0) fmpy.d.s0 f67 = f67, f67
- } { .mfi
- nop 999 // EMbo added ...
-//
-// E = E + E_hold*E (2)
-//
-(p0) fma.s1 f37 = f37, f81, f37
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// E_hold = 1.0 - E * U (3)
-//
-(p0) fnma.s1 f81 = f37, f35, f1
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// E = E + E_hold*E (3)
-// At this point E approximates 1/U to roughly working precision
-// z = V*E approximates V/U
-//
-(p0) fma.s1 f37 = f37, f81, f37
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// z = V * E
-//
-(p0) fmpy.s1 f59 = f36, f37
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fmpy.s1 f64 = f64, f37
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// zsq = z * z
-// Also load P_3
-//
-(p0) fmpy.s1 f60 = f59, f59
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fadd.s1 f52 = f85, f64
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fma.s1 f62 = f60, f77, f76
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fma.s1 f63 = f60, f70, f68
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// z8 = zsq * zsq
-// Also load P_2
-//
-(p0) fmpy.s1 f61 = f60, f60
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fsub.s1 f85 = f85, f52
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fmerge.s f65 = f52,f52
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fma.s1 f62 = f60, f62, f75
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fma.s1 f63 = f60, f63, f66
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// z8 = z8 * z8
-// Also load P_1
-// poly1 = _4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
-// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
-//
-//
-// poly1 = P_7 + zsq * P_8
-// poly2 = P_2 + zsq * P_3
-// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*poly1))
-// poly2 = zsq*(P_1 + zsq*poly2)
-//
-//
-// poly1 = P_6 + zsq * poly1
-// poly2 = P_1 + zsq * poly2
-// poly1 = P_4 + zsq*(P_5 + zsq*poly1)
-// poly2 = zsq*poly2
-//
-(p0) fmpy.s1 f61 = f61, f61
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fadd.s1 f64 = f85, f64
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fma.s1 f62 = f60, f62, f74
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// poly1 = P_5 + zsq * poly1
-// poly2 = zsq * poly2
-// poly1 = P_4 + zsq*poly1
-//
-(p0) fmpy.s1 f63 = f63, f60
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// poly1 = P_4 + zsq * poly1
-// swap = xor(swap,sign_X)
-//
-(p0) fma.s1 f62 = f60, f62, f72
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// poly = z8*poly1 + poly2 (Typo in writeup)
-// Is (swap) != 0 ?
-//
-//
-// z_lo = V - A_temp * U
-// if (p7) sigma = 1.0
-// Writeup shows A_temp as A_hi
-//
-//
-// z_lo = z_lo * E
-// if (p6) sigma = -1.0
-// z_lo = (V - A_temp * U) *E
-//
-//
-// Fixup added to force inexact later -
-// A_hi = A_temp + z_lo
-// z_lo = (A_temp - A_hi) + z_lo
-// z_lo = A_hi - z_lo -A_hi + z_lo = about 0
-//
-(p0) fma.s1 f47 = f61, f62, f63
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// A_lo = z * poly + z_lo
-//
-(p0) fma.s1 f53 = f59, f47, f64
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fadd.s1 f52 = f65, f53
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fsub.s1 f65 = f65, f52
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fmpy.s1 f52 = f34, f52
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fadd.s1 f53 = f65, f53
- nop 999 // EMbo added ...
- } { .mfi
-(p0) setf.exp f65 = r47
-(p0) fma.s1 f56 = f54, f52, f50
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fclass.m.unc p6,p0 = f53,0x007
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// P_hi = s_Y * P_hi
-// A_hi = s_Y * A_hi
-//
-//
-// result_hi = P_hi + sigma * A_hi
-//
-(p6) mov f53 = f65
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// tmp = P_hi - result_hi
-//
-(p0) fsub.s1 f65 = f50, f56
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fma.s1 f65 = f52, f54, f65
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// tmp = sigma * A_hi + tmp
-// sigma = A_lo * sigma + P_lo
-//
-(p0) fma.s1 f54 = f53, f54, f51
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// result_lo = s_Y * sigma + tmp
-//
-(p0) fma.s1 f55 = f34, f54, f65
- nop 999;; // EMbo added ...
- } { .mfb
- nop.m 0
- mov f34 = f1
-(p0) br.cond.sptk L(RETURN_ATAN);;
-}
-//
-// result = result_hi + result_lo (User Supplied Rounding Mode)
-//
-// (p0) fadd.d.s0 f57 = f55, f56
-L(ATAN_UNSUPPORTED):
-L(ATAN_NATVAL):
- { .mfb
- nop 999 // EMbo added ...
-//
-// Deal with the NatVal and unsupported cases.
-// Raise invalid if warrented.
-//
-(p0) fmpy.d.s0 f57 = f8, f9
-br.cond.sptk L(RETURN_ATAN);;
- }
-L(ATAN_NAN):
- { .mfb
- nop 999 // EMbo added ...
-//
-// If only one NaN, then generate the resulting
-// NaN and return - may raise invalid.
-//
-(p0) fmpy.d.s0 f57 = f8, f9
-(p0) br.cond.sptk L(RETURN_ATAN);;
- }
-L(ATAN_SPECIAL_HANDLING):
-
- { .mmf
-(p0) addl r39 = @ltoff(Constants_atan#), gp
- nop.m 999
-(p0) fcmp.lt.s0 p0,p7 = f8,f1
- }
-;;
-
-//
-// Raise denormal operand faults if necessary
-//
-
-{ .mfi
- ld8 r39 = [r39]
-(p0) fcmp.lt.s0 p0,p6 = f9,f1
- nop 999;; // EMbo added ...
-}
-;;
-
-
-
-{ .mfi
- nop 999 // EMbo added ...
-(p0) fclass.m.unc p6,p7 = f32,0x007
- nop 999;; // EMbo added ...
- } { .mlx
- nop 999 // EMbo added ...
-(p0) movl r47 = 992;;
- } { .mib
-(p0) add r39 = r39, r47
- nop 999 // EMbo added ...
-(p7) br.cond.sptk L(ATAN_ArgY_Not_ZERO);;
- } { .mfi
- nop 999 // EMbo added ...
-(p6) fclass.m.unc p14,p0 = f33,0x035
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p6) fclass.m.unc p15,p0 = f33,0x036
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p6) fclass.m.unc p13,p0 = f33,0x007
- nop 999 // EMbo added ...
- } { .mfi
-(p0) ldfd f56 = [r39],8
- nop 999 // EMbo added ...
- nop 999;; // EMbo added ...
- } { .mfi
-(p0) ldfd f55 = [r39],-8
-(p14) fmerge.s f56 = f32,f0
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// Return sign_Y * 0 when Y = +/-0 and X > 0
-//
-(p14) fmerge.s f55 = f32,f0
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p15) fmerge.s f56 = f32,f56
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// Return sign_Y * PI when X < -0
-//
-//
-(p15) fmerge.s f55 = f32,f55
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fadd.d.s0 f57 = f56,f55
- nop.i 0
- } { .bbb
-//
-// Call error support function for atan(0,0)
-// - expected value already computed.
-//
- nop.b 0
- nop.b 0
-(p0) br.cond.sptk L(RETURN_ATAN)
- }
-L(ATAN_ArgY_Not_ZERO):
- { .mfi
- nop 999 // EMbo added ...
-(p0) fclass.m.unc p9,p10 = f32,0x023
- nop 999;; // EMbo added ...
- } { .mfb
- nop 999 // EMbo added ...
-(p9) fclass.m.unc p6,p0 = f33,0x017
-(p10) br.cond.sptk L(ATAN_ArgY_Not_INF);;
- } { .mfi
-(p6) add r39 = 16,r39
-(p9) fclass.m.unc p7,p0 = f33,0x021
- nop 999;; // EMbo added ...
- } { .mmf
- nop 999 // EMbo added ...
-(p0) ldfd f56 = [r39],8
-(p9) fclass.m.unc p8,p0 = f33,0x022;;
- } { .mbb
-(p0) ldfd f55 = [r39],-8
- nop 999 // EMbo added ...
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p6) fmerge.s f56 = f32,f56
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p6) fmerge.s f55 = f32,f55
- nop 999;; // EMbo added ...
- } { .mfb
- nop 999 // EMbo added ...
-//
-// Load I/2 and adjust its sign.
-// Return +I/2 when ArgY = +Inf and ArgX = +/-0,normal
-// Return -I/2 when ArgY = -Inf and ArgX = +/-0,normal
-//
-(p6) fadd.d.s0 f57 = f56, f55
-(p6) br.cond.sptk L(RETURN_ATAN);;
- } { .mmi
-(p7) add r39 = 32,r39;;
-(p7) ldfd f56 = [r39],8
- nop 999;; // EMbo added ...
- } { .mmi
- nop 999;; // EMbo added ...
-(p7) ldfd f55 = [r39],-8
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p7) fmerge.s f56 = f32,f56
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p7) fmerge.s f55 = f32,f55
- nop 999;; // EMbo added ...
- } { .mfb
- nop 999 // EMbo added ...
-//
-// Load PI/4 and adjust its sign.
-// Return +PI/4 when ArgY = +Inf and ArgX = +Inf
-// Return -PI/4 when ArgY = -Inf and ArgX = +Inf
-//
-(p7) fadd.d.s0 f57 = f56, f55
-(p7) br.cond.sptk L(RETURN_ATAN);;
- } { .mmi
-(p8) add r39 = 48,r39;;
-(p8) ldfd f56 =[r39],8
- nop 999;; // EMbo added ...
- } { .mmi
- nop 999;; // EMbo added ...
-(p8) ldfd f55 =[r39],-8
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p8) fmerge.s f56 = f32,f56
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p8) fmerge.s f55 = f32,f55
- nop 999;; // EMbo added ...
- } { .mfb
- nop 999 // EMbo added ...
-//
-// Load I/4 and adjust its sign.
-// Return +3I/4 when ArgY = +Inf and ArgX = -Inf
-// Return -3I/4 when ArgY = -Inf and ArgX = -Inf
-//
-(p8) fadd.d.s0 f57 = f56, f55
-(p8) br.cond.sptk L(RETURN_ATAN);;
- }
-L(ATAN_ArgY_Not_INF):
- { .mfi
- nop 999 // EMbo added ...
-(p0) fclass.m.unc p6,p0 = f33,0x007
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fclass.m.unc p7,p0 = f33,0x021
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p0) fclass.m.unc p8,p0 = f33,0x022
-(p6) add r39 = 16,r39;;
- } { .mfi
-(p6) ldfd f56 =[r39],8
- nop 999 // EMbo added ...
- nop 999;; // EMbo added ...
- } { .mmi
- nop 999;; // EMbo added ...
-(p6) ldfd f55 =[r39],-8
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p6) fmerge.s f56 = f32,f56
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p6) fmerge.s f55 = f32,f55
- nop 999;; // EMbo added ...
- } { .mfb
- nop 999 // EMbo added ...
-//
-// return = sign_Y * I/2 when ArgX = +/-0
-//
-(p6) fadd.d.s0 f57 = f56, f55
-(p6) br.cond.sptk L(RETURN_ATAN);;
- } { .mfi
- nop 999 // EMbo added ...
-(p7) fmerge.s f56 = f32,f0
- nop 999 // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p7) fmerge.s f55 = f32,f0
- nop 999;; // EMbo added ...
- } { .mfb
- nop 999 // EMbo added ...
-//
-// return = sign_Y * 0 when ArgX = Inf
-//
-(p7) fadd.d.s0 f57 = f56, f55
-(p7) br.cond.sptk L(RETURN_ATAN);;
- } { .mfi
-(p8) ldfd f56 = [r39],8
- nop 999 // EMbo added ...
- nop 999;; // EMbo added ...
- } { .mmi
- nop 999;; // EMbo added ...
-(p8) ldfd f55 = [r39],-8
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p8) fmerge.s f56 = f32,f56
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-(p8) fmerge.s f55 = f32,f55
- nop 999;; // EMbo added ...
- } { .mfi
- nop 999 // EMbo added ...
-//
-// return = sign_Y * I when ArgX = -Inf
-//
-(p8) fadd.d.s0 f57 = f56, f55
- nop 999 // EMbo added ...
- };;
-L(RETURN_ATAN):
-// mov f8 = f57 ;;
-// The answer is in f57.
-// But Z_hi is f56
-// Z_lo is f55
-// s_Y is f34
-// W is in f9 and untouched
-
-{ .mfi
- nop 999
-mov f8 = f56
- nop.i 0
-};;
-
-{ .mfi
- nop 999
-mov f10 = f55
- nop.i 999
-}
-{ .mfb
- nop 999
-mov f11 = f34
-br.ret.sptk b0
-};;
-
-.endp __libm_atan2_reg
-ASM_SIZE_DIRECTIVE(__libm_atan2_reg)
diff --git a/sysdeps/ia64/fpu/libm_error.c b/sysdeps/ia64/fpu/libm_error.c
index ebbaad02ad..42ca36d98f 100644
--- a/sysdeps/ia64/fpu/libm_error.c
+++ b/sysdeps/ia64/fpu/libm_error.c
@@ -1,9 +1,10 @@
-//
-// Copyright (C) 2000, 2001, Intel Corporation
+/* file: libm_error.c */
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, James
-// Edwards, and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -19,14 +20,15 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
+
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
@@ -34,19 +36,39 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
// 2/02/00: Initial version
-// 3/22/00: Updated to support flexible and dynamic error handling.
-// 8/16/00: Changed all matherr function-calls to use the pmatherr
+// 3/22/00: Updated to support flexible and dynamic error handling.
+// 8/16/00: Changed all matherr function-calls to use the pmatherr
// function-pointers.
// 10/03/00: Corrected a scalb type.
// 11/28/00: Changed INPUT_XL to INPUT_XD for scalb_underflow case.
// 12/07/00: Added code to make scalbn error support equivalent to ldexp.
// 2/07/01: Added __declspec(align(16)) to long double constants to correct
// alignment problem.
+// 4/23/01: Added code for remquo
+// 6/07/01: Added code for fdim, lrint, lround, llrint, llround
+// Deleted code for remquo
+// 8/15/01: Added code for scalbln, nexttoward
+// 12/10/01: Added code for erfc
+// 12/27/01: Added code for degree argument functions
+// 01/02/02: Added code for tand, cotd
+// 01/15/02: Corrected SVID/XOPEN code for log1p, pow, and acosh
+// 01/25/02: Corrected ISOC for lgamma and gamma to return EDOM for neg ints
+// 01/28/02: Corrected SVID/XOPEN stderr message for log2
+// 05/20/02: Added code for cot
+// 07/01/02: Added code for sinhcosh
+// 10/04/02: Underflow detection in ISOC path redefined to
+// be zero rather than tiny and inexact
+// 12/06/02: Added code for annuity and compound
+// 01/30/03: Corrected test for underflow in ISOC path to not set denormal
+// 04/10/03: Corrected ISOC branch for gamma/lgamma to return ERANGE for neg ints.
+// Added code for tgamma
+// 04/11/03: Corrected POSIX/SVID/XOPEN branches for gamma/lgamma
+// to return EDOM for neg ints.
//
#include <errno.h>
@@ -54,38 +76,41 @@
#include <stdlib.h>
#include "libm_support.h"
-#ifndef _LIBC
+#ifdef _LIBC
+# define pmatherr matherr
+# define pmatherrf matherrf
+# define pmatherrl matherrl
+#else
_LIB_VERSION_TYPE
#if defined( __POSIX__ )
-_LIB_VERSION = _POSIX_;
+_LIB_VERSIONIMF = _POSIX_;
#elif defined( __XOPEN__ )
-_LIB_VERSION = _XOPEN_;
+_LIB_VERSIONIMF = _XOPEN_;
#elif defined( __SVID__ )
-_LIB_VERSION = _SVID_;
+_LIB_VERSIONIMF = _SVID_;
#elif defined( __IEEE__ )
-_LIB_VERSION = _IEEE_;
+_LIB_VERSIONIMF = _IEEE_;
#else
-_LIB_VERSION = _ISOC_;
-#endif
+_LIB_VERSIONIMF = _ISOC_;
#endif
/************************************************************/
/* matherrX function pointers and setusermatherrX functions */
/************************************************************/
-#if 0
int (*pmatherrf)(struct exceptionf*) = MATHERR_F;
int (*pmatherr)(struct EXC_DECL_D*) = MATHERR_D;
int (*pmatherrl)(struct exceptionl*) = matherrl;
void __libm_setusermatherrf( int(*user_merrf)(struct exceptionf*) )
-{ pmatherrf = ( (user_merrf==NULL)? (MATHERR_F) : (user_merrf) ); }
+{ pmatherrf = ( (user_merrf==NULL)? (MATHERR_F) : (user_merrf) ); }
void __libm_setusermatherr( int(*user_merr)(struct EXC_DECL_D*) )
-{ pmatherr = ( (user_merr==NULL)? (MATHERR_D) : (user_merr) ); }
+{ pmatherr = ( (user_merr==NULL)? (MATHERR_D) : (user_merr) ); }
void __libm_setusermatherrl( int(*user_merrl)(struct exceptionl*) )
-{ pmatherrl = ( (user_merrl==NULL)? (matherrl) : (user_merrl) ); }
-#endif
+{ pmatherrl = ( (user_merrl==NULL)? (matherrl) : (user_merrl) ); }
+
+#endif /* !_LIBC */
/***********************************************/
/* error-handling function, libm_error_support */
@@ -93,22 +118,27 @@ void __libm_setusermatherrl( int(*user_merrl)(struct exceptionl*) )
void __libm_error_support(void *arg1,void *arg2,void *retval,error_types input_tag)
{
-
# ifdef __cplusplus
struct __exception exc;
-# else
+# else
struct exception exc;
-# endif
+# endif
struct exceptionf excf;
struct exceptionl excl;
-# if defined opensource || defined _LIBC
+# if defined(__GNUC__)
+#define ALIGNIT __attribute__ ((__aligned__ (16)))
+# elif defined opensource
#define ALIGNIT
-#define ALIGNATTR __attribute__ ((__aligned__ (16)))
# else
#define ALIGNIT __declspec(align(16))
-#define ALIGNATTR
+# endif
+
+# ifdef SIZE_LONG_INT_64
+#define __INT_64__ signed long
+# else
+#define __INT_64__ __int64
# endif
const char float_inf[4] = {0x00,0x00,0x80,0x7F};
@@ -118,66 +148,74 @@ const char float_neg_inf[4] = {0x00,0x00,0x80,0xFF};
const char float_neg_huge[4] = {0xFF,0xFF,0x7F,0xFF};
const char float_neg_zero[4] = {0x00,0x00,0x00,0x80};
ALIGNIT
-const char double_inf[8] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0xF0,0x7F};
+const char double_inf[8] = {0x00,0x00,0x00,0x00,0x00,0x00,0xF0,0x7F};
+#if 0 /* unused */
ALIGNIT
-//const char double_huge[8] ALIGNATTR = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xEF,0x7F};
+const char double_huge[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xEF,0x7F};
+#endif
ALIGNIT
-const char double_zero[8] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
+const char double_zero[8] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
ALIGNIT
-const char double_neg_inf[8] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0xF0,0xFF};
+const char double_neg_inf[8] = {0x00,0x00,0x00,0x00,0x00,0x00,0xF0,0xFF};
+#if 0 /* unused */
ALIGNIT
-//const char double_neg_huge[8] ALIGNATTR = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xEF,0xFF};
+const char double_neg_huge[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xEF,0xFF};
+#endif
ALIGNIT
-const char double_neg_zero[8] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80};
+const char double_neg_zero[8] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80};
ALIGNIT
-const char long_double_inf[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0xFF,0x7F,0x00,0x00,0x00,0x00,0x00,0x00};
+const char long_double_inf[16] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0xFF,0x7F,0x00,0x00,0x00,0x00,0x00,0x00};
+#if 0 /* unused */
ALIGNIT
-//const char long_double_huge[16] ALIGNATTR = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,0x7F,0x00,0x00,0x00,0x00,0x00,0x00};
+const char long_double_huge[16] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,0x7F,0x00,0x00,0x00,0x00,0x00,0x00};
+#endif
ALIGNIT
-const char long_double_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
+const char long_double_zero[16] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
ALIGNIT
-const char long_double_neg_inf[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00};
+const char long_double_neg_inf[16] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00};
+#if 0 /* unused */
ALIGNIT
-//const char long_double_neg_huge[16] ALIGNATTR = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFE,0xFF,0x00,0x00,0x00,0x00,0x00,0x00};
+const char long_double_neg_huge[16] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,0xFF,0x00,0x00,0x00,0x00,0x00,0x00};
+#endif
ALIGNIT
-const char long_double_neg_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x80,0x00,0x00,0x00,0x00,0x00,0x00};
+const char long_double_neg_zero[16] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00,0x00};
-#define RETVAL_HUGE_VALL *(long double *)retval = *(long double *)long_double_inf
-#define RETVAL_NEG_HUGE_VALL *(long double *)retval = *(long double *)long_double_neg_inf
-#define RETVAL_HUGEL *(long double *)retval = (long double)*(float *)float_huge
-#define RETVAL_NEG_HUGEL *(long double *)retval =(long double)*(float*)float_neg_huge
+#define RETVAL_HUGE_VALL *(long double *)retval = *(long double *)long_double_inf
+#define RETVAL_NEG_HUGE_VALL *(long double *)retval = *(long double *)long_double_neg_inf
+#define RETVAL_HUGEL *(long double *)retval = (long double)*(float *)float_huge
+#define RETVAL_NEG_HUGEL *(long double *)retval =(long double)*(float*)float_neg_huge
#define RETVAL_HUGE_VALD *(double *)retval = *(double *) double_inf
#define RETVAL_NEG_HUGE_VALD *(double *)retval = *(double *) double_neg_inf
#define RETVAL_HUGED *(double *)retval = (double) *(float *)float_huge
-#define RETVAL_NEG_HUGED *(double *)retval = (double) *(float *) float_neg_huge
+#define RETVAL_NEG_HUGED *(double *)retval = (double) *(float *) float_neg_huge
#define RETVAL_HUGE_VALF *(float *)retval = *(float *) float_inf
#define RETVAL_NEG_HUGE_VALF *(float *)retval = *(float *) float_neg_inf
#define RETVAL_HUGEF *(float *)retval = *(float *) float_huge
-#define RETVAL_NEG_HUGEF *(float *)retval = *(float *) float_neg_huge
+#define RETVAL_NEG_HUGEF *(float *)retval = *(float *) float_neg_huge
-#define RETVAL_ZEROL *(long double *)retval = *(long double *)long_double_zero
-#define RETVAL_ZEROD *(double *)retval = *(double *)double_zero
-#define RETVAL_ZEROF *(float *)retval = *(float *)float_zero
+#define RETVAL_ZEROL *(long double *)retval = *(long double *)long_double_zero
+#define RETVAL_ZEROD *(double *)retval = *(double *)double_zero
+#define RETVAL_ZEROF *(float *)retval = *(float *)float_zero
-#define RETVAL_NEG_ZEROL *(long double *)retval = *(long double *)long_double_neg_zero
-#define RETVAL_NEG_ZEROD *(double *)retval = *(double *)double_neg_zero
-#define RETVAL_NEG_ZEROF *(float *)retval = *(float *)float_neg_zero
+#define RETVAL_NEG_ZEROL *(long double *)retval = *(long double *)long_double_neg_zero
+#define RETVAL_NEG_ZEROD *(double *)retval = *(double *)double_neg_zero
+#define RETVAL_NEG_ZEROF *(float *)retval = *(float *)float_neg_zero
-#define RETVAL_ONEL *(long double *)retval = (long double) 1.0
-#define RETVAL_ONED *(double *)retval = 1.0
-#define RETVAL_ONEF *(float *)retval = 1.0f
+#define RETVAL_ONEL *(long double *)retval = (long double) 1.0
+#define RETVAL_ONED *(double *)retval = 1.0
+#define RETVAL_ONEF *(float *)retval = 1.0f
-#define NOT_MATHERRL excl.arg1=*(long double *)arg1;excl.arg2=*(long double *)arg2;excl.retval=*(long double *)retval;if(!matherrl(&excl))
-#define NOT_MATHERRD exc.arg1=*(double *)arg1;exc.arg2=*(double *)arg2;exc.retval=*(double *)retval;if(!MATHERR_D(&exc))
-#define NOT_MATHERRF excf.arg1=*(float *)arg1;excf.arg2=*(float *)arg2;excf.retval=*(float *)retval;if(!MATHERR_F(&excf))
+#define NOT_MATHERRL excl.arg1=*(long double *)arg1;excl.arg2=*(long double *)arg2;excl.retval=*(long double *)retval;if(!pmatherrl(&excl))
+#define NOT_MATHERRD exc.arg1=*(double *)arg1;exc.arg2=*(double *)arg2;exc.retval=*(double *)retval;if(!pmatherr(&exc))
+#define NOT_MATHERRF excf.arg1=*(float *)arg1;excf.arg2=*(float *)arg2;excf.retval=*(float *)retval;if(!pmatherrf(&excf))
-#define ifSVID if(_LIB_VERSION==_SVID_)
+#define ifSVID if(_LIB_VERSIONIMF==_SVID_)
-#define NAMEL excl.name
-#define NAMED exc.name
-#define NAMEF excf.name
+#define NAMEL excl.name
+#define NAMED exc.name
+#define NAMEF excf.name
//
// These should work OK for MS because they are ints -
@@ -192,28 +230,28 @@ const char long_double_neg_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0
#define PLOSS 6
#define SINGL excl.type = SING
-#define DOMAINL excl.type = DOMAIN
-#define OVERFLOWL excl.type = OVERFLOW
-#define UNDERFLOWL excl.type = UNDERFLOW
-#define TLOSSL excl.type = TLOSS
+#define DOMAINL excl.type = DOMAIN
+#define OVERFLOWL excl.type = OVERFLOW
+#define UNDERFLOWL excl.type = UNDERFLOW
+#define TLOSSL excl.type = TLOSS
#define SINGD exc.type = SING
-#define DOMAIND exc.type = DOMAIN
-#define OVERFLOWD exc.type = OVERFLOW
-#define UNDERFLOWD exc.type = UNDERFLOW
-#define TLOSSD exc.type = TLOSS
+#define DOMAIND exc.type = DOMAIN
+#define OVERFLOWD exc.type = OVERFLOW
+#define UNDERFLOWD exc.type = UNDERFLOW
+#define TLOSSD exc.type = TLOSS
#define SINGF excf.type = SING
-#define DOMAINF excf.type = DOMAIN
-#define OVERFLOWF excf.type = OVERFLOW
-#define UNDERFLOWF excf.type = UNDERFLOW
-#define TLOSSF excf.type = TLOSS
+#define DOMAINF excf.type = DOMAIN
+#define OVERFLOWF excf.type = OVERFLOW
+#define UNDERFLOWF excf.type = UNDERFLOW
+#define TLOSSF excf.type = TLOSS
#define INPUT_XL (excl.arg1=*(long double*)arg1)
#define INPUT_XD (exc.arg1=*(double*)arg1)
#define INPUT_XF (excf.arg1=*(float*)arg1)
-#define INPUT_YL (excl.arg1=*(long double*)arg2)
-#define INPUT_YD (exc.arg1=*(double*)arg2)
-#define INPUT_YF (excf.arg1=*(float*)arg2)
-#define INPUT_RESL (*(long double *)retval)
+#define INPUT_YL (excl.arg2=*(long double*)arg2)
+#define INPUT_YD (exc.arg2=*(double*)arg2)
+#define INPUT_YF (excf.arg2=*(float*)arg2)
+#define INPUT_RESL (*(long double *)retval)
#define INPUT_RESD (*(double *)retval)
#define INPUT_RESF (*(float *)retval)
@@ -248,11 +286,17 @@ const char long_double_neg_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0
#define WRITED_LOG1P_NEGATIVE fputs("log1p: DOMAIN error\n",stderr)
#define WRITEF_LOG1P_NEGATIVE fputs("log1pf: DOMAIN error\n",stderr)
#define WRITEL_LOG10_ZERO fputs("log10l: SING error\n",stderr)
-#define WRITED_LOG10_ZERO fputs("log10: SING error\n",stderr)
+#define WRITED_LOG10_ZERO fputs("log10: SING error\n",stderr)
#define WRITEF_LOG10_ZERO fputs("log10f: SING error\n",stderr)
#define WRITEL_LOG10_NEGATIVE fputs("log10l: DOMAIN error\n",stderr)
#define WRITED_LOG10_NEGATIVE fputs("log10: DOMAIN error\n",stderr)
#define WRITEF_LOG10_NEGATIVE fputs("log10f: DOMAIN error\n",stderr)
+#define WRITEL_LOG2_ZERO fputs("log2l: SING error\n",stderr)
+#define WRITED_LOG2_ZERO fputs("log2: SING error\n",stderr)
+#define WRITEF_LOG2_ZERO fputs("log2f: SING error\n",stderr)
+#define WRITEL_LOG2_NEGATIVE fputs("log2l: DOMAIN error\n",stderr)
+#define WRITED_LOG2_NEGATIVE fputs("log2: DOMAIN error\n",stderr)
+#define WRITEF_LOG2_NEGATIVE fputs("log2f: DOMAIN error\n",stderr)
#define WRITEL_POW_ZERO_TO_ZERO fputs("powl(0,0): DOMAIN error\n",stderr)
#define WRITED_POW_ZERO_TO_ZERO fputs("pow(0,0): DOMAIN error\n",stderr)
#define WRITEF_POW_ZERO_TO_ZERO fputs("powf(0,0): DOMAIN error\n",stderr)
@@ -295,6 +339,9 @@ const char long_double_neg_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0
#define WRITEL_GAMMA_NEGATIVE fputs("gammal: SING error\n",stderr)
#define WRITED_GAMMA_NEGATIVE fputs("gamma: SING error\n",stderr)
#define WRITEF_GAMMA_NEGATIVE fputs("gammaf: SING error\n",stderr)
+#define WRITEL_TGAMMA_NEGATIVE fputs("tgammal: DOMAIN error\n",stderr)
+#define WRITED_TGAMMA_NEGATIVE fputs("tgamma: DOMAIN error\n",stderr)
+#define WRITEF_TGAMMA_NEGATIVE fputs("tgammaf: DOMAIN error\n",stderr)
#define WRITEL_J0_TLOSS fputs("j0l: TLOSS error\n",stderr)
#define WRITEL_Y0_TLOSS fputs("y0l: TLOSS error\n",stderr)
#define WRITEL_J1_TLOSS fputs("j1l: TLOSS error\n",stderr)
@@ -313,16 +360,26 @@ const char long_double_neg_zero[16] ALIGNATTR = {0x00,0x00,0x00,0x00,0x00,0x00,0
#define WRITEF_Y1_TLOSS fputs("y1f: TLOSS error\n",stderr)
#define WRITEF_JN_TLOSS fputs("jnf: TLOSS error\n",stderr)
#define WRITEF_YN_TLOSS fputs("ynf: TLOSS error\n",stderr)
+#define WRITEL_ACOSD fputs("acosdl: DOMAIN error\n",stderr)
+#define WRITED_ACOSD fputs("acosd: DOMAIN error\n",stderr)
+#define WRITEF_ACOSD fputs("acosdf: DOMAIN error\n",stderr)
+#define WRITEL_ASIND fputs("asindl: DOMAIN error\n",stderr)
+#define WRITED_ASIND fputs("asind: DOMAIN error\n",stderr)
+#define WRITEF_ASIND fputs("asindf: DOMAIN error\n",stderr)
+#define WRITEL_ATAN2D_ZERO_BY_ZERO fputs("atan2dl: DOMAIN error\n",stderr)
+#define WRITED_ATAN2D_ZERO_BY_ZERO fputs("atan2d: DOMAIN error\n",stderr)
+#define WRITEF_ATAN2D_ZERO_BY_ZERO fputs("atan2df: DOMAIN error\n",stderr)
+
/***********************/
/* IEEE Path */
/***********************/
-if(_LIB_VERSION==_IEEE_) return;
+if(_LIB_VERSIONIMF==_IEEE_) return;
/***********************/
/* C9X Path */
/***********************/
-else if(_LIB_VERSION==_ISOC_)
+else if(_LIB_VERSIONIMF==_ISOC_)
{
switch(input_tag)
{
@@ -339,80 +396,146 @@ else if(_LIB_VERSION==_ISOC_)
case log1p_zero:
case log1pf_zero:
case powl_overflow:
- case pow_overflow:
- case powf_overflow:
- case powl_underflow:
- case pow_underflow:
- case powf_underflow:
+ case pow_overflow:
+ case powf_overflow:
case expl_overflow:
- case exp_overflow:
- case expf_overflow:
- case expl_underflow:
- case exp_underflow:
- case expf_underflow:
+ case exp_overflow:
+ case expf_overflow:
case exp2l_overflow:
- case exp2_overflow:
- case exp2f_overflow:
- case exp2l_underflow:
- case exp2_underflow:
- case exp2f_underflow:
+ case exp2_overflow:
+ case exp2f_overflow:
case exp10l_overflow:
- case exp10_overflow:
- case exp10f_overflow:
+ case exp10_overflow:
+ case exp10f_overflow:
case expm1l_overflow:
- case expm1_overflow:
- case expm1f_overflow:
+ case expm1_overflow:
+ case expm1f_overflow:
case hypotl_overflow:
case hypot_overflow:
case hypotf_overflow:
- case sinhl_overflow:
- case sinh_overflow:
- case sinhf_overflow:
- case atanhl_eq_one:
- case atanh_eq_one:
- case atanhf_eq_one:
+ case sinhl_overflow:
+ case sinh_overflow:
+ case sinhf_overflow:
+ case atanhl_eq_one:
+ case atanh_eq_one:
+ case atanhf_eq_one:
case scalbl_overflow:
case scalb_overflow:
case scalbf_overflow:
- case scalbl_underflow:
- case scalb_underflow:
- case scalbf_underflow:
case coshl_overflow:
case cosh_overflow:
case coshf_overflow:
case nextafterl_overflow:
case nextafter_overflow:
case nextafterf_overflow:
+ case nexttowardl_overflow:
+ case nexttoward_overflow:
+ case nexttowardf_overflow:
case scalbnl_overflow:
case scalbn_overflow:
case scalbnf_overflow:
- case scalbnl_underflow:
- case scalbn_underflow:
- case scalbnf_underflow:
+ case scalblnl_overflow:
+ case scalbln_overflow:
+ case scalblnf_overflow:
case ldexpl_overflow:
case ldexp_overflow:
case ldexpf_overflow:
- case ldexpl_underflow:
- case ldexp_underflow:
- case ldexpf_underflow:
case lgammal_overflow:
case lgamma_overflow:
case lgammaf_overflow:
- case lgammal_negative:
- case lgamma_negative:
- case lgammaf_negative:
case gammal_overflow:
case gamma_overflow:
case gammaf_overflow:
+ case lgammal_negative:
+ case lgamma_negative:
+ case lgammaf_negative:
case gammal_negative:
case gamma_negative:
case gammaf_negative:
case ilogbl_zero:
- case ilogb_zero:
+ case ilogb_zero:
case ilogbf_zero:
+ case fdiml_overflow:
+ case fdim_overflow:
+ case fdimf_overflow:
+ case llrintl_large:
+ case llrint_large:
+ case llrintf_large:
+ case llroundl_large:
+ case llround_large:
+ case llroundf_large:
+ case lrintl_large:
+ case lrint_large:
+ case lrintf_large:
+ case lroundl_large:
+ case lround_large:
+ case lroundf_large:
+ case tandl_overflow:
+ case tand_overflow:
+ case tandf_overflow:
+ case cotdl_overflow:
+ case cotd_overflow:
+ case cotdf_overflow:
+ case cotl_overflow:
+ case cot_overflow:
+ case cotf_overflow:
+ case sinhcoshl_overflow:
+ case sinhcosh_overflow:
+ case sinhcoshf_overflow:
+ case annuityl_overflow:
+ case annuity_overflow:
+ case annuityf_overflow:
+ case compoundl_overflow:
+ case compound_overflow:
+ case compoundf_overflow:
+ case tgammal_overflow:
+ case tgamma_overflow:
+ case tgammaf_overflow:
{
ERRNO_RANGE; break;
}
+ case powl_underflow:
+ case expl_underflow:
+ case exp2l_underflow:
+ case scalbl_underflow:
+ case scalbnl_underflow:
+ case scalblnl_underflow:
+ case ldexpl_underflow:
+ case erfcl_underflow:
+ case annuityl_underflow:
+ case compoundl_underflow:
+ {
+ if ( *(__INT_64__*)retval == 0 ) ERRNO_RANGE;
+ break;
+ }
+ case pow_underflow:
+ case exp_underflow:
+ case exp2_underflow:
+ case scalb_underflow:
+ case scalbn_underflow:
+ case scalbln_underflow:
+ case ldexp_underflow:
+ case erfc_underflow:
+ case annuity_underflow:
+ case compound_underflow:
+ {
+ if ( ((*(__INT_64__*)retval)<<1) == 0 ) ERRNO_RANGE;
+ break;
+ }
+ case powf_underflow:
+ case expf_underflow:
+ case exp2f_underflow:
+ case scalbf_underflow:
+ case scalbnf_underflow:
+ case scalblnf_underflow:
+ case ldexpf_underflow:
+ case erfcf_underflow:
+ case annuityf_underflow:
+ case compoundf_underflow:
+ {
+ if ( ((*(__INT_64__*)retval)<<33) == 0 ) ERRNO_RANGE;
+ break;
+ }
case logl_negative:
case log_negative:
case logf_negative:
@@ -440,17 +563,17 @@ else if(_LIB_VERSION==_ISOC_)
case fmodl_by_zero:
case fmod_by_zero:
case fmodf_by_zero:
- case atanhl_gt_one:
- case atanh_gt_one:
- case atanhf_gt_one:
- case acosl_gt_one:
- case acos_gt_one:
- case acosf_gt_one:
- case asinl_gt_one:
- case asin_gt_one:
- case asinf_gt_one:
+ case atanhl_gt_one:
+ case atanh_gt_one:
+ case atanhf_gt_one:
+ case acosl_gt_one:
+ case acos_gt_one:
+ case acosf_gt_one:
+ case asinl_gt_one:
+ case asin_gt_one:
+ case asinf_gt_one:
case logbl_zero:
- case logb_zero:
+ case logb_zero:
case logbf_zero:
case acoshl_lt_one:
case acosh_lt_one:
@@ -473,6 +596,30 @@ else if(_LIB_VERSION==_ISOC_)
case ynl_negative:
case yn_negative:
case ynf_negative:
+ case acosdl_gt_one:
+ case acosd_gt_one:
+ case acosdf_gt_one:
+ case asindl_gt_one:
+ case asind_gt_one:
+ case asindf_gt_one:
+ case atan2dl_zero:
+ case atan2d_zero:
+ case atan2df_zero:
+ case annuityl_by_zero:
+ case annuity_by_zero:
+ case annuityf_by_zero:
+ case annuityl_less_m1:
+ case annuity_less_m1:
+ case annuityf_less_m1:
+ case compoundl_by_zero:
+ case compound_by_zero:
+ case compoundf_by_zero:
+ case compoundl_less_m1:
+ case compound_less_m1:
+ case compoundf_less_m1:
+ case tgammal_negative:
+ case tgamma_negative:
+ case tgammaf_negative:
{
ERRNO_DOMAIN; break;
}
@@ -486,31 +633,37 @@ else if(_LIB_VERSION==_ISOC_)
/* _POSIX_ Path */
/***********************/
-else if(_LIB_VERSION==_POSIX_)
+else if(_LIB_VERSIONIMF==_POSIX_)
{
switch(input_tag)
{
case gammal_overflow:
case lgammal_overflow:
+ case tgammal_overflow:
{
RETVAL_HUGE_VALL; ERRNO_RANGE; break;
}
case gamma_overflow:
case lgamma_overflow:
+ case tgamma_overflow:
{
RETVAL_HUGE_VALD; ERRNO_RANGE; break;
}
case gammaf_overflow:
case lgammaf_overflow:
+ case tgammaf_overflow:
{
RETVAL_HUGE_VALF; ERRNO_RANGE; break;
}
case gammal_negative:
- case gamma_negative:
- case gammaf_negative:
case lgammal_negative:
+ case gamma_negative:
case lgamma_negative:
+ case gammaf_negative:
case lgammaf_negative:
+ case tgammal_negative:
+ case tgamma_negative:
+ case tgammaf_negative:
{
ERRNO_DOMAIN; break;
}
@@ -526,38 +679,56 @@ switch(input_tag)
case scalbn_underflow:
case scalbnf_overflow:
case scalbnf_underflow:
+ case scalblnl_overflow:
+ case scalblnl_underflow:
+ case scalbln_overflow:
+ case scalbln_underflow:
+ case scalblnf_overflow:
+ case scalblnf_underflow:
+ case tandl_overflow:
+ case tand_overflow:
+ case tandf_overflow:
+ case cotdl_overflow:
+ case cotd_overflow:
+ case cotdf_overflow:
+ case cotl_overflow:
+ case cot_overflow:
+ case cotf_overflow:
+ case sinhcoshl_overflow:
+ case sinhcosh_overflow:
+ case sinhcoshf_overflow:
{
ERRNO_RANGE; break;
}
- case atanhl_gt_one:
- case atanhl_eq_one:
+ case atanhl_gt_one:
+ case atanhl_eq_one:
/* atanhl(|x| >= 1) */
{
ERRNO_DOMAIN; break;
}
- case atanh_gt_one:
- case atanh_eq_one:
+ case atanh_gt_one:
+ case atanh_eq_one:
/* atanh(|x| >= 1) */
{
ERRNO_DOMAIN; break;
}
- case atanhf_gt_one:
- case atanhf_eq_one:
+ case atanhf_gt_one:
+ case atanhf_eq_one:
/* atanhf(|x| >= 1) */
{
ERRNO_DOMAIN; break;
}
- case sqrtl_negative:
+ case sqrtl_negative:
/* sqrtl(x < 0) */
{
ERRNO_DOMAIN; break;
}
- case sqrt_negative:
+ case sqrt_negative:
/* sqrt(x < 0) */
{
ERRNO_DOMAIN; break;
}
- case sqrtf_negative:
+ case sqrtf_negative:
/* sqrtf(x < 0) */
{
ERRNO_DOMAIN; break;
@@ -606,7 +777,7 @@ switch(input_tag)
/* yn(x < 0) */
{
RETVAL_NEG_HUGE_VALD; ERRNO_DOMAIN; break;
- }
+ }
case y0f_negative:
case y1f_negative:
case ynf_negative:
@@ -615,10 +786,11 @@ switch(input_tag)
/* ynf(x < 0) */
{
RETVAL_NEG_HUGE_VALF; ERRNO_DOMAIN; break;
- }
+ }
case logl_zero:
case log1pl_zero:
case log10l_zero:
+ case log2l_zero:
/* logl(0) */
/* log1pl(0) */
/* log10l(0) */
@@ -628,7 +800,7 @@ switch(input_tag)
case log_zero:
case log1p_zero:
case log10_zero:
- case log2l_zero:
+ case log2_zero:
/* log(0) */
/* log1p(0) */
/* log10(0) */
@@ -638,6 +810,7 @@ switch(input_tag)
case logf_zero:
case log1pf_zero:
case log10f_zero:
+ case log2f_zero:
/* logf(0) */
/* log1pf(0) */
/* log10f(0) */
@@ -652,6 +825,9 @@ switch(input_tag)
/* log1pl(x < 0) */
/* log10l(x < 0) */
{
+#ifndef _LIBC
+ RETVAL_NEG_HUGE_VALL;
+#endif
ERRNO_DOMAIN; break;
}
case log_negative:
@@ -662,8 +838,11 @@ switch(input_tag)
/* log1p(x < 0) */
/* log10(x < 0) */
{
+#ifndef _LIBC
+ RETVAL_NEG_HUGE_VALD;
+#endif
ERRNO_DOMAIN; break;
- }
+ }
case logf_negative:
case log1pf_negative:
case log10f_negative:
@@ -672,34 +851,46 @@ switch(input_tag)
/* log1pf(x < 0) */
/* log10f(x < 0) */
{
+#ifndef _LIBC
+ RETVAL_NEG_HUGE_VALF;
+#endif
ERRNO_DOMAIN; break;
- }
+ }
case expl_overflow:
+ case exp2l_overflow:
+ case exp10l_overflow:
/* expl overflow */
{
RETVAL_HUGE_VALL; ERRNO_RANGE; break;
}
case exp_overflow:
+ case exp2_overflow:
+ case exp10_overflow:
/* exp overflow */
{
RETVAL_HUGE_VALD; ERRNO_RANGE; break;
}
case expf_overflow:
+ case exp2f_overflow:
+ case exp10f_overflow:
/* expf overflow */
{
RETVAL_HUGE_VALF; ERRNO_RANGE; break;
}
case expl_underflow:
+ case exp2l_underflow:
/* expl underflow */
{
RETVAL_ZEROL; ERRNO_RANGE; break;
}
case exp_underflow:
+ case exp2_underflow:
/* exp underflow */
{
RETVAL_ZEROD; ERRNO_RANGE; break;
}
case expf_underflow:
+ case exp2f_underflow:
/* expf underflow */
{
RETVAL_ZEROF; ERRNO_RANGE; break;
@@ -750,13 +941,17 @@ switch(input_tag)
break;
}
case powl_overflow:
+ case annuityl_overflow:
+ case compoundl_overflow:
/* powl(x,y) overflow */
{
if (INPUT_RESL < 0) RETVAL_NEG_HUGE_VALL;
else RETVAL_HUGE_VALL;
- ERRNO_RANGE; break;
+ ERRNO_RANGE; break;
}
case pow_overflow:
+ case annuity_overflow:
+ case compound_overflow:
/* pow(x,y) overflow */
{
if (INPUT_RESD < 0) RETVAL_NEG_HUGE_VALD;
@@ -764,6 +959,8 @@ switch(input_tag)
ERRNO_RANGE; break;
}
case powf_overflow:
+ case annuityf_overflow:
+ case compoundf_overflow:
/* powf(x,y) overflow */
{
if (INPUT_RESF < 0) RETVAL_NEG_HUGE_VALF;
@@ -771,20 +968,41 @@ switch(input_tag)
ERRNO_RANGE; break;
}
case powl_underflow:
+ case annuityl_underflow:
+ case compoundl_underflow:
/* powl(x,y) underflow */
{
RETVAL_ZEROL; ERRNO_RANGE; break;
}
case pow_underflow:
+ case annuity_underflow:
+ case compound_underflow:
/* pow(x,y) underflow */
{
RETVAL_ZEROD; ERRNO_RANGE; break;
}
- case powf_underflow:
+ case powf_underflow:
+ case annuityf_underflow:
+ case compoundf_underflow:
/* powf(x,y) underflow */
{
RETVAL_ZEROF; ERRNO_RANGE; break;
}
+ case annuityl_by_zero:
+ case annuityl_less_m1:
+ case compoundl_by_zero:
+ case compoundl_less_m1:
+ case annuity_by_zero:
+ case annuity_less_m1:
+ case compound_by_zero:
+ case compound_less_m1:
+ case annuityf_by_zero:
+ case annuityf_less_m1:
+ case compoundf_by_zero:
+ case compoundf_less_m1:
+ {
+ ERRNO_DOMAIN; break;
+ }
case powl_zero_to_negative:
/* 0**neg */
{
@@ -820,7 +1038,7 @@ switch(input_tag)
/* Special Error */
{
break;
- }
+ }
case pow_nan_to_zero:
/* pow(NaN,0.0) */
{
@@ -832,36 +1050,51 @@ switch(input_tag)
break;
}
case atan2l_zero:
- /* atan2l(0,0) */
+ case atan2dl_zero:
+ /* atan2dl(0,0) */
{
- /* XXX arg1 and arg2 are switched!!!! */
+#ifndef _LIBC
+ RETVAL_ZEROL;
+#else
+ /* XXX arg1 and arg2 are switched!!!! */
if (signbit (*(long double *) arg1))
/* y == -0 */
- *(long double *) retval = copysignl (M_PIl, *(long double *) arg2);
+ *(long double *) retval = __libm_copysignl (M_PIl, *(long double *) arg2);
else
*(long double *) retval = *(long double *) arg2;
+#endif
ERRNO_DOMAIN; break;
}
case atan2_zero:
- /* atan2(0,0) */
+ case atan2d_zero:
+ /* atan2d(0,0) */
{
- /* XXX arg1 and arg2 are switched!!!! */
+#ifndef _LIBC
+ RETVAL_ZEROD;
+#else
+ /* XXX arg1 and arg2 are switched!!!! */
if (signbit (*(double *) arg1))
/* y == -0 */
- *(double *) retval = copysign (M_PI, *(double *) arg2);
+ *(double *) retval = __libm_copysign (M_PI, *(double *) arg2);
else
*(double *) retval = *(double *) arg2;
+#endif
ERRNO_DOMAIN; break;
}
- case
- atan2f_zero:
+ case atan2f_zero:
+ case atan2df_zero:
/* atan2f(0,0) */
+ /* atan2df(0,0) */
{
+#ifndef _LIBC
+ RETVAL_ZEROF;
+#else
if (signbit (*(float *) arg2))
/* y == -0 */
- *(float *) retval = copysignf (M_PI, *(float *) arg1);
+ *(float *) retval = __libm_copysignf (M_PI, *(float *) arg1);
else
*(float *) retval = *(float *) arg1;
+#endif
ERRNO_DOMAIN; break;
}
case expm1l_overflow:
@@ -912,42 +1145,42 @@ switch(input_tag)
case scalbl_underflow:
/* scalbl underflow */
{
- if (INPUT_XL < 0) RETVAL_NEG_ZEROL;
+ if (INPUT_XL < 0) RETVAL_NEG_ZEROL;
else RETVAL_ZEROL;
ERRNO_RANGE; break;
}
case scalb_underflow:
/* scalb underflow */
{
- if (INPUT_XD < 0) RETVAL_NEG_ZEROD;
+ if (INPUT_XD < 0) RETVAL_NEG_ZEROD;
else RETVAL_ZEROD;
ERRNO_RANGE; break;
}
case scalbf_underflow:
/* scalbf underflow */
{
- if (INPUT_XF < 0) RETVAL_NEG_ZEROF;
+ if (INPUT_XF < 0) RETVAL_NEG_ZEROF;
else RETVAL_ZEROF;
ERRNO_RANGE; break;
}
case scalbl_overflow:
/* scalbl overflow */
{
- if (INPUT_XL < 0) RETVAL_NEG_HUGE_VALL;
+ if (INPUT_XL < 0) RETVAL_NEG_HUGE_VALL;
else RETVAL_HUGE_VALL;
ERRNO_RANGE; break;
}
case scalb_overflow:
/* scalb overflow */
{
- if (INPUT_XD < 0) RETVAL_NEG_HUGE_VALD;
+ if (INPUT_XD < 0) RETVAL_NEG_HUGE_VALD;
else RETVAL_HUGE_VALD;
ERRNO_RANGE; break;
}
case scalbf_overflow:
/* scalbf overflow */
{
- if (INPUT_XF < 0) RETVAL_NEG_HUGE_VALF;
+ if (INPUT_XF < 0) RETVAL_NEG_HUGE_VALF;
else RETVAL_HUGE_VALF;
ERRNO_RANGE; break;
}
@@ -967,33 +1200,62 @@ switch(input_tag)
ERRNO_DOMAIN; break;
}
case acosl_gt_one:
+ case acosdl_gt_one:
/* acosl(x > 1) */
+ /* acosdl(x > 1) */
{
+#ifndef _LIBC
+ RETVAL_ZEROL;
+#endif
ERRNO_DOMAIN; break;
}
case acos_gt_one:
+ case acosd_gt_one:
/* acos(x > 1) */
+ /* acosd(x > 1) */
{
- ERRNO_DOMAIN; break;
+#ifndef _LIBC
+ RETVAL_ZEROD;
+#endif
+ ERRNO_DOMAIN; break;
}
case acosf_gt_one:
+ case acosdf_gt_one:
/* acosf(x > 1) */
+ /* acosdf(x > 1) */
{
- ERRNO_DOMAIN; break;
+#ifndef _LIBC
+ RETVAL_ZEROF;
+#endif
+ ERRNO_DOMAIN; break;
}
case asinl_gt_one:
+ case asindl_gt_one:
/* asinl(x > 1) */
+ /* asindl(x > 1) */
{
+#ifndef _LIBC
+ RETVAL_ZEROL;
+#endif
ERRNO_DOMAIN; break;
}
case asin_gt_one:
+ case asind_gt_one:
/* asin(x > 1) */
+ /* asind(x > 1) */
{
+#ifndef _LIBC
+ RETVAL_ZEROD;
+#endif
ERRNO_DOMAIN; break;
}
case asinf_gt_one:
- /* asinf(x > 1) */
+ case asindf_gt_one:
+ /* asindf(x > 1) */
{
+#ifndef _LIBC
+ RETVAL_ZEROF;
+#endif
ERRNO_DOMAIN; break;
}
case remainderl_by_zero:
@@ -1029,6 +1291,15 @@ switch(input_tag)
{
RETVAL_HUGE_VALF; ERRNO_RANGE; break;
}
+ case nextafterl_overflow:
+ case nextafter_overflow:
+ case nextafterf_overflow:
+ case nexttowardl_overflow:
+ case nexttoward_overflow:
+ case nexttowardf_overflow:
+ {
+ ERRNO_RANGE; break;
+ }
case sinhl_overflow:
/* sinhl overflows */
{
@@ -1090,7 +1361,7 @@ return;
/*******************************/
/* __SVID__ and __XOPEN__ Path */
/*******************************/
-else
+else
{
switch(input_tag)
{
@@ -1106,15 +1377,57 @@ else
case scalbn_underflow:
case scalbnf_overflow:
case scalbnf_underflow:
+ case scalblnl_overflow:
+ case scalblnl_underflow:
+ case scalbln_overflow:
+ case scalbln_underflow:
+ case scalblnf_overflow:
+ case scalblnf_underflow:
+ case tandl_overflow:
+ case tand_overflow:
+ case tandf_overflow:
+ case cotdl_overflow:
+ case cotd_overflow:
+ case cotdf_overflow:
+ case cotl_overflow:
+ case cot_overflow:
+ case cotf_overflow:
+ case annuityl_overflow:
+ case annuityl_underflow:
+ case annuity_overflow:
+ case annuity_underflow:
+ case annuityf_overflow:
+ case annuityf_underflow:
+ case compoundl_overflow:
+ case compoundl_underflow:
+ case compound_overflow:
+ case compound_underflow:
+ case compoundf_overflow:
+ case compoundf_underflow:
{
ERRNO_RANGE; break;
}
- case sqrtl_negative:
+ case annuityl_by_zero:
+ case annuityl_less_m1:
+ case annuity_by_zero:
+ case annuity_less_m1:
+ case annuityf_by_zero:
+ case annuityf_less_m1:
+ case compoundl_by_zero:
+ case compoundl_less_m1:
+ case compound_by_zero:
+ case compound_less_m1:
+ case compoundf_by_zero:
+ case compoundf_less_m1:
+ {
+ ERRNO_DOMAIN; break;
+ }
+ case sqrtl_negative:
/* sqrtl(x < 0) */
{
DOMAINL; NAMEL = (char *) "sqrtl";
- ifSVID
- {
+ ifSVID
+ {
RETVAL_ZEROL;
NOT_MATHERRL
{
@@ -1122,22 +1435,22 @@ else
ERRNO_DOMAIN;
}
}
- else
+ else
{ /* NaN already computed */
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
- case sqrt_negative:
+ case sqrt_negative:
/* sqrt(x < 0) */
{
DOMAIND; NAMED = (char *) "sqrt";
- ifSVID
+ ifSVID
{
-
+
RETVAL_ZEROD;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_SQRT;
ERRNO_DOMAIN;
@@ -1146,18 +1459,18 @@ else
else
{ /* NaN already computed */
NOT_MATHERRD {ERRNO_DOMAIN;}
- }
- *(double *)retval = exc.retval;
+ }
+ *(double *)retval = exc.retval;
break;
}
- case sqrtf_negative:
+ case sqrtf_negative:
/* sqrtf(x < 0) */
{
DOMAINF; NAMEF = (char *) "sqrtf";
- ifSVID
+ ifSVID
{
RETVAL_ZEROF;
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_SQRT;
ERRNO_DOMAIN;
@@ -1166,62 +1479,59 @@ else
else
{
NOT_MATHERRF {ERRNO_DOMAIN;}
- }
- *(float *)retval = excf.retval;
+ }
+ *(float *)retval = excf.retval;
break;
}
case logl_zero:
- case log2l_zero:
/* logl(0) */
{
SINGL; NAMEL = (char *) "logl";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEL;
- NOT_MATHERRL
+ NOT_MATHERRL
{
WRITEL_LOG_ZERO;
ERRNO_DOMAIN;
- }
+ }
}
else
{
RETVAL_NEG_HUGE_VALL;
NOT_MATHERRL {ERRNO_DOMAIN;}
- }
- *(long double *)retval = excl.retval;
+ }
+ *(long double *)retval = excl.retval;
break;
}
case log_zero:
- case log2_zero:
/* log(0) */
{
SINGD; NAMED = (char *) "log";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGED;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_LOG_ZERO;
ERRNO_DOMAIN;
- }
+ }
}
else
{
RETVAL_NEG_HUGE_VALD;
NOT_MATHERRD {ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case logf_zero:
- case log2f_zero:
/* logf(0) */
{
SINGF; NAMEF = (char *) "logf";
- ifSVID
+ ifSVID
{
- RETVAL_NEG_HUGEF;
+ RETVAL_NEG_HUGEF;
NOT_MATHERRF
{
WRITEF_LOG_ZERO;
@@ -1230,22 +1540,21 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALF;
+ RETVAL_NEG_HUGE_VALF;
NOT_MATHERRF {ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case logl_negative:
- case log2l_negative:
/* logl(x < 0) */
{
DOMAINL; NAMEL = (char *) "logl";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEL;
- NOT_MATHERRL
+ NOT_MATHERRL
{
WRITEL_LOG_NEGATIVE;
ERRNO_DOMAIN;
@@ -1253,21 +1562,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALL;
+ RETVAL_NEG_HUGE_VALL;
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case log_negative:
- case log2_negative:
/* log(x < 0) */
{
DOMAIND; NAMED = (char *) "log";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGED;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_LOG_NEGATIVE;
ERRNO_DOMAIN;
@@ -1275,39 +1583,38 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALD;
+ RETVAL_NEG_HUGE_VALD;
NOT_MATHERRD {ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
- }
+ }
case logf_negative:
- case log2f_negative:
/* logf(x < 0) */
{
DOMAINF; NAMEF = (char *) "logf";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEF;
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_LOG_NEGATIVE;
ERRNO_DOMAIN;
}
- }
+ }
else
{
- RETVAL_NEG_HUGE_VALF;
+ RETVAL_NEG_HUGE_VALF;
NOT_MATHERRF{ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case log1pl_zero:
/* log1pl(-1) */
{
SINGL; NAMEL = (char *) "log1pl";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEL;
NOT_MATHERRL
@@ -1328,7 +1635,7 @@ else
/* log1p(-1) */
{
SINGD; NAMED = (char *) "log1p";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGED;
NOT_MATHERRD
@@ -1349,7 +1656,7 @@ else
/* log1pf(-1) */
{
SINGF; NAMEF = (char *) "log1pf";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEF;
NOT_MATHERRF
@@ -1361,11 +1668,11 @@ else
else
{
RETVAL_NEG_HUGE_VALF;
- NOT_MATHERRF {}ERRNO_DOMAIN;
+ NOT_MATHERRF {ERRNO_DOMAIN;}
}
*(float *)retval = excf.retval;
break;
- }
+ }
case log1pl_negative:
/* log1pl(x < -1) */
{
@@ -1379,7 +1686,7 @@ else
ERRNO_DOMAIN;
}
}
- else
+ else
{
RETVAL_NEG_HUGE_VALL;
NOT_MATHERRL {ERRNO_DOMAIN;}
@@ -1400,7 +1707,7 @@ else
ERRNO_DOMAIN;
}
}
- else
+ else
{
RETVAL_NEG_HUGE_VALD;
NOT_MATHERRD {ERRNO_DOMAIN;}
@@ -1421,7 +1728,7 @@ else
ERRNO_DOMAIN;
}
}
- else
+ else
{
RETVAL_NEG_HUGE_VALF;
NOT_MATHERRF {ERRNO_DOMAIN;}
@@ -1433,7 +1740,7 @@ else
/* log10l(0) */
{
SINGL; NAMEL = (char *) "log10l";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEL;
NOT_MATHERRL
@@ -1447,14 +1754,14 @@ else
RETVAL_NEG_HUGE_VALL;
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case log10_zero:
/* log10(0) */
{
SINGD; NAMED = (char *) "log10";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGED;
NOT_MATHERRD
@@ -1468,14 +1775,14 @@ else
RETVAL_NEG_HUGE_VALD;
NOT_MATHERRD {ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case log10f_zero:
/* log10f(0) */
{
SINGF; NAMEF = (char *) "log10f";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEF;
NOT_MATHERRF
@@ -1489,17 +1796,17 @@ else
RETVAL_NEG_HUGE_VALF;
NOT_MATHERRF {ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case log10l_negative:
/* log10l(x < 0) */
{
DOMAINL; NAMEL = (char *) "log10l";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEL;
- NOT_MATHERRL
+ NOT_MATHERRL
{
WRITEL_LOG10_NEGATIVE;
ERRNO_DOMAIN;
@@ -1510,38 +1817,38 @@ else
RETVAL_NEG_HUGE_VALL;
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case log10_negative:
/* log10(x < 0) */
{
DOMAIND; NAMED = (char *) "log10";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGED;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_LOG10_NEGATIVE;
ERRNO_DOMAIN;
}
- }
+ }
else
{
RETVAL_NEG_HUGE_VALD;
NOT_MATHERRD {ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case log10f_negative:
/* log10f(x < 0) */
{
DOMAINF; NAMEF = (char *) "log10f";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEF;
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_LOG10_NEGATIVE;
ERRNO_DOMAIN;
@@ -1552,14 +1859,119 @@ else
RETVAL_NEG_HUGE_VALF;
NOT_MATHERRF {ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
+ break;
+ }
+ case log2_zero:
+ /* log2(0) */
+ {
+ SINGD; NAMED = (char *) "log2";
+ ifSVID
+ {
+ RETVAL_NEG_HUGED;
+ NOT_MATHERRD
+ {
+ WRITED_LOG2_ZERO;
+ ERRNO_DOMAIN;
+ }
+ }
+ else
+ {
+ RETVAL_NEG_HUGE_VALD;
+ NOT_MATHERRD {ERRNO_DOMAIN;}
+ }
+ *(double *)retval = exc.retval;
+ break;
+ }
+ case log2f_zero:
+ /* log2f(0) */
+ {
+ SINGF; NAMEF = (char *) "log2f";
+ ifSVID
+ {
+ RETVAL_NEG_HUGEF;
+ NOT_MATHERRF
+ {
+ WRITEF_LOG2_ZERO;
+ ERRNO_DOMAIN;
+ }
+ }
+ else
+ {
+ RETVAL_NEG_HUGE_VALF;
+ NOT_MATHERRF {ERRNO_DOMAIN;}
+ }
+ *(float *)retval = excf.retval;
+ break;
+ }
+ case log2l_negative:
+ /* log2l(x < 0) */
+ {
+ DOMAINL; NAMEL = (char *) "log2l";
+ ifSVID
+ {
+ RETVAL_NEG_HUGEL;
+ NOT_MATHERRL
+ {
+ WRITEL_LOG2_NEGATIVE;
+ ERRNO_DOMAIN;
+ }
+ }
+ else
+ {
+ RETVAL_NEG_HUGE_VALL;
+ NOT_MATHERRL {ERRNO_DOMAIN;}
+ }
+ *(long double *)retval = excl.retval;
+ break;
+ }
+ case log2_negative:
+ /* log2(x < 0) */
+ {
+ DOMAIND; NAMED = (char *) "log2";
+ ifSVID
+ {
+ RETVAL_NEG_HUGED;
+ NOT_MATHERRD
+ {
+ WRITED_LOG2_NEGATIVE;
+ ERRNO_DOMAIN;
+ }
+ }
+ else
+ {
+ RETVAL_NEG_HUGE_VALD;
+ NOT_MATHERRD {ERRNO_DOMAIN;}
+ }
+ *(double *)retval = exc.retval;
+ break;
+ }
+ case log2f_negative:
+ /* log2f(x < 0) */
+ {
+ DOMAINF; NAMEF = (char *) "log2f";
+ ifSVID
+ {
+ RETVAL_NEG_HUGEF;
+ NOT_MATHERRF
+ {
+ WRITEF_LOG2_NEGATIVE;
+ ERRNO_DOMAIN;
+ }
+ }
+ else
+ {
+ RETVAL_NEG_HUGE_VALF;
+ NOT_MATHERRF {ERRNO_DOMAIN;}
+ }
+ *(float *)retval = excf.retval;
break;
}
case expl_overflow:
/* expl overflow */
{
OVERFLOWL; NAMEL = (char *) "expl";
- ifSVID
+ ifSVID
{
RETVAL_HUGEL;
}
@@ -1568,14 +1980,14 @@ else
RETVAL_HUGE_VALL;
}
NOT_MATHERRL {ERRNO_RANGE;}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case exp_overflow:
/* exp overflow */
{
OVERFLOWD; NAMED = (char *) "exp";
- ifSVID
+ ifSVID
{
RETVAL_HUGED;
}
@@ -1584,14 +1996,14 @@ else
RETVAL_HUGE_VALD;
}
NOT_MATHERRD {ERRNO_RANGE;}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case expf_overflow:
/* expf overflow */
{
OVERFLOWF; NAMEF = (char *) "expf";
- ifSVID
+ ifSVID
{
RETVAL_HUGEF;
}
@@ -1600,7 +2012,7 @@ else
RETVAL_HUGE_VALF;
}
NOT_MATHERRF {ERRNO_RANGE;}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case expl_underflow:
@@ -1608,7 +2020,7 @@ else
{
UNDERFLOWL; NAMEL = (char *) "expl"; RETVAL_ZEROL;
NOT_MATHERRL {ERRNO_RANGE;}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case exp_underflow:
@@ -1616,7 +2028,7 @@ else
{
UNDERFLOWD; NAMED = (char *) "exp"; RETVAL_ZEROD;
NOT_MATHERRD {ERRNO_RANGE;}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case expf_underflow:
@@ -1624,22 +2036,22 @@ else
{
UNDERFLOWF; NAMEF = (char *) "expf"; RETVAL_ZEROF;
NOT_MATHERRF {ERRNO_RANGE;}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case powl_zero_to_zero:
/* powl 0**0 */
{
DOMAINL; NAMEL = (char *) "powl";
- ifSVID
+ ifSVID
{
RETVAL_ZEROL;
- NOT_MATHERRL
+ NOT_MATHERRL
{
WRITEL_POW_ZERO_TO_ZERO;
- ERRNO_RANGE;
+ ERRNO_DOMAIN;
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
}
else RETVAL_ONEL;
break;
@@ -1648,15 +2060,15 @@ else
/* pow 0**0 */
{
DOMAIND; NAMED = (char *) "pow";
- ifSVID
+ ifSVID
{
RETVAL_ZEROD;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_POW_ZERO_TO_ZERO;
- ERRNO_RANGE;
+ ERRNO_DOMAIN;
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
}
else RETVAL_ONED;
break;
@@ -1665,15 +2077,15 @@ else
/* powf 0**0 */
{
DOMAINF; NAMEF = (char *) "powf";
- ifSVID
+ ifSVID
{
RETVAL_ZEROF;
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_POW_ZERO_TO_ZERO;
- ERRNO_RANGE;
+ ERRNO_DOMAIN;
}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
}
else RETVAL_ONEF;
break;
@@ -1682,54 +2094,54 @@ else
/* powl(x,y) overflow */
{
OVERFLOWL; NAMEL = (char *) "powl";
- ifSVID
+ ifSVID
{
if (INPUT_XL < 0) RETVAL_NEG_HUGEL;
else RETVAL_HUGEL;
}
else
- {
+ {
if (INPUT_XL < 0) RETVAL_NEG_HUGE_VALL;
else RETVAL_HUGE_VALL;
}
NOT_MATHERRL {ERRNO_RANGE;}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case pow_overflow:
/* pow(x,y) overflow */
{
OVERFLOWD; NAMED = (char *) "pow";
- ifSVID
+ ifSVID
{
if (INPUT_XD < 0) RETVAL_NEG_HUGED;
else RETVAL_HUGED;
}
else
- {
+ {
if (INPUT_XD < 0) RETVAL_NEG_HUGE_VALD;
else RETVAL_HUGE_VALD;
}
NOT_MATHERRD {ERRNO_RANGE;}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case powf_overflow:
/* powf(x,y) overflow */
{
OVERFLOWF; NAMEF = (char *) "powf";
- ifSVID
+ ifSVID
{
if (INPUT_XF < 0) RETVAL_NEG_HUGEF;
- else RETVAL_HUGEF;
+ else RETVAL_HUGEF;
}
else
- {
+ {
if (INPUT_XF < 0) RETVAL_NEG_HUGE_VALF;
else RETVAL_HUGE_VALF;
}
NOT_MATHERRF {ERRNO_RANGE;}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case powl_underflow:
@@ -1737,7 +2149,7 @@ else
{
UNDERFLOWL; NAMEL = (char *) "powl"; RETVAL_ZEROL;
NOT_MATHERRL {ERRNO_RANGE;}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case pow_underflow:
@@ -1745,7 +2157,7 @@ else
{
UNDERFLOWD; NAMED = (char *) "pow"; RETVAL_ZEROD;
NOT_MATHERRD {ERRNO_RANGE;}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case powf_underflow:
@@ -1753,17 +2165,17 @@ else
{
UNDERFLOWF; NAMEF = (char *) "powf"; RETVAL_ZEROF;
NOT_MATHERRF {ERRNO_RANGE;}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case powl_zero_to_negative:
/* 0 to neg */
{
DOMAINL; NAMEL = (char *) "powl";
- ifSVID
- {
+ ifSVID
+ {
RETVAL_ZEROL;
- NOT_MATHERRL
+ NOT_MATHERRL
{
WRITEL_POW_ZERO_TO_NEGATIVE;
ERRNO_DOMAIN;
@@ -1774,17 +2186,17 @@ else
RETVAL_NEG_HUGE_VALL;
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case pow_zero_to_negative:
/* 0**neg */
{
DOMAIND; NAMED = (char *) "pow";
- ifSVID
- {
+ ifSVID
+ {
RETVAL_ZEROD;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_POW_ZERO_TO_NEGATIVE;
ERRNO_DOMAIN;
@@ -1795,7 +2207,7 @@ else
RETVAL_NEG_HUGE_VALD;
NOT_MATHERRD {ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case powf_zero_to_negative:
@@ -1803,10 +2215,10 @@ else
{
DOMAINF; NAMEF = (char *) "powf";
RETVAL_NEG_HUGE_VALF;
- ifSVID
- {
+ ifSVID
+ {
RETVAL_ZEROF;
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_POW_ZERO_TO_NEGATIVE;
ERRNO_DOMAIN;
@@ -1817,17 +2229,17 @@ else
RETVAL_NEG_HUGE_VALF;
NOT_MATHERRF {ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case powl_neg_to_non_integer:
/* neg**non_integral */
{
DOMAINL; NAMEL = (char *) "powl";
- ifSVID
- {
+ ifSVID
+ {
RETVAL_ZEROF;
- NOT_MATHERRL
+ NOT_MATHERRL
{
WRITEL_POW_NEG_TO_NON_INTEGER;
ERRNO_DOMAIN;
@@ -1837,17 +2249,17 @@ else
{
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case pow_neg_to_non_integer:
/* neg**non_integral */
{
DOMAIND; NAMED = (char *) "pow";
- ifSVID
- {
+ ifSVID
+ {
RETVAL_ZEROD;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_POW_NEG_TO_NON_INTEGER;
ERRNO_DOMAIN;
@@ -1857,17 +2269,17 @@ else
{
NOT_MATHERRD {ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case powf_neg_to_non_integer:
/* neg**non-integral */
{
DOMAINF; NAMEF = (char *) "powf";
- ifSVID
- {
+ ifSVID
+ {
RETVAL_ZEROF;
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_POW_NEG_TO_NON_INTEGER;
ERRNO_DOMAIN;
@@ -1877,37 +2289,37 @@ else
{
NOT_MATHERRF {ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case powl_nan_to_zero:
/* pow(NaN,0.0) */
/* Special Error */
{
- DOMAINL; NAMEL = (char *) "powl"; INPUT_XL; INPUT_YL;
- excl.retval = *(long double *)arg1;
+ DOMAINL; NAMEL = (char *) "powl";
+ *(long double *)retval = *(long double *)arg1;
NOT_MATHERRL {ERRNO_DOMAIN;}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
- }
+ }
case pow_nan_to_zero:
/* pow(NaN,0.0) */
/* Special Error */
{
- DOMAIND; NAMED = (char *) "pow"; INPUT_XD; INPUT_YD;
- exc.retval = *(double *)arg1;
+ DOMAIND; NAMED = (char *) "pow";
+ *(double *)retval = *(double *)arg1;
NOT_MATHERRD {ERRNO_DOMAIN;}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case powf_nan_to_zero:
/* powf(NaN,0.0) */
/* Special Error */
{
- DOMAINF; NAMEF = (char *) "powf"; INPUT_XF; INPUT_YF;
- excf.retval = *(float *)arg1;
+ DOMAINF; NAMEF = (char *) "powf";
+ *(float *)retval = *(float *)arg1;
NOT_MATHERRF {ERRNO_DOMAIN;}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case atan2l_zero:
@@ -1915,15 +2327,15 @@ else
{
DOMAINL; NAMEL = (char *) "atan2l";
RETVAL_ZEROL;
- NOT_MATHERRL
+ NOT_MATHERRL
{
- ifSVID
+ ifSVID
{
WRITEL_ATAN2_ZERO_BY_ZERO;
}
ERRNO_DOMAIN;
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case atan2_zero:
@@ -1931,15 +2343,15 @@ else
{
DOMAIND; NAMED = (char *) "atan2";
RETVAL_ZEROD;
- NOT_MATHERRD
+ NOT_MATHERRD
{
- ifSVID
- {
+ ifSVID
+ {
WRITED_ATAN2_ZERO_BY_ZERO;
}
ERRNO_DOMAIN;
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case atan2f_zero:
@@ -1947,13 +2359,59 @@ else
{
DOMAINF; NAMEF = (char *) "atan2f";
RETVAL_ZEROF;
- NOT_MATHERRF
- ifSVID
+ NOT_MATHERRF
+ ifSVID
{
WRITEF_ATAN2_ZERO_BY_ZERO;
}
ERRNO_DOMAIN;
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
+ break;
+ }
+ case atan2dl_zero:
+ /* atan2dl(0.0,0.0) */
+ {
+ DOMAINL; NAMEL = (char *) "atan2dl";
+ RETVAL_ZEROL;
+ NOT_MATHERRL
+ {
+ ifSVID
+ {
+ WRITEL_ATAN2D_ZERO_BY_ZERO;
+ }
+ ERRNO_DOMAIN;
+ }
+ *(long double *)retval = excl.retval;
+ break;
+ }
+ case atan2d_zero:
+ /* atan2d(0.0,0.0) */
+ {
+ DOMAIND; NAMED = (char *) "atan2d";
+ RETVAL_ZEROD;
+ NOT_MATHERRD
+ {
+ ifSVID
+ {
+ WRITED_ATAN2D_ZERO_BY_ZERO;
+ }
+ ERRNO_DOMAIN;
+ }
+ *(double *)retval = exc.retval;
+ break;
+ }
+ case atan2df_zero:
+ /* atan2df(0.0,0.0) */
+ {
+ DOMAINF; NAMEF = (char *) "atan2df";
+ RETVAL_ZEROF;
+ NOT_MATHERRF
+ ifSVID
+ {
+ WRITEF_ATAN2D_ZERO_BY_ZERO;
+ }
+ ERRNO_DOMAIN;
+ *(float *)retval = excf.retval;
break;
}
case expm1_overflow:
@@ -1990,8 +2448,8 @@ else
UNDERFLOWL; NAMEL = (char *) "scalbl";
if (INPUT_XL < 0.0L) RETVAL_NEG_ZEROL;
else RETVAL_ZEROL;
- NOT_MATHERRL {ERRNO_RANGE;}
- *(long double *)retval = excf.retval;
+ NOT_MATHERRL {ERRNO_RANGE;}
+ *(long double *)retval = excl.retval;
break;
}
case scalb_underflow:
@@ -2000,8 +2458,8 @@ else
UNDERFLOWD; NAMED = (char *) "scalb";
if (INPUT_XD < 0.0) RETVAL_NEG_ZEROD;
else RETVAL_ZEROD;
- NOT_MATHERRD {ERRNO_RANGE;}
- *(double *)retval = exc.retval;
+ NOT_MATHERRD {ERRNO_RANGE;}
+ *(double *)retval = exc.retval;
break;
}
case scalbf_underflow:
@@ -2010,8 +2468,8 @@ else
UNDERFLOWF; NAMEF = (char *) "scalbf";
if (INPUT_XF < 0.0) RETVAL_NEG_ZEROF;
else RETVAL_ZEROF;
- NOT_MATHERRF {ERRNO_RANGE;}
- *(float *)retval = excf.retval;
+ NOT_MATHERRF {ERRNO_RANGE;}
+ *(float *)retval = excf.retval;
break;
}
case scalbl_overflow:
@@ -2020,8 +2478,8 @@ else
OVERFLOWL; NAMEL = (char *) "scalbl";
if (INPUT_XL < 0) RETVAL_NEG_HUGE_VALL;
else RETVAL_HUGE_VALL;
- NOT_MATHERRL {ERRNO_RANGE;}
- *(long double *)retval = excl.retval;
+ NOT_MATHERRL {ERRNO_RANGE;}
+ *(long double *)retval = excl.retval;
break;
}
case scalb_overflow:
@@ -2030,8 +2488,8 @@ else
OVERFLOWD; NAMED = (char *) "scalb";
if (INPUT_XD < 0) RETVAL_NEG_HUGE_VALD;
else RETVAL_HUGE_VALD;
- NOT_MATHERRD {ERRNO_RANGE;}
- *(double *)retval = exc.retval;
+ NOT_MATHERRD {ERRNO_RANGE;}
+ *(double *)retval = exc.retval;
break;
}
case scalbf_overflow:
@@ -2040,8 +2498,8 @@ else
OVERFLOWF; NAMEF = (char *) "scalbf";
if (INPUT_XF < 0) RETVAL_NEG_HUGE_VALF;
else RETVAL_HUGE_VALF;
- NOT_MATHERRF {ERRNO_RANGE;}
- *(float *)retval = excf.retval;
+ NOT_MATHERRF {ERRNO_RANGE;}
+ *(float *)retval = excf.retval;
break;
}
case hypotl_overflow:
@@ -2049,7 +2507,7 @@ else
{
OVERFLOWL; NAMEL = (char *) "hypotl";
ifSVID
- {
+ {
RETVAL_HUGEL;
}
else
@@ -2057,7 +2515,7 @@ else
RETVAL_HUGE_VALL;
}
NOT_MATHERRL {ERRNO_RANGE;}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case hypot_overflow:
@@ -2065,7 +2523,7 @@ else
{
OVERFLOWD; NAMED = (char *) "hypot";
ifSVID
- {
+ {
RETVAL_HUGED;
}
else
@@ -2073,14 +2531,14 @@ else
RETVAL_HUGE_VALD;
}
NOT_MATHERRD {ERRNO_RANGE;}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case hypotf_overflow:
/* hypotf overflow */
- {
+ {
OVERFLOWF; NAMEF = (char *) "hypotf";
- ifSVID
+ ifSVID
{
RETVAL_HUGEF;
}
@@ -2089,7 +2547,7 @@ else
RETVAL_HUGE_VALF;
}
NOT_MATHERRF {ERRNO_RANGE;}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case acosl_gt_one:
@@ -2097,7 +2555,7 @@ else
{
DOMAINL; NAMEL = (char *) "acosl";
RETVAL_ZEROL;
- ifSVID
+ ifSVID
{
NOT_MATHERRL
{
@@ -2117,7 +2575,7 @@ else
{
DOMAIND; NAMED = (char *) "acos";
RETVAL_ZEROD;
- ifSVID
+ ifSVID
{
NOT_MATHERRD
{
@@ -2137,9 +2595,9 @@ else
{
DOMAINF; NAMEF = (char *) "acosf";
RETVAL_ZEROF;
- ifSVID
+ ifSVID
{
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_ACOS;
ERRNO_DOMAIN;
@@ -2148,8 +2606,8 @@ else
else
{
NOT_MATHERRF {ERRNO_DOMAIN;}
- }
- *(float *)retval = excf.retval;
+ }
+ *(float *)retval = excf.retval;
break;
}
case asinl_gt_one:
@@ -2157,7 +2615,7 @@ else
{
DOMAINL; NAMEL = (char *) "asinl";
RETVAL_ZEROL;
- ifSVID
+ ifSVID
{
NOT_MATHERRL
{
@@ -2177,7 +2635,7 @@ else
{
DOMAIND; NAMED = (char *) "asin";
RETVAL_ZEROD;
- ifSVID
+ ifSVID
{
NOT_MATHERRD
{
@@ -2197,9 +2655,9 @@ else
{
DOMAINF; NAMEF = (char *) "asinf";
RETVAL_ZEROF;
- ifSVID
+ ifSVID
{
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_ASIN;
ERRNO_DOMAIN;
@@ -2208,8 +2666,128 @@ else
else
{
NOT_MATHERRF {ERRNO_DOMAIN;}
+ }
+ *(float *)retval = excf.retval;
+ break;
+ }
+ case acosdl_gt_one:
+ /* acosdl(x > 1) */
+ {
+ DOMAINL; NAMEL = (char *) "acosdl";
+ RETVAL_ZEROL;
+ ifSVID
+ {
+ NOT_MATHERRL
+ {
+ WRITEL_ACOSD;
+ ERRNO_DOMAIN;
+ }
}
- *(float *)retval = excf.retval;
+ else
+ {
+ NOT_MATHERRL {ERRNO_DOMAIN;}
+ }
+ *(long double *)retval = excl.retval;
+ break;
+ }
+ case acosd_gt_one:
+ /* acosd(x > 1) */
+ {
+ DOMAIND; NAMED = (char *) "acosd";
+ RETVAL_ZEROD;
+ ifSVID
+ {
+ NOT_MATHERRD
+ {
+ WRITED_ACOSD;
+ ERRNO_DOMAIN;
+ }
+ }
+ else
+ {
+ NOT_MATHERRD {ERRNO_DOMAIN;}
+ }
+ *(double *)retval = exc.retval;
+ break;
+ }
+ case acosdf_gt_one:
+ /* acosdf(x > 1) */
+ {
+ DOMAINF; NAMEF = (char *) "acosdf";
+ RETVAL_ZEROF;
+ ifSVID
+ {
+ NOT_MATHERRF
+ {
+ WRITEF_ACOSD;
+ ERRNO_DOMAIN;
+ }
+ }
+ else
+ {
+ NOT_MATHERRF {ERRNO_DOMAIN;}
+ }
+ *(float *)retval = excf.retval;
+ break;
+ }
+ case asindl_gt_one:
+ /* asindl(x > 1) */
+ {
+ DOMAINL; NAMEL = (char *) "asindl";
+ RETVAL_ZEROL;
+ ifSVID
+ {
+ NOT_MATHERRL
+ {
+ WRITEL_ASIND;
+ ERRNO_DOMAIN;
+ }
+ }
+ else
+ {
+ NOT_MATHERRL {ERRNO_DOMAIN;}
+ }
+ *(long double *)retval = excl.retval;
+ break;
+ }
+ case asind_gt_one:
+ /* asind(x > 1) */
+ {
+ DOMAIND; NAMED = (char *) "asind";
+ RETVAL_ZEROD;
+ ifSVID
+ {
+ NOT_MATHERRD
+ {
+ WRITED_ASIND;
+ ERRNO_DOMAIN;
+ }
+ }
+ else
+ {
+ NOT_MATHERRD {ERRNO_DOMAIN;}
+ }
+ *(double *)retval = exc.retval;
+ break;
+ }
+ case asindf_gt_one:
+ /* asindf(x > 1) */
+ {
+ DOMAINF; NAMEF = (char *) "asindf";
+ RETVAL_ZEROF;
+ ifSVID
+ {
+ NOT_MATHERRF
+ {
+ WRITEF_ASIND;
+ ERRNO_DOMAIN;
+ }
+ }
+ else
+ {
+ NOT_MATHERRF {ERRNO_DOMAIN;}
+ }
+ *(float *)retval = excf.retval;
break;
}
case coshl_overflow:
@@ -2220,7 +2798,7 @@ else
{
RETVAL_HUGEL;
}
- else
+ else
{
RETVAL_HUGE_VALL;
}
@@ -2236,7 +2814,7 @@ else
{
RETVAL_HUGED;
}
- else
+ else
{
RETVAL_HUGE_VALD;
}
@@ -2252,7 +2830,7 @@ else
{
RETVAL_HUGEF;
}
- else
+ else
{
RETVAL_HUGE_VALF;
}
@@ -2269,7 +2847,7 @@ else
if (INPUT_XL > 0.0) RETVAL_HUGEL;
else RETVAL_NEG_HUGEL;
}
- else
+ else
{
if (INPUT_XL > 0.0) RETVAL_HUGE_VALL;
else RETVAL_NEG_HUGE_VALL;
@@ -2287,7 +2865,7 @@ else
if (INPUT_XD > 0.0) RETVAL_HUGED;
else RETVAL_NEG_HUGED;
}
- else
+ else
{
if (INPUT_XD > 0.0) RETVAL_HUGE_VALD;
else RETVAL_NEG_HUGE_VALD;
@@ -2305,7 +2883,7 @@ else
if( INPUT_XF > 0.0) RETVAL_HUGEF;
else RETVAL_NEG_HUGEF;
}
- else
+ else
{
if (INPUT_XF > 0.0) RETVAL_HUGE_VALF;
else RETVAL_NEG_HUGE_VALF;
@@ -2318,7 +2896,7 @@ else
/* acoshl(x < 1) */
{
DOMAINL; NAMEL = (char *) "acoshl";
- ifSVID
+ ifSVID
{
NOT_MATHERRL
{
@@ -2326,7 +2904,10 @@ else
ERRNO_DOMAIN;
}
}
- else NOT_MATHERRL {ERRNO_DOMAIN;}
+ else
+ {
+ NOT_MATHERRL {ERRNO_DOMAIN;}
+ }
*(long double *)retval = excl.retval;
break;
}
@@ -2334,7 +2915,7 @@ else
/* acosh(x < 1) */
{
DOMAIND; NAMED = (char *) "acosh";
- ifSVID
+ ifSVID
{
NOT_MATHERRD
{
@@ -2342,7 +2923,10 @@ else
ERRNO_DOMAIN;
}
}
- else NOT_MATHERRD {ERRNO_DOMAIN;}
+ else
+ {
+ NOT_MATHERRD {ERRNO_DOMAIN;}
+ }
*(double *)retval = exc.retval;
break;
}
@@ -2350,7 +2934,7 @@ else
/* acoshf(x < 1) */
{
DOMAINF; NAMEF = (char *) "acoshf";
- ifSVID
+ ifSVID
{
NOT_MATHERRF
{
@@ -2369,7 +2953,7 @@ else
/* atanhl(|x| > 1) */
{
DOMAINL; NAMEL = (char *) "atanhl";
- ifSVID
+ ifSVID
{
NOT_MATHERRL
{
@@ -2387,7 +2971,7 @@ else
/* atanh(|x| > 1) */
{
DOMAIND; NAMED = (char *) "atanh";
- ifSVID
+ ifSVID
{
NOT_MATHERRD
{
@@ -2405,7 +2989,7 @@ else
/* atanhf(|x| > 1) */
{
DOMAINF; NAMEF = (char *) "atanhf";
- ifSVID
+ ifSVID
{
NOT_MATHERRF
{
@@ -2422,8 +3006,8 @@ else
case atanhl_eq_one:
/* atanhl(|x| == 1) */
{
- SINGL; NAMEL = (char *)"atanhl";
- ifSVID
+ SINGL; NAMEL = (char *) "atanhl";
+ ifSVID
{
NOT_MATHERRL
{
@@ -2441,7 +3025,7 @@ else
/* atanh(|x| == 1) */
{
SINGD; NAMED = (char *) "atanh";
- ifSVID
+ ifSVID
{
NOT_MATHERRD
{
@@ -2459,7 +3043,7 @@ else
/* atanhf(|x| == 1) */
{
SINGF; NAMEF = (char *) "atanhf";
- ifSVID
+ ifSVID
{
NOT_MATHERRF
{
@@ -2477,7 +3061,7 @@ else
/* gammal overflow */
{
OVERFLOWL; NAMEL = (char *) "gammal";
- ifSVID
+ ifSVID
{
RETVAL_HUGEL;
}
@@ -2485,15 +3069,15 @@ else
{
RETVAL_HUGE_VALL;
}
- NOT_MATHERRL {ERRNO_RANGE;}
- *(long double *)retval = excl.retval;
+ NOT_MATHERRL{ERRNO_RANGE;}
+ *(long double*)retval = excl.retval;
break;
}
case gamma_overflow:
/* gamma overflow */
{
OVERFLOWD; NAMED = (char *) "gamma";
- ifSVID
+ ifSVID
{
RETVAL_HUGED;
}
@@ -2501,31 +3085,94 @@ else
{
RETVAL_HUGE_VALD;
}
- NOT_MATHERRD {ERRNO_RANGE;}
- *(double *)retval = exc.retval;
+ NOT_MATHERRD{ERRNO_RANGE;}
+ *(double*)retval = exc.retval;
break;
}
case gammaf_overflow:
/* gammaf overflow */
{
OVERFLOWF; NAMEF = (char *) "gammaf";
+ ifSVID
+ {
+ RETVAL_HUGEF;
+ }
+ else
+ {
+ RETVAL_HUGE_VALF;
+ }
+ NOT_MATHERRF{ERRNO_RANGE;}
+ *(float*)retval = excf.retval;
+ break;
+ }
+ case gammal_negative:
+ /* gammal -int or 0 */
+ {
+ SINGL; NAMEL = (char *) "gammal";
ifSVID
{
+ RETVAL_HUGEL;
+ NOT_MATHERRL
+ {
+ WRITEL_GAMMA_NEGATIVE;
+ ERRNO_DOMAIN;
+ }
+ }
+ else
+ {
+ RETVAL_HUGE_VALL;
+ NOT_MATHERRL{ERRNO_DOMAIN;}
+ }
+ *(long double*)retval = excl.retval;
+ break;
+ }
+ case gamma_negative:
+ /* gamma -int or 0 */
+ {
+ SINGD; NAMED = (char *) "gamma";
+ ifSVID
+ {
+ RETVAL_HUGED;
+ NOT_MATHERRD
+ {
+ WRITED_GAMMA_NEGATIVE;
+ ERRNO_DOMAIN;
+ }
+ }
+ else
+ {
+ RETVAL_HUGE_VALD;
+ NOT_MATHERRD{ERRNO_DOMAIN;}
+ }
+ *(double*)retval = exc.retval;
+ break;
+ }
+ case gammaf_negative:
+ /* gammaf -int or 0 */
+ {
+ SINGF; NAMEF = (char *) "gammaf";
+ ifSVID
+ {
RETVAL_HUGEF;
+ NOT_MATHERRF
+ {
+ WRITEF_GAMMA_NEGATIVE;
+ ERRNO_DOMAIN;
+ }
}
else
{
RETVAL_HUGE_VALF;
+ NOT_MATHERRF{ERRNO_DOMAIN;}
}
- NOT_MATHERRF {ERRNO_RANGE;}
- *(float *)retval = excf.retval;
+ *(float*)retval = excf.retval;
break;
}
case lgammal_overflow:
/* lgammal overflow */
{
OVERFLOWL; NAMEL = (char *) "lgammal";
- ifSVID
+ ifSVID
{
RETVAL_HUGEL;
}
@@ -2533,15 +3180,15 @@ else
{
RETVAL_HUGE_VALL;
}
- NOT_MATHERRL {ERRNO_RANGE;}
- *(long double *)retval = excl.retval;
+ NOT_MATHERRL{ERRNO_RANGE;}
+ *(long double*)retval = excl.retval;
break;
}
case lgamma_overflow:
/* lgamma overflow */
{
OVERFLOWD; NAMED = (char *) "lgamma";
- ifSVID
+ ifSVID
{
RETVAL_HUGED;
}
@@ -2549,15 +3196,15 @@ else
{
RETVAL_HUGE_VALD;
}
- NOT_MATHERRD {ERRNO_RANGE;}
- *(double *)retval = exc.retval;
+ NOT_MATHERRD{ERRNO_RANGE;}
+ *(double*)retval = exc.retval;
break;
}
case lgammaf_overflow:
/* lgammaf overflow */
{
OVERFLOWF; NAMEF = (char *) "lgammaf";
- ifSVID
+ ifSVID
{
RETVAL_HUGEF;
}
@@ -2565,8 +3212,8 @@ else
{
RETVAL_HUGE_VALF;
}
- NOT_MATHERRF {ERRNO_RANGE;}
- *(float *)retval = excf.retval;
+ NOT_MATHERRF{ERRNO_RANGE;}
+ *(float*)retval = excf.retval;
break;
}
case lgammal_negative:
@@ -2578,16 +3225,16 @@ else
RETVAL_HUGEL;
NOT_MATHERRL
{
- WRITEL_LGAMMA_NEGATIVE;
- ERRNO_DOMAIN;
+ WRITEL_GAMMA_NEGATIVE;
+ ERRNO_DOMAIN;
}
}
else
{
RETVAL_HUGE_VALL;
- NOT_MATHERRL {ERRNO_DOMAIN;}
+ NOT_MATHERRL{ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double*)retval = excl.retval;
break;
}
case lgamma_negative:
@@ -2606,16 +3253,16 @@ else
else
{
RETVAL_HUGE_VALD;
- NOT_MATHERRD {ERRNO_DOMAIN;}
+ NOT_MATHERRD{ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double*)retval = exc.retval;
break;
}
case lgammaf_negative:
/* lgammaf -int or 0 */
{
SINGF; NAMEF = (char *) "lgammaf";
- ifSVID
+ ifSVID
{
RETVAL_HUGEF;
NOT_MATHERRF
@@ -2627,72 +3274,114 @@ else
else
{
RETVAL_HUGE_VALF;
- NOT_MATHERRF {ERRNO_DOMAIN;}
+ NOT_MATHERRF{ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float*)retval = excf.retval;
break;
}
- case gammal_negative:
- /* gammal -int or 0 */
+ case tgammal_overflow:
+ /* tgammal overflow */
{
- SINGL; NAMEL = (char *) "gammal";
- ifSVID
+ OVERFLOWL; NAMEL = (char *) "tgammal";
+ ifSVID
{
RETVAL_HUGEL;
+ }
+ else
+ {
+ RETVAL_HUGE_VALL;
+ }
+ NOT_MATHERRL{ERRNO_RANGE;}
+ *(long double*)retval = excl.retval;
+ break;
+ }
+ case tgamma_overflow:
+ /* tgamma overflow */
+ {
+ OVERFLOWD; NAMED = (char *) "tgamma";
+ ifSVID
+ {
+ RETVAL_HUGED;
+ }
+ else
+ {
+ RETVAL_HUGE_VALD;
+ }
+ NOT_MATHERRD{ERRNO_RANGE;}
+ *(double*)retval = exc.retval;
+ break;
+ }
+ case tgammaf_overflow:
+ /* tgammaf overflow */
+ {
+ OVERFLOWF; NAMEF = (char *) "tgammaf";
+ ifSVID
+ {
+ RETVAL_HUGEF;
+ }
+ else
+ {
+ RETVAL_HUGE_VALF;
+ }
+ NOT_MATHERRF{ERRNO_RANGE;}
+ *(float*)retval = excf.retval;
+ break;
+ }
+ case tgammal_negative:
+ /* tgammal -int or 0 */
+ {
+ SINGL; NAMEL = (char *) "tgammal";
+ ifSVID
+ {
NOT_MATHERRL
{
- WRITEL_GAMMA_NEGATIVE;
- ERRNO_DOMAIN;
+ WRITEL_TGAMMA_NEGATIVE;
+ ERRNO_DOMAIN;
}
}
else
{
- RETVAL_HUGE_VALL;
- NOT_MATHERRL {ERRNO_DOMAIN;}
+ NOT_MATHERRL{ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double*)retval = excl.retval;
break;
}
- case gamma_negative:
- /* gamma -int or 0 */
+ case tgamma_negative:
+ /* tgamma -int or 0 */
{
- SINGD; NAMED = (char *) "gamma";
- ifSVID
+ SINGD; NAMED = (char *) "tgamma";
+ ifSVID
{
- RETVAL_HUGED;
NOT_MATHERRD
{
- WRITED_GAMMA_NEGATIVE;
- ERRNO_DOMAIN;
+ WRITED_TGAMMA_NEGATIVE;
+ ERRNO_DOMAIN;
}
}
else
{
- RETVAL_HUGE_VALD;
- NOT_MATHERRD {ERRNO_DOMAIN;}
+ NOT_MATHERRD{ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double*)retval = exc.retval;
break;
}
- case gammaf_negative:
- /* gammaf -int or 0 */
+ case tgammaf_negative:
+ /* tgammaf -int or 0 */
{
- SINGF; NAMEF = (char *) "gammaf";
- ifSVID
+ SINGF; NAMEF = (char *) "tgammaf";
+ ifSVID
{
- RETVAL_HUGEF;
NOT_MATHERRF
{
- WRITEF_GAMMA_NEGATIVE;
- ERRNO_DOMAIN;
+ WRITEF_TGAMMA_NEGATIVE;
+ ERRNO_DOMAIN;
}
}
else
{
- RETVAL_HUGE_VALF;
- NOT_MATHERRF {ERRNO_DOMAIN;}
+ NOT_MATHERRF{ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float*)retval = excf.retval;
break;
}
case j0l_gt_loss:
@@ -2700,7 +3389,7 @@ else
{
TLOSSL; NAMEL = (char *) "j0l";
RETVAL_ZEROL;
- ifSVID
+ ifSVID
{
NOT_MATHERRL
{
@@ -2712,7 +3401,7 @@ else
{
NOT_MATHERRL {ERRNO_RANGE;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case j0_gt_loss:
@@ -2720,7 +3409,7 @@ else
{
TLOSSD; NAMED = (char *) "j0";
RETVAL_ZEROD;
- ifSVID
+ ifSVID
{
NOT_MATHERRD
{
@@ -2732,7 +3421,7 @@ else
{
NOT_MATHERRD {ERRNO_RANGE;}
}
- *(double*)retval = exc.retval;
+ *(double*)retval = exc.retval;
break;
}
case j0f_gt_loss:
@@ -2740,7 +3429,7 @@ else
{
TLOSSF; NAMEF = (char *) "j0f";
RETVAL_ZEROF;
- ifSVID
+ ifSVID
{
NOT_MATHERRF
{
@@ -2760,7 +3449,7 @@ else
{
TLOSSL; NAMEL = (char *) "j1l";
RETVAL_ZEROL;
- ifSVID
+ ifSVID
{
NOT_MATHERRL
{
@@ -2772,7 +3461,7 @@ else
{
NOT_MATHERRL {ERRNO_RANGE;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case j1_gt_loss:
@@ -2780,7 +3469,7 @@ else
{
TLOSSD; NAMED = (char *) "j1";
RETVAL_ZEROD;
- ifSVID
+ ifSVID
{
NOT_MATHERRD
{
@@ -2792,7 +3481,7 @@ else
{
NOT_MATHERRD {ERRNO_RANGE;}
}
- *(double*)retval = exc.retval;
+ *(double*)retval = exc.retval;
break;
}
case j1f_gt_loss:
@@ -2800,7 +3489,7 @@ else
{
TLOSSF; NAMEF = (char *) "j1f";
RETVAL_ZEROF;
- ifSVID
+ ifSVID
{
NOT_MATHERRF
{
@@ -2820,7 +3509,7 @@ else
{
TLOSSL; NAMEL = (char *) "jnl";
RETVAL_ZEROL;
- ifSVID
+ ifSVID
{
NOT_MATHERRL
{
@@ -2832,7 +3521,7 @@ else
{
NOT_MATHERRL {ERRNO_RANGE;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case jn_gt_loss:
@@ -2840,7 +3529,7 @@ else
{
TLOSSD; NAMED = (char *) "jn";
RETVAL_ZEROD;
- ifSVID
+ ifSVID
{
NOT_MATHERRD
{
@@ -2852,7 +3541,7 @@ else
{
NOT_MATHERRD {ERRNO_RANGE;}
}
- *(double*)retval = exc.retval;
+ *(double*)retval = exc.retval;
break;
}
case jnf_gt_loss:
@@ -2860,7 +3549,7 @@ else
{
TLOSSF; NAMEF = (char *) "jnf";
RETVAL_ZEROF;
- ifSVID
+ ifSVID
{
NOT_MATHERRF
{
@@ -2880,7 +3569,7 @@ else
{
TLOSSL; NAMEL = (char *) "y0l";
RETVAL_ZEROL;
- ifSVID
+ ifSVID
{
NOT_MATHERRL
{
@@ -2900,7 +3589,7 @@ else
{
TLOSSD; NAMED = (char *) "y0";
RETVAL_ZEROD;
- ifSVID
+ ifSVID
{
NOT_MATHERRD
{
@@ -2920,7 +3609,7 @@ else
{
TLOSSF; NAMEF = (char *) "y0f";
RETVAL_ZEROF;
- ifSVID
+ ifSVID
{
NOT_MATHERRF
{
@@ -2939,10 +3628,10 @@ else
/* y0l(0) */
{
DOMAINL; NAMEL = (char *) "y0l";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEL;
- NOT_MATHERRL
+ NOT_MATHERRL
{
WRITEL_Y0_ZERO;
ERRNO_DOMAIN;
@@ -2950,20 +3639,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALL;
+ RETVAL_NEG_HUGE_VALL;
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case y0_zero:
/* y0(0) */
{
DOMAIND; NAMED = (char *) "y0";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGED;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_Y0_ZERO;
ERRNO_DOMAIN;
@@ -2971,20 +3660,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALD;
+ RETVAL_NEG_HUGE_VALD;
NOT_MATHERRD {ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case y0f_zero:
/* y0f(0) */
{
DOMAINF; NAMEF = (char *) "y0f";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEF;
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_Y0_ZERO;
ERRNO_DOMAIN;
@@ -2992,10 +3681,10 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALF;
+ RETVAL_NEG_HUGE_VALF;
NOT_MATHERRF {ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case y1l_gt_loss:
@@ -3003,7 +3692,7 @@ else
{
TLOSSL; NAMEL = (char *) "y1l";
RETVAL_ZEROL;
- ifSVID
+ ifSVID
{
NOT_MATHERRL
{
@@ -3023,7 +3712,7 @@ else
{
TLOSSD; NAMED = (char *) "y1";
RETVAL_ZEROD;
- ifSVID
+ ifSVID
{
NOT_MATHERRD
{
@@ -3043,7 +3732,7 @@ else
{
TLOSSF; NAMEF = (char *) "y1f";
RETVAL_ZEROF;
- ifSVID
+ ifSVID
{
NOT_MATHERRF
{
@@ -3062,10 +3751,10 @@ else
/* y1l(0) */
{
DOMAINL; NAMEL = (char *) "y1l";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEL;
- NOT_MATHERRL
+ NOT_MATHERRL
{
WRITEL_Y1_ZERO;
ERRNO_DOMAIN;
@@ -3073,20 +3762,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALL;
+ RETVAL_NEG_HUGE_VALL;
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case y1_zero:
/* y1(0) */
{
DOMAIND; NAMED = (char *) "y1";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGED;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_Y1_ZERO;
ERRNO_DOMAIN;
@@ -3094,30 +3783,30 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALD;
+ RETVAL_NEG_HUGE_VALD;
NOT_MATHERRD {ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case y1f_zero:
/* y1f(0) */
{
DOMAINF; NAMEF = (char *) "y1f";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEF;
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_Y1_ZERO;
ERRNO_DOMAIN;
}
}else
{
- RETVAL_NEG_HUGE_VALF;
+ RETVAL_NEG_HUGE_VALF;
NOT_MATHERRF {ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case ynl_gt_loss:
@@ -3125,7 +3814,7 @@ else
{
TLOSSL; NAMEL = (char *) "ynl";
RETVAL_ZEROL;
- ifSVID
+ ifSVID
{
NOT_MATHERRL
{
@@ -3145,7 +3834,7 @@ else
{
TLOSSD; NAMED = (char *) "yn";
RETVAL_ZEROD;
- ifSVID
+ ifSVID
{
NOT_MATHERRD
{
@@ -3165,7 +3854,7 @@ else
{
TLOSSF; NAMEF = (char *) "ynf";
RETVAL_ZEROF;
- ifSVID
+ ifSVID
{
NOT_MATHERRF
{
@@ -3184,10 +3873,10 @@ else
/* ynl(0) */
{
DOMAINL; NAMEL = (char *) "ynl";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEL;
- NOT_MATHERRL
+ NOT_MATHERRL
{
WRITEL_YN_ZERO;
ERRNO_DOMAIN;
@@ -3195,20 +3884,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALL;
+ RETVAL_NEG_HUGE_VALL;
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case yn_zero:
/* yn(0) */
{
DOMAIND; NAMED = (char *) "yn";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGED;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_YN_ZERO;
ERRNO_DOMAIN;
@@ -3216,20 +3905,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALD;
+ RETVAL_NEG_HUGE_VALD;
NOT_MATHERRD {ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case ynf_zero:
/* ynf(0) */
{
DOMAINF; NAMEF = (char *) "ynf";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEF;
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_YN_ZERO;
ERRNO_DOMAIN;
@@ -3237,20 +3926,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALF;
+ RETVAL_NEG_HUGE_VALF;
NOT_MATHERRF {ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case y0l_negative:
/* y0l(x<0) */
{
DOMAINL; NAMEL = (char *) "y0l";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEL;
- NOT_MATHERRL
+ NOT_MATHERRL
{
WRITEL_Y0_NEGATIVE;
ERRNO_DOMAIN;
@@ -3258,20 +3947,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALL;
+ RETVAL_NEG_HUGE_VALL;
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case y0_negative:
/* y0(x<0) */
{
DOMAIND; NAMED = (char *) "y0";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGED;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_Y0_NEGATIVE;
ERRNO_DOMAIN;
@@ -3279,20 +3968,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALD;
+ RETVAL_NEG_HUGE_VALD;
NOT_MATHERRD {ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case y0f_negative:
/* y0f(x<0) */
{
DOMAINF; NAMEF = (char *) "y0f";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEF;
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_Y0_NEGATIVE;
ERRNO_DOMAIN;
@@ -3300,20 +3989,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALF;
+ RETVAL_NEG_HUGE_VALF;
NOT_MATHERRF {ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case y1l_negative:
/* y1l(x<0) */
{
DOMAINL; NAMEL = (char *) "y1l";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEL;
- NOT_MATHERRL
+ NOT_MATHERRL
{
WRITEL_Y1_NEGATIVE;
ERRNO_DOMAIN;
@@ -3321,20 +4010,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALL;
+ RETVAL_NEG_HUGE_VALL;
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case y1_negative:
/* y1(x<0) */
{
DOMAIND; NAMED = (char *) "y1";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGED;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_Y1_NEGATIUE;
ERRNO_DOMAIN;
@@ -3342,20 +4031,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALD;
+ RETVAL_NEG_HUGE_VALD;
NOT_MATHERRD {ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case y1f_negative:
/* y1f(x<0) */
{
DOMAINF; NAMEF = (char *) "y1f";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEF;
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_Y1_NEGATIVE;
ERRNO_DOMAIN;
@@ -3363,20 +4052,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALF;
+ RETVAL_NEG_HUGE_VALF;
NOT_MATHERRF {ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
case ynl_negative:
/* ynl(x<0) */
{
DOMAINL; NAMEL = (char *) "ynl";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEL;
- NOT_MATHERRL
+ NOT_MATHERRL
{
WRITEL_YN_NEGATIVE;
ERRNO_DOMAIN;
@@ -3384,20 +4073,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALL;
+ RETVAL_NEG_HUGE_VALL;
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
case yn_negative:
/* yn(x<0) */
{
DOMAIND; NAMED = (char *) "yn";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGED;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_YN_NEGATIVE;
ERRNO_DOMAIN;
@@ -3405,20 +4094,20 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALD;
+ RETVAL_NEG_HUGE_VALD;
NOT_MATHERRD {ERRNO_DOMAIN;}
}
- *(double *)retval = exc.retval;
+ *(double *)retval = exc.retval;
break;
}
case ynf_negative:
/* ynf(x<0) */
{
DOMAINF; NAMEF = (char *) "ynf";
- ifSVID
+ ifSVID
{
RETVAL_NEG_HUGEF;
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_YN_NEGATIVE;
ERRNO_DOMAIN;
@@ -3426,18 +4115,18 @@ else
}
else
{
- RETVAL_NEG_HUGE_VALF;
+ RETVAL_NEG_HUGE_VALF;
NOT_MATHERRF {ERRNO_DOMAIN;}
}
- *(float *)retval = excf.retval;
+ *(float *)retval = excf.retval;
break;
}
- case fmodl_by_zero:
+ case fmodl_by_zero:
/* fmodl(x,0) */
{
DOMAINL; NAMEL = (char *) "fmodl";
- ifSVID
- {
+ ifSVID
+ {
*(long double *)retval = *(long double *)arg1;
NOT_MATHERRL
{
@@ -3445,21 +4134,21 @@ else
ERRNO_DOMAIN;
}
}
- else
+ else
{ /* NaN already computed */
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
- case fmod_by_zero:
+ case fmod_by_zero:
/* fmod(x,0) */
{
DOMAIND; NAMED = (char *) "fmod";
- ifSVID
+ ifSVID
{
*(double *)retval = *(double *)arg1;
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_FMOD;
ERRNO_DOMAIN;
@@ -3468,18 +4157,18 @@ else
else
{ /* NaN already computed */
NOT_MATHERRD {ERRNO_DOMAIN;}
- }
- *(double *)retval = exc.retval;
+ }
+ *(double *)retval = exc.retval;
break;
}
- case fmodf_by_zero:
+ case fmodf_by_zero:
/* fmodf(x,0) */
{
DOMAINF; NAMEF = (char *) "fmodf";
- ifSVID
+ ifSVID
{
*(float *)retval = *(float *)arg1;
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_FMOD;
ERRNO_DOMAIN;
@@ -3488,36 +4177,36 @@ else
else
{
NOT_MATHERRF {ERRNO_DOMAIN;}
- }
- *(float *)retval = excf.retval;
+ }
+ *(float *)retval = excf.retval;
break;
}
- case remainderl_by_zero:
+ case remainderl_by_zero:
/* remainderl(x,0) */
{
DOMAINL; NAMEL = (char *) "remainderl";
- ifSVID
- {
+ ifSVID
+ {
NOT_MATHERRL
{
WRITEL_REM;
ERRNO_DOMAIN;
}
}
- else
+ else
{ /* NaN already computed */
NOT_MATHERRL {ERRNO_DOMAIN;}
}
- *(long double *)retval = excl.retval;
+ *(long double *)retval = excl.retval;
break;
}
- case remainder_by_zero:
+ case remainder_by_zero:
/* remainder(x,0) */
{
DOMAIND; NAMED = (char *) "remainder";
- ifSVID
+ ifSVID
{
- NOT_MATHERRD
+ NOT_MATHERRD
{
WRITED_REM;
ERRNO_DOMAIN;
@@ -3526,17 +4215,17 @@ else
else
{ /* NaN already computed */
NOT_MATHERRD {ERRNO_DOMAIN;}
- }
- *(double *)retval = exc.retval;
+ }
+ *(double *)retval = exc.retval;
break;
}
- case remainderf_by_zero:
+ case remainderf_by_zero:
/* remainderf(x,0) */
{
DOMAINF; NAMEF = (char *) "remainderf";
- ifSVID
+ ifSVID
{
- NOT_MATHERRF
+ NOT_MATHERRF
{
WRITEF_REM;
ERRNO_DOMAIN;
@@ -3545,12 +4234,14 @@ else
else
{
NOT_MATHERRF {ERRNO_DOMAIN;}
- }
- *(float *)retval = excf.retval;
+ }
+ *(float *)retval = excf.retval;
break;
}
default:
- abort();
+ /* We don't want to abort () since SVID doesn't cover all math
+ library functions. */
+ break;
}
return;
}
diff --git a/sysdeps/ia64/fpu/libm_frexp.S b/sysdeps/ia64/fpu/libm_frexp.S
new file mode 100644
index 0000000000..c6bd676a40
--- /dev/null
+++ b/sysdeps/ia64/fpu/libm_frexp.S
@@ -0,0 +1,209 @@
+.file "libm_frexp.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/02/00 Initial version
+// 03/20/00 Improved speed
+// 06/01/00 Fixed bug when x a double-extended denormal
+// 12/08/00 Corrected label on .endp
+// 01/23/02 Added handling for int 32 or 64 bits
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// double __libm_frexp(double x, int* y, int int_type)
+// input floating point f8, pointer to y (r33), int int_type (r34)
+// output floating point f8, returns the fraction of x, 0.5 <= fraction < 1.0
+// output int* y, returns the true exponent of x
+//
+// int_type = 0 if int is 32 bits
+// int_type = 1 if int is 64 bits
+//
+// int* y is returned as a 32 bit integer if int_type = 0
+// int* y is returned as a 64 bit integer if int_type = 1
+//
+// Overview of operation
+//==============================================================
+// break a floating point x number into fraction and an exponent
+// The fraction is returned as a double
+// The exponent is returned as an integer pointed to by y
+// This is a true (not a biased exponent) but 0fffe is subtracted
+// as a bias instead of 0xffff. This is because the fraction returned
+// is between 0.5 and 1.0, not the expected IEEE range.
+//
+// The fraction is 0.5 <= fraction < 1.0
+//
+// Registers used
+//==============================================================
+//
+// general registers:
+// r14 exponent bias for x negative
+// r15 exponent bias for x positive
+// r16 signexp of x
+// r17 exponent mask
+// r18 exponent of x
+// r19 exponent result
+// r20 signexp of 2^64
+// r32 on input contains the 64-bit IEEE double that is in f8
+// r33 on input pointer to 32-bit or 64-bit integer for exponent
+// r34 on input contains 0 if output int is 32 bits, else output int is 64 bits
+//
+// predicate registers:
+// p6 set if x is Nan, zero, or infinity
+// p7 set if x negative
+// p8 set if x positive
+// p9 set if x double-extended denormal
+// p10 set if int_type = 0, 32-bit integer
+// p11 set if int_type = 1, 64-bit integer
+//
+// floating-point registers:
+// f8 input, output
+// f9 normalized x
+// f10 signexp for significand result for x positive
+// f11 signexp for significand result for x negative
+// f12 2^64
+
+.section .text
+GLOBAL_LIBM_ENTRY(__libm_frexp)
+
+// Set signexp for significand result for x>0
+// If x is a NaN, zero, or infinity, return it.
+// Put 0 in the int pointer.
+// x NAN, ZERO, INFINITY?
+// Set signexp for significand result for x<0
+{ .mfi
+ mov r15 = 0x0fffe
+ fclass.m p6,p7 = f8, 0xe7
+ mov r14 = 0x2fffe
+}
+// Form signexp of 2^64 in case x double-extended denormal
+// Save the normalized value of input in f9
+// The normalization also sets fault flags and takes faults if necessary
+{ .mfi
+ mov r20 = 0x1003f
+ fnorm.s0 f9 = f8
+ nop.i 999 ;;
+}
+
+// Move signexp for significand result for x>0 to FP reg
+// Form 2^64 in case x double-extended denormal
+{ .mmi
+ setf.exp f10 = r15
+ setf.exp f12 = r20
+ nop.i 999 ;;
+}
+
+// Move signexp for significand result for x<0 to FP reg
+// p7 if x<0, else p8
+// If x=0,nan,inf, set p10 if output int to be 32 bits, or set p11 if 64 bits
+{ .mfi
+ setf.exp f11 = r14
+(p7) fcmp.lt.s0 p7,p8 = f8,f0
+(p6) cmp.eq.unc p10,p11 = r34, r0 ;;
+}
+
+// If x NAN, ZERO, INFINITY, set *y=0 and exit
+{ .mmb
+(p10) st4 [r33] = r0 // Store *y=0 as 32-bit integer
+(p11) st8 [r33] = r0 // Store *y=0 as 64-bit integer
+(p6) br.ret.spnt b0 ;;
+}
+
+// Form exponent mask
+// Test for fnorm(x) denormal, means x double-extended denormal
+{ .mfi
+ mov r17 = 0x1ffff
+ fclass.m p9,p0 = f9, 0x0b
+ nop.i 999 ;;
+}
+
+// If x double-extended denormal add 64 to exponent bias for scaling
+// If x double-extended denormal multiply x * 2^64 which is normal
+// Set p10 if output int to be 32 bits, or set p11 if 64 bits
+{ .mfi
+(p9) add r15 = 64, r15
+(p9) fmpy.s0 f9 = f9, f12
+ cmp.eq p10,p11 = r34, r0 ;;
+}
+
+// true exponent stored to int pointer
+// the bias is treated as 0xfffe instead of
+// normal 0xffff because we want the significand
+// to be in the range <=0.5 sig < 1.0
+// Store the value of the exponent at the pointer in r33
+
+// If x>0 form significand result
+{ .mfi
+ nop.m 999
+(p8) fmerge.se f8 = f10,f9
+ nop.i 999 ;;
+}
+
+// Get signexp of normalized x
+// If x<0 form significand result
+{ .mfi
+ getf.exp r16 = f9
+(p7) fmerge.se f8 = f11,f9
+ nop.i 999 ;;
+}
+
+// Get exp of normalized x
+// Subtract off bias to get true exponent of x
+{ .mmi
+ and r18 = r17,r16 ;;
+ sub r19 = r18,r15
+ nop.i 999 ;;
+}
+
+// Store int *y as a 32-bit integer
+// Make the value a double
+{ .mfi
+(p10) st4 [r33] = r19 // Store *y as 32-bit integer
+ fnorm.d.s0 f8 = f8
+ nop.i 999
+}
+{ .mfb
+(p11) st8 [r33] = r19 // Store *y as 64-bit integer
+ nop.f 999
+ br.ret.sptk b0 ;;
+}
+
+GLOBAL_LIBM_END(__libm_frexp)
diff --git a/sysdeps/ia64/fpu/libm_frexpf.S b/sysdeps/ia64/fpu/libm_frexpf.S
new file mode 100644
index 0000000000..dde2d09b4b
--- /dev/null
+++ b/sysdeps/ia64/fpu/libm_frexpf.S
@@ -0,0 +1,209 @@
+.file "libm_frexpf.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/02/00 Initial version
+// 03/20/00 Improved speed
+// 06/01/00 Fixed bug when x a double-extended denormal
+// 12/08/00 Corrected label on .endp
+// 01/23/02 Added handling for int 32 or 64 bits
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// float __libm_frexpf(float x, int* y, int int_type)
+// input floating point f8, pointer to y (r33), int int_type (r34)
+// output floating point f8, returns the fraction of x, 0.5 <= fraction < 1.0
+// output int* y, returns the true exponent of x
+//
+// int_type = 0 if int is 32 bits
+// int_type = 1 if int is 64 bits
+//
+// int* y is returned as a 32 bit integer if int_type = 0
+// int* y is returned as a 64 bit integer if int_type = 1
+//
+// Overview of operation
+//==============================================================
+// break a floating point x number into fraction and an exponent
+// The fraction is returned as a float
+// The exponent is returned as an integer pointed to by y
+// This is a true (not a biased exponent) but 0fffe is subtracted
+// as a bias instead of 0xffff. This is because the fraction returned
+// is between 0.5 and 1.0, not the expected IEEE range.
+//
+// The fraction is 0.5 <= fraction < 1.0
+//
+// Registers used
+//==============================================================
+//
+// general registers:
+// r14 exponent bias for x negative
+// r15 exponent bias for x positive
+// r16 signexp of x
+// r17 exponent mask
+// r18 exponent of x
+// r19 exponent result
+// r20 signexp of 2^64
+// r32 on input contains the 32-bit IEEE float that is in f8
+// r33 on input pointer to 32-bit or 64-bit integer for exponent
+// r34 on input contains 0 if output int is 32 bits, else output int is 64 bits
+//
+// predicate registers:
+// p6 set if x is Nan, zero, or infinity
+// p7 set if x negative
+// p8 set if x positive
+// p9 set if x double-extended denormal
+// p10 set if int_type = 0, 32-bit integer
+// p11 set if int_type = 1, 64-bit integer
+//
+// floating-point registers:
+// f8 input, output
+// f9 normalized x
+// f10 signexp for significand result for x positive
+// f11 signexp for significand result for x negative
+// f12 2^64
+
+.section .text
+GLOBAL_LIBM_ENTRY(__libm_frexpf)
+
+// Set signexp for significand result for x>0
+// If x is a NaN, zero, or infinity, return it.
+// Put 0 in the int pointer.
+// x NAN, ZERO, INFINITY?
+// Set signexp for significand result for x<0
+{ .mfi
+ mov r15 = 0x0fffe
+ fclass.m p6,p7 = f8, 0xe7
+ mov r14 = 0x2fffe
+}
+// Form signexp of 2^64 in case x double-extended denormal
+// Save the normalized value of input in f9
+// The normalization also sets fault flags and takes faults if necessary
+{ .mfi
+ mov r20 = 0x1003f
+ fnorm.s0 f9 = f8
+ nop.i 999 ;;
+}
+
+// Move signexp for significand result for x>0 to FP reg
+// Form 2^64 in case x double-extended denormal
+{ .mmi
+ setf.exp f10 = r15
+ setf.exp f12 = r20
+ nop.i 999 ;;
+}
+
+// Move signexp for significand result for x<0 to FP reg
+// p7 if x<0, else p8
+// If x=0,nan,inf, set p10 if output int to be 32 bits, or set p11 if 64 bits
+{ .mfi
+ setf.exp f11 = r14
+(p7) fcmp.lt.s0 p7,p8 = f8,f0
+(p6) cmp.eq.unc p10,p11 = r34, r0 ;;
+}
+
+// If x NAN, ZERO, INFINITY, set *y=0 and exit
+{ .mmb
+(p10) st4 [r33] = r0 // Store *y=0 as 32-bit integer
+(p11) st8 [r33] = r0 // Store *y=0 as 64-bit integer
+(p6) br.ret.spnt b0 ;;
+}
+
+// Form exponent mask
+// Test for fnorm(x) denormal, means x double-extended denormal
+{ .mfi
+ mov r17 = 0x1ffff
+ fclass.m p9,p0 = f9, 0x0b
+ nop.i 999 ;;
+}
+
+// If x double-extended denormal add 64 to exponent bias for scaling
+// If x double-extended denormal multiply x * 2^64 which is normal
+// Set p10 if output int to be 32 bits, or set p11 if 64 bits
+{ .mfi
+(p9) add r15 = 64, r15
+(p9) fmpy.s0 f9 = f9, f12
+ cmp.eq p10,p11 = r34, r0 ;;
+}
+
+// true exponent stored to int pointer
+// the bias is treated as 0xfffe instead of
+// normal 0xffff because we want the significand
+// to be in the range <=0.5 sig < 1.0
+// Store the value of the exponent at the pointer in r33
+
+// If x>0 form significand result
+{ .mfi
+ nop.m 999
+(p8) fmerge.se f8 = f10,f9
+ nop.i 999 ;;
+}
+
+// Get signexp of normalized x
+// If x<0 form significand result
+{ .mfi
+ getf.exp r16 = f9
+(p7) fmerge.se f8 = f11,f9
+ nop.i 999 ;;
+}
+
+// Get exp of normalized x
+// Subtract off bias to get true exponent of x
+{ .mmi
+ and r18 = r17,r16 ;;
+ sub r19 = r18,r15
+ nop.i 999 ;;
+}
+
+// Store int *y as a 32-bit integer
+// Make the value a float
+{ .mfi
+(p10) st4 [r33] = r19 // Store *y as 32-bit integer
+ fnorm.s.s0 f8 = f8
+ nop.i 999
+}
+{ .mfb
+(p11) st8 [r33] = r19 // Store *y as 64-bit integer
+ nop.f 999
+ br.ret.sptk b0 ;;
+}
+
+GLOBAL_LIBM_END(__libm_frexpf)
diff --git a/sysdeps/ia64/fpu/libm_frexpl.S b/sysdeps/ia64/fpu/libm_frexpl.S
new file mode 100644
index 0000000000..64f30b6364
--- /dev/null
+++ b/sysdeps/ia64/fpu/libm_frexpl.S
@@ -0,0 +1,209 @@
+.file "libm_frexpl.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/02/00 Initial version
+// 03/20/00 Improved speed
+// 06/01/00 Fixed bug when x a double-extended denormal
+// 12/08/00 Corrected label on .endp
+// 01/23/02 Added handling for int 32 or 64 bits
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// long double __libm_frexpl(long double x, int* y, int int_type)
+// input floating point f8, pointer to y (r34), int int_type (r35)
+// output floating point f8, returns the fraction of x, 0.5 <= fraction < 1.0
+// output int* y, returns the true exponent of x
+//
+// int_type = 0 if int is 32 bits
+// int_type = 1 if int is 64 bits
+//
+// int* y is returned as a 32 bit integer if int_type = 0
+// int* y is returned as a 64 bit integer if int_type = 1
+//
+// Overview of operation
+//==============================================================
+// break a floating point x number into fraction and an exponent
+// The fraction is returned as a long double
+// The exponent is returned as an integer pointed to by y
+// This is a true (not a biased exponent) but 0fffe is subtracted
+// as a bias instead of 0xffff. This is because the fraction returned
+// is between 0.5 and 1.0, not the expected IEEE range.
+//
+// The fraction is 0.5 <= fraction < 1.0
+//
+// Registers used
+//==============================================================
+//
+// general registers:
+// r14 exponent bias for x negative
+// r15 exponent bias for x positive
+// r16 signexp of x
+// r17 exponent mask
+// r18 exponent of x
+// r19 exponent result
+// r20 signexp of 2^64
+// r32-33 on input contains the 80-bit IEEE long double that is in f8
+// r34 on input pointer to 32-bit or 64-bit integer for exponent
+// r35 on input contains 0 if output int is 32 bits, else output int is 64 bits
+//
+// predicate registers:
+// p6 set if x is Nan, zero, or infinity
+// p7 set if x negative
+// p8 set if x positive
+// p9 set if x double-extended denormal
+// p10 set if int_type = 0, 32-bit integer
+// p11 set if int_type = 1, 64-bit integer
+//
+// floating-point registers:
+// f8 input, output
+// f9 normalized x
+// f10 signexp for significand result for x positive
+// f11 signexp for significand result for x negative
+// f12 2^64
+
+.section .text
+GLOBAL_LIBM_ENTRY(__libm_frexpl)
+
+// Set signexp for significand result for x>0
+// If x is a NaN, zero, or infinity, return it.
+// Put 0 in the int pointer.
+// x NAN, ZERO, INFINITY?
+// Set signexp for significand result for x<0
+{ .mfi
+ mov r15 = 0x0fffe
+ fclass.m p6,p7 = f8, 0xe7
+ mov r14 = 0x2fffe
+}
+// Form signexp of 2^64 in case x double-extended denormal
+// Save the normalized value of input in f9
+// The normalization also sets fault flags and takes faults if necessary
+{ .mfi
+ mov r20 = 0x1003f
+ fnorm.s0 f9 = f8
+ nop.i 999 ;;
+}
+
+// Move signexp for significand result for x>0 to FP reg
+// Form 2^64 in case x double-extended denormal
+{ .mmi
+ setf.exp f10 = r15
+ setf.exp f12 = r20
+ nop.i 999 ;;
+}
+
+// Move signexp for significand result for x<0 to FP reg
+// p7 if x<0, else p8
+// If x=0,nan,inf, set p10 if output int to be 32 bits, or set p11 if 64 bits
+{ .mfi
+ setf.exp f11 = r14
+(p7) fcmp.lt.s0 p7,p8 = f8,f0
+(p6) cmp.eq.unc p10,p11 = r35, r0 ;;
+}
+
+// If x NAN, ZERO, INFINITY, set *y=0 and exit
+{ .mmb
+(p10) st4 [r34] = r0 // Store *y=0 as 32-bit integer
+(p11) st8 [r34] = r0 // Store *y=0 as 64-bit integer
+(p6) br.ret.spnt b0 ;;
+}
+
+// Form exponent mask
+// Test for fnorm(x) denormal, means x double-extended denormal
+{ .mfi
+ mov r17 = 0x1ffff
+ fclass.m p9,p0 = f9, 0x0b
+ nop.i 999 ;;
+}
+
+// If x double-extended denormal add 64 to exponent bias for scaling
+// If x double-extended denormal multiply x * 2^64 which is normal
+// Set p10 if output int to be 32 bits, or set p11 if 64 bits
+{ .mfi
+(p9) add r15 = 64, r15
+(p9) fmpy.s0 f9 = f9, f12
+ cmp.eq p10,p11 = r35, r0 ;;
+}
+
+// true exponent stored to int pointer
+// the bias is treated as 0xfffe instead of
+// normal 0xffff because we want the significand
+// to be in the range <=0.5 sig < 1.0
+// Store the value of the exponent at the pointer in r34
+
+// If x>0 form significand result
+{ .mfi
+ nop.m 999
+(p8) fmerge.se f8 = f10,f9
+ nop.i 999 ;;
+}
+
+// Get signexp of normalized x
+// If x<0 form significand result
+{ .mfi
+ getf.exp r16 = f9
+(p7) fmerge.se f8 = f11,f9
+ nop.i 999 ;;
+}
+
+// Get exp of normalized x
+// Subtract off bias to get true exponent of x
+{ .mmi
+ and r18 = r17,r16 ;;
+ sub r19 = r18,r15
+ nop.i 999 ;;
+}
+
+// Store int *y as a 32-bit integer
+// Make the value a long double
+{ .mfi
+(p10) st4 [r34] = r19 // Store *y as 32-bit integer
+ fnorm.s0 f8 = f8
+ nop.i 999
+}
+{ .mfb
+(p11) st8 [r34] = r19 // Store *y as 64-bit integer
+ nop.f 999
+ br.ret.sptk b0 ;;
+}
+
+GLOBAL_LIBM_END(__libm_frexpl)
diff --git a/sysdeps/ia64/fpu/libm_lgamma.S b/sysdeps/ia64/fpu/libm_lgamma.S
new file mode 100644
index 0000000000..5c13fc3feb
--- /dev/null
+++ b/sysdeps/ia64/fpu/libm_lgamma.S
@@ -0,0 +1,3594 @@
+.file "libm_lgamma.s"
+
+
+// Copyright (c) 2002 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
+// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
+// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code,and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//*********************************************************************
+//
+// History:
+// 01/10/02 Initial version
+// 01/25/02 Corrected error tag numbers
+// 02/04/02 Added support of SIGN(GAMMA(x)) calculation
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 09/15/02 Fixed bug on the branch lgamma_negrecursion
+// 10/21/02 Now it returns SIGN(GAMMA(x))=-1 for negative zero
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+//*********************************************************************
+//
+//*********************************************************************
+//
+// Function: __libm_lgamma(double x, int* signgam, int szsigngam)
+// computes the principle value of the logarithm of the GAMMA function
+// of x. Signum of GAMMA(x) is stored to memory starting at the address
+// specified by the signgam.
+//
+//*********************************************************************
+//
+// Resources Used:
+//
+// Floating-Point Registers: f6-f15
+// f32-f122
+//
+// General Purpose Registers:
+// r8-r11
+// r14-r31
+// r32-r36
+// r37-r40 (Used to pass arguments to error handling routine)
+//
+// Predicate Registers: p6-p15
+//
+//*********************************************************************
+//
+// IEEE Special Conditions:
+//
+// __libm_lgamma(+inf) = +inf
+// __libm_lgamma(-inf) = QNaN
+// __libm_lgamma(+/-0) = +inf
+// __libm_lgamma(x<0, x - integer) = +inf
+// __libm_lgamma(SNaN) = QNaN
+// __libm_lgamma(QNaN) = QNaN
+//
+//*********************************************************************
+//
+// Overview
+//
+// The method consists of three cases.
+//
+// If 512 <= x < OVERFLOW_BOUNDARY use case lgamma_pstirling;
+// else if 1 < x < 512 use case lgamma_regular;
+// else if -17 < x < 1 use case lgamma_negrecursion;
+// else if -512 < x < -17 use case lgamma_negpoly;
+// else if x < -512 use case lgamma_negstirling;
+// else if x is close to negative
+// roots of ln(GAMMA(x)) use case lgamma_negroots;
+//
+//
+// Case 512 <= x < OVERFLOW_BOUNDARY
+// ---------------------------------
+// Here we use algorithm based on the Stirling formula:
+// ln(GAMMA(x)) = ln(sqrt(2*Pi)) + (x-0.5)ln(x) - x + (W2 + W4/x^2)/x
+//
+// Case 1 < x < 512
+// ----------------
+// To calculate GAMMA(x) on this interval we use polynomial approximation
+// on following intervals [0.875; 1.25), [1.25; 1.75), [1.75, 2.25),
+// [2.25; 4), [2^i; 2^(i+1)), i=2..8
+//
+// Following variants of approximation and argument reduction are used:
+// 1. [0.875; 1.25)
+// ln(GAMMA(x)) ~ (x-1.0)*P17(x-1.0)
+//
+// 2. [1.25; 1.75)
+// ln(GAMMA(x)) ~ (x-LocalMinimun)*P17(x-LocalMinimun)
+//
+// 3. [1.75, 2.25)
+// ln(GAMMA(x)) ~ (x-2.0)*P17(x-2.0)
+//
+// 4. [2.25; 4)
+// ln(GAMMA(x)) ~ P22(x)
+//
+// 5. [2^i; 2^(i+1)), i=2..8
+// ln(GAMMA(x)) ~ P22((x-2^i)/2^i)
+//
+// Case -17 < x < 1
+// ----------------
+// Here we use the recursive formula:
+// ln(GAMMA(x)) = ln(GAMMA(x+1)) - ln(x)
+//
+// Using this formula we reduce argument to base interval [1.0; 2.0]
+//
+// Case -512 < x < -17
+// --------------------
+// Here we use the formula:
+// ln(GAMMA(-x)) = ln(Pi/(x*GAMMA(x)*sin(Pi*x))) =
+// = -ln(x) - ln((GAMMA(x)) - ln(sin(Pi*r)/(Pi*r)) - ln(|r|)
+// where r = x - rounded_to_nearest(x), i.e |r| <= 0.5 and
+// ln(sin(Pi*r)/(Pi*r)) is approximated by 14-degree polynomial of r^2
+//
+//
+// Case x < -512
+// -------------
+// Here we use algorithm based on the Stirling formula:
+// ln(GAMMA(-x)) = -ln(sqrt(2*Pi)) + (-x-0.5)ln(x) + x - (W2 + W4/x^2)/x -
+// - ln(sin(Pi*r)/(Pi*r)) - ln(|r|)
+// where r = x - rounded_to_nearest(x).
+//
+// Neighbourhoods of negative roots
+// --------------------------------
+// Here we use polynomial approximation
+// ln(GAMMA(x-x0)) = ln(GAMMA(x0)) + (x-x0)*P14(x-x0),
+// where x0 is a root of ln(GAMMA(x)) rounded to nearest double
+// precision number.
+//
+
+//*********************************************************************
+
+FR_X = f10
+FR_Y = f1 // __libm_lgamma is single argument function
+FR_RESULT = f8
+
+FR_B11 = f6
+FR_B10 = f7
+
+FR_int_N = f9
+FR_N = f10
+FR_P5 = f11
+FR_P4 = f12
+FR_P3 = f13
+FR_P2 = f14
+FR_NormX = f15
+
+FR_Ln2 = f32
+FR_C01 = f33
+FR_A17 = f33
+FR_C00 = f34
+FR_Xp2 = f34
+FR_A00 = f34
+FR_A16 = f34
+FR_C11 = f35
+FR_A15 = f35
+FR_C10 = f36
+FR_Xp3 = f36
+FR_A14 = f36
+FR_B1 = f36
+FR_C21 = f37
+FR_A13 = f37
+FR_PR01 = f37
+FR_C20 = f38
+FR_Xp6 = f38
+FR_A12 = f38
+FR_C31 = f39
+FR_Xp7 = f39
+FR_B0 = f39
+FR_A11 = f39
+FR_C30 = f40
+FR_Xp8 = f40
+FR_A10 = f40
+FR_PR00 = f40
+FR_C41 = f41
+FR_Xp9 = f41
+FR_A9 = f41
+FR_PR11 = f41
+FR_C40 = f42
+FR_A8 = f42
+FR_C51 = f43
+FR_Xp11 = f43
+FR_A7 = f43
+FR_C50 = f44
+FR_C = f44
+FR_Xp12 = f44
+FR_A6 = f44
+FR_Xm2 = f45
+FR_Xp13 = f45
+FR_A5 = f45
+FR_PR10 = f45
+FR_C61 = f46
+FR_Xp14 = f46
+FR_A4 = f46
+FR_PR21 = f46
+FR_C60 = f47
+FR_Xp15 = f47
+FR_A3 = f47
+FR_PR20 = f47
+FR_C71 = f48
+FR_Xp16 = f48
+FR_A2 = f48
+FR_PR31 = f48
+FR_C70 = f49
+FR_Xp17 = f49
+FR_A1 = f49
+FR_PR30 = f49
+FR_C81 = f50
+FR_B17 = f50
+FR_A0 = f50
+FR_C80 = f51
+FR_B16 = f51
+FR_C91 = f52
+FR_B15 = f52
+FR_C90 = f53
+FR_B14 = f53
+FR_CA1 = f54
+FR_B13 = f54
+FR_CA0 = f55
+FR_B12 = f55
+FR_CN = f56
+FR_Qlo = f56
+FR_PRN = f56
+FR_B7 = f57
+FR_B6 = f58
+FR_Qhi = f59
+FR_x = f60
+FR_x2 = f61
+FR_TpNxLn2 = f62
+FR_W2 = f63
+FR_x4 = f64
+FR_r4 = f64
+FR_x8 = f65
+FR_r8 = f65
+FR_r05 = f66
+FR_Xm05 = f66
+FR_B5 = f66
+FR_LnSqrt2Pi = f67
+FR_B4 = f67
+FR_InvX = f68
+FR_B3 = f68
+FR_InvX2 = f69
+FR_B2 = f69
+FR_W4 = f70
+FR_OvfBound = f71
+FR_05 = f72
+FR_LocalMin = f73
+FR_tmp = f73
+FR_LnX = f74
+FR_Xf = f75
+FR_InvXf = f76
+FR_rf = f77
+FR_rf2 = f78
+FR_P54f = f79
+FR_P32f = f80
+FR_rf3 = f81
+FR_P10f = f82
+FR_TpNxLn2f = f83
+FR_Nf = f84
+FR_LnXf = f85
+FR_int_Nf = f86
+FR_Tf = f87
+FR_Xf2 = f88
+FR_Xp10 = f89
+FR_w3 = f90
+FR_S28 = f90
+FR_w2 = f91
+FR_S26 = f91
+FR_w6 = f92
+FR_S24 = f92
+FR_w4 = f93
+FR_S22 = f93
+FR_w = f94
+FR_S20 = f94
+FR_Q8 = f95
+FR_S18 = f95
+FR_Q7 = f96
+FR_S16 = f96
+FR_Q4 = f97
+FR_S14 = f97
+FR_Q3 = f98
+FR_S12 = f98
+FR_Q6 = f99
+FR_S10 = f99
+FR_Q5 = f100
+FR_S8 = f100
+FR_Q2 = f101
+FR_S6 = f101
+FR_Root = f101
+FR_S4 = f102
+FR_Q1 = f102
+FR_S2 = f103
+FR_Xp1 = f104
+FR_Xf4 = f105
+FR_Xf8 = f106
+FR_Xfr = f107
+FR_Xf6 = f108
+FR_Ntrunc = f109
+FR_B9 = f110
+FR_2 = f110
+FR_B8 = f111
+FR_3 = f111
+FR_5 = f112
+FR_Xp4 = f113
+FR_Xp5 = f114
+FR_P54 = f115
+FR_P32 = f116
+FR_P10 = f117
+FR_r = f118
+FR_r2 = f119
+FR_r3 = f120
+FR_T = f121
+FR_int_Ntrunc = f122
+
+//===================================
+
+GR_TAG = r8
+GR_ExpMask = r8
+GR_ExpBias = r9
+GR_ad_Roots = r9
+GR_Expf = r10
+GR_Arg = r10
+GR_SignExp = r11
+GR_ArgXfr = r11
+
+GR_Exp = r14
+GR_Arg125 = r14
+GR_RootInd = r14
+GR_ArgAsIs = r15
+GR_Arg175 = r15
+GR_Sig = r16
+GR_Ind = r17
+GR_ad_Dx = r17
+GR_ad_1 = r18
+GR_SignExp_w = r19
+GR_2_25 = r19
+GR_Arg025 = r19
+GR_Arg15 = r19
+GR_Arg17 = r19
+GR_Exp_w = r19//21
+GR_ad_2 = r20
+GR_2xDx = r21
+GR_SignOfGamma = r21
+GR_fff9 = r22
+GR_Offs = r22
+GR_ad_Co7 = r23
+GR_Arg075 = r23
+GR_Arg0875 = r23
+GR_ad_T = r24
+GR_ad_Root = r24
+GR_Ind = r24
+GR_ad_Co = r25
+GR_ad_Ce = r26
+GR_ad_Ce7 = r27
+GR_Arg05 = r27
+GR_Offs7 = r28
+GR_ArgXfrAsIs = r28
+GR_ExpOf2 = r29
+GR_ad_LnT = r29
+GR_Dx = r29
+GR_ExpOf256 = r30
+GR_0x30033 = r30
+GR_Root = r30
+GR_PseudoRoot = r30
+GR_ad_Data = r31
+GR_ad_SignGam = r31
+
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+
+// Data tables
+//==============================================================
+
+RODATA
+.align 16
+LOCAL_OBJECT_START(lgamma_data)
+// polynomial approximation of ln(GAMMA(x)), 2.25 <= x < 512
+// [2.25; 4)
+data8 0xF888E8D7892718A2,0xC001 // C01
+data8 0xF62F273BA12A4639,0x3FFD // C11
+data8 0xA93AC50A37EC8D38,0xBFFC // C21
+data8 0xB4CC43D2C161E057,0xBFFF // C31
+data8 0xC6AC672F0C1392C7,0xC000 // C41
+data8 0xA292B9AE3276942E,0xC001 // C51
+data8 0xE554E4CCCA6C7B7B,0xC001 // C61
+data8 0x92F0F55FBC87F860,0xC002 // C71
+data8 0xAF60D0112843F6C1,0xC002 // C81
+data8 0xC5956500FA3D92E7,0xC002 // C91
+data8 0xD3B22CCBD8587750,0xC002 // CA1
+data8 0xD888B6CF34159B54,0x4001 // C00
+data8 0xBCB79C8329FD9F44,0x3FFE // C10
+data8 0xCB8896FAD69C455D,0x4000 // C20
+data8 0xE510A424639EBF5E,0x4001 // C30
+data8 0xC65ED41B097486B3,0x4002 // C40
+// [4; 8)
+data8 0x9F1F3C822D03080E,0xC001 // C01
+data8 0x941CACFA9C0FA8A6,0xC001 // C11
+data8 0xFE34336391D99CB7,0xC000 // C21
+data8 0xC40BAEAA165F81A1,0xC000 // C31
+data8 0xFE3AE166E9B4DE8F,0xBFFF // C41
+data8 0xD744F91AF7DAF873,0xBFFE // C51
+data8 0x87871851E9C32D02,0x3FFD // C61
+data8 0x9C93C03C502E808F,0x3FFF // C71
+data8 0xF78BED07501D6A8E,0x3FFF // C81
+data8 0x92FE41BA8BEADF70,0x4000 // C91
+data8 0xA021878E1903A2C6,0x3FFF // CA1
+data8 0xC85EFAC379FAFEE2,0x4001 // C00
+data8 0xC10D7AAB7CEC7FF2,0x4001 // C10
+data8 0xB3537BDF603E454C,0x4001 // C20
+data8 0xA0D44E3D5BBE44C4,0x4001 // C30
+data8 0x8B9C229B6241E7B3,0x4001 // C40
+// [8; 16)
+data8 0xD16AB33AEC220DF6,0x3FFF // C01
+data8 0x987483646E150BCD,0x4000 // C11
+data8 0x80C10A24C863999B,0x4000 // C21
+data8 0xA39A8EB6F8AACE75,0x3FFF // C31
+data8 0x93E04A1379BEC764,0x3FFD // C41
+data8 0xD9F59C4BD3A69BD1,0xBFFE // C51
+data8 0x82094EC891179B1A,0xC000 // C61
+data8 0xC90CFE3A24F70659,0xC000 // C71
+data8 0x827984EA7C155184,0xC001 // C81
+data8 0x981BFDF79D1E0D80,0xC001 // C91
+data8 0xA37209A8B97D230D,0xC001 // CA1
+data8 0xAA1989737D6BA66D,0x3FFE // C00
+data8 0xDBC013A351630AF8,0x3FFF // C10
+data8 0x8B8D47698299389D,0x4000 // C20
+data8 0xACCDD1315DE06EB0,0x4000 // C30
+data8 0xD3414A5AC81BBB2D,0x4000 // C40
+// [16; 32)
+data8 0xECB2B0BE75C5F995,0x3FFF // C01
+data8 0x9DD28BD6DBC96500,0x4000 // C11
+data8 0x8521431B99C6244F,0x4000 // C21
+data8 0xA95F92612B8413C3,0x3FFF // C31
+data8 0x9C76E643B22D9544,0x3FFD // C41
+data8 0xDD90EA99417C8038,0xBFFE // C51
+data8 0x84EA6B6D32E5F906,0xC000 // C61
+data8 0xCDBFE499E05AA622,0xC000 // C71
+data8 0x8594A7DE35427100,0xC001 // C81
+data8 0x9BC1CB2C10DC702F,0xC001 // C91
+data8 0xA7602268762666B0,0xC001 // CA1
+data8 0xDA082BCC6BDB8F7B,0x3FFE // C00
+data8 0xEEBFE1C99322B85E,0x3FFF // C10
+data8 0x96FED4C785361946,0x4000 // C20
+data8 0xB9E3A7207C16B2FE,0x4000 // C30
+data8 0xE1E8170CED48E2C7,0x4000 // C40
+// [32; 64)
+data8 0xFD481EB9AEDD53E7,0x3FFF // C01
+data8 0xA216FB66AC8C53E1,0x4000 // C11
+data8 0x885FF935787553BA,0x4000 // C21
+data8 0xAD471CD89A313327,0x3FFF // C31
+data8 0x9FF13FBA139D21E0,0x3FFD // C41
+data8 0xE25E1663A6EE0266,0xBFFE // C51
+data8 0x87BE51DD5D262FA2,0xC000 // C61
+data8 0xD211A9D4CCE55696,0xC000 // C71
+data8 0x885BEFC29FDED3C9,0xC001 // C81
+data8 0x9EFA48E6367A67F6,0xC001 // C91
+data8 0xAAD3978FC0791297,0xC001 // CA1
+data8 0xF96D210DF37A0AEA,0x3FFE // C00
+data8 0xFE11DC6783917C82,0x3FFF // C10
+data8 0x9FFCD928291B7DDE,0x4000 // C20
+data8 0xC4518F4A80E09AE1,0x4000 // C30
+data8 0xEDDFE9E0FD297C63,0x4000 // C40
+// [64; 128)
+data8 0x840E2E62609B0AD3,0x4000 // C01
+data8 0xA5275A0DD0D3DDF8,0x4000 // C11
+data8 0x8AADC6ABFC441731,0x4000 // C21
+data8 0xB041C6696BE90E50,0x3FFF // C31
+data8 0xA4A8C9153F4B037E,0x3FFD // C41
+data8 0xE3C6A461A7B86736,0xBFFE // C51
+data8 0x89047681C6DE7673,0xC000 // C61
+data8 0xD42DF77A480092DF,0xC000 // C71
+data8 0x89C25D17F086FB20,0xC001 // C81
+data8 0xA09F907D02E34EC7,0xC001 // C91
+data8 0xAC998A9CB79805B7,0xC001 // CA1
+data8 0x875CC9B69AE964CC,0x3FFF // C00
+data8 0x847836BA85DD4C12,0x4000 // C10
+data8 0xA5F3CB2B32E74936,0x4000 // C20
+data8 0xCAE2197C96CB5A0F,0x4000 // C30
+data8 0xF50F7EB60DE5CD09,0x4000 // C40
+// [128; 256)
+data8 0x87D9065DD1876926,0x4000 // C01
+data8 0xA781C28FDAD7CC25,0x4000 // C11
+data8 0x8C6A4FCE35A7EC8D,0x4000 // C21
+data8 0xB27BA081728354F9,0x3FFF // C31
+data8 0xA82FEA7124B0EB2B,0x3FFD // C41
+data8 0xE4C996E42ECBF77A,0xBFFE // C51
+data8 0x89F1A92C84FA538F,0xC000 // C61
+data8 0xD5B6CFF7DB7F6070,0xC000 // C71
+data8 0x8AC6B561FAE38B66,0xC001 // C81
+data8 0xA1D1505C438D8F46,0xC001 // C91
+data8 0xADE2DC1C924FEC81,0xC001 // CA1
+data8 0x8EF6CC62A7E0EB5A,0x3FFF // C00
+data8 0x88A2FFC0ABCB00C0,0x4000 // C10
+data8 0xAA6EA8FCB75B065B,0x4000 // C20
+data8 0xCFC4B82B3D5C9363,0x4000 // C30
+data8 0xFA60FD85DE861771,0x4000 // C40
+// [256; 512)
+data8 0x8AAA7CE4ED5C1EFD,0x4000 // C01
+data8 0xA9679234FB56F1E1,0x4000 // C11
+data8 0x8DCE02287789D841,0x4000 // C21
+data8 0xB44328EF30A8DE7E,0x3FFF // C31
+data8 0xAB0DC564BFA1AB12,0x3FFD // C41
+data8 0xE5882B16FCF2D3CB,0xBFFE // C51
+data8 0x8AA7F48993006A86,0xC000 // C61
+data8 0xD6E63752D192750D,0xC000 // C71
+data8 0x8B90080B17853295,0xC001 // C81
+data8 0xA2BDD4253128D1AB,0xC001 // C91
+data8 0xAEE1A042F96B8121,0xC001 // CA1
+data8 0x94A9C37A42E43BA7,0x3FFF // C00
+data8 0x8BFA54E703878F5A,0x4000 // C10
+data8 0xADFA426DDF14647B,0x4000 // C20
+data8 0xD39C7F7B3958EAF0,0x4000 // C30
+data8 0xFE8C3987853C01E3,0x4000 // C40
+//
+// [2.25; 4)
+data8 0x943AF77763601441,0x4003 // C50
+data8 0xC8A93F9ECB06E891,0x4003 // C60
+data8 0xFC2E5A4AD33DE19D,0x4003 // C70
+data8 0x9526B75B38670119,0x4004 // C80
+data8 0xA7675879D68B587E,0x4004 // C90
+data8 0xB31DFA672D7FB8C0,0x4004 // CA0
+data8 0x83A27775D86F9A81,0xBFD7 // CN
+// [4; 8)
+data8 0xEB8049BA5E79ADA3,0x4000 // C50
+data8 0xC20C95EA99037228,0x4000 // C60
+data8 0x9D4A8C864053CEB8,0x4000 // C70
+data8 0xFC7716544AB0C5C9,0x3FFF // C80
+data8 0xC7EB985259EABA5F,0x3FFF // C90
+data8 0xC042FB3B4C95096D,0x3FFD // CA0
+data8 0xCC2A7F930856177B,0x3FEE // CN
+// [8; 16)
+data8 0xFE1903679D078C7A,0x4000 // C50
+data8 0x957C221AB90171F1,0x4001 // C60
+data8 0xAB2C53B2A78F4031,0x4001 // C70
+data8 0xBE080AE6063AE387,0x4001 // C80
+data8 0xCC019A0311605CB9,0x4001 // C90
+data8 0xD3739D85A12C8ADF,0x4001 // CA0
+data8 0x81FA4D2B7BD7A82D,0x3FEF // CN
+// [16; 32)
+data8 0x871F69E2DD221F02,0x4001 // C50
+data8 0x9E3EF2D477442A9C,0x4001 // C60
+data8 0xB48733582B3C82C5,0x4001 // C70
+data8 0xC7DB9B3C25854A2A,0x4001 // C80
+data8 0xD628B87975BE898F,0x4001 // C90
+data8 0xDDC569C321FF119C,0x4001 // CA0
+data8 0xB27B65560DF7ADA7,0x3FEF // CN
+// [32; 64)
+data8 0x8DE4127349719B22,0x4001 // C50
+data8 0xA5C30A7760F5FBB2,0x4001 // C60
+data8 0xBCB4096055AA2A4E,0x4001 // C70
+data8 0xD08F5F2FB4E7B899,0x4001 // C80
+data8 0xDF39ED39DC91F9CF,0x4001 // C90
+data8 0xE7063E45322F072E,0x4001 // CA0
+data8 0x85A9E11DDDDE67C8,0x3FF0 // CN
+// [64; 128)
+data8 0x91CA191EB80E8893,0x4001 // C50
+data8 0xA9F1D5A55397334A,0x4001 // C60
+data8 0xC1222710295094E3,0x4001 // C70
+data8 0xD52FFABBA6CBE5C6,0x4001 // C80
+data8 0xE3FD9D5282052E1D,0x4001 // C90
+data8 0xEBDBE47BB662F3EF,0x4001 // CA0
+data8 0xEF889F489D88FD31,0x3FF0 // CN
+// [128; 256)
+data8 0x94AA029C2286F8D2,0x4001 // C50
+data8 0xAD0549E55A72389F,0x4001 // C60
+data8 0xC4628899DAF94BA4,0x4001 // C70
+data8 0xD89432A4161C72CB,0x4001 // C80
+data8 0xE77ABA75E9C38F3A,0x4001 // C90
+data8 0xEF65BFFFF71347FF,0x4001 // CA0
+data8 0xE2627460064D918D,0x3FF1 // CN
+// [256; 512)
+data8 0x96E9890D722C2FC1,0x4001 // C50
+data8 0xAF6C2236F6A1CEC4,0x4001 // C60
+data8 0xC6EBB8C9F987D20D,0x4001 // C70
+data8 0xDB38CEFD5EF328CC,0x4001 // C80
+data8 0xEA3265DC66C9A0B4,0x4001 // C90
+data8 0xF2272D6B368C70B1,0x4001 // CA0
+data8 0xDBFF93ECEBCEF1F3,0x3FF2 // CN
+//
+data8 0x3FDD8B618D5AF8FE // point of local minimum on [1;2]
+data8 0x3FE0000000000000 // 0.5
+data8 0xBFC5555DA7212371 // P5
+data8 0x3FC999A19EEF5826 // P4
+data8 0xb17217f7d1cf79ac,0x3ffe // ln(2)
+data8 0xEB3F8E4325F5A535,0x3FFE // ln(sqrt(4*arcsin(1)))
+//
+data8 0xBFCFFFFFFFFEF009 // P3
+data8 0x3FD555555554ECB2 // P2
+data8 0xBF66C16C16C16C17 // W4=B4/12=-1/360
+data8 0x7F5754D9278B51A8 // overflow boundary (first inf result)
+data8 0xAAAAAAAAAAAAAAAB,0x3FFB // W2=B2/2=1/12
+//
+data8 0x3FBC756AC654273B // Q8
+data8 0xBFC001A42489AB4D // Q7 ;
+data8 0x3FC99999999A169B // Q4
+data8 0xBFD00000000019AC // Q3
+data8 0x3FC2492479AA0DF8 // Q6
+data8 0xBFC5555544986F52 // Q5
+data8 0x3FD5555555555555 // Q2
+data8 0xBFE0000000000000 // Q1, P1 = -0.5
+//
+data8 0x80200aaeac44ef38,0x3ff6 // ln(1/frcpa(1+ 0/2^-8))
+data8 0xc09090a2c35aa070,0x3ff7 // ln(1/frcpa(1+ 1/2^-8))
+data8 0xa0c94fcb41977c75,0x3ff8 // ln(1/frcpa(1+ 2/2^-8))
+data8 0xe18b9c263af83301,0x3ff8 // ln(1/frcpa(1+ 3/2^-8))
+data8 0x8d35c8d6399c30ea,0x3ff9 // ln(1/frcpa(1+ 4/2^-8))
+data8 0xadd4d2ecd601cbb8,0x3ff9 // ln(1/frcpa(1+ 5/2^-8))
+data8 0xce95403a192f9f01,0x3ff9 // ln(1/frcpa(1+ 6/2^-8))
+data8 0xeb59392cbcc01096,0x3ff9 // ln(1/frcpa(1+ 7/2^-8))
+data8 0x862c7d0cefd54c5d,0x3ffa // ln(1/frcpa(1+ 8/2^-8))
+data8 0x94aa63c65e70d499,0x3ffa // ln(1/frcpa(1+ 9/2^-8))
+data8 0xa54a696d4b62b382,0x3ffa // ln(1/frcpa(1+ 10/2^-8))
+data8 0xb3e4a796a5dac208,0x3ffa // ln(1/frcpa(1+ 11/2^-8))
+data8 0xc28c45b1878340a9,0x3ffa // ln(1/frcpa(1+ 12/2^-8))
+data8 0xd35c55f39d7a6235,0x3ffa // ln(1/frcpa(1+ 13/2^-8))
+data8 0xe220f037b954f1f5,0x3ffa // ln(1/frcpa(1+ 14/2^-8))
+data8 0xf0f3389b036834f3,0x3ffa // ln(1/frcpa(1+ 15/2^-8))
+data8 0xffd3488d5c980465,0x3ffa // ln(1/frcpa(1+ 16/2^-8))
+data8 0x87609ce2ed300490,0x3ffb // ln(1/frcpa(1+ 17/2^-8))
+data8 0x8ede9321e8c85927,0x3ffb // ln(1/frcpa(1+ 18/2^-8))
+data8 0x96639427f2f8e2f4,0x3ffb // ln(1/frcpa(1+ 19/2^-8))
+data8 0x9defad3e8f73217b,0x3ffb // ln(1/frcpa(1+ 20/2^-8))
+data8 0xa582ebd50097029c,0x3ffb // ln(1/frcpa(1+ 21/2^-8))
+data8 0xac06dbe75ab80fee,0x3ffb // ln(1/frcpa(1+ 22/2^-8))
+data8 0xb3a78449b2d3ccca,0x3ffb // ln(1/frcpa(1+ 23/2^-8))
+data8 0xbb4f79635ab46bb2,0x3ffb // ln(1/frcpa(1+ 24/2^-8))
+data8 0xc2fec93a83523f3f,0x3ffb // ln(1/frcpa(1+ 25/2^-8))
+data8 0xc99af2eaca4c4571,0x3ffb // ln(1/frcpa(1+ 26/2^-8))
+data8 0xd1581106472fa653,0x3ffb // ln(1/frcpa(1+ 27/2^-8))
+data8 0xd8002560d4355f2e,0x3ffb // ln(1/frcpa(1+ 28/2^-8))
+data8 0xdfcb43b4fe508632,0x3ffb // ln(1/frcpa(1+ 29/2^-8))
+data8 0xe67f6dff709d4119,0x3ffb // ln(1/frcpa(1+ 30/2^-8))
+data8 0xed393b1c22351280,0x3ffb // ln(1/frcpa(1+ 31/2^-8))
+data8 0xf5192bff087bcc35,0x3ffb // ln(1/frcpa(1+ 32/2^-8))
+data8 0xfbdf4ff6dfef2fa3,0x3ffb // ln(1/frcpa(1+ 33/2^-8))
+data8 0x81559a97f92f9cc7,0x3ffc // ln(1/frcpa(1+ 34/2^-8))
+data8 0x84be72bce90266e8,0x3ffc // ln(1/frcpa(1+ 35/2^-8))
+data8 0x88bc74113f23def2,0x3ffc // ln(1/frcpa(1+ 36/2^-8))
+data8 0x8c2ba3edf6799d11,0x3ffc // ln(1/frcpa(1+ 37/2^-8))
+data8 0x8f9dc92f92ea08b1,0x3ffc // ln(1/frcpa(1+ 38/2^-8))
+data8 0x9312e8f36efab5a7,0x3ffc // ln(1/frcpa(1+ 39/2^-8))
+data8 0x968b08643409ceb6,0x3ffc // ln(1/frcpa(1+ 40/2^-8))
+data8 0x9a062cba08a1708c,0x3ffc // ln(1/frcpa(1+ 41/2^-8))
+data8 0x9d845b3abf95485c,0x3ffc // ln(1/frcpa(1+ 42/2^-8))
+data8 0xa06fd841bc001bb4,0x3ffc // ln(1/frcpa(1+ 43/2^-8))
+data8 0xa3f3a74652fbe0db,0x3ffc // ln(1/frcpa(1+ 44/2^-8))
+data8 0xa77a8fb2336f20f5,0x3ffc // ln(1/frcpa(1+ 45/2^-8))
+data8 0xab0497015d28b0a0,0x3ffc // ln(1/frcpa(1+ 46/2^-8))
+data8 0xae91c2be6ba6a615,0x3ffc // ln(1/frcpa(1+ 47/2^-8))
+data8 0xb189d1b99aebb20b,0x3ffc // ln(1/frcpa(1+ 48/2^-8))
+data8 0xb51cced5de9c1b2c,0x3ffc // ln(1/frcpa(1+ 49/2^-8))
+data8 0xb819bee9e720d42f,0x3ffc // ln(1/frcpa(1+ 50/2^-8))
+data8 0xbbb2a0947b093a5d,0x3ffc // ln(1/frcpa(1+ 51/2^-8))
+data8 0xbf4ec1505811684a,0x3ffc // ln(1/frcpa(1+ 52/2^-8))
+data8 0xc2535bacfa8975ff,0x3ffc // ln(1/frcpa(1+ 53/2^-8))
+data8 0xc55a3eafad187eb8,0x3ffc // ln(1/frcpa(1+ 54/2^-8))
+data8 0xc8ff2484b2c0da74,0x3ffc // ln(1/frcpa(1+ 55/2^-8))
+data8 0xcc0b1a008d53ab76,0x3ffc // ln(1/frcpa(1+ 56/2^-8))
+data8 0xcfb6203844b3209b,0x3ffc // ln(1/frcpa(1+ 57/2^-8))
+data8 0xd2c73949a47a19f5,0x3ffc // ln(1/frcpa(1+ 58/2^-8))
+data8 0xd5daae18b49d6695,0x3ffc // ln(1/frcpa(1+ 59/2^-8))
+data8 0xd8f08248cf7e8019,0x3ffc // ln(1/frcpa(1+ 60/2^-8))
+data8 0xdca7749f1b3e540e,0x3ffc // ln(1/frcpa(1+ 61/2^-8))
+data8 0xdfc28e033aaaf7c7,0x3ffc // ln(1/frcpa(1+ 62/2^-8))
+data8 0xe2e012a5f91d2f55,0x3ffc // ln(1/frcpa(1+ 63/2^-8))
+data8 0xe600064ed9e292a8,0x3ffc // ln(1/frcpa(1+ 64/2^-8))
+data8 0xe9226cce42b39f60,0x3ffc // ln(1/frcpa(1+ 65/2^-8))
+data8 0xec4749fd97a28360,0x3ffc // ln(1/frcpa(1+ 66/2^-8))
+data8 0xef6ea1bf57780495,0x3ffc // ln(1/frcpa(1+ 67/2^-8))
+data8 0xf29877ff38809091,0x3ffc // ln(1/frcpa(1+ 68/2^-8))
+data8 0xf5c4d0b245cb89be,0x3ffc // ln(1/frcpa(1+ 69/2^-8))
+data8 0xf8f3afd6fcdef3aa,0x3ffc // ln(1/frcpa(1+ 70/2^-8))
+data8 0xfc2519756be1abc7,0x3ffc // ln(1/frcpa(1+ 71/2^-8))
+data8 0xff59119f503e6832,0x3ffc // ln(1/frcpa(1+ 72/2^-8))
+data8 0x8147ce381ae0e146,0x3ffd // ln(1/frcpa(1+ 73/2^-8))
+data8 0x82e45f06cb1ad0f2,0x3ffd // ln(1/frcpa(1+ 74/2^-8))
+data8 0x842f5c7c573cbaa2,0x3ffd // ln(1/frcpa(1+ 75/2^-8))
+data8 0x85ce471968c8893a,0x3ffd // ln(1/frcpa(1+ 76/2^-8))
+data8 0x876e8305bc04066d,0x3ffd // ln(1/frcpa(1+ 77/2^-8))
+data8 0x891012678031fbb3,0x3ffd // ln(1/frcpa(1+ 78/2^-8))
+data8 0x8a5f1493d766a05f,0x3ffd // ln(1/frcpa(1+ 79/2^-8))
+data8 0x8c030c778c56fa00,0x3ffd // ln(1/frcpa(1+ 80/2^-8))
+data8 0x8da85df17e31d9ae,0x3ffd // ln(1/frcpa(1+ 81/2^-8))
+data8 0x8efa663e7921687e,0x3ffd // ln(1/frcpa(1+ 82/2^-8))
+data8 0x90a22b6875c6a1f8,0x3ffd // ln(1/frcpa(1+ 83/2^-8))
+data8 0x91f62cc8f5d24837,0x3ffd // ln(1/frcpa(1+ 84/2^-8))
+data8 0x93a06cfc3857d980,0x3ffd // ln(1/frcpa(1+ 85/2^-8))
+data8 0x94f66d5e6fd01ced,0x3ffd // ln(1/frcpa(1+ 86/2^-8))
+data8 0x96a330156e6772f2,0x3ffd // ln(1/frcpa(1+ 87/2^-8))
+data8 0x97fb3582754ea25b,0x3ffd // ln(1/frcpa(1+ 88/2^-8))
+data8 0x99aa8259aad1bbf2,0x3ffd // ln(1/frcpa(1+ 89/2^-8))
+data8 0x9b0492f6227ae4a8,0x3ffd // ln(1/frcpa(1+ 90/2^-8))
+data8 0x9c5f8e199bf3a7a5,0x3ffd // ln(1/frcpa(1+ 91/2^-8))
+data8 0x9e1293b9998c1daa,0x3ffd // ln(1/frcpa(1+ 92/2^-8))
+data8 0x9f6fa31e0b41f308,0x3ffd // ln(1/frcpa(1+ 93/2^-8))
+data8 0xa0cda11eaf46390e,0x3ffd // ln(1/frcpa(1+ 94/2^-8))
+data8 0xa22c8f029cfa45aa,0x3ffd // ln(1/frcpa(1+ 95/2^-8))
+data8 0xa3e48badb7856b34,0x3ffd // ln(1/frcpa(1+ 96/2^-8))
+data8 0xa5459a0aa95849f9,0x3ffd // ln(1/frcpa(1+ 97/2^-8))
+data8 0xa6a79c84480cfebd,0x3ffd // ln(1/frcpa(1+ 98/2^-8))
+data8 0xa80a946d0fcb3eb2,0x3ffd // ln(1/frcpa(1+ 99/2^-8))
+data8 0xa96e831a3ea7b314,0x3ffd // ln(1/frcpa(1+100/2^-8))
+data8 0xaad369e3dc544e3b,0x3ffd // ln(1/frcpa(1+101/2^-8))
+data8 0xac92e9588952c815,0x3ffd // ln(1/frcpa(1+102/2^-8))
+data8 0xadfa035aa1ed8fdc,0x3ffd // ln(1/frcpa(1+103/2^-8))
+data8 0xaf6219eae1ad6e34,0x3ffd // ln(1/frcpa(1+104/2^-8))
+data8 0xb0cb2e6d8160f753,0x3ffd // ln(1/frcpa(1+105/2^-8))
+data8 0xb2354249ad950f72,0x3ffd // ln(1/frcpa(1+106/2^-8))
+data8 0xb3a056e98ef4a3b4,0x3ffd // ln(1/frcpa(1+107/2^-8))
+data8 0xb50c6dba52c6292a,0x3ffd // ln(1/frcpa(1+108/2^-8))
+data8 0xb679882c33876165,0x3ffd // ln(1/frcpa(1+109/2^-8))
+data8 0xb78c07429785cedc,0x3ffd // ln(1/frcpa(1+110/2^-8))
+data8 0xb8faeb8dc4a77d24,0x3ffd // ln(1/frcpa(1+111/2^-8))
+data8 0xba6ad77eb36ae0d6,0x3ffd // ln(1/frcpa(1+112/2^-8))
+data8 0xbbdbcc915e9bee50,0x3ffd // ln(1/frcpa(1+113/2^-8))
+data8 0xbd4dcc44f8cf12ef,0x3ffd // ln(1/frcpa(1+114/2^-8))
+data8 0xbec0d81bf5b531fa,0x3ffd // ln(1/frcpa(1+115/2^-8))
+data8 0xc034f19c139186f4,0x3ffd // ln(1/frcpa(1+116/2^-8))
+data8 0xc14cb69f7c5e55ab,0x3ffd // ln(1/frcpa(1+117/2^-8))
+data8 0xc2c2abbb6e5fd56f,0x3ffd // ln(1/frcpa(1+118/2^-8))
+data8 0xc439b2c193e6771e,0x3ffd // ln(1/frcpa(1+119/2^-8))
+data8 0xc553acb9d5c67733,0x3ffd // ln(1/frcpa(1+120/2^-8))
+data8 0xc6cc96e441272441,0x3ffd // ln(1/frcpa(1+121/2^-8))
+data8 0xc8469753eca88c30,0x3ffd // ln(1/frcpa(1+122/2^-8))
+data8 0xc962cf3ce072b05c,0x3ffd // ln(1/frcpa(1+123/2^-8))
+data8 0xcadeba8771f694aa,0x3ffd // ln(1/frcpa(1+124/2^-8))
+data8 0xcc5bc08d1f72da94,0x3ffd // ln(1/frcpa(1+125/2^-8))
+data8 0xcd7a3f99ea035c29,0x3ffd // ln(1/frcpa(1+126/2^-8))
+data8 0xcef93860c8a53c35,0x3ffd // ln(1/frcpa(1+127/2^-8))
+data8 0xd0192f68a7ed23df,0x3ffd // ln(1/frcpa(1+128/2^-8))
+data8 0xd19a201127d3c645,0x3ffd // ln(1/frcpa(1+129/2^-8))
+data8 0xd2bb92f4061c172c,0x3ffd // ln(1/frcpa(1+130/2^-8))
+data8 0xd43e80b2ee8cc8fc,0x3ffd // ln(1/frcpa(1+131/2^-8))
+data8 0xd56173601fc4ade4,0x3ffd // ln(1/frcpa(1+132/2^-8))
+data8 0xd6e6637efb54086f,0x3ffd // ln(1/frcpa(1+133/2^-8))
+data8 0xd80ad9f58f3c8193,0x3ffd // ln(1/frcpa(1+134/2^-8))
+data8 0xd991d1d31aca41f8,0x3ffd // ln(1/frcpa(1+135/2^-8))
+data8 0xdab7d02231484a93,0x3ffd // ln(1/frcpa(1+136/2^-8))
+data8 0xdc40d532cde49a54,0x3ffd // ln(1/frcpa(1+137/2^-8))
+data8 0xdd685f79ed8b265e,0x3ffd // ln(1/frcpa(1+138/2^-8))
+data8 0xde9094bbc0e17b1d,0x3ffd // ln(1/frcpa(1+139/2^-8))
+data8 0xe01c91b78440c425,0x3ffd // ln(1/frcpa(1+140/2^-8))
+data8 0xe14658f26997e729,0x3ffd // ln(1/frcpa(1+141/2^-8))
+data8 0xe270cdc2391e0d23,0x3ffd // ln(1/frcpa(1+142/2^-8))
+data8 0xe3ffce3a2aa64922,0x3ffd // ln(1/frcpa(1+143/2^-8))
+data8 0xe52bdb274ed82887,0x3ffd // ln(1/frcpa(1+144/2^-8))
+data8 0xe6589852e75d7df6,0x3ffd // ln(1/frcpa(1+145/2^-8))
+data8 0xe786068c79937a7d,0x3ffd // ln(1/frcpa(1+146/2^-8))
+data8 0xe91903adad100911,0x3ffd // ln(1/frcpa(1+147/2^-8))
+data8 0xea481236f7d35bb0,0x3ffd // ln(1/frcpa(1+148/2^-8))
+data8 0xeb77d48c692e6b14,0x3ffd // ln(1/frcpa(1+149/2^-8))
+data8 0xeca84b83d7297b87,0x3ffd // ln(1/frcpa(1+150/2^-8))
+data8 0xedd977f4962aa158,0x3ffd // ln(1/frcpa(1+151/2^-8))
+data8 0xef7179a22f257754,0x3ffd // ln(1/frcpa(1+152/2^-8))
+data8 0xf0a450d139366ca7,0x3ffd // ln(1/frcpa(1+153/2^-8))
+data8 0xf1d7e0524ff9ffdb,0x3ffd // ln(1/frcpa(1+154/2^-8))
+data8 0xf30c29036a8b6cae,0x3ffd // ln(1/frcpa(1+155/2^-8))
+data8 0xf4412bc411ea8d92,0x3ffd // ln(1/frcpa(1+156/2^-8))
+data8 0xf576e97564c8619d,0x3ffd // ln(1/frcpa(1+157/2^-8))
+data8 0xf6ad62fa1b5f172f,0x3ffd // ln(1/frcpa(1+158/2^-8))
+data8 0xf7e499368b55c542,0x3ffd // ln(1/frcpa(1+159/2^-8))
+data8 0xf91c8d10abaffe22,0x3ffd // ln(1/frcpa(1+160/2^-8))
+data8 0xfa553f7018c966f3,0x3ffd // ln(1/frcpa(1+161/2^-8))
+data8 0xfb8eb13e185d802c,0x3ffd // ln(1/frcpa(1+162/2^-8))
+data8 0xfcc8e3659d9bcbed,0x3ffd // ln(1/frcpa(1+163/2^-8))
+data8 0xfe03d6d34d487fd2,0x3ffd // ln(1/frcpa(1+164/2^-8))
+data8 0xff3f8c7581e9f0ae,0x3ffd // ln(1/frcpa(1+165/2^-8))
+data8 0x803e029e280173ae,0x3ffe // ln(1/frcpa(1+166/2^-8))
+data8 0x80dca10cc52d0757,0x3ffe // ln(1/frcpa(1+167/2^-8))
+data8 0x817ba200632755a1,0x3ffe // ln(1/frcpa(1+168/2^-8))
+data8 0x821b05f3b01d6774,0x3ffe // ln(1/frcpa(1+169/2^-8))
+data8 0x82bacd623ff19d06,0x3ffe // ln(1/frcpa(1+170/2^-8))
+data8 0x835af8c88e7a8f47,0x3ffe // ln(1/frcpa(1+171/2^-8))
+data8 0x83c5f8299e2b4091,0x3ffe // ln(1/frcpa(1+172/2^-8))
+data8 0x8466cb43f3d87300,0x3ffe // ln(1/frcpa(1+173/2^-8))
+data8 0x850803a67c80ca4b,0x3ffe // ln(1/frcpa(1+174/2^-8))
+data8 0x85a9a1d11a23b461,0x3ffe // ln(1/frcpa(1+175/2^-8))
+data8 0x864ba644a18e6e05,0x3ffe // ln(1/frcpa(1+176/2^-8))
+data8 0x86ee1182dcc432f7,0x3ffe // ln(1/frcpa(1+177/2^-8))
+data8 0x875a925d7e48c316,0x3ffe // ln(1/frcpa(1+178/2^-8))
+data8 0x87fdaa109d23aef7,0x3ffe // ln(1/frcpa(1+179/2^-8))
+data8 0x88a129ed4becfaf2,0x3ffe // ln(1/frcpa(1+180/2^-8))
+data8 0x89451278ecd7f9cf,0x3ffe // ln(1/frcpa(1+181/2^-8))
+data8 0x89b29295f8432617,0x3ffe // ln(1/frcpa(1+182/2^-8))
+data8 0x8a572ac5a5496882,0x3ffe // ln(1/frcpa(1+183/2^-8))
+data8 0x8afc2d0ce3b2dadf,0x3ffe // ln(1/frcpa(1+184/2^-8))
+data8 0x8b6a69c608cfd3af,0x3ffe // ln(1/frcpa(1+185/2^-8))
+data8 0x8c101e106e899a83,0x3ffe // ln(1/frcpa(1+186/2^-8))
+data8 0x8cb63de258f9d626,0x3ffe // ln(1/frcpa(1+187/2^-8))
+data8 0x8d2539c5bd19e2b1,0x3ffe // ln(1/frcpa(1+188/2^-8))
+data8 0x8dcc0e064b29e6f1,0x3ffe // ln(1/frcpa(1+189/2^-8))
+data8 0x8e734f45d88357ae,0x3ffe // ln(1/frcpa(1+190/2^-8))
+data8 0x8ee30cef034a20db,0x3ffe // ln(1/frcpa(1+191/2^-8))
+data8 0x8f8b0515686d1d06,0x3ffe // ln(1/frcpa(1+192/2^-8))
+data8 0x90336bba039bf32f,0x3ffe // ln(1/frcpa(1+193/2^-8))
+data8 0x90a3edd23d1c9d58,0x3ffe // ln(1/frcpa(1+194/2^-8))
+data8 0x914d0de2f5d61b32,0x3ffe // ln(1/frcpa(1+195/2^-8))
+data8 0x91be0c20d28173b5,0x3ffe // ln(1/frcpa(1+196/2^-8))
+data8 0x9267e737c06cd34a,0x3ffe // ln(1/frcpa(1+197/2^-8))
+data8 0x92d962ae6abb1237,0x3ffe // ln(1/frcpa(1+198/2^-8))
+data8 0x9383fa6afbe2074c,0x3ffe // ln(1/frcpa(1+199/2^-8))
+data8 0x942f0421651c1c4e,0x3ffe // ln(1/frcpa(1+200/2^-8))
+data8 0x94a14a3845bb985e,0x3ffe // ln(1/frcpa(1+201/2^-8))
+data8 0x954d133857f861e7,0x3ffe // ln(1/frcpa(1+202/2^-8))
+data8 0x95bfd96468e604c4,0x3ffe // ln(1/frcpa(1+203/2^-8))
+data8 0x9632d31cafafa858,0x3ffe // ln(1/frcpa(1+204/2^-8))
+data8 0x96dfaabd86fa1647,0x3ffe // ln(1/frcpa(1+205/2^-8))
+data8 0x9753261fcbb2a594,0x3ffe // ln(1/frcpa(1+206/2^-8))
+data8 0x9800c11b426b996d,0x3ffe // ln(1/frcpa(1+207/2^-8))
+data8 0x9874bf4d45ae663c,0x3ffe // ln(1/frcpa(1+208/2^-8))
+data8 0x99231f5ee9a74f79,0x3ffe // ln(1/frcpa(1+209/2^-8))
+data8 0x9997a18a56bcad28,0x3ffe // ln(1/frcpa(1+210/2^-8))
+data8 0x9a46c873a3267e79,0x3ffe // ln(1/frcpa(1+211/2^-8))
+data8 0x9abbcfc621eb6cb6,0x3ffe // ln(1/frcpa(1+212/2^-8))
+data8 0x9b310cb0d354c990,0x3ffe // ln(1/frcpa(1+213/2^-8))
+data8 0x9be14cf9e1b3515c,0x3ffe // ln(1/frcpa(1+214/2^-8))
+data8 0x9c5710b8cbb73a43,0x3ffe // ln(1/frcpa(1+215/2^-8))
+data8 0x9ccd0abd301f399c,0x3ffe // ln(1/frcpa(1+216/2^-8))
+data8 0x9d7e67f3bdce8888,0x3ffe // ln(1/frcpa(1+217/2^-8))
+data8 0x9df4ea81a99daa01,0x3ffe // ln(1/frcpa(1+218/2^-8))
+data8 0x9e6ba405a54514ba,0x3ffe // ln(1/frcpa(1+219/2^-8))
+data8 0x9f1e21c8c7bb62b3,0x3ffe // ln(1/frcpa(1+220/2^-8))
+data8 0x9f956593f6b6355c,0x3ffe // ln(1/frcpa(1+221/2^-8))
+data8 0xa00ce1092e5498c3,0x3ffe // ln(1/frcpa(1+222/2^-8))
+data8 0xa0c08309c4b912c1,0x3ffe // ln(1/frcpa(1+223/2^-8))
+data8 0xa1388a8c6faa2afa,0x3ffe // ln(1/frcpa(1+224/2^-8))
+data8 0xa1b0ca7095b5f985,0x3ffe // ln(1/frcpa(1+225/2^-8))
+data8 0xa22942eb47534a00,0x3ffe // ln(1/frcpa(1+226/2^-8))
+data8 0xa2de62326449d0a3,0x3ffe // ln(1/frcpa(1+227/2^-8))
+data8 0xa357690f88bfe345,0x3ffe // ln(1/frcpa(1+228/2^-8))
+data8 0xa3d0a93f45169a4b,0x3ffe // ln(1/frcpa(1+229/2^-8))
+data8 0xa44a22f7ffe65f30,0x3ffe // ln(1/frcpa(1+230/2^-8))
+data8 0xa500c5e5b4c1aa36,0x3ffe // ln(1/frcpa(1+231/2^-8))
+data8 0xa57ad064eb2ebbc2,0x3ffe // ln(1/frcpa(1+232/2^-8))
+data8 0xa5f5152dedf4384e,0x3ffe // ln(1/frcpa(1+233/2^-8))
+data8 0xa66f9478856233ec,0x3ffe // ln(1/frcpa(1+234/2^-8))
+data8 0xa6ea4e7cca02c32e,0x3ffe // ln(1/frcpa(1+235/2^-8))
+data8 0xa765437325341ccf,0x3ffe // ln(1/frcpa(1+236/2^-8))
+data8 0xa81e21e6c75b4020,0x3ffe // ln(1/frcpa(1+237/2^-8))
+data8 0xa899ab333fe2b9ca,0x3ffe // ln(1/frcpa(1+238/2^-8))
+data8 0xa9157039c51ebe71,0x3ffe // ln(1/frcpa(1+239/2^-8))
+data8 0xa991713433c2b999,0x3ffe // ln(1/frcpa(1+240/2^-8))
+data8 0xaa0dae5cbcc048b3,0x3ffe // ln(1/frcpa(1+241/2^-8))
+data8 0xaa8a27ede5eb13ad,0x3ffe // ln(1/frcpa(1+242/2^-8))
+data8 0xab06de228a9e3499,0x3ffe // ln(1/frcpa(1+243/2^-8))
+data8 0xab83d135dc633301,0x3ffe // ln(1/frcpa(1+244/2^-8))
+data8 0xac3fb076adc7fe7a,0x3ffe // ln(1/frcpa(1+245/2^-8))
+data8 0xacbd3cbbe47988f1,0x3ffe // ln(1/frcpa(1+246/2^-8))
+data8 0xad3b06b1a5dc57c3,0x3ffe // ln(1/frcpa(1+247/2^-8))
+data8 0xadb90e94af887717,0x3ffe // ln(1/frcpa(1+248/2^-8))
+data8 0xae3754a218f7c816,0x3ffe // ln(1/frcpa(1+249/2^-8))
+data8 0xaeb5d9175437afa2,0x3ffe // ln(1/frcpa(1+250/2^-8))
+data8 0xaf349c322e9c7cee,0x3ffe // ln(1/frcpa(1+251/2^-8))
+data8 0xafb39e30d1768d1c,0x3ffe // ln(1/frcpa(1+252/2^-8))
+data8 0xb032df51c2c93116,0x3ffe // ln(1/frcpa(1+253/2^-8))
+data8 0xb0b25fd3e6035ad9,0x3ffe // ln(1/frcpa(1+254/2^-8))
+data8 0xb1321ff67cba178c,0x3ffe // ln(1/frcpa(1+255/2^-8))
+//
+data8 0xC7DC2985D3B44557,0x3FCA // A00
+//
+// polynomial approximation of ln(GAMMA(x)), 1 <= x < 2.25
+// [0.875,1.25)
+data8 0xBF9A04F7E40C8498,0x3FAB79D8D9380F03 // C17,C16
+data8 0xBFB3B63609CA0CBD,0x3FB5564EA1675539 // C13,C12
+data8 0xBFBC806766F48C41,0x3FC010B36CDA773A // C9,C8
+data8 0xD45CE0BD54BE3D67,0xBFFC // C5
+data8 0xCD26AADF559676D0,0xBFFD // C3
+data8 0x93C467E37DB0C7A7,0xBFFE // C1
+data8 0xBFB10C251723B123,0x3FB2669DAD69A12D // C15,C14
+data8 0xBFB748A3CFCE4717,0x3FB9A01DEE29966A // C11,C10
+data8 0xBFC2703A1D85497E,0x3FC5B40CB0FD353C // C7,C6
+data8 0x8A8991563ECBBA5D,0x3FFD // C4
+data8 0xD28D3312983E9844,0x3FFE // C2
+data8 0,0 // C0
+// [1.25,1.75)
+data8 0xBF12680486396DE6,0x3F23C51FC332CD9D // C17,C16
+data8 0xBF422633DA3A1496,0x3F4CC70680768857 // C13,C12
+data8 0xBF6E2F1A1F804B5D,0x3F78FCE02A032428 // C9,C8
+data8 0x864D46FA895985C1,0xBFFA // C5
+data8 0x97213C6E35E12043,0xBFFC // C3
+data8 0x8A8A42A401D979B7,0x3FC7 // C1
+data8 0xBF2E098A8A2332A8,0x3F370E61B73B205C // C15,C14
+data8 0xBF56F9849D3BC6CC,0x3F6283126F58D7F4 // C11,C10
+data8 0xBF851F9F9516A98F,0x3F9266E797A1433F // C7,C6
+data8 0x845A14A6A81B0638,0x3FFB // C4
+data8 0xF7B95E4771C55C99,0x3FFD // C2
+data8 0xF8CDCDE61C520E0F,0xBFFB // C0
+// [1.75,2.25)
+data8 0xBEA01D7AFA5D8F52,0x3EB1010986E60253 // C17,C16
+data8 0xBEE3CBEDB4C918AA,0x3EF580F6D9D0F72D // C13,C12
+data8 0xBF2D3FD4C7F68563,0x3F40B36AF884AE9A // C9,C8
+data8 0xF2027E10C7B051EC,0xBFF7 // C5
+data8 0x89F000D2ABB03401,0xBFFB // C3
+data8 0xD8773039049E70B6,0x3FFD // C1
+data8 0xBEC112CD07CFC31A,0x3ED2528A428D30E1 // C15,C14
+data8 0xBF078DE5618D8C9F,0x3F1A127AD811A53D // C11,C10
+data8 0xBF538AC5C2BF540D,0x3F67ADD6EADB5718 // C7,C6
+data8 0xA8991563EC243383,0x3FF9 // C4
+data8 0xA51A6625307D3230,0x3FFD // C2
+data8 0,0 // C0
+//
+// polynomial approximation of ln(sin(Pi*x)/(Pi*x)), 9 <= x <= 0.5
+data8 0xBFDC1BF0931AE591,0x3FD36D6D6CE263D7 //S28,S26
+data8 0xBFBD516F4FD9FB18,0xBFBBE1703F315086 //S20,S18
+data8 0xAAB5A3CCEFCD3628,0xBFFC //S12
+data8 0x80859B5C318E19A5,0xBFFD //S8
+data8 0x8A8991563EC7EB33,0xBFFE //S4
+data8 0xBFD23AB9E6CC88AC,0xBF9957F5146FC7AF //S24,S22
+data8 0xBFC007B324E23040,0xBFC248DEC29CAC4A //S16,S14
+data8 0xCD00EFF2F8F86899,0xBFFC //S10
+data8 0xADA06587FACD668B,0xBFFD //S6
+data8 0xD28D3312983E98A0,0xBFFF //S2
+//
+data8 0x8090F777D7942F73,0x4001 // PR01
+data8 0xE5B521193CF61E63,0x4000 // PR11
+data8 0xC02C000000001939,0x0000000000000233 // (-15;-14)
+data8 0xC02A000000016124,0x0000000000002BFB // (-14;-13)
+data8 0xC02800000011EED9,0x0000000000025CBB // (-13;-12)
+data8 0xC026000000D7322A,0x00000000001E1095 // (-12;-11)
+data8 0xC0240000093F2777,0x00000000013DD3DC // (-11;-10)
+data8 0xC02200005C7768FB,0x000000000C9539B9 // (-10;-9)
+data8 0xC02000034028B3F9,0x000000007570C565 // (-9;-8)
+data8 0xC01C0033FDEDFE1F,0x00000007357E670E // (-8;-7)
+data8 0xC018016B25897C8D,0x000000346DC5D639 // (-7;-6)
+data8 0xC014086A57F0B6D9,0x0000010624DD2F1B // (-6;-5)
+data8 0xC010284E78599581,0x0000051EB851EB85 // (-5;-4)
+data8 0xC009260DBC9E59AF,0x000028F5C28F5C29 // (-4;-3)
+data8 0xC003A7FC9600F86C,0x0000666666666666 // (-3;-2)
+data8 0xCC15879606130890,0x4000 // PR21
+data8 0xB42FE3281465E1CC,0x4000 // PR31
+//
+data8 0x828185F0B95C9916,0x4001 // PR00
+//
+data8 0xD4D3C819E4E5654B,0x4000 // PR10
+data8 0xA82FBBA4FCC75298,0x4000 // PR20
+data8 0xC02DFFFFFFFFFE52,0x000000000000001C // (-15;-14)
+data8 0xC02BFFFFFFFFE6C7,0x00000000000001A6 // (-14;-13)
+data8 0xC029FFFFFFFE9EDC,0x0000000000002BFB // (-13;-12)
+data8 0xC027FFFFFFEE1127,0x000000000001EEC8 // (-12;-11)
+data8 0xC025FFFFFF28CDD4,0x00000000001E1095 // (-11;-10)
+data8 0xC023FFFFF6C0D7C0,0x000000000101B2B3 // (-10;-9)
+data8 0xC021FFFFA3884BD0,0x000000000D6BF94D // (-9;-8)
+data8 0xC01FFFF97F8159CF,0x00000000C9539B89 // (-8;-7)
+data8 0xC01BFFCBF76B86F0,0x00000007357E670E // (-7;-6)
+data8 0xC017FE92F591F40D,0x000000346DC5D639 // (-6;-5)
+data8 0xC013F7577A6EEAFD,0x00000147AE147AE1 // (-5;-4)
+data8 0xC00FA471547C2FE5,0x00000C49BA5E353F // (-4;-3)
+data8 0xC005FB410A1BD901,0x000053F7CED91687 // (-3;-2)
+data8 0x80151BB918A293AA,0x4000 // PR30
+data8 0xB3C9F8F47422A314,0x400B // PRN
+//
+// right negative roots
+//(-3;-2)
+data8 0x40BFCF8B90BE7F6B,0x40B237623345EFC3 // A15,A14
+data8 0x407A92EFB03B281E,0x40728700C7819759 // A11,A10
+data8 0x403809F04EF4D0F2,0x4038D32F682D9593 // A7,A6
+data8 0xB4A5302C53C2F2D8,0x3FFF // A3
+data8 0xC1FF4B357A9B0383,0x3FFF // A1
+data8 0x409C46632EB4B2D3,0x4091A72AFA2148F5 // A13,A12
+data8 0x4059297AC79A88DB,0x40548EAA7BE7FA6B // A9,A8
+data8 0x4017339FE04B227F,0x4021718D7CA09E02 // A5,A4
+data8 0x9B775D8017AAE668,0x4001 // A2
+data8 0x8191DB68FF4366A1,0x3FC9 // A0
+//(-4;-3)
+data8 0x425260910D35307B,0x422668F5BE7983BB // A15,A14
+data8 0x41A4454DBE4BEE43,0x41799CA93F6EA817 // A11,A10
+data8 0x40FBB97AA1400F31,0x40D293C3F7ADAB15 // A7,A6
+data8 0xE089B8926AE4517B,0x4005 // A3
+data8 0xF90532F97D630C69,0x4001 // A1
+data8 0x41F9F0CF98C5F2EA,0x41D026336C6BF394 // A13,A12
+data8 0x415057F61156D5B8,0x41251EA3055CB754 // A9,A8
+data8 0x40A99A6337D9FC2B,0x408267203D776151 // A5,A4
+data8 0xCEA694BB8A8827A9,0x4003 // A2
+data8 0xF4B02F1D73D30EED,0x3FCD // A0
+//(-5;-4)
+data8 0x4412365489340979,0x43C86441BAFDEE39 // A15,A14
+data8 0x42ED68FCB19352DD,0x42A45FCE3905CD6F // A11,A10
+data8 0x41CD14FE49FD4FCA,0x41855E3DBFA89744 // A7,A6
+data8 0xAACD88D954E0EC16,0x400B // A3
+data8 0xD652E7A490B0DCDF,0x4003 // A1
+data8 0x437F52608E0E752A,0x433560E0633E33D5 // A13,A12
+data8 0x425C83998976DE3D,0x421433DCCD3B473B // A9,A8
+data8 0x4140261EB5732106,0x40F96D18E21AE6CC // A5,A4
+data8 0xA220AE6C09FA8A0E,0x4007 // A2
+data8 0xCC1682D17A2B5A58,0xBFCF // A0
+//(-6;-5)
+data8 0x4630E41D6386CF5A,0x45C2E7992C628C8C // A15,A14
+data8 0x447AABEC714F913A,0x440EDCAB45339F3A // A11,A10
+data8 0x42C9A8D00C97E3CE,0x425F7D8D5BEAB44D // A7,A6
+data8 0x929EC2B1FB95BB5B,0x4012 // A3
+data8 0xF6B970414D717D38,0x4005 // A1
+data8 0x45545E578976F6A2,0x44E738288DD52686 // A13,A12
+data8 0x43A20921FEC49492,0x433557FD7C6A41B3 // A9,A8
+data8 0x41F3E01773761DB4,0x418A225DF2DA6C47 // A5,A4
+data8 0xE7661976117F9312,0x400B // A2
+data8 0xC33C13FEE07494DE,0x3FCF // A0
+//(-7;-6)
+data8 0x4898F1E6133305AD,0x4802C5306FE4A850 // A15,A14
+data8 0x463FD37946B44094,0x45A8D489B784C2DD // A11,A10
+data8 0x43E9500995815F06,0x4354F21E2FEE6DF5 // A7,A6
+data8 0xEF281D1E1BBE10BD,0x4019 // A3
+data8 0xB4EF24F1D78C2029,0x4008 // A1
+data8 0x476AB1D5930011E5,0x46D4867E77BFB622 // A13,A12
+data8 0x45139151ECDEF7C5,0x447F3A2BC6BF466F // A9,A8
+data8 0x42C1D3D50713FA40,0x422F9C7B52556A1B // A5,A4
+data8 0xFE711A4267CEA83A,0x4010 // A2
+data8 0xD11E91B3FF8F4B94,0xBFD2 // A0
+//(-8;-7)
+data8 0x4B39E57569811B6E,0x4A7656073EB1FA21 // A15,A14
+data8 0x482C9B24A516B0BB,0x47698FF55139C62B // A11,A10
+data8 0x452393E2BC8E8D04,0x44628E1C710DA478 // A7,A6
+data8 0x9F2A95AF1B7A773F,0x4022 // A3
+data8 0x9DA03D51C303C918,0x400B // A1
+data8 0x49B24C241A3D5BCB,0x48F01CB936ECDA67 // A13,A12
+data8 0x46A712B3425C6797,0x45E5164114BD6DA1 // A9,A8
+data8 0x43A216A356069D01,0x42E25E42A45E2108 // A5,A4
+data8 0xC1F42ED57BBC2529,0x4016 // A2
+data8 0xB1C7B615A7DCA8A9,0xBFD7 // A0
+//(-9;-8)
+data8 0x4E09D478E5EE857D,0x4D1647782106E9AB // A15,A14
+data8 0x4A3C7F4D51927548,0x49497954796D743A // A11,A10
+data8 0x467387BD6AF0CBDF,0x4582843E134111D2 // A7,A6
+data8 0x9F003C6DE9666513,0x402B // A3
+data8 0x9D8447F6BF99950A,0x400E // A1
+data8 0x4C22364D238C61A9,0x4B300B18050AB940 // A13,A12
+data8 0x4857004D64215772,0x4765074E448C3C9A // A9,A8
+data8 0x44920E9EA07BF624,0x43A257BEC94BBF48 // A5,A4
+data8 0xC1D1C49AC5B2A4B4,0x401C // A2
+data8 0x9A749AF9F2D2E688,0x3FDB // A0
+//(-10;-9)
+data8 0x5102C7C43EA26C83,0x4FDCD174DEB0426B // A15,A14
+data8 0x4C6A036195CD5BAD,0x4B44ABB52B65628A // A11,A10
+data8 0x47D6439374B98FED,0x46B2C3903EF44D7D // A7,A6
+data8 0xE25BAF73AB8A7DB3,0x4034 // A3
+data8 0xB130901CA6D81B61,0x4011 // A1
+data8 0x4EB50BB0726AE206,0x4D907A96E6D2B6E2 // A13,A12
+data8 0x4A20975D78EAF01A,0x48FAF79C9C3E7908 // A9,A8
+data8 0x459044144129A247,0x446D6043FA3150A3 // A5,A4
+data8 0xF547997E083D9BA7,0x4022 // A2
+data8 0x977AF525A6ECA1BC,0x3FDC // A0
+//(-11;-10)
+data8 0x5420A5D5E90C6D73,0x52C4710A503DC67A // A15,A14
+data8 0x4EB2ED07BA88D2A8,0x4D581001ED9A5ECE // A11,A10
+data8 0x494A8A28E9E3DFEF,0x47F1E4E1E476793E // A7,A6
+data8 0xDD0C97E12D4A3378,0x403E // A3
+data8 0xDD7C12D5182FD543,0x4014 // A1
+data8 0x5167ED536877A072,0x500DF9AF21DDC0B6 // A13,A12
+data8 0x4BFEE6F04BC34FF8,0x4AA4175CEF736A5E // A9,A8
+data8 0x4698D1B4388FEC78,0x4541EDE7607A600D // A5,A4
+data8 0xBF9F645F282AC552,0x4029 // A2
+data8 0xAE1BBE4D3CDACCF4,0x3FE1 // A0
+//(-12;-11)
+data8 0x575F0EEF5FB7D4C0,0x55CBB7302B211A7C // A15,A14
+data8 0x5113A4F1825C7CB2,0x4F822A0D46E0605A // A11,A10
+data8 0x4ACED38FC8BE069A,0x493E3B56D2649F18 // A7,A6
+data8 0x8FA8FF5DF8B72D5E,0x4049 // A3
+data8 0x9845417E8598D642,0x4018 // A1
+data8 0x5437780541C3F2D3,0x52A56279B563C1B2 // A13,A12
+data8 0x4DF0F71A48C50188,0x4C600B358988DEBF // A9,A8
+data8 0x47AE7EE95BDA3DE9,0x46200599DC16B18F // A5,A4
+data8 0xB5249F914932E55D,0x4030 // A2
+data8 0xEAE760CD2C086094,0x3FE5 // A0
+//(-13;-12)
+data8 0x5ABA5848651F6D18,0x58EF60D8A817650B // A15,A14
+data8 0x538A8CA86E13EFB1,0x51C05DBD4D01076D // A11,A10
+data8 0x4C607594C339D259,0x4A9585BD5BF932BB // A7,A6
+data8 0xF26D282C36EC3611,0x4053 // A3
+data8 0xE467DF4810EE7EEE,0x401B // A1
+data8 0x5721D9BA485E8CC3,0x5555AF2CCFB2104D // A13,A12
+data8 0x4FF4619A17B14EA6,0x4E29B2F29EB9F8C4 // A9,A8
+data8 0x48CCF27629D46E79,0x47044715F991A63D // A5,A4
+data8 0xCBC92FB9BDAA95A9,0x4037 // A2
+data8 0xFB743A426163665B,0xBFE6 // A0
+//(-14;-13)
+data8 0x5E3295B24B353EAA,0x5C2B447E29796F20 // A15,A14
+data8 0x5615A35CB5EAFAE5,0x54106AB089C95CAF // A11,A10
+data8 0x4DFEC7D93501900A,0x4BF8C4C685F01B83 // A7,A6
+data8 0x820899603D9A74D5,0x405F // A3
+data8 0xB9949919933821CB,0x401F // A1
+data8 0x5A23373DB9A995AC,0x581CBA0AF7F53009 // A13,A12
+data8 0x520929836BB304CD,0x500386409A7076DA // A9,A8
+data8 0x49F480173FEAF90B,0x47F1ACB14B810793 // A5,A4
+data8 0x86881B8674DBF205,0x403F // A2
+data8 0x8CF3CC35AA2C5F90,0x3FED // A0
+//(-15;-14)
+data8 0x61C37D53BE0029D6,0x5F80667CD9D68354 // A15,A14
+data8 0x58B3F01898E6605B,0x567149652116DB6A // A11,A10
+data8 0x4FA82FA4F5D35B00,0x4D663DB00832DF8F // A7,A6
+data8 0xAE426731C9B94996,0x406A // A3
+data8 0xA264C84BE3708F3F,0x4023 // A1
+data8 0x5D3B254BC1C806A8,0x5AF72E736048B553 // A13,A12
+data8 0x542E476505104BB0,0x51EAD96CDC4FB48F // A9,A8
+data8 0x4B25095F498DB134,0x48E4B9FDEBFE24AB // A5,A4
+data8 0xCE076A5A116C1D34,0x4046 // A2
+data8 0x940013871A15050B,0x3FF1 // A0
+//
+// left negative roots
+//(-3;-2)
+data8 0x41AEB7998DBE2B2C,0xC19053D8FAC05DF7 // A16,A15
+data8 0x4133197BF1ADEAF9,0xC1150728B9B82072 // A12,A11
+data8 0x40BDBA65E74F4526,0xC0A12239BEEF8F72 // A8,A7
+data8 0xFA8256664F99E2AA,0x4004 // A4
+data8 0x9933F9E132D2A5DB,0x4002 // A2
+data8 0x416FFB167B85F77C,0xC15166AE0ACCF87C // A14,A13
+data8 0x40F75815106322C0,0xC0DA2D23C59C348D // A10,A9
+data8 0x4084373F7CC42043,0xC0685884581F8C61 // A6,A5
+data8 0xA0C2D6186460FF9D,0xC003 // A3
+data8 0xF5096D48258CA0AD,0xBFFF // A1
+//(-4;-3)
+data8 0xC3E5BD233016D4B9,0x43A084DAD2D94AB1 // A15,A14
+data8 0xC2CCFFF5E5AED722,0x4286D143AC7D29A6 // A11,A10
+data8 0xC1B7DBBE0680D07B,0x4173E8F3ABB79CED // A7,A6
+data8 0xE929ACEA59799BAF,0xC00A // A3
+data8 0xA5CCECB362B21E1C,0xC003 // A1
+data8 0xC357EED873871B81,0x43128E0B873204FC // A13,A12
+data8 0xC242225FA76E8450,0x41FD2F76AE7386CE // A9,A8
+data8 0xC13116F7806D0C7A,0x40EE8F829F141025 // A5,A4
+data8 0xFBB6F57021B5B397,0x4006 // A2
+data8 0xEEE019B4C05AC269,0xBFCB // A0
+//(-5;-4)
+data8 0xC626A52FE8AAA100,0x45B9FD1F4DDFE31E // A15,A14
+data8 0xC473812A5675F08B,0x440738530AECC254 // A11,A10
+data8 0xC2C5068B3F94AC27,0x425A8C5C539A500B // A7,A6
+data8 0x869FBFF732F20C3A,0xC012 // A3
+data8 0xE91251F7CF25A655,0xC005 // A1
+data8 0xC54C18CB48E5DA0F,0x44E07BD36FF561DF // A13,A12
+data8 0xC39BEC120D2FEBEA,0x4330FFA5388435BE // A9,A8
+data8 0xC1F13D5D163B7FB5,0x418752A6F5AC0F39 // A5,A4
+data8 0xDA99E33C51D360F0,0x400B // A2
+data8 0x9F47A66A2F53D9B9,0x3FD1 // A0
+//(-6;-5)
+data8 0xC8970DAC16B6D59E,0x480170728306FD76 // A15,A14
+data8 0xC63E0E5030604CF3,0x45A7924D74D57C65 // A11,A10
+data8 0xC3E8684E41730FC6,0x43544D54EA2E5B9A // A7,A6
+data8 0xEB7404450C47C5F4,0xC019 // A3
+data8 0xB30FB521D2C19F8B,0xC008 // A1
+data8 0xC768F34D35DF6320,0x46D348B3BB2E68B8 // A13,A12
+data8 0xC512AC2FE5EA638E,0x447DF44BC7FC5E17 // A9,A8
+data8 0xC2C15EA6B0AAFEF9,0x422EF5D308DBC420 // A5,A4
+data8 0xFBCEE5BCA70FD3A3,0x4010 // A2
+data8 0x8589A7CFFE0A3E86,0xBFD5 // A0
+//(-7;-6)
+data8 0xCB3995A0CC961E5A,0x4A7615C6C7116ADD // A15,A14
+data8 0xC82C5AFE0BF9C427,0x47695BD2F367668B // A11,A10
+data8 0xC52377E70BA14CF5,0x4462775E859E4392 // A7,A6
+data8 0x9EC8ED6E4C3D4DBE,0xC022 // A3
+data8 0x9D5FBD2E75520E65,0xC00B // A1
+data8 0xC9B21BB881A4DDF8,0x48EFEAB06FBA0207 // A13,A12
+data8 0xC6A6E8550CBC188F,0x45E4F3D26238B099 // A9,A8
+data8 0xC3A20427DF1B110A,0x42E24F3D636F2E4E // A5,A4
+data8 0xC1A4D12A82280CFB,0x4016 // A2
+data8 0xEF46D8DCCA9E8197,0x3FD2 // A0
+//(-8;-7)
+data8 0xCE0946982B27DE5B,0x4D15DBC6664E2DD2 // A15,A14
+data8 0xCA3C769F6B3B2B93,0x49497251CD0C4363 // A11,A10
+data8 0xC67384066C47F489,0x458281393433AB28 // A7,A6
+data8 0x9EF3459926D0F14F,0xC02B // A3
+data8 0x9D7BB7F2600DFF0B,0xC00E // A1
+data8 0xCC22351326C939A7,0x4B3009431C4F1D3F // A13,A12
+data8 0xC856FAADDD48815D,0x476502BC3ECA040C // A9,A8
+data8 0xC4920C2A84173810,0x43A255C052525F99 // A5,A4
+data8 0xC1C73B6554011EFA,0x401C // A2
+data8 0x954612700ADF8317,0xBFD8 // A0
+//(-9;-8)
+data8 0xD102F5CC7B590D3A,0x4FDD0F1C30E4EB22 // A15,A14
+data8 0xCC6A02912B0DF650,0x4B44AB18E4FCC159 // A11,A10
+data8 0xC7D64314B4A2FAAB,0x46B2C334AE5E2D34 // A7,A6
+data8 0xE2598724F7E28E99,0xC034 // A3
+data8 0xB12F6FE2E195452C,0xC011 // A1
+data8 0xCEB507747AF9356A,0x4D907802C08BA48F // A13,A12
+data8 0xCA2096E3DC29516F,0x48FAF6ED046A1DB7 // A9,A8
+data8 0xC59043D21BA5EE56,0x446D5FE468B30450 // A5,A4
+data8 0xF5460A8196B59C83,0x4022 // A2
+data8 0xB108F35A8EDA92D5,0xBFDD // A0
+//(-10;-9)
+data8 0xD420430D91F8265B,0x52C406CAAAC9E0EE // A15,A14
+data8 0xCEB2ECDDDAA3DAD1,0x4D580FDA97F92E3A // A11,A10
+data8 0xC94A8A192341B5D4,0x47F1E4D8C690D07B // A7,A6
+data8 0xDD0C5F920C2F0D2B,0xC03E // A3
+data8 0xDD7BED3631657B48,0xC014 // A1
+data8 0xD167F410E64E90A4,0x500DFFED20F714A7 // A13,A12
+data8 0xCBFEE6D9043169E9,0x4AA4174F64B40AA7 // A9,A8
+data8 0xC698D1A9AF0AB9C2,0x4541EDE14987A887 // A5,A4
+data8 0xBF9F43D461B3DE6E,0x4029 // A2
+data8 0xF3891A50642FAF26,0x3FE1 // A0
+//(-11;-10)
+data8 0xD75F0EEAF769D42A,0x55CBB72C8869183A // A15,A14
+data8 0xD113A4EF80394F77,0x4F822A0B96B3ECA9 // A11,A10
+data8 0xCACED38DC75763CB,0x493E3B5522D2D028 // A7,A6
+data8 0x8FA8FB5C92533701,0xC049 // A3
+data8 0x98453EDB9339C24E,0xC018 // A1
+data8 0xD43778026CCD4B20,0x52A5627753273B9B // A13,A12
+data8 0xCDF0F718DD7E1214,0x4C600B34582911EB // A9,A8
+data8 0xC7AE7EE7F112362C,0x46200599439C264F // A5,A4
+data8 0xB5249C335342B5BC,0x4030 // A2
+data8 0x881550711D143475,0x3FE4 // A0
+//(-12;-11)
+data8 0xDAB9C724EEEE2BBB,0x58EEC971340EDDBA // A15,A14
+data8 0xD38A8C8AE63BD8BF,0x51C05DB21CEE00D3 // A11,A10
+data8 0xCC607594C311C12D,0x4A9585BD5BE6AB57 // A7,A6
+data8 0xF26D282C36EC0E66,0xC053 // A3
+data8 0xE467DF1FA674BFAE,0xC01B // A1
+data8 0xD721DE506999AA9C,0x5555B34F71B45132 // A13,A12
+data8 0xCFF4619A476BF76F,0x4E29B2F2BBE7A67E // A9,A8
+data8 0xC8CCF27629D48EDC,0x47044715F991AB46 // A5,A4
+data8 0xCBC92FB9BDAA928D,0x4037 // A2
+data8 0xCE27C4F01CF53284,0xBFE6 // A0
+//(-13;-12)
+data8 0xDE3295B24355C5A1,0x5C2B447E298B562D // A15,A14
+data8 0xD615A35CB5E92103,0x54106AB089C95E8C // A11,A10
+data8 0xCDFEC7D935019005,0x4BF8C4C685F01B83 // A7,A6
+data8 0x820899603D9A74D5,0xC05F // A3
+data8 0xB9949916F8DF4AC4,0xC01F // A1
+data8 0xDA23373DBA0B7548,0x581CBA0AF7F45C01 // A13,A12
+data8 0xD20929836BB30934,0x500386409A7076D6 // A9,A8
+data8 0xC9F480173FEAF90B,0x47F1ACB14B810793 // A5,A4
+data8 0x86881B8674DBF205,0x403F // A2
+data8 0x8CFAFA9A142C1FF0,0x3FED // A0
+//(-14;-13)
+data8 0xE1C33F356FA2C630,0x5F8038B8AA919DD7 // A15,A14
+data8 0xD8B3F0167E14982D,0x5671496400BAE0DB // A11,A10
+data8 0xCFA82FA4F5D25C3E,0x4D663DB008328C58 // A7,A6
+data8 0xAE426731C9B94980,0xC06A // A3
+data8 0xA264C84BB8A66F86,0xC023 // A1
+data8 0xDD3B26E34762ED1E,0x5AF72F76E3C1B793 // A13,A12
+data8 0xD42E476507E3D06E,0x51EAD96CDD881DFA // A9,A8
+data8 0xCB25095F498DB15F,0x48E4B9FDEBFE24B5 // A5,A4
+data8 0xCE076A5A116C1D32,0x4046 // A2
+data8 0x94001BF5A24966F5,0x3FF1 // A0
+//(-15;-14)
+data8 0xE56DB8B72D7156FF,0x62EAB0CDB22539BE // A15,A14
+data8 0xDB63D76B0D3457E7,0x58E254823D0AE4FF // A11,A10
+data8 0xD15F060BF548404A,0x4EDE65C20CD4E961 // A7,A6
+data8 0x900DA565ED76C19D,0xC076 // A3
+data8 0x9868C809852DA712,0xC027 // A1
+data8 0xE067CCDA0408AAF0,0x5DE5A79C5C5C54AF // A13,A12
+data8 0xD6611ADBF5958ED0,0x53E0294092BE9677 // A9,A8
+data8 0xCC5EA28D90EE8C5D,0x49E014930EF336EE // A5,A4
+data8 0xB57930DCE7A61AE8,0x404E // A2
+data8 0x976BEC1F30DF151C,0x3FF5 // A0
+LOCAL_OBJECT_END(lgamma_data)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(__libm_lgamma)
+
+{ .mfi
+ getf.exp GR_SignExp = f8
+ frcpa.s1 FR_C,p9 = f1,f8
+ mov GR_ExpMask = 0x1ffff
+}
+{ .mfi
+ addl GR_ad_Data = @ltoff(lgamma_data),gp
+ fcvt.fx.s1 FR_int_N = f8
+ mov GR_2_25 = 0x4002 // 2.25
+};;
+{ .mfi
+ getf.d GR_ArgAsIs = f8
+ fclass.m p13,p0 = f8,0x1EF // is x NaTVal, NaN,
+ // +/-0, +/-INF or +/-deno?
+ mov GR_ExpBias = 0xFFFF
+}
+{ .mfi
+ ld8 GR_ad_Data = [GR_ad_Data]
+ fcvt.fx.trunc.s1 FR_int_Ntrunc = f8
+ mov GR_ExpOf256 = 0x10007
+};;
+{ .mfi
+ mov GR_ExpOf2 = 0x10000
+ fcmp.lt.s1 p14,p15 = f8,f0 // p14 if x<0
+ dep.z GR_Ind = GR_SignExp,8,4
+}
+{ .mfi
+ and GR_Exp = GR_SignExp,GR_ExpMask
+ fma.s1 FR_2 = f1,f1,f1
+ cmp.lt p10,p0 = GR_SignExp,GR_ExpBias
+};;
+{ .mfi
+ add GR_ad_1 = 0xB80,GR_ad_Data
+ fnorm.s1 FR_NormX = f8
+ shr.u GR_Arg = GR_ArgAsIs,48
+}
+{ .mib
+ add GR_ad_Co = GR_Ind,GR_ad_Data
+ add GR_ad_Ce = 0x10,GR_ad_Data
+ // jump if the input argument is NaTVal, NaN, +/-0, +/-INF or +/-deno
+(p13) br.cond.spnt lgamma_spec
+};;
+lgamma_common:
+{ .mfi
+ ldfpd FR_LocalMin,FR_05 = [GR_ad_1],16
+ fmerge.se FR_x = f1,f8
+ add GR_ad_2 = 0xBC0,GR_ad_Data
+}
+{ .mfb
+ add GR_ad_Ce = GR_Ind,GR_ad_Ce
+ fms.s1 FR_w = f8,f1,f1 // x-1
+ // jump if the input argument is positive and less than 1.0
+(p10) br.cond.spnt lgamma_0_1
+};;
+{ .mfi
+ ldfe FR_C01 = [GR_ad_Co],32
+ fnma.s1 FR_InvX = FR_C,f8,f1 // NR iteration #1
+(p15) cmp.lt.unc p8,p0 = GR_ExpOf256,GR_SignExp
+}
+{ .mib
+ ldfe FR_C11 = [GR_ad_Ce],32
+(p15) cmp.lt.unc p11,p0 = GR_Arg,GR_2_25
+ // jump if the input argument isn't less than 512.0
+(p8) br.cond.spnt lgamma_pstirling
+};;
+{ .mfi
+ ldfe FR_C21 = [GR_ad_Co],32
+(p14) fms.s1 FR_r = FR_C,f8,f1 // reduced arg for log(x)
+(p14) cmp.lt.unc p0,p9 = GR_Exp,GR_ExpOf256
+}
+{ .mib
+ ldfe FR_C31 = [GR_ad_Ce],32
+ add GR_ad_Co7 = 0x12C0,GR_ad_2
+ // jump if the input argument is from range [1.0; 2.25)
+(p11) br.cond.spnt lgamma_1_2
+};;
+{ .mfi
+ ldfe FR_C41 = [GR_ad_Co],32
+ fcvt.xf FR_N = FR_int_N
+ add GR_ad_Ce7 = 0x1310,GR_ad_2
+}
+{ .mfb
+ ldfe FR_C51 = [GR_ad_Ce],32
+(p14) fma.s1 FR_5 = FR_2,FR_2,f1
+ // jump if the input argument is less or equal to -512.0
+(p9) br.cond.spnt lgamma_negstirling
+};;
+{ .mfi
+ ldfe FR_C61 = [GR_ad_Co],32
+(p14) fcvt.xf FR_Ntrunc = FR_int_Ntrunc
+ shr GR_Ind = GR_Ind,4
+}
+{ .mfi
+ ldfe FR_C71 = [GR_ad_Ce],32
+(p14) fma.s1 FR_Xp1 = f1,f1,FR_NormX // x+1
+ cmp.eq p6,p7 = GR_ExpOf2,GR_SignExp
+};;
+.pred.rel "mutex",p6,p7
+{ .mfi
+ ldfe FR_C81 = [GR_ad_Co],32
+(p6) fma.s1 FR_x = f0,f0,FR_NormX
+ shladd GR_Offs7 = GR_Ind,2,GR_Ind // (ind*16)*5
+}
+{ .mfi
+ ldfe FR_C91 = [GR_ad_Ce],32
+(p7) fms.s1 FR_x = FR_x,f1,f1
+ add GR_ad_Co7 = 0x800,GR_ad_Data
+};;
+{ .mfi
+ ldfe FR_CA1 = [GR_ad_Co],32
+(p14) fma.s1 FR_3 = f1,f1,FR_2
+ shladd GR_Offs7 = GR_Ind,1,GR_Offs7 // (ind*16)*7
+}
+{ .mfi
+ ldfe FR_C00 = [GR_ad_Ce],32
+(p14) fma.s1 FR_Xp4 = FR_2,FR_2,FR_NormX
+ add GR_ad_Ce7 = 0x810,GR_ad_Data
+};;
+{ .mfi
+ ldfe FR_C10 = [GR_ad_Co],32
+(p6) fms.s1 FR_Xm2 = FR_w,f1,f1
+ add GR_ad_Co7 = GR_ad_Co7,GR_Offs7
+}
+{ .mfi
+ ldfe FR_C20 = [GR_ad_Ce],32
+(p14) fma.s1 FR_r2 = FR_r,FR_r,f0 // log(x)
+ add GR_ad_Ce7 = GR_ad_Ce7,GR_Offs7
+};;
+{ .mfi
+ ldfe FR_C30 = [GR_ad_Co],32
+(p14) fms.s1 FR_Xf = FR_NormX,f1,FR_N // xf = x - [x]
+(p14) mov GR_Arg17 = 0xC031 // -17
+}
+{ .mfi
+ ldfe FR_C40 = [GR_ad_Ce],32
+(p14) fma.s1 FR_Xp5 = FR_5,f1,FR_NormX
+(p14) sub GR_Exp = GR_Exp,GR_ExpBias
+};;
+{ .mfi
+ ldfe FR_C50 = [GR_ad_Co7],32
+(p14) fms.s1 FR_Xfr = FR_Xp1,f1,FR_Ntrunc // xfr = (x+1) - [x]
+(p14) cmp.lt.unc p13,p0 = GR_Arg,GR_Arg17
+}
+{ .mfb
+ ldfe FR_C60 = [GR_ad_Ce7],32
+(p14) fma.s1 FR_Xp10 = FR_5,FR_2,FR_NormX
+ // jump if the input argument is negative and great than -17.0
+(p13) br.cond.spnt lgamma_negrecursion
+};;
+{ .mfi
+ ldfe FR_C70 = [GR_ad_Co7],32
+ fma.s1 FR_C01 = FR_x,f1,FR_C01
+(p14) add GR_ad_Ce = 0x1310,GR_ad_2
+}
+{ .mfi
+ ldfe FR_C80 = [GR_ad_Ce7],32
+ fma.s1 FR_C11 = FR_x,f1,FR_C11
+(p14) add GR_ad_Co = 0x12C0,GR_ad_2
+};;
+{ .mfi
+ ldfe FR_C90 = [GR_ad_Co7],32
+ fma.s1 FR_C21 = FR_x,f1,FR_C21
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_CA0 = [GR_ad_Ce7],32
+ fma.s1 FR_C31 = FR_x,f1,FR_C31
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_CN = [GR_ad_Co7],32
+ fma.s1 FR_C41 = FR_x,f1,FR_C41
+ nop.i 0
+}
+{ .mfi
+(p14) ldfpd FR_P5,FR_P4 = [GR_ad_1],16
+ fma.s1 FR_C51 = FR_x,f1,FR_C51
+ nop.i 0
+};;
+{ .mfi
+(p14) ldfpd FR_P3,FR_P2 = [GR_ad_2],16
+ fma.s1 FR_C61 = FR_x,f1,FR_C61
+ nop.i 0
+}
+{ .mfi
+(p14) ldfe FR_Ln2 = [GR_ad_1]
+ fma.s1 FR_C71 = FR_x,f1,FR_C71
+ nop.i 0
+};;
+{ .mfi
+(p14) ldfpd FR_S28,FR_S26 = [GR_ad_Co],16
+ fma.s1 FR_C81 = FR_x,f1,FR_C81
+ add GR_ad_2 = 0x60,GR_ad_2
+}
+{ .mfi
+(p14) ldfpd FR_S24,FR_S22 = [GR_ad_Ce],16
+ fma.s1 FR_C91 = FR_x,f1,FR_C91
+ nop.i 0
+};;
+{ .mfi
+(p14) ldfpd FR_S20,FR_S18 = [GR_ad_Co],16
+ fma.s1 FR_CA1 = FR_x,f1,FR_CA1
+ nop.i 0
+}
+{ .mfi
+(p14) ldfpd FR_S16,FR_S14 = [GR_ad_Ce],16
+ fma.s1 FR_C01 = FR_C01,FR_x,FR_C00
+ nop.i 0
+};;
+{ .mfi
+(p14) getf.exp GR_SignExp = FR_Xf
+ fma.s1 FR_C11 = FR_C11,FR_x,FR_C10
+ nop.i 0
+}
+{ .mfi
+(p14) ldfe FR_S12 = [GR_ad_Co],16
+ fma.s1 FR_C21 = FR_C21,FR_x,FR_C20
+ nop.i 0
+};;
+{ .mfi
+(p14) getf.sig GR_Sig = FR_Xf
+(p14) frcpa.s1 FR_InvXf,p0 = f1,FR_Xf
+ nop.i 0
+}
+{ .mfi
+(p14) ldfe FR_S10 = [GR_ad_Ce],16
+ fma.s1 FR_C41 = FR_C41,FR_x,FR_C40
+ nop.i 0
+};;
+{ .mfi
+(p14) ldfe FR_S8 = [GR_ad_Co],16
+ fma.s1 FR_C51 = FR_C51,FR_x,FR_C50
+ nop.i 0
+}
+{ .mfi
+(p14) ldfe FR_S6 = [GR_ad_Ce],16
+ fma.s1 FR_C61 = FR_C61,FR_x,FR_C60
+(p14) and GR_Expf = GR_SignExp,GR_ExpMask
+};;
+{ .mfi
+(p14) sub GR_Expf = GR_Expf,GR_ExpBias
+ fma.s1 FR_C71 = FR_C71,FR_x,FR_C70
+(p14) shl GR_Ind = GR_Sig,1
+}
+{ .mfi
+(p14) ldfe FR_S4 = [GR_ad_Co],16
+ fma.s1 FR_C81 = FR_C81,FR_x,FR_C80
+(p14) cmp.eq.unc p8,p0 = 0,GR_Sig
+};;
+{ .mfi
+(p14) setf.sig FR_int_Nf = GR_Expf
+ fma.s1 FR_C91 = FR_C91,FR_x,FR_C90
+(p14) shr.u GR_Ind = GR_Ind,56
+}
+{ .mfb
+(p14) ldfe FR_S2 = [GR_ad_Ce],16
+ fma.s1 FR_CA1 = FR_CA1,FR_x,FR_CA0
+ // jump if the input argument is integer number from range (-512.0;-17.0]
+(p8) br.cond.spnt lgamma_singularity
+};;
+{ .mfi
+(p14) getf.sig GR_Sig = FR_int_Ntrunc
+ fma.s1 FR_C01 = FR_C01,FR_C11,f0
+ nop.i 0
+}
+{ .mfi
+(p14) shladd GR_ad_T = GR_Ind,4,GR_ad_2
+ fma.s1 FR_C31 = FR_C31,FR_x,FR_C30
+ nop.i 0
+};;
+{ .mfi
+(p14) ldfe FR_Tf = [GR_ad_T]
+(p14) fms.s1 FR_rf = FR_InvXf,FR_Xf,f1 // reduced arg for log({x})
+(p14) extr.u GR_Ind = GR_ArgAsIs,44,8
+}
+{ .mfi
+ // set p9 if signgum is 32-bit int
+ // set p10 if signgum is 64-bit int
+ cmp.eq p10,p9 = 8,r34
+ fma.s1 FR_C21 = FR_C21,FR_C41,f0
+ mov GR_SignOfGamma = 1
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C51 = FR_C51,FR_C61,f0
+(p14) tbit.z.unc p8,p0 = GR_Sig,0
+}
+{ .mfi
+(p14) shladd GR_ad_T = GR_Ind,4,GR_ad_2
+(p6) fma.s1 FR_CN = FR_CN,FR_Xm2,f0
+ nop.i 0
+};;
+{ .mfi
+(p14) setf.sig FR_int_N = GR_Exp
+ fma.s1 FR_C71 = FR_C71,FR_C81,f0
+(p8) sub GR_SignOfGamma = r0,GR_SignOfGamma
+}
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_Xf2 = FR_Xf,FR_Xf,f0
+ nop.i 0
+};;
+{ .mfi
+(p14) ldfe FR_T = [GR_ad_T]
+ fma.s1 FR_C91 = FR_C91,FR_CA1,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_r2 = FR_r,FR_r,f0
+ nop.i 0
+};;
+.pred.rel "mutex",p9,p10
+{ .mfi
+ // store sign of gamma(x) as 32-bit int
+(p9) st4 [r33] = GR_SignOfGamma
+ fma.s1 FR_C01 = FR_C01,FR_C31,f0
+ nop.i 0
+}
+{ .mfi
+ // store sign of gamma(x) as 64-bit int
+(p10) st8 [r33] = GR_SignOfGamma
+(p14) fma.s1 FR_P54 = FR_P5,FR_r,FR_P4
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_P32 = FR_P3,FR_r,FR_P2
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p14) fma.s1 FR_P54f = FR_P5,FR_rf,FR_P4
+ // jump if the input argument is non-integer from range (-512.0;-17.0]
+(p14) br.cond.spnt lgamma_negpoly
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C21 = FR_C21,FR_C51,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C71 = FR_C71,FR_C91,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_CN = FR_C01,FR_CN,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C21 = FR_C21,FR_C71,f0
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = FR_C21,FR_CN,f0
+ br.ret.sptk b0 // exit for arguments from range [2.25; 512.0)
+};;
+// branch for calculating of ln(GAMMA(x)) for -512 < x < -17
+//---------------------------------------------------------------------
+.align 32
+lgamma_negpoly:
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Xf4 = FR_Xf2,FR_Xf2,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S28 = FR_S28,FR_Xf2,FR_S26
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S24 = FR_S24,FR_Xf2,FR_S22
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S20 = FR_S20,FR_Xf2,FR_S18
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S16 = FR_S16,FR_Xf2,FR_S14
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S12 = FR_S12,FR_Xf2,FR_S10
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S8 = FR_S8,FR_Xf2,FR_S6
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S4 = FR_S4,FR_Xf2,FR_S2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rf2 = FR_rf,FR_rf,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P32f = FR_P3,FR_rf,FR_P2 // log(x)
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r3 = FR_r2,FR_r,f0 // log(x)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fcvt.xf FR_Nf = FR_int_Nf // log({x})
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S28 = FR_S28,FR_Xf4,FR_S24
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Xf8 = FR_Xf4,FR_Xf4,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S20 = FR_S20,FR_Xf4,FR_S16
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C21 = FR_C21,FR_C51,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S12 = FR_S12,FR_Xf4,FR_S8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C71 = FR_C71,FR_C91,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_P10 = FR_r2,FR_05,FR_r // log(x)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P54 = FR_P54,FR_r2,FR_P32 // log(x)
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_P10f = FR_rf2,FR_05,FR_rf // log({x})
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fcvt.xf FR_N = FR_int_N // log(x)
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rf3 = FR_rf2,FR_rf,f0 // log({x})
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P54f = FR_P54f,FR_rf2,FR_P32f // log({x})
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S28 = FR_S28,FR_Xf8,FR_S20
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_TpNxLn2f = FR_Nf,FR_Ln2,FR_Tf // log({x})
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_CN = FR_C01,FR_CN,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C21 = FR_C21,FR_C71,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P54 = FR_P54,FR_r3,FR_P10 // log(x)
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_TpNxLn2 = FR_N,FR_Ln2,FR_T // log(x)
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P54f = FR_P54f,FR_rf3,FR_P10f // log({x})
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S28 = FR_S28,FR_Xf8,FR_S12
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_C21 = FR_C21,FR_CN,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_LnX = FR_TpNxLn2,f1,FR_P54 // log(x)
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_LnXf = FR_TpNxLn2f,f1,FR_P54f // log({x})
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S28 = FR_S28,FR_Xf4,FR_S4
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_LnX = FR_LnX,f1,FR_LnXf
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_S28 = FR_S28,FR_Xf2,FR_C21
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ fms.d.s0 f8 = FR_S28,f1,FR_LnX
+ br.ret.sptk b0
+};;
+// branch for calculating of ln(GAMMA(x)) for x >= 512
+//---------------------------------------------------------------------
+.align 32
+lgamma_pstirling:
+{ .mfi
+ ldfpd FR_P5,FR_P4 = [GR_ad_1],16
+ nop.f 0
+ and GR_Exp = GR_SignExp,GR_ExpMask
+}
+{ .mfi
+ ldfpd FR_P3,FR_P2 = [GR_ad_2],16
+ fma.s1 FR_InvX = FR_C,FR_InvX,FR_C // NR iteration #1
+ mov GR_ExpBias = 0xffff
+};;
+{ .mfi
+ ldfe FR_Ln2 = [GR_ad_1],16
+ nop.f 0
+ sub GR_Exp = GR_Exp,GR_ExpBias
+};;
+{ .mfi
+ ldfpd FR_W4,FR_OvfBound = [GR_ad_2],16
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ setf.sig FR_int_N = GR_Exp
+ fms.s1 FR_r = FR_C,f8,f1
+ nop.i 0
+};;
+{ .mmf
+ getf.sig GR_Sig = FR_NormX
+ ldfe FR_LnSqrt2Pi = [GR_ad_1],16
+ nop.f 0
+};;
+{ .mmf
+ ldfe FR_W2 = [GR_ad_2],16
+ nop.m 0
+ fnma.s1 FR_InvX2 = FR_InvX,FR_NormX,f1 // NR iteration #2
+};;
+{ .mfi
+ add GR_ad_2 = 0x40,GR_ad_2
+ nop.f 0
+ shl GR_Ind = GR_Sig,1
+};;
+{ .mfi
+ mov GR_SignOfGamma = 1
+ nop.f 0
+ shr.u GR_Ind = GR_Ind,56
+};;
+{ .mfi
+ shladd GR_ad_2 = GR_Ind,4,GR_ad_2
+ fma.s1 FR_r2 = FR_r,FR_r,f0
+ // set p9 if signgum is 32-bit int
+ // set p10 if signgum is 64-bit int
+ cmp.eq p10,p9 = 8,r34
+};;
+{ .mfi
+ ldfe FR_T = [GR_ad_2]
+ fma.s1 FR_P54 = FR_P5,FR_r,FR_P4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P32 = FR_P3,FR_r,FR_P2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fcmp.le.s1 p6,p0 = FR_OvfBound,FR_NormX
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_InvX2 = FR_InvX,FR_InvX2,FR_InvX // NR iteration #2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fcvt.xf FR_N = FR_int_N
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ nop.f 0
+ // jump if x is great than OVERFLOW_BOUNDARY
+(p6) br.cond.spnt lgamma_overflow
+};;
+.pred.rel "mutex",p9,p10
+{ .mfi
+ // store sign of gamma(x) as 32-bit int
+(p9) st4 [r33] = GR_SignOfGamma
+ fma.s1 FR_r3 = FR_r2,FR_r,f0
+ nop.i 0
+}
+{ .mfi
+ // store sign of gamma(x) as 64-bit int
+(p10) st8 [r33] = GR_SignOfGamma
+ fnma.s1 FR_P10 = FR_r2,FR_05,FR_r
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P54 = FR_P54,FR_r2,FR_P32
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_InvX = FR_InvX2,FR_NormX,f1 // NR iteration #3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_Xm05 = FR_NormX,f1,FR_05 // (x-1/2)
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_TpNxLn2 = FR_N,FR_Ln2,FR_T
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P54 = FR_P54,FR_r3,FR_P10
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_InvX = FR_InvX2,FR_InvX,FR_InvX2 // NR iteration #3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_LnSqrt2Pi = FR_LnSqrt2Pi,f1,FR_NormX // ln(sqrt(2*Pi))-x
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_LnX = FR_TpNxLn2,f1,FR_P54
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_InvX2 = FR_InvX,FR_InvX,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ // (x-1/2)*ln(x)+ln(sqrt(2*Pi))-x
+ fma.s1 FR_LnX = FR_LnX,FR_Xm05,FR_LnSqrt2Pi
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_W2 = FR_W4,FR_InvX2,FR_W2 // W2 + W4/x^2
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = FR_InvX,FR_W2,FR_LnX
+ br.ret.sptk b0
+};;
+// branch for calculating of ln(GAMMA(x)) for x < -512
+//---------------------------------------------------------------------
+.align 32
+lgamma_negstirling:
+{ .mfi
+ ldfpd FR_P5,FR_P4 = [GR_ad_1],16
+ fms.s1 FR_Xf = FR_NormX,f1,FR_N // xf = x - [x]
+ and GR_Exp = GR_SignExp,GR_ExpMask
+}
+{ .mfi
+ ldfpd FR_P3,FR_P2 = [GR_ad_2],16
+ fma.s1 FR_InvX = FR_C,FR_InvX,FR_C // NR iteration #1
+ mov GR_0x30033 = 0x30033
+};;
+{ .mfi
+ ldfe FR_Ln2 = [GR_ad_1],16
+ nop.f 0
+ extr.u GR_Ind = GR_ArgAsIs,44,8
+}
+{ .mib
+ ldfd FR_W4 = [GR_ad_2],16
+ // jump if x is less or equal to -2^52, i.e. x is big negative integer
+ cmp.leu.unc p7,p0 = GR_0x30033,GR_SignExp
+(p7) br.cond.spnt lgamma_singularity
+};;
+{ .mfi
+ ldfpd FR_S28,FR_S26 = [GR_ad_Co7],16
+ nop.f 0
+ add GR_ad_LnT = 0x50,GR_ad_2
+}
+{ .mfi
+ ldfpd FR_S24,FR_S22 = [GR_ad_Ce7],16
+ nop.f 0
+ mov GR_ExpBias = 0xffff
+};;
+{ .mfi
+ ldfpd FR_S20,FR_S18 = [GR_ad_Co7],16
+ nop.f 0
+ shladd GR_ad_T = GR_Ind,4,GR_ad_LnT
+}
+{ .mfi
+ ldfpd FR_S16,FR_S14 = [GR_ad_Ce7],16
+ nop.f 0
+ sub GR_Exp = GR_Exp,GR_ExpBias
+};;
+{ .mfi
+ ldfe FR_S12 = [GR_ad_Co7],16
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_S10 = [GR_ad_Ce7],16
+ fms.s1 FR_r = FR_C,f8,f1
+ nop.i 0
+};;
+{ .mmf
+ ldfe FR_S8 = [GR_ad_Co7],16
+ ldfe FR_S6 = [GR_ad_Ce7],16
+ nop.f 0
+};;
+{ .mfi
+ ldfe FR_S4 = [GR_ad_Co7],16
+ fma.s1 FR_Xf2 = FR_Xf,FR_Xf,f0
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_S2 = [GR_ad_Ce7],16
+ fnma.s1 FR_InvX2 = FR_InvX,FR_NormX,f1 // NR iteration #2
+ nop.i 0
+};;
+{ .mfi
+ setf.sig FR_int_N = GR_Exp
+ frcpa.s1 FR_InvXf,p9 = f1,FR_Xf // 1/xf
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_LnSqrt2Pi = [GR_ad_1],16
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ getf.exp GR_SignExp = FR_Xf
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_W2 = [GR_ad_2],16
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ getf.sig GR_Sig = FR_Xf
+ fma.s1 FR_P54 = FR_P5,FR_r,FR_P4
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_T = [GR_ad_T]
+ fma.s1 FR_P32 = FR_P3,FR_r,FR_P2
+ nop.i 0
+};;
+{ .mfi
+ and GR_Exp = GR_SignExp,GR_ExpMask
+ fma.s1 FR_r2 = FR_r,FR_r,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_Xm05 = FR_NormX,f1,FR_05 // (x-1/2)
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_InvX2 = FR_InvX,FR_InvX2,FR_InvX // NR iteration #2
+ extr.u GR_Ind = GR_Sig,55,8
+}
+{ .mfi
+ sub GR_Exp = GR_Exp,GR_ExpBias
+ fma.s1 FR_Xf4 = FR_Xf2,FR_Xf2,f0
+ cmp.eq p6,p0 = 0,GR_Sig
+};;
+{ .mfi
+ setf.sig FR_int_Nf = GR_Exp
+ fma.s1 FR_S28 = FR_S28,FR_Xf2,FR_S26
+ shladd GR_ad_T = GR_Ind,4,GR_ad_LnT
+}
+{ .mfb
+ nop.m 0
+ fma.s1 FR_S24 = FR_S24,FR_Xf2,FR_S22
+ // jump if the input argument is integer number from range (-512.0;-17.0]
+(p6) br.cond.spnt lgamma_singularity
+};;
+{ .mfi
+ getf.sig GR_Sig = FR_int_Ntrunc
+ fma.s1 FR_S20 = FR_S20,FR_Xf2,FR_S18
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S16 = FR_S16,FR_Xf2,FR_S14
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_Tf = [GR_ad_T]
+ fma.s1 FR_S12 = FR_S12,FR_Xf2,FR_S10
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S8 = FR_S8,FR_Xf2,FR_S6
+ mov GR_SignOfGamma = 1
+};;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_rf = FR_InvXf,FR_Xf,f1 // reduced arg rf
+ tbit.z p8,p0 = GR_Sig,0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r3 = FR_r2,FR_r,f0
+ // set p9 if signgum is 32-bit int
+ // set p10 if signgum is 64-bit int
+ cmp.eq p10,p9 = 8,r34
+};;
+{ .mfi
+ nop.m 0
+ fcvt.xf FR_N = FR_int_N
+(p8) sub GR_SignOfGamma = r0,GR_SignOfGamma
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_InvX = FR_InvX2,FR_NormX,f1 // NR iteration #3
+ nop.i 0
+};;
+.pred.rel "mutex",p9,p10
+{ .mfi
+ // store sign of gamma(x) as 32-bit int
+(p9) st4 [r33] = GR_SignOfGamma
+ fma.s1 FR_P54 = FR_P54,FR_r2,FR_P32
+ nop.i 0
+}
+{ .mfi
+ // store sign of gamma(x) as 64-bit int
+(p10) st8 [r33] = GR_SignOfGamma
+ fnma.s1 FR_P10 = FR_r2,FR_05,FR_r
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Xf8 = FR_Xf4,FR_Xf4,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S28 = FR_S28,FR_Xf4,FR_S24
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S20 = FR_S20,FR_Xf4,FR_S16
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S12 = FR_S12,FR_Xf4,FR_S8
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rf2 = FR_rf,FR_rf,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P54f = FR_P5,FR_rf,FR_P4
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P32f = FR_P3,FR_rf,FR_P2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_InvX = FR_InvX2,FR_InvX,FR_InvX2 // NR iteration #3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fcvt.xf FR_Nf = FR_int_Nf
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_LnSqrt2Pi = FR_NormX,f1,FR_LnSqrt2Pi // x+ln(sqrt(2*Pi))
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P54 = FR_P54,FR_r3,FR_P10
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S28 = FR_S28,FR_Xf8,FR_S20
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rf3 = FR_rf2,FR_rf,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_P10f = FR_rf2,FR_05,FR_rf
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_TpNxLn2 = FR_N,FR_Ln2,FR_T
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P54f = FR_P54f,FR_rf2,FR_P32f
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_InvX2 = FR_InvX,FR_InvX,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S28 = FR_S28,FR_Xf8,FR_S12
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S4 = FR_S4,FR_Xf2,FR_S2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P54f = FR_P54f,FR_rf3,FR_P10f
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_TpNxLn2f = FR_Nf,FR_Ln2,FR_Tf
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_LnX = FR_TpNxLn2,f1,FR_P54
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_W2 = FR_W4,FR_InvX2,FR_W2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S28 = FR_S28,FR_Xf4,FR_S4
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_LnXf = FR_TpNxLn2f,f1,FR_P54f
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_LnX = FR_LnX,FR_Xm05,FR_LnSqrt2Pi
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_LnX = FR_InvX,FR_W2,FR_LnX
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_LnX = FR_S28,FR_Xf2,FR_LnX
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ fms.d.s0 f8 = FR_LnX,f1,FR_LnXf
+ br.ret.sptk b0
+};;
+// branch for calculating of ln(GAMMA(x)) for 0 <= x < 1
+//---------------------------------------------------------------------
+.align 32
+lgamma_0_1:
+{ .mfi
+ ldfpd FR_P5,FR_P4 = [GR_ad_1],16
+ fms.s1 FR_x = FR_NormX,f1,f0 // x
+ mov GR_Arg025 = 0x3FD0
+}
+{ .mfi
+ ldfpd FR_P3,FR_P2 = [GR_ad_2],16
+ nop.f 0
+ add GR_ad_Co = 0x1C40,GR_ad_Data
+};;
+{ .mfi
+ ldfe FR_Ln2 = [GR_ad_1],0x50
+ nop.f 0
+ // p6 if arg < 0.25
+ cmp.lt p6,p9 = GR_Arg,GR_Arg025
+}
+{ .mfi
+ add GR_ad_2 = 0x40,GR_ad_2
+ nop.f 0
+ mov GR_Arg075 = 0x3FE8
+};;
+{ .mfi
+ ldfpd FR_Q8,FR_Q7 = [GR_ad_1],16
+ fma.s1 FR_w2 = FR_w,FR_w,f0
+ // p7 if 0.25 <= arg < 0.75
+ // p8 if 0.75 <= arg < 1.0
+(p9) cmp.lt.unc p7,p8 = GR_Arg,GR_Arg075
+}
+{ .mfi
+ mov GR_Arg0875 = 0x3FEC
+ nop.f 0
+ sub GR_Exp = GR_Exp,GR_ExpBias
+};;
+{ .mfi
+ ldfpd FR_Q6,FR_Q5 = [GR_ad_2],16
+ nop.f 0
+(p8) cmp.lt p9,p0 = GR_Arg,GR_Arg0875
+}
+{ .mfi
+ ldfpd FR_Q4,FR_Q3 = [GR_ad_1],16
+ nop.f 0
+ add GR_ad_Ce = 0x60,GR_ad_Co
+};;
+.pred.rel "mutex",p7,p8
+{ .mfi
+ ldfd FR_Q2 = [GR_ad_2],16
+ fms.s1 FR_r = FR_C,f8,f1
+(p7) mov GR_Offs = 0xC0
+}
+{ .mfi
+ setf.sig FR_int_N = GR_Exp
+ nop.f 0
+(p8) mov GR_Offs = 0x180
+};;
+.pred.rel "mutex",p6,p7
+{ .mfi
+(p9) add GR_ad_Co = GR_Offs,GR_ad_Co
+(p8) fms.s1 FR_x = FR_NormX,f1,f1 // x-1
+ nop.i 0
+}
+{ .mfi
+(p9) add GR_ad_Ce = GR_Offs,GR_ad_Ce
+(p7) fms.s1 FR_x = FR_NormX,f1,FR_LocalMin // x-LocalMin
+ cmp.lt p10,p0 = GR_Arg,GR_Arg0875
+};;
+lgamma_common_0_2:
+{ .mfi
+ ldfpd FR_A17,FR_A16 = [GR_ad_Co],16
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd FR_A15,FR_A14 = [GR_ad_Ce],16
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ ldfpd FR_A13,FR_A12 = [GR_ad_Co],16
+ nop.f 0
+(p10) extr.u GR_Ind = GR_ArgAsIs,44,8
+}
+{ .mfi
+ ldfpd FR_A11,FR_A10 = [GR_ad_Ce],16
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ ldfpd FR_A9,FR_A8 = [GR_ad_Co],16
+(p10) fnma.s1 FR_Q1 = FR_05,FR_w2,FR_w
+ nop.i 0
+}
+{ .mfi
+ ldfpd FR_A7,FR_A6 = [GR_ad_Ce],16
+(p10) fma.s1 FR_w3 = FR_w2,FR_w,f0
+ nop.i 0
+};;
+{ .mfi
+(p10) getf.exp GR_SignExp_w = FR_w
+(p10) fma.s1 FR_w4 = FR_w2,FR_w2,f0
+ nop.i 0
+}
+{ .mfi
+(p10) shladd GR_ad_2 = GR_Ind,4,GR_ad_2
+(p10) fma.s1 FR_r2 = FR_r,FR_r,f0
+ nop.i 0
+};;
+{ .mfi
+(p10) ldfe FR_T = [GR_ad_2]
+(p10) fma.s1 FR_P54 = FR_P5,FR_r,FR_P4
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_A5 = [GR_ad_Co],16
+(p10) fma.s1 FR_P32 = FR_P3,FR_r,FR_P2
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_A4 = [GR_ad_Ce],16
+ fma.s1 FR_x2 = FR_x,FR_x,f0
+(p10) and GR_Exp_w = GR_ExpMask, GR_SignExp_w
+}
+{ .mfi
+ ldfe FR_A3 = [GR_ad_Co],16
+ nop.f 0
+(p10) mov GR_fff9 = 0xfff9
+};;
+// p13 <== large w __libm_lgamma
+// p14 <== small w __libm_lgamma
+{ .mfi
+ ldfe FR_A2 = [GR_ad_Ce],16
+(p10) fma.s1 FR_Q8 = FR_Q8,FR_w,FR_Q7
+(p10) cmp.ge.unc p13,p14 = GR_Exp_w,GR_fff9
+}
+{ .mfi
+ ldfe FR_A1 = [GR_ad_Co],16
+(p10) fma.s1 FR_Q6 = FR_Q6,FR_w,FR_Q5
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_A0 = [GR_ad_Ce],16
+(p10) fma.s1 FR_Q4 = FR_Q4,FR_w,FR_Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_Q2 = FR_Q2,FR_w3,FR_Q1
+ nop.i 0
+};;
+{ .mfi
+ // set p11 if signgum is 32-bit int
+ // set p12 if signgum is 64-bit int
+ cmp.eq p12,p11 = 8,r34
+(p10) fma.s1 FR_r3 = FR_r2,FR_r,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p10) fnma.s1 FR_P10 = FR_r2,FR_05,FR_r
+ mov GR_SignOfGamma = 1
+};;
+.pred.rel "mutex",p11,p12
+{ .mfi
+ // store sign of gamma(x) as 32-bit int
+(p11) st4 [r33] = GR_SignOfGamma
+ fma.s1 FR_A17 = FR_A17,FR_x,FR_A16
+ nop.i 0
+}
+{ .mfi
+ // store sign of gamma(x) as 64-bit int
+(p12) st8 [r33] = GR_SignOfGamma
+ fma.s1 FR_A15 = FR_A15,FR_x,FR_A14
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p10) fcvt.xf FR_N = FR_int_N
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_P54 = FR_P54,FR_r2,FR_P32
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A13 = FR_A13,FR_x,FR_A12
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A11 = FR_A11,FR_x,FR_A10
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A9 = FR_A9,FR_x,FR_A8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A7 = FR_A7,FR_x,FR_A6
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_Qlo = FR_Q8,FR_w2,FR_Q6
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_w6 = FR_w3,FR_w3,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_Qhi = FR_Q4,FR_w4,FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A5 = FR_A5,FR_x,FR_A4
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_TpNxLn2 = FR_N,FR_Ln2,FR_T
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A3 = FR_A3,FR_x,FR_A2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_P54 = FR_P54,FR_r3,FR_P10
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A1 = FR_A1,FR_x,FR_A0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A17 = FR_A17,FR_x2,FR_A15
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A13 = FR_A13,FR_x2,FR_A11
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A9 = FR_A9,FR_x2,FR_A7
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_x4 = FR_x2,FR_x2,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_LnX = FR_Qlo,FR_w6,FR_Qhi
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A5 = FR_A5,FR_x2,FR_A3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_LnX = FR_TpNxLn2,f1,FR_P54
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A17 = FR_A17,FR_x4,FR_A13
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_x8 = FR_x4,FR_x4,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A9 = FR_A9,FR_x4,FR_A5
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A17 = FR_A17,FR_x8,FR_A9
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p10) fms.s1 FR_A1 = FR_A1,f1,FR_LnX
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = FR_A17,FR_x2,FR_A1
+ br.ret.sptk b0
+};;
+// branch for calculating of ln(GAMMA(x)) for 1.0 <= x < 2.25
+//---------------------------------------------------------------------
+.align 32
+lgamma_1_2:
+{ .mfi
+ add GR_ad_Co = 0x10B0,GR_ad_1
+ fcmp.eq.s1 p12,p0 = f1,FR_w
+ mov GR_Arg125 = 0x3FF4
+}
+{ .mfi
+ add GR_ad_Ce = 0x1110,GR_ad_1
+ nop.f 0
+ mov GR_Arg175 = 0x3FFC
+};;
+{ .mfi
+ mov GR_SignOfGamma = 1
+ fcmp.eq.s1 p13,p0 = f1,FR_NormX
+ cmp.lt p6,p9 = GR_Arg,GR_Arg125 // 1.0 <= x < 1.25
+}
+{ .mfi
+ // set p10 if signgum is 32-bit int
+ // set p11 if signgum is 64-bit int
+ cmp.eq p11,p10 = 8,r34
+ nop.f 0
+ cmp.ge p8,p0 = GR_Arg,GR_Arg175 // x >= 1.75
+};;
+.pred.rel "mutex",p10,p11
+{ .mfi
+ // store sign of gamma(x) as 32-bit int
+(p10) st4 [r33] = GR_SignOfGamma
+(p12) fma.d.s0 f8 = f0,f0,f0
+(p9) cmp.lt.unc p7,p0 = GR_Arg,GR_Arg175 // 1.25 <= x < 1.75
+}
+{ .mib
+ // store sign of gamma(x) as 64-bit int
+(p11) st8 [r33] = GR_SignOfGamma
+ mov GR_Offs = 0
+(p12) br.ret.spnt b0 // fast exit for 2.0
+};;
+.pred.rel "mutex",p7,p8
+{ .mfi
+(p7) mov GR_Offs = 0xC0
+(p7) fms.s1 FR_x = FR_w,f1,FR_LocalMin
+ nop.i 0
+}
+{ .mfb
+(p8) mov GR_Offs = 0x180
+(p13) fma.d.s0 f8 = f0,f0,f0
+(p13) br.ret.spnt b0 // fast exit for 1.0
+};;
+.pred.rel "mutex",p6,p8
+{ .mfi
+ add GR_ad_Co = GR_ad_Co,GR_Offs
+(p8) fms.s1 FR_x = FR_w,f1,f1
+ cmp.eq p0,p10 = r0,r0
+}
+{ .mfb
+ add GR_ad_Ce = GR_ad_Ce,GR_Offs
+(p6) fma.s1 FR_x = f0,f0,FR_w
+ br.cond.sptk lgamma_common_0_2
+};;
+// branch for calculating of ln(GAMMA(x)) for -17 < x < 0
+//---------------------------------------------------------------------
+.align 32
+lgamma_negrecursion:
+{ .mfi
+ getf.d GR_ArgXfrAsIs = FR_Xfr
+ fma.s1 FR_Xp2 = FR_2,f1,FR_NormX
+ mov GR_Arg05 = 0x3FE
+}
+{ .mfi
+ add GR_ad_Roots = 0x1390,GR_ad_1
+ fma.s1 FR_NormX = FR_NormX,FR_Xfr,f0
+ mov GR_Arg075 = 0x3FE8
+};;
+{ .mfi
+ getf.sig GR_Sig = FR_int_Ntrunc
+ fma.s1 FR_Xp3 = FR_2,f1,FR_Xp1
+ shl GR_Arg05 = GR_Arg05,52
+}
+{ .mfi
+ mov GR_Arg025 = 0x3FD0
+ fma.s1 FR_Xp6 = FR_5,f1,FR_Xp1
+ add GR_ad_Co = 0x1C40,GR_ad_Data
+};;
+{ .mfi
+ add GR_ad_Dx = 8,GR_ad_Roots
+ fma.s1 FR_Xp7 = FR_2,f1,FR_Xp5
+ shr.u GR_ArgXfr = GR_ArgXfrAsIs,48
+}
+{ .mfi
+ add GR_ad_Ce = 0x60,GR_ad_Co
+ fma.s1 FR_Xp8 = FR_3,f1,FR_Xp5
+ cmp.lt p6,p0 = GR_ArgXfrAsIs,GR_Arg05
+};;
+{ .mfi
+ and GR_RootInd = 0xF,GR_Sig
+ fma.s1 FR_Xp9 = FR_2,FR_2,FR_Xp5
+ // p10 if arg < 0.25
+ cmp.lt p10,p14 = GR_ArgXfr,GR_Arg025
+}
+{ .mfi
+(p6) add GR_ad_Roots = 0x120,GR_ad_Roots
+ fma.s1 FR_Xp11 = f1,f1,FR_Xp10
+(p6) add GR_ad_Dx = 0x120,GR_ad_Dx
+};;
+{ .mfi
+ shladd GR_ad_Root = GR_RootInd,4,GR_ad_Roots
+ fma.s1 FR_Xp12 = FR_2,f1,FR_Xp10
+ // p11 if 0.25 <= arg < 0.75
+ // p12 if 0.75 <= arg < 1.0
+(p14) cmp.lt.unc p11,p12 = GR_ArgXfr,GR_Arg075
+}
+{ .mfi
+ shladd GR_ad_Dx = GR_RootInd,4,GR_ad_Dx
+ fma.s1 FR_Xp13 = FR_3,f1,FR_Xp10
+ cmp.eq p0,p13 = 0,GR_Sig
+};;
+{ .mfi
+ ld8 GR_Root = [GR_ad_Root]
+ fma.s1 FR_Xp14 = FR_2,FR_2,FR_Xp10
+(p12) mov GR_Offs = 0x180
+}
+{ .mfi
+ ldfd FR_Root = [GR_ad_Root]
+ fma.s1 FR_Xp15 = FR_5,f1,FR_Xp10
+ and GR_Sig = 0xF,GR_Sig
+};;
+{ .mfi
+ ld8 GR_Dx = [GR_ad_Dx]
+ fma.s1 FR_Xp16 = FR_3,FR_2,FR_Xp10
+(p13) cmp.ge.unc p6,p0 = 0xD,GR_Sig
+}
+{ .mfi
+(p11) mov GR_Offs = 0xC0
+(p13) fma.s1 FR_NormX = FR_NormX,FR_Xp1,f0
+(p13) cmp.ge.unc p7,p0 = 0xB,GR_Sig
+};;
+{ .mfi
+(p14) add GR_ad_Co = GR_Offs,GR_ad_Co
+(p6) fma.s1 FR_Xp2 = FR_Xp2,FR_Xp3,f0
+(p13) cmp.ge.unc p8,p0 = 0x9,GR_Sig
+}
+{ .mfi
+(p14) add GR_ad_Ce = GR_Offs,GR_ad_Ce
+(p7) fma.s1 FR_Xp4 = FR_Xp4,FR_Xp5,f0
+(p13) cmp.ge.unc p9,p0 = 0x7,GR_Sig
+};;
+{ .mfi
+ ldfpd FR_B17,FR_B16 = [GR_ad_Co],16
+(p8) fma.s1 FR_Xp6 = FR_Xp6,FR_Xp7,f0
+(p13) cmp.ge.unc p6,p0 = 0x5,GR_Sig
+}
+{ .mfi
+ ldfpd FR_B15,FR_B14 = [GR_ad_Ce],16
+(p9) fma.s1 FR_Xp8 = FR_Xp8,FR_Xp9,f0
+(p13) cmp.ge.unc p7,p0 = 0x3,GR_Sig
+};;
+{ .mfi
+ ldfpd FR_B13,FR_B12 = [GR_ad_Co],16
+(p6) fma.s1 FR_Xp10 = FR_Xp10,FR_Xp11,f0
+(p13) cmp.ge.unc p8,p0 = 0x1,GR_Sig
+}
+{ .mfi
+ ldfpd FR_B11,FR_B10 = [GR_ad_Ce],16
+(p7) fma.s1 FR_Xp12 = FR_Xp12,FR_Xp13,f0
+(p13) cmp.eq.unc p9,p0 = 0,GR_Sig
+};;
+{ .mfi
+ ldfpd FR_B9,FR_B8 = [GR_ad_Co],16
+(p8) fma.s1 FR_Xp14 = FR_Xp14,FR_Xp15,f0
+ mov GR_Arg15 = 0xC02E // -15
+}
+{ .mfi
+ ldfpd FR_B7,FR_B6 = [GR_ad_Ce],16
+ fcmp.eq.s1 p15,p0 = f0,FR_Xf
+(p13) cmp.ge.unc p6,p0 = 0xC,GR_Sig
+};;
+{ .mfi
+ ldfe FR_B5 = [GR_ad_Co],16
+(p9) fma.s1 FR_NormX = FR_NormX,FR_Xp16,f0
+ sub GR_Root = GR_ArgAsIs,GR_Root
+}
+{ .mfi
+ sub GR_RootInd = 0xE,GR_RootInd
+(p11) fms.s1 FR_x = FR_Xfr,f1,FR_LocalMin // x-LocalMin
+(p13) cmp.ge.unc p7,p0 = 0x8,GR_Sig
+};;
+.pred.rel "mutex",p10,p12
+{ .mfi
+ ldfe FR_B4 = [GR_ad_Ce],16
+(p10) fms.s1 FR_x = FR_Xfr,f1,f0 // x
+ add GR_Root = GR_Root,GR_Dx
+}
+{ .mfb
+ cmp.gtu p14,p0 = 0xE,GR_RootInd
+(p12) fms.s1 FR_x = FR_Xfr,f1,f1 // x-1
+(p15) br.cond.spnt lgamma_singularity
+};;
+{ .mfi
+ ldfe FR_B3 = [GR_ad_Co],16
+(p6) fma.s1 FR_Xp2 = FR_Xp2,FR_Xp4,f0
+(p14) cmp.lt.unc p11,p0 = GR_Arg,GR_Arg15
+}
+{ .mfi
+ ldfe FR_B2 = [GR_ad_Ce],16
+(p7) fma.s1 FR_Xp6 = FR_Xp6,FR_Xp8,f0
+ add GR_2xDx = GR_Dx,GR_Dx
+};;
+{ .mfi
+ ldfe FR_B1 = [GR_ad_Co],16
+ fms.s1 FR_r = f8,f1,FR_Root
+(p13) cmp.ge.unc p6,p0 = 0x4,GR_Sig
+}
+{ .mib
+ ldfe FR_B0 = [GR_ad_Ce],16
+(p11) cmp.leu.unc p10,p0 = GR_Root,GR_2xDx
+(p10) br.cond.spnt lgamma_negroots
+};;
+{ .mfi
+ ldfpd FR_P5,FR_P4 = [GR_ad_1],16
+(p6) fma.s1 FR_Xp10 = FR_Xp10,FR_Xp12,f0
+ tbit.z p14,p15 = GR_Sig,0
+}
+{ .mfi
+ ldfpd FR_P3,FR_P2 = [GR_ad_2],16
+ fnma.d.s0 FR_T = f1,f1,f8 // nop.f 0
+
+(p13) cmp.ge.unc p7,p0 = 0x2,GR_Sig
+};;
+{ .mfi
+ ldfe FR_Ln2 = [GR_ad_1],0x50
+(p7) fma.s1 FR_NormX = FR_NormX,FR_Xp14,f0
+ mov GR_PseudoRoot = 0xBFFBC
+}
+{ .mlx
+ add GR_ad_2 = 0x40,GR_ad_2
+ movl GR_2xDx = 0x00002346DC5D6389
+};;
+{ .mfi
+ ldfpd FR_Q8,FR_Q7 = [GR_ad_1],16
+ fma.s1 FR_x2 = FR_x,FR_x,f0
+ shl GR_PseudoRoot = GR_PseudoRoot,44
+}
+{ .mfi
+ ldfpd FR_Q6,FR_Q5 = [GR_ad_2],16
+ fma.s1 FR_B17 = FR_B17,FR_x,FR_B16
+(p13) cmp.ge.unc p6,p0 = 0xA,GR_Sig
+};;
+{ .mfi
+ ldfpd FR_Q4,FR_Q3 = [GR_ad_1],16
+(p6) fma.s1 FR_Xp2 = FR_Xp2,FR_Xp6,f0
+ sub GR_PseudoRoot = GR_ArgAsIs,GR_PseudoRoot
+}
+{ .mfi
+ ldfpd FR_Q2,FR_Q1 = [GR_ad_2],16
+ fma.s1 FR_B15 = FR_B15,FR_x,FR_B14
+(p13) cmp.ge.unc p7,p0 = 0x6,GR_Sig
+};;
+{ .mfi
+ add GR_ad_Co = 0x12F0,GR_ad_2
+ fma.s1 FR_B13 = FR_B13,FR_x,FR_B12
+ cmp.leu.unc p10,p0 = GR_PseudoRoot,GR_2xDx
+}
+{ .mfi
+ add GR_ad_Ce = 0x1300,GR_ad_2
+ fma.s1 FR_B11 = FR_B11,FR_x,FR_B10
+ mov GR_ExpMask = 0x1ffff
+};;
+{ .mfi
+(p10) ldfe FR_PR01 = [GR_ad_Co],0xF0
+ fma.s1 FR_B9 = FR_B9,FR_x,FR_B8
+ mov GR_ExpBias = 0xFFFF
+}
+{ .mfb
+(p10) ldfe FR_PR11 = [GR_ad_Ce],0xF0
+ fma.s1 FR_B7 = FR_B7,FR_x,FR_B6
+(p10) br.cond.spnt lgamma_pseudoroot
+};;
+{ .mfi
+(p13) cmp.ge.unc p6,p0 = 0xE,GR_Sig
+(p7) fma.s1 FR_NormX = FR_NormX,FR_Xp10,f0
+ tbit.z.unc p8,p0 = GR_Sig,0
+}
+{ .mfi
+ mov GR_SignOfGamma = 1
+ fma.s1 FR_B5 = FR_B5,FR_x,FR_B4
+ // set p9 if signgum is 32-bit int
+ // set p10 if signgum is 64-bit int
+ cmp.eq p10,p9 = 8,r34
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_B3 = FR_B3,FR_x,FR_B2
+(p8) sub GR_SignOfGamma = r0,GR_SignOfGamma
+}
+{ .mfi
+ nop.m 0
+(p14) fms.s1 FR_w = f0,f0,f1
+ nop.i 0
+};;
+.pred.rel "mutex",p9,p10
+{ .mfi
+ // store sign of gamma(x) as 32-bit int
+(p9) st4 [r33] = GR_SignOfGamma
+ fma.s1 FR_B1 = FR_B1,FR_x,FR_B0
+ nop.i 0
+}
+{ .mfi
+ // store sign of gamma(x) as 64-bit int
+(p10) st8 [r33] = GR_SignOfGamma
+ fma.s1 FR_B17 = FR_B17,FR_x2,FR_B15
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_B13 = FR_B13,FR_x2,FR_B11
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_B9 = FR_B9,FR_x2,FR_B7
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_x4 = FR_x2,FR_x2,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p6) fma.s1 FR_NormX = FR_NormX,FR_Xp2,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_B5 = FR_B5,FR_x2,FR_B3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_B17 = FR_B17,FR_x4,FR_B13
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_x8 = FR_x4,FR_x4,f0
+ nop.i 0
+};;
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p15) fms.s1 FR_w = FR_NormX,f1,f1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p14) fnma.s1 FR_w = FR_NormX,f1,FR_w
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_B9 = FR_B9,FR_x4,FR_B5
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ frcpa.s1 FR_C,p0 = f1,FR_NormX
+ nop.i 0
+};;
+{ .mfi
+ getf.exp GR_Exp = FR_NormX
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ getf.d GR_ArgAsIs = FR_NormX
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_w2 = FR_w,FR_w,f0
+ nop.i 0
+}
+{ .mfi
+ and GR_Exp = GR_Exp,GR_ExpMask
+ fma.s1 FR_Q8 = FR_Q8,FR_w,FR_Q7
+ nop.i 0
+};;
+{ .mfi
+ sub GR_Exp = GR_Exp,GR_ExpBias
+ fma.s1 FR_B17 = FR_B17,FR_x8,FR_B9
+ extr.u GR_Ind = GR_ArgAsIs,44,8
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Q6 = FR_Q6,FR_w,FR_Q5
+ nop.i 0
+};;
+{ .mfi
+ setf.sig FR_int_N = GR_Exp
+ fms.s1 FR_r = FR_C,FR_NormX,f1
+ nop.i 0
+}
+{ .mfi
+ shladd GR_ad_2 = GR_Ind,4,GR_ad_2
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ getf.exp GR_SignExp_w = FR_w
+ fma.s1 FR_Q4 = FR_Q4,FR_w,FR_Q3
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_T = [GR_ad_2]
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ and GR_Exp_w = GR_ExpMask, GR_SignExp_w
+ fnma.s1 FR_Q1 = FR_05,FR_w2,FR_w
+ mov GR_fff9 = 0xfff9
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_w3 = FR_w2,FR_w,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_w4 = FR_w2,FR_w2,f0
+// p13 <== large w __libm_lgamma
+// p14 <== small w __libm_lgamma
+ cmp.ge p13,p14 = GR_Exp_w,GR_fff9
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Qlo = FR_Q8,FR_w2,FR_Q6
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_r2 = FR_r,FR_r,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_B17 = FR_B17,FR_x2,FR_B1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_P32 = FR_P3,FR_r,FR_P2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_P54 = FR_P5,FR_r,FR_P4
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_Q2 = FR_Q2,FR_w3,FR_Q1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_w6 = FR_w3,FR_w3,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p13) fcvt.xf FR_N = FR_int_N
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_r3 = FR_r2,FR_r,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fnma.s1 FR_P10 = FR_r2,FR_05,FR_r
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_P54 = FR_P54,FR_r2,FR_P32
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_Qhi = FR_Q4,FR_w4,FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p14) fnma.s1 FR_Qlo = FR_Qlo,FR_w6,FR_B17
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_TpNxLn2 = FR_N,FR_Ln2,FR_T
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_P54 = FR_P54,FR_r3,FR_P10
+ nop.i 0
+};;
+.pred.rel "mutex",p13,p14
+{ .mfi
+ nop.m 0
+(p14) fms.d.s0 f8 = FR_Qlo,f1,FR_Qhi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_LnX = FR_TpNxLn2,f1,FR_P54
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+(p13) fms.d.s0 f8 = FR_B17,f1,FR_LnX
+ br.ret.sptk b0
+};;
+// branch for calculating of ln(GAMMA(x)) near negative roots
+//---------------------------------------------------------------------
+.align 32
+lgamma_negroots:
+{ .mfi
+ shladd GR_Offs = GR_RootInd,3,r0 //GR_RootInd*8
+ fma.s1 FR_r2 = FR_r,FR_r,f0
+ add GR_ad_Co = 0x15C0,GR_ad_1//0x1590,GR_ad_1
+}
+{ .mfi
+ add GR_ad_Ce = 0x1610,GR_ad_1//0x15E0,GR_ad_1
+ nop.f 0
+ cmp.lt p6,p0 = GR_ArgXfrAsIs,GR_Arg05
+};;
+{ .mfi
+ add GR_ad_Roots = 0x10A0,GR_ad_1
+ nop.f 0
+(p6) add GR_ad_Co = 0x820,GR_ad_Co
+}
+{ .mfi
+(p6) add GR_ad_Ce = 0x820,GR_ad_Ce
+ nop.f 0
+ shladd GR_Offs = GR_RootInd,1,GR_Offs //GR_RootInd*10
+};;
+{ .mmi
+ shladd GR_ad_Co = GR_Offs,4,GR_ad_Co
+ shladd GR_ad_Ce = GR_Offs,4,GR_ad_Ce
+ cmp.eq p8,p7 = r0,r0
+};;
+{ .mmi
+ ldfpd FR_A15,FR_A14 = [GR_ad_Co],16
+ ldfpd FR_A13,FR_A12 = [GR_ad_Ce],16
+ mov GR_SignOfGamma = 1
+};;
+{ .mmi
+ ldfpd FR_A11,FR_A10 = [GR_ad_Co],16
+ ldfpd FR_A9,FR_A8 = [GR_ad_Ce],16
+(p6) cmp.eq p7,p8 = r0,GR_RootInd
+};;
+{ .mmi
+ ldfpd FR_A7,FR_A6 = [GR_ad_Co],16
+ ldfpd FR_A5,FR_A4 = [GR_ad_Ce],16
+ tbit.z p11,p0 = GR_Sig,0
+};;
+{ .mmi
+ ldfe FR_A3 = [GR_ad_Co],16
+ ldfe FR_A2 = [GR_ad_Ce],16
+ // set p9 if signgum is 32-bit int
+ // set p10 if signgum is 64-bit int
+ cmp.eq p10,p9 = 8,r34
+};;
+{ .mmi
+ ldfe FR_A1 = [GR_ad_Co],16
+ ldfe FR_A0 = [GR_ad_Ce],16
+(p11) sub GR_SignOfGamma = r0,GR_SignOfGamma
+};;
+{ .mfi
+ ldfe FR_A00 = [GR_ad_Roots]
+ fma.s1 FR_r4 = FR_r2,FR_r2,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A15 = FR_A15,FR_r,FR_A14
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A13 = FR_A13,FR_r,FR_A12
+ nop.i 0
+};;
+.pred.rel "mutex",p9,p10
+{ .mfi
+ // store sign of gamma(x) as 32-bit int
+(p9) st4 [r33] = GR_SignOfGamma
+ fma.s1 FR_A11 = FR_A11,FR_r,FR_A10
+ nop.i 0
+}
+{ .mfi
+ // store sign of gamma(x) as 64-bit int
+(p10) st8 [r33] = GR_SignOfGamma
+ fma.s1 FR_A9 = FR_A9,FR_r,FR_A8
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A7 = FR_A7,FR_r,FR_A6
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A5 = FR_A5,FR_r,FR_A4
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A3 = FR_A3,FR_r,FR_A2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r8 = FR_r4,FR_r4,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A1 = FR_A1,FR_r,FR_A0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A15 = FR_A15,FR_r2,FR_A13
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A11 = FR_A11,FR_r2,FR_A9
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A7 = FR_A7,FR_r2,FR_A5
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A3 = FR_A3,FR_r2,FR_A1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A15 = FR_A15,FR_r4,FR_A11
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A7 = FR_A7,FR_r4,FR_A3
+ nop.i 0
+};;
+.pred.rel "mutex",p7,p8
+{ .mfi
+ nop.m 0
+(p7) fma.s1 FR_A1 = FR_A15,FR_r8,FR_A7
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p8) fma.d.s0 f8 = FR_A15,FR_r8,FR_A7
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+(p7) fma.d.s0 f8 = FR_A1,FR_r,FR_A00
+ br.ret.sptk b0
+};;
+// branch for handling pseudo root on (-2;-1)
+//---------------------------------------------------------------------
+.align 32
+lgamma_pseudoroot:
+{ .mmi
+ ldfe FR_PR21 = [GR_ad_Co],32
+ ldfe FR_PR31 = [GR_ad_Ce],32
+ // set p9 if signgum is 32-bit int
+ // set p10 if signgum is 64-bit int
+ cmp.eq p10,p9 = 8,r34
+};;
+{ .mmi
+ ldfe FR_PR00 = [GR_ad_Co],32
+ ldfe FR_PR10 = [GR_ad_Ce],0xF0
+ mov GR_SignOfGamma = 1
+};;
+{ .mmi
+ ldfe FR_PR20 = [GR_ad_Co],0xF0
+ ldfe FR_PR30 = [GR_ad_Ce]
+ tbit.z p8,p0 = GR_Sig,0
+};;
+{ .mfi
+ ldfe FR_PRN = [GR_ad_Co]
+ fma.s1 FR_PR01 = f8,f1,FR_PR01
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_PR11 = f8,f1,FR_PR11
+(p8) sub GR_SignOfGamma = r0,GR_SignOfGamma
+};;
+.pred.rel "mutex",p9,p10
+{ .mfi
+ // store sign of gamma(x) as 32-bit int
+(p9) st4 [r33] = GR_SignOfGamma
+ fma.s1 FR_PR21 = f8,f1,FR_PR21
+ nop.i 0
+}
+{ .mfi
+ // store sign of gamma(x) as 64-bit int
+(p10) st8 [r33] = GR_SignOfGamma
+ fma.s1 FR_PR31 = f8,f1,FR_PR31
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_PR01 = f8,FR_PR01,FR_PR00
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_PR11 = f8,FR_PR11,FR_PR10
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_PR21 = f8,FR_PR21,FR_PR20
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_PR31 = f8,FR_PR31,FR_PR30
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_PR01 = FR_PR11,FR_PR01,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_PR21 = FR_PR31,FR_PR21,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_PR01 = FR_PR21,FR_PR01,f0
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = FR_PR01,FR_PRN,f0
+ br.ret.sptk b0
+};;
+// branch for handling +/-0, NaT, QNaN, +/-INF and denormalised numbers
+//---------------------------------------------------------------------
+.align 32
+lgamma_spec:
+{ .mfi
+ getf.exp GR_SignExp = FR_NormX
+ fclass.m p6,p0 = f8,0x21 // is arg +INF?
+ mov GR_SignOfGamma = 1
+};;
+{ .mfi
+ getf.sig GR_ArgAsIs = FR_NormX
+ fclass.m p7,p0 = f8,0xB // is x deno?
+ // set p11 if signgum is 32-bit int
+ // set p12 if signgum is 64-bit int
+ cmp.eq p12,p11 = 8,r34
+};;
+.pred.rel "mutex",p11,p12
+{ .mfi
+ // store sign of gamma(x) as 32-bit int
+(p11) st4 [r33] = GR_SignOfGamma
+ fclass.m p8,p0 = f8,0x1C0 // is arg NaT or NaN?
+ dep.z GR_Ind = GR_SignExp,8,4
+}
+{ .mib
+ // store sign of gamma(x) as 64-bit int
+(p12) st8 [r33] = GR_SignOfGamma
+ cmp.lt p10,p0 = GR_SignExp,GR_ExpBias
+(p6) br.ret.spnt b0 // exit for +INF
+};;
+{ .mfi
+ and GR_Exp = GR_SignExp,GR_ExpMask
+ fclass.m p9,p0 = f8,0x22 // is arg -INF?
+ nop.i 0
+};;
+{ .mfi
+ add GR_ad_Co = GR_Ind,GR_ad_Data
+(p7) fma.s0 FR_tmp = f8,f8,f8
+ extr.u GR_ArgAsIs = GR_ArgAsIs,11,52
+}
+{ .mfb
+ nop.m 0
+(p8) fms.d.s0 f8 = f8,f1,f8
+(p8) br.ret.spnt b0 // exit for NaT and NaN
+};;
+{ .mib
+ nop.m 0
+ shr.u GR_Arg = GR_ArgAsIs,48
+(p7) br.cond.sptk lgamma_common
+};;
+{ .mfb
+ nop.m 0
+(p9) fmerge.s f8 = f1,f8
+(p9) br.ret.spnt b0 // exit -INF
+};;
+// branch for handling negative integers and +/-0
+//---------------------------------------------------------------------
+.align 32
+lgamma_singularity:
+{ .mfi
+ mov GR_ad_SignGam = r33
+ fclass.m p6,p0 = f8, 0x6 // is x -0?
+ mov GR_SignOfGamma = 1
+}
+{ .mfi
+ // set p9 if signgum is 32-bit int
+ // set p10 if signgum is 64-bit int
+ cmp.eq p10,p9 = 8,r34
+ fma.s1 FR_X = f0,f0,f8
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ frcpa.s0 f8,p0 = f1,f0
+ mov GR_TAG = 106 // negative
+}
+{ .mib
+ nop.m 0
+(p6) sub GR_SignOfGamma = r0,GR_SignOfGamma
+ br.cond.sptk lgamma_libm_err
+};;
+// overflow (x > OVERFLOV_BOUNDARY)
+//---------------------------------------------------------------------
+.align 32
+lgamma_overflow:
+{ .mfi
+ mov GR_SignOfGamma = 1
+ nop.f 0
+ mov r8 = 0x1FFFE
+};;
+{ .mfi
+ setf.exp f9 = r8
+ fmerge.s FR_X = f8,f8
+ mov GR_TAG = 105 // overflow
+};;
+{ .mfi
+ mov GR_ad_SignGam = r33
+ nop.f 0
+ // set p9 if signgum is 32-bit int
+ // set p10 if signgum is 64-bit int
+ cmp.eq p10,p9 = 8,r34
+}
+{ .mfi
+ nop.m 0
+ fma.d.s0 f8 = f9,f9,f0 // Set I,O and +INF result
+ nop.i 0
+};;
+//
+//---------------------------------------------------------------------
+.align 32
+lgamma_libm_err:
+{ .mmi
+ alloc r32 = ar.pfs,1,4,4,0
+ mov GR_Parameter_TAG = GR_TAG
+ nop.i 0
+};;
+.pred.rel "mutex",p9,p10
+{ .mmi
+ // store sign of gamma(x) as 32-bit int
+(p9) st4 [GR_ad_SignGam] = GR_SignOfGamma
+ // store sign of gamma(x) as 64-bit int
+(p10) st8 [GR_ad_SignGam] = GR_SignOfGamma
+ nop.i 0
+};;
+GLOBAL_LIBM_END(__libm_lgamma)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1
+ // on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3
+ // on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling
+ // function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+{ .mmi
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/libm_lgammaf.S b/sysdeps/ia64/fpu/libm_lgammaf.S
new file mode 100644
index 0000000000..83cffd60fa
--- /dev/null
+++ b/sysdeps/ia64/fpu/libm_lgammaf.S
@@ -0,0 +1,2189 @@
+.file "libm_lgammaf.s"
+
+
+// Copyright (c) 2002 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
+// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
+// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code,and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//*********************************************************************
+//
+// History:
+// 01/10/02 Initial version
+// 01/25/02 Corrected parameter store, load, and tag for __libm_error_support
+// 02/01/02 Added support of SIGN(GAMMA(x)) calculation
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 09/16/02 Improved accuracy on intervals reduced to [1;1.25]
+// 10/21/02 Now it returns SIGN(GAMMA(x))=-1 for negative zero
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+//*********************************************************************
+//
+//*********************************************************************
+//
+// Function: __libm_lgammaf(float x, int* signgam, int szsigngam)
+// computes the principle value of the logarithm of the GAMMA function
+// of x. Signum of GAMMA(x) is stored to memory starting at the address
+// specified by the signgam.
+//
+//*********************************************************************
+//
+// Resources Used:
+//
+// Floating-Point Registers: f6-f15
+// f32-f97
+//
+// General Purpose Registers:
+// r8-r11
+// r14-r30
+// r32-r36
+// r37-r40 (Used to pass arguments to error handling routine)
+//
+// Predicate Registers: p6-p15
+//
+//*********************************************************************
+//
+// IEEE Special Conditions:
+//
+// lgamma(+inf) = +inf
+// lgamma(-inf) = +inf
+// lgamma(+/-0) = +inf
+// lgamma(x<0, x - integer) = +inf
+// lgamma(SNaN) = QNaN
+// lgamma(QNaN) = QNaN
+//
+//*********************************************************************
+//
+// Overview
+//
+// The method consists of three cases.
+//
+// If 2^13 <= x < OVERFLOW_BOUNDARY use case lgammaf_pstirling;
+// else if 1 < x < 2^13 use case lgammaf_regular;
+// else if -9 < x < 1 use case lgammaf_negrecursion;
+// else if -2^13 < x < -9 use case lgammaf_negpoly;
+// else if x < -2^13 use case lgammaf_negstirling;
+// else if x is close to negative
+// roots of ln(GAMMA(x)) use case lgammaf_negroots;
+//
+//
+// Case 2^13 <= x < OVERFLOW_BOUNDARY
+// ----------------------------------
+// Here we use algorithm based on the Stirling formula:
+// ln(GAMMA(x)) = ln(sqrt(2*Pi)) + (x-0.5)*ln(x) - x
+//
+// Case 1 < x < 2^13
+// -----------------
+// To calculate ln(GAMMA(x)) for such arguments we use polynomial
+// approximation on following intervals: [1.0; 1.25), [1.25; 1.5),
+// [1.5, 1.75), [1.75; 2), [2; 4), [2^i; 2^(i+1)), i=1..8
+//
+// Following variants of approximation and argument reduction are used:
+// 1. [1.0; 1.25)
+// ln(GAMMA(x)) ~ (x-1.0)*P7(x)
+//
+// 2. [1.25; 1.5)
+// ln(GAMMA(x)) ~ ln(GAMMA(x0))+(x-x0)*P8(x-x0),
+// where x0 - point of local minimum on [1;2] rounded to nearest double
+// precision number.
+//
+// 3. [1.5; 1.75)
+// ln(GAMMA(x)) ~ P8(x)
+//
+// 4. [1.75; 2.0)
+// ln(GAMMA(x)) ~ (x-2)*P7(x)
+//
+// 5. [2; 4)
+// ln(GAMMA(x)) ~ (x-2)*P10(x)
+//
+// 6. [2^i; 2^(i+1)), i=2..8
+// ln(GAMMA(x)) ~ P10((x-2^i)/2^i)
+//
+// Case -9 < x < 1
+// ---------------
+// Here we use the recursive formula:
+// ln(GAMMA(x)) = ln(GAMMA(x+1)) - ln(x)
+//
+// Using this formula we reduce argument to base interval [1.0; 2.0]
+//
+// Case -2^13 < x < -9
+// --------------------
+// Here we use the formula:
+// ln(GAMMA(x)) = ln(Pi/(|x|*GAMMA(|x|)*sin(Pi*|x|))) =
+// = -ln(|x|) - ln((GAMMA(|x|)) - ln(sin(Pi*r)/(Pi*r)) - ln(|r|)
+// where r = x - rounded_to_nearest(x), i.e |r| <= 0.5 and
+// ln(sin(Pi*r)/(Pi*r)) is approximated by 8-degree polynomial of r^2
+//
+// Case x < -2^13
+// --------------
+// Here we use algorithm based on the Stirling formula:
+// ln(GAMMA(x)) = -ln(sqrt(2*Pi)) + (|x|-0.5)ln(x) - |x| -
+// - ln(sin(Pi*r)/(Pi*r)) - ln(|r|)
+// where r = x - rounded_to_nearest(x).
+//
+// Neighbourhoods of negative roots
+// --------------------------------
+// Here we use polynomial approximation
+// ln(GAMMA(x-x0)) = ln(GAMMA(x0)) + (x-x0)*P14(x-x0),
+// where x0 is a root of ln(GAMMA(x)) rounded to nearest double
+// precision number.
+//
+//
+// Claculation of logarithm
+// ------------------------
+// Consider x = 2^N * xf so
+// ln(x) = ln(frcpa(x)*x/frcpa(x))
+// = ln(1/frcpa(x)) + ln(frcpa(x)*x)
+//
+// frcpa(x) = 2^(-N) * frcpa(xf)
+//
+// ln(1/frcpa(x)) = -ln(2^(-N)) - ln(frcpa(xf))
+// = N*ln(2) - ln(frcpa(xf))
+// = N*ln(2) + ln(1/frcpa(xf))
+//
+// ln(x) = ln(1/frcpa(x)) + ln(frcpa(x)*x) =
+// = N*ln(2) + ln(1/frcpa(xf)) + ln(frcpa(x)*x)
+// = N*ln(2) + T + ln(frcpa(x)*x)
+//
+// Let r = 1 - frcpa(x)*x, note that r is quite small by
+// absolute value so
+//
+// ln(x) = N*ln(2) + T + ln(1+r) ~ N*ln(2) + T + Series(r),
+// where T - is precomputed tabular value,
+// Series(r) = (P3*r + P2)*r^2 + (P1*r + 1)
+//
+//*********************************************************************
+
+GR_TAG = r8
+GR_ad_Data = r8
+GR_ad_Co = r9
+GR_ad_SignGam = r10
+GR_ad_Ce = r10
+GR_SignExp = r11
+
+GR_ad_C650 = r14
+GR_ad_RootCo = r14
+GR_ad_C0 = r15
+GR_Dx = r15
+GR_Ind = r16
+GR_Offs = r17
+GR_IntNum = r17
+GR_ExpBias = r18
+GR_ExpMask = r19
+GR_Ind4T = r20
+GR_RootInd = r20
+GR_Sig = r21
+GR_Exp = r22
+GR_PureExp = r23
+GR_ad_C43 = r24
+GR_StirlBound = r25
+GR_ad_T = r25
+GR_IndX8 = r25
+GR_Neg2 = r25
+GR_2xDx = r25
+GR_SingBound = r26
+GR_IndX2 = r26
+GR_Neg4 = r26
+GR_ad_RootCe = r26
+GR_Arg = r27
+GR_ExpOf2 = r28
+GR_fff7 = r28
+GR_Root = r28
+GR_ReqBound = r28
+GR_N = r29
+GR_ad_Root = r30
+GR_ad_OvfBound = r30
+GR_SignOfGamma = r31
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+//*********************************************************************
+
+FR_X = f10
+FR_Y = f1 // lgammaf is single argument function
+FR_RESULT = f8
+
+FR_x = f6
+FR_x2 = f7
+
+FR_x3 = f9
+FR_x4 = f10
+FR_xm2 = f11
+FR_w = f11
+FR_w2 = f12
+FR_Q32 = f13
+FR_Q10 = f14
+FR_InvX = f15
+
+FR_NormX = f32
+
+FR_A0 = f33
+FR_A1 = f34
+FR_A2 = f35
+FR_A3 = f36
+FR_A4 = f37
+FR_A5 = f38
+FR_A6 = f39
+FR_A7 = f40
+FR_A8 = f41
+FR_A9 = f42
+FR_A10 = f43
+
+FR_int_N = f44
+FR_P3 = f45
+FR_P2 = f46
+FR_P1 = f47
+FR_LocalMin = f48
+FR_Ln2 = f49
+FR_05 = f50
+FR_LnSqrt2Pi = f51
+FR_3 = f52
+FR_r = f53
+FR_r2 = f54
+FR_T = f55
+FR_N = f56
+FR_xm05 = f57
+FR_int_Ln = f58
+FR_P32 = f59
+FR_P10 = f60
+
+FR_Xf = f61
+FR_InvXf = f62
+FR_rf = f63
+FR_rf2 = f64
+FR_Tf = f65
+FR_Nf = f66
+FR_xm05f = f67
+FR_P32f = f68
+FR_P10f = f69
+FR_Lnf = f70
+FR_Xf2 = f71
+FR_Xf4 = f72
+FR_Xf8 = f73
+FR_Ln = f74
+FR_xx = f75
+FR_Root = f75
+FR_Req = f76
+FR_1pXf = f77
+
+FR_S16 = f78
+FR_R3 = f78
+FR_S14 = f79
+FR_R2 = f79
+FR_S12 = f80
+FR_R1 = f80
+FR_S10 = f81
+FR_R0 = f81
+FR_S8 = f82
+FR_rx = f82
+FR_S6 = f83
+FR_rx2 = f84
+FR_S4 = f84
+FR_S2 = f85
+
+FR_Xp1 = f86
+FR_Xp2 = f87
+FR_Xp3 = f88
+FR_Xp4 = f89
+FR_Xp5 = f90
+FR_Xp6 = f91
+FR_Xp7 = f92
+FR_Xp8 = f93
+FR_OverflowBound = f93
+
+FR_2 = f94
+FR_tmp = f95
+FR_int_Ntrunc = f96
+FR_Ntrunc = f97
+
+//*********************************************************************
+
+RODATA
+.align 32
+LOCAL_OBJECT_START(lgammaf_data)
+log_table_1:
+data8 0xbfd0001008f39d59 // P3
+data8 0x3fd5556073e0c45a // P2
+data8 0x3fe62e42fefa39ef // ln(2)
+data8 0x3fe0000000000000 // 0.5
+//
+data8 0x3F60040155D5889E //ln(1/frcpa(1+ 0/256)
+data8 0x3F78121214586B54 //ln(1/frcpa(1+ 1/256)
+data8 0x3F841929F96832F0 //ln(1/frcpa(1+ 2/256)
+data8 0x3F8C317384C75F06 //ln(1/frcpa(1+ 3/256)
+data8 0x3F91A6B91AC73386 //ln(1/frcpa(1+ 4/256)
+data8 0x3F95BA9A5D9AC039 //ln(1/frcpa(1+ 5/256)
+data8 0x3F99D2A8074325F4 //ln(1/frcpa(1+ 6/256)
+data8 0x3F9D6B2725979802 //ln(1/frcpa(1+ 7/256)
+data8 0x3FA0C58FA19DFAAA //ln(1/frcpa(1+ 8/256)
+data8 0x3FA2954C78CBCE1B //ln(1/frcpa(1+ 9/256)
+data8 0x3FA4A94D2DA96C56 //ln(1/frcpa(1+ 10/256)
+data8 0x3FA67C94F2D4BB58 //ln(1/frcpa(1+ 11/256)
+data8 0x3FA85188B630F068 //ln(1/frcpa(1+ 12/256)
+data8 0x3FAA6B8ABE73AF4C //ln(1/frcpa(1+ 13/256)
+data8 0x3FAC441E06F72A9E //ln(1/frcpa(1+ 14/256)
+data8 0x3FAE1E6713606D07 //ln(1/frcpa(1+ 15/256)
+data8 0x3FAFFA6911AB9301 //ln(1/frcpa(1+ 16/256)
+data8 0x3FB0EC139C5DA601 //ln(1/frcpa(1+ 17/256)
+data8 0x3FB1DBD2643D190B //ln(1/frcpa(1+ 18/256)
+data8 0x3FB2CC7284FE5F1C //ln(1/frcpa(1+ 19/256)
+data8 0x3FB3BDF5A7D1EE64 //ln(1/frcpa(1+ 20/256)
+data8 0x3FB4B05D7AA012E0 //ln(1/frcpa(1+ 21/256)
+data8 0x3FB580DB7CEB5702 //ln(1/frcpa(1+ 22/256)
+data8 0x3FB674F089365A7A //ln(1/frcpa(1+ 23/256)
+data8 0x3FB769EF2C6B568D //ln(1/frcpa(1+ 24/256)
+data8 0x3FB85FD927506A48 //ln(1/frcpa(1+ 25/256)
+data8 0x3FB9335E5D594989 //ln(1/frcpa(1+ 26/256)
+data8 0x3FBA2B0220C8E5F5 //ln(1/frcpa(1+ 27/256)
+data8 0x3FBB0004AC1A86AC //ln(1/frcpa(1+ 28/256)
+data8 0x3FBBF968769FCA11 //ln(1/frcpa(1+ 29/256)
+data8 0x3FBCCFEDBFEE13A8 //ln(1/frcpa(1+ 30/256)
+data8 0x3FBDA727638446A2 //ln(1/frcpa(1+ 31/256)
+data8 0x3FBEA3257FE10F7A //ln(1/frcpa(1+ 32/256)
+data8 0x3FBF7BE9FEDBFDE6 //ln(1/frcpa(1+ 33/256)
+data8 0x3FC02AB352FF25F4 //ln(1/frcpa(1+ 34/256)
+data8 0x3FC097CE579D204D //ln(1/frcpa(1+ 35/256)
+data8 0x3FC1178E8227E47C //ln(1/frcpa(1+ 36/256)
+data8 0x3FC185747DBECF34 //ln(1/frcpa(1+ 37/256)
+data8 0x3FC1F3B925F25D41 //ln(1/frcpa(1+ 38/256)
+data8 0x3FC2625D1E6DDF57 //ln(1/frcpa(1+ 39/256)
+data8 0x3FC2D1610C86813A //ln(1/frcpa(1+ 40/256)
+data8 0x3FC340C59741142E //ln(1/frcpa(1+ 41/256)
+data8 0x3FC3B08B6757F2A9 //ln(1/frcpa(1+ 42/256)
+data8 0x3FC40DFB08378003 //ln(1/frcpa(1+ 43/256)
+data8 0x3FC47E74E8CA5F7C //ln(1/frcpa(1+ 44/256)
+data8 0x3FC4EF51F6466DE4 //ln(1/frcpa(1+ 45/256)
+data8 0x3FC56092E02BA516 //ln(1/frcpa(1+ 46/256)
+data8 0x3FC5D23857CD74D5 //ln(1/frcpa(1+ 47/256)
+data8 0x3FC6313A37335D76 //ln(1/frcpa(1+ 48/256)
+data8 0x3FC6A399DABBD383 //ln(1/frcpa(1+ 49/256)
+data8 0x3FC70337DD3CE41B //ln(1/frcpa(1+ 50/256)
+data8 0x3FC77654128F6127 //ln(1/frcpa(1+ 51/256)
+data8 0x3FC7E9D82A0B022D //ln(1/frcpa(1+ 52/256)
+data8 0x3FC84A6B759F512F //ln(1/frcpa(1+ 53/256)
+data8 0x3FC8AB47D5F5A310 //ln(1/frcpa(1+ 54/256)
+data8 0x3FC91FE49096581B //ln(1/frcpa(1+ 55/256)
+data8 0x3FC981634011AA75 //ln(1/frcpa(1+ 56/256)
+data8 0x3FC9F6C407089664 //ln(1/frcpa(1+ 57/256)
+data8 0x3FCA58E729348F43 //ln(1/frcpa(1+ 58/256)
+data8 0x3FCABB55C31693AD //ln(1/frcpa(1+ 59/256)
+data8 0x3FCB1E104919EFD0 //ln(1/frcpa(1+ 60/256)
+data8 0x3FCB94EE93E367CB //ln(1/frcpa(1+ 61/256)
+data8 0x3FCBF851C067555F //ln(1/frcpa(1+ 62/256)
+data8 0x3FCC5C0254BF23A6 //ln(1/frcpa(1+ 63/256)
+data8 0x3FCCC000C9DB3C52 //ln(1/frcpa(1+ 64/256)
+data8 0x3FCD244D99C85674 //ln(1/frcpa(1+ 65/256)
+data8 0x3FCD88E93FB2F450 //ln(1/frcpa(1+ 66/256)
+data8 0x3FCDEDD437EAEF01 //ln(1/frcpa(1+ 67/256)
+data8 0x3FCE530EFFE71012 //ln(1/frcpa(1+ 68/256)
+data8 0x3FCEB89A1648B971 //ln(1/frcpa(1+ 69/256)
+data8 0x3FCF1E75FADF9BDE //ln(1/frcpa(1+ 70/256)
+data8 0x3FCF84A32EAD7C35 //ln(1/frcpa(1+ 71/256)
+data8 0x3FCFEB2233EA07CD //ln(1/frcpa(1+ 72/256)
+data8 0x3FD028F9C7035C1C //ln(1/frcpa(1+ 73/256)
+data8 0x3FD05C8BE0D9635A //ln(1/frcpa(1+ 74/256)
+data8 0x3FD085EB8F8AE797 //ln(1/frcpa(1+ 75/256)
+data8 0x3FD0B9C8E32D1911 //ln(1/frcpa(1+ 76/256)
+data8 0x3FD0EDD060B78081 //ln(1/frcpa(1+ 77/256)
+data8 0x3FD122024CF0063F //ln(1/frcpa(1+ 78/256)
+data8 0x3FD14BE2927AECD4 //ln(1/frcpa(1+ 79/256)
+data8 0x3FD180618EF18ADF //ln(1/frcpa(1+ 80/256)
+data8 0x3FD1B50BBE2FC63B //ln(1/frcpa(1+ 81/256)
+data8 0x3FD1DF4CC7CF242D //ln(1/frcpa(1+ 82/256)
+data8 0x3FD214456D0EB8D4 //ln(1/frcpa(1+ 83/256)
+data8 0x3FD23EC5991EBA49 //ln(1/frcpa(1+ 84/256)
+data8 0x3FD2740D9F870AFB //ln(1/frcpa(1+ 85/256)
+data8 0x3FD29ECDABCDFA04 //ln(1/frcpa(1+ 86/256)
+data8 0x3FD2D46602ADCCEE //ln(1/frcpa(1+ 87/256)
+data8 0x3FD2FF66B04EA9D4 //ln(1/frcpa(1+ 88/256)
+data8 0x3FD335504B355A37 //ln(1/frcpa(1+ 89/256)
+data8 0x3FD360925EC44F5D //ln(1/frcpa(1+ 90/256)
+data8 0x3FD38BF1C3337E75 //ln(1/frcpa(1+ 91/256)
+data8 0x3FD3C25277333184 //ln(1/frcpa(1+ 92/256)
+data8 0x3FD3EDF463C1683E //ln(1/frcpa(1+ 93/256)
+data8 0x3FD419B423D5E8C7 //ln(1/frcpa(1+ 94/256)
+data8 0x3FD44591E0539F49 //ln(1/frcpa(1+ 95/256)
+data8 0x3FD47C9175B6F0AD //ln(1/frcpa(1+ 96/256)
+data8 0x3FD4A8B341552B09 //ln(1/frcpa(1+ 97/256)
+data8 0x3FD4D4F3908901A0 //ln(1/frcpa(1+ 98/256)
+data8 0x3FD501528DA1F968 //ln(1/frcpa(1+ 99/256)
+data8 0x3FD52DD06347D4F6 //ln(1/frcpa(1+ 100/256)
+data8 0x3FD55A6D3C7B8A8A //ln(1/frcpa(1+ 101/256)
+data8 0x3FD5925D2B112A59 //ln(1/frcpa(1+ 102/256)
+data8 0x3FD5BF406B543DB2 //ln(1/frcpa(1+ 103/256)
+data8 0x3FD5EC433D5C35AE //ln(1/frcpa(1+ 104/256)
+data8 0x3FD61965CDB02C1F //ln(1/frcpa(1+ 105/256)
+data8 0x3FD646A84935B2A2 //ln(1/frcpa(1+ 106/256)
+data8 0x3FD6740ADD31DE94 //ln(1/frcpa(1+ 107/256)
+data8 0x3FD6A18DB74A58C5 //ln(1/frcpa(1+ 108/256)
+data8 0x3FD6CF31058670EC //ln(1/frcpa(1+ 109/256)
+data8 0x3FD6F180E852F0BA //ln(1/frcpa(1+ 110/256)
+data8 0x3FD71F5D71B894F0 //ln(1/frcpa(1+ 111/256)
+data8 0x3FD74D5AEFD66D5C //ln(1/frcpa(1+ 112/256)
+data8 0x3FD77B79922BD37E //ln(1/frcpa(1+ 113/256)
+data8 0x3FD7A9B9889F19E2 //ln(1/frcpa(1+ 114/256)
+data8 0x3FD7D81B037EB6A6 //ln(1/frcpa(1+ 115/256)
+data8 0x3FD8069E33827231 //ln(1/frcpa(1+ 116/256)
+data8 0x3FD82996D3EF8BCB //ln(1/frcpa(1+ 117/256)
+data8 0x3FD85855776DCBFB //ln(1/frcpa(1+ 118/256)
+data8 0x3FD8873658327CCF //ln(1/frcpa(1+ 119/256)
+data8 0x3FD8AA75973AB8CF //ln(1/frcpa(1+ 120/256)
+data8 0x3FD8D992DC8824E5 //ln(1/frcpa(1+ 121/256)
+data8 0x3FD908D2EA7D9512 //ln(1/frcpa(1+ 122/256)
+data8 0x3FD92C59E79C0E56 //ln(1/frcpa(1+ 123/256)
+data8 0x3FD95BD750EE3ED3 //ln(1/frcpa(1+ 124/256)
+data8 0x3FD98B7811A3EE5B //ln(1/frcpa(1+ 125/256)
+data8 0x3FD9AF47F33D406C //ln(1/frcpa(1+ 126/256)
+data8 0x3FD9DF270C1914A8 //ln(1/frcpa(1+ 127/256)
+data8 0x3FDA0325ED14FDA4 //ln(1/frcpa(1+ 128/256)
+data8 0x3FDA33440224FA79 //ln(1/frcpa(1+ 129/256)
+data8 0x3FDA57725E80C383 //ln(1/frcpa(1+ 130/256)
+data8 0x3FDA87D0165DD199 //ln(1/frcpa(1+ 131/256)
+data8 0x3FDAAC2E6C03F896 //ln(1/frcpa(1+ 132/256)
+data8 0x3FDADCCC6FDF6A81 //ln(1/frcpa(1+ 133/256)
+data8 0x3FDB015B3EB1E790 //ln(1/frcpa(1+ 134/256)
+data8 0x3FDB323A3A635948 //ln(1/frcpa(1+ 135/256)
+data8 0x3FDB56FA04462909 //ln(1/frcpa(1+ 136/256)
+data8 0x3FDB881AA659BC93 //ln(1/frcpa(1+ 137/256)
+data8 0x3FDBAD0BEF3DB165 //ln(1/frcpa(1+ 138/256)
+data8 0x3FDBD21297781C2F //ln(1/frcpa(1+ 139/256)
+data8 0x3FDC039236F08819 //ln(1/frcpa(1+ 140/256)
+data8 0x3FDC28CB1E4D32FD //ln(1/frcpa(1+ 141/256)
+data8 0x3FDC4E19B84723C2 //ln(1/frcpa(1+ 142/256)
+data8 0x3FDC7FF9C74554C9 //ln(1/frcpa(1+ 143/256)
+data8 0x3FDCA57B64E9DB05 //ln(1/frcpa(1+ 144/256)
+data8 0x3FDCCB130A5CEBB0 //ln(1/frcpa(1+ 145/256)
+data8 0x3FDCF0C0D18F326F //ln(1/frcpa(1+ 146/256)
+data8 0x3FDD232075B5A201 //ln(1/frcpa(1+ 147/256)
+data8 0x3FDD490246DEFA6B //ln(1/frcpa(1+ 148/256)
+data8 0x3FDD6EFA918D25CD //ln(1/frcpa(1+ 149/256)
+data8 0x3FDD9509707AE52F //ln(1/frcpa(1+ 150/256)
+data8 0x3FDDBB2EFE92C554 //ln(1/frcpa(1+ 151/256)
+data8 0x3FDDEE2F3445E4AF //ln(1/frcpa(1+ 152/256)
+data8 0x3FDE148A1A2726CE //ln(1/frcpa(1+ 153/256)
+data8 0x3FDE3AFC0A49FF40 //ln(1/frcpa(1+ 154/256)
+data8 0x3FDE6185206D516E //ln(1/frcpa(1+ 155/256)
+data8 0x3FDE882578823D52 //ln(1/frcpa(1+ 156/256)
+data8 0x3FDEAEDD2EAC990C //ln(1/frcpa(1+ 157/256)
+data8 0x3FDED5AC5F436BE3 //ln(1/frcpa(1+ 158/256)
+data8 0x3FDEFC9326D16AB9 //ln(1/frcpa(1+ 159/256)
+data8 0x3FDF2391A2157600 //ln(1/frcpa(1+ 160/256)
+data8 0x3FDF4AA7EE03192D //ln(1/frcpa(1+ 161/256)
+data8 0x3FDF71D627C30BB0 //ln(1/frcpa(1+ 162/256)
+data8 0x3FDF991C6CB3B379 //ln(1/frcpa(1+ 163/256)
+data8 0x3FDFC07ADA69A910 //ln(1/frcpa(1+ 164/256)
+data8 0x3FDFE7F18EB03D3E //ln(1/frcpa(1+ 165/256)
+data8 0x3FE007C053C5002E //ln(1/frcpa(1+ 166/256)
+data8 0x3FE01B942198A5A1 //ln(1/frcpa(1+ 167/256)
+data8 0x3FE02F74400C64EB //ln(1/frcpa(1+ 168/256)
+data8 0x3FE04360BE7603AD //ln(1/frcpa(1+ 169/256)
+data8 0x3FE05759AC47FE34 //ln(1/frcpa(1+ 170/256)
+data8 0x3FE06B5F1911CF52 //ln(1/frcpa(1+ 171/256)
+data8 0x3FE078BF0533C568 //ln(1/frcpa(1+ 172/256)
+data8 0x3FE08CD9687E7B0E //ln(1/frcpa(1+ 173/256)
+data8 0x3FE0A10074CF9019 //ln(1/frcpa(1+ 174/256)
+data8 0x3FE0B5343A234477 //ln(1/frcpa(1+ 175/256)
+data8 0x3FE0C974C89431CE //ln(1/frcpa(1+ 176/256)
+data8 0x3FE0DDC2305B9886 //ln(1/frcpa(1+ 177/256)
+data8 0x3FE0EB524BAFC918 //ln(1/frcpa(1+ 178/256)
+data8 0x3FE0FFB54213A476 //ln(1/frcpa(1+ 179/256)
+data8 0x3FE114253DA97D9F //ln(1/frcpa(1+ 180/256)
+data8 0x3FE128A24F1D9AFF //ln(1/frcpa(1+ 181/256)
+data8 0x3FE1365252BF0865 //ln(1/frcpa(1+ 182/256)
+data8 0x3FE14AE558B4A92D //ln(1/frcpa(1+ 183/256)
+data8 0x3FE15F85A19C765B //ln(1/frcpa(1+ 184/256)
+data8 0x3FE16D4D38C119FA //ln(1/frcpa(1+ 185/256)
+data8 0x3FE18203C20DD133 //ln(1/frcpa(1+ 186/256)
+data8 0x3FE196C7BC4B1F3B //ln(1/frcpa(1+ 187/256)
+data8 0x3FE1A4A738B7A33C //ln(1/frcpa(1+ 188/256)
+data8 0x3FE1B981C0C9653D //ln(1/frcpa(1+ 189/256)
+data8 0x3FE1CE69E8BB106B //ln(1/frcpa(1+ 190/256)
+data8 0x3FE1DC619DE06944 //ln(1/frcpa(1+ 191/256)
+data8 0x3FE1F160A2AD0DA4 //ln(1/frcpa(1+ 192/256)
+data8 0x3FE2066D7740737E //ln(1/frcpa(1+ 193/256)
+data8 0x3FE2147DBA47A394 //ln(1/frcpa(1+ 194/256)
+data8 0x3FE229A1BC5EBAC3 //ln(1/frcpa(1+ 195/256)
+data8 0x3FE237C1841A502E //ln(1/frcpa(1+ 196/256)
+data8 0x3FE24CFCE6F80D9A //ln(1/frcpa(1+ 197/256)
+data8 0x3FE25B2C55CD5762 //ln(1/frcpa(1+ 198/256)
+data8 0x3FE2707F4D5F7C41 //ln(1/frcpa(1+ 199/256)
+data8 0x3FE285E0842CA384 //ln(1/frcpa(1+ 200/256)
+data8 0x3FE294294708B773 //ln(1/frcpa(1+ 201/256)
+data8 0x3FE2A9A2670AFF0C //ln(1/frcpa(1+ 202/256)
+data8 0x3FE2B7FB2C8D1CC1 //ln(1/frcpa(1+ 203/256)
+data8 0x3FE2C65A6395F5F5 //ln(1/frcpa(1+ 204/256)
+data8 0x3FE2DBF557B0DF43 //ln(1/frcpa(1+ 205/256)
+data8 0x3FE2EA64C3F97655 //ln(1/frcpa(1+ 206/256)
+data8 0x3FE3001823684D73 //ln(1/frcpa(1+ 207/256)
+data8 0x3FE30E97E9A8B5CD //ln(1/frcpa(1+ 208/256)
+data8 0x3FE32463EBDD34EA //ln(1/frcpa(1+ 209/256)
+data8 0x3FE332F4314AD796 //ln(1/frcpa(1+ 210/256)
+data8 0x3FE348D90E7464D0 //ln(1/frcpa(1+ 211/256)
+data8 0x3FE35779F8C43D6E //ln(1/frcpa(1+ 212/256)
+data8 0x3FE36621961A6A99 //ln(1/frcpa(1+ 213/256)
+data8 0x3FE37C299F3C366A //ln(1/frcpa(1+ 214/256)
+data8 0x3FE38AE2171976E7 //ln(1/frcpa(1+ 215/256)
+data8 0x3FE399A157A603E7 //ln(1/frcpa(1+ 216/256)
+data8 0x3FE3AFCCFE77B9D1 //ln(1/frcpa(1+ 217/256)
+data8 0x3FE3BE9D503533B5 //ln(1/frcpa(1+ 218/256)
+data8 0x3FE3CD7480B4A8A3 //ln(1/frcpa(1+ 219/256)
+data8 0x3FE3E3C43918F76C //ln(1/frcpa(1+ 220/256)
+data8 0x3FE3F2ACB27ED6C7 //ln(1/frcpa(1+ 221/256)
+data8 0x3FE4019C2125CA93 //ln(1/frcpa(1+ 222/256)
+data8 0x3FE4181061389722 //ln(1/frcpa(1+ 223/256)
+data8 0x3FE42711518DF545 //ln(1/frcpa(1+ 224/256)
+data8 0x3FE436194E12B6BF //ln(1/frcpa(1+ 225/256)
+data8 0x3FE445285D68EA69 //ln(1/frcpa(1+ 226/256)
+data8 0x3FE45BCC464C893A //ln(1/frcpa(1+ 227/256)
+data8 0x3FE46AED21F117FC //ln(1/frcpa(1+ 228/256)
+data8 0x3FE47A1527E8A2D3 //ln(1/frcpa(1+ 229/256)
+data8 0x3FE489445EFFFCCC //ln(1/frcpa(1+ 230/256)
+data8 0x3FE4A018BCB69835 //ln(1/frcpa(1+ 231/256)
+data8 0x3FE4AF5A0C9D65D7 //ln(1/frcpa(1+ 232/256)
+data8 0x3FE4BEA2A5BDBE87 //ln(1/frcpa(1+ 233/256)
+data8 0x3FE4CDF28F10AC46 //ln(1/frcpa(1+ 234/256)
+data8 0x3FE4DD49CF994058 //ln(1/frcpa(1+ 235/256)
+data8 0x3FE4ECA86E64A684 //ln(1/frcpa(1+ 236/256)
+data8 0x3FE503C43CD8EB68 //ln(1/frcpa(1+ 237/256)
+data8 0x3FE513356667FC57 //ln(1/frcpa(1+ 238/256)
+data8 0x3FE522AE0738A3D8 //ln(1/frcpa(1+ 239/256)
+data8 0x3FE5322E26867857 //ln(1/frcpa(1+ 240/256)
+data8 0x3FE541B5CB979809 //ln(1/frcpa(1+ 241/256)
+data8 0x3FE55144FDBCBD62 //ln(1/frcpa(1+ 242/256)
+data8 0x3FE560DBC45153C7 //ln(1/frcpa(1+ 243/256)
+data8 0x3FE5707A26BB8C66 //ln(1/frcpa(1+ 244/256)
+data8 0x3FE587F60ED5B900 //ln(1/frcpa(1+ 245/256)
+data8 0x3FE597A7977C8F31 //ln(1/frcpa(1+ 246/256)
+data8 0x3FE5A760D634BB8B //ln(1/frcpa(1+ 247/256)
+data8 0x3FE5B721D295F10F //ln(1/frcpa(1+ 248/256)
+data8 0x3FE5C6EA94431EF9 //ln(1/frcpa(1+ 249/256)
+data8 0x3FE5D6BB22EA86F6 //ln(1/frcpa(1+ 250/256)
+data8 0x3FE5E6938645D390 //ln(1/frcpa(1+ 251/256)
+data8 0x3FE5F673C61A2ED2 //ln(1/frcpa(1+ 252/256)
+data8 0x3FE6065BEA385926 //ln(1/frcpa(1+ 253/256)
+data8 0x3FE6164BFA7CC06B //ln(1/frcpa(1+ 254/256)
+data8 0x3FE62643FECF9743 //ln(1/frcpa(1+ 255/256)
+//
+// [2;4)
+data8 0xBEB2CC7A38B9355F,0x3F035F2D1833BF4C // A10,A9
+data8 0xBFF51BAA7FD27785,0x3FFC9D5D5B6CDEFF // A2,A1
+data8 0xBF421676F9CB46C7,0x3F7437F2FA1436C6 // A8,A7
+data8 0xBFD7A7041DE592FE,0x3FE9F107FEE8BD29 // A4,A3
+// [4;8)
+data8 0x3F6BBBD68451C0CD,0xBF966EC3272A16F7 // A10,A9
+data8 0x40022A24A39AD769,0x4014190EDF49C8C5 // A2,A1
+data8 0x3FB130FD016EE241,0xBFC151B46E635248 // A8,A7
+data8 0x3FDE8F611965B5FE,0xBFEB5110EB265E3D // A4,A3
+// [8;16)
+data8 0x3F736EF93508626A,0xBF9FE5DBADF58AF1 // A10,A9
+data8 0x40110A9FC5192058,0x40302008A6F96B29 // A2,A1
+data8 0x3FB8E74E0CE1E4B5,0xBFC9B5DA78873656 // A8,A7
+data8 0x3FE99D0DF10022DC,0xBFF829C0388F9484 // A4,A3
+// [16;32)
+data8 0x3F7FFF9D6D7E9269,0xBFAA780A249AEDB1 // A10,A9
+data8 0x402082A807AEA080,0x4045ED9868408013 // A2,A1
+data8 0x3FC4E1E54C2F99B7,0xBFD5DE2D6FFF1490 // A8,A7
+data8 0x3FF75FC89584AE87,0xC006B4BADD886CAE // A4,A3
+// [32;64)
+data8 0x3F8CE54375841A5F,0xBFB801ABCFFA1BE2 // A10,A9
+data8 0x403040A8B1815BDA,0x405B99A917D24B7A // A2,A1
+data8 0x3FD30CAB81BFFA03,0xBFE41AEF61ECF48B // A8,A7
+data8 0x400650CC136BEC43,0xC016022046E8292B // A4,A3
+// [64;128)
+data8 0x3F9B69BD22CAA8B8,0xBFC6D48875B7A213 // A10,A9
+data8 0x40402028CCAA2F6D,0x40709AACEB3CBE0F // A2,A1
+data8 0x3FE22C6A5924761E,0xBFF342F5F224523D // A8,A7
+data8 0x4015CD405CCA331F,0xC025AAD10482C769 // A4,A3
+// [128;256)
+data8 0x3FAAAD9CD0E40D06,0xBFD63FC8505D80CB // A10,A9
+data8 0x40501008D56C2648,0x408364794B0F4376 // A2,A1
+data8 0x3FF1BE0126E00284,0xC002D8E3F6F7F7CA // A8,A7
+data8 0x40258C757E95D860,0xC0357FA8FD398011 // A4,A3
+// [256;512)
+data8 0x3FBA4DAC59D49FEB,0xBFE5F476D1C43A77 // A10,A9
+data8 0x40600800D890C7C6,0x40962C42AAEC8EF0 // A2,A1
+data8 0x40018680ECF19B89,0xC012A3EB96FB7BA4 // A8,A7
+data8 0x40356C4CDD3B60F9,0xC0456A34BF18F440 // A4,A3
+// [512;1024)
+data8 0x3FCA1B54F6225A5A,0xBFF5CD67BA10E048 // A10,A9
+data8 0x407003FED94C58C2,0x40A8F30B4ACBCD22 // A2,A1
+data8 0x40116A135EB66D8C,0xC022891B1CED527E // A8,A7
+data8 0x40455C4617FDD8BC,0xC0555F82729E59C4 // A4,A3
+// [1024;2048)
+data8 0x3FD9FFF9095C6EC9,0xC005B88CB25D76C9 // A10,A9
+data8 0x408001FE58FA734D,0x40BBB953BAABB0F3 // A2,A1
+data8 0x40215B2F9FEB5D87,0xC0327B539DEA5058 // A8,A7
+data8 0x40555444B3E8D64D,0xC0655A2B26F9FC8A // A4,A3
+// [2048;4096)
+data8 0x3FE9F065A1C3D6B1,0xC015ACF6FAE8D78D // A10,A9
+data8 0x409000FE383DD2B7,0x40CE7F5C1E8BCB8B // A2,A1
+data8 0x40315324E5DB2EBE,0xC04274194EF70D18 // A8,A7
+data8 0x4065504353FF2207,0xC075577FE1BFE7B6 // A4,A3
+// [4096;8192)
+data8 0x3FF9E6FBC6B1C70D,0xC025A62DAF76F85D // A10,A9
+data8 0x40A0007E2F61EBE8,0x40E0A2A23FB5F6C3 // A2,A1
+data8 0x40414E9BC0A0141A,0xC0527030F2B69D43 // A8,A7
+data8 0x40754E417717B45B,0xC085562A447258E5 // A4,A3
+//
+data8 0xbfdffffffffaea15 // P1
+data8 0x3FDD8B618D5AF8FE // point of local minimum on [1;2]
+data8 0x3FED67F1C864BEB5 // ln(sqrt(2*Pi))
+data8 0x4008000000000000 // 3.0
+//
+data8 0xBF9E1C289FB224AB,0x3FBF7422445C9460 // A6,A5
+data8 0xBFF01E76D66F8D8A // A0
+data8 0xBFE2788CFC6F91DA // A1 [1.0;1.25)
+data8 0x3FCB8CC69000EB5C,0xBFD41997A0C2C641 // A6,A5
+data8 0x3FFCAB0BFA0EA462 // A0
+data8 0xBFBF19B9BCC38A42 // A0 [1.25;1.5)
+data8 0x3FD51EE4DE0A364C,0xBFE00D7F98A16E4B // A6,A5
+data8 0x40210CE1F327E9E4 // A0
+data8 0x4001DB08F9DFA0CC // A0 [1.5;1.75)
+data8 0x3FE24F606742D252,0xBFEC81D7D12574EC // A6,A5
+data8 0x403BE636A63A9C27 // A0
+data8 0x4000A0CB38D6CF0A // A0 [1.75;2.0)
+data8 0x3FF1029A9DD542B4,0xBFFAD37C209D3B25 // A6,A5
+data8 0x405385E6FD9BE7EA // A0
+data8 0x478895F1C0000000 // Overflow boundary
+data8 0x400062D97D26B523,0xC00A03E1529FF023 // A6,A5
+data8 0x4069204C51E566CE,0 // A0
+data8 0x40101476B38FD501,0xC0199DE7B387C0FC // A6,A5
+data8 0x407EB8DAEC83D759,0 // A0
+data8 0x401FDB008D65125A,0xC0296B506E665581 // A6,A5
+data8 0x409226D93107EF66,0 // A0
+data8 0x402FB3EAAF3E7B2D,0xC039521142AD8E0D // A6,A5
+data8 0x40A4EFA4F072792E,0 // A0
+data8 0x403FA024C66B2563,0xC0494569F250E691 // A6,A5
+data8 0x40B7B747C9235BB8,0 // A0
+data8 0x404F9607D6DA512C,0xC0593F0B2EDDB4BC // A6,A5
+data8 0x40CA7E29C5F16DE2,0 // A0
+data8 0x405F90C5F613D98D,0xC0693BD130E50AAF // A6,A5
+data8 0x40DD4495238B190C,0 // A0
+//
+// polynomial approximation of ln(sin(Pi*x)/(Pi*x)), |x| <= 0.5
+data8 0xBFD58731A486E820,0xBFA4452CC28E15A9 // S16,S14
+data8 0xBFD013F6E1B86C4F,0xBFD5B3F19F7A341F // S8,S6
+data8 0xBFC86A0D5252E778,0xBFC93E08C9EE284B // S12,S10
+data8 0xBFE15132555C9EDD,0xBFFA51A662480E35 // S4,S2
+//
+// [1.0;1.25)
+data8 0xBFA697D6775F48EA,0x3FB9894B682A98E7 // A9,A8
+data8 0xBFCA8969253CFF55,0x3FD15124EFB35D9D // A5,A4
+data8 0xBFC1B00158AB719D,0x3FC5997D04E7F1C1 // A7,A6
+data8 0xBFD9A4D50BAFF989,0x3FEA51A661F5176A // A3,A2
+// [1.25;1.5)
+data8 0x3F838E0D35A6171A,0xBF831BBBD61313B7 // A8,A7
+data8 0x3FB08B40196425D0,0xBFC2E427A53EB830 // A4,A3
+data8 0x3F9285DDDC20D6C3,0xBFA0C90C9C223044 // A6,A5
+data8 0x3FDEF72BC8F5287C,0x3D890B3DAEBC1DFC // A2,A1
+// [1.5;1.75)
+data8 0x3F65D5A7EB31047F,0xBFA44EAC9BFA7FDE // A8,A7
+data8 0x40051FEFE7A663D8,0xC012A5CFE00A2522 // A4,A3
+data8 0x3FD0E1583AB00E08,0xBFF084AF95883BA5 // A6,A5
+data8 0x40185982877AE0A2,0xC015F83DB73B57B7 // A2,A1
+// [1.75;2.0)
+data8 0x3F4A9222032EB39A,0xBF8CBC9587EEA5A3 // A8,A7
+data8 0x3FF795400783BE49,0xC00851BC418B8A25 // A4,A3
+data8 0x3FBBC992783E8C5B,0xBFDFA67E65E89B29 // A6,A5
+data8 0x4012B408F02FAF88,0xC013284CE7CB0C39 // A2,A1
+//
+// roots
+data8 0xC003A7FC9600F86C // -2.4570247382208005860
+data8 0xC009260DBC9E59AF // -3.1435808883499798405
+data8 0xC005FB410A1BD901 // -2.7476826467274126919
+data8 0xC00FA471547C2FE5 // -3.9552942848585979085
+//
+// polynomial approximation of ln(GAMMA(x)) near roots
+// near -2.4570247382208005860
+data8 0x3FF694A6058D9592,0x40136EEBB003A92B // R3,R2
+data8 0x3FF83FE966AF5360,0x3C90323B6D1FE86D // R1,R0
+// near -3.1435808883499798405
+data8 0x405C11371268DA38,0x4039D4D2977D2C23 // R3,R2
+data8 0x401F20A65F2FAC62,0x3CDE9605E3AE7A62 // R1,R0
+// near -2.7476826467274126919
+data8 0xC034185AC31314FF,0x4023267F3C28DFE3 // R3,R2
+data8 0xBFFEA12DA904B194,0x3CA8FB8530BA7689 // R1,R0
+// near -2.7476826467274126919
+data8 0xC0AD25359E70C888,0x406F76DEAEA1B8C6 // R3,R2
+data8 0xC034B99D966C5644,0xBCBDDC0336980B58 // R1,R0
+LOCAL_OBJECT_END(lgammaf_data)
+
+//*********************************************************************
+
+.section .text
+GLOBAL_LIBM_ENTRY(__libm_lgammaf)
+{ .mfi
+ getf.exp GR_SignExp = f8
+ frcpa.s1 FR_InvX,p0 = f1,f8
+ mov GR_ExpOf2 = 0x10000
+}
+{ .mfi
+ addl GR_ad_Data = @ltoff(lgammaf_data),gp
+ fcvt.fx.s1 FR_int_N = f8
+ mov GR_ExpMask = 0x1ffff
+};;
+{ .mfi
+ getf.sig GR_Sig = f8
+ fclass.m p13,p0 = f8,0x1EF // is x NaTVal, NaN,
+ // +/-0, +/-INF or +/-deno?
+ mov GR_ExpBias = 0xffff
+}
+{ .mfi
+ ld8 GR_ad_Data = [GR_ad_Data]
+ fma.s1 FR_Xp1 = f8,f1,f1
+ mov GR_StirlBound = 0x1000C
+};;
+{ .mfi
+ setf.exp FR_2 = GR_ExpOf2
+ fmerge.se FR_x = f1,f8
+ dep.z GR_Ind = GR_SignExp,3,4
+}
+{ .mfi
+ cmp.eq p8,p0 = GR_SignExp,GR_ExpBias
+ fcvt.fx.trunc.s1 FR_int_Ntrunc = f8
+ and GR_Exp = GR_ExpMask,GR_SignExp
+};;
+{ .mfi
+ add GR_ad_C650 = 0xB20,GR_ad_Data
+ fcmp.lt.s1 p14,p15 = f8,f0
+ extr.u GR_Ind4T = GR_Sig,55,8
+}
+{ .mfb
+ sub GR_PureExp = GR_Exp,GR_ExpBias
+ fnorm.s1 FR_NormX = f8
+ // jump if x is NaTVal, NaN, +/-0, +/-INF or +/-deno
+(p13) br.cond.spnt lgammaf_spec
+};;
+lgammaf_core:
+{ .mfi
+ ldfpd FR_P1,FR_LocalMin = [GR_ad_C650],16
+ fms.s1 FR_xm2 = f8,f1,f1
+ add GR_ad_Co = 0x820,GR_ad_Data
+}
+{ .mib
+ ldfpd FR_P3,FR_P2 = [GR_ad_Data],16
+ cmp.ltu p9,p0 = GR_SignExp,GR_ExpBias
+ // jump if x is from the interval [1; 2)
+(p8) br.cond.spnt lgammaf_1_2
+};;
+{ .mfi
+ setf.sig FR_int_Ln = GR_PureExp
+ fms.s1 FR_r = FR_InvX,f8,f1
+ shladd GR_ad_Co = GR_Ind,3,GR_ad_Co
+}
+{ .mib
+ ldfpd FR_LnSqrt2Pi,FR_3 = [GR_ad_C650],16
+ cmp.lt p13,p12 = GR_Exp,GR_StirlBound
+ // jump if x is from the interval (0; 1)
+(p9) br.cond.spnt lgammaf_0_1
+};;
+{ .mfi
+ ldfpd FR_Ln2,FR_05 = [GR_ad_Data],16
+ fma.s1 FR_Xp2 = f1,f1,FR_Xp1 // (x+2)
+ shladd GR_ad_C650 = GR_Ind,2,GR_ad_C650
+}
+{ .mfi
+ add GR_ad_Ce = 0x20,GR_ad_Co
+ nop.f 0
+ add GR_ad_C43 = 0x30,GR_ad_Co
+};;
+{ .mfi
+ // load coefficients of polynomial approximation
+ // of ln(GAMMA(x)), 2 <= x < 2^13
+(p13) ldfpd FR_A10,FR_A9 = [GR_ad_Co],16
+ fcvt.xf FR_N = FR_int_N
+ cmp.eq.unc p6,p7 = GR_ExpOf2,GR_SignExp
+}
+{ .mib
+(p13) ldfpd FR_A8,FR_A7 = [GR_ad_Ce]
+(p14) cmp.le.unc p9,p0 = GR_StirlBound,GR_Exp
+ // jump if x is less or equal to -2^13
+(p9) br.cond.spnt lgammaf_negstirling
+};;
+.pred.rel "mutex",p6,p7
+{ .mfi
+(p13) ldfpd FR_A6,FR_A5 = [GR_ad_C650],16
+(p6) fma.s1 FR_x = f0,f0,FR_NormX
+ shladd GR_ad_T = GR_Ind4T,3,GR_ad_Data
+}
+{ .mfi
+(p13) ldfpd FR_A4,FR_A3 = [GR_ad_C43]
+(p7) fms.s1 FR_x = FR_x,f1,f1
+(p14) mov GR_ReqBound = 0x20005
+};;
+{ .mfi
+(p13) ldfpd FR_A2,FR_A1 = [GR_ad_Co],16
+ fms.s1 FR_xm2 = FR_xm2,f1,f1
+(p14) extr.u GR_Arg = GR_Sig,60,4
+}
+{ .mfi
+ mov GR_SignOfGamma = 1 // set sign of gamma(x) to 1
+ fcvt.xf FR_Ntrunc = FR_int_Ntrunc
+ nop.i 0
+};;
+{ .mfi
+ ldfd FR_T = [GR_ad_T]
+ fma.s1 FR_r2 = FR_r,FR_r,f0
+ shl GR_ReqBound = GR_ReqBound,3
+}
+{ .mfi
+ add GR_ad_Co = 0xCA0,GR_ad_Data
+ fnma.s1 FR_Req = FR_Xp1,FR_NormX,f0 // -x*(x+1)
+(p14) shladd GR_Arg = GR_Exp,4,GR_Arg
+};;
+{ .mfi
+(p13) ldfd FR_A0 = [GR_ad_C650]
+ fma.s1 FR_Xp3 = FR_2,f1,FR_Xp1 // (x+3)
+(p14) cmp.le.unc p9,p0 = GR_Arg,GR_ReqBound
+}
+{ .mfi
+(p14) add GR_ad_Ce = 0x20,GR_ad_Co
+ fma.s1 FR_Xp4 = FR_2,FR_2,FR_NormX // (x+4)
+(p15) add GR_ad_OvfBound = 0xBB8,GR_ad_Data
+};;
+{ .mfi
+ // load coefficients of polynomial approximation
+ // of ln(sin(Pi*xf)/(Pi*xf)), |xf| <= 0.5
+(p14) ldfpd FR_S16,FR_S14 = [GR_ad_Co],16
+(p14) fms.s1 FR_Xf = FR_NormX,f1,FR_N // xf = x - [x]
+(p14) sub GR_SignOfGamma = r0,GR_SignOfGamma // set sign of
+ // gamma(x) to -1
+}
+{ .mfb
+(p14) ldfpd FR_S12,FR_S10 = [GR_ad_Ce],16
+ fma.s1 FR_Xp5 = FR_2,FR_2,FR_Xp1 // (x+5)
+ // jump if x is from the interval (-9; 0)
+(p9) br.cond.spnt lgammaf_negrecursion
+};;
+{ .mfi
+(p14) ldfpd FR_S8,FR_S6 = [GR_ad_Co],16
+ fma.s1 FR_P32 = FR_P3,FR_r,FR_P2
+ nop.i 0
+}
+{ .mfb
+(p14) ldfpd FR_S4,FR_S2 = [GR_ad_Ce],16
+ fma.s1 FR_x2 = FR_x,FR_x,f0
+ // jump if x is from the interval (-2^13; -9)
+(p14) br.cond.spnt lgammaf_negpoly
+};;
+{ .mfi
+ ldfd FR_OverflowBound = [GR_ad_OvfBound]
+(p12) fcvt.xf FR_N = FR_int_Ln
+ // set p9 if signgum is 32-bit int
+ // set p10 if signgum is 64-bit int
+ cmp.eq p10,p9 = 8,r34
+}
+{ .mfi
+ nop.m 0
+(p12) fma.s1 FR_P10 = FR_P1,FR_r,f1
+ nop.i 0
+};;
+.pred.rel "mutex",p6,p7
+.pred.rel "mutex",p9,p10
+{ .mfi
+ // store sign of gamma(x) as 32-bit int
+(p9) st4 [r33] = GR_SignOfGamma
+(p6) fma.s1 FR_xx = FR_x,FR_xm2,f0
+ nop.i 0
+}
+{ .mfi
+ // store sign of gamma(x) as 64-bit int
+(p10) st8 [r33] = GR_SignOfGamma
+(p7) fma.s1 FR_xx = f0,f0,FR_x
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_A9 = FR_A10,FR_x,FR_A9
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_A7 = FR_A8,FR_x,FR_A7
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_A5 = FR_A6,FR_x,FR_A5
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_A3 = FR_A4,FR_x,FR_A3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p15) fcmp.eq.unc.s1 p8,p0 = FR_NormX,FR_2 // is input argument 2.0?
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_A1 = FR_A2,FR_x,FR_A1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p12) fma.s1 FR_T = FR_N,FR_Ln2,FR_T
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p12) fma.s1 FR_P32 = FR_P32,FR_r2,FR_P10
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_x4 = FR_x2,FR_x2,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_x3 = FR_x2,FR_xx,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_A7 = FR_A9,FR_x2,FR_A7
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p8) fma.s.s0 f8 = f0,f0,f0
+(p8) br.ret.spnt b0 // fast exit for 2.0
+};;
+{ .mfi
+ nop.m 0
+(p6) fma.s1 FR_A0 = FR_A0,FR_xm2,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_A3 = FR_A5,FR_x2,FR_A3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p15) fcmp.le.unc.s1 p8,p0 = FR_OverflowBound,FR_NormX // overflow test
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p12) fms.s1 FR_xm05 = FR_NormX,f1,FR_05
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p12) fma.s1 FR_Ln = FR_P32,FR_r,FR_T
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p12) fms.s1 FR_LnSqrt2Pi = FR_LnSqrt2Pi,f1,FR_NormX
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_A0 = FR_A1,FR_xx,FR_A0
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p13) fma.s1 FR_A3 = FR_A7,FR_x4,FR_A3
+ // jump if result overflows
+(p8) br.cond.spnt lgammaf_overflow
+};;
+.pred.rel "mutex",p12,p13
+{ .mfi
+ nop.m 0
+(p12) fma.s.s0 f8 = FR_Ln,FR_xm05,FR_LnSqrt2Pi
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p13) fma.s.s0 f8 = FR_A3,FR_x3,FR_A0
+ br.ret.sptk b0
+};;
+// branch for calculating of ln(GAMMA(x)) for 0 < x < 1
+//---------------------------------------------------------------------
+.align 32
+lgammaf_0_1:
+{ .mfi
+ getf.sig GR_Ind = FR_Xp1
+ fma.s1 FR_r2 = FR_r,FR_r,f0
+ mov GR_fff7 = 0xFFF7
+}
+{ .mfi
+ ldfpd FR_Ln2,FR_05 = [GR_ad_Data],16
+ fma.s1 FR_P32 = FR_P3,FR_r,FR_P2
+ // input argument cann't be equal to 1.0
+ cmp.eq p0,p14 = r0,r0
+};;
+{ .mfi
+ getf.exp GR_Exp = FR_w
+ fcvt.xf FR_N = FR_int_Ln
+ add GR_ad_Co = 0xCE0,GR_ad_Data
+}
+{ .mfi
+ shladd GR_ad_T = GR_Ind4T,3,GR_ad_Data
+ fma.s1 FR_P10 = FR_P1,FR_r,f1
+ add GR_ad_Ce = 0xD00,GR_ad_Data
+};;
+{ .mfi
+ ldfd FR_T = [GR_ad_T]
+ fma.s1 FR_w2 = FR_w,FR_w,f0
+ extr.u GR_Ind = GR_Ind,61,2
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Q32 = FR_P3,FR_w,FR_P2
+//// add GR_ad_C0 = 0xB30,GR_ad_Data
+ add GR_ad_C0 = 0xB38,GR_ad_Data
+};;
+{ .mfi
+ and GR_Exp = GR_Exp,GR_ExpMask
+ nop.f 0
+ shladd GR_IndX8 = GR_Ind,3,r0
+}
+{ .mfi
+ shladd GR_IndX2 = GR_Ind,1,r0
+ fma.s1 FR_Q10 = FR_P1,FR_w,f1
+ cmp.eq p6,p15 = 0,GR_Ind
+};;
+{ .mfi
+ shladd GR_ad_Co = GR_IndX8,3,GR_ad_Co
+(p6) fma.s1 FR_x = f0,f0,FR_NormX
+ shladd GR_ad_C0 = GR_IndX2,4,GR_ad_C0
+}
+{ .mfi
+ shladd GR_ad_Ce = GR_IndX8,3,GR_ad_Ce
+ nop.f 0
+(p15) cmp.eq.unc p7,p8 = 1,GR_Ind
+};;
+.pred.rel "mutex",p7,p8
+{ .mfi
+ ldfpd FR_A8,FR_A7 = [GR_ad_Co],16
+(p7) fms.s1 FR_x = FR_NormX,f1,FR_LocalMin
+ cmp.ge p10,p11 = GR_Exp,GR_fff7
+}
+{ .mfb
+ ldfpd FR_A6,FR_A5 = [GR_ad_Ce],16
+(p8) fma.s1 FR_x = f1,f1,FR_NormX
+ br.cond.sptk lgamma_0_2_core
+};;
+// branch for calculating of ln(GAMMA(x)) for 1 <= x < 2
+//---------------------------------------------------------------------
+.align 32
+lgammaf_1_2:
+{ .mfi
+ add GR_ad_Co = 0xCF0,GR_ad_Data
+ fcmp.eq.s1 p14,p0 = f1,FR_NormX // is input argument 1.0?
+ extr.u GR_Ind = GR_Sig,61,2
+}
+{ .mfi
+ add GR_ad_Ce = 0xD10,GR_ad_Data
+ nop.f 0
+//// add GR_ad_C0 = 0xB40,GR_ad_Data
+ add GR_ad_C0 = 0xB48,GR_ad_Data
+};;
+{ .mfi
+ shladd GR_IndX8 = GR_Ind,3,r0
+ nop.f 0
+ shladd GR_IndX2 = GR_Ind,1,r0
+}
+{ .mfi
+ cmp.eq p6,p15 = 0,GR_Ind // p6 <- x from [1;1.25)
+ nop.f 0
+ cmp.ne p9,p0 = r0,r0
+};;
+{ .mfi
+ shladd GR_ad_Co = GR_IndX8,3,GR_ad_Co
+(p6) fms.s1 FR_x = FR_NormX,f1,f1 // reduced x for [1;1.25)
+ shladd GR_ad_C0 = GR_IndX2,4,GR_ad_C0
+}
+{ .mfi
+ shladd GR_ad_Ce = GR_IndX8,3,GR_ad_Ce
+(p14) fma.s.s0 f8 = f0,f0,f0
+(p15) cmp.eq.unc p7,p8 = 1,GR_Ind // p7 <- x from [1.25;1.5)
+};;
+.pred.rel "mutex",p7,p8
+{ .mfi
+ ldfpd FR_A8,FR_A7 = [GR_ad_Co],16
+(p7) fms.s1 FR_x = FR_xm2,f1,FR_LocalMin
+ nop.i 0
+}
+{ .mfi
+ ldfpd FR_A6,FR_A5 = [GR_ad_Ce],16
+(p8) fma.s1 FR_x = f0,f0,FR_NormX
+(p9) cmp.eq.unc p10,p11 = r0,r0
+};;
+lgamma_0_2_core:
+{ .mmi
+ ldfpd FR_A4,FR_A3 = [GR_ad_Co],16
+ ldfpd FR_A2,FR_A1 = [GR_ad_Ce],16
+ mov GR_SignOfGamma = 1 // set sign of gamma(x) to 1
+};;
+{ .mfi
+// add GR_ad_C0 = 8,GR_ad_C0
+ ldfd FR_A0 = [GR_ad_C0]
+ nop.f 0
+ // set p13 if signgum is 32-bit int
+ // set p15 if signgum is 64-bit int
+ cmp.eq p15,p13 = 8,r34
+};;
+.pred.rel "mutex",p13,p15
+{ .mmf
+ // store sign of gamma(x)
+(p13) st4 [r33] = GR_SignOfGamma // as 32-bit int
+(p15) st8 [r33] = GR_SignOfGamma // as 64-bit int
+(p11) fma.s1 FR_Q32 = FR_Q32,FR_w2,FR_Q10
+};;
+{ .mfb
+ nop.m 0
+(p10) fma.s1 FR_P32 = FR_P32,FR_r2,FR_P10
+(p14) br.ret.spnt b0 // fast exit for 1.0
+};;
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_T = FR_N,FR_Ln2,FR_T
+ cmp.eq p6,p7 = 0,GR_Ind // p6 <- x from [1;1.25)
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_x2 = FR_x,FR_x,f0
+ cmp.eq p8,p0 = r0,r0 // set p8 to 1 that means we on [1;2]
+};;
+{ .mfi
+ nop.m 0
+(p11) fma.s1 FR_Ln = FR_Q32,FR_w,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+(p6) fma.s1 FR_xx = f0,f0,FR_x
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fma.s1 FR_xx = f0,f0,f1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A7 = FR_A8,FR_x,FR_A7
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A5 = FR_A6,FR_x,FR_A5
+(p9) cmp.ne p8,p0 = r0,r0 // set p8 to 0 that means we on [0;1]
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A3 = FR_A4,FR_x,FR_A3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A1 = FR_A2,FR_x,FR_A1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_x4 = FR_x2,FR_x2,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_Ln = FR_P32,FR_r,FR_T
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A5 = FR_A7,FR_x2,FR_A5
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A1 = FR_A3,FR_x2,FR_A1
+ nop.i 0
+};;
+.pred.rel "mutex",p9,p8
+{ .mfi
+ nop.m 0
+(p9) fms.d.s1 FR_A0 = FR_A0,FR_xx,FR_Ln
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p8) fms.s1 FR_A0 = FR_A0,FR_xx,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.d.s1 FR_A1 = FR_A5,FR_x4,FR_A1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+(p6) fma.s.s0 f8 = FR_A1,FR_x2,FR_A0
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p7) fma.s.s0 f8 = FR_A1,FR_x,FR_A0
+ br.ret.sptk b0
+};;
+// branch for calculating of ln(GAMMA(x)) for -9 < x < 1
+//---------------------------------------------------------------------
+.align 32
+lgammaf_negrecursion:
+{ .mfi
+ getf.sig GR_N = FR_int_Ntrunc
+ fms.s1 FR_1pXf = FR_Xp2,f1,FR_Ntrunc // 1 + (x+1) - [x]
+ mov GR_Neg2 = 2
+}
+{ .mfi
+ add GR_ad_Co = 0xCE0,GR_ad_Data
+ fms.s1 FR_Xf = FR_Xp1,f1,FR_Ntrunc // (x+1) - [x]
+ mov GR_Neg4 = 4
+};;
+{ .mfi
+ add GR_ad_Ce = 0xD00,GR_ad_Data
+ fma.s1 FR_Xp6 = FR_2,FR_2,FR_Xp2 // (x+6)
+ add GR_ad_C0 = 0xB30,GR_ad_Data
+}
+{ .mfi
+ sub GR_Neg2 = r0,GR_Neg2
+ fma.s1 FR_Xp7 = FR_2,FR_3,FR_Xp1 // (x+7)
+ sub GR_Neg4 = r0,GR_Neg4
+};;
+{ .mfi
+ cmp.ne p8,p0 = r0,GR_N
+ fcmp.eq.s1 p13,p0 = FR_NormX,FR_Ntrunc
+ and GR_IntNum = 0xF,GR_N
+}
+{ .mfi
+ cmp.lt p6,p0 = GR_N,GR_Neg2
+ fma.s1 FR_Xp8 = FR_2,FR_3,FR_Xp2 // (x+8)
+ cmp.lt p7,p0 = GR_N,GR_Neg4
+};;
+{ .mfi
+ getf.d GR_Arg = FR_NormX
+(p6) fma.s1 FR_Xp2 = FR_Xp2,FR_Xp3,f0
+(p8) tbit.z.unc p14,p15 = GR_IntNum,0
+}
+{ .mfi
+ sub GR_RootInd = 0xE,GR_IntNum
+(p7) fma.s1 FR_Xp4 = FR_Xp4,FR_Xp5,f0
+ add GR_ad_Root = 0xDE0,GR_ad_Data
+};;
+{ .mfi
+ shladd GR_ad_Root = GR_RootInd,3,GR_ad_Root
+ fms.s1 FR_x = FR_Xp1,f1,FR_Ntrunc // (x+1) - [x]
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ nop.f 0
+(p13) br.cond.spnt lgammaf_singularity
+};;
+.pred.rel "mutex",p14,p15
+{ .mfi
+ cmp.gt p6,p0 = 0xA,GR_IntNum
+(p14) fma.s1 FR_Req = FR_Req,FR_Xf,f0
+ cmp.gt p7,p0 = 0xD,GR_IntNum
+}
+{ .mfi
+(p15) mov GR_SignOfGamma = 1 // set sign of gamma(x) to 1
+(p15) fnma.s1 FR_Req = FR_Req,FR_Xf,f0
+ cmp.leu p0,p13 = 2,GR_RootInd
+};;
+{ .mfi
+ nop.m 0
+(p6) fma.s1 FR_Xp6 = FR_Xp6,FR_Xp7,f0
+(p13) add GR_ad_RootCo = 0xE00,GR_ad_Data
+};;
+{ .mfi
+ nop.m 0
+ fcmp.eq.s1 p12,p11 = FR_1pXf,FR_2
+ nop.i 0
+};;
+{ .mfi
+ getf.sig GR_Sig = FR_1pXf
+ fcmp.le.s1 p9,p0 = FR_05,FR_Xf
+ nop.i 0
+}
+{ .mfi
+(p13) shladd GR_RootInd = GR_RootInd,4,r0
+(p7) fma.s1 FR_Xp2 = FR_Xp2,FR_Xp4,f0
+(p8) cmp.gt.unc p10,p0 = 0x9,GR_IntNum
+};;
+.pred.rel "mutex",p11,p12
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_Req = FR_Req,FR_Xp8,f0
+(p11) extr.u GR_Ind = GR_Sig,61,2
+}
+{ .mfi
+(p13) add GR_RootInd = GR_RootInd,GR_RootInd
+ nop.f 0
+(p12) mov GR_Ind = 3
+};;
+{ .mfi
+ shladd GR_IndX2 = GR_Ind,1,r0
+ nop.f 0
+ cmp.gt p14,p0 = 2,GR_Ind
+}
+{ .mfi
+ shladd GR_IndX8 = GR_Ind,3,r0
+ nop.f 0
+ cmp.eq p6,p0 = 1,GR_Ind
+};;
+.pred.rel "mutex",p6,p9
+{ .mfi
+ shladd GR_ad_Co = GR_IndX8,3,GR_ad_Co
+(p6) fms.s1 FR_x = FR_Xf,f1,FR_LocalMin
+ cmp.gt p10,p0 = 0xB,GR_IntNum
+}
+{ .mfi
+ shladd GR_ad_Ce = GR_IndX8,3,GR_ad_Ce
+(p9) fma.s1 FR_x = f0,f0,FR_1pXf
+ shladd GR_ad_C0 = GR_IndX2,4,GR_ad_C0
+};;
+{ .mfi
+ // load coefficients of polynomial approximation
+ // of ln(GAMMA(x)), 1 <= x < 2
+ ldfpd FR_A8,FR_A7 = [GR_ad_Co],16
+(p10) fma.s1 FR_Xp2 = FR_Xp2,FR_Xp6,f0
+ add GR_ad_C0 = 8,GR_ad_C0
+}
+{ .mfi
+ ldfpd FR_A6,FR_A5 = [GR_ad_Ce],16
+ nop.f 0
+(p14) add GR_ad_Root = 0x10,GR_ad_Root
+};;
+{ .mfi
+ ldfpd FR_A4,FR_A3 = [GR_ad_Co],16
+ nop.f 0
+ add GR_ad_RootCe = 0xE10,GR_ad_Data
+}
+{ .mfi
+ ldfpd FR_A2,FR_A1 = [GR_ad_Ce],16
+ nop.f 0
+(p14) add GR_RootInd = 0x40,GR_RootInd
+};;
+{ .mmi
+ ldfd FR_A0 = [GR_ad_C0]
+(p13) add GR_ad_RootCo = GR_ad_RootCo,GR_RootInd
+(p13) add GR_ad_RootCe = GR_ad_RootCe,GR_RootInd
+};;
+{ .mmi
+(p13) ld8 GR_Root = [GR_ad_Root]
+(p13) ldfd FR_Root = [GR_ad_Root]
+ mov GR_ExpBias = 0xffff
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_x2 = FR_x,FR_x,f0
+ nop.i 0
+}
+{ .mlx
+(p8) cmp.gt.unc p10,p0 = 0xF,GR_IntNum
+ movl GR_Dx = 0x000000014F8B588E
+};;
+{ .mfi
+ // load coefficients of polynomial approximation
+ // of ln(GAMMA(x)), x is close to one of negative roots
+(p13) ldfpd FR_R3,FR_R2 = [GR_ad_RootCo]
+ // argumenth for logarithm
+(p10) fma.s1 FR_Req = FR_Req,FR_Xp2,f0
+ mov GR_ExpMask = 0x1ffff
+}
+{ .mfi
+(p13) ldfpd FR_R1,FR_R0 = [GR_ad_RootCe]
+ nop.f 0
+ // set p9 if signgum is 32-bit int
+ // set p8 if signgum is 64-bit int
+ cmp.eq p8,p9 = 8,r34
+};;
+.pred.rel "mutex",p9,p8
+{ .mfi
+(p9) st4 [r33] = GR_SignOfGamma // as 32-bit int
+ fma.s1 FR_A7 = FR_A8,FR_x,FR_A7
+(p13) sub GR_Root = GR_Arg,GR_Root
+}
+{ .mfi
+(p8) st8 [r33] = GR_SignOfGamma // as 64-bit int
+ fma.s1 FR_A5 = FR_A6,FR_x,FR_A5
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_w = FR_Req,f1,f1
+(p13) add GR_Root = GR_Root,GR_Dx
+}
+{ .mfi
+ nop.m 0
+ nop.f 0
+(p13) add GR_2xDx = GR_Dx,GR_Dx
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A3 = FR_A4,FR_x,FR_A3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A1 = FR_A2,FR_x,FR_A1
+(p13) cmp.leu.unc p10,p0 = GR_Root,GR_2xDx
+};;
+{ .mfi
+ nop.m 0
+ frcpa.s1 FR_InvX,p0 = f1,FR_Req
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p10) fms.s1 FR_rx = FR_NormX,f1,FR_Root
+ nop.i 0
+};;
+{ .mfi
+ getf.exp GR_SignExp = FR_Req
+ fma.s1 FR_x4 = FR_x2,FR_x2,f0
+ nop.i 0
+};;
+{ .mfi
+ getf.sig GR_Sig = FR_Req
+ fma.s1 FR_A5 = FR_A7,FR_x2,FR_A5
+ nop.i 0
+};;
+{ .mfi
+ sub GR_PureExp = GR_SignExp,GR_ExpBias
+ fma.s1 FR_w2 = FR_w,FR_w,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Q32 = FR_P3,FR_w,FR_P2
+ nop.i 0
+};;
+{ .mfi
+ setf.sig FR_int_Ln = GR_PureExp
+ fma.s1 FR_A1 = FR_A3,FR_x2,FR_A1
+ extr.u GR_Ind4T = GR_Sig,55,8
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Q10 = FR_P1,FR_w,f1
+ nop.i 0
+};;
+{ .mfi
+ shladd GR_ad_T = GR_Ind4T,3,GR_ad_Data
+ fms.s1 FR_r = FR_InvX,FR_Req,f1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p10) fms.s1 FR_rx2 = FR_rx,FR_rx,f0
+ nop.i 0
+};;
+{ .mfi
+ ldfd FR_T = [GR_ad_T]
+(p10) fma.s1 FR_R2 = FR_R3,FR_rx,FR_R2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_R0 = FR_R1,FR_rx,FR_R0
+ nop.i 0
+};;
+{ .mfi
+ getf.exp GR_Exp = FR_w
+ fma.s1 FR_A1 = FR_A5,FR_x4,FR_A1
+ mov GR_ExpMask = 0x1ffff
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Q32 = FR_Q32, FR_w2,FR_Q10
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r2 = FR_r,FR_r,f0
+ mov GR_fff7 = 0xFFF7
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P32 = FR_P3,FR_r,FR_P2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P10 = FR_P1,FR_r,f1
+ and GR_Exp = GR_ExpMask,GR_Exp
+}
+{ .mfb
+ nop.m 0
+(p10) fma.s.s0 f8 = FR_R2,FR_rx2,FR_R0
+(p10) br.ret.spnt b0 // exit for arguments close to negative roots
+};;
+{ .mfi
+ nop.m 0
+ fcvt.xf FR_N = FR_int_Ln
+ nop.i 0
+}
+{ .mfi
+ cmp.ge p14,p15 = GR_Exp,GR_fff7
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A0 = FR_A1,FR_x,FR_A0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_Ln = FR_Q32,FR_w,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_P32 = FR_P32,FR_r2,FR_P10
+ cmp.eq p6,p7 = 0,GR_Ind
+};;
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_T = FR_N,FR_Ln2,FR_T
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_Ln = FR_P32,FR_r,FR_T
+ nop.i 0
+};;
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+(p6) fms.s.s0 f8 = FR_A0,FR_x,FR_Ln
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p7) fms.s.s0 f8 = FR_A0,f1,FR_Ln
+ br.ret.sptk b0
+};;
+
+// branch for calculating of ln(GAMMA(x)) for x < -2^13
+//---------------------------------------------------------------------
+.align 32
+lgammaf_negstirling:
+{ .mfi
+ shladd GR_ad_T = GR_Ind4T,3,GR_ad_Data
+ fms.s1 FR_Xf = FR_NormX,f1,FR_N // xf = x - [x]
+ mov GR_SingBound = 0x10016
+}
+{ .mfi
+ add GR_ad_Co = 0xCA0,GR_ad_Data
+ fma.s1 FR_P32 = FR_P3,FR_r,FR_P2
+ nop.i 0
+};;
+{ .mfi
+ ldfd FR_T = [GR_ad_T]
+ fcvt.xf FR_int_Ln = FR_int_Ln
+ cmp.le p6,p0 = GR_SingBound,GR_Exp
+}
+{ .mfb
+ add GR_ad_Ce = 0x20,GR_ad_Co
+ fma.s1 FR_r2 = FR_r,FR_r,f0
+(p6) br.cond.spnt lgammaf_singularity
+};;
+{ .mfi
+ // load coefficients of polynomial approximation
+ // of ln(sin(Pi*xf)/(Pi*xf)), |xf| <= 0.5
+ ldfpd FR_S16,FR_S14 = [GR_ad_Co],16
+ fma.s1 FR_P10 = FR_P1,FR_r,f1
+ nop.i 0
+}
+{ .mfi
+ ldfpd FR_S12,FR_S10 = [GR_ad_Ce],16
+ fms.s1 FR_xm05 = FR_NormX,f1,FR_05
+ nop.i 0
+};;
+{ .mmi
+ ldfpd FR_S8,FR_S6 = [GR_ad_Co],16
+ ldfpd FR_S4,FR_S2 = [GR_ad_Ce],16
+ nop.i 0
+};;
+{ .mfi
+ getf.sig GR_N = FR_int_Ntrunc // signgam calculation
+ fma.s1 FR_Xf2 = FR_Xf,FR_Xf,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ frcpa.s1 FR_InvXf,p0 = f1,FR_Xf
+ nop.i 0
+};;
+{ .mfi
+ getf.d GR_Arg = FR_Xf
+ fcmp.eq.s1 p6,p0 = FR_NormX,FR_N
+ mov GR_ExpBias = 0x3FF
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_T = FR_int_Ln,FR_Ln2,FR_T
+ extr.u GR_Exp = GR_Arg,52,11
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P32 = FR_P32,FR_r2,FR_P10
+ nop.i 0
+};;
+{ .mfi
+ sub GR_PureExp = GR_Exp,GR_ExpBias
+ fma.s1 FR_S14 = FR_S16,FR_Xf2,FR_S14
+ extr.u GR_Ind4T = GR_Arg,44,8
+}
+{ .mfb
+ mov GR_SignOfGamma = 1 // set signgam to -1
+ fma.s1 FR_S10 = FR_S12,FR_Xf2,FR_S10
+(p6) br.cond.spnt lgammaf_singularity
+};;
+{ .mfi
+ setf.sig FR_int_Ln = GR_PureExp
+ fms.s1 FR_rf = FR_InvXf,FR_Xf,f1
+ // set p14 if GR_N is even
+ tbit.z p14,p0 = GR_N,0
+}
+{ .mfi
+ shladd GR_ad_T = GR_Ind4T,3,GR_ad_Data
+ fma.s1 FR_Xf4 = FR_Xf2,FR_Xf2,f0
+ nop.i 0
+};;
+{ .mfi
+(p14) sub GR_SignOfGamma = r0,GR_SignOfGamma // set signgam to -1
+ fma.s1 FR_S6 = FR_S8,FR_Xf2,FR_S6
+ nop.i 0
+}
+{ .mfi
+ // set p9 if signgum is 32-bit int
+ // set p10 if signgum is 64-bit int
+ cmp.eq p10,p9 = 8,r34
+ fma.s1 FR_S2 = FR_S4,FR_Xf2,FR_S2
+ nop.i 0
+};;
+{ .mfi
+ ldfd FR_Tf = [GR_ad_T]
+ fma.s1 FR_Ln = FR_P32,FR_r,FR_T
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_LnSqrt2Pi = FR_LnSqrt2Pi,f1,FR_NormX
+ nop.i 0
+};;
+.pred.rel "mutex",p9,p10
+{ .mfi
+(p9) st4 [r33] = GR_SignOfGamma // as 32-bit int
+ fma.s1 FR_rf2 = FR_rf,FR_rf,f0
+ nop.i 0
+}
+{ .mfi
+(p10) st8 [r33] = GR_SignOfGamma // as 64-bit int
+ fma.s1 FR_S10 = FR_S14,FR_Xf4,FR_S10
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P32f = FR_P3,FR_rf,FR_P2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Xf8 = FR_Xf4,FR_Xf4,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P10f = FR_P1,FR_rf,f1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S2 = FR_S6,FR_Xf4,FR_S2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_Ln = FR_Ln,FR_xm05,FR_LnSqrt2Pi
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fcvt.xf FR_Nf = FR_int_Ln
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S2 = FR_S10,FR_Xf8,FR_S2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Tf = FR_Nf,FR_Ln2,FR_Tf
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P32f = FR_P32f,FR_rf2,FR_P10f // ??????
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_Ln = FR_S2,FR_Xf2,FR_Ln
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Lnf = FR_P32f,FR_rf,FR_Tf
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ fms.s.s0 f8 = FR_Ln,f1,FR_Lnf
+ br.ret.sptk b0
+};;
+// branch for calculating of ln(GAMMA(x)) for -2^13 < x < -9
+//---------------------------------------------------------------------
+.align 32
+lgammaf_negpoly:
+{ .mfi
+ getf.d GR_Arg = FR_Xf
+ frcpa.s1 FR_InvXf,p0 = f1,FR_Xf
+ mov GR_ExpBias = 0x3FF
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Xf2 = FR_Xf,FR_Xf,f0
+ nop.i 0
+};;
+{ .mfi
+ getf.sig GR_N = FR_int_Ntrunc
+ fcvt.xf FR_N = FR_int_Ln
+ mov GR_SignOfGamma = 1
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A9 = FR_A10,FR_x,FR_A9
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P10 = FR_P1,FR_r,f1
+ extr.u GR_Exp = GR_Arg,52,11
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_x4 = FR_x2,FR_x2,f0
+ nop.i 0
+};;
+{ .mfi
+ sub GR_PureExp = GR_Exp,GR_ExpBias
+ fma.s1 FR_A7 = FR_A8,FR_x,FR_A7
+ tbit.z p14,p0 = GR_N,0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A5 = FR_A6,FR_x,FR_A5
+ nop.i 0
+};;
+{ .mfi
+ setf.sig FR_int_Ln = GR_PureExp
+ fma.s1 FR_A3 = FR_A4,FR_x,FR_A3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A1 = FR_A2,FR_x,FR_A1
+(p14) sub GR_SignOfGamma = r0,GR_SignOfGamma
+};;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_rf = FR_InvXf,FR_Xf,f1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Xf4 = FR_Xf2,FR_Xf2,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S14 = FR_S16,FR_Xf2,FR_S14
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S10 = FR_S12,FR_Xf2,FR_S10
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_T = FR_N,FR_Ln2,FR_T
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P32 = FR_P32,FR_r2,FR_P10
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S6 = FR_S8,FR_Xf2,FR_S6
+ extr.u GR_Ind4T = GR_Arg,44,8
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S2 = FR_S4,FR_Xf2,FR_S2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A7 = FR_A9,FR_x2,FR_A7
+ nop.i 0
+}
+{ .mfi
+ shladd GR_ad_T = GR_Ind4T,3,GR_ad_Data
+ fma.s1 FR_A3 = FR_A5,FR_x2,FR_A3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Xf8 = FR_Xf4,FR_Xf4,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rf2 = FR_rf,FR_rf,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P32f = FR_P3,FR_rf,FR_P2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P10f = FR_P1,FR_rf,f1
+ nop.i 0
+};;
+{ .mfi
+ ldfd FR_Tf = [GR_ad_T]
+ fma.s1 FR_Ln = FR_P32,FR_r,FR_T
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A0 = FR_A1,FR_x,FR_A0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S10 = FR_S14,FR_Xf4,FR_S10
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S2 = FR_S6,FR_Xf4,FR_S2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fcvt.xf FR_Nf = FR_int_Ln
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A3 = FR_A7,FR_x4,FR_A3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fcmp.eq.s1 p13,p0 = FR_NormX,FR_Ntrunc
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_x3 = FR_x2,FR_x,f0 // -x^3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P32f = FR_P32f,FR_rf2,FR_P10f
+ nop.i 0
+};;
+{ .mfb
+ // set p9 if signgum is 32-bit int
+ // set p10 if signgum is 64-bit int
+ cmp.eq p10,p9 = 8,r34
+ fma.s1 FR_S2 = FR_S10,FR_Xf8,FR_S2
+(p13) br.cond.spnt lgammaf_singularity
+};;
+.pred.rel "mutex",p9,p10
+{ .mmf
+(p9) st4 [r33] = GR_SignOfGamma // as 32-bit int
+(p10) st8 [r33] = GR_SignOfGamma // as 64-bit int
+ fms.s1 FR_A0 = FR_A3,FR_x3,FR_A0 // -A3*x^3-A0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Tf = FR_Nf,FR_Ln2,FR_Tf
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Ln = FR_S2,FR_Xf2,FR_Ln // S2*Xf^2+Ln
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Lnf = FR_P32f,FR_rf,FR_Tf
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_Ln = FR_A0,f1,FR_Ln
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ fms.s.s0 f8 = FR_Ln,f1,FR_Lnf
+ br.ret.sptk b0
+};;
+// branch for handling +/-0, NaT, QNaN, +/-INF and denormalised numbers
+//---------------------------------------------------------------------
+.align 32
+lgammaf_spec:
+{ .mfi
+ getf.exp GR_SignExp = FR_NormX
+ fclass.m p6,p0 = f8,0x21 // is arg +INF?
+ mov GR_SignOfGamma = 1 // set signgam to 1
+};;
+{ .mfi
+ getf.sig GR_Sig = FR_NormX
+ fclass.m p7,p0 = f8,0xB // is x deno?
+ // set p11 if signgum is 32-bit int
+ // set p12 if signgum is 64-bit int
+ cmp.eq p12,p11 = 8,r34
+};;
+.pred.rel "mutex",p11,p12
+{ .mfi
+ // store sign of gamma(x) as 32-bit int
+(p11) st4 [r33] = GR_SignOfGamma
+ fclass.m p8,p0 = f8,0x1C0 // is arg NaT or NaN?
+ dep.z GR_Ind = GR_SignExp,3,4
+}
+{ .mib
+ // store sign of gamma(x) as 64-bit int
+(p12) st8 [r33] = GR_SignOfGamma
+ and GR_Exp = GR_ExpMask,GR_SignExp
+(p6) br.ret.spnt b0 // exit for +INF
+};;
+{ .mfi
+ sub GR_PureExp = GR_Exp,GR_ExpBias
+ fclass.m p9,p0 = f8,0x22 // is arg -INF?
+ extr.u GR_Ind4T = GR_Sig,55,8
+}
+{ .mfb
+ nop.m 0
+(p7) fma.s0 FR_tmp = f1,f1,f8
+(p7) br.cond.sptk lgammaf_core
+};;
+{ .mfb
+ nop.m 0
+(p8) fms.s.s0 f8 = f8,f1,f8
+(p8) br.ret.spnt b0 // exit for NaT and NaN
+};;
+{ .mfb
+ nop.m 0
+(p9) fmerge.s f8 = f1,f8
+(p9) br.ret.spnt b0 // exit -INF
+};;
+// branch for handling negative integers and +/-0
+//---------------------------------------------------------------------
+.align 32
+lgammaf_singularity:
+{ .mfi
+ mov GR_SignOfGamma = 1 // set signgam to 1
+ fclass.m p6,p0 = f8,0x6 // is x -0?
+ mov GR_TAG = 109 // negative
+}
+{ .mfi
+ mov GR_ad_SignGam = r33
+ fma.s1 FR_X = f0,f0,f8
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ frcpa.s0 f8,p0 = f1,f0
+ // set p9 if signgum is 32-bit int
+ // set p10 if signgum is 64-bit int
+ cmp.eq p10,p9 = 8,r34
+}
+{ .mib
+ nop.m 0
+(p6) sub GR_SignOfGamma = r0,GR_SignOfGamma
+ br.cond.sptk lgammaf_libm_err
+};;
+// overflow (x > OVERFLOV_BOUNDARY)
+//---------------------------------------------------------------------
+.align 32
+lgammaf_overflow:
+{ .mfi
+ nop.m 0
+ nop.f 0
+ mov r8 = 0x1FFFE
+};;
+{ .mfi
+ setf.exp f9 = r8
+ fmerge.s FR_X = f8,f8
+ mov GR_TAG = 108 // overflow
+};;
+{ .mfi
+ mov GR_ad_SignGam = r33
+ nop.f 0
+ // set p9 if signgum is 32-bit int
+ // set p10 if signgum is 64-bit int
+ cmp.eq p10,p9 = 8,r34
+}
+{ .mfi
+ nop.m 0
+ fma.s.s0 f8 = f9,f9,f0 // Set I,O and +INF result
+ nop.i 0
+};;
+// gate to __libm_error_support#
+//---------------------------------------------------------------------
+.align 32
+lgammaf_libm_err:
+{ .mmi
+ alloc r32 = ar.pfs,1,4,4,0
+ mov GR_Parameter_TAG = GR_TAG
+ nop.i 0
+};;
+.pred.rel "mutex",p9,p10
+{ .mmi
+ // store sign of gamma(x) as 32-bit int
+(p9) st4 [GR_ad_SignGam] = GR_SignOfGamma
+ // store sign of gamma(x) as 64-bit int
+(p10) st8 [GR_ad_SignGam] = GR_SignOfGamma
+ nop.i 0
+};;
+GLOBAL_LIBM_END(__libm_lgammaf)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfs [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfs [GR_Parameter_X] = FR_X // STORE Parameter 1
+ // on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfs [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3
+ // on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling
+ // function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+{ .mmi
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/libm_lgammal.S b/sysdeps/ia64/fpu/libm_lgammal.S
new file mode 100644
index 0000000000..056171b7d2
--- /dev/null
+++ b/sysdeps/ia64/fpu/libm_lgammal.S
@@ -0,0 +1,7676 @@
+.file "libm_lgammal.s"
+
+
+// Copyright (c) 2002 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
+// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
+// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code,and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//*********************************************************************
+//
+// History:
+// 03/28/02 Original version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 08/21/02 Added support of SIGN(GAMMA(x)) calculation
+// 09/26/02 Algorithm description improved
+// 10/21/02 Now it returns SIGN(GAMMA(x))=-1 for negative zero
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+//*********************************************************************
+//
+// Function: __libm_lgammal(long double x, int* signgam, int szsigngam)
+// computes the principal value of the logarithm of the GAMMA function
+// of x. Signum of GAMMA(x) is stored to memory starting at the address
+// specified by the signgam.
+//
+//*********************************************************************
+//
+// Resources Used:
+//
+// Floating-Point Registers: f8 (Input and Return Value)
+// f9-f15
+// f32-f127
+//
+// General Purpose Registers:
+// r2, r3, r8-r11, r14-r31
+// r32-r65
+// r66-r69 (Used to pass arguments to error handling routine)
+//
+// Predicate Registers: p6-p15
+//
+//*********************************************************************
+//
+// IEEE Special Conditions:
+//
+// __libm_lgammal(+inf) = +inf
+// __libm_lgammal(-inf) = QNaN
+// __libm_lgammal(+/-0) = +inf
+// __libm_lgammal(x<0, x - integer) = QNaN
+// __libm_lgammal(SNaN) = QNaN
+// __libm_lgammal(QNaN) = QNaN
+//
+//*********************************************************************
+//
+// ALGORITHM DESCRIPTION
+//
+// Below we suppose that there is log(z) function which takes an long
+// double argument and returns result as a pair of long double numbers
+// lnHi and lnLo (such that sum lnHi + lnLo provides ~80 correct bits
+// of significand). Algorithm description for such log(z) function
+// see below.
+// Also, it this algorithm description we use the following notational
+// conventions:
+// a) pair A = (Ahi, Alo) means number A represented as sum of Ahi and Alo
+// b) C = A + B = (Ahi, Alo) + (Bhi, Blo) means multi-precision addition.
+// The result would be C = (Chi, Clo). Notice, that Clo shouldn't be
+// equal to Alo + Blo
+// c) D = A*B = (Ahi, Alo)*(Bhi, Blo) = (Dhi, Dlo) multi-precisiion
+// multiplication.
+//
+// So, lgammal has the following computational paths:
+// 1) |x| < 0.5
+// P = A1*|x| + A2*|x|^2 + ... + A22*|x|^22
+// A1, A2, A3 represented as a sum of two double precision
+// numbers and multi-precision computations are used for 3 higher
+// terms of the polynomial. We get polynomial as a sum of two
+// double extended numbers: P = (Phi, Plo)
+// 1.1) x > 0
+// lgammal(x) = P - log(|x|) = (Phi, Plo) - (lnHi(|x|), lnLo(|x|))
+// 1.2) x < 0
+// lgammal(x) = -P - log(|x|) - log(sin(Pi*x)/(Pi*x))
+// P and log(|x|) are computed by the same way as in 1.1;
+// - log(sin(Pi*x)/(Pi*x)) is approximated by a polynomial Plnsin.
+// Plnsin:= fLnSin2*|x|^2 + fLnSin4*|x|^4 + ... + fLnSin36*|x|^36
+// The first coefficient of Plnsin is represented as sum of two
+// double precision numbers (fLnSin2, fLnSin2L). Multi-precision
+// computations for higher two terms of Plnsin are used.
+// So, the final result is reconstructed by the following formula
+// lgammal(x) = (-(Phi, Plo) - (lnHi(|x|), lnLo(|x|))) -
+// - (PlnsinHi,PlnsinLo)
+//
+// 2) 0.5 <= x < 0.75 -> t = x - 0.625
+// -0.75 < x <= -0.5 -> t = x + 0.625
+// 2.25 <= x < 4.0 -> t = x/2 - 1.5
+// 4.0 <= x < 8.0 -> t = x/4 - 1.5
+// -0.5 < x <= -0.40625 -> t = x + 0.5
+// -2.6005859375 < x <= -2.5 -> t = x + 2.5
+// 1.3125 <= x < 1.5625 -> t = x - LOC_MIN, where LOC_MIN is point in
+// which lgammal has local minimum. Exact
+// value can be found in the table below,
+// approximate value is ~1.46
+//
+// lgammal(x) is approximated by the polynomial of 25th degree: P25(t)
+// P25(t) = A0 + A1*t + ... + A25*t^25 = (Phi, Plo) + t^4*P21(t),
+// where
+// (Phi, Plo) is sum of four highest terms of the polynomial P25(t):
+// (Phi, Plo) = ((A0, A0L) + (A1, A1L)*t) + t^2 *((A2, A2L) + (A3, A3L)*t),
+// (Ai, AiL) - coefficients represented as pairs of DP numbers.
+//
+// P21(t) = (PolC(t)*t^8 + PolD(t))*t^8 + PolE(t),
+// where
+// PolC(t) = C21*t^5 + C20*t^4 + ... + C16,
+// C21 = A25, C20 = A24, ..., C16 = A20
+//
+// PolD(t) = D7*t^7 + D6*t^6 + ... + D0,
+// D7 = A19, D6 = A18, ..., D0 = A12
+//
+// PolE(t) = E7*t^7 + E6*t^6 + ... + E0,
+// E7 = A11, E6 = A10, ..., E0 = A4
+//
+// Cis and Dis are represented as double precision numbers,
+// Eis are represented as double extended numbers.
+//
+// 3) 0.75 <= x < 1.3125 -> t = x - 1.0
+// 1.5625 <= x < 2.25 -> t = x - 2.0
+// lgammal(x) is approximated by the polynomial of 25th degree: P25(t)
+// P25(t) = A1*t + ... + A25*t^25, and computations are carried out
+// by similar way as in the previous case
+//
+// 4) 10.0 < x <= Overflow Bound ("positive Sterling" range)
+// lgammal(x) is approximated using Sterling's formula:
+// lgammal(x) ~ ((x*(lnHi(x) - 1, lnLo(x))) - 0.5*(lnHi(x), lnLo(x))) +
+// + ((Chi, Clo) + S(1/x))
+// where
+// C = (Chi, Clo) - pair of double precision numbers representing constant
+// 0.5*ln(2*Pi);
+// S(1/x) = 1/x * (B2 + B4*(1/x)^2 + ... + B20*(1/x)^18), B2, ..., B20 are
+// Bernulli numbers. S is computed in native precision and then added to
+// Clo;
+// lnHi(x) - 1 is computed in native precision and the multiprecision
+// multiplication (x, 0) *(lnHi(x) - 1, lnLo(x)) is used.
+//
+// 5) -INF < x <= -2^63, any negative integer < 0
+// All numbers in this range are integers -> error handler is called
+//
+// 6) -2^63 < x <= -0.75 ("negative Sterling" range), x is "far" from root,
+// lgammal(-t) for positive t is approximated using the following formula:
+// lgammal(-t) = -lgammal(t)-log(t)-log(|dT|)+log(sin(Pi*|dT|)/(Pi*|dT|))
+// where dT = -t -round_to_nearest_integer(-t)
+// Last item is approximated by the same polynomial as described in 1.2.
+// We split the whole range into three subranges due to different ways of
+// approximation of the first terms.
+// 6.1) -2^63 < x < -6.0 ("negative Sterling" range)
+// lgammal(t) is approximated exactly as in #4. The only difference that
+// for -13.0 < x < -6.0 subrange instead of Bernulli numbers we use their
+// minimax approximation on this range.
+// log(t), log(|dT|) are approximated by the log routine mentioned above.
+// 6.2) -6.0 < x <= -0.75, |x + 1|> 2^(-7)
+// log(t), log(|dT|) are approximated by the log routine mentioned above,
+// lgammal(t) is approximated by polynomials of the 25th degree similar
+// to ones from #2. Arguments z of the polynomials are as follows
+// a) 0.75 <= t < 1.0 - 2^(-7), z = 2*t - 1.5
+// b) 1.0 - 2^(-7) < t < 2.0, z = t - 1.5
+// c) 2.0 < t < 3.0, z = t/2 - 1.5
+// d) 3.0 < t < 4.0, z = t/2 - 1.5. Notice, that range reduction is
+// the same as in case c) but the set of coefficients is different
+// e) 4.0 < t < 6.0, z = t/4 - 1.5
+// 6.3) |x + 1| <= 2^(-7)
+// log(1 + (x-1)) is approximated by Taylor series,
+// log(sin(Pi*|dT|)/(Pi*|dT|)) is still approximated by polynomial but
+// it has just 4th degree.
+// log(|dT|) is approximated by the log routine mentioned above.
+// lgammal(-x) is approximated by polynomial of 8th degree from (-x + 1).
+//
+// 7) -20.0 < x < -2.0, x falls in root "neighbourhood".
+// "Neighbourhood" means that |lgammal(x)| < epsilon, where epsilon is
+// different for every root (and it is stored in the table), but typically
+// it is ~ 0.15. There are 35 roots significant from "double extended"
+// point of view. We split all the roots into two subsets: "left" and "right"
+// roots. Considering [-(N+1), -N] range we call root as "left" one if it
+// lies closer to -(N+1) and "right" otherwise. There is no "left" root in
+// the [-20, -19] range (it exists, but is insignificant for double extended
+// precision). To determine if x falls in root "neighbourhood" we store
+// significands of all the 35 roots as well as epsilon values (expressed
+// by the left and right bound).
+// In these ranges we approximate lgammal(x) by polynomial series of 19th
+// degree:
+// lgammal(x) = P19(t) = A0 + A1*t + ...+ A19*t^19, where t = x - EDP_Root,
+// EDP_Root is the exact value of the corresponding root rounded to double
+// extended precision. So, we have 35 different polynomials which make our
+// table rather big. We may hope that x falls in root "neighbourhood"
+// quite rarely -> ther might be no need in frequent use of different
+// polynomials.
+// A0, A1, A2, A3 are represented as pairs of double precision numbers,
+// A4, A5 are long doubles, and to decrease the size of the table we
+// keep the rest of coefficients in just double precision
+//
+//*********************************************************************
+// Algorithm for log(X) = (lnHi(X), lnLo(X))
+//
+// ALGORITHM
+//
+// Here we use a table lookup method. The basic idea is that in
+// order to compute logl(Arg) for an argument Arg in [1,2), we
+// construct a value G such that G*Arg is close to 1 and that
+// logl(1/G) is obtainable easily from a table of values calculated
+// beforehand. Thus
+//
+// logl(Arg) = logl(1/G) + logl(G*Arg)
+// = logl(1/G) + logl(1 + (G*Arg - 1))
+//
+// Because |G*Arg - 1| is small, the second term on the right hand
+// side can be approximated by a short polynomial. We elaborate
+// this method in four steps.
+//
+// Step 0: Initialization
+//
+// We need to calculate logl( X ). Obtain N, S_hi such that
+//
+// X = 2^N * S_hi exactly
+//
+// where S_hi in [1,2)
+//
+// Step 1: Argument Reduction
+//
+// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
+//
+// G := G_1 * G_2 * G_3
+// r := (G * S_hi - 1)
+//
+// These G_j's have the property that the product is exactly
+// representable and that |r| < 2^(-12) as a result.
+//
+// Step 2: Approximation
+//
+//
+// logl(1 + r) is approximated by a short polynomial poly(r).
+//
+// Step 3: Reconstruction
+//
+//
+// Finally, logl( X ) is given by
+//
+// logl( X ) = logl( 2^N * S_hi )
+// ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
+// ~=~ N*logl(2) + logl(1/G) + poly(r).
+//
+// IMPLEMENTATION
+//
+// Step 0. Initialization
+// ----------------------
+//
+// Z := X
+// N := unbaised exponent of Z
+// S_hi := 2^(-N) * Z
+//
+// Step 1. Argument Reduction
+// --------------------------
+//
+// Let
+//
+// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63
+//
+// We obtain G_1, G_2, G_3 by the following steps.
+//
+//
+// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted
+// from S_hi.
+//
+// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated
+// to lsb = 2^(-4).
+//
+// Define index_1 := [ d_1 d_2 d_3 d_4 ].
+//
+// Fetch Z_1 := (1/A_1) rounded UP in fixed point with
+// fixed point lsb = 2^(-15).
+// Z_1 looks like z_0.z_1 z_2 ... z_15
+// Note that the fetching is done using index_1.
+// A_1 is actually not needed in the implementation
+// and is used here only to explain how is the value
+// Z_1 defined.
+//
+// Fetch G_1 := (1/A_1) truncated to 21 sig. bits.
+// floating pt. Again, fetching is done using index_1. A_1
+// explains how G_1 is defined.
+//
+// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14)
+// = 1.0 0 0 0 d_5 ... d_14
+// This is accomplised by integer multiplication.
+// It is proved that X_1 indeed always begin
+// with 1.0000 in fixed point.
+//
+//
+// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
+// truncated to lsb = 2^(-8). Similar to A_1,
+// A_2 is not needed in actual implementation. It
+// helps explain how some of the values are defined.
+//
+// Define index_2 := [ d_5 d_6 d_7 d_8 ].
+//
+// Fetch Z_2 := (1/A_2) rounded UP in fixed point with
+// fixed point lsb = 2^(-15). Fetch done using index_2.
+// Z_2 looks like z_0.z_1 z_2 ... z_15
+//
+// Fetch G_2 := (1/A_2) truncated to 21 sig. bits.
+// floating pt.
+//
+// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14)
+// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14
+// This is accomplised by integer multiplication.
+// It is proved that X_2 indeed always begin
+// with 1.00000000 in fixed point.
+//
+//
+// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1.
+// This is 2^(-14) + X_2 truncated to lsb = 2^(-13).
+//
+// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ].
+//
+// Fetch G_3 := (1/A_3) truncated to 21 sig. bits.
+// floating pt. Fetch is done using index_3.
+//
+// Compute G := G_1 * G_2 * G_3.
+//
+// This is done exactly since each of G_j only has 21 sig. bits.
+//
+// Compute
+//
+// r := (G*S_hi - 1)
+//
+//
+// Step 2. Approximation
+// ---------------------
+//
+// This step computes an approximation to logl( 1 + r ) where r is the
+// reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13);
+// thus logl(1+r) can be approximated by a short polynomial:
+//
+// logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5
+//
+//
+// Step 3. Reconstruction
+// ----------------------
+//
+// This step computes the desired result of logl(X):
+//
+// logl(X) = logl( 2^N * S_hi )
+// = N*logl(2) + logl( S_hi )
+// = N*logl(2) + logl(1/G) +
+// logl(1 + G*S_hi - 1 )
+//
+// logl(2), logl(1/G_j) are stored as pairs of (single,double) numbers:
+// log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are
+// single-precision numbers and the low parts are double precision
+// numbers. These have the property that
+//
+// N*log2_hi + SUM ( log1byGj_hi )
+//
+// is computable exactly in double-extended precision (64 sig. bits).
+// Finally
+//
+// lnHi(X) := N*log2_hi + SUM ( log1byGj_hi )
+// lnLo(X) := poly_hi + [ poly_lo +
+// ( SUM ( log1byGj_lo ) + N*log2_lo ) ]
+//
+//
+//*********************************************************************
+// General Purpose Registers
+// scratch registers
+rPolDataPtr = r2
+rLnSinDataPtr = r3
+rExpX = r8
+rSignifX = r9
+rDelta = r10
+rSignExpX = r11
+GR_ad_z_1 = r14
+r17Ones = r15
+GR_Index1 = r16
+rSignif1andQ = r17
+GR_X_0 = r18
+GR_X_1 = r19
+GR_X_2 = r20
+GR_Z_1 = r21
+GR_Z_2 = r22
+GR_N = r23
+rExpHalf = r24
+rExp8 = r25
+rX0Dx = r25
+GR_ad_tbl_1 = r26
+GR_ad_tbl_2 = r27
+GR_ad_tbl_3 = r28
+GR_ad_q = r29
+GR_ad_z_1 = r30
+GR_ad_z_2 = r31
+// stacked registers
+rPFS_SAVED = r32
+GR_ad_z_3 = r33
+rSgnGamAddr = r34
+rSgnGamSize = r35
+rLogDataPtr = r36
+rZ1offsett = r37
+rTmpPtr = r38
+rTmpPtr2 = r39
+rTmpPtr3 = r40
+rExp2 = r41
+rExp2tom7 = r42
+rZ625 = r42
+rExpOne = r43
+rNegSingularity = r44
+rXint = r45
+rTbl1Addr = r46
+rTbl2Addr = r47
+rTbl3Addr = r48
+rZ2Addr = r49
+rRootsAddr = r50
+rRootsBndAddr = r51
+rRoot = r52
+rRightBound = r53
+rLeftBound = r54
+rSignifDx = r55
+rBernulliPtr = r56
+rLnSinTmpPtr = r56
+rIndex1Dx = r57
+rIndexPol = r58
+GR_Index3 = r59
+GR_Index2 = r60
+rSgnGam = r61
+rXRnd = r62
+
+GR_SAVE_B0 = r63
+GR_SAVE_GP = r64
+GR_SAVE_PFS = r65
+// output parameters when calling error handling routine
+GR_Parameter_X = r66
+GR_Parameter_Y = r67
+GR_Parameter_RESULT = r68
+GR_Parameter_TAG = r69
+
+//********************************************************************
+// Floating Point Registers
+// CAUTION: due to the lack of registers there exist (below in the code)
+// sometimes "unconventional" use of declared registers
+//
+fAbsX = f6
+fDelX4 = f6
+fSignifX = f7
+// macros for error handling routine
+FR_X = f10 // first argument
+FR_Y = f1 // second argument (lgammal has just one)
+FR_RESULT = f8 // result
+
+// First 7 Bernulli numbers
+fB2 = f9
+fLnDeltaL = f9
+fXSqr = f9
+fB4 = f10
+fX4 = f10
+fB6 = f11
+fX6 = f11
+fB8 = f12
+fXSqrL = f12
+fB10 = f13
+fRes7H = f13
+fB12 = f14
+fRes7L = f14
+fB14 = f15
+
+// stack registers
+// Polynomial coefficients: A0, ..., A25
+fA0 = f32
+fA0L = f33
+fInvXL = f33
+fA1 = f34
+fA1L = f35
+fA2 = f36
+fA2L = f37
+fA3 = f38
+fA3L = f39
+fA4 = f40
+fA4L = f41
+fRes6H = f41
+fA5 = f42
+fB2L = f42
+fA5L = f43
+fMinNegStir = f43
+fRes6L = f43
+fA6 = f44
+fMaxNegStir = f44
+fA7 = f45
+fLnDeltaH = f45
+fA8 = f46
+fBrnL = f46
+fA9 = f47
+fBrnH = f47
+fA10 = f48
+fRes5L = f48
+fA11 = f49
+fRes5H = f49
+fA12 = f50
+fDx6 = f50
+fA13 = f51
+fDx8 = f51
+fA14 = f52
+fDx4 = f52
+fA15 = f53
+fYL = f53
+fh3Dx = f53
+fA16 = f54
+fYH = f54
+fH3Dx = f54
+fA17 = f55
+fResLnDxL = f55
+fG3Dx = f55
+fA18 = f56
+fResLnDxH = f56
+fh2Dx = f56
+fA19 = f57
+fFloatNDx = f57
+fA20 = f58
+fPolyHiDx = f58
+fhDx = f58
+fA21 = f59
+fRDxCub = f59
+fHDx = f59
+fA22 = f60
+fRDxSq = f60
+fGDx = f60
+fA23 = f61
+fPolyLoDx = f61
+fInvX3 = f61
+fA24 = f62
+fRDx = f62
+fInvX8 = f62
+fA25 = f63
+fInvX4 = f63
+fPol = f64
+fPolL = f65
+// Coefficients of ln(sin(Pi*x)/Pi*x)
+fLnSin2 = f66
+fLnSin2L = f67
+fLnSin4 = f68
+fLnSin6 = f69
+fLnSin8 = f70
+fLnSin10 = f71
+fLnSin12 = f72
+fLnSin14 = f73
+fLnSin16 = f74
+fLnSin18 = f75
+fDelX8 = f75
+fLnSin20 = f76
+fLnSin22 = f77
+fDelX6 = f77
+fLnSin24 = f78
+fLnSin26 = f79
+fLnSin28 = f80
+fLnSin30 = f81
+fhDelX = f81
+fLnSin32 = f82
+fLnSin34 = f83
+fLnSin36 = f84
+fXint = f85
+fDxSqr = f85
+fRes3L = f86
+fRes3H = f87
+fRes4H = f88
+fRes4L = f89
+fResH = f90
+fResL = f91
+fDx = f92
+FR_MHalf = f93
+fRes1H = f94
+fRes1L = f95
+fRes2H = f96
+fRes2L = f97
+FR_FracX = f98
+fRcpX = f99
+fLnSinH = f99
+fTwo = f100
+fMOne = f100
+FR_G = f101
+FR_H = f102
+FR_h = f103
+FR_G2 = f104
+FR_H2 = f105
+FR_poly_lo = f106
+FR_poly_hi = f107
+FR_h2 = f108
+FR_rsq = f109
+FR_r = f110
+FR_log2_hi = f111
+FR_log2_lo = f112
+fFloatN = f113
+FR_Q4 = f114
+FR_G3 = f115
+FR_H3 = f116
+FR_h3 = f117
+FR_Q3 = f118
+FR_Q2 = f119
+FR_Q1 = f120
+fThirteen = f121
+fSix = f121
+FR_rcub = f121
+// Last three Bernulli numbers
+fB16 = f122
+fB18 = f123
+fB20 = f124
+fInvX = f125
+fLnSinL = f125
+fDxSqrL = f126
+fFltIntX = f126
+fRoot = f127
+fNormDx = f127
+
+// Data tables
+//==============================================================
+RODATA
+// ************* DO NOT CHANGE THE ORDER OF THESE TABLES *************
+.align 16
+LOCAL_OBJECT_START(lgammal_right_roots_data)
+// List of all right roots themselves
+data8 0x9D3FE4B007C360AB, 0x0000C000 // Range [-3, -2]
+data8 0xC9306DE4F2CD7BEE, 0x0000C000 // Range [-4, -3]
+data8 0x814273C2CCAC0618, 0x0000C001 // Range [-5, -4]
+data8 0xA04352BF85B6C865, 0x0000C001 // Range [-6, -5]
+data8 0xC00B592C4BE4676C, 0x0000C001 // Range [-7, -6]
+data8 0xE0019FEF6FF0F5BF, 0x0000C001 // Range [-8, -7]
+data8 0x80001A01459FC9F6, 0x0000C002 // Range [-9, -8]
+data8 0x900002E3BB47D86D, 0x0000C002 // Range [-10, -9]
+data8 0xA0000049F93BB992, 0x0000C002 // Range [-11, -10]
+data8 0xB0000006B9915316, 0x0000C002 // Range [-12, -11]
+data8 0xC00000008F76C773, 0x0000C002 // Range [-13, -12]
+data8 0xD00000000B09230A, 0x0000C002 // Range [-14, -13]
+data8 0xE000000000C9CBA5, 0x0000C002 // Range [-15, -14]
+data8 0xF0000000000D73FA, 0x0000C002 // Range [-16, -15]
+data8 0x8000000000006BA0, 0x0000C003 // Range [-17, -16]
+data8 0x8800000000000655, 0x0000C003 // Range [-18, -17]
+data8 0x900000000000005A, 0x0000C003 // Range [-19, -18]
+data8 0x9800000000000005, 0x0000C003 // Range [-20, -19]
+// List of bounds of ranges with special polynomial approximation near root
+// Only significands of bounds are actually stored
+data8 0xA000000000000000, 0x9800000000000000 // Bounds for root on [-3, -2]
+data8 0xCAB88035C5EFBB41, 0xC7E05E31F4B02115 // Bounds for root on [-4, -3]
+data8 0x817831B899735C72, 0x8114633941B8053A // Bounds for root on [-5, -4]
+data8 0xA04E8B34C6AA9476, 0xA039B4A42978197B // Bounds for root on [-6, -5]
+data8 0xC00D3D5E588A78A9, 0xC009BA25F7E858A6 // Bounds for root on [-7, -6]
+data8 0xE001E54202991EB4, 0xE001648416CE897F // Bounds for root on [-8, -7]
+data8 0x80001E56D13A6B9F, 0x8000164A3BAD888A // Bounds for root on [-9, -8]
+data8 0x9000035F0529272A, 0x9000027A0E3D94F0 // Bounds for root on [-10, -9]
+data8 0xA00000564D705880, 0xA000003F67EA0CC7 // Bounds for root on [-11, -10]
+data8 0xB0000007D87EE0EF, 0xB0000005C3A122A5 // Bounds for root on [-12, -11]
+data8 0xC0000000A75FE8B1, 0xC00000007AF818AC // Bounds for root on [-13, -12]
+data8 0xD00000000CDFFE36, 0xD000000009758BBF // Bounds for root on [-14, -13]
+data8 0xE000000000EB6D96, 0xE000000000ACF7B2 // Bounds for root on [-15, -14]
+data8 0xF0000000000FB1F9, 0xF0000000000B87FB // Bounds for root on [-16, -15]
+data8 0x8000000000007D90, 0x8000000000005C40 // Bounds for root on [-17, -16]
+data8 0x8800000000000763, 0x880000000000056D // Bounds for root on [-18, -17]
+data8 0x9000000000000069, 0x900000000000004D // Bounds for root on [-19, -18]
+data8 0x9800000000000006, 0x9800000000000005 // Bounds for root on [-20, -19]
+// List of all left roots themselves
+data8 0xAFDA0850DEC8065E, 0x0000C000 // Range [-3, -2]
+data8 0xFD238AA3E17F285C, 0x0000C000 // Range [-4, -3]
+data8 0x9FBABBD37757E6A2, 0x0000C001 // Range [-5, -4]
+data8 0xBFF497AC8FA06AFC, 0x0000C001 // Range [-6, -5]
+data8 0xDFFE5FBB5C377FE8, 0x0000C001 // Range [-7, -6]
+data8 0xFFFFCBFC0ACE7879, 0x0000C001 // Range [-8, -7]
+data8 0x8FFFFD1C425E8100, 0x0000C002 // Range [-9, -8]
+data8 0x9FFFFFB606BDFDCD, 0x0000C002 // Range [-10, -9]
+data8 0xAFFFFFF9466E9F1B, 0x0000C002 // Range [-11, -10]
+data8 0xBFFFFFFF70893874, 0x0000C002 // Range [-12, -11]
+data8 0xCFFFFFFFF4F6DCF6, 0x0000C002 // Range [-13, -12]
+data8 0xDFFFFFFFFF36345B, 0x0000C002 // Range [-14, -13]
+data8 0xEFFFFFFFFFF28C06, 0x0000C002 // Range [-15, -14]
+data8 0xFFFFFFFFFFFF28C0, 0x0000C002 // Range [-16, -15]
+data8 0x87FFFFFFFFFFF9AB, 0x0000C003 // Range [-17, -16]
+data8 0x8FFFFFFFFFFFFFA6, 0x0000C003 // Range [-18, -17]
+data8 0x97FFFFFFFFFFFFFB, 0x0000C003 // Range [-19, -18]
+data8 0x0000000000000000, 0x00000000 // pad to keep logic in the main path
+// List of bounds of ranges with special polynomial approximation near root
+// Only significands of bounds are actually stored
+data8 0xB235880944CC758E, 0xADD2F1A9FBE76C8B // Bounds for root on [-3, -2]
+data8 0xFD8E7844F307B07C, 0xFCA655C2152BDE4D // Bounds for root on [-4, -3]
+data8 0x9FC4D876EE546967, 0x9FAEE4AF68BC4292 // Bounds for root on [-5, -4]
+data8 0xBFF641FFBFCC44F1, 0xBFF2A47919F4BA89 // Bounds for root on [-6, -5]
+data8 0xDFFE9C803DEFDD59, 0xDFFE18932EB723FE // Bounds for root on [-7, -6]
+data8 0xFFFFD393FA47AFC3, 0xFFFFC317CF638AE1 // Bounds for root on [-8, -7]
+data8 0x8FFFFD8840279925, 0x8FFFFC9DCECEEE92 // Bounds for root on [-9, -8]
+data8 0x9FFFFFC0D34E2AF8, 0x9FFFFFA9619AA3B7 // Bounds for root on [-10, -9]
+data8 0xAFFFFFFA41C18246, 0xAFFFFFF82025A23C // Bounds for root on [-11, -10]
+data8 0xBFFFFFFF857ACB4E, 0xBFFFFFFF58032378 // Bounds for root on [-12, -11]
+data8 0xCFFFFFFFF6934AB8, 0xCFFFFFFFF313EF0A // Bounds for root on [-13, -12]
+data8 0xDFFFFFFFFF53A9E9, 0xDFFFFFFFFF13B5A5 // Bounds for root on [-14, -13]
+data8 0xEFFFFFFFFFF482CB, 0xEFFFFFFFFFF03F4F // Bounds for root on [-15, -14]
+data8 0xFFFFFFFFFFFF482D, 0xFFFFFFFFFFFF03F5 // Bounds for root on [-16, -15]
+data8 0x87FFFFFFFFFFFA98, 0x87FFFFFFFFFFF896 // Bounds for root on [-17, -16]
+data8 0x8FFFFFFFFFFFFFB3, 0x8FFFFFFFFFFFFF97 // Bounds for root on [-18, -17]
+data8 0x97FFFFFFFFFFFFFC, 0x97FFFFFFFFFFFFFB // Bounds for root on [-19, -18]
+LOCAL_OBJECT_END(lgammal_right_roots_data)
+
+LOCAL_OBJECT_START(lgammal_0_Half_data)
+// Polynomial coefficients for the lgammal(x), 0.0 < |x| < 0.5
+data8 0xBFD9A4D55BEAB2D6, 0xBC8AA3C097746D1F //A3
+data8 0x3FEA51A6625307D3, 0x3C7180E7BD2D0DCC //A2
+data8 0xBFE2788CFC6FB618, 0xBC9E9346C4692BCC //A1
+data8 0x8A8991563EC1BD13, 0x00003FFD //A4
+data8 0xD45CE0BD52C27EF2, 0x0000BFFC //A5
+data8 0xADA06587FA2BBD47, 0x00003FFC //A6
+data8 0x9381D0ED2194902A, 0x0000BFFC //A7
+data8 0x80859B3CF92D4192, 0x00003FFC //A8
+data8 0xE4033517C622A946, 0x0000BFFB //A9
+data8 0xCD00CE67A51FC82A, 0x00003FFB //A10
+data8 0xBA44E2A96C3B5700, 0x0000BFFB //A11
+data8 0xAAAD008FA46DBD99, 0x00003FFB //A12
+data8 0x9D604AC65A41153D, 0x0000BFFB //A13
+data8 0x917CECB864B5A861, 0x00003FFB //A14
+data8 0x85A4810EB730FDE4, 0x0000BFFB //A15
+data8 0xEF2761C38BD21F77, 0x00003FFA //A16
+data8 0xC913043A128367DA, 0x0000BFFA //A17
+data8 0x96A29B71FF7AFFAA, 0x00003FFA //A18
+data8 0xBB9FFA1A5FE649BB, 0x0000BFF9 //A19
+data8 0xB17982CD2DAA0EE3, 0x00003FF8 //A20
+data8 0xDE1DDCBFFB9453F0, 0x0000BFF6 //A21
+data8 0x87FBF5D7ACD9FA9D, 0x00003FF4 //A22
+LOCAL_OBJECT_END(lgammal_0_Half_data)
+
+LOCAL_OBJECT_START(Constants_Q)
+// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
+data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
+data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
+data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
+data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
+data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
+data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
+LOCAL_OBJECT_END(Constants_Q)
+
+LOCAL_OBJECT_START(Constants_Z_1)
+// Z1 - 16 bit fixed
+data4 0x00008000
+data4 0x00007879
+data4 0x000071C8
+data4 0x00006BCB
+data4 0x00006667
+data4 0x00006187
+data4 0x00005D18
+data4 0x0000590C
+data4 0x00005556
+data4 0x000051EC
+data4 0x00004EC5
+data4 0x00004BDB
+data4 0x00004925
+data4 0x0000469F
+data4 0x00004445
+data4 0x00004211
+LOCAL_OBJECT_END(Constants_Z_1)
+
+LOCAL_OBJECT_START(Constants_G_H_h1)
+// G1 and H1 - IEEE single and h1 - IEEE double
+data4 0x3F800000,0x00000000,0x00000000,0x00000000
+data4 0x3F70F0F0,0x3D785196,0x617D741C,0x3DA163A6
+data4 0x3F638E38,0x3DF13843,0xCBD3D5BB,0x3E2C55E6
+data4 0x3F579430,0x3E2FF9A0,0xD86EA5E7,0xBE3EB0BF
+data4 0x3F4CCCC8,0x3E647FD6,0x86B12760,0x3E2E6A8C
+data4 0x3F430C30,0x3E8B3AE7,0x5C0739BA,0x3E47574C
+data4 0x3F3A2E88,0x3EA30C68,0x13E8AF2F,0x3E20E30F
+data4 0x3F321640,0x3EB9CEC8,0xF2C630BD,0xBE42885B
+data4 0x3F2AAAA8,0x3ECF9927,0x97E577C6,0x3E497F34
+data4 0x3F23D708,0x3EE47FC5,0xA6B0A5AB,0x3E3E6A6E
+data4 0x3F1D89D8,0x3EF8947D,0xD328D9BE,0xBDF43E3C
+data4 0x3F17B420,0x3F05F3A1,0x0ADB090A,0x3E4094C3
+data4 0x3F124920,0x3F0F4303,0xFC1FE510,0xBE28FBB2
+data4 0x3F0D3DC8,0x3F183EBF,0x10FDE3FA,0x3E3A7895
+data4 0x3F088888,0x3F20EC80,0x7CC8C98F,0x3E508CE5
+data4 0x3F042108,0x3F29516A,0xA223106C,0xBE534874
+LOCAL_OBJECT_END(Constants_G_H_h1)
+
+LOCAL_OBJECT_START(Constants_Z_2)
+// Z2 - 16 bit fixed
+data4 0x00008000
+data4 0x00007F81
+data4 0x00007F02
+data4 0x00007E85
+data4 0x00007E08
+data4 0x00007D8D
+data4 0x00007D12
+data4 0x00007C98
+data4 0x00007C20
+data4 0x00007BA8
+data4 0x00007B31
+data4 0x00007ABB
+data4 0x00007A45
+data4 0x000079D1
+data4 0x0000795D
+data4 0x000078EB
+LOCAL_OBJECT_END(Constants_Z_2)
+
+LOCAL_OBJECT_START(Constants_G_H_h2)
+// G2 and H2 - IEEE single and h2 - IEEE double
+data4 0x3F800000,0x00000000,0x00000000,0x00000000
+data4 0x3F7F00F8,0x3B7F875D,0x22C42273,0x3DB5A116
+data4 0x3F7E03F8,0x3BFF015B,0x21F86ED3,0x3DE620CF
+data4 0x3F7D08E0,0x3C3EE393,0x484F34ED,0xBDAFA07E
+data4 0x3F7C0FC0,0x3C7E0586,0x3860BCF6,0xBDFE07F0
+data4 0x3F7B1880,0x3C9E75D2,0xA78093D6,0x3DEA370F
+data4 0x3F7A2328,0x3CBDC97A,0x72A753D0,0x3DFF5791
+data4 0x3F792FB0,0x3CDCFE47,0xA7EF896B,0x3DFEBE6C
+data4 0x3F783E08,0x3CFC15D0,0x409ECB43,0x3E0CF156
+data4 0x3F774E38,0x3D0D874D,0xFFEF71DF,0xBE0B6F97
+data4 0x3F766038,0x3D1CF49B,0x5D59EEE8,0xBE080483
+data4 0x3F757400,0x3D2C531D,0xA9192A74,0x3E1F91E9
+data4 0x3F748988,0x3D3BA322,0xBF72A8CD,0xBE139A06
+data4 0x3F73A0D0,0x3D4AE46F,0xF8FBA6CF,0x3E1D9202
+data4 0x3F72B9D0,0x3D5A1756,0xBA796223,0xBE1DCCC4
+data4 0x3F71D488,0x3D693B9D,0xB6B7C239,0xBE049391
+LOCAL_OBJECT_END(Constants_G_H_h2)
+
+LOCAL_OBJECT_START(Constants_G_H_h3)
+// G3 and H3 - IEEE single and h3 - IEEE double
+data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595
+data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2
+data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D
+data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291
+data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8
+data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707
+data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9
+data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47
+data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E
+data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D
+data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441
+data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95
+data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC
+data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337
+data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B
+data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B
+data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21
+data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4
+data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070
+data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC
+data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83
+data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40
+data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7
+data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B
+data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E
+data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06
+data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1
+data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103
+data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B
+data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19
+data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502
+data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17
+LOCAL_OBJECT_END(Constants_G_H_h3)
+
+LOCAL_OBJECT_START(lgammal_data)
+// Positive overflow value
+data8 0xB8D54C8BFFFDEBF4, 0x00007FF1
+LOCAL_OBJECT_END(lgammal_data)
+
+LOCAL_OBJECT_START(lgammal_Stirling)
+// Coefficients needed for Strirling's formula
+data8 0x3FED67F1C864BEB4 // High part of 0.5*ln(2*Pi)
+data8 0x3C94D252F2400510 // Low part of 0.5*ln(2*Pi)
+//
+// Bernulli numbers used in Striling's formula for -2^63 < |x| < -13.0
+//(B1H, B1L) = 8.3333333333333333333262747254e-02
+data8 0x3FB5555555555555, 0x3C55555555555555
+data8 0xB60B60B60B60B60B, 0x0000BFF6 //B2 = -2.7777777777777777777777777778e-03
+data8 0xD00D00D00D00D00D, 0x00003FF4 //B3 = 7.9365079365079365079365079365e-04
+data8 0x9C09C09C09C09C0A, 0x0000BFF4 //B4 = -5.9523809523809523809523809524e-04
+data8 0xDCA8F158C7F91AB8, 0x00003FF4 //B5 = 8.4175084175084175084175084175e-04
+data8 0xFB5586CCC9E3E410, 0x0000BFF5 //B6 = -1.9175269175269175269175269175e-03
+data8 0xD20D20D20D20D20D, 0x00003FF7 //B7 = 6.4102564102564102564102564103e-03
+data8 0xF21436587A9CBEE1, 0x0000BFF9 //B8 = -2.9550653594771241830065359477e-02
+data8 0xB7F4B1C0F033FFD1, 0x00003FFC //B9 = 1.7964437236883057316493849002e-01
+data8 0xB23B3808C0F9CF6E, 0x0000BFFF //B10 = -1.3924322169059011164274322169e+00
+// Polynomial coefficients for Stirling's formula, -13.0 < x < -6.0
+data8 0x3FB5555555555555, 0x3C4D75060289C58B //A0
+data8 0xB60B60B60B0F0876, 0x0000BFF6 //A1
+data8 0xD00D00CE54B1256C, 0x00003FF4 //A2
+data8 0x9C09BF46B58F75E1, 0x0000BFF4 //A3
+data8 0xDCA8483BC91ACC6D, 0x00003FF4 //A4
+data8 0xFB3965C939CC9FEE, 0x0000BFF5 //A5
+data8 0xD0723ADE3F0BC401, 0x00003FF7 //A6
+data8 0xE1ED7434E81F0B73, 0x0000BFF9 //A7
+data8 0x8069C6982F993283, 0x00003FFC //A8
+data8 0xC271F65BFA5BEE3F, 0x0000BFFD //A9
+LOCAL_OBJECT_END(lgammal_Stirling)
+
+LOCAL_OBJECT_START(lgammal_lnsin_data)
+// polynomial approximation of -ln(sin(Pi*x)/(Pi*x)), 0 < x <= 0.5
+data8 0x3FFA51A6625307D3, 0x3C81873332FAF94C //A2
+data8 0x8A8991563EC241C3, 0x00003FFE //A4
+data8 0xADA06588061805DF, 0x00003FFD //A6
+data8 0x80859B57C338D0F7, 0x00003FFD //A8
+data8 0xCD00F1C2D78754BD, 0x00003FFC //A10
+data8 0xAAB56B1D3A1F4655, 0x00003FFC //A12
+data8 0x924B6F2FBBED12B1, 0x00003FFC //A14
+data8 0x80008E58765F43FC, 0x00003FFC //A16
+data8 0x3FBC718EC115E429//A18
+data8 0x3FB99CE544FE183E//A20
+data8 0x3FB7251C09EAAD89//A22
+data8 0x3FB64A970733628C//A24
+data8 0x3FAC92D6802A3498//A26
+data8 0x3FC47E1165261586//A28
+data8 0xBFCA1BAA434750D4//A30
+data8 0x3FE460001C4D5961//A32
+data8 0xBFE6F06A3E4908AD//A34
+data8 0x3FE300889EBB203A//A36
+LOCAL_OBJECT_END(lgammal_lnsin_data)
+
+LOCAL_OBJECT_START(lgammal_half_3Q_data)
+// Polynomial coefficients for the lgammal(x), 0.5 <= x < 0.75
+data8 0xBFF7A648EE90C62E, 0x3C713F326857E066 // A3, A0L
+data8 0xBFF73E4B8BA780AE, 0xBCA953BC788877EF // A1, A1L
+data8 0x403774DCD58D0291, 0xC0415254D5AE6623 // D0, D1
+data8 0x40B07213855CBFB0, 0xC0B8855E25D2D229 // C20, C21
+data8 0x3FFB359F85FF5000, 0x3C9BAECE6EF9EF3A // A2, A2L
+data8 0x3FD717D498A3A8CC, 0xBC9088E101CFEDFA // A0, A3L
+data8 0xAFEF36CC5AEC3FF0, 0x00004002 // E6
+data8 0xABE2054E1C34E791, 0x00004001 // E4
+data8 0xB39343637B2900D1, 0x00004000 // E2
+data8 0xD74FB710D53F58F6, 0x00003FFF // E0
+data8 0x4070655963BA4256, 0xC078DA9D263C4EA3 // D6, D7
+data8 0x405CD2B6A9B90978, 0xC065B3B9F4F4F171 // D4, D5
+data8 0x4049BC2204CF61FF, 0xC05337227E0BA152 // D2, D3
+data8 0x4095509A50C07A96, 0xC0A0747949D2FB45 // C18, C19
+data8 0x4082ECCBAD709414, 0xC08CD02FB088A702 // C16, C17
+data8 0xFFE4B2A61B508DD5, 0x0000C002 // E7
+data8 0xF461ADB8AE17E0A5, 0x0000C001 // E5
+data8 0xF5BE8B0B90325F20, 0x0000C000 // E3
+data8 0x877B275F3FB78DCA, 0x0000C000 // E1
+LOCAL_OBJECT_END(lgammal_half_3Q_data)
+
+LOCAL_OBJECT_START(lgammal_half_3Q_neg_data)
+// Polynomial coefficients for the lgammal(x), -0.75 < x <= -0.5
+data8 0xC014836EFD94899C, 0x3C9835679663B44F // A3, A0L
+data8 0xBFF276C7B4FB1875, 0xBC92D3D9FA29A1C0 // A1, A1L
+data8 0x40C5178F24E1A435, 0xC0D9DE84FBC5D76A // D0, D1
+data8 0x41D4D1B236BF6E93, 0xC1EBB0445CE58550 // C20, C21
+data8 0x4015718CD67F63D3, 0x3CC5354B6F04B59C // A2, A2L
+data8 0x3FF554493087E1ED, 0xBCB72715E37B02B9 // A0, A3L
+data8 0xE4AC7E915FA72229, 0x00004009 // E6
+data8 0xA28244206395FCC6, 0x00004007 // E4
+data8 0xFB045F19C07B2544, 0x00004004 // E2
+data8 0xE5C8A6E6A9BA7D7B, 0x00004002 // E0
+data8 0x4143943B55BF5118, 0xC158AC05EA675406 // D6, D7
+data8 0x4118F6833D19717C, 0xC12F51A6F375CC80 // D4, D5
+data8 0x40F00C209483481C, 0xC103F1DABF750259 // D2, D3
+data8 0x4191038F2D8F9E40, 0xC1A413066DA8AE4A // C18, C19
+data8 0x4170B537EDD833DE, 0xC1857E79424C61CE // C16, C17
+data8 0x8941D8AB4855DB73, 0x0000C00B // E7
+data8 0xBB822B131BD2E813, 0x0000C008 // E5
+data8 0x852B4C03B83D2D4F, 0x0000C006 // E3
+data8 0xC754CA7E2DDC0F1F, 0x0000C003 // E1
+LOCAL_OBJECT_END(lgammal_half_3Q_neg_data)
+
+LOCAL_OBJECT_START(lgammal_2Q_4_data)
+// Polynomial coefficients for the lgammal(x), 2.25 <= |x| < 4.0
+data8 0xBFCA4D55BEAB2D6F, 0x3C7ABC9DA14141F5 // A3, A0L
+data8 0x3FFD8773039049E7, 0x3C66CB7957A95BA4 // A1, A1L
+data8 0x3F45C3CC79E91E7D, 0xBF3A8E5005937E97 // D0, D1
+data8 0x3EC951E35E1C9203, 0xBEB030A90026C5DF // C20, C21
+data8 0x3FE94699894C1F4C, 0x3C91884D21D123F1 // A2, A2L
+data8 0x3FE62E42FEFA39EF, 0xBC66480CEB70870F // A0, A3L
+data8 0xF1C2EAFF0B3A7579, 0x00003FF5 // E6
+data8 0xB36AF863926B55A3, 0x00003FF7 // E4
+data8 0x9620656185BB44CA, 0x00003FF9 // E2
+data8 0xA264558FB0906AFF, 0x00003FFB // E0
+data8 0x3F03D59E9666C961, 0xBEF91115893D84A6 // D6, D7
+data8 0x3F19333611C46225, 0xBF0F89EB7D029870 // D4, D5
+data8 0x3F3055A96B347AFE, 0xBF243B5153E178A8 // D2, D3
+data8 0x3ED9A4AEF30C4BB2, 0xBED388138B1CEFF2 // C18, C19
+data8 0x3EEF7945A3C3A254, 0xBEE36F32A938EF11 // C16, C17
+data8 0x9028923F47C82118, 0x0000BFF5 // E7
+data8 0xCE0DAAFB6DC93B22, 0x0000BFF6 // E5
+data8 0xA0D0983B34AC4C8D, 0x0000BFF8 // E3
+data8 0x94D6C50FEB8B0CE7, 0x0000BFFA // E1
+LOCAL_OBJECT_END(lgammal_2Q_4_data)
+
+LOCAL_OBJECT_START(lgammal_4_8_data)
+// Polynomial coefficients for the lgammal(x), 4.0 <= |x| < 8.0
+data8 0xBFD6626BC9B31B54, 0x3CAA53C82493A92B // A3, A0L
+data8 0x401B4C420A50AD7C, 0x3C8C6E9929F789A3 // A1, A1L
+data8 0x3F49410427E928C2, 0xBF3E312678F8C146 // D0, D1
+data8 0x3ED51065F7CD5848, 0xBED052782A03312F // C20, C21
+data8 0x3FF735973273D5EC, 0x3C831DFC65BF8CCF // A2, A2L
+data8 0x401326643C4479C9, 0xBC6FA0498C5548A6 // A0, A3L
+data8 0x9382D8B3CD4EB7E3, 0x00003FF6 // E6
+data8 0xE9F92CAD8A85CBCD, 0x00003FF7 // E4
+data8 0xD58389FE38258CEC, 0x00003FF9 // E2
+data8 0x81310136363AE8AA, 0x00003FFC // E0
+data8 0x3F04F0AE38E78570, 0xBEF9E2144BB8F03C // D6, D7
+data8 0x3F1B5E992A6CBC2A, 0xBF10F3F400113911 // D4, D5
+data8 0x3F323EE00AAB7DEE, 0xBF2640FDFA9FB637 // D2, D3
+data8 0x3ED2143EBAFF067A, 0xBEBBDEB92D6FF35D // C18, C19
+data8 0x3EF173A42B69AAA4, 0xBEE78B9951A2EAA5 // C16, C17
+data8 0xAB3CCAC6344E52AA, 0x0000BFF5 // E7
+data8 0x81ACCB8915B16508, 0x0000BFF7 // E5
+data8 0xDA62C7221102C426, 0x0000BFF8 // E3
+data8 0xDF1BD44C4083580A, 0x0000BFFA // E1
+LOCAL_OBJECT_END(lgammal_4_8_data)
+
+LOCAL_OBJECT_START(lgammal_loc_min_data)
+// Polynomial coefficients for the lgammal(x), 1.3125 <= x < 1.5625
+data8 0xBB16C31AB5F1FB71, 0x00003FFF // xMin - point of local minimum
+data8 0xBFC2E4278DC6BC23, 0xBC683DA8DDCA9650 // A3, A0L
+data8 0x3BD4DB7D0CA61D5F, 0x386E719EDD01D801 // A1, A1L
+data8 0x3F4CC72638E1D93F, 0xBF4228EC9953CCB9 // D0, D1
+data8 0x3ED222F97A04613E,0xBED3DDD58095CB6C // C20, C21
+data8 0x3FDEF72BC8EE38AB, 0x3C863AFF3FC48940 // A2, A2L
+data8 0xBFBF19B9BCC38A41, 0xBC7425F1BFFC1442// A0, A3L
+data8 0x941890032BEB34C3, 0x00003FF6 // E6
+data8 0xC7E701591CE534BC, 0x00003FF7 // E4
+data8 0x93373CBD05138DD4, 0x00003FF9 // E2
+data8 0x845A14A6A81C05D6, 0x00003FFB // E0
+data8 0x3F0F6C4DF6D47A13, 0xBF045DCDB5B49E19 // D6, D7
+data8 0x3F22E23345DDE59C, 0xBF1851159AFB1735 // D4, D5
+data8 0x3F37101EA4022B78, 0xBF2D721E6323AF13 // D2, D3
+data8 0x3EE691EBE82DF09D, 0xBEDD42550961F730 // C18, C19
+data8 0x3EFA793EDE99AD85, 0xBEF14000108E70BE // C16, C17
+data8 0xB7CBC033ACE0C99C, 0x0000BFF5 // E7
+data8 0xF178D1F7B1A45E27, 0x0000BFF6 // E5
+data8 0xA8FCFCA8106F471C, 0x0000BFF8 // E3
+data8 0x864D46FA898A9AD2, 0x0000BFFA // E1
+LOCAL_OBJECT_END(lgammal_loc_min_data)
+
+LOCAL_OBJECT_START(lgammal_03Q_1Q_data)
+// Polynomial coefficients for the lgammal(x), 0.75 <= |x| < 1.3125
+data8 0x3FD151322AC7D848, 0x3C7184DE0DB7B4EE // A4, A2L
+data8 0x3FD9A4D55BEAB2D6, 0x3C9E934AAB10845F // A3, A1L
+data8 0x3FB111289C381259, 0x3FAFFFCFB32AE18D // D2, D3
+data8 0x3FB3B1D9E0E3E00D, 0x3FB2496F0D3768DF // D0, D1
+data8 0xBA461972C057D439, 0x00003FFB // E6
+data8 0x3FEA51A6625307D3, 0x3C76ABC886A72DA2 // A2, A4L
+data8 0x3FA8EFE46B32A70E, 0x3F8F31B3559576B6 // C17, C20
+data8 0xE403383700387D85, 0x00003FFB // E4
+data8 0x9381D0EE74BF7251, 0x00003FFC // E2
+data8 0x3FAA2177A6D28177, 0x3FA4895E65FBD995 // C18, C19
+data8 0x3FAAED2C77DBEE5D, 0x3FA94CA59385512C // D6, D7
+data8 0x3FAE1F522E8A5941, 0x3FAC785EF56DD87E // D4, D5
+data8 0x3FB556AD5FA56F0A, 0x3FA81F416E87C783 // E7, C16
+data8 0xCD00F1C2DC2C9F1E, 0x00003FFB // E5
+data8 0x3FE2788CFC6FB618, 0x3C8E52519B5B17CB // A1, A3L
+data8 0x80859B57C3E7F241, 0x00003FFC // E3
+data8 0xADA065880615F401, 0x00003FFC // E1
+data8 0xD45CE0BD530AB50E, 0x00003FFC // E0
+LOCAL_OBJECT_END(lgammal_03Q_1Q_data)
+
+LOCAL_OBJECT_START(lgammal_13Q_2Q_data)
+// Polynomial coefficients for the lgammal(x), 1.5625 <= |x| < 2.25
+data8 0x3F951322AC7D8483, 0x3C71873D88C6539D // A4, A2L
+data8 0xBFB13E001A557606, 0x3C56CB907018A101 // A3, A1L
+data8 0xBEC11B2EC1E7F6FC, 0x3EB0064ED9824CC7 // D2, D3
+data8 0xBEE3CBC963EC103A, 0x3ED2597A330C107D // D0, D1
+data8 0xBC6F2DEBDFE66F38, 0x0000BFF0 // E6
+data8 0x3FD4A34CC4A60FA6, 0x3C3AFC9BF775E8A0 // A2, A4L
+data8 0x3E48B0C542F85B32, 0xBE347F12EAF787AB // C17, C20
+data8 0xE9FEA63B6984FA1E, 0x0000BFF2 // E4
+data8 0x9C562E15FC703BBF, 0x0000BFF5 // E2
+data8 0xBE3C12A50AB0355E, 0xBE1C941626AE4717 // C18, C19
+data8 0xBE7AFA8714342BC4,0x3E69A12D2B7761CB // D6, D7
+data8 0xBE9E25EF1D526730, 0x3E8C762291889B99 // D4, D5
+data8 0x3EF580DCEE754733, 0xBE57C811D070549C // E7, C16
+data8 0xD093D878BE209C98, 0x00003FF1 // E5
+data8 0x3FDB0EE6072093CE, 0xBC6024B9E81281C4 // A1, A3L
+data8 0x859B57C31CB77D96, 0x00003FF4 // E3
+data8 0xBD6EB756DB617E8D, 0x00003FF6 // E1
+data8 0xF2027E10C7AF8C38, 0x0000BFF7 // E0
+LOCAL_OBJECT_END(lgammal_13Q_2Q_data)
+
+LOCAL_OBJECT_START(lgammal_8_10_data)
+// Polynomial coefficients for the lgammal(x), 8.0 <= |x| < 10.0
+// Multi Precision terms
+data8 0x40312008A3A23E5C, 0x3CE020B4F2E4083A //A1
+data8 0x4025358E82FCB70C, 0x3CD4A5A74AF7B99C //A0
+// Native precision terms
+data8 0xF0AA239FFBC616D2, 0x00004000 //A2
+data8 0x96A8EA798FE57D66, 0x0000BFFF //A3
+data8 0x8D501B7E3B9B9BDB, 0x00003FFE //A4
+data8 0x9EE062401F4B1DC2, 0x0000BFFD //A5
+data8 0xC63FD8CD31E93431, 0x00003FFC //A6
+data8 0x8461101709C23C30, 0x0000BFFC //A7
+data8 0xB96D7EA7EF3648B2, 0x00003FFB //A8
+data8 0x86886759D2ACC906, 0x0000BFFB //A9
+data8 0xC894B6E28265B183, 0x00003FFA //A10
+data8 0x98C4348CAD821662, 0x0000BFFA //A11
+data8 0xEC9B092226A94DF2, 0x00003FF9 //A12
+data8 0xB9F169FF9B98CDDC, 0x0000BFF9 //A13
+data8 0x9A3A32BB040894D3, 0x00003FF9 //A14
+data8 0xF9504CCC1003B3C3, 0x0000BFF8 //A15
+LOCAL_OBJECT_END(lgammal_8_10_data)
+
+LOCAL_OBJECT_START(lgammal_03Q_6_data)
+// Polynomial coefficients for the lgammal(x), 0.75 <= |x| < 1.0
+data8 0xBFBC47DCA479E295, 0xBC607E6C1A379D55 //A3
+data8 0x3FCA051C372609ED, 0x3C7B02D73EB7D831 //A0
+data8 0xBFE15FAFA86B04DB, 0xBC3F52EE4A8945B5 //A1
+data8 0x3FD455C4FF28F0BF, 0x3C75F8C6C99F30BB //A2
+data8 0xD2CF04CD934F03E1, 0x00003FFA //A4
+data8 0xDB4ED667E29256E1, 0x0000BFF9 //A5
+data8 0xF155A33A5B6021BF, 0x00003FF8 //A6
+data8 0x895E9B9D386E0338, 0x0000BFF8 //A7
+data8 0xA001BE94B937112E, 0x00003FF7 //A8
+data8 0xBD82846E490ED048, 0x0000BFF6 //A9
+data8 0xE358D24EC30DBB5D, 0x00003FF5 //A10
+data8 0x89C4F3652446B78B, 0x0000BFF5 //A11
+data8 0xA86043E10280193D, 0x00003FF4 //A12
+data8 0xCF3A2FBA61EB7682, 0x0000BFF3 //A13
+data8 0x3F300900CC9200EC //A14
+data8 0xBF23F42264B94AE8 //A15
+data8 0x3F18EEF29895FE73 //A16
+data8 0xBF0F3C4563E3EDFB //A17
+data8 0x3F0387DBBC385056 //A18
+data8 0xBEF81B4004F92900 //A19
+data8 0x3EECA6692A9A5B81 //A20
+data8 0xBEDF61A0059C15D3 //A21
+data8 0x3ECDA9F40DCA0111 //A22
+data8 0xBEB60FE788217BAF //A23
+data8 0x3E9661D795DFC8C6 //A24
+data8 0xBE66C7756A4EDEE5 //A25
+// Polynomial coefficients for the lgammal(x), 1.0 <= |x| < 2.0
+data8 0xBFC1AE55B180726B, 0xBC7DE1BC478453F5 //A3
+data8 0xBFBEEB95B094C191, 0xBC53456FF6F1C9D9 //A0
+data8 0x3FA2AED059BD608A, 0x3C0B65CC647D557F //A1
+data8 0x3FDDE9E64DF22EF2, 0x3C8993939A8BA8E4 //A2
+data8 0xF07C206D6B100CFF, 0x00003FFA //A4
+data8 0xED2CEA9BA52FE7FB, 0x0000BFF9 //A5
+data8 0xFCE51CED52DF3602, 0x00003FF8 //A6
+data8 0x8D45D27872326619, 0x0000BFF8 //A7
+data8 0xA2B78D6BCEBE27F7, 0x00003FF7 //A8
+data8 0xBF6DC0996A895B6F, 0x0000BFF6 //A9
+data8 0xE4B9AD335AF82D79, 0x00003FF5 //A10
+data8 0x8A451880195362A1, 0x0000BFF5 //A11
+data8 0xA8BE35E63089A7A9, 0x00003FF4 //A12
+data8 0xCF7FA175FA11C40C, 0x0000BFF3 //A13
+data8 0x3F300C282FAA3B02 //A14
+data8 0xBF23F6AEBDA68B80 //A15
+data8 0x3F18F6860E2224DD //A16
+data8 0xBF0F542B3CE32F28 //A17
+data8 0x3F039436218C9BF8 //A18
+data8 0xBEF8AE6307677AEC //A19
+data8 0x3EF0B55527B3A211 //A20
+data8 0xBEE576AC995E7605 //A21
+data8 0x3ED102DDC1365D2D //A22
+data8 0xBEC442184F97EA54 //A23
+data8 0x3ED4D2283DFE5FC6 //A24
+data8 0xBECB9219A9B46787 //A25
+// Polynomial coefficients for the lgammal(x), 2.0 <= |x| < 3.0
+data8 0xBFCA4D55BEAB2D6F, 0xBC66F80E5BFD5AF5 //A3
+data8 0x3FE62E42FEFA39EF, 0x3C7ABC9E3B347E3D //A0
+data8 0x3FFD8773039049E7, 0x3C66CB9007C426EA //A1
+data8 0x3FE94699894C1F4C, 0x3C918726EB111663 //A2
+data8 0xA264558FB0906209, 0x00003FFB //A4
+data8 0x94D6C50FEB902ADC, 0x0000BFFA //A5
+data8 0x9620656184243D17, 0x00003FF9 //A6
+data8 0xA0D0983B8BCA910B, 0x0000BFF8 //A7
+data8 0xB36AF8559B222BD3, 0x00003FF7 //A8
+data8 0xCE0DACB3260AE6E5, 0x0000BFF6 //A9
+data8 0xF1C2C0BF0437C7DB, 0x00003FF5 //A10
+data8 0x902A2F2F3AB74A92, 0x0000BFF5 //A11
+data8 0xAE05009B1B2C6E4C, 0x00003FF4 //A12
+data8 0xD5B71F6456D7D4CB, 0x0000BFF3 //A13
+data8 0x3F2F0351D71BC9C6 //A14
+data8 0xBF2B53BC56A3B793 //A15
+data8 0xBF18B12DC6F6B861 //A16
+data8 0xBF43EE6EB5215C2F //A17
+data8 0xBF5474787CDD455E //A18
+data8 0xBF642B503C9C060A //A19
+data8 0xBF6E07D1AA254AA3 //A20
+data8 0xBF71C785443AAEE8 //A21
+data8 0xBF6F67BF81B71052 //A22
+data8 0xBF63E4BCCF4FFABF //A23
+data8 0xBF50067F8C671D5A //A24
+data8 0xBF29C770D680A5AC //A25
+// Polynomial coefficients for the lgammal(x), 4.0 <= |x| < 6.0
+data8 0xBFD6626BC9B31B54, 0xBC85AABE08680902 //A3
+data8 0x401326643C4479C9, 0x3CAA53C26F31E364 //A0
+data8 0x401B4C420A50AD7C, 0x3C8C76D55E57DD8D //A1
+data8 0x3FF735973273D5EC, 0x3C83A0B78E09188A //A2
+data8 0x81310136363AAB6D, 0x00003FFC //A4
+data8 0xDF1BD44C4075C0E6, 0x0000BFFA //A5
+data8 0xD58389FE38D8D664, 0x00003FF9 //A6
+data8 0xDA62C7221D5B5F87, 0x0000BFF8 //A7
+data8 0xE9F92CAD0263E157, 0x00003FF7 //A8
+data8 0x81ACCB8606C165FE, 0x0000BFF7 //A9
+data8 0x9382D8D263D1C2A3, 0x00003FF6 //A10
+data8 0xAB3CCBA4C853B12C, 0x0000BFF5 //A11
+data8 0xCA0818BBCCC59296, 0x00003FF4 //A12
+data8 0xF18912691CBB5BD0, 0x0000BFF3 //A13
+data8 0x3F323EF5D8330339 //A14
+data8 0xBF2641132EA571F7 //A15
+data8 0x3F1B5D9576175CA9 //A16
+data8 0xBF10F56A689C623D //A17
+data8 0x3F04CACA9141A18D //A18
+data8 0xBEFA307AC9B4E85D //A19
+data8 0x3EF4B625939FBE32 //A20
+data8 0xBECEE6AC1420F86F //A21
+data8 0xBE9A95AE2E485964 //A22
+data8 0xBF039EF47F8C09BB //A23
+data8 0xBF05345957F7B7A9 //A24
+data8 0xBEF85AE6385D4CCC //A25
+// Polynomial coefficients for the lgammal(x), 3.0 <= |x| < 4.0
+data8 0xBFCA4D55BEAB2D6F, 0xBC667B20FF46C6A8 //A3
+data8 0x3FE62E42FEFA39EF, 0x3C7ABC9E3B398012 //A0
+data8 0x3FFD8773039049E7, 0x3C66CB9070238D77 //A1
+data8 0x3FE94699894C1F4C, 0x3C91873D8839B1CD //A2
+data8 0xA264558FB0906D7E, 0x00003FFB //A4
+data8 0x94D6C50FEB8AFD72, 0x0000BFFA //A5
+data8 0x9620656185B68F14, 0x00003FF9 //A6
+data8 0xA0D0983B34B7088A, 0x0000BFF8 //A7
+data8 0xB36AF863964AA440, 0x00003FF7 //A8
+data8 0xCE0DAAFB5497AFB8, 0x0000BFF6 //A9
+data8 0xF1C2EAFA79CC2864, 0x00003FF5 //A10
+data8 0x9028922A839572B8, 0x0000BFF5 //A11
+data8 0xAE1E62F870BA0278, 0x00003FF4 //A12
+data8 0xD4726F681E2ABA29, 0x0000BFF3 //A13
+data8 0x3F30559B9A02FADF //A14
+data8 0xBF243ADEB1266CAE //A15
+data8 0x3F19303B6F552603 //A16
+data8 0xBF0F768C288EC643 //A17
+data8 0x3F039D5356C21DE1 //A18
+data8 0xBEF81BCA8168E6BE //A19
+data8 0x3EEC74A53A06AD54 //A20
+data8 0xBEDED52D1A5DACDF //A21
+data8 0x3ECCB4C2C7087342 //A22
+data8 0xBEB4F1FAFDFF5C2F //A23
+data8 0x3E94C80B52D58904 //A24
+data8 0xBE64A328CBE92A27 //A25
+LOCAL_OBJECT_END(lgammal_03Q_6_data)
+
+LOCAL_OBJECT_START(lgammal_1pEps_data)
+// Polynomial coefficients for the lgammal(x), 1 - 2^(-7) <= |x| < 1 + 2^(-7)
+data8 0x93C467E37DB0C7A5, 0x00003FFE //A1
+data8 0xD28D3312983E9919, 0x00003FFE //A2
+data8 0xCD26AADF559A47E3, 0x00003FFD //A3
+data8 0x8A8991563EC22E81, 0x00003FFD //A4
+data8 0x3FCA8B9C168D52FE //A5
+data8 0x3FC5B40CB0696370 //A6
+data8 0x3FC270AC2229A65D //A7
+data8 0x3FC0110AF10FCBFC //A8
+// Polynomial coefficients for the log1p(x), - 2^(-7) <= |x| < 2^(-7)
+data8 0x3FBC71C71C71C71C //P8
+data8 0xBFC0000000000000 //P7
+data8 0x3FC2492492492492 //P6
+data8 0xBFC5555555555555 //P5
+data8 0x3FC999999999999A //P4
+data8 0xBFD0000000000000 //P3
+data8 0x3FD5555555555555 //P2
+data8 0xBFE0000000000000 //P1
+// short version of "lnsin" polynomial
+data8 0xD28D3312983E9918, 0x00003FFF //A2
+data8 0x8A8991563EC241B6, 0x00003FFE //A4
+data8 0xADA06588061830A5, 0x00003FFD //A6
+data8 0x80859B57C31CB746, 0x00003FFD //A8
+LOCAL_OBJECT_END(lgammal_1pEps_data)
+
+LOCAL_OBJECT_START(lgammal_neg2andHalf_data)
+// Polynomial coefficients for the lgammal(x), -2.005859375 <= x < -2.5
+data8 0xBF927781D4BB093A, 0xBC511D86D85B7045 // A3, A0L
+data8 0x3FF1A68793DEFC15, 0x3C9852AE2DA7DEEF // A1, A1L
+data8 0x408555562D45FAFD, 0xBF972CDAFE5FEFAD // D0, D1
+data8 0xC18682331EF492A5, 0xC1845E3E0D29606B // C20, C21
+data8 0x4013141822E16979, 0x3CCF8718B6E75F6C // A2, A2L
+data8 0xBFACCBF9F5ED0F15, 0xBBDD1AEB73297401 // A0, A3L
+data8 0xCCCDB17423046445, 0x00004006 // E6
+data8 0x800514E230A3A452, 0x00004005 // E4
+data8 0xAAE9A48EC162E76F, 0x00004003 // E2
+data8 0x81D4F88B3F3EA0FC, 0x00004002 // E0
+data8 0x40CF3F3E35238DA0, 0xC0F8B340945F1A7E // D6, D7
+data8 0x40BF89EC0BD609C6, 0xC095897242AEFEE2 // D4, D5
+data8 0x40A2482FF01DBC5C, 0xC02095E275FDCF62 // D2, D3
+data8 0xC1641354F2312A6A, 0xC17B3657F85258E9 // C18, C19
+data8 0xC11F964E9ECBE2C9, 0xC146D7A90F70696C // C16, C17
+data8 0xE7AECDE6AF8EA816, 0x0000BFEF // E7
+data8 0xD711252FEBBE1091, 0x0000BFEB // E5
+data8 0xE648BD10F8C43391, 0x0000BFEF // E3
+data8 0x948A1E78AA00A98D, 0x0000BFF4 // E1
+LOCAL_OBJECT_END(lgammal_neg2andHalf_data)
+
+LOCAL_OBJECT_START(lgammal_near_neg_half_data)
+// Polynomial coefficients for the lgammal(x), -0.5 < x < -0.40625
+data8 0xBFC1AE55B180726C, 0x3C8053CD734E6A1D // A3, A0L
+data8 0x3FA2AED059BD608A, 0x3C0CD3D2CDBA17F4 // A1, A1L
+data8 0x40855554DBCD1E1E, 0x3F96C51AC2BEE9E1 // D0, D1
+data8 0xC18682331EF4927D, 0x41845E3E0D295DFC // C20, C21
+data8 0x4011DE9E64DF22EF, 0x3CA692B70DAD6B7B // A2, A2L
+data8 0x3FF43F89A3F0EDD6, 0xBC4955AED0FA087D // A0, A3L
+data8 0xCCCD3F1DF4A2C1DD, 0x00004006 // E6
+data8 0x80028ADE33C7FCD9, 0x00004005 // E4
+data8 0xAACA474E485507EF, 0x00004003 // E2
+data8 0x80F07C206D6B0ECD, 0x00004002 // E0
+data8 0x40CF3F3E33E83056, 0x40F8B340944633D9 // D6, D7
+data8 0x40BF89EC059931F0, 0x409589723307AD20 // D4, D5
+data8 0x40A2482FD0054824, 0x402095CE7F19D011 // D2, D3
+data8 0xC1641354F2313614, 0x417B3657F8525354 // C18, C19
+data8 0xC11F964E9ECFD21C, 0x4146D7A90F701836 // C16, C17
+data8 0x86A9C01F0EA11E5A, 0x0000BFF5 // E7
+data8 0xBF6D8469142881C0, 0x0000BFF6 // E5
+data8 0x8D45D277BA8255F1, 0x0000BFF8 // E3
+data8 0xED2CEA9BA528BCC3, 0x0000BFF9 // E1
+LOCAL_OBJECT_END(lgammal_near_neg_half_data)
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+////////////// POLYNOMIAL COEFFICIENTS FOR "NEAR ROOTS" RANGES /////////////
+////////////// THIS PART OF TABLE SHOULD BE ADDRESSED REALLY RARE /////////////
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+LOCAL_OBJECT_START(lgammal_right_roots_polynomial_data)
+// Polynomial coefficients for right root on [-3, -2]
+// Lgammal is aproximated by polynomial within [-.056244 ; .158208 ] range
+data8 0xBBBD5E9DCD11030B, 0xB867411D9FF87DD4 //A0
+data8 0x3FF83FE966AF535E, 0x3CAA21235B8A769A //A1
+data8 0x40136EEBB002F55C, 0x3CC3959A6029838E //A2
+data8 0xB4A5302C53C2BEDD, 0x00003FFF //A3
+data8 0x8B8C6BE504F2DA1C, 0x00004002 //A4
+data8 0xB99CFF02593B4D98, 0x00004001 //A5
+data8 0x4038D32F682AA1CF //A6
+data8 0x403809F04EE6C5B5 //A7
+data8 0x40548EAA81634CEE //A8
+data8 0x4059297ADB6BC03D //A9
+data8 0x407286FB8EC5C9DA //A10
+data8 0x407A92E05B744CFB //A11
+data8 0x4091A9D4144258CD //A12
+data8 0x409C4D01D24F367E //A13
+data8 0x40B1871B9A426A83 //A14
+data8 0x40BE51C48BD9A583 //A15
+data8 0x40D2140D0C6153E7 //A16
+data8 0x40E0FB2C989CE4A3 //A17
+data8 0x40E52739AB005641 //A18
+data8 0x41161E3E6DDF503A //A19
+// Polynomial coefficients for right root on [-4, -3]
+// Lgammal is aproximated by polynomial within [-.172797 ; .171573 ] range
+data8 0x3C172712B248E42E, 0x38CB8D17801A5D67 //A0
+data8 0x401F20A65F2FAC54, 0x3CCB9EA1817A824E //A1
+data8 0x4039D4D2977150EF, 0x3CDA42E149B6276A //A2
+data8 0xE089B8926AE2D9CB, 0x00004005 //A3
+data8 0x933901EBBB586C37, 0x00004008 //A4
+data8 0xCCD319BED1CFA1CD, 0x0000400A //A5
+data8 0x40D293C3F78D3C37 //A6
+data8 0x40FBB97AA0B6DD02 //A7
+data8 0x41251EA3345E5EB9 //A8
+data8 0x415057F65C92E7B0 //A9
+data8 0x41799C865241B505 //A10
+data8 0x41A445209EFE896B //A11
+data8 0x41D02D21880C953B //A12
+data8 0x41F9FFDE8C63E16D //A13
+data8 0x422504DC8302D2BE //A14
+data8 0x425111BF18C95414 //A15
+data8 0x427BCBE74A2B8EF7 //A16
+data8 0x42A7256F59B286F7 //A17
+data8 0x42D462D1586DE61F //A18
+data8 0x42FBB1228D6C5118 //A19
+// Polynomial coefficients for right root on [-5, -4]
+// Lgammal is aproximated by polynomial within [-.163171 ; .161988 ] range
+data8 0x3C5840FBAFDEE5BB, 0x38CAC0336E8C490A //A0
+data8 0x403ACA5CF4921642, 0x3CCEDCDDA5491E56 //A1
+data8 0x40744415CD813F8E, 0x3CFBFEBC17E39146 //A2
+data8 0xAACD88D954E3E1BD, 0x0000400B //A3
+data8 0xCB68C710D75ED802, 0x0000400F //A4
+data8 0x8130F5AB997277AC, 0x00004014 //A5
+data8 0x41855E3DBF99EBA7 //A6
+data8 0x41CD14FE49C49FC2 //A7
+data8 0x421433DCE281F07D //A8
+data8 0x425C8399C7A92B6F //A9
+data8 0x42A45FBE67840F1A //A10
+data8 0x42ED68D75F9E6C98 //A11
+data8 0x433567291C27E5BE //A12
+data8 0x437F5ED7A9D9FD28 //A13
+data8 0x43C720A65C8AB711 //A14
+data8 0x441120A6C1D40B9B //A15
+data8 0x44596F561F2D1CBE //A16
+data8 0x44A3507DA81D5C01 //A17
+data8 0x44EF06A31E39EEDF //A18
+data8 0x45333774C99F523F //A19
+// Polynomial coefficients for right root on [-6, -5]
+// Lgammal is aproximated by polynomial within [-.156450 ; .156126 ] range
+data8 0x3C71B82D6B2B3304, 0x3917186E3C0DC231 //A0
+data8 0x405ED72E0829AE02, 0x3C960C25157980EB //A1
+data8 0x40BCECC32EC22F9B, 0x3D5D8335A32F019C //A2
+data8 0x929EC2B1FB931F17, 0x00004012 //A3
+data8 0xD112EF96D37316DE, 0x00004018 //A4
+data8 0x9F00BB9BB13416AB, 0x0000401F //A5
+data8 0x425F7D8D5BDCB223 //A6
+data8 0x42C9A8D00C776CC6 //A7
+data8 0x433557FD8C481424 //A8
+data8 0x43A209221A953EF0 //A9
+data8 0x440EDC98D5618AB7 //A10
+data8 0x447AABD25E367378 //A11
+data8 0x44E73DE20CC3B288 //A12
+data8 0x455465257B4E0BD8 //A13
+data8 0x45C2011532085353 //A14
+data8 0x462FEE4CC191945B //A15
+data8 0x469C63AEEFEF0A7F //A16
+data8 0x4709D045390A3810 //A17
+data8 0x4778D360873C9F64 //A18
+data8 0x47E26965BE9A682A //A19
+// Polynomial coefficients for right root on [-7, -6]
+// Lgammal is aproximated by polynomial within [-.154582 ; .154521 ] range
+data8 0x3C75F103A1B00A48, 0x391C041C190C726D //A0
+data8 0x40869DE49E3AF2AA, 0x3D1C17E1F813063B //A1
+data8 0x410FCE23484CFD10, 0x3DB6F38C2F11DAB9 //A2
+data8 0xEF281D1E1BE2055A, 0x00004019 //A3
+data8 0xFCE3DA92AC55DFF8, 0x00004022 //A4
+data8 0x8E9EA838A20BD58E, 0x0000402C //A5
+data8 0x4354F21E2FB9E0C9 //A6
+data8 0x43E9500994CD4F09 //A7
+data8 0x447F3A2C23C033DF //A8
+data8 0x45139152656606D8 //A9
+data8 0x45A8D45F8D3BF2E8 //A10
+data8 0x463FD32110E5BFE5 //A11
+data8 0x46D490B3BDBAE0BE //A12
+data8 0x476AC3CAD905DD23 //A13
+data8 0x48018558217AD473 //A14
+data8 0x48970AF371D30585 //A15
+data8 0x492E6273A8BEFFE3 //A16
+data8 0x49C47CC9AE3F1073 //A17
+data8 0x4A5D38E8C35EFF45 //A18
+data8 0x4AF0123E89694CD8 //A19
+// Polynomial coefficients for right root on [-8, -7]
+// Lgammal is aproximated by polynomial within [-.154217 ; .154208 ] range
+data8 0xBCD2507D818DDD68, 0xB97F6940EA2871A0 //A0
+data8 0x40B3B407AA387BCB, 0x3D6320238F2C43D1 //A1
+data8 0x41683E85DAAFBAC7, 0x3E148D085958EA3A //A2
+data8 0x9F2A95AF1E10A548, 0x00004022 //A3
+data8 0x92F21522F482300E, 0x0000402E //A4
+data8 0x90B51AB03A1F244D, 0x0000403A //A5
+data8 0x44628E1C70EF534F //A6
+data8 0x452393E2BC32D244 //A7
+data8 0x45E5164141F4BA0B //A8
+data8 0x46A712B3A8AF5808 //A9
+data8 0x47698FD36CEDD0F2 //A10
+data8 0x482C9AE6BBAA3637 //A11
+data8 0x48F023821857C8E9 //A12
+data8 0x49B2569053FC106F //A13
+data8 0x4A74F646D5C1604B //A14
+data8 0x4B3811CF5ABA4934 //A15
+data8 0x4BFBB5DD6C84E233 //A16
+data8 0x4CC05021086F637B //A17
+data8 0x4D8450A345B0FB49 //A18
+data8 0x4E43825848865DB2 //A19
+// Polynomial coefficients for right root on [-9, -8]
+// Lgammal is aproximated by polynomial within [-.154160 ; .154158 ] range
+data8 0x3CDF4358564F2B46, 0x397969BEE6042F81 //A0
+data8 0x40E3B088FED67721, 0x3D82787BA937EE85 //A1
+data8 0x41C83A3893550EF4, 0x3E542ED57E244DA8 //A2
+data8 0x9F003C6DC56E0B8E, 0x0000402B //A3
+data8 0x92BDF64A3213A699, 0x0000403A //A4
+data8 0x9074F503AAD417AF, 0x00004049 //A5
+data8 0x4582843E1313C8CD //A6
+data8 0x467387BD6A7826C1 //A7
+data8 0x4765074E788CF440 //A8
+data8 0x4857004DD9D1E09D //A9
+data8 0x4949792ED7530EAF //A10
+data8 0x4A3C7F089A292ED3 //A11
+data8 0x4B30125BF0AABB86 //A12
+data8 0x4C224175195E307E //A13
+data8 0x4D14DC4C8B32C08D //A14
+data8 0x4E07F1DB2786197E //A15
+data8 0x4EFB8EA1C336DACB //A16
+data8 0x4FF03797EACD0F23 //A17
+data8 0x50E4304A8E68A730 //A18
+data8 0x51D3618FB2EC9F93 //A19
+// Polynomial coefficients for right root on [-10, -9]
+// Lgammal is aproximated by polynomial within [-.154152 ; .154152 ] range
+data8 0x3D42F34DA97ECF0C, 0x39FD1256F345B0D0 //A0
+data8 0x4116261203919787, 0x3DC12D44055588EB //A1
+data8 0x422EA8F32FB7FE99, 0x3ED849CE4E7B2D77 //A2
+data8 0xE25BAF73477A57B5, 0x00004034 //A3
+data8 0xEB021FD10060504A, 0x00004046 //A4
+data8 0x8220A208EE206C5F, 0x00004059 //A5
+data8 0x46B2C3903EC9DA14 //A6
+data8 0x47D64393744B9C67 //A7
+data8 0x48FAF79CCDC604DD //A8
+data8 0x4A20975DB8061EBA //A9
+data8 0x4B44AB9CBB38DB21 //A10
+data8 0x4C6A032F60094FE9 //A11
+data8 0x4D908103927634B4 //A12
+data8 0x4EB516CA21D30861 //A13
+data8 0x4FDB1BF12C58D318 //A14
+data8 0x510180AAE094A553 //A15
+data8 0x5226A8F2A2D45D57 //A16
+data8 0x534E00B6B0C8B809 //A17
+data8 0x5475022FE21215B2 //A18
+data8 0x5596B02BF6C5E19B //A19
+// Polynomial coefficients for right root on [-11, -10]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0x3D7AA9C2E2B1029C, 0x3A15FB37578544DB //A0
+data8 0x414BAF825A0C91D4, 0x3DFB9DA2CE398747 //A1
+data8 0x4297F3EC8AE0AF03, 0x3F34208B55FB8781 //A2
+data8 0xDD0C97D3197F56DE, 0x0000403E //A3
+data8 0x8F6F3AF7A5499674, 0x00004054 //A4
+data8 0xC68DA1AF6D878EEB, 0x00004069 //A5
+data8 0x47F1E4E1E2197CE0 //A6
+data8 0x494A8A28E597C3EB //A7
+data8 0x4AA4175D0D35D705 //A8
+data8 0x4BFEE6F0AF69E814 //A9
+data8 0x4D580FE7B3DBB3C6 //A10
+data8 0x4EB2ECE60E4608AF //A11
+data8 0x500E04BE3E2B4F24 //A12
+data8 0x5167F9450F0FB8FD //A13
+data8 0x52C342BDE747603F //A14
+data8 0x541F1699D557268C //A15
+data8 0x557927C5F079864E //A16
+data8 0x56D4D10FEEDB030C //A17
+data8 0x5832385DF86AD28A //A18
+data8 0x598898914B4D6523 //A19
+// Polynomial coefficients for right root on [-12, -11]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0xBD96F61647C58B03, 0xBA3ABB0C2A6C755B //A0
+data8 0x418308A82714B70D, 0x3E1088FC6A104C39 //A1
+data8 0x4306A493DD613C39, 0x3FB2341ECBF85741 //A2
+data8 0x8FA8FE98339474AB, 0x00004049 //A3
+data8 0x802CCDF570BA7942, 0x00004062 //A4
+data8 0xF3F748AF11A32890, 0x0000407A //A5
+data8 0x493E3B567EF178CF //A6
+data8 0x4ACED38F651BA362 //A7
+data8 0x4C600B357337F946 //A8
+data8 0x4DF0F71A52B54CCF //A9
+data8 0x4F8229F3B9FA2C70 //A10
+data8 0x5113A4C4979B770E //A11
+data8 0x52A56BC367F298D5 //A12
+data8 0x543785CF31842DC0 //A13
+data8 0x55C9FC37E3E40896 //A14
+data8 0x575CD5D1BA556C82 //A15
+data8 0x58F00A7AD99A9E08 //A16
+data8 0x5A824088688B008D //A17
+data8 0x5C15F75EF7E08EBD //A18
+data8 0x5DA462EA902F0C90 //A19
+// Polynomial coefficients for right root on [-13, -12]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0x3DC3191752ACFC9D, 0x3A26CB6629532DBF //A0
+data8 0x41BC8CFC051191BD, 0x3E68A84DA4E62AF2 //A1
+data8 0x43797926294A0148, 0x400F345FF3723CFF //A2
+data8 0xF26D2AF700B82625, 0x00004053 //A3
+data8 0xA238B24A4B1F7B15, 0x00004070 //A4
+data8 0xE793B5C0A41A264F, 0x0000408C //A5
+data8 0x4A9585BDDACE863D //A6
+data8 0x4C6075953448088A //A7
+data8 0x4E29B2F38D1FC670 //A8
+data8 0x4FF4619B079C440F //A9
+data8 0x51C05DAE118D8AD9 //A10
+data8 0x538A8C7F87326AD4 //A11
+data8 0x5555B6937588DAB3 //A12
+data8 0x5721E1F8B6E6A7DB //A13
+data8 0x58EDA1D7A77DD6E5 //A14
+data8 0x5AB8A9616B7DC9ED //A15
+data8 0x5C84942AA209ED17 //A16
+data8 0x5E518FC34C6F54EF //A17
+data8 0x601FB3F17BCCD9A0 //A18
+data8 0x61E61128D512FE97 //A1
+// Polynomial coefficients for right root on [-14, -13]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0xBE170D646421B3F5, 0xBAAD95F79FCB5097 //A0
+data8 0x41F7328CBFCD9AC7, 0x3E743B8B1E8AEDB1 //A1
+data8 0x43F0D0FA2DBDA237, 0x40A0422D6A227B55 //A2
+data8 0x82082DF2D32686CC, 0x0000405F //A3
+data8 0x8D64EE9B42E68B43, 0x0000407F //A4
+data8 0xA3FFD82E08C5F1F1, 0x0000409F //A5
+data8 0x4BF8C49D99123454 //A6
+data8 0x4DFEC79DDF11342F //A7
+data8 0x50038615A892F6BD //A8
+data8 0x520929453DB32EF1 //A9
+data8 0x54106A7808189A7F //A10
+data8 0x5615A302D03C207B //A11
+data8 0x581CC175AA736F5E //A12
+data8 0x5A233E071147C017 //A13
+data8 0x5C29E81917243F22 //A14
+data8 0x5E3184B0B5AC4707 //A15
+data8 0x6037C11DE62D8388 //A16
+data8 0x6240787C4B1C9D6C //A17
+data8 0x6448289235E80977 //A18
+data8 0x664B5352C6C3449E //A19
+// Polynomial coefficients for right root on [-15, -14]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0x3E562C2E34A9207D, 0x3ADC00DA3DFF7A83 //A0
+data8 0x42344C3B2F0D90AB, 0x3EB8A2E979F24536 //A1
+data8 0x4469BFFF28B50D07, 0x41181E3D05C1C294 //A2
+data8 0xAE38F64DCB24D9F8, 0x0000406A //A3
+data8 0xA5C3F52C1B350702, 0x0000408E //A4
+data8 0xA83BC857BCD67A1B, 0x000040B2 //A5
+data8 0x4D663B4727B4D80A //A6
+data8 0x4FA82C965B0F7788 //A7
+data8 0x51EAD58C02908D95 //A8
+data8 0x542E427970E073D8 //A9
+data8 0x56714644C558A818 //A10
+data8 0x58B3EC2040C77BAE //A11
+data8 0x5AF72AE6A83D45B1 //A12
+data8 0x5D3B214F611F5D12 //A13
+data8 0x5F7FF5E49C54E92A //A14
+data8 0x61C2E917AB765FB2 //A15
+data8 0x64066FD70907B4C1 //A16
+data8 0x664B3998D60D0F9B //A17
+data8 0x689178710782FA8B //A18
+data8 0x6AD14A66C1C7BEC3 //A19
+// Polynomial coefficients for right root on [-16, -15]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0xBE6D7E7192615BAE, 0xBB0137677D7CC719 //A0
+data8 0x4273077763F6628C, 0x3F09250FB8FC8EC9 //A1
+data8 0x44E6A1BF095B1AB3, 0x4178D5A74F6CB3B3 //A2
+data8 0x8F8E0D5060FCC76E, 0x00004076 //A3
+data8 0x800CC1DCFF092A63, 0x0000409E //A4
+data8 0xF3AB0BA9D14CDA72, 0x000040C5 //A5
+data8 0x4EDE3000A2F6D54F //A6
+data8 0x515EC613B9C8E241 //A7
+data8 0x53E003309FEEEA96 //A8
+data8 0x5660ED908D7C9A90 //A9
+data8 0x58E21E9B517B1A50 //A10
+data8 0x5B639745E4374EE2 //A11
+data8 0x5DE55BB626B2075D //A12
+data8 0x606772B7506BA747 //A13
+data8 0x62E9E581AB2E057B //A14
+data8 0x656CBAD1CF85D396 //A15
+data8 0x67EFF4EBD7989872 //A16
+data8 0x6A722D2B19B7E2F9 //A17
+data8 0x6CF5DEB3073B0743 //A18
+data8 0x6F744AC11550B93A //A19
+// Polynomial coefficients for right root on [-17, -16]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0xBEDCC6291188207E, 0xBB872E3FDD48F5B7 //A0
+data8 0x42B3076EE7525EF9, 0x3F6687A5038CA81C //A1
+data8 0x4566A1AAD96EBCB5, 0x421F0FEDFBF548D2 //A2
+data8 0x8F8D4D3DE9850DBA, 0x00004082 //A3
+data8 0x800BDD6DA2CE1859, 0x000040AE //A4
+data8 0xF3A8EC4C9CDC1CE5, 0x000040D9 //A5
+data8 0x505E2FAFDB812628 //A6
+data8 0x531EC5B3A7508719 //A7
+data8 0x55E002F77E99B628 //A8
+data8 0x58A0ED4C9B4DAE54 //A9
+data8 0x5B621E4A8240F90C //A10
+data8 0x5E2396E5C8849814 //A11
+data8 0x60E55B43D8C5CE71 //A12
+data8 0x63A7722F5D45D01D //A13
+data8 0x6669E4E010DCE45A //A14
+data8 0x692CBA120D5E78F6 //A15
+data8 0x6BEFF4045350B22E //A16
+data8 0x6EB22C9807C21819 //A17
+data8 0x7175DE20D04617C4 //A18
+data8 0x74344AB87C6D655F //A19
+// Polynomial coefficients for right root on [-18, -17]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0xBF28AEEE7B61D77C, 0xBBDBBB5FC57ABF79 //A0
+data8 0x42F436F56B3B8A0C, 0x3FA43EE3C5C576E9 //A1
+data8 0x45E98A22535D115D, 0x42984678BE78CC48 //A2
+data8 0xAC176F3775E6FCFC, 0x0000408E //A3
+data8 0xA3114F53A9FEB922, 0x000040BE //A4
+data8 0xA4D168A8334ABF41, 0x000040EE //A5
+data8 0x51E5B0E7EC7182BB //A6
+data8 0x54E77D67B876EAB6 //A7
+data8 0x57E9F7C30C09C4B6 //A8
+data8 0x5AED29B0488614CA //A9
+data8 0x5DF09486F87E79F9 //A10
+data8 0x60F30B199979654E //A11
+data8 0x63F60E02C7DCCC5F //A12
+data8 0x66F9B8A00EB01684 //A13
+data8 0x69FE2D3ED0700044 //A14
+data8 0x6D01C8363C7DCC84 //A15
+data8 0x700502B29C2F06E3 //A16
+data8 0x730962B4500F4A61 //A17
+data8 0x76103C6ED099192A //A18
+data8 0x79100C7132CFD6E3 //A19
+// Polynomial coefficients for right root on [-19, -18]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0x3F3C19A53328A0C3, 0x3BE04ADC3FBE1458 //A0
+data8 0x4336C16C16C16C19, 0x3FE58CE3AC4A7C28 //A1
+data8 0x46702E85C0898B70, 0x432C922E412CEC6E //A2
+data8 0xF57B99A1C034335D, 0x0000409A //A3
+data8 0x82EC9634223DF909, 0x000040CF //A4
+data8 0x94F66D7557E2EA60, 0x00004103 //A5
+data8 0x5376118B79AE34D0 //A6
+data8 0x56BAE7106D52E548 //A7
+data8 0x5A00BD48CC8E25AB //A8
+data8 0x5D4529722821B493 //A9
+data8 0x608B1654AF31BBC1 //A10
+data8 0x63D182CC98AEA859 //A11
+data8 0x6716D43D5EEB05E8 //A12
+data8 0x6A5DF884FC172E1C //A13
+data8 0x6DA3CA7EBB97976B //A14
+data8 0x70EA416D0BE6D2EF //A15
+data8 0x743176C31EBB65F2 //A16
+data8 0x7777C401A8715CF9 //A17
+data8 0x7AC1110C6D350440 //A18
+data8 0x7E02D0971CF84865 //A19
+// Polynomial coefficients for right root on [-20, -19]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0xBFAB767F9BE21803, 0xBC5ACEF5BB1BD8B5 //A0
+data8 0x4379999999999999, 0x4029241C7F5914C8 //A1
+data8 0x46F47AE147AE147A, 0x43AC2979B64B9D7E //A2
+data8 0xAEC33E1F67152993, 0x000040A7 //A3
+data8 0xD1B71758E219616F, 0x000040DF //A4
+data8 0x8637BD05AF6CF468, 0x00004118 //A5
+data8 0x55065E9F80F293DE //A6
+data8 0x588EADA78C44EE66 //A7
+data8 0x5C15798EE22DEF09 //A8
+data8 0x5F9E8ABFD644FA63 //A9
+data8 0x6325FD7FE29BD7CD //A10
+data8 0x66AFFC5C57E1F802 //A11
+data8 0x6A3774CD7D5C0181 //A12
+data8 0x6DC152724DE2A6FE //A13
+data8 0x7149BB138EB3D0C2 //A14
+data8 0x74D32FF8A70896C2 //A15
+data8 0x785D3749F9C72BD7 //A16
+data8 0x7BE5CCF65EBC4E40 //A17
+data8 0x7F641A891B5FC652 //A18
+data8 0x7FEFFFFFFFFFFFFF //A19
+LOCAL_OBJECT_END(lgammal_right_roots_polynomial_data)
+
+LOCAL_OBJECT_START(lgammal_left_roots_polynomial_data)
+// Polynomial coefficients for left root on [-3, -2]
+// Lgammal is aproximated by polynomial within [.084641 ; -.059553 ] range
+data8 0xBC0844590979B82E, 0xB8BC7CE8CE2ECC3B //A0
+data8 0xBFFEA12DA904B18C, 0xBC91A6B2BAD5EF6E //A1
+data8 0x4023267F3C265A51, 0x3CD7055481D03AED //A2
+data8 0xA0C2D618645F8E00, 0x0000C003 //A3
+data8 0xFA8256664F8CD2BE, 0x00004004 //A4
+data8 0xC2C422C103F57158, 0x0000C006 //A5
+data8 0x4084373F7CC70AF5 //A6
+data8 0xC0A12239BDD6BB95 //A7
+data8 0x40BDBA65E2709397 //A8
+data8 0xC0DA2D2504DFB085 //A9
+data8 0x40F758173CA5BF3C //A10
+data8 0xC11506C65C267E72 //A11
+data8 0x413318EE3A6B05FC //A12
+data8 0xC1517767F247DA98 //A13
+data8 0x41701237B4754D73 //A14
+data8 0xC18DB8A03BC5C3D8 //A15
+data8 0x41AB80953AC14A07 //A16
+data8 0xC1C9B7B76638D0A4 //A17
+data8 0x41EA727E3033E2D9 //A18
+data8 0xC20812C297729142 //A19
+//
+// Polynomial coefficients for left root on [-4, -3]
+// Lgammal is aproximated by polynomial within [.147147 ; -.145158 ] range
+data8 0xBC3130AE5C4F54DB, 0xB8ED23294C13398A //A0
+data8 0xC034B99D966C5646, 0xBCE2E5FE3BC3DBB9 //A1
+data8 0x406F76DEAE0436BD, 0x3D14974DDEC057BD //A2
+data8 0xE929ACEA5979BE96, 0x0000C00A //A3
+data8 0xF47C14F8A0D52771, 0x0000400E //A4
+data8 0x88B7BC036937481C, 0x0000C013 //A5
+data8 0x4173E8F3AB9FC266 //A6
+data8 0xC1B7DBBE062FB11B //A7
+data8 0x41FD2F76DE7A47A7 //A8
+data8 0xC242225FE53B124D //A9
+data8 0x4286D12AE2FBFA30 //A10
+data8 0xC2CCFFC267A3C4C0 //A11
+data8 0x431294E10008E014 //A12
+data8 0xC357FAC8C9A2DF6A //A13
+data8 0x439F2190AB9FAE01 //A14
+data8 0xC3E44C1D8E8C67C3 //A15
+data8 0x442A8901105D5A38 //A16
+data8 0xC471C4421E908C3A //A17
+data8 0x44B92CD4D59D6D17 //A18
+data8 0xC4FB3A078B5247FA //A19
+// Polynomial coefficients for left root on [-5, -4]
+// Lgammal is aproximated by polynomial within [.155671 ; -.155300 ] range
+data8 0xBC57BF3C6E8A94C1, 0xB902FB666934AC9E //A0
+data8 0xC05D224A3EF9E41F, 0xBCF6F5713913E440 //A1
+data8 0x40BB533C678A3955, 0x3D688E53E3C72538 //A2
+data8 0x869FBFF732E99B84, 0x0000C012 //A3
+data8 0xBA9537AD61392DEC, 0x00004018 //A4
+data8 0x89EAE8B1DEA06B05, 0x0000C01F //A5
+data8 0x425A8C5C53458D3C //A6
+data8 0xC2C5068B3ED6509B //A7
+data8 0x4330FFA575E99B4E //A8
+data8 0xC39BEC12DDDF7669 //A9
+data8 0x44073825725F74F9 //A10
+data8 0xC47380EBCA299047 //A11
+data8 0x44E084DD9B666437 //A12
+data8 0xC54C2DA6BF787ACF //A13
+data8 0x45B82D65C8D6FA42 //A14
+data8 0xC624D62113FE950A //A15
+data8 0x469200CC19B45016 //A16
+data8 0xC6FFDDC6DD938E2E //A17
+data8 0x476DD7C07184B9F9 //A18
+data8 0xC7D554A30085C052 //A19
+// Polynomial coefficients for left root on [-6, -5]
+// Lgammal is aproximated by polynomial within [.157425 ; -.157360 ] range
+data8 0x3C9E20A87C8B79F1, 0x39488BE34B2427DB //A0
+data8 0xC08661F6A43A5E12, 0xBD3D912526D759CC //A1
+data8 0x410F79DCB794F270, 0x3DB9BEE7CD3C1BF5 //A2
+data8 0xEB7404450D0005DB, 0x0000C019 //A3
+data8 0xF7AE9846DFE4D4AB, 0x00004022 //A4
+data8 0x8AF535855A95B6DA, 0x0000C02C //A5
+data8 0x43544D54E9FE240E //A6
+data8 0xC3E8684E40CE6CFC //A7
+data8 0x447DF44C1D803454 //A8
+data8 0xC512AC305439B2BA //A9
+data8 0x45A79226AF79211A //A10
+data8 0xC63E0DFF7244893A //A11
+data8 0x46D35216C3A83AF3 //A12
+data8 0xC76903BE0C390E28 //A13
+data8 0x48004A4DECFA4FD5 //A14
+data8 0xC8954FBD243DB8BE //A15
+data8 0x492BF3A31EB18DDA //A16
+data8 0xC9C2C6A864521F3A //A17
+data8 0x4A5AB127C62E8DA1 //A18
+data8 0xCAECF60EF3183C57 //A19
+// Polynomial coefficients for left root on [-7, -6]
+// Lgammal is aproximated by polynomial within [.157749 ; -.157739 ] range
+data8 0x3CC9B9E8B8D551D6, 0x3961813C8E1E10DB //A0
+data8 0xC0B3ABF7A5CEA91F, 0xBD55638D4BCB4CC4 //A1
+data8 0x4168349A25504236, 0x3E0287ECE50CCF76 //A2
+data8 0x9EC8ED6E4C219E67, 0x0000C022 //A3
+data8 0x9279EB1B799A3FF3, 0x0000402E //A4
+data8 0x90213EF8D9A5DBCF, 0x0000C03A //A5
+data8 0x4462775E857FB71C //A6
+data8 0xC52377E70B45FDBF //A7
+data8 0x45E4F3D28EDA8C28 //A8
+data8 0xC6A6E85571BD2D0B //A9
+data8 0x47695BB17E74DF74 //A10
+data8 0xC82C5AC0ED6A662F //A11
+data8 0x48EFF8159441C2E3 //A12
+data8 0xC9B22602C1B68AE5 //A13
+data8 0x4A74BA8CE7B34100 //A14
+data8 0xCB37C7E208482E4B //A15
+data8 0x4BFB5A1D57352265 //A16
+data8 0xCCC01CB3021212FF //A17
+data8 0x4D841613AC3431D1 //A18
+data8 0xCE431C9E9EE43AD9 //A19
+// Polynomial coefficients for left root on [-8, -7]
+// Lgammal is aproximated by polynomial within [.157799 ; -.157798 ] range
+data8 0xBCF9C7A33AD9478C, 0xB995B0470F11E5ED //A0
+data8 0xC0E3AF76FE4C2F8B, 0xBD8DBCD503250511 //A1
+data8 0x41C838E76CAAF0D5, 0x3E5D79F5E2E069C3 //A2
+data8 0x9EF345992B262CE0, 0x0000C02B //A3
+data8 0x92AE0292985FD559, 0x0000403A //A4
+data8 0x90615420C08F7D8C, 0x0000C049 //A5
+data8 0x45828139342CEEB7 //A6
+data8 0xC67384066C31E2D3 //A7
+data8 0x476502BC4DAC2C35 //A8
+data8 0xC856FAADFF22ADC6 //A9
+data8 0x49497243255AB3CE //A10
+data8 0xCA3C768489520F6B //A11
+data8 0x4B300D1EA47AF838 //A12
+data8 0xCC223B0508AC620E //A13
+data8 0x4D14D46583338CD8 //A14
+data8 0xCE07E7A87AA068E4 //A15
+data8 0x4EFB811AD2F8BEAB //A16
+data8 0xCFF0351B51508523 //A17
+data8 0x50E4364CCBF53100 //A18
+data8 0xD1D33CFD0BF96FA6 //A19
+// Polynomial coefficients for left root on [-9, -8]
+// Lgammal is aproximated by polynomial within [.157806 ; -.157806 ] range
+data8 0x3D333E4438B1B9D4, 0x39E7B956B83964C1 //A0
+data8 0xC11625EDFC63DCD8, 0xBDCF39625709EFAC //A1
+data8 0x422EA8C150480F16, 0x3EC16ED908AB7EDD //A2
+data8 0xE2598725E2E11646, 0x0000C034 //A3
+data8 0xEAFF2346DE3EBC98, 0x00004046 //A4
+data8 0x821E90DE12A0F05F, 0x0000C059 //A5
+data8 0x46B2C334AE5366FE //A6
+data8 0xC7D64314B43191B6 //A7
+data8 0x48FAF6ED5899E01B //A8
+data8 0xCA2096E4472AF37D //A9
+data8 0x4B44AAF49FB7E4C8 //A10
+data8 0xCC6A02469F2BD920 //A11
+data8 0x4D9080626D2EFC07 //A12
+data8 0xCEB515EDCF0695F7 //A13
+data8 0x4FDB1AC69BF36960 //A14
+data8 0xD1017F8274339270 //A15
+data8 0x5226A684961BAE2F //A16
+data8 0xD34E085C088404A5 //A17
+data8 0x547511892FF8960E //A18
+data8 0xD5968FA3B1ED67A9 //A19
+// Polynomial coefficients for left root on [-10, -9]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBD355818A2B42BA2, 0xB9B7320B6A0D61EA //A0
+data8 0xC14BAF7DA5F3770E, 0xBDE64AF9A868F719 //A1
+data8 0x4297F3E8791F9CD3, 0x3F2A553E59B4835E //A2
+data8 0xDD0C5F7E551BD13C, 0x0000C03E //A3
+data8 0x8F6F0A3B2EB08BBB, 0x00004054 //A4
+data8 0xC68D4D5AD230BA08, 0x0000C069 //A5
+data8 0x47F1E4D8C35D1A3E //A6
+data8 0xC94A8A191DB0A466 //A7
+data8 0x4AA4174F65FE6AE8 //A8
+data8 0xCBFEE6D90F94E9DD //A9
+data8 0x4D580FD3438BE16C //A10
+data8 0xCEB2ECD456D50224 //A11
+data8 0x500E049F7FE64546 //A12
+data8 0xD167F92D9600F378 //A13
+data8 0x52C342AE2B43261A //A14
+data8 0xD41F15DEEDA4B67E //A15
+data8 0x55792638748AFB7D //A16
+data8 0xD6D4D760074F6E6B //A17
+data8 0x5832469D58ED3FA9 //A18
+data8 0xD988769F3DC76642 //A19
+// Polynomial coefficients for left root on [-11, -10]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBDA050601F39778A, 0xBA0D4D1CE53E8241 //A0
+data8 0xC18308A7D8EA4039, 0xBE370C379D3EAD41 //A1
+data8 0x4306A49380644E6C, 0x3FBBB143C0E7B5C8 //A2
+data8 0x8FA8FB233E4AA6D2, 0x0000C049 //A3
+data8 0x802CC9D8AEAC207D, 0x00004062 //A4
+data8 0xF3F73EE651A37A13, 0x0000C07A //A5
+data8 0x493E3B550A7B9568 //A6
+data8 0xCACED38DAA060929 //A7
+data8 0x4C600B346BAB3BC6 //A8
+data8 0xCDF0F719193E3D26 //A9
+data8 0x4F8229F24528B151 //A10
+data8 0xD113A4C2D32FBBE2 //A11
+data8 0x52A56BC13DC4474D //A12
+data8 0xD43785CFAF5E3CE3 //A13
+data8 0x55C9FC3EA5941202 //A14
+data8 0xD75CD545A3341AF5 //A15
+data8 0x58F009911F77C282 //A16
+data8 0xDA8246294D210BEC //A17
+data8 0x5C1608AAC32C3A8E //A18
+data8 0xDDA446E570A397D5 //A19
+// Polynomial coefficients for left root on [-12, -11]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0x3DEACBB3081C502E, 0x3A8AA6F01DEDF745 //A0
+data8 0xC1BC8CFBFB0A9912, 0xBE6556B6504A2AE6 //A1
+data8 0x43797926206941D7, 0x40289A9644C2A216 //A2
+data8 0xF26D2A78446D0839, 0x0000C053 //A3
+data8 0xA238B1D937FFED38, 0x00004070 //A4
+data8 0xE793B4F6DE470538, 0x0000C08C //A5
+data8 0x4A9585BDC44DC45D //A6
+data8 0xCC60759520342C47 //A7
+data8 0x4E29B2F3694C0404 //A8
+data8 0xCFF4619AE7B6BBAB //A9
+data8 0x51C05DADF52B89E8 //A10
+data8 0xD38A8C7F48819A4A //A11
+data8 0x5555B6932D687860 //A12
+data8 0xD721E1FACB6C1B5B //A13
+data8 0x58EDA1E2677C8F91 //A14
+data8 0xDAB8A8EC523C1F71 //A15
+data8 0x5C84930133F30411 //A16
+data8 0xDE51952FDFD1EC49 //A17
+data8 0x601FCCEC1BBD25F1 //A18
+data8 0xE1E5F2D76B610920 //A19
+// Polynomial coefficients for left root on [-13, -12]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBE01612F373268ED, 0xBA97B7A18CDF103B //A0
+data8 0xC1F7328CBF7A4FAC, 0xBE89A25A6952F481 //A1
+data8 0x43F0D0FA2DBDA237, 0x40A0422EC1CE6084 //A2
+data8 0x82082DF2D32686C5, 0x0000C05F //A3
+data8 0x8D64EE9B42E68B36, 0x0000407F //A4
+data8 0xA3FFD82E08C630C9, 0x0000C09F //A5
+data8 0x4BF8C49D99123466 //A6
+data8 0xCDFEC79DDF1119ED //A7
+data8 0x50038615A892D242 //A8
+data8 0xD20929453DC8B537 //A9
+data8 0x54106A78083BA1EE //A10
+data8 0xD615A302C69E27B2 //A11
+data8 0x581CC175870FF16F //A12
+data8 0xDA233E0979E12B74 //A13
+data8 0x5C29E822BC568C80 //A14
+data8 0xDE31845DB5340FBC //A15
+data8 0x6037BFC6D498D5F9 //A16
+data8 0xE2407D92CD613E82 //A17
+data8 0x64483B9B62367EB7 //A18
+data8 0xE64B2DC830E8A799 //A1
+// Polynomial coefficients for left root on [-14, -13]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0x3E563D0B930B371F, 0x3AE779957E14F012 //A0
+data8 0xC2344C3B2F083767, 0xBEC0B7769AA3DD66 //A1
+data8 0x4469BFFF28B50D07, 0x41181E3F13ED2401 //A2
+data8 0xAE38F64DCB24D9EE, 0x0000C06A //A3
+data8 0xA5C3F52C1B3506F2, 0x0000408E //A4
+data8 0xA83BC857BCD6BA92, 0x0000C0B2 //A5
+data8 0x4D663B4727B4D81A //A6
+data8 0xCFA82C965B0F62E9 //A7
+data8 0x51EAD58C02905B71 //A8
+data8 0xD42E427970FA56AD //A9
+data8 0x56714644C57D8476 //A10
+data8 0xD8B3EC2037EC95F2 //A11
+data8 0x5AF72AE68BBA5B3D //A12
+data8 0xDD3B2152C67AA6B7 //A13
+data8 0x5F7FF5F082861B8B //A14
+data8 0xE1C2E8BE125A5B7A //A15
+data8 0x64066E92FE9EBE7D //A16
+data8 0xE64B4201CDF9F138 //A17
+data8 0x689186351E58AA88 //A18
+data8 0xEAD132A585DFC60A //A19
+// Polynomial coefficients for left root on [-15, -14]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBE6D7DDE12700AC1, 0xBB1E025BF1667FB5 //A0
+data8 0xC273077763F60AD5, 0xBF2A1698184C7A9A //A1
+data8 0x44E6A1BF095B1AB3, 0x4178D5AE8A4A2874 //A2
+data8 0x8F8E0D5060FCC767, 0x0000C076 //A3
+data8 0x800CC1DCFF092A57, 0x0000409E //A4
+data8 0xF3AB0BA9D14D37D1, 0x0000C0C5 //A5
+data8 0x4EDE3000A2F6D565 //A6
+data8 0xD15EC613B9C8C800 //A7
+data8 0x53E003309FEECCAA //A8
+data8 0xD660ED908D8B15C4 //A9
+data8 0x58E21E9B51A1C4AE //A10
+data8 0xDB639745DB82210D //A11
+data8 0x5DE55BB60C68FCF6 //A12
+data8 0xE06772BA3FCA23C6 //A13
+data8 0x62E9E58B4F702C31 //A14
+data8 0xE56CBA49B071ABE2 //A15
+data8 0x67EFF31E4F2BA36A //A16
+data8 0xEA7232C8804F32C3 //A17
+data8 0x6CF5EFEE929A0928 //A18
+data8 0xEF742EE03EC3E8FF //A19
+// Polynomial coefficients for left root on [-16, -15]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBEDCC628FEAC7A1B, 0xBB80582C8BEBB198 //A0
+data8 0xC2B3076EE752595E, 0xBF5388F55AFAE53E //A1
+data8 0x4566A1AAD96EBCB5, 0x421F0FEFE2444293 //A2
+data8 0x8F8D4D3DE9850DB2, 0x0000C082 //A3
+data8 0x800BDD6DA2CE184C, 0x000040AE //A4
+data8 0xF3A8EC4C9CDC7A43, 0x0000C0D9 //A5
+data8 0x505E2FAFDB81263F //A6
+data8 0xD31EC5B3A7506CD9 //A7
+data8 0x55E002F77E999810 //A8
+data8 0xD8A0ED4C9B5C2900 //A9
+data8 0x5B621E4A8267C401 //A10
+data8 0xDE2396E5BFCFDA7A //A11
+data8 0x60E55B43BE6F9A79 //A12
+data8 0xE3A772324C7405FA //A13
+data8 0x6669E4E9B7E57A2D //A14
+data8 0xE92CB989F8A8FB37 //A15
+data8 0x6BEFF2368849A36E //A16
+data8 0xEEB23234FE191D55 //A17
+data8 0x7175EF5D1080B105 //A18
+data8 0xF4342ED7B1B7BE31 //A19
+// Polynomial coefficients for left root on [-17, -16]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBF28AEEE7B58C790, 0xBBC4448DE371FA0A //A0
+data8 0xC2F436F56B3B89B1, 0xBF636755245AC63A //A1
+data8 0x45E98A22535D115D, 0x4298467DA93DB784 //A2
+data8 0xAC176F3775E6FCF2, 0x0000C08E //A3
+data8 0xA3114F53A9FEB908, 0x000040BE //A4
+data8 0xA4D168A8334AFE5A, 0x0000C0EE //A5
+data8 0x51E5B0E7EC7182CF //A6
+data8 0xD4E77D67B876D6B4 //A7
+data8 0x57E9F7C30C098C83 //A8
+data8 0xDAED29B0489EF7A7 //A9
+data8 0x5DF09486F8A524B8 //A10
+data8 0xE0F30B19910A2393 //A11
+data8 0x63F60E02AB3109F4 //A12
+data8 0xE6F9B8A3431854D5 //A13
+data8 0x69FE2D4A6D94218E //A14
+data8 0xED01C7E272A73560 //A15
+data8 0x7005017D82B186B6 //A16
+data8 0xF3096A81A69BD8AE //A17
+data8 0x76104951BAD67D5C //A18
+data8 0xF90FECC99786FD5B //A19
+// Polynomial coefficients for left root on [-18, -17]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0x3F3C19A53328E26A, 0x3BE238D7BA036B3B //A0
+data8 0xC336C16C16C16C13, 0xBFEACE245DEC56F3 //A1
+data8 0x46702E85C0898B70, 0x432C922B64FD1DA4 //A2
+data8 0xF57B99A1C0343350, 0x0000C09A //A3
+data8 0x82EC9634223DF90D, 0x000040CF //A4
+data8 0x94F66D7557E3237D, 0x0000C103 //A5
+data8 0x5376118B79AE34D6 //A6
+data8 0xD6BAE7106D52CE49 //A7
+data8 0x5A00BD48CC8E11AB //A8
+data8 0xDD4529722833E2DF //A9
+data8 0x608B1654AF5F46AF //A10
+data8 0xE3D182CC90D8723F //A11
+data8 0x6716D43D46706AA0 //A12
+data8 0xEA5DF888C5B428D3 //A13
+data8 0x6DA3CA85888931A6 //A14
+data8 0xF0EA40EF2AC7E070 //A15
+data8 0x743175D1A251AFCD //A16
+data8 0xF777CB6E2B550D73 //A17
+data8 0x7AC11E468A134A51 //A18
+data8 0xFE02B6BDD0FC40AA //A19
+// Polynomial coefficients for left root on [-19, -18]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBFAB767F9BE217FC, 0xBC4A5541CE0D8D0D //A0
+data8 0xC379999999999999, 0xC01A84981B490BE8 //A1
+data8 0x46F47AE147AE147A, 0x43AC2987BBC466EB //A2
+data8 0xAEC33E1F67152987, 0x0000C0A7 //A3
+data8 0xD1B71758E2196153, 0x000040DF //A4
+data8 0x8637BD05AF6D420E, 0x0000C118 //A5
+data8 0x55065E9F80F293B2 //A6
+data8 0xD88EADA78C44BFA7 //A7
+data8 0x5C15798EE22EC6CD //A8
+data8 0xDF9E8ABFD67895CF //A9
+data8 0x6325FD7FE13B0DE0 //A10
+data8 0xE6AFFC5C3DE70858 //A11
+data8 0x6A3774CE81C70D43 //A12
+data8 0xEDC1527412D8129F //A13
+data8 0x7149BABCDA8B7A72 //A14
+data8 0xF4D330AD49071BB5 //A15
+data8 0x785D4046F4C5F1FD //A16
+data8 0xFBE59BFEDBA73FAF //A17
+data8 0x7F64BEF2B2EC8DA1 //A18
+data8 0xFFEFFFFFFFFFFFFF //A19
+LOCAL_OBJECT_END(lgammal_left_roots_polynomial_data)
+
+
+//==============================================================
+// Code
+//==============================================================
+
+.section .text
+GLOBAL_LIBM_ENTRY(__libm_lgammal)
+{ .mfi
+ getf.exp rSignExpX = f8
+ // Test x for NaTVal, NaN, +/-0, +/-INF, denormals
+ fclass.m p6,p0 = f8,0x1EF
+ addl r17Ones = 0x1FFFF, r0 // exponent mask
+}
+{ .mfi
+ addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp
+ fcvt.fx.s1 fXint = f8 // Convert arg to int (int repres. in FR)
+ adds rDelta = 0x3FC, r0
+}
+;;
+{ .mfi
+ getf.sig rSignifX = f8
+ fcmp.lt.s1 p15, p14 = f8, f0
+ shl rDelta = rDelta, 20 // single precision 1.5
+}
+{ .mfi
+ ld8 GR_ad_z_1 = [GR_ad_z_1]// get pointer to Constants_Z_1
+ fma.s1 fTwo = f1, f1, f1 // 2.0
+ addl rExp8 = 0x10002, r0 // exponent of 8.0
+}
+;;
+{ .mfi
+ alloc rPFS_SAVED = ar.pfs, 0, 34, 4, 0 // get some registers
+ fmerge.s fAbsX = f1, f8 // |x|
+ and rExpX = rSignExpX, r17Ones // mask sign bit
+}
+{ .mib
+ addl rExpHalf = 0xFFFE, r0 // exponent of 0.5
+ addl rExp2 = 0x10000, r0 // exponent of 2.0
+ // branch out if x is NaTVal, NaN, +/-0, +/-INF, or denormalized number
+(p6) br.cond.spnt lgammal_spec
+}
+;;
+_deno_back_to_main_path:
+{ .mfi
+ // Point to Constants_G_H_h1
+ add rTbl1Addr = 0x040, GR_ad_z_1
+ frcpa.s1 fRcpX, p0 = f1, f8 // initial approximation of 1/x
+ extr.u GR_Index1 = rSignifX, 59, 4
+}
+{ .mib
+(p14) cmp.ge.unc p8, p0 = rExpX, rExp8 // p8 = 1 if x >= 8.0
+ adds rZ625 = 0x3F2, r0
+(p8) br.cond.spnt lgammal_big_positive // branch out if x >= 8.0
+}
+;;
+{ .mfi
+ shladd rZ1offsett = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
+ fmerge.se fSignifX = f1, f8 // sifnificand of x
+ // Get high 15 bits of significand
+ extr.u GR_X_0 = rSignifX, 49, 15
+}
+{ .mib
+ cmp.lt.unc p9, p0 = rExpX, rExpHalf // p9 = 1 if |x| < 0.5
+ // set p11 if 2 <= x < 4
+(p14) cmp.eq.unc p11, p0 = rExpX, rExp2
+(p9) br.cond.spnt lgammal_0_half // branch out if |x| < 0.5
+}
+;;
+{ .mfi
+ ld4 GR_Z_1 = [rZ1offsett] // Load Z_1
+ fms.s1 fA5L = f1, f1, f8 // for 0.75 <= x < 1.3125 path
+ shl rZ625 = rZ625, 20 // sinfle precision 0.625
+}
+{ .mib
+ setf.s FR_MHalf = rDelta
+ // set p10 if x >= 4.0
+(p14) cmp.gt.unc p10, p0 = rExpX, rExp2
+ // branch to special path for 4.0 <= x < 8
+(p10) br.cond.spnt lgammal_4_8
+}
+;;
+{ .mfi
+ // for 1.3125 <= x < 1.5625 path
+ addl rPolDataPtr= @ltoff(lgammal_loc_min_data),gp
+ // argument of polynomial approximation for 1.5625 <= x < 2.25
+ fms.s1 fB4 = f8, f1, fTwo
+ cmp.eq p12, p0 = rExpX, rExpHalf
+}
+{ .mib
+ addl rExpOne = 0xFFFF, r0 // exponent of 1.0
+ // set p10 if significand of x >= 1.125
+(p11) cmp.le p11, p0 = 2, GR_Index1
+(p11) br.cond.spnt lgammal_2Q_4
+}
+;;
+{ .mfi
+ // point to xMin for 1.3125 <= x < 1.5625 path
+ ld8 rPolDataPtr = [rPolDataPtr]
+ fcvt.xf fFltIntX = fXint // RTN(x)
+(p14) cmp.eq.unc p13, p7 = rExpX, rExpOne // p13 set if 1.0 <= x < 2.0
+}
+{ .mib
+ setf.s FR_FracX = rZ625
+ // set p12 if |x| < 0.75
+(p12) cmp.gt.unc p12, p0 = 8, GR_Index1
+ // branch out to special path for |x| < 0.75
+(p12) br.cond.spnt lgammal_half_3Q
+}
+;;
+.pred.rel "mutex", p7, p13
+{ .mfi
+ getf.sig rXRnd = fXint // integer part of the input value
+ fnma.s1 fInvX = f8, fRcpX, f1 // start of 1st NR iteration
+ // Get bits 30-15 of X_0 * Z_1
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
+}
+{ .mib
+(p7) cmp.eq p6, p0 = rExpX, rExp2 // p6 set if 2.0 <= x < 2.25
+(p13) cmp.le p6, p0 = 9, GR_Index1
+ // branch to special path 1.5625 <= x < 2.25
+(p6) br.cond.spnt lgammal_13Q_2Q
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ shladd GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr // Point to G_1
+ fma.s1 fSix = fTwo, fTwo, fTwo // 6.0
+ add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q
+}
+{ .mib
+ add rTmpPtr3 = -0x50, GR_ad_z_1
+(p13) cmp.gt p7, p0 = 5, GR_Index1
+ // branch to special path 0.75 <= x < 1.3125
+(p7) br.cond.spnt lgammal_03Q_1Q
+}
+;;
+{ .mfi
+ add rTmpPtr = 8, GR_ad_tbl_1
+ fma.s1 fRoot = f8, f1, f1 // x + 1
+ // Absolute value of int arg. Will be used as index in table with roots
+ sub rXRnd = r0, rXRnd
+}
+{ .mib
+ ldfe fA5L = [rPolDataPtr], 16 // xMin
+ addl rNegSingularity = 0x3003E, r0
+(p14) br.cond.spnt lgammal_loc_min
+}
+;;
+{ .mfi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1], 8 // Load G_1, H_1
+ nop.f 0
+ add rZ2Addr = 0x140, GR_ad_z_1 // Point to Constants_Z_2
+}
+{ .mib
+ ldfd FR_h = [rTmpPtr] // Load h_1
+ // If arg is less or equal to -2^63
+ cmp.geu.unc p8,p0 = rSignExpX, rNegSingularity
+ // Singularity for x < -2^63 since all such arguments are integers
+ // branch to special code which deals with singularity
+(p8) br.cond.spnt lgammal_singularity
+}
+;;
+{ .mfi
+ ldfe FR_log2_hi = [GR_ad_q], 32 // Load log2_hi
+ nop.f 0
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+}
+{ .mfi
+ ldfe FR_log2_lo = [rTmpPtr3], 32 // Load log2_lo
+ fms.s1 fDx = f8, f1, fFltIntX // x - RTN(x)
+ // index in table with roots and bounds
+ adds rXint = -2, rXRnd
+}
+;;
+{ .mfi
+ ldfe FR_Q4 = [GR_ad_q], 32 // Load Q4
+ nop.f 0
+ // set p12 if x may be close to negative root: -19.5 < x < -2.0
+ cmp.gtu p12, p0 = 18, rXint
+}
+{ .mfi
+ shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2
+ fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 1st NR iteration
+ // Point to Constants_G_H_h2
+ add rTbl2Addr = 0x180, GR_ad_z_1
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
+ // set p9 if x is integer and negative
+ fcmp.eq.s1 p9, p0 = f8,fFltIntX
+ // Point to Constants_G_H_h3
+ add rTbl3Addr = 0x280, GR_ad_z_1
+}
+{ .mfi
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ nop.f 0
+ sub GR_N = rExpX, rExpHalf, 1
+}
+;;
+{ .mfi
+ ldfe FR_Q3 = [rTmpPtr3], 32 // Load Q3
+ nop.f 0
+ // Point to lnsin polynomial coefficients
+ adds rLnSinDataPtr = 864, rTbl3Addr
+}
+{ .mfi
+ ldfe FR_Q2 = [GR_ad_q],32 // Load Q2
+ nop.f 0
+ add rTmpPtr = 8, GR_ad_tbl_2
+}
+;;
+{ .mfi
+ ldfe FR_Q1 = [rTmpPtr3] // Load Q1
+ fcmp.lt.s1 p0, p15 = fAbsX, fSix // p15 is set when x < -6.0
+ // point to table with roots and bounds
+ adds rRootsBndAddr = -1296, GR_ad_z_1
+}
+{ .mfb
+ // Put integer N into rightmost significand
+ setf.sig fFloatN = GR_N
+ fma.s1 fThirteen = fSix, fTwo, f1 // 13.0
+ // Singularity if -2^63 < x < 0 and x is integer
+ // branch to special code which deals with singularity
+(p9) br.cond.spnt lgammal_singularity
+}
+;;
+{ .mfi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2] // Load G_2, H_2
+ // y = |x|/2^(exponent(x)) - 1.5
+ fms.s1 FR_FracX = fSignifX, f1, FR_MHalf
+ // Get bits 30-15 of X_1 * Z_2
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
+}
+{ .mfi
+ ldfd FR_h2 = [rTmpPtr] // Load h_2
+ fma.s1 fDxSqr = fDx, fDx, f0 // deltaX^2
+ adds rTmpPtr3 = 128, rLnSinDataPtr
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ getf.exp rRoot = fRoot // sign and biased exponent of (x + 1)
+ nop.f 0
+ // set p6 if -4 < x <= -2
+ cmp.eq p6, p0 = rExpX, rExp2
+}
+{ .mfi
+ ldfpd fLnSin2, fLnSin2L = [rLnSinDataPtr], 16
+ fnma.s1 fInvX = f8, fRcpX, f1 // start of 2nd NR iteration
+ sub rIndexPol = rExpX, rExpHalf // index of polynom
+}
+;;
+{ .mfi
+ ldfe fLnSin4 = [rLnSinDataPtr], 96
+ // p10 is set if x is potential "right" root
+ // p11 set for possible "left" root
+ fcmp.lt.s1 p10, p11 = fDx, f0
+ shl rIndexPol = rIndexPol, 6 // (i*16)*4
+}
+{ .mfi
+ ldfpd fLnSin18, fLnSin20 = [rTmpPtr3], 16
+ nop.f 0
+ mov rExp2tom7 = 0x0fff8 // Exponent of 2^-7
+}
+;;
+{ .mfi
+ getf.sig rSignifDx = fDx // Get significand of RTN(x)
+ nop.f 0
+ // set p6 if -4 < x <= -3.0
+(p6) cmp.le.unc p6, p0 = 0x8, GR_Index1
+}
+{ .mfi
+ ldfpd fLnSin22, fLnSin24 = [rTmpPtr3], 16
+ nop.f 0
+ // mask sign bit in the exponent of (x + 1)
+ and rRoot = rRoot, r17Ones
+}
+;;
+{ .mfi
+ ldfe fLnSin16 = [rLnSinDataPtr], -80
+ nop.f 0
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+}
+{ .mfi
+ ldfpd fLnSin26, fLnSin28 = [rTmpPtr3], 16
+ nop.f 0
+ and rXRnd = 1, rXRnd
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
+ fms.s1 fDxSqrL = fDx, fDx, fDxSqr // low part of deltaX^2
+ // potential "left" root
+(p11) adds rRootsBndAddr = 560, rRootsBndAddr
+}
+{ .mib
+ ldfpd fLnSin30, fLnSin32 = [rTmpPtr3], 16
+ // set p7 if |x+1| < 2^-7
+ cmp.lt p7, p0 = rRoot, rExp2tom7
+ // branch to special path for |x+1| < 2^-7
+(p7) br.cond.spnt _closeToNegOne
+}
+;;
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3
+ fcmp.lt.s1 p14, p0 = fAbsX, fThirteen // set p14 if x > -13.0
+ // base address of polynomial on range [-6.0, -0.75]
+ adds rPolDataPtr = 3440, rTbl3Addr
+}
+{ .mfi
+ // (i*16)*4 + (i*16)*8 - offsett of polynomial on range [-6.0, -0.75]
+ shladd rTmpPtr = rIndexPol, 2, rIndexPol
+ fma.s1 fXSqr = FR_FracX, FR_FracX, f0 // y^2
+ // point to left "near root" bound
+(p12) shladd rRootsBndAddr = rXint, 4, rRootsBndAddr
+}
+;;
+{ .mfi
+ ldfpd fLnSin34, fLnSin36 = [rTmpPtr3], 16
+ fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 2nd NR iteration
+ // add special offsett if -4 < x <= -3.0
+(p6) adds rPolDataPtr = 640, rPolDataPtr
+}
+{ .mfi
+ // point to right "near root" bound
+ adds rTmpPtr2 = 8, rRootsBndAddr
+ fnma.s1 fMOne = f1, f1, f0 // -1.0
+ // Point to Bernulli numbers
+ adds rBernulliPtr = 544, rTbl3Addr
+}
+;;
+{ .mfi
+ // left bound of "near root" range
+(p12) ld8 rLeftBound = [rRootsBndAddr]
+ fmerge.se fNormDx = f1, fDx // significand of DeltaX
+ // base address + offsett for polynomial coeff. on range [-6.0, -0.75]
+ add rPolDataPtr = rPolDataPtr, rTmpPtr
+}
+{ .mfi
+ // right bound of "near root" range
+(p12) ld8 rRightBound = [rTmpPtr2]
+ fcvt.xf fFloatN = fFloatN
+ // special "Bernulli" numbers for Stirling's formula for -13 < x < -6
+(p14) adds rBernulliPtr = 160, rBernulliPtr
+}
+;;
+{ .mfi
+ ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ adds rTmpPtr3 = -160, rTmpPtr3
+}
+{ .mfb
+ adds rTmpPtr = 80, rPolDataPtr
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ // p15 is set if -2^63 < x < 6.0 and x is not an integer
+ // branch to path with implementation using Stirling's formula for neg. x
+(p15) br.cond.spnt _negStirling
+}
+;;
+{ .mfi
+ ldfpd fA3, fA3L = [rPolDataPtr], 16 // A3
+ fma.s1 fDelX4 = fDxSqr, fDxSqr, f0 // deltaX^4
+ // Get high 4 bits of signif
+ extr.u rIndex1Dx = rSignifDx, 59, 4
+}
+{ .mfi
+ ldfe fA5 = [rTmpPtr], -16 // A5
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+ adds rLnSinTmpPtr = 16, rLnSinDataPtr
+}
+;;
+{ .mfi
+ ldfpd fA0, fA0L = [rPolDataPtr], 16 // A0
+ fma.s1 fLnSin20 = fLnSin20, fDxSqr, fLnSin18
+ // Get high 15 bits of significand
+ extr.u rX0Dx = rSignifDx, 49, 15
+}
+{ .mfi
+ ldfe fA4 = [rTmpPtr], 192 // A4
+ fms.s1 fXSqrL = FR_FracX, FR_FracX, fXSqr // low part of y^2
+ shladd GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1
+}
+;;
+{ .mfi
+ ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1
+ fma.s1 fX4 = fXSqr, fXSqr, f0 // y^4
+ adds rTmpPtr2 = 32, rTmpPtr
+}
+{ .mfi
+ ldfpd fA18, fA19 = [rTmpPtr], 16 // A18, A19
+ fma.s1 fLnSin24 = fLnSin24, fDxSqr, fLnSin22
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fLnSin6 = [rLnSinDataPtr], 32
+ fma.s1 fLnSin28 = fLnSin28, fDxSqr, fLnSin26
+ nop.i 0
+}
+{ .mfi
+ ldfe fLnSin8 = [rLnSinTmpPtr], 32
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA20, fA21 = [rTmpPtr], 16 // A20, A21
+ fma.s1 fLnSin32 = fLnSin32, fDxSqr, fLnSin30
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA22, fA23 = [rTmpPtr2], 16 // A22, A23
+ fma.s1 fB20 = f1, f1, FR_MHalf // 2.5
+(p12) cmp.ltu.unc p6, p0 = rSignifX, rLeftBound
+}
+;;
+{ .mfi
+ ldfpd fA2, fA2L = [rPolDataPtr], 16 // A2
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ // set p6 if x falls in "near root" range
+(p6) cmp.geu.unc p6, p0 = rSignifX, rRightBound
+}
+{ .mfb
+ adds rTmpPtr3 = -64, rTmpPtr
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ // branch to special path if x falls in "near root" range
+(p6) br.cond.spnt _negRoots
+}
+;;
+{ .mfi
+ ldfpd fA24, fA25 = [rTmpPtr2], 16 // A24, A25
+ fma.s1 fLnSin36 = fLnSin36, fDxSqr, fLnSin34
+(p11) cmp.eq.unc p7, p0 = 1,rXint // p7 set if -3.0 < x < -2.5
+}
+{ .mfi
+ adds rTmpPtr = -48, rTmpPtr
+ fma.s1 fLnSin20 = fLnSin20, fDxSqr, fLnSin16
+ addl rDelta = 0x5338, r0 // significand of -2.605859375
+}
+;;
+{ .mfi
+ getf.exp GR_N = fDx // Get N = exponent of DeltaX
+ fma.s1 fX6 = fX4, fXSqr, f0 // y^6
+ // p7 set if -2.605859375 <= x < -2.5
+(p7) cmp.gt.unc p7, p0 = rDelta, GR_X_0
+}
+{ .mfb
+ ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
+ fma.s1 fDelX8 = fDelX4, fDelX4, f0 // deltaX^8
+ // branch to special path for -2.605859375 <= x < -2.5
+(p7) br.cond.spnt _neg2andHalf
+}
+;;
+{ .mfi
+ ldfpd fA14, fA15 = [rTmpPtr3], 16 // A14, A15
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ adds rTmpPtr2 = 128 , rPolDataPtr
+}
+{ .mfi
+ ldfpd fA16, fA17 = [rTmpPtr], 16 // A16, A17
+ fma.s1 fLnSin28 = fLnSin28, fDelX4, fLnSin24
+ adds rPolDataPtr = 144 , rPolDataPtr
+}
+;;
+{ .mfi
+ ldfe fLnSin10 = [rLnSinDataPtr], 32
+ fma.s1 fRes1H = fA3, FR_FracX, f0 // (A3*y)hi
+ and GR_N = GR_N, r17Ones // mask sign bit
+}
+{ .mfi
+ ldfe fLnSin12 = [rLnSinTmpPtr]
+ fma.s1 fDelX6 = fDxSqr, fDelX4, f0 // DeltaX^6
+ shladd GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1
+}
+;;
+{ .mfi
+ ldfe fA13 = [rPolDataPtr], -32 // A13
+ fma.s1 fA4 = fA5, FR_FracX, fA4 // A5*y + A4
+ // Get bits 30-15 of X_0 * Z_1
+ pmpyshr2.u GR_X_1 = rX0Dx, GR_Z_1, 15
+}
+{ .mfi
+ ldfe fA12 = [rTmpPtr2], -32 // A12
+ fms.s1 FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1
+ sub GR_N = GR_N, rExpHalf, 1 // unbisaed exponent of DeltaX
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+.pred.rel "mutex",p10,p11
+{ .mfi
+ ldfe fA11 = [rPolDataPtr], -32 // A11
+ // High part of log(|x|) = Y_hi = N * log2_hi + H
+ fma.s1 fResH = fFloatN, FR_log2_hi, FR_H
+(p10) cmp.eq p8, p9 = rXRnd, r0
+}
+{ .mfi
+ ldfe fA10 = [rTmpPtr2], -32 // A10
+ fma.s1 fRes6H = fA1, FR_FracX, f0 // (A1*y)hi
+(p11) cmp.eq p9, p8 = rXRnd, r0
+}
+;;
+{ .mfi
+ ldfe fA9 = [rPolDataPtr], -32 // A9
+ fma.s1 fB14 = fLnSin6, fDxSqr, f0 // (LnSin6*deltaX^2)hi
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+{ .mfi
+ ldfe fA8 = [rTmpPtr2], -32 // A8
+ fma.s1 fA18 = fA19, FR_FracX, fA18
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fA7 = [rPolDataPtr] // A7
+ fma.s1 fA23 = fA23, FR_FracX, fA22
+ nop.i 0
+}
+{ .mfi
+ ldfe fA6 = [rTmpPtr2] // A6
+ fma.s1 fA21 = fA21, FR_FracX, fA20
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fLnSin14 = [rLnSinDataPtr]
+ fms.s1 fRes1L = fA3, FR_FracX, fRes1H // delta((A3*y)hi)
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+}
+{ .mfi
+ setf.sig fFloatNDx = GR_N
+ fadd.s1 fPol = fRes1H, fA2 // (A3*y + A2)hi
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1], 8 // Load G_1, H_1
+ fma.s1 fRes2H = fA4, fXSqr, f0 // ((A5 + A4*y)*y^2)hi
+ nop.i 0
+}
+{ .mfi
+ shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2
+ fma.s1 fA25 = fA25, FR_FracX, fA24
+ shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
+}
+;;
+.pred.rel "mutex",p8,p9
+{ .mfi
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ fms.s1 fRes6L = fA1, FR_FracX, fRes6H // delta((A1*y)hi)
+ // sign of GAMMA(x) is negative
+(p8) adds rSgnGam = -1, r0
+}
+{ .mfi
+ adds rTmpPtr = 8, GR_ad_tbl_2
+ fadd.s1 fRes3H = fRes6H, fA0 // (A1*y + A0)hi
+ // sign of GAMMA(x) is positive
+(p9) adds rSgnGam = 1, r0
+}
+;;
+{ .mfi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2] // Load G_2, H_2
+ // (LnSin6*deltaX^2 + LnSin4)hi
+ fadd.s1 fLnSinH = fB14, fLnSin4
+ nop.i 0
+}
+{ .mfi
+ ldfd FR_h2 = [rTmpPtr] // Load h_2
+ fms.s1 fB16 = fLnSin6, fDxSqr, fB14 // delta(LnSin6*deltaX^2)
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd fhDelX = [GR_ad_tbl_1] // Load h_1
+ fma.s1 fA21 = fA21, fXSqr, fA18
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDelX4, fLnSin32
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fA3L, FR_FracX, fRes1L // (A3*y)lo
+ // Get bits 30-15 of X_1 * Z_
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fPolL = fA2, fPol
+ nop.i 0
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ nop.m 0
+ // delta(((A5 + A4*y)*y^2)hi)
+ fms.s1 fRes2L = fA4, fXSqr, fRes2H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (((A5 + A4*y)*y^2) + A3*y + A2)hi
+ fadd.s1 fRes4H = fRes2H, fPol
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s1 fRes6L = fA1L, FR_FracX, fRes6L // (A1*y)lo
+ nop.i 0
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fsub.s1 fRes3L = fA0, fRes3H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fLnSinL = fLnSin4, fLnSinH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)hi
+ fma.s1 fB18 = fLnSinH, fDxSqr, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ adds rTmpPtr = 8, rTbl3Addr
+ fma.s1 fB16 = fLnSin6, fDxSqrL, fB16 // (LnSin6*deltaX^2)lo
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fXSqr, fA23
+ nop.i 0
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
+ fadd.s1 fPolL = fPolL, fRes1H
+ nop.i 0
+}
+{ .mfi
+ shladd rTmpPtr = GR_Index3, 4, rTmpPtr // Point to G_3
+ fadd.s1 fRes1L = fRes1L, fA2L // (A3*y)lo + A2lo
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3] // Load G_3, H_3
+ fma.s1 fRes2L = fA4, fXSqrL, fRes2L // ((A5 + A4*y)*y^2)lo
+ nop.i 0
+}
+{ .mfi
+ ldfd FR_h3 = [rTmpPtr] // Load h_3
+ fsub.s1 fRes4L = fPol, fRes4H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)hi
+ fma.s1 fRes7H = fRes4H, fXSqr, f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, FR_FracX, fA14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fRes6H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes6L = fRes6L, fA0L // (A1*y)lo + A0lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fLnSinL = fLnSinL, fB14
+
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // delta((LnSin6*deltaX^2 + LnSin4)*deltaX^2)
+ fms.s1 fB20 = fLnSinH, fDxSqr, fB18
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fPolL = fPolL, fRes1L // (A3*y + A2)lo
+
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)hi
+ fadd.s1 fLnSin6 = fB18, fLnSin2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fRes2H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, FR_FracX, fA16
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // delta(((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)
+ fms.s1 fRes7L = fRes4H, fXSqr, fRes7H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fPol = fRes7H, fRes3H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fRes6L // (A1*y + A0)lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fX4, fA21
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (LnSin6*deltaX^2 + LnSin4)lo
+ fadd.s1 fLnSinL = fLnSinL, fB16
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB20 = fLnSinH, fDxSqrL, fB20
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fLnSin4 = fLnSin2, fLnSin6
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)*DeltaX^2)hi
+ fma.s1 fLnSinH = fLnSin6, fDxSqr, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ((A5 + A4*y)*y^2)lo + (A3*y + A2)lo
+ fadd.s1 fRes2L = fRes2L, fPolL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, fXSqr, fA15
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)lo
+ fma.s1 fRes7L = fRes4H, fXSqrL, fRes7L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fPolL = fRes3H, fPol
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, FR_FracX, fA12
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, FR_FracX, fA10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)lo
+ fma.s1 fB20 = fLnSinL, fDxSqr, fB20
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fLnSin4 = fLnSin4, fB18
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fLnSinL = fLnSin6, fDxSqr, fLnSinH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (((A5 + A4*y)*y^2) + A3*y + A2)lo
+ fadd.s1 fRes4L = fRes4L, fRes2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fhDelX = fhDelX, FR_h2 // h = h_1 + h_2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes7L = fRes7L, fRes3L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fPolL = fPolL, fRes7H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fcvt.xf fFloatNDx = fFloatNDx
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)lo + (LnSin2)lo
+ fadd.s1 fLnSin2L = fLnSin2L, fB20
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fX4, fA17
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, fXSqr, fA11
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, FR_FracX, fA8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA7, FR_FracX, fA6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDelX8, fLnSin28
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin14 = fLnSin14, fDxSqr, fLnSin12
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin10 = fLnSin10, fDxSqr, fLnSin8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRDx = FR_G, fNormDx, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // poly_lo = r * Q4 + Q3
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)lo + (A1*y + A0)lo
+ fma.s1 fRes7L = fRes4L, fXSqr, fRes7L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fX4, fA13
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, fXSqr, fA7
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // h = N * log2_lo + h
+ fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fhDelX = fhDelX, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDelX6, fLnSin20
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin14 = fLnSin14, fDelX4, fLnSin10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = r * Q4 + Q3
+ fma.s1 fPolyLoDx = fRDx, FR_Q4, FR_Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 fRDxSq = fRDx, fRDx // rsq = r * r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Y_hi = N * log2_hi + H
+ fma.s1 fResLnDxH = fFloatNDx, FR_log2_hi, FR_H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA25, fX4, fA9
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fPolL = fPolL, fRes7L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fLnSin4 = fLnSin4, fLnSin2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // h = N * log2_lo + h
+ fma.s1 fhDelX = fFloatNDx, FR_log2_lo, fhDelX
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDelX8, fLnSin14
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)lo
+ fma.s1 fLnSinL = fLnSin6, fDxSqrL, fLnSinL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 fPolyLoDx = fPolyLoDx, fRDx, FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fRDxCub = fRDxSq, fRDx, f0 // rcub = r^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ famax.s0 fRes5H = fPol, fResH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // High part of (lgammal(|x|) + log(|x|))
+ fadd.s1 fRes1H = fPol, fResH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fA9, fX6, fPolL // P25lo
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ famin.s0 fRes5L = fPol, fResH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // High part of -(LnSin + log(|DeltaX|))
+ fnma.s1 fRes2H = fResLnDxH, f1, fLnSinH
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // (((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)*DeltaX^2)lo
+ fma.s1 fLnSinL = fLnSin4, fDxSqr, fLnSinL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDelX6, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 fPolyHiDx = FR_Q1, fRDxSq, fRDx
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 fPolyLoDx = fPolyLoDx, fRDxCub, fhDelX
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes1L = fRes5H, fRes1H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // -(lgammal(|x|) + log(|x|))hi
+ fnma.s1 fRes1H = fRes1H, f1, f0
+
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes2L = fResLnDxH, fMOne, fRes2H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSinL = fLnSin36, fDxSqr, fLnSinL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // Y_lo = poly_hi + poly_lo
+ fadd.s1 fResLnDxL = fPolyHiDx, fPolyLoDx
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fRes5L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // high part of the final result
+ fadd.s1 fYH = fRes2H, fRes1H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Y_lo = poly_hi + poly_lo
+ fadd.s1 fResL = FR_poly_hi, FR_poly_lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ famax.s0 fRes4H = fRes2H, fRes1H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ famin.s0 fRes4L = fRes2H, fRes1H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (LnSin)lo + (log(|DeltaX|))lo
+ fsub.s1 fLnSinL = fLnSinL, fResLnDxL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fLnSinH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ //(lgammal(|x|))lo + (log(|x|))lo
+ fadd.s1 fPolL = fResL, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fYL = fRes4H, fYH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Low part of -(LnSin + log(|DeltaX|))
+ fadd.s1 fRes2L = fRes2L, fLnSinL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // High part of (lgammal(|x|) + log(|x|))
+ fadd.s1 fRes1L = fRes1L, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fYL = fYL, fRes4L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes2L = fRes2L, fRes1L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // low part of the final result
+ fadd.s1 fYL = fYL, fRes2L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for -6.0 < x <= -0.75, non-integer, "far" from roots
+ fma.s0 f8 = fYH, f1, fYL
+ // exit here for -6.0 < x <= -0.75, non-integer, "far" from roots
+ br.ret.sptk b0
+}
+;;
+
+// here if |x+1| < 2^(-7)
+.align 32
+_closeToNegOne:
+{ .mfi
+ getf.exp GR_N = fDx // Get N = exponent of x
+ fmerge.se fAbsX = f1, fDx // Form |deltaX|
+ // Get high 4 bits of significand of deltaX
+ extr.u rIndex1Dx = rSignifDx, 59, 4
+}
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_1pEps_data),gp
+ fma.s1 fA0L = fDxSqr, fDxSqr, f0 // deltaX^4
+ // sign of GAMMA is positive if p10 is set to 1
+(p10) adds rSgnGam = 1, r0
+}
+;;
+{ .mfi
+ shladd GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1
+ fnma.s1 fResL = fDx, f1, f0 // -(x+1)
+ // Get high 15 bits of significand
+ extr.u GR_X_0 = rSignifDx, 49, 15
+}
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ shladd GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1
+}
+;;
+{ .mfi
+ ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
+ nop.f 0
+ and GR_N = GR_N, r17Ones // mask sign bit
+}
+{ .mfi
+ adds rTmpPtr = 8, GR_ad_tbl_1
+ nop.f 0
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+;;
+{ .mfi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
+ nop.f 0
+ adds rTmpPtr2 = 96, rPolDataPtr
+}
+{ .mfi
+ ldfd FR_h = [rTmpPtr] // Load h_1
+ nop.f 0
+ // unbiased exponent of deltaX
+ sub GR_N = GR_N, rExpHalf, 1
+}
+;;
+{ .mfi
+ adds rTmpPtr3 = 192, rPolDataPtr
+ nop.f 0
+ // sign of GAMMA is negative if p11 is set to 1
+(p11) adds rSgnGam = -1, r0
+}
+{ .mfi
+ ldfe fA1 = [rPolDataPtr], 16 // A1
+ nop.f 0
+ nop.i 0
+}
+;;
+{.mfi
+ ldfe fA2 = [rPolDataPtr], 16 // A2
+ nop.f 0
+ // Get bits 30-15 of X_0 * Z_1
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
+}
+{ .mfi
+ ldfpd fA20, fA19 = [rTmpPtr2], 16 // P8, P7
+ nop.f 0
+ nop.i 0
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ ldfe fA3 = [rPolDataPtr], 16 // A3
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA18, fA17 = [rTmpPtr2], 16 // P6, P5
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fA4 = [rPolDataPtr], 16 // A4
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA16, fA15 = [rTmpPtr2], 16 // P4, p3
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA5L, fA6 = [rPolDataPtr], 16 // A5, A6
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA14, fA13 = [rTmpPtr2], 16 // P2, P1
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA7, fA8 = [rPolDataPtr], 16 // A7, A8
+ nop.f 0
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+}
+{ .mfi
+ ldfe fLnSin2 = [rTmpPtr2], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2
+ nop.f 0
+ shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
+}
+{ .mfi
+ ldfe fLnSin4 = [rTmpPtr2], 32
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ nop.f 0
+ adds rTmpPtr = 8, GR_ad_tbl_2
+}
+{ .mfi
+ // Put integer N into rightmost significand
+ setf.sig fFloatN = GR_N
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fLnSin6 = [rTmpPtr3]
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe fLnSin8 = [rTmpPtr2]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfd FR_h2 = [rTmpPtr] // Load h_2
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s1 fResH = fA20, fResL, fA19 //polynomial for log(|x|)
+ // Get bits 30-15 of X_1 * Z_2
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s1 fA2 = fA2, fDx, fA1 // polynomial for lgammal(|x|)
+ nop.i 0
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ nop.m 0
+ fma.s1 fA18 = fA18, fResL, fA17 //polynomial for log(|x|)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA16 = fA16, fResL, fA15 //polynomial for log(|x|)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA4, fDx, fA3 // polynomial for lgammal(|x|)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA14 = fA14, fResL, fA13 //polynomial for log(|x|)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA6 = fA6, fDx, fA5L // polynomial for lgammal(|x|)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fA8, fDx, fA7 // polynomial for lgammal(|x|)
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
+ // loqw part of lnsin polynomial
+ fma.s1 fRes3L = fLnSin4, fDxSqr, fLnSin2
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3
+ fcvt.xf fFloatN = fFloatN // N as FP number
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fResH, fDxSqr, fA18 // High part of log(|x|)
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+ fma.s1 fA4 = fA4, fDxSqr, fA2 // Low part of lgammal(|x|)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // high part of lnsin polynomial
+ fma.s1 fRes3H = fLnSin8, fDxSqr, fLnSin6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA16 = fA16, fDxSqr, fA14 // Low part of log(|x|)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, fDxSqr, fA6 // High part of lgammal(|x|)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fResH, fA0L, fA16 // log(|x|)/deltaX^2 - deltaX
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fResH, fDxSqr, fResL // log(|x|)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, fA0L, fA4 // lgammal(|x|)/|x|
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = FR_G, fAbsX, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // high part of log(deltaX)= Y_hi = N * log2_hi + H
+ fma.s1 fRes4H = fFloatN, FR_log2_hi, FR_H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // h = N * log2_lo + h
+ fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fPol, fDx, fResH // lgammal(|x|) + log(|x|)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // lnsin/deltaX^2
+ fma.s1 fRes3H = fRes3H, fA0L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = r * Q4 + Q3
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // lnSin - log(|x|) - lgammal(|x|)
+ fms.s1 fResH = fRes3H, fDxSqr, fResH
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // low part of log(|deltaX|) = Y_lo = poly_hi + poly_lo
+ fadd.s1 fRes4L = FR_poly_hi, FR_poly_lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fResH = fResH, fRes4L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for |x+1|< 2^(-7) path
+ fsub.s0 f8 = fResH, fRes4H
+ // exit for |x+1|< 2^(-7) path
+ br.ret.sptk b0
+}
+;;
+
+
+// here if -2^63 < x < -6.0 and x is not an integer
+// Also we are going to filter out cases when x falls in
+// range which is "close enough" to negative root. Rhis case
+// may occur only for -19.5 < x since other roots of lgamma are
+// insignificant from double extended point of view (they are closer
+// to RTN(x) than one ulp(x).
+.align 32
+_negStirling:
+{ .mfi
+ ldfe fLnSin6 = [rLnSinDataPtr], 32
+ fnma.s1 fInvX = f8, fRcpX, f1 // start of 3rd NR iteration
+ // Get high 4 bits of significand of deltaX
+ extr.u rIndex1Dx = rSignifDx, 59, 4
+}
+{ .mfi
+ ldfe fLnSin8 = [rTmpPtr3], 32
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+(p12) cmp.ltu.unc p6, p0 = rSignifX, rLeftBound
+}
+;;
+{ .mfi
+ ldfe fLnSin10 = [rLnSinDataPtr], 32
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ // Get high 15 bits of significand
+ extr.u GR_X_0 = rSignifDx, 49, 15
+}
+{ .mfi
+ shladd GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ // set p6 if x falls in "near root" range
+(p6) cmp.geu.unc p6, p0 = rSignifX, rRightBound
+}
+;;
+{ .mfi
+ getf.exp GR_N = fDx // Get N = exponent of x
+ fma.s1 fDx4 = fDxSqr, fDxSqr, f0 // deltaX^4
+ adds rTmpPtr = 96, rBernulliPtr
+}
+{ .mfb
+ ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
+ fma.s1 fLnSin34 = fLnSin34, fDxSqr, fLnSin32
+ // branch to special path if x falls in "near root" range
+(p6) br.cond.spnt _negRoots
+}
+;;
+.pred.rel "mutex",p10,p11
+{ .mfi
+ ldfe fLnSin12 = [rTmpPtr3]
+ fma.s1 fLnSin26 = fLnSin26, fDxSqr, fLnSin24
+(p10) cmp.eq p8, p9 = rXRnd, r0
+}
+{ .mfi
+ ldfe fLnSin14 = [rLnSinDataPtr]
+ fma.s1 fLnSin30 = fLnSin30, fDxSqr, fLnSin28
+(p11) cmp.eq p9, p8 = rXRnd, r0
+}
+;;
+{ .mfi
+ ldfpd fB2, fB2L = [rBernulliPtr], 16
+ fma.s1 fLnSin18 = fLnSin18, fDxSqr, fLnSin16
+ shladd GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1
+
+}
+{ .mfi
+ ldfe fB14 = [rTmpPtr], 16
+ fma.s1 fLnSin22 = fLnSin22, fDxSqr, fLnSin20
+ and GR_N = GR_N, r17Ones // mask sign bit
+}
+;;
+{ .mfi
+ ldfe fB4 = [rBernulliPtr], 16
+ fma.s1 fInvX = fInvX, fRcpX, fRcpX // end of 3rd NR iteration
+ // Get bits 30-15 of X_0 * Z_1
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
+}
+{ .mfi
+ ldfe fB16 = [rTmpPtr], 16
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ adds rTmpPtr2 = 8, GR_ad_tbl_1
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ ldfe fB6 = [rBernulliPtr], 16
+ fms.s1 FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1
+ adds rTmpPtr3 = -48, rTmpPtr
+}
+{ .mfi
+ ldfe fB18 = [rTmpPtr], 16
+ // High part of the log(|x|) = Y_hi = N * log2_hi + H
+ fma.s1 fResH = fFloatN, FR_log2_hi, FR_H
+ sub GR_N = GR_N, rExpHalf, 1 // unbiased exponent of deltaX
+}
+;;
+.pred.rel "mutex",p8,p9
+{ .mfi
+ ldfe fB8 = [rBernulliPtr], 16
+ fma.s1 fLnSin36 = fLnSin36, fDx4, fLnSin34
+ // sign of GAMMA(x) is negative
+(p8) adds rSgnGam = -1, r0
+}
+{ .mfi
+ ldfe fB20 = [rTmpPtr], -160
+ fma.s1 fRes5H = fLnSin4, fDxSqr, f0
+ // sign of GAMMA(x) is positive
+(p9) adds rSgnGam = 1, r0
+
+}
+;;
+{ .mfi
+ ldfe fB10 = [rBernulliPtr], 16
+ fma.s1 fLnSin30 = fLnSin30, fDx4, fLnSin26
+(p14) adds rTmpPtr = -160, rTmpPtr
+}
+{ .mfi
+ ldfe fB12 = [rTmpPtr3], 16
+ fma.s1 fDx8 = fDx4, fDx4, f0 // deltaX^8
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+;;
+{ .mfi
+ ldfps fGDx, fHDx = [GR_ad_tbl_1], 8 // Load G_1, H_1
+ fma.s1 fDx6 = fDx4, fDxSqr, f0 // deltaX^6
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+}
+{ .mfi
+ ldfd fhDx = [rTmpPtr2] // Load h_1
+ fma.s1 fLnSin22 = fLnSin22, fDx4, fLnSin18
+ nop.i 0
+}
+;;
+{ .mfi
+ // Load two parts of C
+ ldfpd fRes1H, fRes1L = [rTmpPtr], 16
+ fma.s1 fRcpX = fInvX, fInvX, f0 // (1/x)^2
+ shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
+}
+{ .mfi
+ shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2
+ fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h// h = N * log2_lo + h
+ nop.i 0
+}
+;;
+{ .mfi
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ fnma.s1 fInvXL = f8, fInvX, f1 // relative error of 1/x
+ nop.i 0
+}
+{ .mfi
+ adds rTmpPtr2 = 8, GR_ad_tbl_2
+ fma.s1 fLnSin8 = fLnSin8, fDxSqr, fLnSin6
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
+ // poly_lo = r * Q4 + Q3
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
+ nop.i 0
+}
+{ .mfi
+ ldfd fh2Dx = [rTmpPtr2] // Load h_2
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA1L = fB2, fInvX, f0 // (B2*(1/x))hi
+ nop.i 0
+}
+{ .mfi
+ // Put integer N into rightmost significand
+ setf.sig fFloatNDx = GR_N
+ fms.s1 fRes4H = fResH, f1, f1 // ln(|x|)hi - 1
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2H = fRes5H, fLnSin2//(lnSin4*DeltaX^2 + lnSin2)hi
+ // Get bits 30-15 of X_1 * Z_2
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fRes5L = fLnSin4, fDxSqr, fRes5H
+ nop.i 0
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ nop.m 0
+ fma.s1 fInvX4 = fRcpX, fRcpX, f0 // (1/x)^4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB6 = fB6, fRcpX, fB4
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s1 fB18 = fB18, fRcpX, fB16
+ nop.i 0
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s1 fInvXL = fInvXL, fInvX, f0 // low part of 1/x
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3H = fRes4H, f8, f0 // (-|x|*(ln(|x|)-1))hi
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+}
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
+ fms.s1 fA2L = fB2, fInvX, fA1L // delta(B2*(1/x))
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 fBrnH = fRes1H, f1, fA1L // (-C - S(1/x))hi
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps fG3Dx, fH3Dx = [GR_ad_tbl_3],8 // Load G_3, H_3
+ fma.s1 fInvX8 = fInvX4, fInvX4, f0 // (1/x)^8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB10 = fB10, fRcpX, fB8
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfd fh3Dx = [GR_ad_tbl_3] // Load h_3
+ fma.s1 fB20 = fB20, fInvX4, fB18
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB14 = fB14, fRcpX, fB12
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDx8, fLnSin30
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin12 = fLnSin12, fDxSqr, fLnSin10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes2L = fLnSin2, fRes2H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fRes2H, fDxSqr, f0 // high part of LnSin
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fnma.s1 fResH = fResH, FR_MHalf, fResH // -0.5*ln(|x|)hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 fGDx = fGDx, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // B2lo*(1/x)hi+ delta(B2*(1/x))
+ fma.s1 fA2L = fB2L, fInvX, fA2L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB20 = fB20, fInvX4, fB14
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB10 = fB10, fInvX4, fB6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fcvt.xf fFloatNDx = fFloatNDx
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin14 = fLnSin14, fDx4, fLnSin12
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDx8, fLnSin22
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fRes3L = fRes4H, f8, fRes3H // delta(-|x|*(ln(|x|)-1))
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fmpy.s1 fGDx = fGDx, fG3Dx // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))hi
+ fadd.s1 fRes4H = fRes3H, fResH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA2L = fInvXL, fB2, fA2L //(B2*(1/x))lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // low part of log(|x|) = Y_lo = poly_hi + poly_lo
+ fadd.s1 fResL = FR_poly_hi, FR_poly_lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB20 = fB20, fInvX8, fB10
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fInvX3 = fInvX, fRcpX, f0 // (1/x)^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fHDx = fHDx, FR_H2 // H = H_1 + H_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes5L = fRes5L, fLnSin2L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fRes5H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fhDx = fhDx, fh2Dx // h = h_1 + h_2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fBrnL = fRes1H, fMOne, fBrnH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = fGDx, fNormDx, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3L = fResL, f8 , fRes3L // (-|x|*(ln(|x|)-1))lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes4L = fRes3H, fRes4H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // low part of "Bernulli" polynomial
+ fma.s1 fB20 = fB20, fInvX3, fA2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 fResL = fResL, FR_MHalf, fResL // -0.5*ln(|x|)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fHDx = fHDx, fH3Dx // H = (H_1 + H_2) + H_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fPolL = fRes2H, fDxSqr, fPol
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fhDx = fhDx, fh3Dx // h = (h_1 + h_2) + h_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (-|x|*(ln(|x|)-1) - 0.5ln(|x|) - C - S(1/x))hi
+ fadd.s1 fB14 = fRes4H, fBrnH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = r * Q4 + Q3
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fResH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fBrnL = fBrnL, fA1L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (-|x|*(ln(|x|)-1))lo + (-0.5ln(|x|))lo
+ fadd.s1 fRes3L = fRes3L, fResL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 fB20 = fRes1L, f1, fB20 // -Clo - S(1/x)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fRes5L // (lnSin4*DeltaX^2 + lnSin2)lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fDxSqrL, fRes2H, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin14 = fLnSin14, fDx4, fLnSin8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDx8, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fB12 = fRes4H, fB14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))lo
+ fadd.s1 fRes4L = fRes4L, fRes3L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fBrnL = fBrnL, fB20 // (-C - S(1/x))lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // high part of log(|DeltaX|) = Y_hi = N * log2_hi + H
+ fma.s1 fLnDeltaH = fFloatNDx, FR_log2_hi, fHDx
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // h = N * log2_lo + h
+ fma.s1 fhDx = fFloatNDx, FR_log2_lo, fhDx
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fRes2L, fDxSqr, fPolL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin14 = fLnSin36, fDxSqr, fLnSin14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))lo + (- C - S(1/x))lo
+ fadd.s1 fBrnL = fBrnL, fRes4L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fB12 = fB12, fBrnH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, fhDx
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 fRes1H = fLnDeltaH, f1, fPol//(-ln(|DeltaX|) + LnSin)hi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fDxSqrL, fRes2L, fPolL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin14, fDx6, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (-|x|*(ln(|x|)-1) - 0.5ln(|x|) - C - S(1/x))lo
+ fadd.s1 fB12 = fB12, fBrnL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // low part of log(|DeltaX|) = Y_lo = poly_hi + poly_lo
+ fadd.s1 fLnDeltaL= FR_poly_hi, FR_poly_lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1L = fLnDeltaH, fMOne, fRes1H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fPolL = fPolL, fLnSin36
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ //(-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi + (-ln(|DeltaX|) + LnSin)hi
+ fadd.s1 f8 = fRes1H, fB14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ //max((-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi,
+ // (-ln(|DeltaX|) + LnSin)hi)
+ famax.s1 fMaxNegStir = fRes1H, fB14
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ //min((-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi,
+ // (-ln(|DeltaX|) + LnSin)hi)
+ famin.s1 fMinNegStir = fRes1H, fB14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fPol
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (-ln(|DeltaX|))lo + (LnSin)lo
+ fnma.s1 fPolL = fLnDeltaL, f1, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 f9 = fMaxNegStir, f8 // delta1
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fPolL // (-ln(|DeltaX|) + LnSin)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 f9 = f9, fMinNegStir
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fB12
+ nop.i 0
+}
+;;
+{ .mfi
+ // low part of the result
+ fadd.s1 f9 = f9, fRes1L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for -2^63 < x < -6.0 path
+ fma.s0 f8 = f8, f1, f9
+ // exit here for -2^63 < x < -6.0 path
+ br.ret.sptk b0
+}
+;;
+
+// here if x falls in neighbourhood of any negative root
+// "neighbourhood" typically means that |lgammal(x)| < 0.17
+// on the [-3.0,-2.0] range |lgammal(x)| has even less
+// magnitude
+// rXint contains index of the root
+// p10 is set if root belongs to "right" ones
+// p11 is set if root belongs to "left" ones
+// lgammal(x) is approximated by polynomial of
+// 19th degree from (x - root) argument
+.align 32
+_negRoots:
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_right_roots_polynomial_data),gp
+ nop.f 0
+ shl rTmpPtr2 = rXint, 7 // (i*16)*8
+}
+{ .mfi
+ adds rRootsAddr = -288, rRootsBndAddr
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fRoot = [rRootsAddr] // FP representation of root
+ nop.f 0
+ shl rTmpPtr = rXint, 6 // (i*16)*4
+}
+{ .mfi
+(p11) adds rTmpPtr2 = 3536, rTmpPtr2
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ shladd rTmpPtr = rXint, 4, rTmpPtr // (i*16) + (i*16)*4
+}
+{ .mfi
+ adds rTmpPtr3 = 32, rTmpPtr2
+ nop.f 0
+ nop.i 0
+}
+;;
+.pred.rel "mutex",p10,p11
+{ .mfi
+ add rTmpPtr3 = rTmpPtr, rTmpPtr3
+ nop.f 0
+(p10) cmp.eq p8, p9 = rXRnd, r0
+}
+{ .mfi
+ // (i*16) + (i*16)*4 + (i*16)*8
+ add rTmpPtr = rTmpPtr, rTmpPtr2
+ nop.f 0
+(p11) cmp.eq p9, p8 = rXRnd, r0
+}
+;;
+{ .mfi
+ add rTmpPtr2 = rPolDataPtr, rTmpPtr3
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ add rPolDataPtr = rPolDataPtr, rTmpPtr // begin + offsett
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA0, fA0L = [rPolDataPtr], 16 // A0
+ nop.f 0
+ adds rTmpPtr = 112, rTmpPtr2
+}
+{ .mfi
+ ldfpd fA2, fA2L = [rTmpPtr2], 16 // A2
+ nop.f 0
+ cmp.eq p12, p13 = 4, rSgnGamSize
+}
+;;
+{ .mfi
+ ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe fA3 = [rTmpPtr2], 128 // A4
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA12, fA13 = [rTmpPtr], 16 // A12, A13
+ nop.f 0
+ adds rTmpPtr3 = 64, rPolDataPtr
+}
+{ .mfi
+ ldfpd fA16, fA17 = [rTmpPtr2], 16 // A16, A17
+ nop.f 0
+ adds rPolDataPtr = 32, rPolDataPtr
+}
+;;
+.pred.rel "mutex",p8,p9
+{ .mfi
+ ldfpd fA14, fA15 = [rTmpPtr], 16 // A14, A15
+ nop.f 0
+ // sign of GAMMA(x) is negative
+(p8) adds rSgnGam = -1, r0
+}
+{ .mfi
+ ldfpd fA18, fA19 = [rTmpPtr2], 16 // A18, A19
+ nop.f 0
+ // sign of GAMMA(x) is positive
+(p9) adds rSgnGam = 1, r0
+}
+;;
+{ .mfi
+ ldfe fA4 = [rPolDataPtr], 16 // A4
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA6, fA7 = [rTmpPtr3], 16 // A6, A7
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fA5 = [rPolDataPtr], 16 // A5
+ // if x equals to (rounded) root exactly
+ fcmp.eq.s1 p6, p0 = f8, fRoot
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA8, fA9 = [rTmpPtr3], 16 // A8, A9
+ fms.s1 FR_FracX = f8, f1, fRoot
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p12) st4 [rSgnGamAddr] = rSgnGam
+ nop.f 0
+ nop.i 0
+}
+{ .mfb
+ // store signgam if size of variable is 8 bytes
+(p13) st8 [rSgnGamAddr] = rSgnGam
+ // answer if x equals to (rounded) root exactly
+(p6) fadd.s0 f8 = fA0, fA0L
+ // exit if x equals to (rounded) root exactly
+(p6) br.ret.spnt b0
+}
+;;
+{ .mmf
+ ldfpd fA10, fA11 = [rTmpPtr3], 16 // A10, A11
+ nop.m 0
+ nop.f 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fA2, FR_FracX, f0 // (A2*x)hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4L = FR_FracX, FR_FracX, f0 // x^2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, FR_FracX, fA16
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, FR_FracX, fA12
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, FR_FracX, fA18
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, FR_FracX, fA14
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fPol = fA7, FR_FracX, fA6
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, FR_FracX, fA8
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fResL = fA2, FR_FracX, fResH // delta(A2*x)
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fadd.s1 fRes1H = fResH, fA1 // (A2*x + A1)hi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, FR_FracX, fA10
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 fA5L = fA4L, fA4L, f0 // x^4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, fA4L, fA17
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fA4L, fA13
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, FR_FracX, fA5
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 fA3L = fA4L, FR_FracX, f0 // x^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // delta(A2*x) + A2L*x = (A2*x)lo
+ fma.s1 fResL = fA2L, FR_FracX, fResL
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fsub.s1 fRes1L = fA1, fRes1H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, fA4L, fA9
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, fA5L, fA15
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fPol = fPol, FR_FracX, fA4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fResL = fResL, fA1L // (A2*x)lo + A1
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fResH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2H = fRes1H, FR_FracX, f0 // ((A2*x + A1)*x)hi
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, fA5L, fA11
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fPol = fPol, FR_FracX, fA3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fResL // (A2*x + A1)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // delta((A2*x + A1)*x)
+ fms.s1 fRes2L = fRes1H, FR_FracX, fRes2H
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fadd.s1 fRes3H = fRes2H, fA0 // ((A2*x + A1)*x + A0)hi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, fA5L, f0
+ nop.i 0
+}
+
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2L = fRes1L, FR_FracX, fRes2L // ((A2*x + A1)*x)lo
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fsub.s1 fRes3L = fRes2H, fRes3H
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fPol = fA19, FR_FracX, fPol
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fA0
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fA0L // ((A2*x + A1)*x)lo + A0L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fRes2L // (((A2*x + A1)*x) + A0)lo
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fRes3L = fPol, fA3L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for arguments which are close to negative roots
+ fma.s0 f8 = fRes3H, f1, fRes3L
+ // exit here for arguments which are close to negative roots
+ br.ret.sptk b0
+}
+;;
+
+// here if |x| < 0.5
+.align 32
+lgammal_0_half:
+{ .mfi
+ ld4 GR_Z_1 = [rZ1offsett] // Load Z_1
+ fma.s1 fA4L = f8, f8, f0 // x^2
+ addl rPolDataPtr = @ltoff(lgammal_0_Half_data), gp
+}
+{ .mfi
+ shladd GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr// Point to G_1
+ nop.f 0
+ addl rLnSinDataPtr = @ltoff(lgammal_lnsin_data), gp
+}
+;;
+{ .mfi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
+ nop.f 0
+ // Point to Constants_Z_2
+ add GR_ad_z_2 = 0x140, GR_ad_z_1
+}
+{ .mfi
+ add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q
+ nop.f 0
+ // Point to Constants_G_H_h2
+ add GR_ad_tbl_2 = 0x180, GR_ad_z_1
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ // Point to Constants_G_H_h3
+ add GR_ad_tbl_3 = 0x280, GR_ad_z_1
+}
+{ .mfi
+ ldfd FR_h = [GR_ad_tbl_1] // Load h_1
+ nop.f 0
+ sub GR_N = rExpX, rExpHalf, 1
+}
+;;
+{ .mfi
+ ld8 rLnSinDataPtr = [rLnSinDataPtr]
+ nop.f 0
+ // Get bits 30-15 of X_0 * Z_1
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
+}
+{ .mfi
+ ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
+ nop.f 0
+ sub GR_N = r0, GR_N
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ ldfe FR_log2_lo = [GR_ad_q], 16 // Load log2_lo
+ nop.f 0
+ add rTmpPtr2 = 320, rPolDataPtr
+}
+{ .mfi
+ add rTmpPtr = 32, rPolDataPtr
+ nop.f 0
+ // exponent of 0.25
+ adds rExp2 = -1, rExpHalf
+}
+;;
+{ .mfi
+ ldfpd fA3, fA3L = [rPolDataPtr], 16 // A3
+ fma.s1 fA5L = fA4L, fA4L, f0 // x^4
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA1, fA1L = [rTmpPtr], 16 // A1
+ fms.s1 fB8 = f8, f8, fA4L // x^2 - <x^2>
+ // set p6 if -0.5 < x <= -0.25
+(p15) cmp.eq.unc p6, p0 = rExpX, rExp2
+}
+;;
+{ .mfi
+ ldfpd fA2, fA2L = [rPolDataPtr], 16 // A2
+ nop.f 0
+ // set p6 if -0.5 < x <= -0.40625
+(p6) cmp.le.unc p6, p0 = 10, GR_Index1
+}
+{ .mfi
+ ldfe fA21 = [rTmpPtr2], -16 // A21
+ // Put integer N into rightmost significand
+ nop.f 0
+ adds rTmpPtr = 240, rTmpPtr
+}
+;;
+{ .mfi
+ setf.sig fFloatN = GR_N
+ nop.f 0
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+}
+{ .mfi
+ ldfe FR_Q4 = [GR_ad_q], 16 // Load Q4
+ nop.f 0
+ adds rPolDataPtr = 304, rPolDataPtr
+}
+;;
+{ .mfi
+ ldfe fA20 = [rTmpPtr2], -32 // A20
+ nop.f 0
+ shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
+}
+{ .mfi
+ ldfe fA19 = [rTmpPtr], -32 // A19
+ nop.f 0
+ shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2// Point to G_2
+}
+;;
+{ .mfi
+ ldfe fA17 = [rTmpPtr], -32 // A17
+ nop.f 0
+ adds rTmpPtr3 = 8, GR_ad_tbl_2
+}
+{ .mfb
+ ldfe fA18 = [rTmpPtr2], -32 // A18
+ nop.f 0
+ // branch to special path for -0.5 < x <= 0.40625
+(p6) br.cond.spnt lgammal_near_neg_half
+}
+;;
+{ .mmf
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ ldfe fA15 = [rTmpPtr], -32 // A15
+ fma.s1 fB20 = fA5L, fA5L, f0 // x^8
+}
+;;
+{ .mmf
+ ldfe fA16 = [rTmpPtr2], -32 // A16
+ ldfe fA13 = [rTmpPtr], -32 // A13
+ fms.s1 fB16 = fA4L, fA4L, fA5L
+}
+;;
+{ .mmf
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2], 8 // Load G_2, H_2
+ ldfd FR_h2 = [rTmpPtr3] // Load h_2
+ fmerge.s fB10 = f8, fA5L // sign(x) * x^4
+}
+;;
+{ .mmi
+ ldfe fA14 = [rTmpPtr2], -32 // A14
+ ldfe fA11 = [rTmpPtr], -32 // A11
+ // Get bits 30-15 of X_1 * Z_2
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ ldfe fA12 = [rTmpPtr2], -32 // A12
+ fma.s1 fRes4H = fA3, fAbsX, f0
+ adds rTmpPtr3 = 16, GR_ad_q
+}
+{ .mfi
+ ldfe fA9 = [rTmpPtr], -32 // A9
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mmf
+ ldfe fA10 = [rTmpPtr2], -32 // A10
+ ldfe fA7 = [rTmpPtr], -32 // A7
+ fma.s1 fB18 = fB20, fB20, f0 // x^16
+}
+;;
+{ .mmf
+ ldfe fA8 = [rTmpPtr2], -32 // A8
+ ldfe fA22 = [rPolDataPtr], 16 // A22
+ fcvt.xf fFloatN = fFloatN
+}
+;;
+{ .mfi
+ ldfe fA5 = [rTmpPtr], -32 // A5
+ fma.s1 fA21 = fA21, fAbsX, fA20 // v16
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+}
+{ .mfi
+ ldfe fA6 = [rTmpPtr2], -32 // A6
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mmf
+ // Point to G_3
+ shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3
+ ldfe fA4 = [rTmpPtr2], -32 // A4
+ fma.s1 fA19 = fA19, fAbsX, fA18 // v13
+}
+;;
+.pred.rel "mutex",p14,p15
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
+ fms.s1 fRes4L = fA3, fAbsX, fRes4H
+(p14) adds rSgnGam = 1, r0
+}
+{ .mfi
+ cmp.eq p6, p7 = 4, rSgnGamSize
+ fadd.s1 fRes2H = fRes4H, fA2
+(p15) adds rSgnGam = -1, r0
+}
+;;
+
+{ .mfi
+ ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+ fma.s1 fA17 = fA17, fAbsX, fA16 // v12
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe FR_Q3 = [GR_ad_q], 32 // Load Q3
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_Q2 = [rTmpPtr3], 16 // Load Q2
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe FR_Q1 = [GR_ad_q] // Load Q1
+ fma.s1 fA15 = fA15, fAbsX, fA14 // v8
+ nop.i 0
+}
+{ .mfi
+ adds rTmpPtr3 = 32, rLnSinDataPtr
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+ nop.i 0
+}
+;;
+{ .mmf
+ ldfpd fLnSin2, fLnSin2L = [rLnSinDataPtr], 16
+ ldfe fLnSin6 = [rTmpPtr3], 32
+ fma.s1 fA13 = fA13, fAbsX, fA12 // v7
+
+}
+;;
+{ .mfi
+ ldfe fLnSin4 = [rLnSinDataPtr], 32
+ fma.s1 fRes4L = fA3L, fAbsX, fRes4L
+ nop.i 0
+}
+{ .mfi
+ ldfe fLnSin10 = [rTmpPtr3], 32
+ fsub.s1 fRes2L = fA2, fRes2H
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fLnSin8 = [rLnSinDataPtr], 32
+ fma.s1 fResH = fRes2H, fAbsX, f0
+ nop.i 0
+}
+{ .mfi
+ ldfe fLnSin14 = [rTmpPtr3], 32
+ fma.s1 fA22 = fA22, fA4L, fA21 // v15
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fLnSin12 = [rLnSinDataPtr], 32
+ fma.s1 fA9 = fA9, fAbsX, fA8 // v4
+ nop.i 0
+}
+{ .mfi
+ ldfd fLnSin18 = [rTmpPtr3], 16
+ fma.s1 fA11 = fA11, fAbsX, fA10 // v5
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fLnSin16 = [rLnSinDataPtr], 24
+ fma.s1 fA19 = fA19, fA4L, fA17 // v11
+ nop.i 0
+}
+{ .mfi
+ ldfd fLnSin22 = [rTmpPtr3], 16
+ fma.s1 fPolL = fA7, fAbsX, fA6
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd fLnSin20 = [rLnSinDataPtr], 16
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ ldfd fLnSin26 = [rTmpPtr3], 16
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd fLnSin24 = [rLnSinDataPtr], 16
+ fadd.s1 fRes2L = fRes2L, fRes4H
+ nop.i 0
+}
+{ .mfi
+ ldfd fLnSin30 = [rTmpPtr3], 16
+ fadd.s1 fA2L = fA2L, fRes4L
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd fLnSin28 = [rLnSinDataPtr], 16
+ fms.s1 fResL = fRes2H, fAbsX, fResH
+ nop.i 0
+}
+{ .mfi
+ ldfd fLnSin34 = [rTmpPtr3], 8
+ fadd.s1 fRes2H = fResH, fA1
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd fLnSin32 = [rLnSinDataPtr]
+ fma.s1 fA11 = fA11, fA4L, fA9 // v3
+ nop.i 0
+}
+{ .mfi
+ ldfd fLnSin36 = [rTmpPtr3]
+ fma.s1 fA15 = fA15, fA4L, fA13 // v6
+ nop.i 0
+}
+;;
+
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 0
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s1 fA5 = fA5, fAbsX, fA4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // High part of the log(|x|): Y_hi = N * log2_hi + H
+ fms.s1 FR_log2_hi = fFloatN, FR_log2_hi, FR_H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fA3L = fRes2L, fA2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA22 = fA22, fA5L, fA19
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes2L = fA1, fRes2H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3H = fRes2H, f8, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fA5L, fA11 // v2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin18 = fLnSin18, fA4L, fLnSin16
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // h = N * log2_lo + h
+ fms.s1 FR_h = fFloatN, FR_log2_lo, FR_h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fPolL, fA4L, fA5
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = r * Q4 + Q3
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResL = fA3L, fAbsX, fResL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin30 = fLnSin30, fA4L, fLnSin28
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fResH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fRes3L = fRes2H, f8, fRes3H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1H = fRes3H, FR_log2_hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fB20, fA22, fA15
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin34 = fLnSin34, fA4L, fLnSin32
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin14 = fLnSin14, fA4L, fLnSin12
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fA1L = fA1L, fResL
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin22 = fLnSin22, fA4L, fLnSin20
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin26 = fLnSin26, fA4L, fLnSin24
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes1L = FR_log2_hi, fRes1H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, fA5L, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin34 = fLnSin36, fA5L, fLnSin34
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin18 = fLnSin18, fA5L, fLnSin14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin6 = fLnSin6, fA4L, fLnSin4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin10 = fLnSin10, fA4L, fLnSin8
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fA1L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB2 = fLnSin2, fA4L, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fRes3H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, fB10, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin26 = fLnSin26, fA5L, fLnSin22
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin34 = fLnSin34, fA5L, fLnSin30
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin10 = fLnSin10, fA5L, fLnSin6
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin2L = fLnSin2L, fA4L, f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3L = fRes2L, f8, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Y_lo = poly_hi + poly_lo
+ fsub.s1 FR_log2_lo = FR_poly_lo, FR_poly_hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fB4 = fLnSin2, fA4L, fB2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2H = fRes1H, fPol
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin34 = fLnSin34, fB20, fLnSin26
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin18 = fLnSin18, fB20, fLnSin10
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin2L = fB8, fLnSin2, fLnSin2L
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_log2_lo = FR_log2_lo, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes2L = fRes1H, fRes2H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB6 = fLnSin34, fB18, fLnSin18
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fB4 = fLnSin2L, fB4
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, FR_log2_lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fPol
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB12 = fB6, fA5L, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fRes1L
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fB14 = fB6, fA5L, fB12
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fadd.s1 fLnSin30 = fB2, fB12
+ // branch out if x is negative
+(p15) br.cond.spnt _O_Half_neg
+}
+;;
+{ .mfb
+ nop.m 0
+ // sign(x)*Pol(|x|) - log(|x|)
+ fma.s0 f8 = fRes2H, f1, fRes2L
+ // it's an answer already for positive x
+ // exit if 0 < x < 0.5
+ br.ret.sptk b0
+}
+;;
+
+// here if x is negative and |x| < 0.5
+.align 32
+_O_Half_neg:
+{ .mfi
+ nop.m 0
+ fma.s1 fB14 = fB16, fB6, fB14
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fLnSin16 = fB2, fLnSin30
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fResH = fLnSin30, fRes2H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fLnSin16 = fLnSin16, fB12
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fB4 = fB14, fB4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fLnSin16 = fB4, fLnSin16
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fResL = fRes2H, fResH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fResL = fResL, fLnSin30
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fLnSin16 = fLnSin16, fRes2L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fResL = fResL, fLnSin16
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for -0.5 < x < 0
+ fma.s0 f8 = fResH, f1, fResL
+ // exit for -0.5 < x < 0
+ br.ret.sptk b0
+}
+;;
+
+// here if x >= 8.0
+// there are two computational paths:
+// 1) For x >10.0 Stirling's formula is used
+// 2) Polynomial approximation for 8.0 <= x <= 10.0
+.align 32
+lgammal_big_positive:
+{ .mfi
+ addl rPolDataPtr = @ltoff(lgammal_data), gp
+ fmerge.se fSignifX = f1, f8
+ // Get high 15 bits of significand
+ extr.u GR_X_0 = rSignifX, 49, 15
+}
+{.mfi
+ shladd rZ1offsett = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
+ fnma.s1 fInvX = f8, fRcpX, f1 // start of 1st NR iteration
+ adds rSignif1andQ = 0x5, r0
+}
+;;
+{.mfi
+ ld4 GR_Z_1 = [rZ1offsett] // Load Z_1
+ nop.f 0
+ shl rSignif1andQ = rSignif1andQ, 61 // significand of 1.25
+}
+{ .mfi
+ cmp.eq p8, p0 = rExpX, rExp8 // p8 = 1 if 8.0 <= x < 16
+ nop.f 0
+ adds rSgnGam = 1, r0 // gamma is positive at this range
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr// Point to G_1
+ nop.f 0
+ add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q
+}
+{ .mlx
+ ld8 rPolDataPtr = [rPolDataPtr]
+ movl rDelta = 0x3FF2000000000000
+}
+;;
+{ .mfi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
+ nop.f 0
+ add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
+}
+{ .mfi
+ // Point to Constants_G_H_h2
+ add GR_ad_tbl_2 = 0x180, GR_ad_z_1
+ nop.f 0
+ // p8 = 1 if 8.0 <= x <= 10.0
+(p8) cmp.leu.unc p8, p0 = rSignifX, rSignif1andQ
+}
+;;
+{ .mfi
+ ldfd FR_h = [GR_ad_tbl_1] // Load h_1
+ nop.f 0
+ // Get bits 30-15 of X_0 * Z_1
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
+}
+{ .mfb
+(p8) setf.d FR_MHalf = rDelta
+ nop.f 0
+(p8) br.cond.spnt lgammal_8_10 // branch out if 8.0 <= x <= 10.0
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ ldfe fA1 = [rPolDataPtr], 16 // Load overflow threshold
+ fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 1st NR iteration
+ // Point to Constants_G_H_h3
+ add GR_ad_tbl_3 = 0x280, GR_ad_z_1
+}
+{ .mlx
+ nop.m 0
+ movl rDelta = 0xBFE0000000000000 // -0.5 in DP
+}
+;;
+{ .mfi
+ ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
+ nop.f 0
+ sub GR_N = rExpX, rExpHalf, 1 // unbiased exponent of x
+}
+;;
+{ .mfi
+ ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ setf.d FR_MHalf = rDelta
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ // Put integer N into rightmost significand
+ setf.sig fFloatN = GR_N
+ nop.f 0
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+}
+{ .mfi
+ ldfe FR_Q4 = [GR_ad_q], 16 // Load Q4
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
+ nop.f 0
+ shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2// Point to G_2
+}
+{ .mfi
+ ldfe FR_Q3 = [GR_ad_q], 16 // Load Q3
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ fnma.s1 fInvX = f8, fRcpX, f1 // start of 2nd NR iteration
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2], 8 // Load G_2, H_2
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
+ nop.f 0
+ // Get bits 30-15 of X_1 * Z_2
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ ldfe FR_Q1 = [GR_ad_q] // Load Q1
+ fcmp.gt.s1 p7,p0 = f8, fA1 // check if x > overflow threshold
+ nop.i 0
+}
+;;
+{.mfi
+ ldfpd fA0, fA0L = [rPolDataPtr], 16 // Load two parts of C
+ fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 2nd NR iteration
+ nop.i 0
+}
+;;
+{ .mfb
+ ldfpd fB2, fA1 = [rPolDataPtr], 16
+ nop.f 0
+(p7) br.cond.spnt lgammal_overflow // branch if x > overflow threshold
+}
+;;
+{.mfi
+ ldfe fB4 = [rPolDataPtr], 16
+ fcvt.xf fFloatN = fFloatN
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3// Point to G_3
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe fB6 = [rPolDataPtr], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe fB8 = [rPolDataPtr], 16
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 fInvX = f8, fRcpX, f1 // start of 3rd NR iteration
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fB10 = [rPolDataPtr], 16
+ nop.f 0
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+;;
+{ .mfi
+ ldfe fB12 = [rPolDataPtr], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fB14 = [rPolDataPtr], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe fB16 = [rPolDataPtr], 16
+ // get double extended coefficients from two doubles
+ // two doubles are needed in Stitling's formula for negative x
+ fadd.s1 fB2 = fB2, fA1
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fB18 = [rPolDataPtr], 16
+ fma.s1 fInvX = fInvX, fRcpX, fRcpX // end of 3rd NR iteration
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fB20 = [rPolDataPtr], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRcpX = fInvX, fInvX, f0 // 1/x^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA0L = fB2, fInvX, fA0L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = fSignifX, FR_G, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // High part of the log(x): Y_hi = N * log2_hi + H
+ fma.s1 fRes2H = fFloatN, FR_log2_hi, FR_H
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // h = N * log2_lo + h
+ fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // High part of the log(x): Y_hi = N * log2_hi + H
+ fma.s1 fRes1H = fFloatN, FR_log2_hi, FR_H
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fPol = fB18, fRcpX, fB16 // v9
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA2L = fRcpX, fRcpX, f0 // v10
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fA3 = fB6, fRcpX, fB4 // v3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fB10, fRcpX, fB8 // v4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes2H =fRes2H, f1, f1 // log_Hi(x) -1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // poly_lo = r * Q4 + Q3
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1H = fRes1H, FR_MHalf, f0 // -0.5*log_Hi(x)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fB14, fRcpX, fB12 // v7
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA8 = fA2L, fB20, fPol // v8
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA2 = fA4, fA2L, fA3 // v2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4L = fA2L, fA2L, f0 // v5
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fRes2H, f8, f0 // (x*(ln(x)-1))hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fRcpX, fInvX, f0 // 1/x^3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA6 = fA8, fA2L, fA7 // v6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fResL = fRes2H, f8, fResH // d(x*(ln(x)-1))
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3H = fResH, fRes1H // (x*(ln(x)-1) -0.5ln(x))hi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fA4L, fA6, fA2 // v1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // raise inexact exception
+ fma.s0 FR_log2_lo = FR_log2_lo, FR_log2_lo, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4H = fRes3H, fA0 // (x*(ln(x)-1) -0.5ln(x))hi + Chi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes3L = fResH, fRes3H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Y_lo = poly_hi + poly_lo
+ fadd.s1 fRes2L = FR_poly_hi, FR_poly_lo
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA0L = fPol, fA11, fA0L // S(1/x) + Clo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fRes1H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes4L = fRes3H, fRes4H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResL = fRes2L, f8 , fResL // lo part of x*(ln(x)-1)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Clo + S(1/x) - 0.5*logLo(x)
+ fma.s1 fA0L = fRes2L, FR_MHalf, fA0L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fA0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Clo + S(1/x) - 0.5*logLo(x) + (x*(ln(x)-1))lo
+ fadd.s1 fA0L = fA0L, fResL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fA0L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ fma.s0 f8 = fRes4H, f1, fRes4L
+ // exit for x > 10.0
+ br.ret.sptk b0
+}
+;;
+// here if 8.0 <= x <= 10.0
+// Result = P15(y), where y = x/8.0 - 1.5
+.align 32
+lgammal_8_10:
+{ .mfi
+ addl rPolDataPtr = @ltoff(lgammal_8_10_data), gp
+ fms.s1 FR_FracX = fSignifX, f1, FR_MHalf // y = x/8.0 - 1.5
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+;;
+{ .mfi
+ ld8 rLnSinDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ adds rZ1offsett = 32, rLnSinDataPtr
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ adds rLnSinDataPtr = 48, rLnSinDataPtr
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe fA2 = [rZ1offsett], 32 // A5
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA0, fA0L = [rPolDataPtr], 16 // A0
+ fma.s1 FR_rsq = FR_FracX, FR_FracX, f0 // y^2
+ nop.i 0
+}
+{ .mfi
+ ldfe fA3 = [rLnSinDataPtr],32 // A5
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mmf
+ ldfe fA4 = [rZ1offsett], 32 // A4
+ ldfe fA5 = [rLnSinDataPtr], 32 // A5
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfe fA6 = [rZ1offsett], 32 // A6
+ ldfe fA7 = [rLnSinDataPtr], 32 // A7
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfe fA8 = [rZ1offsett], 32 // A8
+ ldfe fA9 = [rLnSinDataPtr], 32 // A9
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfe fA10 = [rZ1offsett], 32 // A10
+ ldfe fA11 = [rLnSinDataPtr], 32 // A11
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfe fA12 = [rZ1offsett], 32 // A12
+ ldfe fA13 = [rLnSinDataPtr], 32 // A13
+ fma.s1 FR_Q4 = FR_rsq, FR_rsq, f0 // y^4
+}
+;;
+{ .mmf
+ ldfe fA14 = [rZ1offsett], 32 // A14
+ ldfe fA15 = [rLnSinDataPtr], 32 // A15
+ nop.f 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1H = FR_FracX, fA1, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA3 = fA3, FR_FracX, fA2 // v4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA5 = fA5, FR_FracX, fA4 // v5
+ nop.i 0
+}
+;;
+{ .mfi
+ // store sign of GAMMA(x) if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s1 fA3L = FR_Q4, FR_Q4, f0 // v9 = y^8
+ nop.i 0
+}
+{ .mfi
+ // store sign of GAMMA(x) if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s1 fA7 = fA7, FR_FracX, fA6 // v7
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, FR_FracX, fA8 // v8
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1L = FR_FracX, fA1, fRes1H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, FR_FracX, fA10 // v12
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, FR_FracX, fA12 // v13
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2H = fRes1H, f1, fA0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, FR_FracX, fA14 // v16
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA5 = fA5, FR_rsq, fA3 // v3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, FR_rsq, fA7 // v6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = FR_FracX, fA1L, fRes1L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes2L = fA0, f1, fRes2H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, FR_rsq, fA11 // v11
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, FR_Q4, fA5 // v2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes1L, f1, fA0L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2L = fRes2L, f1, fRes1H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, FR_Q4, fA13 // v10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2L = fRes1L, f1, fRes2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fA3L, fA15, fA9
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 f8 = FR_rsq , fPol, fRes2H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, FR_rsq, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1L = fRes2H, f1, f8
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes1L, f1, fPol
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes1L, f1, fRes2L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ fma.s0 f8 = f8, f1, fRes1L
+ // exit for 8.0 <= x <= 10.0
+ br.ret.sptk b0
+}
+;;
+
+// here if 4.0 <=x < 8.0
+.align 32
+lgammal_4_8:
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_4_8_data),gp
+ fms.s1 FR_FracX = fSignifX, f1, FR_MHalf
+ adds rSgnGam = 1, r0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfb
+ adds rTmpPtr = 160, rPolDataPtr
+ nop.f 0
+ // branch to special path which computes polynomial of 25th degree
+ br.sptk lgamma_polynom25
+}
+;;
+
+// here if 2.25 <=x < 4.0
+.align 32
+lgammal_2Q_4:
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_2Q_4_data),gp
+ fms.s1 FR_FracX = fSignifX, f1, FR_MHalf
+ adds rSgnGam = 1, r0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfb
+ adds rTmpPtr = 160, rPolDataPtr
+ nop.f 0
+ // branch to special path which computes polynomial of 25th degree
+ br.sptk lgamma_polynom25
+}
+;;
+
+// here if 0.5 <= |x| < 0.75
+.align 32
+lgammal_half_3Q:
+.pred.rel "mutex", p14, p15
+{ .mfi
+(p14) addl rPolDataPtr= @ltoff(lgammal_half_3Q_data),gp
+ // FR_FracX = x - 0.625 for positive x
+(p14) fms.s1 FR_FracX = f8, f1, FR_FracX
+(p14) adds rSgnGam = 1, r0
+}
+{ .mfi
+(p15) addl rPolDataPtr= @ltoff(lgammal_half_3Q_neg_data),gp
+ // FR_FracX = x + 0.625 for negative x
+(p15) fma.s1 FR_FracX = f8, f1, FR_FracX
+(p15) adds rSgnGam = -1, r0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfb
+ adds rTmpPtr = 160, rPolDataPtr
+ nop.f 0
+ // branch to special path which computes polynomial of 25th degree
+ br.sptk lgamma_polynom25
+}
+;;
+// here if 1.3125 <= x < 1.5625
+.align 32
+lgammal_loc_min:
+{ .mfi
+ adds rSgnGam = 1, r0
+ nop.f 0
+ nop.i 0
+}
+{ .mfb
+ adds rTmpPtr = 160, rPolDataPtr
+ fms.s1 FR_FracX = f8, f1, fA5L
+ br.sptk lgamma_polynom25
+}
+;;
+// here if -2.605859375 <= x < -2.5
+// special polynomial approximation used since neither "near root"
+// approximation nor reflection formula give satisfactory accuracy on
+// this range
+.align 32
+_neg2andHalf:
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_neg2andHalf_data),gp
+ fma.s1 FR_FracX = fB20, f1, f8 // 2.5 + x
+ adds rSgnGam = -1, r0
+}
+;;
+{.mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfb
+ adds rTmpPtr = 160, rPolDataPtr
+ nop.f 0
+ // branch to special path which computes polynomial of 25th degree
+ br.sptk lgamma_polynom25
+}
+;;
+
+// here if -0.5 < x <= -0.40625
+.align 32
+lgammal_near_neg_half:
+{ .mmf
+ addl rPolDataPtr= @ltoff(lgammal_near_neg_half_data),gp
+ setf.exp FR_FracX = rExpHalf
+ nop.f 0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ adds rSgnGam = -1, r0
+}
+;;
+{ .mfb
+ adds rTmpPtr = 160, rPolDataPtr
+ fma.s1 FR_FracX = FR_FracX, f1, f8
+ // branch to special path which computes polynomial of 25th degree
+ br.sptk lgamma_polynom25
+}
+;;
+
+// here if there an answer is P25(x)
+// rPolDataPtr, rTmpPtr point to coefficients
+// x is in FR_FracX register
+.align 32
+lgamma_polynom25:
+{ .mfi
+ ldfpd fA3, fA0L = [rPolDataPtr], 16 // A3
+ nop.f 0
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+{ .mfi
+ ldfpd fA18, fA19 = [rTmpPtr], 16 // D7, D6
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA16, fA17 = [rTmpPtr], 16 // D4, D5
+ nop.f 0
+}
+;;
+{ .mfi
+ ldfpd fA12, fA13 = [rPolDataPtr], 16 // D0, D1
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA14, fA15 = [rTmpPtr], 16 // D2, D3
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA24, fA25 = [rPolDataPtr], 16 // C21, C20
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA22, fA23 = [rTmpPtr], 16 // C19, C18
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA2, fA2L = [rPolDataPtr], 16 // A2
+ fma.s1 fA4L = FR_FracX, FR_FracX, f0 // x^2
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA20, fA21 = [rTmpPtr], 16 // C17, C16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fA11 = [rTmpPtr], 16 // E7
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA0, fA3L = [rPolDataPtr], 16 // A0
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ ldfe fA10 = [rPolDataPtr], 16 // E6
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe fA9 = [rTmpPtr], 16 // E5
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mmf
+ ldfe fA8 = [rPolDataPtr], 16 // E4
+ ldfe fA7 = [rTmpPtr], 16 // E3
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfe fA6 = [rPolDataPtr], 16 // E2
+ ldfe fA5 = [rTmpPtr], 16 // E1
+ nop.f 0
+}
+;;
+{ .mfi
+ ldfe fA4 = [rPolDataPtr], 16 // E0
+ fma.s1 fA5L = fA4L, fA4L, f0 // x^4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fB2 = FR_FracX, FR_FracX, fA4L // x^2 - <x^2>
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s1 fRes4H = fA3, FR_FracX, f0 // (A3*x)hi
+ nop.i 0
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s1 fA19 = fA19, FR_FracX, fA18 // D7*x + D6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fA1, FR_FracX, f0 // (A1*x)hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB6 = fA1L, FR_FracX, fA0L // A1L*x + A0L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, FR_FracX, fA16 // D5*x + D4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, FR_FracX, fA14 // D3*x + D2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, FR_FracX, fA24 // C21*x + C20
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, FR_FracX, fA12 // D1*x + D0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA23 = fA23, FR_FracX, fA22 // C19*x + C18
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA21 = fA21, FR_FracX, fA20 // C17*x + C16
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes4L = fA3, FR_FracX, fRes4H // delta((A3*x)hi)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2H = fRes4H, fA2 // (A3*x + A2)hi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fResL = fA1, FR_FracX, fResH // d(A1*x)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1H = fResH, fA0 // (A1*x + A0)hi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, fA4L, fA17 // Dhi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, FR_FracX, fA10 // E7*x + E6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Doing this to raise inexact flag
+ fma.s0 fA10 = fA0, fA0, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fA4L, fA13 // Dlo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (C21*x + C20)*x^2 + C19*x + C18
+ fma.s1 fA25 = fA25, fA4L, fA23
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, FR_FracX, fA8 // E5*x + E4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA7, FR_FracX, fA6 // E3*x + E2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes4L = fA3L, FR_FracX, fRes4L // (A3*x)lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes2L = fA2, fRes2H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fResL = fResL, fB6 // (A1L*x + A0L) + d(A1*x)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes1L = fA0, fRes1H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA5 = fA5, FR_FracX, fA4 // E1*x + E0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB8 = fA5L, fA5L, f0 // x^8
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ((C21*x + C20)*x^2 + C19*x + C18)*x^2 + C17*x + C16
+ fma.s1 fA25 = fA25, fA4L, fA21
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, fA5L, fA15 // D
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, fA4L, fA9 // Ehi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fRes4H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fA2L // (A3*x)lo + A2L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3H = fRes2H, fA4L, f0 // ((A3*x + A2)*x^2)hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fResH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3L = fRes2H, fB2, f0 // (A3*x + A2)hi*d(x^2)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA7, fA4L, fA5 // Elo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fB8, fA19 // C*x^8 + D
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fRes4L // (A3*x + A2)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fB4 = fRes2H, fA4L, fRes3H // d((A3*x + A2)*x^2))
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fResL // (A1*x + A0)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fB20 = fRes3H, fRes1H // Phi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, fA5L, fA7 // E
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ( (A3*x + A2)lo*<x^2> + (A3*x + A2)hi*d(x^2))
+ fma.s1 fRes3L = fRes2L, fA4L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // d((A3*x + A2)*x^2)) + (A1*x + A0)lo
+ fadd.s1 fRes1L = fRes1L, fB4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fB18 = fRes1H, fB20
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fA25, fB8, fA11
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fB18 = fB18, fRes3H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fRes4H = fPol, fA5L, fB20
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fPol, fA5L, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fB18 = fB18, fRes1L // Plo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes4L = fB20, fRes4H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fB18 = fB18, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fB18
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ fma.s0 f8 = fRes4H, f1, fRes4L
+ // P25(x) computed, exit here
+ br.ret.sptk b0
+}
+;;
+
+
+// here if 0.75 <= x < 1.3125
+.align 32
+lgammal_03Q_1Q:
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_03Q_1Q_data),gp
+ fma.s1 FR_FracX = fA5L, f1, f0 // x
+ adds rSgnGam = 1, r0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB4 = fA5L, fA5L, f0 // x^2
+ nop.i 0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfb
+ adds rTmpPtr = 144, rPolDataPtr
+ nop.f 0
+ br.sptk lgamma_polynom24x
+}
+;;
+
+// here if 1.5625 <= x < 2.25
+.align 32
+lgammal_13Q_2Q:
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_13Q_2Q_data),gp
+ fma.s1 FR_FracX = fB4, f1, f0 // x
+ adds rSgnGam = 1, r0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB4 = fB4, fB4, f0 // x^2
+ nop.i 0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfb
+ adds rTmpPtr = 144, rPolDataPtr
+ nop.f 0
+ br.sptk lgamma_polynom24x
+}
+;;
+
+// here if result is Pol24(x)
+// x is in FR_FracX,
+// rPolDataPtr, rTmpPtr point to coefficients
+.align 32
+lgamma_polynom24x:
+{ .mfi
+ ldfpd fA4, fA2L = [rPolDataPtr], 16
+ nop.f 0
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+{ .mfi
+ ldfpd fA23, fA24 = [rTmpPtr], 16 // C18, C19
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA3, fA1L = [rPolDataPtr], 16
+ fma.s1 fA5L = fB4, fB4, f0 // x^4
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA19, fA20 = [rTmpPtr], 16 // D6, D7
+ fms.s1 fB2 = FR_FracX, FR_FracX, fB4 // x^2 - <x^2>
+ nop.i 0
+}
+;;
+{ .mmf
+ ldfpd fA15, fA16 = [rPolDataPtr], 16 // D2, D3
+ ldfpd fA17, fA18 = [rTmpPtr], 16 // D4, D5
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfpd fA13, fA14 = [rPolDataPtr], 16 // D0, D1
+ ldfpd fA12, fA21 = [rTmpPtr], 16 // E7, C16
+ nop.f 0
+}
+;;
+{ .mfi
+ ldfe fA11 = [rPolDataPtr], 16 // E6
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe fA10 = [rTmpPtr], 16 // E5
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA2, fA4L = [rPolDataPtr], 16
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA1, fA3L = [rTmpPtr], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA22, fA25 = [rPolDataPtr], 16 // C17, C20
+ fma.s1 fA0 = fA5L, fA5L, f0 // x^8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA0L = fA5L, FR_FracX, f0 // x^5
+ nop.i 0
+}
+;;
+{ .mmf
+ ldfe fA9 = [rPolDataPtr], 16 // E4
+ ldfe fA8 = [rTmpPtr], 16 // E3
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfe fA7 = [rPolDataPtr], 16 // E2
+ ldfe fA6 = [rTmpPtr], 16 // E1
+ nop.f 0
+}
+;;
+{ .mfi
+ ldfe fA5 = [rTmpPtr], 16 // E0
+ fma.s1 fRes4H = fA4, fB4, f0 // A4*<x^2>
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fA24, FR_FracX, fA23 // C19*x + C18
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s1 fRes1H = fA3, fB4, f0 // A3*<x^2>
+ nop.i 0
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s1 fA1L = fA3, fB2,fA1L // A3*d(x^2) + A1L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA20 = fA20, FR_FracX, fA19 // D7*x + D6
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA18 = fA18, FR_FracX, fA17 // D5*x + D4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA16 = fA16, FR_FracX, fA15 // D3*x + D2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA14 = fA14, FR_FracX, fA13 // D1*x + D0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA2L = fA4, fB2,fA2L // A4*d(x^2) + A2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA12 = fA12, FR_FracX, fA11 // E7*x + E6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes2L = fA4, fB4, fRes4H // delta(A4*<x^2>)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2H = fRes4H, fA2 // A4*<x^2> + A2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes3L = fA3, fB4, fRes1H // delta(A3*<x^2>)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3H = fRes1H, fA1 // A3*<x^2> + A1
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA20 = fA20, fB4, fA18 // (D7*x + D6)*x^2 + D5*x + D4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA22 = fA22, FR_FracX, fA21 // C17*x + C16
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA16 = fA16, fB4, fA14 // (D3*x + D2)*x^2 + D1*x + D0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fA25, fB4, fPol // C20*x^2 + C19*x + C18
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA2L = fA4L, fB4, fA2L // A4L*<x^2> + A4*d(x^2) + A2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA1L = fA3L, fB4, fA1L // A3L*<x^2> + A3*d(x^2) + A1L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes4L = fA2, fRes2H // d1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fRes2H, fB4, f0 // (A4*<x^2> + A2)*x^2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes1L = fA1, fRes3H // d1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB6 = fRes3H, FR_FracX, f0 // (A3*<x^2> + A1)*x
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA10 = fA10, FR_FracX, fA9 // E5*x + E4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA8 = fA8, FR_FracX, fA7 // E3*x + E2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (C20*x^2 + C19*x + C18)*x^2 + C17*x + C16
+ fma.s1 fPol = fPol, fB4, fA22
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA6 = fA6, FR_FracX, fA5 // E1*x + E0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // A4L*<x^2> + A4*d(x^2) + A2L + delta(A4*<x^2>)
+ fadd.s1 fRes2L = fA2L, fRes2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // A3L*<x^2> + A3*d(x^2) + A1L + delta(A3*<x^2>)
+ fadd.s1 fRes3L = fA1L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fRes4H // d2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fResL = fRes2H, fB4, fResH // d(A4*<x^2> + A2)*x^2)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fRes1H // d2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fB8 = fRes3H, FR_FracX, fB6 // d((A3*<x^2> + A1)*x)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fB10 = fResH, fB6 // (A4*x^4 + .. + A1*x)hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA12 = fA12, fB4, fA10 // Ehi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ((D7*x + D6)*x^2 + D5*x + D4)*x^4 + (D3*x + D2)*x^2 + D1*x + D0
+ fma.s1 fA20 = fA20, fA5L, fA16
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA8 = fA8, fB4, fA6 // Elo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fRes4L // (A4*<x^2> + A2)lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // d(A4*<x^2> + A2)*x^2) + A4*<x^2> + A2)*d(x^2)
+ fma.s1 fResL = fRes2H, fB2, fResL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fRes1L // (A4*<x^2> + A2)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fB12 = fB6, fB10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, fA0, fA20 // PolC*x^8 + PolD
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fA12, fA5L, fA8 // E
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResL = fB4, fRes2L, fResL // ((A4*<x^2> + A2)*x^2)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3L = fRes3L, FR_FracX, fB8 // ((A3*<x^2> + A1)*x)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fB12 = fB12, fResH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, fA0, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fResL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2H = fPol, fA0L, fB10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fB12, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes4L = fB10, fRes2H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes4L = fPol, fA0L, fRes4L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for all paths for which the result is Pol24(x)
+ fma.s0 f8 = fRes2H, f1, fRes4L
+ // here is the exit for all paths for which the result is Pol24(x)
+ br.ret.sptk b0
+}
+;;
+
+
+// here if x is natval, nan, +/-inf, +/-0, or denormal
+.align 32
+lgammal_spec:
+{ .mfi
+ nop.m 0
+ fclass.m p9, p0 = f8, 0xB // +/-denormals
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fclass.m p6, p0 = f8, 0x1E1 // Test x for natval, nan, +inf
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ fclass.m p7, p0 = f8, 0x7 // +/-0
+(p9) br.cond.sptk lgammal_denormal_input
+};;
+{ .mfb
+ nop.m 0
+ nop.f 0
+ // branch out if if x is natval, nan, +inf
+(p6) br.cond.spnt lgammal_nan_pinf
+};;
+{ .mfb
+ nop.m 0
+ nop.f 0
+(p7) br.cond.spnt lgammal_singularity
+};;
+// if we are still here then x = -inf
+{ .mfi
+ cmp.eq p6, p7 = 4, rSgnGamSize
+ nop.f 0
+ adds rSgnGam = 1, r0
+};;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ nop.f 0
+ nop.i 0
+}
+{ .mfb
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s0 f8 = f8,f8,f0 // return +inf, no call to error support
+ br.ret.spnt b0
+};;
+
+// here if x is NaN, NatVal or +INF
+.align 32
+lgammal_nan_pinf:
+{ .mfi
+ cmp.eq p6, p7 = 4, rSgnGamSize
+ nop.f 0
+ adds rSgnGam = 1, r0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s0 f8 = f8,f1,f8 // return x+x if x is natval, nan, +inf
+ nop.i 0
+}
+{ .mfb
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ nop.f 0
+ br.ret.sptk b0
+}
+;;
+
+// here if x denormal or unnormal
+.align 32
+lgammal_denormal_input:
+{ .mfi
+ nop.m 0
+ fma.s0 fResH = f1, f1, f8 // raise denormal exception
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnorm.s1 f8 = f8 // normalize input value
+ nop.i 0
+}
+;;
+{ .mfi
+ getf.sig rSignifX = f8
+ fmerge.se fSignifX = f1, f8
+ nop.i 0
+}
+{ .mfi
+ getf.exp rSignExpX = f8
+ fcvt.fx.s1 fXint = f8 // Convert arg to int (int repres. in FR)
+ nop.i 0
+}
+;;
+{ .mfi
+ getf.exp rSignExpX = f8
+ fcmp.lt.s1 p15, p14 = f8, f0
+ nop.i 0
+}
+;;
+{ .mfb
+ and rExpX = rSignExpX, r17Ones
+ fmerge.s fAbsX = f1, f8 // |x|
+ br.cond.sptk _deno_back_to_main_path
+}
+;;
+
+
+// here if overflow (x > overflow_bound)
+.align 32
+lgammal_overflow:
+{ .mfi
+ addl r8 = 0x1FFFE, r0
+ nop.f 0
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+{ .mfi
+ adds rSgnGam = 1, r0
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ setf.exp f9 = r8
+ fmerge.s FR_X = f8,f8
+ mov GR_Parameter_TAG = 102 // overflow
+};;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ nop.f 0
+ nop.i 0
+}
+{ .mfb
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s0 FR_RESULT = f9,f9,f0 // Set I,O and +INF result
+ br.cond.sptk __libm_error_region
+};;
+
+// here if x is negative integer or +/-0 (SINGULARITY)
+.align 32
+lgammal_singularity:
+{ .mfi
+ adds rSgnGam = 1, r0
+ fclass.m p8,p0 = f8,0x6 // is x -0?
+ mov GR_Parameter_TAG = 103 // negative
+}
+{ .mfi
+ cmp.eq p6, p7 = 4, rSgnGamSize
+ fma.s1 FR_X = f0,f0,f8
+ nop.i 0
+};;
+{ .mfi
+(p8) sub rSgnGam = r0, rSgnGam
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ nop.f 0
+ nop.i 0
+}
+{ .mfb
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ frcpa.s0 FR_RESULT, p0 = f1, f0
+ br.cond.sptk __libm_error_region
+};;
+
+GLOBAL_LIBM_END(__libm_lgammal)
+
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ nop.b 0 // Parameter 3 address
+}
+{ .mib
+ stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 999
+ nop.i 999
+};;
+{ .mmi
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region#)
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/libm_reduce.S b/sysdeps/ia64/fpu/libm_reduce.S
index 1c7f4e1e88..8bdf91d6de 100644
--- a/sysdeps/ia64/fpu/libm_reduce.S
+++ b/sysdeps/ia64/fpu/libm_reduce.S
@@ -1,10 +1,10 @@
.file "libm_reduce.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,304 +20,310 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-// History: 02/02/00 Initial Version
+// History:
+// 02/02/00 Initial Version
+// 05/13/02 Rescheduled for speed, changed interface to pass
+// parameters in fp registers
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double data storage
//
-// *********************************************************************
-// *********************************************************************
+//*********************************************************************
+//*********************************************************************
//
// Function: __libm_pi_by_two_reduce(x) return r, c, and N where
// x = N * pi/4 + (r+c) , where |r+c| <= pi/4.
// This function is not designed to be used by the
// general user.
//
-// *********************************************************************
+//*********************************************************************
//
// Accuracy: Returns double-precision values
//
-// *********************************************************************
+//*********************************************************************
//
// Resources Used:
//
-// Floating-Point Registers: f32-f70
+// Floating-Point Registers:
+// f8 = Input x, return value r
+// f9 = return value c
+// f32-f70
//
// General Purpose Registers:
// r8 = return value N
-// r32 = Address of x
-// r33 = Address of where to place r and then c
// r34-r64
//
// Predicate Registers: p6-p14
//
-// *********************************************************************
+//*********************************************************************
//
// IEEE Special Conditions:
//
-// No condions should be raised.
+// No condions should be raised.
//
-// *********************************************************************
+//*********************************************************************
//
// I. Introduction
// ===============
//
// For the forward trigonometric functions sin, cos, sincos, and
-// tan, the original algorithms for IA 64 handle arguments up to
+// tan, the original algorithms for IA 64 handle arguments up to
// 1 ulp less than 2^63 in magnitude. For double-extended arguments x,
-// |x| >= 2^63, this routine returns CASE, N and r_hi, r_lo where
-//
+// |x| >= 2^63, this routine returns N and r_hi, r_lo where
+//
// x is accurately approximated by
// 2*K*pi + N * pi/2 + r_hi + r_lo, |r_hi+r_lo| <= pi/4.
// CASE = 1 or 2.
// CASE is 1 unless |r_hi + r_lo| < 2^(-33).
-//
+//
// The exact value of K is not determined, but that information is
// not required in trigonometric function computations.
-//
-// We first assume the argument x in question satisfies x >= 2^(63).
+//
+// We first assume the argument x in question satisfies x >= 2^(63).
// In particular, it is positive. Negative x can be handled by symmetry:
-//
+//
// -x is accurately approximated by
// -2*K*pi + (-N) * pi/2 - (r_hi + r_lo), |r_hi+r_lo| <= pi/4.
-//
+//
// The idea of the reduction is that
-//
-// x * 2/pi = N_big + N + f, |f| <= 1/2
-//
+//
+// x * 2/pi = N_big + N + f, |f| <= 1/2
+//
// Moreover, for double extended x, |f| >= 2^(-75). (This is an
// non-obvious fact found by enumeration using a special algorithm
-// involving continued fraction.) The algorithm described below
+// involving continued fraction.) The algorithm described below
// calculates N and an accurate approximation of f.
-//
-// Roughly speaking, an appropriate 256-bit (4 X 64) portion of
+//
+// Roughly speaking, an appropriate 256-bit (4 X 64) portion of
// 2/pi is multiplied with x to give the desired information.
-//
+//
// II. Representation of 2/PI
// ==========================
-//
+//
// The value of 2/pi in binary fixed-point is
-//
+//
// .101000101111100110......
-//
+//
// We store 2/pi in a table, starting at the position corresponding
-// to bit position 63
-//
+// to bit position 63
+//
// bit position 63 62 ... 0 -1 -2 -3 -4 -5 -6 -7 .... -16576
-//
-// 0 0 ... 0 . 1 0 1 0 1 0 1 .... X
-//
+//
+// 0 0 ... 0 . 1 0 1 0 1 0 1 .... X
+//
// ^
-// |__ implied binary pt
-//
+// |__ implied binary pt
+//
// III. Algorithm
// ==============
-//
+//
// This describes the algorithm in the most natural way using
-// unsigned interger multiplication. The implementation section
+// unsigned interger multiplication. The implementation section
// describes how the integer arithmetic is simulated.
-//
+//
// STEP 0. Initialization
// ----------------------
-//
-// Let the input argument x be
-//
+//
+// Let the input argument x be
+//
// x = 2^m * ( 1. b_1 b_2 b_3 ... b_63 ), 63 <= m <= 16383.
-//
-// The first crucial step is to fetch four 64-bit portions of 2/pi.
+//
+// The first crucial step is to fetch four 64-bit portions of 2/pi.
// To fulfill this goal, we calculate the bit position L of the
// beginning of these 256-bit quantity by
-//
+//
// L := 62 - m.
-//
-// Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that
+//
+// Note that -16321 <= L <= -1 because 63 <= m <= 16383; and that
// the storage of 2/pi is adequate.
-//
+//
// Fetch P_1, P_2, P_3, P_4 beginning at bit position L thus:
-//
+//
// bit position L L-1 L-2 ... L-63
-//
+//
// P_1 = b b b ... b
-//
+//
// each b can be 0 or 1. Also, let P_0 be the two bits correspoding to
// bit positions L+2 and L+1. So, when each of the P_j is interpreted
// with appropriate scaling, we have
//
// 2/pi = P_big + P_0 + (P_1 + P_2 + P_3 + P_4) + P_small
-//
+//
// Note that P_big and P_small can be ignored. The reasons are as follow.
// First, consider P_big. If P_big = 0, we can certainly ignore it.
-// Otherwise, P_big >= 2^(L+3). Now,
-//
+// Otherwise, P_big >= 2^(L+3). Now,
+//
// P_big * ulp(x) >= 2^(L+3) * 2^(m-63)
-// >= 2^(65-m + m-63 )
-// >= 2^2
-//
+// >= 2^(65-m + m-63 )
+// >= 2^2
+//
// Thus, P_big * x is an integer of the form 4*K. So
-//
-// x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2)
+//
+// x = 4*K * (pi/2) + x*(P_0 + P_1 + P_2 + P_3 + P_4)*(pi/2)
// + x*P_small*(pi/2).
-//
+//
// Hence, P_big*x corresponds to information that can be ignored for
// trigonometic function evaluation.
-//
+//
// Next, we must estimate the effect of ignoring P_small. The absolute
// error made by ignoring P_small is bounded by
-//
+//
// |P_small * x| <= ulp(P_4) * x
-// <= 2^(L-255) * 2^(m+1)
-// <= 2^(62-m-255 + m + 1)
-// <= 2^(-192)
-//
-// Since for double-extended precision, x * 2/pi = integer + f,
+// <= 2^(L-255) * 2^(m+1)
+// <= 2^(62-m-255 + m + 1)
+// <= 2^(-192)
+//
+// Since for double-extended precision, x * 2/pi = integer + f,
// 0.5 >= |f| >= 2^(-75), the relative error introduced by ignoring
// P_small is bounded by 2^(-192+75) <= 2^(-117), which is acceptable.
-//
+//
// Further note that if x is split into x_hi + x_lo where x_lo is the
// two bits corresponding to bit positions 2^(m-62) and 2^(m-63); then
-//
-// P_0 * x_hi
-//
+//
+// P_0 * x_hi
+//
// is also an integer of the form 4*K; and thus can also be ignored.
// Let M := P_0 * x_lo which is a small integer. The main part of the
// calculation is really the multiplication of x with the four pieces
// P_1, P_2, P_3, and P_4.
-//
+//
// Unless the reduced argument is extremely small in magnitude, it
// suffices to carry out the multiplication of x with P_1, P_2, and
-// P_3. x*P_4 will be carried out and added on as a correction only
+// P_3. x*P_4 will be carried out and added on as a correction only
// when it is found to be needed. Note also that x*P_4 need not be
// computed exactly. A straightforward multiplication suffices since
// the rounding error thus produced would be bounded by 2^(-3*64),
// that is 2^(-192) which is small enough as the reduced argument
// is bounded from below by 2^(-75).
-//
+//
// Now that we have four 64-bit data representing 2/pi and a
// 64-bit x. We first need to calculate a highly accurate product
// of x and P_1, P_2, P_3. This is best understood as integer
// multiplication.
-//
-//
+//
+//
// STEP 1. Multiplication
// ----------------------
-//
-//
+//
+//
// --------- --------- ---------
-// | P_1 | | P_2 | | P_3 |
-// --------- --------- ---------
-//
+// | P_1 | | P_2 | | P_3 |
+// --------- --------- ---------
+//
+// ---------
+// X | X |
// ---------
-// X | X |
-// ---------
// ----------------------------------------------------
//
// --------- ---------
-// | A_hi | | A_lo |
-// --------- ---------
+// | A_hi | | A_lo |
+// --------- ---------
//
//
// --------- ---------
-// | B_hi | | B_lo |
-// --------- ---------
+// | B_hi | | B_lo |
+// --------- ---------
//
//
-// --------- ---------
-// | C_hi | | C_lo |
-// --------- ---------
+// --------- ---------
+// | C_hi | | C_lo |
+// --------- ---------
//
// ====================================================
// --------- --------- --------- ---------
-// | S_0 | | S_1 | | S_2 | | S_3 |
-// --------- --------- --------- ---------
+// | S_0 | | S_1 | | S_2 | | S_3 |
+// --------- --------- --------- ---------
//
//
//
// STEP 2. Get N and f
// -------------------
-//
+//
// Conceptually, after the individual pieces S_0, S_1, ..., are obtained,
// we have to sum them and obtain an integer part, N, and a fraction, f.
// Here, |f| <= 1/2, and N is an integer. Note also that N need only to
// be known to module 2^k, k >= 2. In the case when |f| is small enough,
// we would need to add in the value x*P_4.
-//
-//
+//
+//
// STEP 3. Get reduced argument
// ----------------------------
-//
+//
// The value f is not yet the reduced argument that we seek. The
// equation
-//
-// x * 2/pi = 4K + N + f
-//
+//
+// x * 2/pi = 4K + N + f
+//
// says that
-//
+//
// x = 2*K*pi + N * pi/2 + f * (pi/2).
-//
+//
// Thus, the reduced argument is given by
-//
-// reduced argument = f * pi/2.
-//
+//
+// reduced argument = f * pi/2.
+//
// This multiplication must be performed to extra precision.
-//
+//
// IV. Implementation
// ==================
-//
+//
// Step 0. Initialization
// ----------------------
-//
+//
// Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
-//
+//
// In memory, 2/pi is stored contigously as
-//
+//
// 0x00000000 0x00000000 0xA2F....
// ^
// |__ implied binary bit
-//
+//
// Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m. Thus
// -1 <= L <= -16321. We fetch from memory 5 integer pieces of data.
-//
+//
// P_0 is the two bits corresponding to bit positions L+2 and L+1
// P_1 is the 64-bit starting at bit position L
// P_2 is the 64-bit starting at bit position L-64
// P_3 is the 64-bit starting at bit position L-128
// P_4 is the 64-bit starting at bit position L-192
-//
+//
// For example, if m = 63, P_0 would be 0 and P_1 would look like
// 0xA2F...
-//
+//
// If m = 65, P_0 would be the two msb of 0xA, thus, P_0 is 10 in binary.
-// P_1 in binary would be 1 0 0 0 1 0 1 1 1 1 ....
-//
+// P_1 in binary would be 1 0 0 0 1 0 1 1 1 1 ....
+//
// Step 1. Multiplication
// ----------------------
-//
+//
// At this point, P_1, P_2, P_3, P_4 are integers. They are
// supposed to be interpreted as
-//
+//
// 2^(L-63) * P_1;
// 2^(L-63-64) * P_2;
// 2^(L-63-128) * P_3;
// 2^(L-63-192) * P_4;
-//
+//
// Since each of them need to be multiplied to x, we would scale
// both x and the P_j's by some convenient factors: scale each
// of P_j's up by 2^(63-L), and scale x down by 2^(L-63).
-//
+//
// p_1 := fcvt.xf ( P_1 )
// p_2 := fcvt.xf ( P_2 ) * 2^(-64)
// p_3 := fcvt.xf ( P_3 ) * 2^(-128)
@@ -325,30 +331,30 @@
// x := replace exponent of x by -1
// because 2^m * 1.xxxx...xxx * 2^(L-63)
// is 2^(-1) * 1.xxxx...xxx
-//
+//
// We are now faced with the task of computing the following
-//
+//
// --------- --------- ---------
-// | P_1 | | P_2 | | P_3 |
-// --------- --------- ---------
-//
+// | P_1 | | P_2 | | P_3 |
+// --------- --------- ---------
+//
// ---------
-// X | X |
-// ---------
+// X | X |
+// ---------
// ----------------------------------------------------
-//
+//
// --------- ---------
-// | A_hi | | A_lo |
-// --------- ---------
-//
+// | A_hi | | A_lo |
+// --------- ---------
+//
// --------- ---------
-// | B_hi | | B_lo |
-// --------- ---------
-//
-// --------- ---------
-// | C_hi | | C_lo |
-// --------- ---------
-//
+// | B_hi | | B_lo |
+// --------- ---------
+//
+// --------- ---------
+// | C_hi | | C_lo |
+// --------- ---------
+//
// ====================================================
// ----------- --------- --------- ---------
// | S_0 | | S_1 | | S_2 | | S_3 |
@@ -357,108 +363,108 @@
// | |___ binary point
// |
// |___ possibly one more bit
-//
+//
// Let FPSR3 be set to round towards zero with widest precision
-// and exponent range. Unless an explicit FPSR is given,
+// and exponent range. Unless an explicit FPSR is given,
// round-to-nearest with widest precision and exponent range is
// used.
-//
+//
// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_C := 2^(-65).
-//
+//
// Tmp_C := fmpy.fpsr3( x, p_1 );
// If Tmp_C >= sigma_C then
// C_hi := Tmp_C;
// C_lo := x*p_1 - C_hi ...fma, exact
// Else
// C_hi := fadd.fpsr3(sigma_C, Tmp_C) - sigma_C
-// ...subtraction is exact, regardless
-// ...of rounding direction
+// ...subtraction is exact, regardless
+// ...of rounding direction
// C_lo := x*p_1 - C_hi ...fma, exact
// End If
-//
+//
// Tmp_B := fmpy.fpsr3( x, p_2 );
// If Tmp_B >= sigma_B then
// B_hi := Tmp_B;
// B_lo := x*p_2 - B_hi ...fma, exact
// Else
// B_hi := fadd.fpsr3(sigma_B, Tmp_B) - sigma_B
-// ...subtraction is exact, regardless
-// ...of rounding direction
+// ...subtraction is exact, regardless
+// ...of rounding direction
// B_lo := x*p_2 - B_hi ...fma, exact
// End If
-//
+//
// Tmp_A := fmpy.fpsr3( x, p_3 );
// If Tmp_A >= sigma_A then
// A_hi := Tmp_A;
// A_lo := x*p_3 - A_hi ...fma, exact
// Else
// A_hi := fadd.fpsr3(sigma_A, Tmp_A) - sigma_A
-// ...subtraction is exact, regardless
-// ...of rounding direction
+// ...subtraction is exact, regardless
+// ...of rounding direction
// A_lo := x*p_3 - A_hi ...fma, exact
// End If
-//
+//
// ...Note that C_hi is of integer value. We need only the
-// ...last few bits. Thus we can ensure C_hi is never a big
+// ...last few bits. Thus we can ensure C_hi is never a big
// ...integer, freeing us from overflow worry.
-//
+//
// Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
// ...Tmp_C is the upper portion of C_hi
// C_hi := C_hi - Tmp_C
// ...0 <= C_hi < 2^7
-//
+//
// Step 2. Get N and f
// -------------------
-//
-// At this point, we have all the components to obtain
+//
+// At this point, we have all the components to obtain
// S_0, S_1, S_2, S_3 and thus N and f. We start by adding
// C_lo and B_hi. This sum together with C_hi gives a good
-// estimation of N and f.
-//
+// estimation of N and f.
+//
// A := fadd.fpsr3( B_hi, C_lo )
// B := max( B_hi, C_lo )
// b := min( B_hi, C_lo )
-//
-// a := (B - A) + b ...exact. Note that a is either 0
-// ...or 2^(-64).
-//
+//
+// a := (B - A) + b ...exact. Note that a is either 0
+// ...or 2^(-64).
+//
// N := round_to_nearest_integer_value( A );
-// f := A - N; ...exact because lsb(A) >= 2^(-64)
-// ...and |f| <= 1/2.
-//
-// f := f + a ...exact because a is 0 or 2^(-64);
-// ...the msb of the sum is <= 1/2
-// ...lsb >= 2^(-64).
-//
+// f := A - N; ...exact because lsb(A) >= 2^(-64)
+// ...and |f| <= 1/2.
+//
+// f := f + a ...exact because a is 0 or 2^(-64);
+// ...the msb of the sum is <= 1/2
+// ...lsb >= 2^(-64).
+//
// N := convert to integer format( C_hi + N );
// M := P_0 * x_lo;
// N := N + M;
-//
+//
// If sgn_x == 1 (that is original x was negative)
// N := 2^10 - N
// ...this maintains N to be non-negative, but still
// ...equivalent to the (negated N) mod 4.
// End If
-//
+//
// If |f| >= 2^(-33)
-//
+//
// ...Case 1
// CASE := 1
// g := A_hi + B_lo;
// s_hi := f + g;
// s_lo := (f - s_hi) + g;
-//
+//
// Else
-//
+//
// ...Case 2
// CASE := 2
// A := fadd.fpsr3( A_hi, B_lo )
// B := max( A_hi, B_lo )
// b := min( A_hi, B_lo )
-//
-// a := (B - A) + b ...exact. Note that a is either 0
-// ...or 2^(-128).
-//
+//
+// a := (B - A) + b ...exact. Note that a is either 0
+// ...or 2^(-128).
+//
// f_hi := A + f;
// f_lo := (f - f_hi) + A;
// ...this is exact.
@@ -468,9 +474,9 @@
// ...If f = 2^(-64), f-f_hi involves cancellation and is
// ...exact. If f = -2^(-64), then A + f is exact. Hence
// ...f-f_hi is -A exactly, giving f_lo = 0.
-//
+//
// f_lo := f_lo + a;
-//
+//
// If |f| >= 2^(-50) then
// s_hi := f_hi;
// s_lo := f_lo;
@@ -479,117 +485,111 @@
// s_hi := f_hi + f_lo
// s_lo := (f_hi - s_hi) + f_lo
// End If
-//
+//
// End If
-//
+//
// Step 3. Get reduced argument
// ----------------------------
-//
+//
// If sgn_x == 0 (that is original x is positive)
-//
+//
// D_hi := Pi_by_2_hi
// D_lo := Pi_by_2_lo
// ...load from table
-//
+//
// Else
-//
+//
// D_hi := neg_Pi_by_2_hi
// D_lo := neg_Pi_by_2_lo
// ...load from table
// End If
-//
+//
// r_hi := s_hi*D_hi
-// r_lo := s_hi*D_hi - r_hi ...fma
+// r_lo := s_hi*D_hi - r_hi ...fma
// r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
-//
-// Return CASE, N, r_hi, r_lo
-//
-
-#include "libm_support.h"
-
-FR_X = f32
-FR_N = f33
-FR_p_1 = f34
-FR_TWOM33 = f35
-FR_TWOM50 = f36
-FR_g = f37
-FR_p_2 = f38
-FR_f = f39
-FR_s_lo = f40
-FR_p_3 = f41
-FR_f_abs = f42
-FR_D_lo = f43
-FR_p_4 = f44
-FR_D_hi = f45
-FR_Tmp2_C = f46
-FR_s_hi = f47
-FR_sigma_A = f48
-FR_A = f49
-FR_sigma_B = f50
-FR_B = f51
-FR_sigma_C = f52
-FR_b = f53
-FR_ScaleP2 = f54
-FR_ScaleP3 = f55
-FR_ScaleP4 = f56
-FR_Tmp_A = f57
-FR_Tmp_B = f58
-FR_Tmp_C = f59
-FR_A_hi = f60
-FR_f_hi = f61
-FR_r_hi = f62
-FR_A_lo = f63
-FR_B_hi = f64
-FR_a = f65
-FR_B_lo = f66
+//
+// Return N, r_hi, r_lo
+//
+FR_input_X = f8
+FR_r_hi = f8
+FR_r_lo = f9
+
+FR_X = f32
+FR_N = f33
+FR_p_1 = f34
+FR_TWOM33 = f35
+FR_TWOM50 = f36
+FR_g = f37
+FR_p_2 = f38
+FR_f = f39
+FR_s_lo = f40
+FR_p_3 = f41
+FR_f_abs = f42
+FR_D_lo = f43
+FR_p_4 = f44
+FR_D_hi = f45
+FR_Tmp2_C = f46
+FR_s_hi = f47
+FR_sigma_A = f48
+FR_A = f49
+FR_sigma_B = f50
+FR_B = f51
+FR_sigma_C = f52
+FR_b = f53
+FR_ScaleP2 = f54
+FR_ScaleP3 = f55
+FR_ScaleP4 = f56
+FR_Tmp_A = f57
+FR_Tmp_B = f58
+FR_Tmp_C = f59
+FR_A_hi = f60
+FR_f_hi = f61
+FR_RSHF = f62
+FR_A_lo = f63
+FR_B_hi = f64
+FR_a = f65
+FR_B_lo = f66
FR_f_lo = f67
-FR_r_lo = f68
-FR_C_hi = f69
-FR_C_lo = f70
+FR_N_fix = f68
+FR_C_hi = f69
+FR_C_lo = f70
GR_N = r8
-GR_Address_of_Input = r32
-GR_Address_of_Outputs = r33
-GR_Exp_x = r36
-GR_Temp = r37
-GR_BIASL63 = r38
+GR_Exp_x = r36
+GR_Temp = r37
+GR_BIASL63 = r38
GR_CASE = r39
-GR_x_lo = r40
-GR_sgn_x = r41
+GR_x_lo = r40
+GR_sgn_x = r41
GR_M = r42
GR_BASE = r43
GR_LENGTH1 = r44
GR_LENGTH2 = r45
GR_ASUB = r46
GR_P_0 = r47
-GR_P_1 = r48
-GR_P_2 = r49
-GR_P_3 = r50
-GR_P_4 = r51
+GR_P_1 = r48
+GR_P_2 = r49
+GR_P_3 = r50
+GR_P_4 = r51
GR_START = r52
GR_SEGMENT = r53
GR_A = r54
-GR_B = r55
+GR_B = r55
GR_C = r56
GR_D = r57
GR_E = r58
-GR_TEMP1 = r59
-GR_TEMP2 = r60
-GR_TEMP3 = r61
-GR_TEMP4 = r62
+GR_TEMP1 = r59
+GR_TEMP2 = r60
+GR_TEMP3 = r61
+GR_TEMP4 = r62
GR_TEMP5 = r63
GR_TEMP6 = r64
+GR_rshf = r64
+RODATA
.align 64
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-
-Constants_Bits_of_2_by_pi:
-ASM_TYPE_DIRECTIVE(Constants_Bits_of_2_by_pi,@object)
+LOCAL_OBJECT_START(Constants_Bits_of_2_by_pi)
data8 0x0000000000000000,0xA2F9836E4E441529
data8 0xFC2757D1F534DDC0,0xDB6295993C439041
data8 0xFE5163ABDEBBC561,0xB7246E3A424DD2E0
@@ -721,34 +721,33 @@ data8 0xB5D6DF8261DD9602,0x36169F3AC4A1A283
data8 0x6DED727A8D39A9B8,0x825C326B5B2746ED
data8 0x34007700D255F4FC,0x4D59018071E0E13F
data8 0x89B295F364A8F1AE,0xA74B38FC4CEAB2BB
-ASM_SIZE_DIRECTIVE(Constants_Bits_of_2_by_pi)
+LOCAL_OBJECT_END(Constants_Bits_of_2_by_pi)
-Constants_Bits_of_pi_by_2:
-ASM_TYPE_DIRECTIVE(Constants_Bits_of_pi_by_2,@object)
-data4 0x2168C234,0xC90FDAA2,0x00003FFF,0x00000000
-data4 0x80DC1CD1,0xC4C6628B,0x00003FBF,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_Bits_of_pi_by_2)
+LOCAL_OBJECT_START(Constants_Bits_of_pi_by_2)
+data8 0xC90FDAA22168C234,0x00003FFF
+data8 0xC4C6628B80DC1CD1,0x00003FBF
+LOCAL_OBJECT_END(Constants_Bits_of_pi_by_2)
.section .text
-.proc __libm_pi_by_2_reduce#
.global __libm_pi_by_2_reduce#
-.align 64
+.proc __libm_pi_by_2_reduce#
+.align 32
-__libm_pi_by_2_reduce:
+__libm_pi_by_2_reduce:
-// X is at the address in Address_of_Input
-// Place the two-piece result at the address in Address_of_Outputs
-// r followed by c
-// N is returned
+// X is in f8
+// Place the two-piece result r (r_hi) in f8 and c (r_lo) in f9
+// N is returned in r8
-{ .mmf
-alloc r34 = ar.pfs,2,34,0,0
-(p0) ldfe FR_X = [GR_Address_of_Input]
-(p0) fsetc.s3 0x00,0x7F ;;
+{ .mfi
+ alloc r34 = ar.pfs,2,34,0,0
+ fsetc.s3 0x00,0x7F // Set sf3 to round to zero, 82-bit prec, td, ftz
+ nop.i 999
}
-{ .mlx
- nop.m 999
-(p0) movl GR_BIASL63 = 0x1003E
+{ .mfi
+ addl GR_BASE = @ltoff(Constants_Bits_of_2_by_pi#), gp
+ nop.f 999
+ mov GR_BIASL63 = 0x1003E
}
;;
@@ -765,73 +764,61 @@ alloc r34 = ar.pfs,2,34,0,0
// Address_BASE = shladd(SEGMENT,3) + BASE
-
{ .mmi
- nop.m 999
-(p0) addl GR_BASE = @ltoff(Constants_Bits_of_2_by_pi#), gp
- nop.i 999
+ getf.exp GR_Exp_x = FR_input_X
+ ld8 GR_BASE = [GR_BASE]
+ mov GR_TEMP5 = 0x0FFFE
}
;;
+// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65).
{ .mmi
- ld8 GR_BASE = [GR_BASE]
- nop.m 999
+ getf.sig GR_x_lo = FR_input_X
+ mov GR_TEMP6 = 0x0FFBE
nop.i 999
}
;;
-
-{ .mlx
- nop.m 999
-(p0) movl GR_TEMP5 = 0x000000000000FFFE
-}
-{ .mmi
- nop.m 999 ;;
-(p0) setf.exp FR_sigma_B = GR_TEMP5
- nop.i 999
-}
-{ .mlx
- nop.m 999
-(p0) movl GR_TEMP6 = 0x000000000000FFBE ;;
-}
-// Define sigma_C := 2^63; sigma_B := 2^(-1); sigma_A := 2^(-65).
-{ .mfi
-(p0) setf.exp FR_sigma_A = GR_TEMP6
- nop.f 999
- nop.i 999 ;;
-}
-// Special Code for testing DE arguments
-// (p0) movl GR_BIASL63 = 0x0000000000013FFE
-// (p0) movl GR_x_lo = 0xFFFFFFFFFFFFFFFF
-// (p0) setf.exp FR_X = GR_BIASL63
-// (p0) setf.sig FR_ScaleP3 = GR_x_lo
-// (p0) fmerge.se FR_X = FR_X,FR_ScaleP3
+// Special Code for testing DE arguments
+// movl GR_BIASL63 = 0x0000000000013FFE
+// movl GR_x_lo = 0xFFFFFFFFFFFFFFFF
+// setf.exp FR_X = GR_BIASL63
+// setf.sig FR_ScaleP3 = GR_x_lo
+// fmerge.se FR_X = FR_X,FR_ScaleP3
// Set sgn_x := sign(x); x := |x|; x_lo := 2 lsb of x.
// 2/pi is stored contigously as
// 0x00000000 0x00000000.0xA2F....
// M = EXP - BIAS ( M >= 63)
// Given x = 2^m * 1.xxxx...xxx; we calculate L := 62 - m.
// Thus -1 <= L <= -16321.
-{ .mmf
-(p0) getf.exp GR_Exp_x = FR_X
-(p0) getf.sig GR_x_lo = FR_X
-(p0) fabs FR_X = FR_X ;;
+{ .mmi
+ setf.exp FR_sigma_B = GR_TEMP5
+ setf.exp FR_sigma_A = GR_TEMP6
+ extr.u GR_M = GR_Exp_x,0,17
}
+;;
+
{ .mii
-(p0) and GR_x_lo = 0x03,GR_x_lo
-(p0) extr.u GR_M = GR_Exp_x,0,17 ;;
-(p0) sub GR_START = GR_M,GR_BIASL63
+ and GR_x_lo = 0x03,GR_x_lo
+ sub GR_START = GR_M,GR_BIASL63
+ add GR_BASE = 8,GR_BASE // To effectively add 1 to SEGMENT
}
-{ .mmi
- nop.m 999 ;;
-(p0) and GR_LENGTH1 = 0x3F,GR_START
-(p0) shr.u GR_SEGMENT = GR_START,6
+;;
+
+{ .mii
+ and GR_LENGTH1 = 0x3F,GR_START
+ shr.u GR_SEGMENT = GR_START,6
+ nop.i 999
}
+;;
+
{ .mmi
- nop.m 999 ;;
-(p0) add GR_SEGMENT = 0x1,GR_SEGMENT
-(p0) sub GR_LENGTH2 = 0x40,GR_LENGTH1
+ shladd GR_BASE = GR_SEGMENT,3,GR_BASE
+ sub GR_LENGTH2 = 0x40,GR_LENGTH1
+ cmp.le p6,p7 = 0x2,GR_LENGTH1
}
+;;
+
// P_0 is the two bits corresponding to bit positions L+2 and L+1
// P_1 is the 64-bit starting at bit position L
// P_2 is the 64-bit starting at bit position L-64
@@ -849,13 +836,13 @@ alloc r34 = ar.pfs,2,34,0,0
// P_4 is made up of Clo and Dhi
// P_4 = deposit Dlo, position 0, length2 into P_4, position length1
// deposit Ehi, position length2, length1 into P_4, position 0
-{ .mmi
-(p0) cmp.le.unc p6,p7 = 0x2,GR_LENGTH1 ;;
-(p0) shladd GR_BASE = GR_SEGMENT,3,GR_BASE
-(p7) cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1 ;;
+{ .mfi
+ ld8 GR_A = [GR_BASE],8
+ fabs FR_X = FR_input_X
+(p7) cmp.eq.unc p8,p9 = 0x1,GR_LENGTH1
}
-{ .mmi
- nop.m 999
+;;
+
// ld_64 A at Base and increment Base by 8
// ld_64 B at Base and increment Base by 8
// ld_64 C at Base and increment Base by 8
@@ -866,31 +853,35 @@ alloc r34 = ar.pfs,2,34,0,0
// A, B, C, D, and E look like | length1 | length2 |
// ---------------------
// hi lo
-(p0) ld8 GR_A = [GR_BASE],8
-(p0) extr.u GR_sgn_x = GR_Exp_x,17,1 ;;
-}
-{ .mmf
- nop.m 999
-(p0) ld8 GR_B = [GR_BASE],8
-(p0) fmerge.se FR_X = FR_sigma_B,FR_X ;;
+{ .mlx
+ ld8 GR_B = [GR_BASE],8
+ movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift N_fix
}
-{ .mii
-(p0) ld8 GR_C = [GR_BASE],8
-(p8) extr.u GR_Temp = GR_A,63,1 ;;
-(p0) shl GR_TEMP1 = GR_A,GR_LENGTH1
+;;
+
+{ .mmi
+ ld8 GR_C = [GR_BASE],8
+ nop.m 999
+(p8) extr.u GR_Temp = GR_A,63,1
}
-{ .mii
-(p0) ld8 GR_D = [GR_BASE],8
+;;
+
// If length1 >= 2,
// P_0 = deposit Ahi, position length2, 2 bit into P_0 at position 0.
-(p6) shr.u GR_P_0 = GR_A,GR_LENGTH2 ;;
-(p0) shl GR_TEMP2 = GR_B,GR_LENGTH1
+{ .mii
+ ld8 GR_D = [GR_BASE],8
+ shl GR_TEMP1 = GR_A,GR_LENGTH1 // MM instruction
+(p6) shr.u GR_P_0 = GR_A,GR_LENGTH2 // MM instruction
}
+;;
+
{ .mii
-(p0) ld8 GR_E = [GR_BASE],-40
-(p0) shr.u GR_P_1 = GR_B,GR_LENGTH2 ;;
-(p0) shr.u GR_P_2 = GR_C,GR_LENGTH2
+ ld8 GR_E = [GR_BASE],-40
+ shl GR_TEMP2 = GR_B,GR_LENGTH1 // MM instruction
+ shr.u GR_P_1 = GR_B,GR_LENGTH2 // MM instruction
}
+;;
+
// Else
// Load 16 bit of ASUB from (Base_Address_of_A - 2)
// P_0 = ASUB & 0x3
@@ -900,43 +891,56 @@ alloc r34 = ar.pfs,2,34,0,0
// Deposit element 63 from Ahi and place in element 0 of P_0.
// Endif
// Endif
+
{ .mii
(p7) ld2 GR_ASUB = [GR_BASE],8
-(p0) shl GR_TEMP3 = GR_C,GR_LENGTH1 ;;
-(p0) shl GR_TEMP4 = GR_D,GR_LENGTH1
+ shl GR_TEMP3 = GR_C,GR_LENGTH1 // MM instruction
+ shr.u GR_P_2 = GR_C,GR_LENGTH2 // MM instruction
}
+;;
+
{ .mii
- nop.m 999
-(p0) shr.u GR_P_3 = GR_D,GR_LENGTH2 ;;
-(p0) shr.u GR_P_4 = GR_E,GR_LENGTH2
+ setf.d FR_RSHF = GR_rshf // Form right shift const 1.100 * 2^63
+ shl GR_TEMP4 = GR_D,GR_LENGTH1 // MM instruction
+ shr.u GR_P_3 = GR_D,GR_LENGTH2 // MM instruction
}
-{ .mii
+;;
+
+{ .mmi
(p7) and GR_P_0 = 0x03,GR_ASUB
-(p6) and GR_P_0 = 0x03,GR_P_0 ;;
-(p0) or GR_P_1 = GR_P_1,GR_TEMP1
+(p6) and GR_P_0 = 0x03,GR_P_0
+ shr.u GR_P_4 = GR_E,GR_LENGTH2 // MM instruction
}
+;;
+
{ .mmi
-(p8) and GR_P_0 = 0x1,GR_P_0 ;;
-(p0) or GR_P_2 = GR_P_2,GR_TEMP2
-(p8) shl GR_P_0 = GR_P_0,0x1 ;;
-}
-{ .mii
- nop.m 999
-(p0) or GR_P_3 = GR_P_3,GR_TEMP3
-(p8) or GR_P_0 = GR_P_0,GR_Temp
+ nop.m 999
+ or GR_P_1 = GR_P_1,GR_TEMP1
+(p8) and GR_P_0 = 0x1,GR_P_0
}
+;;
+
{ .mmi
-(p0) setf.sig FR_p_1 = GR_P_1 ;;
-(p0) setf.sig FR_p_2 = GR_P_2
-(p0) or GR_P_4 = GR_P_4,GR_TEMP4 ;;
+ setf.sig FR_p_1 = GR_P_1
+ or GR_P_2 = GR_P_2,GR_TEMP2
+(p8) shladd GR_P_0 = GR_P_0,1,GR_Temp
}
+;;
+
+{ .mmf
+ setf.sig FR_p_2 = GR_P_2
+ or GR_P_3 = GR_P_3,GR_TEMP3
+ fmerge.se FR_X = FR_sigma_B,FR_X
+}
+;;
+
{ .mmi
- nop.m 999 ;;
-(p0) setf.sig FR_p_3 = GR_P_3
-(p0) pmpy2.r GR_M = GR_P_0,GR_x_lo
+ setf.sig FR_p_3 = GR_P_3
+ or GR_P_4 = GR_P_4,GR_TEMP4
+ pmpy2.r GR_M = GR_P_0,GR_x_lo
}
-{ .mlx
-(p0) setf.sig FR_p_4 = GR_P_4
+;;
+
// P_1, P_2, P_3, P_4 are integers. They should be
// 2^(L-63) * P_1;
// 2^(L-63-64) * P_2;
@@ -954,18 +958,18 @@ alloc r34 = ar.pfs,2,34,0,0
// | P_1 | | P_2 | | P_3 |
// --------- --------- ---------
// ---------
-// X | X |
-// ---------
+// X | X |
+// ---------
// ----------------------------------------------------
// --------- ---------
-// | A_hi | | A_lo |
-// --------- ---------
+// | A_hi | | A_lo |
+// --------- ---------
// --------- ---------
-// | B_hi | | B_lo |
-// --------- ---------
+// | B_hi | | B_lo |
+// --------- ---------
+// --------- ---------
+// | C_hi | | C_lo |
// --------- ---------
-// | C_hi | | C_lo |
-// --------- ---------
// ====================================================
// ----------- --------- --------- ---------
// | S_0 | | S_1 | | S_2 | | S_3 |
@@ -977,52 +981,55 @@ alloc r34 = ar.pfs,2,34,0,0
// and exponent range. Unless an explicit FPSR is given,
// round-to-nearest with widest precision and exponent range is
// used.
-(p0) movl GR_TEMP1 = 0x000000000000FFBF
-}
{ .mmi
- nop.m 999 ;;
-(p0) setf.exp FR_ScaleP2 = GR_TEMP1
- nop.i 999
-}
-{ .mlx
- nop.m 999
-(p0) movl GR_TEMP4 = 0x000000000001003E
+ setf.sig FR_p_4 = GR_P_4
+ mov GR_TEMP1 = 0x0FFBF
+ nop.i 999
}
+;;
+
{ .mmi
- nop.m 999 ;;
-(p0) setf.exp FR_sigma_C = GR_TEMP4
- nop.i 999
+ setf.exp FR_ScaleP2 = GR_TEMP1
+ mov GR_TEMP2 = 0x0FF7F
+ nop.i 999
}
-{ .mlx
- nop.m 999
-(p0) movl GR_TEMP2 = 0x000000000000FF7F ;;
+;;
+
+{ .mmi
+ setf.exp FR_ScaleP3 = GR_TEMP2
+ mov GR_TEMP4 = 0x1003E
+ nop.i 999
}
+;;
+
{ .mmf
- nop.m 999
-(p0) setf.exp FR_ScaleP3 = GR_TEMP2
-(p0) fcvt.xuf.s1 FR_p_1 = FR_p_1 ;;
+ setf.exp FR_sigma_C = GR_TEMP4
+ mov GR_Temp = 0x0FFDE
+ fcvt.xuf.s1 FR_p_1 = FR_p_1
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fcvt.xuf.s1 FR_p_2 = FR_p_2
- nop.i 999
-}
-{ .mlx
- nop.m 999
-(p0) movl GR_Temp = 0x000000000000FFDE ;;
-}
-{ .mmf
- nop.m 999
-(p0) setf.exp FR_TWOM33 = GR_Temp
-(p0) fcvt.xuf.s1 FR_p_3 = FR_p_3 ;;
+ setf.exp FR_TWOM33 = GR_Temp
+ fcvt.xuf.s1 FR_p_2 = FR_p_2
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fcvt.xuf.s1 FR_p_4 = FR_p_4
- nop.i 999 ;;
+ nop.m 999
+ fcvt.xuf.s1 FR_p_3 = FR_p_3
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
+ nop.m 999
+ fcvt.xuf.s1 FR_p_4 = FR_p_4
+ nop.i 999
+}
+;;
+
// Tmp_C := fmpy.fpsr3( x, p_1 );
// Tmp_B := fmpy.fpsr3( x, p_2 );
// Tmp_A := fmpy.fpsr3( x, p_3 );
@@ -1048,55 +1055,62 @@ alloc r34 = ar.pfs,2,34,0,0
// Exact, regardless ...of rounding direction
// A_lo := x*p_3 - A_hi ...fma, exact
// Endif
-(p0) fmpy.s3 FR_Tmp_C = FR_X,FR_p_1
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2
- nop.i 999
-}
-{ .mlx
- nop.m 999
-(p0) movl GR_Temp = 0x0000000000000400
+ nop.m 999
+ fmpy.s3 FR_Tmp_C = FR_X,FR_p_1
+ nop.i 999
}
-{ .mlx
- nop.m 999
-(p0) movl GR_TEMP3 = 0x000000000000FF3F ;;
+;;
+
+{ .mfi
+ mov GR_TEMP3 = 0x0FF3F
+ fmpy.s1 FR_p_2 = FR_p_2,FR_ScaleP2
+ nop.i 999
}
+;;
+
{ .mmf
- nop.m 999
-(p0) setf.exp FR_ScaleP4 = GR_TEMP3
-(p0) fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3 ;;
+ setf.exp FR_ScaleP4 = GR_TEMP3
+ mov GR_TEMP4 = 0x10045
+ fmpy.s1 FR_p_3 = FR_p_3,FR_ScaleP3
}
-{ .mlx
- nop.m 999
-(p0) movl GR_TEMP4 = 0x0000000000010045 ;;
+;;
+
+{ .mfi
+ nop.m 999
+ fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C // For Tmp_C < sigma_C case
+ nop.i 999
}
+;;
+
{ .mmf
- nop.m 999
-(p0) setf.exp FR_Tmp2_C = GR_TEMP4
-(p0) fmpy.s3 FR_Tmp_B = FR_X,FR_p_2 ;;
+ setf.exp FR_Tmp2_C = GR_TEMP4
+ nop.m 999
+ fmpy.s3 FR_Tmp_B = FR_X,FR_p_2
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc.s1 p12, p9 = FR_Tmp_C,FR_sigma_C
- nop.i 999 ;;
+ addl GR_BASE = @ltoff(Constants_Bits_of_pi_by_2#), gp
+ fcmp.ge.s1 p12, p9 = FR_Tmp_C,FR_sigma_C
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fmpy.s3 FR_Tmp_A = FR_X,FR_p_3
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s3 FR_Tmp_A = FR_X,FR_p_3
+ nop.i 99
}
+;;
+
{ .mfi
- nop.m 999
+ ld8 GR_BASE = [GR_BASE]
(p12) mov FR_C_hi = FR_Tmp_C
- nop.i 999 ;;
+ nop.i 999
}
{ .mfi
-(p0) addl GR_BASE = @ltoff(Constants_Bits_of_pi_by_2#), gp
-(p9) fadd.s3 FR_C_hi = FR_sigma_C,FR_Tmp_C
- nop.i 999
+ nop.m 999
+(p9) fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C
+ nop.i 999
}
;;
@@ -1114,97 +1128,106 @@ alloc r34 = ar.pfs,2,34,0,0
// Load from table
// End If
-
-{ .mmi
- ld8 GR_BASE = [GR_BASE]
+{ .mfi
nop.m 999
+ fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4
nop.i 999
}
-;;
-
-
{ .mfi
-(p0) ldfe FR_D_hi = [GR_BASE],16
-(p0) fmpy.s1 FR_p_4 = FR_p_4,FR_ScaleP4
- nop.i 999 ;;
+ nop.m 999
+ fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B // For Tmp_B < sigma_B case
+ nop.i 999
}
+;;
+
{ .mfi
-(p0) ldfe FR_D_lo = [GR_BASE],0
-(p0) fcmp.ge.unc.s1 p13, p10 = FR_Tmp_B,FR_sigma_B
- nop.i 999 ;;
+ nop.m 999
+ fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A // For Tmp_A < sigma_A case
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p13) mov FR_B_hi = FR_Tmp_B
- nop.i 999
+ nop.m 999
+ fcmp.ge.s1 p13, p10 = FR_Tmp_B,FR_sigma_B
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p12) fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
- nop.i 999 ;;
+ nop.m 999
+ fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fadd.s3 FR_B_hi = FR_sigma_B,FR_Tmp_B
- nop.i 999
+ ldfe FR_D_hi = [GR_BASE],16
+ fcmp.ge.s1 p14, p11 = FR_Tmp_A,FR_sigma_A
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p9) fsub.s1 FR_C_hi = FR_C_hi,FR_sigma_C
- nop.i 999 ;;
+ ldfe FR_D_lo = [GR_BASE]
+(p13) mov FR_B_hi = FR_Tmp_B
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc.s1 p14, p11 = FR_Tmp_A,FR_sigma_A
- nop.i 999 ;;
+ nop.m 999
+(p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
+ nop.m 999
(p14) mov FR_A_hi = FR_Tmp_A
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p11) fadd.s3 FR_A_hi = FR_sigma_A,FR_Tmp_A
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p9) fms.s1 FR_C_lo = FR_X,FR_p_1,FR_C_hi
-(p0) cmp.eq.unc p12,p9 = 0x1,GR_sgn_x
-}
-{ .mfi
- nop.m 999
-(p13) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
- nop.i 999 ;;
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p10) fsub.s1 FR_B_hi = FR_B_hi,FR_sigma_B
- nop.i 999
+ nop.m 999
+(p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
// Note that C_hi is of integer value. We need only the
// last few bits. Thus we can ensure C_hi is never a big
// integer, freeing us from overflow worry.
// Tmp_C := fadd.fpsr3( C_hi, 2^(70) ) - 2^(70);
// Tmp_C is the upper portion of C_hi
-(p0) fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C
- nop.i 999 ;;
+{ .mfi
+ nop.m 999
+ fadd.s3 FR_Tmp_C = FR_C_hi,FR_Tmp2_C
+ tbit.z p12,p9 = GR_Exp_x, 17
}
+;;
+
{ .mfi
- nop.m 999
-(p14) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
- nop.i 999
+ nop.m 999
+ fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p11) fsub.s1 FR_A_hi = FR_A_hi,FR_sigma_A
- nop.i 999 ;;
+ nop.m 999
+ fadd.s3 FR_A = FR_B_hi,FR_C_lo
+ nop.i 999
}
+;;
+
+{ .mfi
+ nop.m 999
+ fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
+ nop.i 999
+}
+;;
+
{ .mfi
- nop.m 999
+ nop.m 999
+ fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C
+ nop.i 999
+}
+;;
+
// *******************
// Step 2. Get N and f
// *******************
@@ -1215,168 +1238,213 @@ alloc r34 = ar.pfs,2,34,0,0
// A := fadd.fpsr3( B_hi, C_lo )
// B := max( B_hi, C_lo )
// b := min( B_hi, C_lo )
-(p0) fadd.s3 FR_A = FR_B_hi,FR_C_lo
- nop.i 999
-}
{ .mfi
- nop.m 999
-(p10) fms.s1 FR_B_lo = FR_X,FR_p_2,FR_B_hi
- nop.i 999 ;;
+ nop.m 999
+ fmax.s1 FR_B = FR_B_hi,FR_C_lo
+ nop.i 999
}
+;;
+
+// We use a right-shift trick to get the integer part of A into the rightmost
+// bits of the significand by adding 1.1000..00 * 2^63. This operation is good
+// if |A| < 2^61, which it is in this case. We are doing this to save a few
+// cycles over using fcvt.fx followed by fnorm. The second step of the trick
+// is to subtract the same constant to float the rounded integer into a fp reg.
+
{ .mfi
- nop.m 999
-(p0) fsub.s1 FR_Tmp_C = FR_Tmp_C,FR_Tmp2_C
- nop.i 999 ;;
+ nop.m 999
+// N := round_to_nearest_integer_value( A );
+ fma.s1 FR_N_fix = FR_A, f1, FR_RSHF
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmax.s1 FR_B = FR_B_hi,FR_C_lo
- nop.i 999 ;;
+ nop.m 999
+ fmin.s1 FR_b = FR_B_hi,FR_C_lo
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fmin.s1 FR_b = FR_B_hi,FR_C_lo
- nop.i 999
+ nop.m 999
+// C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7
+ fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p11) fms.s1 FR_A_lo = FR_X,FR_p_3,FR_A_hi
- nop.i 999 ;;
+ nop.m 999
+// a := (B - A) + b: Exact - note that a is either 0 or 2^(-64).
+ fsub.s1 FR_a = FR_B,FR_A
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-// N := round_to_nearest_integer_value( A );
-(p0) fcvt.fx.s1 FR_N = FR_A
- nop.i 999 ;;
+ nop.m 999
+ fms.s1 FR_N = FR_N_fix, f1, FR_RSHF
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-// C_hi := C_hi - Tmp_C ...0 <= C_hi < 2^7
-(p0) fsub.s1 FR_C_hi = FR_C_hi,FR_Tmp_C
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 FR_a = FR_a,FR_b
+ nop.i 999
}
+;;
+
+// f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2.
+// N := convert to integer format( C_hi + N );
+// M := P_0 * x_lo;
+// N := N + M;
{ .mfi
- nop.m 999
-// a := (B - A) + b: Exact - note that a is either 0 or 2^(-64).
-(p0) fsub.s1 FR_a = FR_B,FR_A
- nop.i 999 ;;
+ nop.m 999
+ fsub.s1 FR_f = FR_A,FR_N
+ nop.i 999
}
{ .mfi
- nop.m 999
-// f := A - N; Exact because lsb(A) >= 2^(-64) and |f| <= 1/2.
-(p0) fnorm.s1 FR_N = FR_N
- nop.i 999
+ nop.m 999
+ fadd.s1 FR_N = FR_N,FR_C_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_a = FR_a,FR_b
- nop.i 999 ;;
+ nop.m 999
+(p9) fsub.s1 FR_D_hi = f0, FR_D_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fsub.s1 FR_f = FR_A,FR_N
- nop.i 999
+ nop.m 999
+(p9) fsub.s1 FR_D_lo = f0, FR_D_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-// N := convert to integer format( C_hi + N );
-// M := P_0 * x_lo;
-// N := N + M;
-(p0) fadd.s1 FR_N = FR_N,FR_C_hi
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 FR_g = FR_A_hi,FR_B_lo // For Case 1, g=A_hi+B_lo
+ nop.i 999
}
{ .mfi
- nop.m 999
-// f = f + a Exact because a is 0 or 2^(-64);
-// the msb of the sum is <= 1/2 and lsb >= 2^(-64).
-(p0) fadd.s1 FR_f = FR_f,FR_a
- nop.i 999
+ nop.m 999
+ fadd.s3 FR_A = FR_A_hi,FR_B_lo // For Case 2, A=A_hi+B_lo w/ sf3
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Create 2**(-33)
-//
-(p0) fcvt.fx.s1 FR_N = FR_N
- nop.i 999 ;;
+ mov GR_Temp = 0x0FFCD // For Case 2, exponent of 2^-50
+ fmax.s1 FR_B = FR_A_hi,FR_B_lo // For Case 2, B=max(A_hi,B_lo)
+ nop.i 999
}
+;;
+
+// f = f + a Exact because a is 0 or 2^(-64);
+// the msb of the sum is <= 1/2 and lsb >= 2^(-64).
{ .mfi
- nop.m 999
-(p0) fabs FR_f_abs = FR_f
- nop.i 999 ;;
+ setf.exp FR_TWOM50 = GR_Temp // For Case 2, form 2^-50
+ fcvt.fx.s1 FR_N = FR_N
+ nop.i 999
}
{ .mfi
-(p0) getf.sig GR_N = FR_N
- nop.f 999
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 FR_f = FR_f,FR_a
+ nop.i 999
}
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p0) add GR_N = GR_N,GR_M ;;
+;;
+
+{ .mfi
+ nop.m 999
+ fmin.s1 FR_b = FR_A_hi,FR_B_lo // For Case 2, b=min(A_hi,B_lo)
+ nop.i 999
}
-// If sgn_x == 1 (that is original x was negative)
-// N := 2^10 - N
-// this maintains N to be non-negative, but still
-// equivalent to the (negated N) mod 4.
-// End If
-{ .mii
-(p12) sub GR_N = GR_Temp,GR_N
-(p0) cmp.eq.unc p12,p9 = 0x0,GR_sgn_x ;;
- nop.i 999
+;;
+
+{ .mfi
+ nop.m 999
+ fsub.s1 FR_a = FR_B,FR_A // For Case 2, a=B-A
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fcmp.ge.unc.s1 p13, p10 = FR_f_abs,FR_TWOM33
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 FR_s_hi = FR_f,FR_g // For Case 1, s_hi=f+g
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p9) fsub.s1 FR_D_hi = f0, FR_D_hi
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 FR_f_hi = FR_A,FR_f // For Case 2, f_hi=A+f
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fadd.s3 FR_A = FR_A_hi,FR_B_lo
- nop.i 999
+ nop.m 999
+ fabs FR_f_abs = FR_f
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p13) fadd.s1 FR_g = FR_A_hi,FR_B_lo
- nop.i 999 ;;
+ getf.sig GR_N = FR_N
+ fsetc.s3 0x7F,0x40 // Reset sf3 to user settings + td
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fmax.s1 FR_B = FR_A_hi,FR_B_lo
- nop.i 999
+ nop.m 999
+ fsub.s1 FR_s_lo = FR_f,FR_s_hi // For Case 1, s_lo=f-s_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p9) fsub.s1 FR_D_lo = f0, FR_D_lo
- nop.i 999 ;;
+ nop.m 999
+ fsub.s1 FR_f_lo = FR_f,FR_f_hi // For Case 2, f_lo=f-f_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fmin.s1 FR_b = FR_A_hi,FR_B_lo
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi // For Case 1, r_hi=s_hi*D_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fsetc.s3 0x7F,0x40
- nop.i 999
+ nop.m 999
+ fadd.s1 FR_a = FR_a,FR_b // For Case 2, a=a+b
+ nop.i 999
}
-{ .mlx
- nop.m 999
-(p10) movl GR_Temp = 0x000000000000FFCD ;;
+;;
+
+
+// If sgn_x == 1 (that is original x was negative)
+// N := 2^10 - N
+// this maintains N to be non-negative, but still
+// equivalent to the (negated N) mod 4.
+// End If
+{ .mfi
+ add GR_N = GR_N,GR_M
+ fcmp.ge.s1 p13, p10 = FR_f_abs,FR_TWOM33
+ mov GR_Temp = 0x00400
}
-{ .mmf
- nop.m 999
-(p10) setf.exp FR_TWOM50 = GR_Temp
-(p10) fadd.s1 FR_f_hi = FR_A,FR_f ;;
+;;
+
+{ .mfi
+(p9) sub GR_N = GR_Temp,GR_N
+ fadd.s1 FR_s_lo = FR_s_lo,FR_g // For Case 1, s_lo=s_lo+g
+ nop.i 999
}
{ .mfi
- nop.m 999
-// a := (B - A) + b Exact.
+ nop.m 999
+ fadd.s1 FR_f_lo = FR_f_lo,FR_A // For Case 2, f_lo=f_lo+A
+ nop.i 999
+}
+;;
+
+// a := (B - A) + b Exact.
// Note that a is either 0 or 2^(-128).
// f_hi := A + f;
// f_lo := (f - f_hi) + A
@@ -1387,68 +1455,32 @@ alloc r34 = ar.pfs,2,34,0,0
// exact. If f = -2^(-64), then A + f is exact. Hence
// f-f_hi is -A exactly, giving f_lo = 0.
// f_lo := f_lo + a;
-(p10) fsub.s1 FR_a = FR_B,FR_A
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p13) fadd.s1 FR_s_hi = FR_f,FR_g
- nop.i 999 ;;
-}
-{ .mlx
- nop.m 999
+
// If |f| >= 2^(-33)
// Case 1
// CASE := 1
// g := A_hi + B_lo;
// s_hi := f + g;
// s_lo := (f - s_hi) + g;
-(p13) movl GR_CASE = 0x1 ;;
-}
-{ .mlx
- nop.m 999
// Else
// Case 2
// CASE := 2
// A := fadd.fpsr3( A_hi, B_lo )
// B := max( A_hi, B_lo )
// b := min( A_hi, B_lo )
-(p10) movl GR_CASE = 0x2
-}
-{ .mfi
- nop.m 999
-(p10) fsub.s1 FR_f_lo = FR_f,FR_f_hi
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p10) fadd.s1 FR_a = FR_a,FR_b
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p13) fsub.s1 FR_s_lo = FR_f,FR_s_hi
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p13) fadd.s1 FR_s_lo = FR_s_lo,FR_g
- nop.i 999 ;;
-}
+
{ .mfi
- nop.m 999
-(p10) fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50
- nop.i 999 ;;
+ nop.m 999
+(p10) fcmp.ge.unc.s1 p14, p11 = FR_f_abs,FR_TWOM50
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Create 2**(-50)
-(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_A
- nop.i 999 ;;
+ nop.m 999
+(p13) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi //For Case 1, r_lo=s_hi*D_hi+r_hi
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
// If |f| >= 2^(-50) then
// s_hi := f_hi;
// s_lo := f_lo;
@@ -1457,84 +1489,90 @@ alloc r34 = ar.pfs,2,34,0,0
// s_hi := f_hi + f_lo
// s_lo := (f_hi - s_hi) + f_lo
// End If
-(p14) mov FR_s_hi = FR_f_hi
- nop.i 999 ;;
+{ .mfi
+ nop.m 999
+(p14) mov FR_s_hi = FR_f_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_a
- nop.i 999 ;;
+ nop.m 999
+(p10) fadd.s1 FR_f_lo = FR_f_lo,FR_a
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p14) mov FR_s_lo = FR_f_lo
- nop.i 999
+ nop.m 999
+(p14) mov FR_s_lo = FR_f_lo
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p11) fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo
- nop.i 999 ;;
+ nop.m 999
+(p11) fadd.s1 FR_f_lo = FR_f_lo,FR_A_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p11) fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo
- nop.i 999 ;;
+ nop.m 999
+(p11) fma.s1 FR_f_lo = FR_X,FR_p_4,FR_f_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p11) fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo
- nop.i 999 ;;
+ nop.m 999
+(p13) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo //For Case 1, r_lo=s_hi*D_lo+r_lo
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
+(p11) fadd.s1 FR_s_hi = FR_f_hi,FR_f_lo
+ nop.i 999
+}
+;;
+
// r_hi := s_hi*D_hi
// r_lo := s_hi*D_hi - r_hi with fma
// r_lo := (s_hi*D_lo + r_lo) + s_lo*D_hi
-(p0) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi
- nop.i 999
-}
{ .mfi
- nop.m 999
-(p11) fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi
- nop.i 999 ;;
+ nop.m 999
+(p10) fmpy.s1 FR_r_hi = FR_s_hi,FR_D_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi
- nop.i 999
+ nop.m 999
+(p11) fsub.s1 FR_s_lo = FR_f_hi,FR_s_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p11) fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo
- nop.i 999 ;;
-}
-{ .mmi
- nop.m 999 ;;
-// Return N, r_hi, r_lo
-// We do not return CASE
-(p0) stfe [GR_Address_of_Outputs] = FR_r_hi,16
- nop.i 999 ;;
+ nop.m 999
+(p10) fms.s1 FR_r_lo = FR_s_hi,FR_D_hi,FR_r_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo
- nop.i 999 ;;
+ nop.m 999
+(p11) fadd.s1 FR_s_lo = FR_s_lo,FR_f_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo
- nop.i 999 ;;
-}
-{ .mmi
- nop.m 999 ;;
-(p0) stfe [GR_Address_of_Outputs] = FR_r_lo,-16
- nop.i 999
+ nop.m 999
+(p10) fma.s1 FR_r_lo = FR_s_hi,FR_D_lo,FR_r_lo
+ nop.i 999
}
-{ .mib
- nop.m 999
- nop.i 999
-(p0) br.ret.sptk b0 ;;
+;;
+
+// Return N, r_hi, r_lo
+// We do not return CASE
+{ .mfb
+ nop.m 999
+ fma.s1 FR_r_lo = FR_s_lo,FR_D_hi,FR_r_lo
+ br.ret.sptk b0
}
+;;
-.endp __libm_pi_by_2_reduce
-ASM_SIZE_DIRECTIVE(__libm_pi_by_2_reduce)
+.endp __libm_pi_by_2_reduce#
diff --git a/sysdeps/ia64/fpu/s_scalbnf.S b/sysdeps/ia64/fpu/libm_scalblnf.S
index ff7d1ca637..362e68b641 100644
--- a/sysdeps/ia64/fpu/s_scalbnf.S
+++ b/sysdeps/ia64/fpu/libm_scalblnf.S
@@ -1,10 +1,10 @@
-//.file "scalbnf.s"
+.file "libm_scalblnf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2001 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,26 +35,30 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 1/26/01 scalbnf completely reworked and now standalone version
+// 08/03/01 Initial version
+// 08/23/01 Corrected error tag number
+// 02/06/02 Corrected to handle 32- or 64-bit integers
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
-// float = scalbnf (float x, int n)
-// input floating point f8 and int n (r33)
+// float = __libm_scalblnf (float x, long int n, int long_int_type)
+// input floating point f8 and long int n (r33)
+// input long_int_type = 0 if long int defined as 32 bits, = 1 if 64 bits
+//
// output floating point f8
//
+
// Returns x* 2**n using an fma and detects overflow
// and underflow.
//
//
-#include "libm_support.h"
-
FR_Big = f6
FR_NBig = f7
FR_Floating_X = f8
@@ -81,34 +85,36 @@ GR_Parameter_Y = r36
GR_Parameter_RESULT = r37
GR_Tag = r38
-.align 32
-.global scalbnf
-
.section .text
-.proc scalbnf
-.align 32
-
-scalbnf:
+GLOBAL_LIBM_ENTRY(__libm_scalblnf)
//
// Is x NAN, INF, ZERO, +-?
// Build the exponent Bias
//
{ .mfi
- alloc r32=ar.pfs,1,2,4,0
+ alloc r32=ar.pfs,3,0,4,0
fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero
addl GR_Bias = 0x0FFFF,r0
}
+
//
-// Sign extend input
// Is N zero?
// Normalize x
+// Do we need to sign extend input (long_int_type = 0)?
//
{ .mfi
cmp.eq.unc p6,p0 = r33,r0
fnorm.s1 FR_Norm_X = FR_Floating_X
- sxt4 GR_N_as_int = r33
+ cmp.eq.unc p8,p9 = r34,r0
+}
+;;
+
+{ .mii
+(p9) mov GR_N_as_int = r33 // Get n directly if long int 64 bits
+(p8) sxt4 GR_N_as_int = r33 // Sign extend n if long int 32 bits
+ nop.i 0
}
;;
@@ -173,7 +179,7 @@ scalbnf:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch = 0x000000000003007F
+ movl GR_Scratch = 0x000000000003007F
};;
@@ -184,7 +190,7 @@ scalbnf:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch1= 0x000000000001007F
+ movl GR_Scratch1= 0x000000000001007F
};;
// Set up necessary status fields
@@ -195,12 +201,12 @@ scalbnf:
//
{ .mfi
nop.m 999
-(p0) fsetc.s3 0x7F,0x41
+ fsetc.s3 0x7F,0x41
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fsetc.s2 0x7F,0x42
+ fsetc.s2 0x7F,0x42
nop.i 999
};;
@@ -247,7 +253,7 @@ scalbnf:
nop.i 999
}
{ .mfi
- addl GR_Tag = 178, r0
+ addl GR_Tag = 205, r0
fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big
nop.i 0
};;
@@ -266,9 +272,9 @@ scalbnf:
// Branch out for underflow
//
{ .mfb
-(p6) addl GR_Tag = 179, r0
+(p6) addl GR_Tag = 206, r0
(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig
-(p6) br.cond.spnt L(scalbnf_UNDERFLOW)
+(p6) br.cond.spnt scalbnf_UNDERFLOW
};;
//
@@ -276,8 +282,8 @@ scalbnf:
//
{ .mbb
nop.m 0
-(p7) br.cond.spnt L(scalbnf_OVERFLOW)
-(p9) br.cond.spnt L(scalbnf_OVERFLOW)
+(p7) br.cond.spnt scalbnf_OVERFLOW
+(p9) br.cond.spnt scalbnf_OVERFLOW
};;
//
@@ -289,13 +295,11 @@ scalbnf:
br.ret.sptk b0;;
}
-.endp scalbnf
-ASM_SIZE_DIRECTIVE(scalbnf)
-.proc __libm_error_region
+GLOBAL_LIBM_END(__libm_scalblnf)
__libm_error_region:
-L(scalbnf_OVERFLOW):
-L(scalbnf_UNDERFLOW):
+scalbnf_OVERFLOW:
+scalbnf_UNDERFLOW:
//
// Get stack address of N
@@ -372,8 +376,7 @@ L(scalbnf_UNDERFLOW):
br.ret.sptk b0
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/libm_sincos.S b/sysdeps/ia64/fpu/libm_sincos.S
new file mode 100644
index 0000000000..a3f4c72743
--- /dev/null
+++ b/sysdeps/ia64/fpu/libm_sincos.S
@@ -0,0 +1,782 @@
+.file "libm_sincos.s"
+
+
+// Copyright (c) 2002 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/01/02 Initial version
+// 02/18/02 Large arguments processing routine is excluded.
+// External interface entry points are added
+// 03/13/02 Corrected restore of predicate registers
+// 03/19/02 Added stack unwind around call to __libm_cis_large
+// 09/05/02 Work range is widened by reduction strengthen (3 parts of Pi/16)
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// 1) double _Complex cis(double)
+// 2) void sincos(double, double*s, double*c)
+// 3) __libm_sincos - internal LIBM function, that accepts
+// argument in f8 and returns cosine through f8, sine through f9
+//
+// Overview of operation
+//==============================================================
+//
+// Step 1
+// ======
+// Reduce x to region -1/2*pi/2^k ===== 0 ===== +1/2*pi/2^k where k=4
+// divide x by pi/2^k.
+// Multiply by 2^k/pi.
+// nfloat = Round result to integer (round-to-nearest)
+//
+// r = x - nfloat * pi/2^k
+// Do this as ((((x - nfloat * HIGH(pi/2^k))) -
+// nfloat * LOW(pi/2^k)) -
+// nfloat * LOWEST(pi/2^k) for increased accuracy.
+// pi/2^k is stored as two numbers that when added make pi/2^k.
+// pi/2^k = HIGH(pi/2^k) + LOW(pi/2^k)
+// HIGH and LOW parts are rounded to zero values,
+// and LOWEST is rounded to nearest one.
+//
+// x = (nfloat * pi/2^k) + r
+// r is small enough that we can use a polynomial approximation
+// and is referred to as the reduced argument.
+//
+// Step 3
+// ======
+// Take the unreduced part and remove the multiples of 2pi.
+// So nfloat = nfloat (with lower k+1 bits cleared) + lower k+1 bits
+//
+// nfloat (with lower k+1 bits cleared) is a multiple of 2^(k+1)
+// N * 2^(k+1)
+// nfloat * pi/2^k = N * 2^(k+1) * pi/2^k + (lower k+1 bits) * pi/2^k
+// nfloat * pi/2^k = N * 2 * pi + (lower k+1 bits) * pi/2^k
+// nfloat * pi/2^k = N2pi + M * pi/2^k
+//
+//
+// Sin(x) = Sin((nfloat * pi/2^k) + r)
+// = Sin(nfloat * pi/2^k) * Cos(r) + Cos(nfloat * pi/2^k) * Sin(r)
+//
+// Sin(nfloat * pi/2^k) = Sin(N2pi + Mpi/2^k)
+// = Sin(N2pi)Cos(Mpi/2^k) + Cos(N2pi)Sin(Mpi/2^k)
+// = Sin(Mpi/2^k)
+//
+// Cos(nfloat * pi/2^k) = Cos(N2pi + Mpi/2^k)
+// = Cos(N2pi)Cos(Mpi/2^k) + Sin(N2pi)Sin(Mpi/2^k)
+// = Cos(Mpi/2^k)
+//
+// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r)
+//
+//
+// Step 4
+// ======
+// 0 <= M < 2^(k+1)
+// There are 2^(k+1) Sin entries in a table.
+// There are 2^(k+1) Cos entries in a table.
+//
+// Get Sin(Mpi/2^k) and Cos(Mpi/2^k) by table lookup.
+//
+//
+// Step 5
+// ======
+// Calculate Cos(r) and Sin(r) by polynomial approximation.
+//
+// Cos(r) = 1 + r^2 q1 + r^4 q2 + r^6 q3 + ... = Series for Cos
+// Sin(r) = r + r^3 p1 + r^5 p2 + r^7 p3 + ... = Series for Sin
+//
+// and the coefficients q1, q2, ... and p1, p2, ... are stored in a table
+//
+//
+// Calculate
+// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r)
+//
+// as follows
+//
+// S[m] = Sin(Mpi/2^k) and C[m] = Cos(Mpi/2^k)
+// rsq = r*r
+//
+//
+// P = p1 + r^2p2 + r^4p3 + r^6p4
+// Q = q1 + r^2q2 + r^4q3 + r^6q4
+//
+// rcub = r * rsq
+// Sin(r) = r + rcub * P
+// = r + r^3p1 + r^5p2 + r^7p3 + r^9p4 + ... = Sin(r)
+//
+// The coefficients are not exactly these values, but almost.
+//
+// p1 = -1/6 = -1/3!
+// p2 = 1/120 = 1/5!
+// p3 = -1/5040 = -1/7!
+// p4 = 1/362889 = 1/9!
+//
+// P = r + rcub * P
+//
+// Answer = S[m] Cos(r) + C[m] P
+//
+// Cos(r) = 1 + rsq Q
+// Cos(r) = 1 + r^2 Q
+// Cos(r) = 1 + r^2 (q1 + r^2q2 + r^4q3 + r^6q4)
+// Cos(r) = 1 + r^2q1 + r^4q2 + r^6q3 + r^8q4 + ...
+//
+// S[m] Cos(r) = S[m](1 + rsq Q)
+// S[m] Cos(r) = S[m] + S[m] rsq Q
+// S[m] Cos(r) = S[m] + s_rsq Q
+// Q = S[m] + s_rsq Q
+//
+// Then,
+//
+// Answer = Q + C[m] P
+
+// Registers used
+//==============================================================
+// general input registers:
+// r14 -> r19
+// r32 -> r49
+
+// predicate registers used:
+// p6 -> p14
+
+// floating-point registers used
+// f9 -> f15
+// f32 -> f100
+
+// Assembly macros
+//==============================================================
+
+cis_Arg = f8
+
+cis_Sin_res = f9
+cis_Cos_res = f8
+
+cis_NORM_f8 = f10
+cis_W = f11
+cis_int_Nfloat = f12
+cis_Nfloat = f13
+
+cis_r = f14
+cis_rsq = f15
+cis_rcub = f32
+
+cis_Inv_Pi_by_16 = f33
+cis_Pi_by_16_hi = f34
+cis_Pi_by_16_lo = f35
+
+cis_Inv_Pi_by_64 = f36
+cis_Pi_by_16_lowest = f37
+cis_r_exact = f38
+
+
+cis_P1 = f39
+cis_Q1 = f40
+cis_P2 = f41
+cis_Q2 = f42
+cis_P3 = f43
+cis_Q3 = f44
+cis_P4 = f45
+cis_Q4 = f46
+
+cis_P_temp1 = f47
+cis_P_temp2 = f48
+
+cis_Q_temp1 = f49
+cis_Q_temp2 = f50
+
+cis_P = f51
+
+cis_SIG_INV_PI_BY_16_2TO61 = f52
+cis_RSHF_2TO61 = f53
+cis_RSHF = f54
+cis_2TOM61 = f55
+cis_NFLOAT = f56
+cis_W_2TO61_RSH = f57
+
+cis_tmp = f58
+
+cis_Sm_sin = f59
+cis_Cm_sin = f60
+
+cis_Sm_cos = f61
+cis_Cm_cos = f62
+
+cis_srsq_sin = f63
+cis_srsq_cos = f64
+
+cis_Q_sin = f65
+cis_Q_cos = f66
+cis_Q = f67
+
+/////////////////////////////////////////////////////////////
+
+cis_pResSin = r33
+cis_pResCos = r34
+
+cis_exp_limit = r35
+cis_r_signexp = r36
+cis_AD_beta_table = r37
+cis_r_sincos = r38
+
+cis_r_exp = r39
+cis_r_17_ones = r40
+
+cis_GR_sig_inv_pi_by_16 = r14
+cis_GR_rshf_2to61 = r15
+cis_GR_rshf = r16
+cis_GR_exp_2tom61 = r17
+cis_GR_n = r18
+
+cis_GR_n_sin = r19
+cis_GR_m_sin = r41
+cis_GR_32m_sin = r41
+
+cis_GR_n_cos = r42
+cis_GR_m_cos = r43
+cis_GR_32m_cos = r43
+
+cis_AD_2_sin = r44
+cis_AD_2_cos = r45
+
+cis_gr_tmp = r46
+GR_SAVE_B0 = r47
+GR_SAVE_GP = r48
+rB0_SAVED = r49
+GR_SAVE_PFS = r50
+GR_SAVE_PR = r51
+cis_AD_1 = r52
+
+RODATA
+
+.align 16
+// Pi/16 parts
+LOCAL_OBJECT_START(double_cis_pi)
+ data8 0xC90FDAA22168C234, 0x00003FFC // pi/16 1st part
+ data8 0xC4C6628B80DC1CD1, 0x00003FBC // pi/16 2nd part
+ data8 0xA4093822299F31D0, 0x00003F7A // pi/16 3rd part
+LOCAL_OBJECT_END(double_cis_pi)
+
+// Coefficients for polynomials
+LOCAL_OBJECT_START(double_cis_pq_k4)
+ data8 0x3EC71C963717C63A // P4
+ data8 0x3EF9FFBA8F191AE6 // Q4
+ data8 0xBF2A01A00F4E11A8 // P3
+ data8 0xBF56C16C05AC77BF // Q3
+ data8 0x3F8111111110F167 // P2
+ data8 0x3FA555555554DD45 // Q2
+ data8 0xBFC5555555555555 // P1
+ data8 0xBFDFFFFFFFFFFFFC // Q1
+LOCAL_OBJECT_END(double_cis_pq_k4)
+
+// Sincos table (S[m], C[m])
+LOCAL_OBJECT_START(double_sin_cos_beta_k4)
+data8 0x0000000000000000 , 0x00000000 // sin( 0 pi/16) S0
+data8 0x8000000000000000 , 0x00003fff // cos( 0 pi/16) C0
+//
+data8 0xc7c5c1e34d3055b3 , 0x00003ffc // sin( 1 pi/16) S1
+data8 0xfb14be7fbae58157 , 0x00003ffe // cos( 1 pi/16) C1
+//
+data8 0xc3ef1535754b168e , 0x00003ffd // sin( 2 pi/16) S2
+data8 0xec835e79946a3146 , 0x00003ffe // cos( 2 pi/16) C2
+//
+data8 0x8e39d9cd73464364 , 0x00003ffe // sin( 3 pi/16) S3
+data8 0xd4db3148750d181a , 0x00003ffe // cos( 3 pi/16) C3
+//
+data8 0xb504f333f9de6484 , 0x00003ffe // sin( 4 pi/16) S4
+data8 0xb504f333f9de6484 , 0x00003ffe // cos( 4 pi/16) C4
+//
+data8 0xd4db3148750d181a , 0x00003ffe // sin( 5 pi/16) C3
+data8 0x8e39d9cd73464364 , 0x00003ffe // cos( 5 pi/16) S3
+//
+data8 0xec835e79946a3146 , 0x00003ffe // sin( 6 pi/16) C2
+data8 0xc3ef1535754b168e , 0x00003ffd // cos( 6 pi/16) S2
+//
+data8 0xfb14be7fbae58157 , 0x00003ffe // sin( 7 pi/16) C1
+data8 0xc7c5c1e34d3055b3 , 0x00003ffc // cos( 7 pi/16) S1
+//
+data8 0x8000000000000000 , 0x00003fff // sin( 8 pi/16) C0
+data8 0x0000000000000000 , 0x00000000 // cos( 8 pi/16) S0
+//
+data8 0xfb14be7fbae58157 , 0x00003ffe // sin( 9 pi/16) C1
+data8 0xc7c5c1e34d3055b3 , 0x0000bffc // cos( 9 pi/16) -S1
+//
+data8 0xec835e79946a3146 , 0x00003ffe // sin(10 pi/16) C2
+data8 0xc3ef1535754b168e , 0x0000bffd // cos(10 pi/16) -S2
+//
+data8 0xd4db3148750d181a , 0x00003ffe // sin(11 pi/16) C3
+data8 0x8e39d9cd73464364 , 0x0000bffe // cos(11 pi/16) -S3
+//
+data8 0xb504f333f9de6484 , 0x00003ffe // sin(12 pi/16) S4
+data8 0xb504f333f9de6484 , 0x0000bffe // cos(12 pi/16) -S4
+//
+data8 0x8e39d9cd73464364 , 0x00003ffe // sin(13 pi/16) S3
+data8 0xd4db3148750d181a , 0x0000bffe // cos(13 pi/16) -C3
+//
+data8 0xc3ef1535754b168e , 0x00003ffd // sin(14 pi/16) S2
+data8 0xec835e79946a3146 , 0x0000bffe // cos(14 pi/16) -C2
+//
+data8 0xc7c5c1e34d3055b3 , 0x00003ffc // sin(15 pi/16) S1
+data8 0xfb14be7fbae58157 , 0x0000bffe // cos(15 pi/16) -C1
+//
+data8 0x0000000000000000 , 0x00000000 // sin(16 pi/16) S0
+data8 0x8000000000000000 , 0x0000bfff // cos(16 pi/16) -C0
+//
+data8 0xc7c5c1e34d3055b3 , 0x0000bffc // sin(17 pi/16) -S1
+data8 0xfb14be7fbae58157 , 0x0000bffe // cos(17 pi/16) -C1
+//
+data8 0xc3ef1535754b168e , 0x0000bffd // sin(18 pi/16) -S2
+data8 0xec835e79946a3146 , 0x0000bffe // cos(18 pi/16) -C2
+//
+data8 0x8e39d9cd73464364 , 0x0000bffe // sin(19 pi/16) -S3
+data8 0xd4db3148750d181a , 0x0000bffe // cos(19 pi/16) -C3
+//
+data8 0xb504f333f9de6484 , 0x0000bffe // sin(20 pi/16) -S4
+data8 0xb504f333f9de6484 , 0x0000bffe // cos(20 pi/16) -S4
+//
+data8 0xd4db3148750d181a , 0x0000bffe // sin(21 pi/16) -C3
+data8 0x8e39d9cd73464364 , 0x0000bffe // cos(21 pi/16) -S3
+//
+data8 0xec835e79946a3146 , 0x0000bffe // sin(22 pi/16) -C2
+data8 0xc3ef1535754b168e , 0x0000bffd // cos(22 pi/16) -S2
+//
+data8 0xfb14be7fbae58157 , 0x0000bffe // sin(23 pi/16) -C1
+data8 0xc7c5c1e34d3055b3 , 0x0000bffc // cos(23 pi/16) -S1
+//
+data8 0x8000000000000000 , 0x0000bfff // sin(24 pi/16) -C0
+data8 0x0000000000000000 , 0x00000000 // cos(24 pi/16) S0
+//
+data8 0xfb14be7fbae58157 , 0x0000bffe // sin(25 pi/16) -C1
+data8 0xc7c5c1e34d3055b3 , 0x00003ffc // cos(25 pi/16) S1
+//
+data8 0xec835e79946a3146 , 0x0000bffe // sin(26 pi/16) -C2
+data8 0xc3ef1535754b168e , 0x00003ffd // cos(26 pi/16) S2
+//
+data8 0xd4db3148750d181a , 0x0000bffe // sin(27 pi/16) -C3
+data8 0x8e39d9cd73464364 , 0x00003ffe // cos(27 pi/16) S3
+//
+data8 0xb504f333f9de6484 , 0x0000bffe // sin(28 pi/16) -S4
+data8 0xb504f333f9de6484 , 0x00003ffe // cos(28 pi/16) S4
+//
+data8 0x8e39d9cd73464364 , 0x0000bffe // sin(29 pi/16) -S3
+data8 0xd4db3148750d181a , 0x00003ffe // cos(29 pi/16) C3
+//
+data8 0xc3ef1535754b168e , 0x0000bffd // sin(30 pi/16) -S2
+data8 0xec835e79946a3146 , 0x00003ffe // cos(30 pi/16) C2
+//
+data8 0xc7c5c1e34d3055b3 , 0x0000bffc // sin(31 pi/16) -S1
+data8 0xfb14be7fbae58157 , 0x00003ffe // cos(31 pi/16) C1
+//
+data8 0x0000000000000000 , 0x00000000 // sin(32 pi/16) S0
+data8 0x8000000000000000 , 0x00003fff // cos(32 pi/16) C0
+LOCAL_OBJECT_END(double_sin_cos_beta_k4)
+
+.section .text
+
+GLOBAL_IEEE754_ENTRY(sincos)
+// cis_GR_sig_inv_pi_by_16 = significand of 16/pi
+{ .mlx
+ alloc GR_SAVE_PFS = ar.pfs, 0, 21, 0, 0
+ movl cis_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A
+
+}
+// cis_GR_rshf_2to61 = 1.1000 2^(63+63-2)
+{ .mlx
+ addl cis_AD_1 = @ltoff(double_cis_pi), gp
+ movl cis_GR_rshf_2to61 = 0x47b8000000000000
+};;
+
+{ .mfi
+ ld8 cis_AD_1 = [cis_AD_1]
+ fnorm.s1 cis_NORM_f8 = cis_Arg
+ cmp.eq p13, p14 = r0, r0 // p13 set for sincos
+}
+// cis_GR_exp_2tom61 = exponent of scaling factor 2^-61
+{ .mib
+ mov cis_GR_exp_2tom61 = 0xffff-61
+ nop.i 0
+ br.cond.sptk _CIS_COMMON
+};;
+GLOBAL_IEEE754_END(sincos)
+LOCAL_LIBM_ENTRY(cis)
+LOCAL_LIBM_END(cis)
+GLOBAL_LIBM_ENTRY(__libm_sincos)
+// cis_GR_sig_inv_pi_by_16 = significand of 16/pi
+{ .mlx
+ alloc GR_SAVE_PFS = ar.pfs,0,21,0,0
+ movl cis_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A
+}
+// cis_GR_rshf_2to61 = 1.1000 2^(63+63-2)
+{ .mlx
+ addl cis_AD_1 = @ltoff(double_cis_pi), gp
+ movl cis_GR_rshf_2to61 = 0x47b8000000000000
+};;
+// p14 set for __libm_sincos and cis
+{ .mfi
+ ld8 cis_AD_1 = [cis_AD_1]
+ fnorm.s1 cis_NORM_f8 = cis_Arg
+ cmp.eq p14, p13 = r0, r0
+}
+// cis_GR_exp_2tom61 = exponent of scaling factor 2^-61
+{ .mib
+ mov cis_GR_exp_2tom61 = 0xffff-61
+ nop.i 0
+ nop.b 0
+};;
+
+_CIS_COMMON:
+// Form two constants we need
+// 16/pi * 2^-2 * 2^63, scaled by 2^61 since we just loaded the significand
+// 1.1000...000 * 2^(63+63-2) to right shift int(W) into the low significand
+// fcmp used to set denormal, and invalid on snans
+{ .mfi
+ setf.sig cis_SIG_INV_PI_BY_16_2TO61 = cis_GR_sig_inv_pi_by_16
+ fclass.m p6,p0 = cis_Arg, 0xe7 // if x=0,inf,nan
+ addl cis_gr_tmp = -1, r0
+}
+// 1.1000 2^63 for right shift
+{ .mlx
+ setf.d cis_RSHF_2TO61 = cis_GR_rshf_2to61
+ movl cis_GR_rshf = 0x43e8000000000000
+};;
+
+// Form another constant
+// 2^-61 for scaling Nfloat
+// 0x1001a is register_bias + 27.
+// So if f8 >= 2^27, go to large arguments routine
+{ .mmi
+ getf.exp cis_r_signexp = cis_Arg
+ setf.exp cis_2TOM61 = cis_GR_exp_2tom61
+ mov cis_exp_limit = 0x1001a
+};;
+
+// Load the two pieces of pi/16
+// Form another constant
+// 1.1000...000 * 2^63, the right shift constant
+{ .mmb
+ ldfe cis_Pi_by_16_hi = [cis_AD_1],16
+ setf.d cis_RSHF = cis_GR_rshf
+(p6) br.cond.spnt _CIS_SPECIAL_ARGS
+};;
+
+// Create constant inexact set
+{ .mmi
+ ldfe cis_Pi_by_16_lo = [cis_AD_1],16
+ setf.sig cis_tmp = cis_gr_tmp
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe cis_Pi_by_16_lowest = [cis_AD_1],16
+ nop.f 0
+ nop.i 0
+};;
+
+// Start loading P, Q coefficients
+{ .mib
+ ldfpd cis_P4,cis_Q4 = [cis_AD_1],16
+ dep.z cis_r_exp = cis_r_signexp, 0, 17
+ nop.b 0
+};;
+
+// p10 is true if we must call routines to handle larger arguments
+// p10 is true if f8 exp is > 0x1001a
+{ .mmb
+ ldfpd cis_P3,cis_Q3 = [cis_AD_1],16
+ cmp.ge p10, p0 = cis_r_exp, cis_exp_limit
+(p10) br.cond.spnt _CIS_LARGE_ARGS // go to |x| >= 2^27 path
+};;
+
+// cis_W = x * cis_Inv_Pi_by_16
+// Multiply x by scaled 16/pi and add large const to shift integer part of W to
+// rightmost bits of significand
+{ .mfi
+ ldfpd cis_P2,cis_Q2 = [cis_AD_1],16
+ fma.s1 cis_W_2TO61_RSH = cis_NORM_f8,cis_SIG_INV_PI_BY_16_2TO61,cis_RSHF_2TO61
+ nop.i 0
+};;
+
+// cis_NFLOAT = Round_Int_Nearest(cis_W)
+{ .mfi
+ ldfpd cis_P1,cis_Q1 = [cis_AD_1], 16
+ fms.s1 cis_NFLOAT = cis_W_2TO61_RSH,cis_2TOM61,cis_RSHF
+ nop.i 0
+};;
+
+// get N = (int)cis_int_Nfloat
+{ .mfi
+ getf.sig cis_GR_n = cis_W_2TO61_RSH
+ nop.f 0
+ nop.i 0
+};;
+
+// Add 2^(k-1) (which is in cis_r_sincos) to N
+// cis_r = -cis_Nfloat * cis_Pi_by_16_hi + x
+// cis_r = cis_r -cis_Nfloat * cis_Pi_by_16_lo
+{ .mfi
+ add cis_GR_n_cos = 0x8, cis_GR_n
+ fnma.s1 cis_r = cis_NFLOAT,cis_Pi_by_16_hi,cis_NORM_f8
+ nop.i 0
+};;
+
+//Get M (least k+1 bits of N)
+{ .mmi
+ and cis_GR_m_sin = 0x1f,cis_GR_n
+ and cis_GR_m_cos = 0x1f,cis_GR_n_cos
+ nop.i 0
+};;
+
+{ .mmi
+ nop.m 0
+ nop.m 0
+ shl cis_GR_32m_sin = cis_GR_m_sin,5
+};;
+
+// Add 32*M to address of sin_cos_beta table
+{ .mmi
+ add cis_AD_2_sin = cis_GR_32m_sin, cis_AD_1
+ nop.m 0
+ shl cis_GR_32m_cos = cis_GR_m_cos,5
+};;
+
+// Add 32*M to address of sin_cos_beta table
+{ .mmf
+ ldfe cis_Sm_sin = [cis_AD_2_sin],16
+ add cis_AD_2_cos = cis_GR_32m_cos, cis_AD_1
+ fclass.m.unc p10,p0 = cis_Arg,0x0b // den. input - uflow
+};;
+
+{ .mfi
+ ldfe cis_Sm_cos = [cis_AD_2_cos], 16
+ fnma.s1 cis_r = cis_NFLOAT, cis_Pi_by_16_lo, cis_r
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe cis_Cm_sin = [cis_AD_2_sin]
+ fma.s1 cis_rsq = cis_r, cis_r, f0 // get r^2
+ nop.i 0
+}
+// fmpy forces inexact flag
+{ .mfi
+ nop.m 0
+ fmpy.s0 cis_tmp = cis_tmp,cis_tmp
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 cis_r_exact = cis_NFLOAT, cis_Pi_by_16_lowest, cis_r
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe cis_Cm_cos = [cis_AD_2_cos]
+ fma.s1 cis_P_temp1 = cis_rsq, cis_P4, cis_P3
+ nop.i 0
+}
+
+{ .mfi
+ nop.m 0
+ fma.s1 cis_Q_temp1 = cis_rsq, cis_Q4, cis_Q3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 cis_srsq_sin = cis_Sm_sin, cis_rsq
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 cis_srsq_cos = cis_Sm_cos,cis_rsq
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 cis_Q_temp2 = cis_rsq, cis_Q_temp1, cis_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 cis_P_temp2 = cis_rsq, cis_P_temp1, cis_P2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 cis_Q = cis_rsq, cis_Q_temp2, cis_Q1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 cis_P = cis_rsq, cis_P_temp2, cis_P1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 cis_rcub = cis_r_exact, cis_rsq // get r^3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 cis_Q_sin = cis_srsq_sin,cis_Q, cis_Sm_sin
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 cis_Q_cos = cis_srsq_cos,cis_Q, cis_Sm_cos
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 cis_P = cis_rcub,cis_P, cis_r_exact // final P
+ nop.i 0
+};;
+
+// If den. arg, force underflow to be set
+{ .mfi
+ nop.m 0
+(p10) fmpy.d.s0 cis_tmp = cis_Arg,cis_Arg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.d.s0 cis_Sin_res = cis_Cm_sin,cis_P,cis_Q_sin//Final sin
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fma.d.s0 cis_Cos_res = cis_Cm_cos,cis_P,cis_Q_cos//Final cos
+(p14) br.ret.sptk b0 // common exit for __libm_sincos and cis main path
+};;
+
+{ .mmb
+ stfd [cis_pResSin] = cis_Sin_res
+ stfd [cis_pResCos] = cis_Cos_res
+ br.ret.sptk b0 // common exit for sincos main path
+};;
+
+_CIS_SPECIAL_ARGS:
+// sin(+/-0) = +/-0
+// sin(Inf) = NaN
+// sin(NaN) = NaN
+{ .mfi
+ nop.m 999
+ fma.d.s0 cis_Sin_res = cis_Arg, f0, f0 // sinf(+/-0,NaN,Inf)
+ nop.i 999
+};;
+// cos(+/-0) = 1.0
+// cos(Inf) = NaN
+// cos(NaN) = NaN
+{ .mfb
+ nop.m 999
+ fma.d.s0 cis_Cos_res = cis_Arg, f0, f1 // cosf(+/-0,NaN,Inf)
+(p14) br.ret.sptk b0 //spec exit for __libm_sincos and cis main path
+};;
+
+{ .mmb
+ stfd [cis_pResSin] = cis_Sin_res
+ stfd [cis_pResCos] = cis_Cos_res
+ br.ret.sptk b0 // common exit for sincos main path
+};;
+GLOBAL_LIBM_END(__libm_sincos)
+//// |x| > 2^27 path ///////
+.proc _CIS_LARGE_ARGS
+_CIS_LARGE_ARGS:
+.prologue
+{ .mfi
+ nop.m 0
+ nop.f 0
+.save ar.pfs, GR_SAVE_PFS
+ mov GR_SAVE_PFS = ar.pfs
+}
+;;
+
+{ .mfi
+ mov GR_SAVE_GP = gp
+ nop.f 0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0 = b0
+};;
+
+.body
+// Call of huge arguments sincos
+{ .mib
+ nop.m 0
+ mov GR_SAVE_PR = pr
+ br.call.sptk b0 = __libm_sincos_large
+};;
+
+{ .mfi
+ mov gp = GR_SAVE_GP
+ nop.f 0
+ mov pr = GR_SAVE_PR, 0x1fffe
+}
+;;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ mov b0 = GR_SAVE_B0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.d.s0 cis_Cos_res = cis_Cos_res, f1, f0
+ mov ar.pfs = GR_SAVE_PFS
+}
+{ .mfb
+ nop.m 0
+ fma.d.s0 cis_Sin_res = cis_Sin_res, f1, f0
+(p14) br.ret.sptk b0 // exit for |x| > 2^27 path (__libm_sincos and cis)
+};;
+
+{ .mmb
+ stfd [cis_pResSin] = cis_Sin_res
+ stfd [cis_pResCos] = cis_Cos_res
+ br.ret.sptk b0 // exit for sincos |x| > 2^27 path
+};;
+.endp _CIS_LARGE_ARGS
+
+.type __libm_sincos_large#,@function
+.global __libm_sincos_large#
+
diff --git a/sysdeps/ia64/fpu/libm_sincos_large.S b/sysdeps/ia64/fpu/libm_sincos_large.S
new file mode 100644
index 0000000000..42cf0940f0
--- /dev/null
+++ b/sysdeps/ia64/fpu/libm_sincos_large.S
@@ -0,0 +1,2754 @@
+.file "libm_sincos_large.s"
+
+
+// Copyright (c) 2002 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/15/02 Initial version
+// 05/13/02 Changed interface to __libm_pi_by_2_reduce
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
+// 05/15/03 Reformatted data tables
+//
+//
+// Overview of operation
+//==============================================================
+//
+// These functions calculate the sin and cos for inputs
+// greater than 2^10
+//
+// __libm_sin_large#
+// __libm_cos_large#
+// They accept argument in f8
+// and return result in f8 without final rounding
+//
+// __libm_sincos_large#
+// It accepts argument in f8
+// and returns cos in f8 and sin in f9 without final rounding
+//
+//
+//*********************************************************************
+//
+// Accuracy: Within .7 ulps for 80-bit floating point values
+// Very accurate for double precision values
+//
+//*********************************************************************
+//
+// Resources Used:
+//
+// Floating-Point Registers: f8 as Input Value, f8 and f9 as Return Values
+// f32-f103
+//
+// General Purpose Registers:
+// r32-r43
+// r44-r45 (Used to pass arguments to pi_by_2 reduce routine)
+//
+// Predicate Registers: p6-p13
+//
+//*********************************************************************
+//
+// IEEE Special Conditions:
+//
+// Denormal fault raised on denormal inputs
+// Overflow exceptions do not occur
+// Underflow exceptions raised when appropriate for sin
+// (No specialized error handling for this routine)
+// Inexact raised when appropriate by algorithm
+//
+// sin(SNaN) = QNaN
+// sin(QNaN) = QNaN
+// sin(inf) = QNaN
+// sin(+/-0) = +/-0
+// cos(inf) = QNaN
+// cos(SNaN) = QNaN
+// cos(QNaN) = QNaN
+// cos(0) = 1
+//
+//*********************************************************************
+//
+// Mathematical Description
+// ========================
+//
+// The computation of FSIN and FCOS is best handled in one piece of
+// code. The main reason is that given any argument Arg, computation
+// of trigonometric functions first calculate N and an approximation
+// to alpha where
+//
+// Arg = N pi/2 + alpha, |alpha| <= pi/4.
+//
+// Since
+//
+// cos( Arg ) = sin( (N+1) pi/2 + alpha ),
+//
+// therefore, the code for computing sine will produce cosine as long
+// as 1 is added to N immediately after the argument reduction
+// process.
+//
+// Let M = N if sine
+// N+1 if cosine.
+//
+// Now, given
+//
+// Arg = M pi/2 + alpha, |alpha| <= pi/4,
+//
+// let I = M mod 4, or I be the two lsb of M when M is represented
+// as 2's complement. I = [i_0 i_1]. Then
+//
+// sin( Arg ) = (-1)^i_0 sin( alpha ) if i_1 = 0,
+// = (-1)^i_0 cos( alpha ) if i_1 = 1.
+//
+// For example:
+// if M = -1, I = 11
+// sin ((-pi/2 + alpha) = (-1) cos (alpha)
+// if M = 0, I = 00
+// sin (alpha) = sin (alpha)
+// if M = 1, I = 01
+// sin (pi/2 + alpha) = cos (alpha)
+// if M = 2, I = 10
+// sin (pi + alpha) = (-1) sin (alpha)
+// if M = 3, I = 11
+// sin ((3/2)pi + alpha) = (-1) cos (alpha)
+//
+// The value of alpha is obtained by argument reduction and
+// represented by two working precision numbers r and c where
+//
+// alpha = r + c accurately.
+//
+// The reduction method is described in a previous write up.
+// The argument reduction scheme identifies 4 cases. For Cases 2
+// and 4, because |alpha| is small, sin(r+c) and cos(r+c) can be
+// computed very easily by 2 or 3 terms of the Taylor series
+// expansion as follows:
+//
+// Case 2:
+// -------
+//
+// sin(r + c) = r + c - r^3/6 accurately
+// cos(r + c) = 1 - 2^(-67) accurately
+//
+// Case 4:
+// -------
+//
+// sin(r + c) = r + c - r^3/6 + r^5/120 accurately
+// cos(r + c) = 1 - r^2/2 + r^4/24 accurately
+//
+// The only cases left are Cases 1 and 3 of the argument reduction
+// procedure. These two cases will be merged since after the
+// argument is reduced in either cases, we have the reduced argument
+// represented as r + c and that the magnitude |r + c| is not small
+// enough to allow the usage of a very short approximation.
+//
+// The required calculation is either
+//
+// sin(r + c) = sin(r) + correction, or
+// cos(r + c) = cos(r) + correction.
+//
+// Specifically,
+//
+// sin(r + c) = sin(r) + c sin'(r) + O(c^2)
+// = sin(r) + c cos (r) + O(c^2)
+// = sin(r) + c(1 - r^2/2) accurately.
+// Similarly,
+//
+// cos(r + c) = cos(r) - c sin(r) + O(c^2)
+// = cos(r) - c(r - r^3/6) accurately.
+//
+// We therefore concentrate on accurately calculating sin(r) and
+// cos(r) for a working-precision number r, |r| <= pi/4 to within
+// 0.1% or so.
+//
+// The greatest challenge of this task is that the second terms of
+// the Taylor series
+//
+// r - r^3/3! + r^r/5! - ...
+//
+// and
+//
+// 1 - r^2/2! + r^4/4! - ...
+//
+// are not very small when |r| is close to pi/4 and the rounding
+// errors will be a concern if simple polynomial accumulation is
+// used. When |r| < 2^-3, however, the second terms will be small
+// enough (6 bits or so of right shift) that a normal Horner
+// recurrence suffices. Hence there are two cases that we consider
+// in the accurate computation of sin(r) and cos(r), |r| <= pi/4.
+//
+// Case small_r: |r| < 2^(-3)
+// --------------------------
+//
+// Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1],
+// we have
+//
+// sin(Arg) = (-1)^i_0 * sin(r + c) if i_1 = 0
+// = (-1)^i_0 * cos(r + c) if i_1 = 1
+//
+// can be accurately approximated by
+//
+// sin(Arg) = (-1)^i_0 * [sin(r) + c] if i_1 = 0
+// = (-1)^i_0 * [cos(r) - c*r] if i_1 = 1
+//
+// because |r| is small and thus the second terms in the correction
+// are unneccessary.
+//
+// Finally, sin(r) and cos(r) are approximated by polynomials of
+// moderate lengths.
+//
+// sin(r) = r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11
+// cos(r) = 1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10
+//
+// We can make use of predicates to selectively calculate
+// sin(r) or cos(r) based on i_1.
+//
+// Case normal_r: 2^(-3) <= |r| <= pi/4
+// ------------------------------------
+//
+// This case is more likely than the previous one if one considers
+// r to be uniformly distributed in [-pi/4 pi/4]. Again,
+//
+// sin(Arg) = (-1)^i_0 * sin(r + c) if i_1 = 0
+// = (-1)^i_0 * cos(r + c) if i_1 = 1.
+//
+// Because |r| is now larger, we need one extra term in the
+// correction. sin(Arg) can be accurately approximated by
+//
+// sin(Arg) = (-1)^i_0 * [sin(r) + c(1-r^2/2)] if i_1 = 0
+// = (-1)^i_0 * [cos(r) - c*r*(1 - r^2/6)] i_1 = 1.
+//
+// Finally, sin(r) and cos(r) are approximated by polynomials of
+// moderate lengths.
+//
+// sin(r) = r + PP_1_hi r^3 + PP_1_lo r^3 +
+// PP_2 r^5 + ... + PP_8 r^17
+//
+// cos(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16
+//
+// where PP_1_hi is only about 16 bits long and QQ_1 is -1/2.
+// The crux in accurate computation is to calculate
+//
+// r + PP_1_hi r^3 or 1 + QQ_1 r^2
+//
+// accurately as two pieces: U_hi and U_lo. The way to achieve this
+// is to obtain r_hi as a 10 sig. bit number that approximates r to
+// roughly 8 bits or so of accuracy. (One convenient way is
+//
+// r_hi := frcpa( frcpa( r ) ).)
+//
+// This way,
+//
+// r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 +
+// PP_1_hi (r^3 - r_hi^3)
+// = [r + PP_1_hi r_hi^3] +
+// [PP_1_hi (r - r_hi)
+// (r^2 + r_hi r + r_hi^2) ]
+// = U_hi + U_lo
+//
+// Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long,
+// PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed
+// exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign
+// and that there is no more than 8 bit shift off between r and
+// PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus
+// calculated without any error. Finally, the fact that
+//
+// |U_lo| <= 2^(-8) |U_hi|
+//
+// says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly
+// 8 extra bits of accuracy.
+//
+// Similarly,
+//
+// 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] +
+// [QQ_1 (r - r_hi)(r + r_hi)]
+// = U_hi + U_lo.
+//
+// Summarizing, we calculate r_hi = frcpa( frcpa( r ) ).
+//
+// If i_1 = 0, then
+//
+// U_hi := r + PP_1_hi * r_hi^3
+// U_lo := PP_1_hi * (r - r_hi) * (r^2 + r*r_hi + r_hi^2)
+// poly := PP_1_lo r^3 + PP_2 r^5 + ... + PP_8 r^17
+// correction := c * ( 1 + C_1 r^2 )
+//
+// Else ...i_1 = 1
+//
+// U_hi := 1 + QQ_1 * r_hi * r_hi
+// U_lo := QQ_1 * (r - r_hi) * (r + r_hi)
+// poly := QQ_2 * r^4 + QQ_3 * r^6 + ... + QQ_8 r^16
+// correction := -c * r * (1 + S_1 * r^2)
+//
+// End
+//
+// Finally,
+//
+// V := poly + ( U_lo + correction )
+//
+// / U_hi + V if i_0 = 0
+// result := |
+// \ (-U_hi) - V if i_0 = 1
+//
+// It is important that in the last step, negation of U_hi is
+// performed prior to the subtraction which is to be performed in
+// the user-set rounding mode.
+//
+//
+// Algorithmic Description
+// =======================
+//
+// The argument reduction algorithm is tightly integrated into FSIN
+// and FCOS which share the same code. The following is complete and
+// self-contained. The argument reduction description given
+// previously is repeated below.
+//
+//
+// Step 0. Initialization.
+//
+// If FSIN is invoked, set N_inc := 0; else if FCOS is invoked,
+// set N_inc := 1.
+//
+// Step 1. Check for exceptional and special cases.
+//
+// * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special
+// handling.
+// * If |Arg| < 2^24, go to Step 2 for reduction of moderate
+// arguments. This is the most likely case.
+// * If |Arg| < 2^63, go to Step 8 for pre-reduction of large
+// arguments.
+// * If |Arg| >= 2^63, go to Step 10 for special handling.
+//
+// Step 2. Reduction of moderate arguments.
+//
+// If |Arg| < pi/4 ...quick branch
+// N_fix := N_inc (integer)
+// r := Arg
+// c := 0.0
+// Branch to Step 4, Case_1_complete
+// Else ...cf. argument reduction
+// N := Arg * two_by_PI (fp)
+// N_fix := fcvt.fx( N ) (int)
+// N := fcvt.xf( N_fix )
+// N_fix := N_fix + N_inc
+// s := Arg - N * P_1 (first piece of pi/2)
+// w := -N * P_2 (second piece of pi/2)
+//
+// If |s| >= 2^(-33)
+// go to Step 3, Case_1_reduce
+// Else
+// go to Step 7, Case_2_reduce
+// Endif
+// Endif
+//
+// Step 3. Case_1_reduce.
+//
+// r := s + w
+// c := (s - r) + w ...observe order
+//
+// Step 4. Case_1_complete
+//
+// ...At this point, the reduced argument alpha is
+// ...accurately represented as r + c.
+// If |r| < 2^(-3), go to Step 6, small_r.
+//
+// Step 5. Normal_r.
+//
+// Let [i_0 i_1] by the 2 lsb of N_fix.
+// FR_rsq := r * r
+// r_hi := frcpa( frcpa( r ) )
+// r_lo := r - r_hi
+//
+// If i_1 = 0, then
+// poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8))
+// U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order
+// U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi)
+// correction := c + c*C_1*FR_rsq ...any order
+// Else
+// poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8))
+// U_hi := 1 + QQ_1 * r_hi * r_hi ...any order
+// U_lo := QQ_1 * r_lo * (r + r_hi)
+// correction := -c*(r + S_1*FR_rsq*r) ...any order
+// Endif
+//
+// V := poly + (U_lo + correction) ...observe order
+//
+// result := (i_0 == 0? 1.0 : -1.0)
+//
+// Last instruction in user-set rounding mode
+//
+// result := (i_0 == 0? result*U_hi + V :
+// result*U_hi - V)
+//
+// Return
+//
+// Step 6. Small_r.
+//
+// ...Use flush to zero mode without causing exception
+// Let [i_0 i_1] be the two lsb of N_fix.
+//
+// FR_rsq := r * r
+//
+// If i_1 = 0 then
+// z := FR_rsq*FR_rsq; z := FR_rsq*z *r
+// poly_lo := S_3 + FR_rsq*(S_4 + FR_rsq*S_5)
+// poly_hi := r*FR_rsq*(S_1 + FR_rsq*S_2)
+// correction := c
+// result := r
+// Else
+// z := FR_rsq*FR_rsq; z := FR_rsq*z
+// poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5)
+// poly_hi := FR_rsq*(C_1 + FR_rsq*C_2)
+// correction := -c*r
+// result := 1
+// Endif
+//
+// poly := poly_hi + (z * poly_lo + correction)
+//
+// If i_0 = 1, result := -result
+//
+// Last operation. Perform in user-set rounding mode
+//
+// result := (i_0 == 0? result + poly :
+// result - poly )
+// Return
+//
+// Step 7. Case_2_reduce.
+//
+// ...Refer to the write up for argument reduction for
+// ...rationale. The reduction algorithm below is taken from
+// ...argument reduction description and integrated this.
+//
+// w := N*P_3
+// U_1 := N*P_2 + w ...FMA
+// U_2 := (N*P_2 - U_1) + w ...2 FMA
+// ...U_1 + U_2 is N*(P_2+P_3) accurately
+//
+// r := s - U_1
+// c := ( (s - r) - U_1 ) - U_2
+//
+// ...The mathematical sum r + c approximates the reduced
+// ...argument accurately. Note that although compared to
+// ...Case 1, this case requires much more work to reduce
+// ...the argument, the subsequent calculation needed for
+// ...any of the trigonometric function is very little because
+// ...|alpha| < 1.01*2^(-33) and thus two terms of the
+// ...Taylor series expansion suffices.
+//
+// If i_1 = 0 then
+// poly := c + S_1 * r * r * r ...any order
+// result := r
+// Else
+// poly := -2^(-67)
+// result := 1.0
+// Endif
+//
+// If i_0 = 1, result := -result
+//
+// Last operation. Perform in user-set rounding mode
+//
+// result := (i_0 == 0? result + poly :
+// result - poly )
+//
+// Return
+//
+//
+// Step 8. Pre-reduction of large arguments.
+//
+// ...Again, the following reduction procedure was described
+// ...in the separate write up for argument reduction, which
+// ...is tightly integrated here.
+
+// N_0 := Arg * Inv_P_0
+// N_0_fix := fcvt.fx( N_0 )
+// N_0 := fcvt.xf( N_0_fix)
+
+// Arg' := Arg - N_0 * P_0
+// w := N_0 * d_1
+// N := Arg' * two_by_PI
+// N_fix := fcvt.fx( N )
+// N := fcvt.xf( N_fix )
+// N_fix := N_fix + N_inc
+//
+// s := Arg' - N * P_1
+// w := w - N * P_2
+//
+// If |s| >= 2^(-14)
+// go to Step 3
+// Else
+// go to Step 9
+// Endif
+//
+// Step 9. Case_4_reduce.
+//
+// ...first obtain N_0*d_1 and -N*P_2 accurately
+// U_hi := N_0 * d_1 V_hi := -N*P_2
+// U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs
+//
+// ...compute the contribution from N_0*d_1 and -N*P_3
+// w := -N*P_3
+// w := w + N_0*d_2
+// t := U_lo + V_lo + w ...any order
+//
+// ...at this point, the mathematical value
+// ...s + U_hi + V_hi + t approximates the true reduced argument
+// ...accurately. Just need to compute this accurately.
+//
+// ...Calculate U_hi + V_hi accurately:
+// A := U_hi + V_hi
+// if |U_hi| >= |V_hi| then
+// a := (U_hi - A) + V_hi
+// else
+// a := (V_hi - A) + U_hi
+// endif
+// ...order in computing "a" must be observed. This branch is
+// ...best implemented by predicates.
+// ...A + a is U_hi + V_hi accurately. Moreover, "a" is
+// ...much smaller than A: |a| <= (1/2)ulp(A).
+//
+// ...Just need to calculate s + A + a + t
+// C_hi := s + A t := t + a
+// C_lo := (s - C_hi) + A
+// C_lo := C_lo + t
+//
+// ...Final steps for reduction
+// r := C_hi + C_lo
+// c := (C_hi - r) + C_lo
+//
+// ...At this point, we have r and c
+// ...And all we need is a couple of terms of the corresponding
+// ...Taylor series.
+//
+// If i_1 = 0
+// poly := c + r*FR_rsq*(S_1 + FR_rsq*S_2)
+// result := r
+// Else
+// poly := FR_rsq*(C_1 + FR_rsq*C_2)
+// result := 1
+// Endif
+//
+// If i_0 = 1, result := -result
+//
+// Last operation. Perform in user-set rounding mode
+//
+// result := (i_0 == 0? result + poly :
+// result - poly )
+// Return
+//
+// Large Arguments: For arguments above 2**63, a Payne-Hanek
+// style argument reduction is used and pi_by_2 reduce is called.
+//
+
+
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(FSINCOS_CONSTANTS)
+
+data4 0x4B800000 // two**24
+data4 0xCB800000 // -two**24
+data4 0x00000000 // pad
+data4 0x00000000 // pad
+data8 0xA2F9836E4E44152A, 0x00003FFE // Inv_pi_by_2
+data8 0xC84D32B0CE81B9F1, 0x00004016 // P_0
+data8 0xC90FDAA22168C235, 0x00003FFF // P_1
+data8 0xECE675D1FC8F8CBB, 0x0000BFBD // P_2
+data8 0xB7ED8FBBACC19C60, 0x0000BF7C // P_3
+data4 0x5F000000 // two**63
+data4 0xDF000000 // -two**63
+data4 0x00000000 // pad
+data4 0x00000000 // pad
+data8 0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0
+data8 0x8D848E89DBD171A1, 0x0000BFBF // d_1
+data8 0xD5394C3618A66F8E, 0x0000BF7C // d_2
+data8 0xC90FDAA22168C234, 0x00003FFE // pi_by_4
+data8 0xC90FDAA22168C234, 0x0000BFFE // neg_pi_by_4
+data4 0x3E000000 // two**-3
+data4 0xBE000000 // -two**-3
+data4 0x00000000 // pad
+data4 0x00000000 // pad
+data4 0x2F000000 // two**-33
+data4 0xAF000000 // -two**-33
+data4 0x9E000000 // -two**-67
+data4 0x00000000 // pad
+data8 0xCC8ABEBCA21C0BC9, 0x00003FCE // PP_8
+data8 0xD7468A05720221DA, 0x0000BFD6 // PP_7
+data8 0xB092382F640AD517, 0x00003FDE // PP_6
+data8 0xD7322B47D1EB75A4, 0x0000BFE5 // PP_5
+data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1
+data8 0xAAAA000000000000, 0x0000BFFC // PP_1_hi
+data8 0xB8EF1D2ABAF69EEA, 0x00003FEC // PP_4
+data8 0xD00D00D00D03BB69, 0x0000BFF2 // PP_3
+data8 0x8888888888888962, 0x00003FF8 // PP_2
+data8 0xAAAAAAAAAAAB0000, 0x0000BFEC // PP_1_lo
+data8 0xD56232EFC2B0FE52, 0x00003FD2 // QQ_8
+data8 0xC9C99ABA2B48DCA6, 0x0000BFDA // QQ_7
+data8 0x8F76C6509C716658, 0x00003FE2 // QQ_6
+data8 0x93F27DBAFDA8D0FC, 0x0000BFE9 // QQ_5
+data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1
+data8 0x8000000000000000, 0x0000BFFE // QQ_1
+data8 0xD00D00D00C6E5041, 0x00003FEF // QQ_4
+data8 0xB60B60B60B607F60, 0x0000BFF5 // QQ_3
+data8 0xAAAAAAAAAAAAAA9B, 0x00003FFA // QQ_2
+data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1
+data8 0xAAAAAAAAAAAA719F, 0x00003FFA // C_2
+data8 0xB60B60B60356F994, 0x0000BFF5 // C_3
+data8 0xD00CFFD5B2385EA9, 0x00003FEF // C_4
+data8 0x93E4BD18292A14CD, 0x0000BFE9 // C_5
+data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1
+data8 0x88888888888868DB, 0x00003FF8 // S_2
+data8 0xD00D00D0055EFD4B, 0x0000BFF2 // S_3
+data8 0xB8EF1C5D839730B9, 0x00003FEC // S_4
+data8 0xD71EA3A4E5B3F492, 0x0000BFE5 // S_5
+data4 0x38800000 // two**-14
+data4 0xB8800000 // -two**-14
+LOCAL_OBJECT_END(FSINCOS_CONSTANTS)
+
+// sin and cos registers
+
+// FR
+FR_Input_X = f8
+
+FR_r = f8
+FR_c = f9
+
+FR_Two_to_63 = f32
+FR_Two_to_24 = f33
+FR_Pi_by_4 = f33
+FR_Two_to_M14 = f34
+FR_Two_to_M33 = f35
+FR_Neg_Two_to_24 = f36
+FR_Neg_Pi_by_4 = f36
+FR_Neg_Two_to_M14 = f37
+FR_Neg_Two_to_M33 = f38
+FR_Neg_Two_to_M67 = f39
+FR_Inv_pi_by_2 = f40
+FR_N_float = f41
+FR_N_fix = f42
+FR_P_1 = f43
+FR_P_2 = f44
+FR_P_3 = f45
+FR_s = f46
+FR_w = f47
+FR_d_2 = f48
+FR_prelim = f49
+FR_Z = f50
+FR_A = f51
+FR_a = f52
+FR_t = f53
+FR_U_1 = f54
+FR_U_2 = f55
+FR_C_1 = f56
+FR_C_2 = f57
+FR_C_3 = f58
+FR_C_4 = f59
+FR_C_5 = f60
+FR_S_1 = f61
+FR_S_2 = f62
+FR_S_3 = f63
+FR_S_4 = f64
+FR_S_5 = f65
+FR_poly_hi = f66
+FR_poly_lo = f67
+FR_r_hi = f68
+FR_r_lo = f69
+FR_rsq = f70
+FR_r_cubed = f71
+FR_C_hi = f72
+FR_N_0 = f73
+FR_d_1 = f74
+FR_V = f75
+FR_V_hi = f75
+FR_V_lo = f76
+FR_U_hi = f77
+FR_U_lo = f78
+FR_U_hiabs = f79
+FR_V_hiabs = f80
+FR_PP_8 = f81
+FR_QQ_8 = f81
+FR_PP_7 = f82
+FR_QQ_7 = f82
+FR_PP_6 = f83
+FR_QQ_6 = f83
+FR_PP_5 = f84
+FR_QQ_5 = f84
+FR_PP_4 = f85
+FR_QQ_4 = f85
+FR_PP_3 = f86
+FR_QQ_3 = f86
+FR_PP_2 = f87
+FR_QQ_2 = f87
+FR_QQ_1 = f88
+FR_N_0_fix = f89
+FR_Inv_P_0 = f90
+FR_corr = f91
+FR_poly = f92
+FR_Neg_Two_to_M3 = f93
+FR_Two_to_M3 = f94
+FR_Neg_Two_to_63 = f94
+FR_P_0 = f95
+FR_C_lo = f96
+FR_PP_1 = f97
+FR_PP_1_lo = f98
+FR_ArgPrime = f99
+
+// GR
+GR_Table_Base = r32
+GR_Table_Base1 = r33
+GR_i_0 = r34
+GR_i_1 = r35
+GR_N_Inc = r36
+GR_Sin_or_Cos = r37
+
+GR_SAVE_B0 = r39
+GR_SAVE_GP = r40
+GR_SAVE_PFS = r41
+
+// sincos combined routine registers
+
+// GR
+GR_SINCOS_SAVE_PFS = r32
+GR_SINCOS_SAVE_B0 = r33
+GR_SINCOS_SAVE_GP = r34
+
+// FR
+FR_SINCOS_ARG = f100
+FR_SINCOS_RES_SIN = f101
+
+
+.section .text
+
+
+GLOBAL_LIBM_ENTRY(__libm_sincos_large)
+
+{ .mfi
+ alloc GR_SINCOS_SAVE_PFS = ar.pfs,0,3,0,0
+ fma.s1 FR_SINCOS_ARG = f8, f1, f0 // Save argument for sin and cos
+ mov GR_SINCOS_SAVE_B0 = b0
+};;
+
+{ .mfb
+ mov GR_SINCOS_SAVE_GP = gp
+ nop.f 0
+ br.call.sptk b0 = __libm_sin_large // Call sin
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_SINCOS_RES_SIN = f8, f1, f0 // Save sin result
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+ fma.s1 f8 = FR_SINCOS_ARG, f1, f0 // Arg for cos
+ br.call.sptk b0 = __libm_cos_large // Call cos
+};;
+
+{ .mfi
+ mov gp = GR_SINCOS_SAVE_GP
+ fma.s1 f9 = FR_SINCOS_RES_SIN, f1, f0 // Out sin result
+ mov b0 = GR_SINCOS_SAVE_B0
+};;
+
+{ .mib
+ nop.m 0
+ mov ar.pfs = GR_SINCOS_SAVE_PFS
+ br.ret.sptk b0 // sincos_large exit
+};;
+
+GLOBAL_LIBM_END(__libm_sincos_large)
+
+
+
+GLOBAL_LIBM_ENTRY(__libm_sin_large)
+
+{ .mlx
+alloc GR_Table_Base = ar.pfs,0,12,2,0
+ movl GR_Sin_or_Cos = 0x0 ;;
+}
+
+{ .mmi
+ nop.m 999
+ addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp
+ nop.i 999
+}
+;;
+
+{ .mmi
+ ld8 GR_Table_Base = [GR_Table_Base]
+ nop.m 999
+ nop.i 999
+}
+;;
+
+
+{ .mib
+ nop.m 999
+ nop.i 999
+ br.cond.sptk SINCOS_CONTINUE ;;
+}
+
+GLOBAL_LIBM_END(__libm_sin_large)
+GLOBAL_LIBM_ENTRY(__libm_cos_large)
+
+{ .mlx
+alloc GR_Table_Base= ar.pfs,0,12,2,0
+ movl GR_Sin_or_Cos = 0x1 ;;
+}
+
+{ .mmi
+ nop.m 999
+ addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp
+ nop.i 999
+}
+;;
+
+{ .mmi
+ ld8 GR_Table_Base = [GR_Table_Base]
+ nop.m 999
+ nop.i 999
+}
+;;
+
+//
+// Load Table Address
+//
+SINCOS_CONTINUE:
+
+{ .mmi
+ add GR_Table_Base1 = 96, GR_Table_Base
+ ldfs FR_Two_to_24 = [GR_Table_Base], 4
+ nop.i 999
+}
+;;
+
+{ .mmi
+ nop.m 999
+//
+// Load 2**24, load 2**63.
+//
+ ldfs FR_Neg_Two_to_24 = [GR_Table_Base], 12
+ mov r41 = ar.pfs ;;
+}
+
+{ .mfi
+ ldfs FR_Two_to_63 = [GR_Table_Base1], 4
+//
+// Check for unnormals - unsupported operands. We do not want
+// to generate denormal exception
+// Check for NatVals, QNaNs, SNaNs, +/-Infs
+// Check for EM unsupporteds
+// Check for Zero
+//
+ fclass.m.unc p6, p8 = FR_Input_X, 0x1E3
+ mov r40 = gp ;;
+}
+
+{ .mfi
+ nop.m 999
+ fclass.nm.unc p8, p0 = FR_Input_X, 0x1FF
+// GR_Sin_or_Cos denotes
+ mov r39 = b0
+}
+
+{ .mfb
+ ldfs FR_Neg_Two_to_63 = [GR_Table_Base1], 12
+ fclass.m.unc p10, p0 = FR_Input_X, 0x007
+(p6) br.cond.spnt SINCOS_SPECIAL ;;
+}
+
+{ .mib
+ nop.m 999
+ nop.i 999
+(p8) br.cond.spnt SINCOS_SPECIAL ;;
+}
+
+{ .mib
+ nop.m 999
+ nop.i 999
+//
+// Branch if +/- NaN, Inf.
+// Load -2**24, load -2**63.
+//
+(p10) br.cond.spnt SINCOS_ZERO ;;
+}
+
+{ .mmb
+ ldfe FR_Inv_pi_by_2 = [GR_Table_Base], 16
+ ldfe FR_Inv_P_0 = [GR_Table_Base1], 16
+ nop.b 999 ;;
+}
+
+{ .mmb
+ nop.m 999
+ ldfe FR_d_1 = [GR_Table_Base1], 16
+ nop.b 999 ;;
+}
+//
+// Raise possible denormal operand flag with useful fcmp
+// Is x <= -2**63
+// Load Inv_P_0 for pre-reduction
+// Load Inv_pi_by_2
+//
+
+{ .mmb
+ ldfe FR_P_0 = [GR_Table_Base], 16
+ ldfe FR_d_2 = [GR_Table_Base1], 16
+ nop.b 999 ;;
+}
+//
+// Load P_0
+// Load d_1
+// Is x >= 2**63
+// Is x <= -2**24?
+//
+
+{ .mmi
+ ldfe FR_P_1 = [GR_Table_Base], 16 ;;
+//
+// Load P_1
+// Load d_2
+// Is x >= 2**24?
+//
+ ldfe FR_P_2 = [GR_Table_Base], 16
+ nop.i 999 ;;
+}
+
+{ .mmf
+ nop.m 999
+ ldfe FR_P_3 = [GR_Table_Base], 16
+ fcmp.le.unc.s1 p7, p8 = FR_Input_X, FR_Neg_Two_to_24
+}
+
+{ .mfi
+ nop.m 999
+//
+// Branch if +/- zero.
+// Decide about the paths to take:
+// If -2**24 < FR_Input_X < 2**24 - CASE 1 OR 2
+// OTHERWISE - CASE 3 OR 4
+//
+ fcmp.le.unc.s1 p10, p11 = FR_Input_X, FR_Neg_Two_to_63
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fcmp.ge.s1 p7, p0 = FR_Input_X, FR_Two_to_24
+ nop.i 999
+}
+
+{ .mfi
+ ldfe FR_Pi_by_4 = [GR_Table_Base1], 16
+(p11) fcmp.ge.s1 p10, p0 = FR_Input_X, FR_Two_to_63
+ nop.i 999 ;;
+}
+
+{ .mmi
+ ldfe FR_Neg_Pi_by_4 = [GR_Table_Base1], 16 ;;
+ ldfs FR_Two_to_M3 = [GR_Table_Base1], 4
+ nop.i 999 ;;
+}
+
+{ .mib
+ ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1], 12
+ nop.i 999
+//
+// Load P_2
+// Load P_3
+// Load pi_by_4
+// Load neg_pi_by_4
+// Load 2**(-3)
+// Load -2**(-3).
+//
+(p10) br.cond.spnt SINCOS_ARG_TOO_LARGE ;;
+}
+
+{ .mib
+ nop.m 999
+ nop.i 999
+//
+// Branch out if x >= 2**63. Use Payne-Hanek Reduction
+//
+(p7) br.cond.spnt SINCOS_LARGER_ARG ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// Branch if Arg <= -2**24 or Arg >= 2**24 and use pre-reduction.
+//
+ fma.s1 FR_N_float = FR_Input_X, FR_Inv_pi_by_2, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+ fcmp.lt.unc.s1 p6, p7 = FR_Input_X, FR_Pi_by_4
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// Select the case when |Arg| < pi/4
+// Else Select the case when |Arg| >= pi/4
+//
+ fcvt.fx.s1 FR_N_fix = FR_N_float
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// N = Arg * 2/pi
+// Check if Arg < pi/4
+//
+(p6) fcmp.gt.s1 p6, p7 = FR_Input_X, FR_Neg_Pi_by_4
+ nop.i 999 ;;
+}
+//
+// Case 2: Convert integer N_fix back to normalized floating-point value.
+// Case 1: p8 is only affected when p6 is set
+//
+
+{ .mfi
+(p7) ldfs FR_Two_to_M33 = [GR_Table_Base1], 4
+//
+// Grab the integer part of N and call it N_fix
+//
+(p6) fmerge.se FR_r = FR_Input_X, FR_Input_X
+// If |x| < pi/4, r = x and c = 0
+// lf |x| < pi/4, is x < 2**(-3).
+// r = Arg
+// c = 0
+(p6) mov GR_N_Inc = GR_Sin_or_Cos ;;
+}
+
+{ .mmf
+ nop.m 999
+(p7) ldfs FR_Neg_Two_to_M33 = [GR_Table_Base1], 4
+(p6) fmerge.se FR_c = f0, f0
+}
+
+{ .mfi
+ nop.m 999
+(p6) fcmp.lt.unc.s1 p8, p9 = FR_Input_X, FR_Two_to_M3
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// lf |x| < pi/4, is -2**(-3)< x < 2**(-3) - set p8.
+// If |x| >= pi/4,
+// Create the right N for |x| < pi/4 and otherwise
+// Case 2: Place integer part of N in GP register
+//
+(p7) fcvt.xf FR_N_float = FR_N_fix
+ nop.i 999 ;;
+}
+
+{ .mmf
+ nop.m 999
+(p7) getf.sig GR_N_Inc = FR_N_fix
+(p8) fcmp.gt.s1 p8, p0 = FR_Input_X, FR_Neg_Two_to_M3 ;;
+}
+
+{ .mib
+ nop.m 999
+ nop.i 999
+//
+// Load 2**(-33), -2**(-33)
+//
+(p8) br.cond.spnt SINCOS_SMALL_R ;;
+}
+
+{ .mib
+ nop.m 999
+ nop.i 999
+(p6) br.cond.sptk SINCOS_NORMAL_R ;;
+}
+//
+// if |x| < pi/4, branch based on |x| < 2**(-3) or otherwise.
+//
+//
+// In this branch, |x| >= pi/4.
+//
+
+{ .mfi
+ ldfs FR_Neg_Two_to_M67 = [GR_Table_Base1], 8
+//
+// Load -2**(-67)
+//
+ fnma.s1 FR_s = FR_N_float, FR_P_1, FR_Input_X
+//
+// w = N * P_2
+// s = -N * P_1 + Arg
+//
+ add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos
+}
+
+{ .mfi
+ nop.m 999
+ fma.s1 FR_w = FR_N_float, FR_P_2, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// Adjust N_fix by N_inc to determine whether sine or
+// cosine is being calculated
+//
+ fcmp.lt.unc.s1 p7, p6 = FR_s, FR_Two_to_M33
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p7) fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+// Remember x >= pi/4.
+// Is s <= -2**(-33) or s >= 2**(-33) (p6)
+// or -2**(-33) < s < 2**(-33) (p7)
+(p6) fms.s1 FR_r = FR_s, f1, FR_w
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p7) fma.s1 FR_w = FR_N_float, FR_P_3, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p7) fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p6) fms.s1 FR_c = FR_s, f1, FR_r
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// For big s: r = s - w: No futher reduction is necessary
+// For small s: w = N * P_3 (change sign) More reduction
+//
+(p6) fcmp.lt.unc.s1 p8, p9 = FR_r, FR_Two_to_M3
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fcmp.gt.s1 p8, p9 = FR_r, FR_Neg_Two_to_M3
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p7) fms.s1 FR_r = FR_s, f1, FR_U_1
+ nop.i 999
+}
+
+{ .mfb
+ nop.m 999
+//
+// For big s: Is |r| < 2**(-3)?
+// For big s: c = S - r
+// For small s: U_1 = N * P_2 + w
+//
+// If p8 is set, prepare to branch to Small_R.
+// If p9 is set, prepare to branch to Normal_R.
+// For big s, r is complete here.
+//
+(p6) fms.s1 FR_c = FR_c, f1, FR_w
+//
+// For big s: c = c + w (w has not been negated.)
+// For small s: r = S - U_1
+//
+(p8) br.cond.spnt SINCOS_SMALL_R ;;
+}
+
+{ .mib
+ nop.m 999
+ nop.i 999
+(p9) br.cond.sptk SINCOS_NORMAL_R ;;
+}
+
+{ .mfi
+(p7) add GR_Table_Base1 = 224, GR_Table_Base1
+//
+// Branch to SINCOS_SMALL_R or SINCOS_NORMAL_R
+//
+(p7) fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1
+//
+// c = S - U_1
+// r = S_1 * r
+//
+//
+(p7) extr.u GR_i_1 = GR_N_Inc, 0, 1
+}
+
+{ .mmi
+ nop.m 999 ;;
+//
+// Get [i_0,i_1] - two lsb of N_fix_gr.
+// Do dummy fmpy so inexact is always set.
+//
+(p7) cmp.eq.unc p9, p10 = 0x0, GR_i_1
+(p7) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
+}
+//
+// For small s: U_2 = N * P_2 - U_1
+// S_1 stored constant - grab the one stored with the
+// coefficients.
+//
+
+{ .mfi
+(p7) ldfe FR_S_1 = [GR_Table_Base1], 16
+//
+// Check if i_1 and i_0 != 0
+//
+(p10) fma.s1 FR_poly = f0, f1, FR_Neg_Two_to_M67
+(p7) cmp.eq.unc p11, p12 = 0x0, GR_i_0 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p7) fms.s1 FR_s = FR_s, f1, FR_r
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//
+// S = S - r
+// U_2 = U_2 + w
+// load S_1
+//
+(p7) fma.s1 FR_rsq = FR_r, FR_r, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p7) fma.s1 FR_U_2 = FR_U_2, f1, FR_w
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//(p7) fmerge.se FR_Input_X = FR_r, FR_r
+(p7) fmerge.se FR_prelim = FR_r, FR_r
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//(p10) fma.s1 FR_Input_X = f0, f1, f1
+(p10) fma.s1 FR_prelim = f0, f1, f1
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// FR_rsq = r * r
+// Save r as the result.
+//
+(p7) fms.s1 FR_c = FR_s, f1, FR_U_1
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if ( i_1 ==0) poly = c + S_1*r*r*r
+// else Result = 1
+//
+//(p12) fnma.s1 FR_Input_X = FR_Input_X, f1, f0
+(p12) fnma.s1 FR_prelim = FR_prelim, f1, f0
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p7) fma.s1 FR_r = FR_S_1, FR_r, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p7) fma.d.s1 FR_S_1 = FR_S_1, FR_S_1, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// If i_1 != 0, poly = 2**(-67)
+//
+(p7) fms.s1 FR_c = FR_c, f1, FR_U_2
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// c = c - U_2
+//
+(p9) fma.s1 FR_poly = FR_r, FR_rsq, FR_c
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// i_0 != 0, so Result = -Result
+//
+(p11) fma.s1 FR_Input_X = FR_prelim, f1, FR_poly
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+(p12) fms.s1 FR_Input_X = FR_prelim, f1, FR_poly
+//
+// if (i_0 == 0), Result = Result + poly
+// else Result = Result - poly
+//
+ br.ret.sptk b0 ;;
+}
+SINCOS_LARGER_ARG:
+
+{ .mfi
+ nop.m 999
+ fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0
+ nop.i 999
+}
+;;
+
+// This path for argument > 2*24
+// Adjust table_ptr1 to beginning of table.
+//
+
+{ .mmi
+ nop.m 999
+ addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp
+ nop.i 999
+}
+;;
+
+{ .mmi
+ ld8 GR_Table_Base = [GR_Table_Base]
+ nop.m 999
+ nop.i 999
+}
+;;
+
+
+//
+// Point to 2*-14
+// N_0 = Arg * Inv_P_0
+//
+
+{ .mmi
+ add GR_Table_Base = 688, GR_Table_Base ;;
+ ldfs FR_Two_to_M14 = [GR_Table_Base], 4
+ nop.i 999 ;;
+}
+
+{ .mfi
+ ldfs FR_Neg_Two_to_M14 = [GR_Table_Base], 0
+ nop.f 999
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// Load values 2**(-14) and -2**(-14)
+//
+ fcvt.fx.s1 FR_N_0_fix = FR_N_0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// N_0_fix = integer part of N_0
+//
+ fcvt.xf FR_N_0 = FR_N_0_fix
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// Make N_0 the integer part
+//
+ fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+ fma.s1 FR_w = FR_N_0, FR_d_1, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// Arg' = -N_0 * P_0 + Arg
+// w = N_0 * d_1
+//
+ fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// N = A' * 2/pi
+//
+ fcvt.fx.s1 FR_N_fix = FR_N_float
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// N_fix is the integer part
+//
+ fcvt.xf FR_N_float = FR_N_fix
+ nop.i 999 ;;
+}
+
+{ .mfi
+ getf.sig GR_N_Inc = FR_N_fix
+ nop.f 999
+ nop.i 999 ;;
+}
+
+{ .mii
+ nop.m 999
+ nop.i 999 ;;
+ add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// N is the integer part of the reduced-reduced argument.
+// Put the integer in a GP register
+//
+ fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+ fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// s = -N*P_1 + Arg'
+// w = -N*P_2 + w
+// N_fix_gr = N_fix_gr + N_inc
+//
+ fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p9) fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// For |s| > 2**(-14) r = S + w (r complete)
+// Else U_hi = N_0 * d_1
+//
+(p9) fma.s1 FR_V_hi = FR_N_float, FR_P_2, f0
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_U_hi = FR_N_0, FR_d_1, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// Either S <= -2**(-14) or S >= 2**(-14)
+// or -2**(-14) < s < 2**(-14)
+//
+(p8) fma.s1 FR_r = FR_s, f1, FR_w
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_w = FR_N_float, FR_P_3, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// We need abs of both U_hi and V_hi - don't
+// worry about switched sign of V_hi.
+//
+(p9) fms.s1 FR_A = FR_U_hi, f1, FR_V_hi
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//
+// Big s: finish up c = (S - r) + w (c complete)
+// Case 4: A = U_hi + V_hi
+// Note: Worry about switched sign of V_hi, so subtract instead of add.
+//
+(p9) fnma.s1 FR_V_lo = FR_N_float, FR_P_2, FR_V_hi
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p9) fms.s1 FR_U_lo = FR_N_0, FR_d_1, FR_U_hi
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p9) fmerge.s FR_V_hiabs = f0, FR_V_hi
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+// For big s: c = S - r
+// For small s do more work: U_lo = N_0 * d_1 - U_hi
+//
+(p9) fmerge.s FR_U_hiabs = f0, FR_U_hi
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// For big s: Is |r| < 2**(-3)
+// For big s: if p12 set, prepare to branch to Small_R.
+// For big s: If p13 set, prepare to branch to Normal_R.
+//
+(p8) fms.s1 FR_c = FR_s, f1, FR_r
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//
+// For small S: V_hi = N * P_2
+// w = N * P_3
+// Note the product does not include the (-) as in the writeup
+// so (-) missing for V_hi and w.
+//
+(p8) fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p12) fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p8) fma.s1 FR_c = FR_c, f1, FR_w
+ nop.i 999
+}
+
+{ .mfb
+ nop.m 999
+(p9) fms.s1 FR_w = FR_N_0, FR_d_2, FR_w
+(p12) br.cond.spnt SINCOS_SMALL_R ;;
+}
+
+{ .mib
+ nop.m 999
+ nop.i 999
+(p13) br.cond.sptk SINCOS_NORMAL_R ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true.
+// The remaining stuff is for Case 4.
+// Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup)
+// Note: the (-) is still missing for V_lo.
+// Small s: w = w + N_0 * d_2
+// Note: the (-) is now incorporated in w.
+//
+(p9) fcmp.ge.unc.s1 p10, p11 = FR_U_hiabs, FR_V_hiabs
+ extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// C_hi = S + A
+//
+(p9) fma.s1 FR_t = FR_U_lo, f1, FR_V_lo
+ extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// t = U_lo + V_lo
+//
+//
+(p10) fms.s1 FR_a = FR_U_hi, f1, FR_A
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p11) fma.s1 FR_a = FR_V_hi, f1, FR_A
+ nop.i 999
+}
+;;
+
+{ .mmi
+ nop.m 999
+ addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp
+ nop.i 999
+}
+;;
+
+{ .mmi
+ ld8 GR_Table_Base = [GR_Table_Base]
+ nop.m 999
+ nop.i 999
+}
+;;
+
+
+{ .mfi
+ add GR_Table_Base = 528, GR_Table_Base
+//
+// Is U_hiabs >= V_hiabs?
+//
+(p9) fma.s1 FR_C_hi = FR_s, f1, FR_A
+ nop.i 999 ;;
+}
+
+{ .mmi
+ ldfe FR_C_1 = [GR_Table_Base], 16 ;;
+ ldfe FR_C_2 = [GR_Table_Base], 64
+ nop.i 999 ;;
+}
+
+{ .mmf
+ nop.m 999
+//
+// c = c + C_lo finished.
+// Load C_2
+//
+ ldfe FR_S_1 = [GR_Table_Base], 16
+//
+// C_lo = S - C_hi
+//
+ fma.s1 FR_t = FR_t, f1, FR_w ;;
+}
+//
+// r and c have been computed.
+// Make sure ftz mode is set - should be automatic when using wre
+// |r| < 2**(-3)
+// Get [i_0,i_1] - two lsb of N_fix.
+// Load S_1
+//
+
+{ .mfi
+ ldfe FR_S_2 = [GR_Table_Base], 64
+//
+// t = t + w
+//
+(p10) fms.s1 FR_a = FR_a, f1, FR_V_hi
+ cmp.eq.unc p9, p10 = 0x0, GR_i_0
+}
+
+{ .mfi
+ nop.m 999
+//
+// For larger u than v: a = U_hi - A
+// Else a = V_hi - A (do an add to account for missing (-) on V_hi
+//
+ fms.s1 FR_C_lo = FR_s, f1, FR_C_hi
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p11) fms.s1 FR_a = FR_U_hi, f1, FR_a
+ cmp.eq.unc p11, p12 = 0x0, GR_i_1
+}
+
+{ .mfi
+ nop.m 999
+//
+// If u > v: a = (U_hi - A) + V_hi
+// Else a = (V_hi - A) + U_hi
+// In each case account for negative missing from V_hi.
+//
+ fma.s1 FR_C_lo = FR_C_lo, f1, FR_A
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// C_lo = (S - C_hi) + A
+//
+ fma.s1 FR_t = FR_t, f1, FR_a
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// t = t + a
+//
+ fma.s1 FR_C_lo = FR_C_lo, f1, FR_t
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// C_lo = C_lo + t
+// Adjust Table_Base to beginning of table
+//
+ fma.s1 FR_r = FR_C_hi, f1, FR_C_lo
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// Load S_2
+//
+ fma.s1 FR_rsq = FR_r, FR_r, f0
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//
+// Table_Base points to C_1
+// r = C_hi + C_lo
+//
+ fms.s1 FR_c = FR_C_hi, f1, FR_r
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if i_1 ==0: poly = S_2 * FR_rsq + S_1
+// else poly = C_2 * FR_rsq + C_1
+//
+//(p11) fma.s1 FR_Input_X = f0, f1, FR_r
+(p11) fma.s1 FR_prelim = f0, f1, FR_r
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//(p12) fma.s1 FR_Input_X = f0, f1, f1
+(p12) fma.s1 FR_prelim = f0, f1, f1
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// Compute r_cube = FR_rsq * r
+//
+(p11) fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p12) fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//
+// Compute FR_rsq = r * r
+// Is i_1 == 0 ?
+//
+ fma.s1 FR_r_cubed = FR_rsq, FR_r, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// c = C_hi - r
+// Load C_1
+//
+ fma.s1 FR_c = FR_c, f1, FR_C_lo
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//
+// if i_1 ==0: poly = r_cube * poly + c
+// else poly = FR_rsq * poly
+//
+//(p10) fms.s1 FR_Input_X = f0, f1, FR_Input_X
+(p10) fms.s1 FR_prelim = f0, f1, FR_prelim
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if i_1 ==0: Result = r
+// else Result = 1.0
+//
+(p11) fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p12) fma.s1 FR_poly = FR_rsq, FR_poly, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if i_0 !=0: Result = -Result
+//
+(p9) fma.s1 FR_Input_X = FR_prelim, f1, FR_poly
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+(p10) fms.s1 FR_Input_X = FR_prelim, f1, FR_poly
+//
+// if i_0 == 0: Result = Result + poly
+// else Result = Result - poly
+//
+ br.ret.sptk b0 ;;
+}
+SINCOS_SMALL_R:
+
+{ .mii
+ nop.m 999
+ extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;
+//
+//
+// Compare both i_1 and i_0 with 0.
+// if i_1 == 0, set p9.
+// if i_0 == 0, set p11.
+//
+ cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;
+}
+
+{ .mfi
+ nop.m 999
+ fma.s1 FR_rsq = FR_r, FR_r, f0
+ extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// Z = Z * FR_rsq
+//
+(p10) fnma.s1 FR_c = FR_c, FR_r, f0
+ cmp.eq.unc p11, p12 = 0x0, GR_i_0
+}
+;;
+
+// ******************************************************************
+// ******************************************************************
+// ******************************************************************
+// r and c have been computed.
+// We know whether this is the sine or cosine routine.
+// Make sure ftz mode is set - should be automatic when using wre
+// |r| < 2**(-3)
+//
+// Set table_ptr1 to beginning of constant table.
+// Get [i_0,i_1] - two lsb of N_fix_gr.
+//
+
+{ .mmi
+ nop.m 999
+ addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp
+ nop.i 999
+}
+;;
+
+{ .mmi
+ ld8 GR_Table_Base = [GR_Table_Base]
+ nop.m 999
+ nop.i 999
+}
+;;
+
+
+//
+// Set table_ptr1 to point to S_5.
+// Set table_ptr1 to point to C_5.
+// Compute FR_rsq = r * r
+//
+
+{ .mfi
+(p9) add GR_Table_Base = 672, GR_Table_Base
+(p10) fmerge.s FR_r = f1, f1
+(p10) add GR_Table_Base = 592, GR_Table_Base ;;
+}
+//
+// Set table_ptr1 to point to S_5.
+// Set table_ptr1 to point to C_5.
+//
+
+{ .mmi
+(p9) ldfe FR_S_5 = [GR_Table_Base], -16 ;;
+//
+// if (i_1 == 0) load S_5
+// if (i_1 != 0) load C_5
+//
+(p9) ldfe FR_S_4 = [GR_Table_Base], -16
+ nop.i 999 ;;
+}
+
+{ .mmf
+(p10) ldfe FR_C_5 = [GR_Table_Base], -16
+//
+// Z = FR_rsq * FR_rsq
+//
+(p9) ldfe FR_S_3 = [GR_Table_Base], -16
+//
+// Compute FR_rsq = r * r
+// if (i_1 == 0) load S_4
+// if (i_1 != 0) load C_4
+//
+ fma.s1 FR_Z = FR_rsq, FR_rsq, f0 ;;
+}
+//
+// if (i_1 == 0) load S_3
+// if (i_1 != 0) load C_3
+//
+
+{ .mmi
+(p9) ldfe FR_S_2 = [GR_Table_Base], -16 ;;
+//
+// if (i_1 == 0) load S_2
+// if (i_1 != 0) load C_2
+//
+(p9) ldfe FR_S_1 = [GR_Table_Base], -16
+ nop.i 999
+}
+
+{ .mmi
+(p10) ldfe FR_C_4 = [GR_Table_Base], -16 ;;
+(p10) ldfe FR_C_3 = [GR_Table_Base], -16
+ nop.i 999 ;;
+}
+
+{ .mmi
+(p10) ldfe FR_C_2 = [GR_Table_Base], -16 ;;
+(p10) ldfe FR_C_1 = [GR_Table_Base], -16
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1 != 0):
+// poly_lo = FR_rsq * C_5 + C_4
+// poly_hi = FR_rsq * C_2 + C_1
+//
+(p9) fma.s1 FR_Z = FR_Z, FR_r, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1 == 0) load S_1
+// if (i_1 != 0) load C_1
+//
+(p9) fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//
+// c = -c * r
+// dummy fmpy's to flag inexact.
+//
+(p9) fma.d.s1 FR_S_4 = FR_S_4, FR_S_4, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// poly_lo = FR_rsq * poly_lo + C_3
+// poly_hi = FR_rsq * poly_hi
+//
+ fma.s1 FR_Z = FR_Z, FR_rsq, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1 == 0):
+// poly_lo = FR_rsq * S_5 + S_4
+// poly_hi = FR_rsq * S_2 + S_1
+//
+(p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1 == 0):
+// Z = Z * r for only one of the small r cases - not there
+// in original implementation notes.
+//
+(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.d.s1 FR_C_1 = FR_C_1, FR_C_1, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//
+// poly_lo = FR_rsq * poly_lo + S_3
+// poly_hi = FR_rsq * poly_hi
+//
+(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1 == 0): dummy fmpy's to flag inexact
+// r = 1
+//
+(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//
+// poly_hi = r * poly_hi
+//
+ fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p12) fms.s1 FR_r = f0, f1, FR_r
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// poly_hi = Z * poly_lo + c
+// if i_0 == 1: r = -r
+//
+ fma.s1 FR_poly = FR_poly, f1, FR_poly_hi
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p12) fms.s1 FR_Input_X = FR_r, f1, FR_poly
+ nop.i 999
+}
+
+{ .mfb
+ nop.m 999
+//
+// poly = poly + poly_hi
+//
+(p11) fma.s1 FR_Input_X = FR_r, f1, FR_poly
+//
+// if (i_0 == 0) Result = r + poly
+// if (i_0 != 0) Result = r - poly
+//
+ br.ret.sptk b0 ;;
+}
+SINCOS_NORMAL_R:
+
+{ .mii
+ nop.m 999
+ extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;
+//
+// Set table_ptr1 and table_ptr2 to base address of
+// constant table.
+ cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;
+}
+
+{ .mfi
+ nop.m 999
+ fma.s1 FR_rsq = FR_r, FR_r, f0
+ extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
+}
+
+{ .mfi
+ nop.m 999
+ frcpa.s1 FR_r_hi, p6 = f1, FR_r
+ cmp.eq.unc p11, p12 = 0x0, GR_i_0
+}
+;;
+
+// ******************************************************************
+// ******************************************************************
+// ******************************************************************
+//
+// r and c have been computed.
+// We known whether this is the sine or cosine routine.
+// Make sure ftz mode is set - should be automatic when using wre
+// Get [i_0,i_1] - two lsb of N_fix_gr alone.
+//
+
+{ .mmi
+ nop.m 999
+ addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp
+ nop.i 999
+}
+;;
+
+{ .mmi
+ ld8 GR_Table_Base = [GR_Table_Base]
+ nop.m 999
+ nop.i 999
+}
+;;
+
+
+{ .mfi
+(p10) add GR_Table_Base = 384, GR_Table_Base
+//(p12) fms.s1 FR_Input_X = f0, f1, f1
+(p12) fms.s1 FR_prelim = f0, f1, f1
+(p9) add GR_Table_Base = 224, GR_Table_Base ;;
+}
+
+{ .mmf
+ nop.m 999
+(p10) ldfe FR_QQ_8 = [GR_Table_Base], 16
+//
+// if (i_1==0) poly = poly * FR_rsq + PP_1_lo
+// else poly = FR_rsq * poly
+//
+//(p11) fma.s1 FR_Input_X = f0, f1, f1 ;;
+(p11) fma.s1 FR_prelim = f0, f1, f1 ;;
+}
+
+{ .mmf
+(p10) ldfe FR_QQ_7 = [GR_Table_Base], 16
+//
+// Adjust table pointers based on i_0
+// Compute rsq = r * r
+//
+(p9) ldfe FR_PP_8 = [GR_Table_Base], 16
+ fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 ;;
+}
+
+{ .mmf
+(p9) ldfe FR_PP_7 = [GR_Table_Base], 16
+(p10) ldfe FR_QQ_6 = [GR_Table_Base], 16
+//
+// Load PP_8 and QQ_8; PP_7 and QQ_7
+//
+ frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;;
+}
+//
+// if (i_1==0) poly = PP_7 + FR_rsq * PP_8.
+// else poly = QQ_7 + FR_rsq * QQ_8.
+//
+
+{ .mmb
+(p9) ldfe FR_PP_6 = [GR_Table_Base], 16
+(p10) ldfe FR_QQ_5 = [GR_Table_Base], 16
+ nop.b 999 ;;
+}
+
+{ .mmb
+(p9) ldfe FR_PP_5 = [GR_Table_Base], 16
+(p10) ldfe FR_S_1 = [GR_Table_Base], 16
+ nop.b 999 ;;
+}
+
+{ .mmb
+(p10) ldfe FR_QQ_1 = [GR_Table_Base], 16
+(p9) ldfe FR_C_1 = [GR_Table_Base], 16
+ nop.b 999 ;;
+}
+
+{ .mmi
+(p10) ldfe FR_QQ_4 = [GR_Table_Base], 16 ;;
+(p9) ldfe FR_PP_1 = [GR_Table_Base], 16
+ nop.i 999 ;;
+}
+
+{ .mmf
+(p10) ldfe FR_QQ_3 = [GR_Table_Base], 16
+//
+// if (i_1=0) corr = corr + c*c
+// else corr = corr * c
+//
+(p9) ldfe FR_PP_4 = [GR_Table_Base], 16
+(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 ;;
+}
+//
+// if (i_1=0) poly = rsq * poly + PP_5
+// else poly = rsq * poly + QQ_5
+// Load PP_4 or QQ_4
+//
+
+{ .mmf
+(p9) ldfe FR_PP_3 = [GR_Table_Base], 16
+(p10) ldfe FR_QQ_2 = [GR_Table_Base], 16
+//
+// r_hi = frcpa(frcpa(r)).
+// r_cube = r * FR_rsq.
+//
+(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 ;;
+}
+//
+// Do dummy multiplies so inexact is always set.
+//
+
+{ .mfi
+(p9) ldfe FR_PP_2 = [GR_Table_Base], 16
+//
+// r_lo = r - r_hi
+//
+(p9) fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0
+ nop.i 999 ;;
+}
+
+{ .mmf
+ nop.m 999
+(p9) ldfe FR_PP_1_lo = [GR_Table_Base], 16
+(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1=0) U_lo = r_hi * r_hi
+// else U_lo = r_hi + r
+//
+(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1=0) corr = C_1 * rsq
+// else corr = S_1 * r_cubed + r
+//
+(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1=0) U_hi = r_hi + U_hi
+// else U_hi = QQ_1 * U_hi + 1
+//
+(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//
+// U_hi = r_hi * r_hi
+//
+ fms.s1 FR_r_lo = FR_r, f1, FR_r_hi
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// Load PP_1, PP_6, PP_5, and C_1
+// Load QQ_1, QQ_6, QQ_5, and S_1
+//
+ fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1=0) U_lo = r * r_hi + U_lo
+// else U_lo = r_lo * U_lo
+//
+(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1 =0) U_hi = r + U_hi
+// if (i_1 =0) U_lo = r_lo * U_lo
+//
+//
+(p9) fma.d.s1 FR_PP_5 = FR_PP_5, FR_PP_4, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1=0) poly = poly * rsq + PP_6
+// else poly = poly * rsq + QQ_6
+//
+(p9) fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.d.s1 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1!=0) U_hi = PP_1 * U_hi
+// if (i_1!=0) U_lo = r * r + U_lo
+// Load PP_3 or QQ_3
+//
+(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0
+ nop.i 999
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// Load PP_2, QQ_2
+//
+(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1==0) poly = FR_rsq * poly + PP_3
+// else poly = FR_rsq * poly + QQ_3
+// Load PP_1_lo
+//
+(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1 =0) poly = poly * rsq + pp_r4
+// else poly = poly * rsq + qq_r4
+//
+(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1==0) U_lo = PP_1_hi * U_lo
+// else U_lo = QQ_1 * U_lo
+//
+(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_0==0) Result = 1
+// else Result = -1
+//
+ fma.s1 FR_V = FR_U_lo, f1, FR_corr
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1==0) poly = FR_rsq * poly + PP_2
+// else poly = FR_rsq * poly + QQ_2
+//
+(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// V = U_lo + corr
+//
+(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//
+// if (i_1==0) poly = r_cube * poly
+// else poly = FR_rsq * poly
+//
+ fma.s1 FR_V = FR_poly, f1, FR_V
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+//(p12) fms.s1 FR_Input_X = FR_Input_X, FR_U_hi, FR_V
+(p12) fms.s1 FR_Input_X = FR_prelim, FR_U_hi, FR_V
+ nop.i 999
+}
+
+{ .mfb
+ nop.m 999
+//
+// V = V + poly
+//
+//(p11) fma.s1 FR_Input_X = FR_Input_X, FR_U_hi, FR_V
+(p11) fma.s1 FR_Input_X = FR_prelim, FR_U_hi, FR_V
+//
+// if (i_0==0) Result = Result * U_hi + V
+// else Result = Result * U_hi - V
+//
+ br.ret.sptk b0 ;;
+}
+
+//
+// If cosine, FR_Input_X = 1
+// If sine, FR_Input_X = +/-Zero (Input FR_Input_X)
+// Results are exact, no exceptions
+//
+SINCOS_ZERO:
+
+{ .mmb
+ cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos
+ nop.m 999
+ nop.b 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+(p7) fmerge.s FR_Input_X = FR_Input_X, FR_Input_X
+ nop.i 999
+}
+
+{ .mfb
+ nop.m 999
+(p6) fmerge.s FR_Input_X = f1, f1
+ br.ret.sptk b0 ;;
+}
+
+SINCOS_SPECIAL:
+
+//
+// Path for Arg = +/- QNaN, SNaN, Inf
+// Invalid can be raised. SNaNs
+// become QNaNs
+//
+
+{ .mfb
+ nop.m 999
+ fmpy.s1 FR_Input_X = FR_Input_X, f0
+ br.ret.sptk b0 ;;
+}
+GLOBAL_LIBM_END(__libm_cos_large)
+
+// *******************************************************************
+// *******************************************************************
+// *******************************************************************
+//
+// Special Code to handle very large argument case.
+// Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63
+// The interface is custom:
+// On input:
+// (Arg or x) is in f8
+// On output:
+// r is in f8
+// c is in f9
+// N is in r8
+// Be sure to allocate at least 2 GP registers as output registers for
+// __libm_pi_by_2_reduce. This routine uses r49-50. These are used as
+// scratch registers within the __libm_pi_by_2_reduce routine (for speed).
+//
+// We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127. We
+// use this to eliminate save/restore of key fp registers in this calling
+// function.
+//
+// *******************************************************************
+// *******************************************************************
+// *******************************************************************
+
+LOCAL_LIBM_ENTRY(__libm_callout_2)
+SINCOS_ARG_TOO_LARGE:
+
+.prologue
+// Readjust Table ptr
+{ .mfi
+ adds GR_Table_Base1 = -16, GR_Table_Base1
+ nop.f 999
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+};;
+
+{ .mmi
+ ldfs FR_Two_to_M3 = [GR_Table_Base1],4
+ mov GR_SAVE_GP=gp // Save gp
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+//
+// Call argument reduction with x in f8
+// Returns with N in r8, r in f8, c in f9
+// Assumes f71-127 are preserved across the call
+//
+{ .mib
+ ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0
+ nop.i 0
+ br.call.sptk b0=__libm_pi_by_2_reduce#
+};;
+
+{ .mfi
+ add GR_N_Inc = GR_Sin_or_Cos,r8
+ fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+
+{ .mfi
+ mov gp = GR_SAVE_GP // Restore gp
+(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+};;
+
+{ .mbb
+ nop.m 999
+(p6) br.cond.spnt SINCOS_SMALL_R // Branch if |r| < 1/4
+ br.cond.sptk SINCOS_NORMAL_R ;; // Branch if 1/4 <= |r| < pi/4
+}
+
+LOCAL_LIBM_END(__libm_callout_2)
+
+.type __libm_pi_by_2_reduce#,@function
+.global __libm_pi_by_2_reduce#
+
diff --git a/sysdeps/ia64/fpu/libm_sincosf.S b/sysdeps/ia64/fpu/libm_sincosf.S
new file mode 100644
index 0000000000..c4783aca3a
--- /dev/null
+++ b/sysdeps/ia64/fpu/libm_sincosf.S
@@ -0,0 +1,744 @@
+.file "libm_sincosf.s"
+
+
+// Copyright (c) 2002 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/01/02 Initial version
+// 02/18/02 Large arguments processing routine is excluded.
+// External interface entry points are added
+// 02/26/02 Added temporary return of results in r8, r9
+// 03/13/02 Corrected restore of predicate registers
+// 03/19/02 Added stack unwind around call to __libm_cisf_large
+// 09/05/02 Work range is widened by reduction strengthen (2 parts of Pi/16)
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+
+// API
+//==============================================================
+// 1) float _Complex cisf(float)
+// 2) void sincosf(float, float*s, float*c)
+// 3) __libm_sincosf - internal LIBM function, that accepts
+// argument in f8 and returns cosine through f8, sine through f9
+
+//
+// Overview of operation
+//==============================================================
+//
+// Step 1
+// ======
+// Reduce x to region -1/2*pi/2^k ===== 0 ===== +1/2*pi/2^k where k=4
+// divide x by pi/2^k.
+// Multiply by 2^k/pi.
+// nfloat = Round result to integer (round-to-nearest)
+//
+// r = x - nfloat * pi/2^k
+// Do this as (x - nfloat * HIGH(pi/2^k)) - nfloat * LOW(pi/2^k) for increased accuracy.
+// pi/2^k is stored as two numbers that when added make pi/2^k.
+// pi/2^k = HIGH(pi/2^k) + LOW(pi/2^k)
+// HIGH part is rounded to zero, LOW - to nearest
+//
+// x = (nfloat * pi/2^k) + r
+// r is small enough that we can use a polynomial approximation
+// and is referred to as the reduced argument.
+//
+// Step 3
+// ======
+// Take the unreduced part and remove the multiples of 2pi.
+// So nfloat = nfloat (with lower k+1 bits cleared) + lower k+1 bits
+//
+// nfloat (with lower k+1 bits cleared) is a multiple of 2^(k+1)
+// N * 2^(k+1)
+// nfloat * pi/2^k = N * 2^(k+1) * pi/2^k + (lower k+1 bits) * pi/2^k
+// nfloat * pi/2^k = N * 2 * pi + (lower k+1 bits) * pi/2^k
+// nfloat * pi/2^k = N2pi + M * pi/2^k
+//
+//
+// Sin(x) = Sin((nfloat * pi/2^k) + r)
+// = Sin(nfloat * pi/2^k) * Cos(r) + Cos(nfloat * pi/2^k) * Sin(r)
+//
+// Sin(nfloat * pi/2^k) = Sin(N2pi + Mpi/2^k)
+// = Sin(N2pi)Cos(Mpi/2^k) + Cos(N2pi)Sin(Mpi/2^k)
+// = Sin(Mpi/2^k)
+//
+// Cos(nfloat * pi/2^k) = Cos(N2pi + Mpi/2^k)
+// = Cos(N2pi)Cos(Mpi/2^k) + Sin(N2pi)Sin(Mpi/2^k)
+// = Cos(Mpi/2^k)
+//
+// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r)
+//
+//
+// Step 4
+// ======
+// 0 <= M < 2^(k+1)
+// There are 2^(k+1) Sin entries in a table.
+// There are 2^(k+1) Cos entries in a table.
+//
+// Get Sin(Mpi/2^k) and Cos(Mpi/2^k) by table lookup.
+//
+//
+// Step 5
+// ======
+// Calculate Cos(r) and Sin(r) by polynomial approximation.
+//
+// Cos(r) = 1 + r^2 q1 + r^4 q2 = Series for Cos
+// Sin(r) = r + r^3 p1 + r^5 p2 = Series for Sin
+//
+// and the coefficients q1, q2 and p1, p2 are stored in a table
+//
+//
+// Calculate
+// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r)
+//
+// as follows
+//
+// S[m] = Sin(Mpi/2^k) and C[m] = Cos(Mpi/2^k)
+// rsq = r*r
+//
+//
+// P = p1 + r^2p2
+// Q = q1 + r^2q2
+//
+// rcub = r * rsq
+// Sin(r) = r + rcub * P
+// = r + r^3p1 + r^5p2 = Sin(r)
+//
+// P = r + rcub * P
+//
+// Answer = S[m] Cos(r) + C[m] P
+//
+// Cos(r) = 1 + rsq Q
+// Cos(r) = 1 + r^2 Q
+// Cos(r) = 1 + r^2 (q1 + r^2q2)
+// Cos(r) = 1 + r^2q1 + r^4q2
+//
+// S[m] Cos(r) = S[m](1 + rsq Q)
+// S[m] Cos(r) = S[m] + S[m] rsq Q
+// S[m] Cos(r) = S[m] + s_rsq Q
+// Q = S[m] + s_rsq Q
+//
+// Then,
+//
+// Answer = Q + C[m] P
+
+
+// Registers used
+//==============================================================
+// general input registers:
+// r14 -> r19
+// r32 -> r49
+
+// predicate registers used:
+// p6 -> p14
+
+// floating-point registers used
+// f9 -> f15
+// f32 -> f100
+
+// Assembly macros
+//==============================================================
+
+cisf_Arg = f8
+
+cisf_Sin_res = f9
+cisf_Cos_res = f8
+
+
+cisf_NORM_f8 = f10
+cisf_W = f11
+cisf_int_Nfloat = f12
+cisf_Nfloat = f13
+
+cisf_r = f14
+cisf_r_exact = f68
+cisf_rsq = f15
+cisf_rcub = f32
+
+cisf_Inv_Pi_by_16 = f33
+cisf_Pi_by_16_hi = f34
+cisf_Pi_by_16_lo = f35
+
+cisf_Inv_Pi_by_64 = f36
+cisf_Pi_by_64_hi = f37
+cisf_Pi_by_64_lo = f38
+
+
+cisf_P1 = f39
+cisf_Q1 = f40
+cisf_P2 = f41
+cisf_Q2 = f42
+cisf_P3 = f43
+cisf_Q3 = f44
+cisf_P4 = f45
+cisf_Q4 = f46
+
+cisf_P_temp1 = f47
+cisf_P_temp2 = f48
+
+cisf_Q_temp1 = f49
+cisf_Q_temp2 = f50
+
+cisf_P = f51
+
+cisf_SIG_INV_PI_BY_16_2TO61 = f52
+cisf_RSHF_2TO61 = f53
+cisf_RSHF = f54
+cisf_2TOM61 = f55
+cisf_NFLOAT = f56
+cisf_W_2TO61_RSH = f57
+
+cisf_tmp = f58
+
+cisf_Sm_sin = f59
+cisf_Cm_sin = f60
+
+cisf_Sm_cos = f61
+cisf_Cm_cos = f62
+
+cisf_srsq_sin = f63
+cisf_srsq_cos = f64
+
+cisf_Q_sin = f65
+cisf_Q_cos = f66
+cisf_Q = f67
+
+/////////////////////////////////////////////////////////////
+
+cisf_pResSin = r33
+cisf_pResCos = r34
+
+cisf_exp_limit = r35
+cisf_r_signexp = r36
+cisf_AD_beta_table = r37
+cisf_r_sincos = r38
+
+cisf_r_exp = r39
+cisf_r_17_ones = r40
+
+cisf_GR_sig_inv_pi_by_16 = r14
+cisf_GR_rshf_2to61 = r15
+cisf_GR_rshf = r16
+cisf_GR_exp_2tom61 = r17
+cisf_GR_n = r18
+
+cisf_GR_n_sin = r19
+cisf_GR_m_sin = r41
+cisf_GR_32m_sin = r41
+
+cisf_GR_n_cos = r42
+cisf_GR_m_cos = r43
+cisf_GR_32m_cos = r43
+
+cisf_AD_2_sin = r44
+cisf_AD_2_cos = r45
+
+cisf_gr_tmp = r46
+GR_SAVE_B0 = r47
+GR_SAVE_GP = r48
+rB0_SAVED = r49
+GR_SAVE_PFS = r50
+GR_SAVE_PR = r51
+cisf_AD_1 = r52
+
+RODATA
+
+.align 16
+// Pi/16 parts
+LOCAL_OBJECT_START(double_cisf_pi)
+ data8 0xC90FDAA22168C234, 0x00003FFC // pi/16 1st part
+ data8 0xC4C6628B80DC1CD1, 0x00003FBC // pi/16 2nd part
+LOCAL_OBJECT_END(double_cisf_pi)
+
+// Coefficients for polynomials
+LOCAL_OBJECT_START(double_cisf_pq_k4)
+ data8 0x3F810FABB668E9A2 // P2
+ data8 0x3FA552E3D6DE75C9 // Q2
+ data8 0xBFC555554447BC7F // P1
+ data8 0xBFDFFFFFC447610A // Q1
+LOCAL_OBJECT_END(double_cisf_pq_k4)
+
+// Sincos table (S[m], C[m])
+LOCAL_OBJECT_START(double_sin_cos_beta_k4)
+ data8 0x0000000000000000 // sin ( 0 Pi / 16 )
+ data8 0x3FF0000000000000 // cos ( 0 Pi / 16 )
+//
+ data8 0x3FC8F8B83C69A60B // sin ( 1 Pi / 16 )
+ data8 0x3FEF6297CFF75CB0 // cos ( 1 Pi / 16 )
+//
+ data8 0x3FD87DE2A6AEA963 // sin ( 2 Pi / 16 )
+ data8 0x3FED906BCF328D46 // cos ( 2 Pi / 16 )
+//
+ data8 0x3FE1C73B39AE68C8 // sin ( 3 Pi / 16 )
+ data8 0x3FEA9B66290EA1A3 // cos ( 3 Pi / 16 )
+//
+ data8 0x3FE6A09E667F3BCD // sin ( 4 Pi / 16 )
+ data8 0x3FE6A09E667F3BCD // cos ( 4 Pi / 16 )
+//
+ data8 0x3FEA9B66290EA1A3 // sin ( 5 Pi / 16 )
+ data8 0x3FE1C73B39AE68C8 // cos ( 5 Pi / 16 )
+//
+ data8 0x3FED906BCF328D46 // sin ( 6 Pi / 16 )
+ data8 0x3FD87DE2A6AEA963 // cos ( 6 Pi / 16 )
+//
+ data8 0x3FEF6297CFF75CB0 // sin ( 7 Pi / 16 )
+ data8 0x3FC8F8B83C69A60B // cos ( 7 Pi / 16 )
+//
+ data8 0x3FF0000000000000 // sin ( 8 Pi / 16 )
+ data8 0x0000000000000000 // cos ( 8 Pi / 16 )
+//
+ data8 0x3FEF6297CFF75CB0 // sin ( 9 Pi / 16 )
+ data8 0xBFC8F8B83C69A60B // cos ( 9 Pi / 16 )
+//
+ data8 0x3FED906BCF328D46 // sin ( 10 Pi / 16 )
+ data8 0xBFD87DE2A6AEA963 // cos ( 10 Pi / 16 )
+//
+ data8 0x3FEA9B66290EA1A3 // sin ( 11 Pi / 16 )
+ data8 0xBFE1C73B39AE68C8 // cos ( 11 Pi / 16 )
+//
+ data8 0x3FE6A09E667F3BCD // sin ( 12 Pi / 16 )
+ data8 0xBFE6A09E667F3BCD // cos ( 12 Pi / 16 )
+//
+ data8 0x3FE1C73B39AE68C8 // sin ( 13 Pi / 16 )
+ data8 0xBFEA9B66290EA1A3 // cos ( 13 Pi / 16 )
+//
+ data8 0x3FD87DE2A6AEA963 // sin ( 14 Pi / 16 )
+ data8 0xBFED906BCF328D46 // cos ( 14 Pi / 16 )
+//
+ data8 0x3FC8F8B83C69A60B // sin ( 15 Pi / 16 )
+ data8 0xBFEF6297CFF75CB0 // cos ( 15 Pi / 16 )
+//
+ data8 0x0000000000000000 // sin ( 16 Pi / 16 )
+ data8 0xBFF0000000000000 // cos ( 16 Pi / 16 )
+//
+ data8 0xBFC8F8B83C69A60B // sin ( 17 Pi / 16 )
+ data8 0xBFEF6297CFF75CB0 // cos ( 17 Pi / 16 )
+//
+ data8 0xBFD87DE2A6AEA963 // sin ( 18 Pi / 16 )
+ data8 0xBFED906BCF328D46 // cos ( 18 Pi / 16 )
+//
+ data8 0xBFE1C73B39AE68C8 // sin ( 19 Pi / 16 )
+ data8 0xBFEA9B66290EA1A3 // cos ( 19 Pi / 16 )
+//
+ data8 0xBFE6A09E667F3BCD // sin ( 20 Pi / 16 )
+ data8 0xBFE6A09E667F3BCD // cos ( 20 Pi / 16 )
+//
+ data8 0xBFEA9B66290EA1A3 // sin ( 21 Pi / 16 )
+ data8 0xBFE1C73B39AE68C8 // cos ( 21 Pi / 16 )
+//
+ data8 0xBFED906BCF328D46 // sin ( 22 Pi / 16 )
+ data8 0xBFD87DE2A6AEA963 // cos ( 22 Pi / 16 )
+//
+ data8 0xBFEF6297CFF75CB0 // sin ( 23 Pi / 16 )
+ data8 0xBFC8F8B83C69A60B // cos ( 23 Pi / 16 )
+//
+ data8 0xBFF0000000000000 // sin ( 24 Pi / 16 )
+ data8 0x0000000000000000 // cos ( 24 Pi / 16 )
+//
+ data8 0xBFEF6297CFF75CB0 // sin ( 25 Pi / 16 )
+ data8 0x3FC8F8B83C69A60B // cos ( 25 Pi / 16 )
+//
+ data8 0xBFED906BCF328D46 // sin ( 26 Pi / 16 )
+ data8 0x3FD87DE2A6AEA963 // cos ( 26 Pi / 16 )
+//
+ data8 0xBFEA9B66290EA1A3 // sin ( 27 Pi / 16 )
+ data8 0x3FE1C73B39AE68C8 // cos ( 27 Pi / 16 )
+//
+ data8 0xBFE6A09E667F3BCD // sin ( 28 Pi / 16 )
+ data8 0x3FE6A09E667F3BCD // cos ( 28 Pi / 16 )
+//
+ data8 0xBFE1C73B39AE68C8 // sin ( 29 Pi / 16 )
+ data8 0x3FEA9B66290EA1A3 // cos ( 29 Pi / 16 )
+//
+ data8 0xBFD87DE2A6AEA963 // sin ( 30 Pi / 16 )
+ data8 0x3FED906BCF328D46 // cos ( 30 Pi / 16 )
+//
+ data8 0xBFC8F8B83C69A60B // sin ( 31 Pi / 16 )
+ data8 0x3FEF6297CFF75CB0 // cos ( 31 Pi / 16 )
+//
+ data8 0x0000000000000000 // sin ( 32 Pi / 16 )
+ data8 0x3FF0000000000000 // cos ( 32 Pi / 16 )
+LOCAL_OBJECT_END(double_sin_cos_beta_k4)
+
+.section .text
+
+GLOBAL_IEEE754_ENTRY(sincosf)
+// cis_GR_sig_inv_pi_by_16 = significand of 16/pi
+{ .mlx
+ alloc GR_SAVE_PFS = ar.pfs, 0, 21, 0, 0
+ movl cisf_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A // 16/pi signd
+
+}
+// cis_GR_rshf_2to61 = 1.1000 2^(63+63-2)
+{ .mlx
+ addl cisf_AD_1 = @ltoff(double_cisf_pi), gp
+ movl cisf_GR_rshf_2to61 = 0x47b8000000000000 // 1.1 2^(63+63-2)
+};;
+
+{ .mfi
+ ld8 cisf_AD_1 = [cisf_AD_1]
+ fnorm.s1 cisf_NORM_f8 = cisf_Arg
+ cmp.eq p13, p14 = r0, r0 // p13 set for sincos
+}
+// cis_GR_exp_2tom61 = exponent of scaling factor 2^-61
+{ .mib
+ mov cisf_GR_exp_2tom61 = 0xffff-61
+ nop.i 0
+ br.cond.sptk _CISF_COMMON
+};;
+GLOBAL_IEEE754_END(sincosf)
+LOCAL_LIBM_ENTRY(cisf)
+LOCAL_LIBM_END(cisf)
+GLOBAL_LIBM_ENTRY(__libm_sincosf)
+{ .mlx
+// cisf_GR_sig_inv_pi_by_16 = significand of 16/pi
+ alloc GR_SAVE_PFS = ar.pfs,0,21,0,0
+ movl cisf_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A
+}
+// cisf_GR_rshf_2to61 = 1.1000 2^(63+63-2)
+{ .mlx
+ addl cisf_AD_1 = @ltoff(double_cisf_pi), gp
+ movl cisf_GR_rshf_2to61 = 0x47b8000000000000
+};;
+
+// p14 set for __libm_sincos and cis
+{ .mfi
+ ld8 cisf_AD_1 = [cisf_AD_1]
+ fnorm.s1 cisf_NORM_f8 = cisf_Arg
+ cmp.eq p14, p13 = r0, r0
+}
+// cisf_GR_exp_2tom61 = exponent of scaling factor 2^-61
+{ .mib
+ mov cisf_GR_exp_2tom61 = 0xffff-61
+ nop.i 0
+ nop.b 0
+};;
+
+_CISF_COMMON:
+// Form two constants we need
+// 16/pi * 2^-2 * 2^63, scaled by 2^61 since we just loaded the significand
+// 1.1000...000 * 2^(63+63-2) to right shift int(W) into the low significand
+// fcmp used to set denormal, and invalid on snans
+{ .mfi
+ setf.sig cisf_SIG_INV_PI_BY_16_2TO61 = cisf_GR_sig_inv_pi_by_16
+ fclass.m p6,p0 = cisf_Arg, 0xe7//if x=0,inf,nan
+ addl cisf_gr_tmp = -1, r0
+}
+// cisf_GR_rshf = 1.1000 2^63 for right shift
+{ .mlx
+ setf.d cisf_RSHF_2TO61 = cisf_GR_rshf_2to61
+ movl cisf_GR_rshf = 0x43e8000000000000
+};;
+
+// Form another constant
+// 2^-61 for scaling Nfloat
+// 0x10017 is register_bias + 24.
+// So if f8 >= 2^24, go to large args routine
+{ .mmi
+ getf.exp cisf_r_signexp = cisf_Arg
+ setf.exp cisf_2TOM61 = cisf_GR_exp_2tom61
+ mov cisf_exp_limit = 0x10017
+};;
+
+// Load the two pieces of pi/16
+// Form another constant
+// 1.1000...000 * 2^63, the right shift constant
+{ .mmb
+ ldfe cisf_Pi_by_16_hi = [cisf_AD_1],16
+ setf.d cisf_RSHF = cisf_GR_rshf
+(p6) br.cond.spnt _CISF_SPECIAL_ARGS
+};;
+
+{ .mmi
+ ldfe cisf_Pi_by_16_lo = [cisf_AD_1],16
+ setf.sig cisf_tmp = cisf_gr_tmp //constant for inexact set
+ nop.i 0
+};;
+
+// Start loading P, Q coefficients
+{ .mmi
+ ldfpd cisf_P2,cisf_Q2 = [cisf_AD_1],16
+ nop.m 0
+ dep.z cisf_r_exp = cisf_r_signexp, 0, 17
+};;
+
+// p10 is true if we must call routines to handle larger arguments
+// p10 is true if f8 exp is >= 0x10017
+{ .mmb
+ ldfpd cisf_P1,cisf_Q1 = [cisf_AD_1], 16
+ cmp.ge p10, p0 = cisf_r_exp, cisf_exp_limit
+(p10) br.cond.spnt _CISF_LARGE_ARGS // go to |x| >= 2^24 path
+};;
+
+// cisf_W = x * cisf_Inv_Pi_by_16
+// Multiply x by scaled 16/pi and add large const to shift integer part of W to
+// rightmost bits of significand
+{ .mfi
+ nop.m 0
+ fma.s1 cisf_W_2TO61_RSH = cisf_NORM_f8,cisf_SIG_INV_PI_BY_16_2TO61,cisf_RSHF_2TO61
+ nop.i 0
+};;
+
+// cisf_NFLOAT = Round_Int_Nearest(cisf_W)
+{ .mfi
+ nop.m 0
+ fms.s1 cisf_NFLOAT = cisf_W_2TO61_RSH,cisf_2TOM61,cisf_RSHF
+ nop.i 0
+};;
+
+// N = (int)cisf_int_Nfloat
+{ .mfi
+ getf.sig cisf_GR_n = cisf_W_2TO61_RSH
+ nop.f 0
+ nop.i 0
+};;
+
+// Add 2^(k-1) (which is in cisf_r_sincos) to N
+// cisf_r = -cisf_Nfloat * cisf_Pi_by_16_hi + x
+// cisf_r = cisf_r -cisf_Nfloat * cisf_Pi_by_16_lo
+{ .mfi
+ add cisf_GR_n_cos = 0x8, cisf_GR_n
+ fnma.s1 cisf_r = cisf_NFLOAT, cisf_Pi_by_16_hi, cisf_NORM_f8
+ nop.i 0
+};;
+
+//Get M (least k+1 bits of N)
+{ .mmi
+ and cisf_GR_m_sin = 0x1f,cisf_GR_n
+ and cisf_GR_m_cos = 0x1f,cisf_GR_n_cos
+ nop.i 0
+};;
+
+{ .mmi
+ shladd cisf_AD_2_cos = cisf_GR_m_cos,4, cisf_AD_1
+ shladd cisf_AD_2_sin = cisf_GR_m_sin,4, cisf_AD_1
+ nop.i 0
+};;
+
+// den. input to set uflow
+{ .mmf
+ ldfpd cisf_Sm_sin, cisf_Cm_sin = [cisf_AD_2_sin]
+ ldfpd cisf_Sm_cos, cisf_Cm_cos = [cisf_AD_2_cos]
+ fclass.m.unc p10,p0 = cisf_Arg,0x0b
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 cisf_rsq = cisf_r, cisf_r, f0 // get r^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s0 cisf_tmp = cisf_tmp,cisf_tmp // inexact flag
+ nop.i 0
+};;
+
+{ .mmf
+ nop.m 0
+ nop.m 0
+ fnma.s1 cisf_r_exact = cisf_NFLOAT, cisf_Pi_by_16_lo, cisf_r
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 cisf_P = cisf_rsq, cisf_P2, cisf_P1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 cisf_Q = cisf_rsq, cisf_Q2, cisf_Q1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 cisf_rcub = cisf_r_exact, cisf_rsq // get r^3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 cisf_srsq_sin = cisf_Sm_sin,cisf_rsq
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 cisf_srsq_cos = cisf_Sm_cos,cisf_rsq
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 cisf_P = cisf_rcub,cisf_P,cisf_r_exact
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 cisf_Q_sin = cisf_srsq_sin,cisf_Q, cisf_Sm_sin
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 cisf_Q_cos = cisf_srsq_cos,cisf_Q, cisf_Sm_cos
+ nop.i 0
+};;
+
+// If den. arg, force underflow to be set
+{ .mfi
+ nop.m 0
+(p10) fmpy.s.s0 cisf_tmp = cisf_Arg,cisf_Arg
+ nop.i 0
+};;
+
+//Final sin
+{ .mfi
+ nop.m 0
+ fma.s.s0 cisf_Sin_res = cisf_Cm_sin, cisf_P, cisf_Q_sin
+ nop.i 0
+}
+//Final cos
+{ .mfb
+ nop.m 0
+ fma.s.s0 cisf_Cos_res = cisf_Cm_cos, cisf_P, cisf_Q_cos
+(p14) br.cond.sptk _CISF_RETURN //com. exit for __libm_sincos and cis main path
+};;
+
+{ .mmb
+ stfs [cisf_pResSin] = cisf_Sin_res
+ stfs [cisf_pResCos] = cisf_Cos_res
+ br.ret.sptk b0 // common exit for sincos main path
+};;
+
+_CISF_SPECIAL_ARGS:
+// sinf(+/-0) = +/-0
+// sinf(Inf) = NaN
+// sinf(NaN) = NaN
+{ .mfi
+ nop.m 999
+ fma.s.s0 cisf_Sin_res = cisf_Arg, f0, f0 // sinf(+/-0,NaN,Inf)
+ nop.i 999
+};;
+
+// cosf(+/-0) = 1.0
+// cosf(Inf) = NaN
+// cosf(NaN) = NaN
+{ .mfb
+ nop.m 999
+ fma.s.s0 cisf_Cos_res = cisf_Arg, f0, f1 // cosf(+/-0,NaN,Inf)
+(p14) br.cond.sptk _CISF_RETURN //spec exit for __libm_sincos and cis main path
+};;
+
+{ .mmb
+ stfs [cisf_pResSin] = cisf_Sin_res
+ stfs [cisf_pResCos] = cisf_Cos_res
+ br.ret.sptk b0 // special exit for sincos main path
+};;
+
+ // exit for sincos
+ // NOTE! r8 and r9 used only because of compiler issue
+ // connected with float point complex function arguments pass
+ // After fix of this issue this operations can be deleted
+_CISF_RETURN:
+{ .mmb
+ getf.s r8 = cisf_Cos_res
+ getf.s r9 = cisf_Sin_res
+ br.ret.sptk b0 // exit for sincos
+};;
+GLOBAL_LIBM_END(__libm_sincosf)
+//// |x| > 2^24 path ///////
+.proc _CISF_LARGE_ARGS
+_CISF_LARGE_ARGS:
+.prologue
+{ .mfi
+ nop.m 0
+ nop.f 0
+.save ar.pfs, GR_SAVE_PFS
+ mov GR_SAVE_PFS = ar.pfs
+};;
+
+{ .mfi
+ mov GR_SAVE_GP = gp
+ nop.f 0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0 = b0
+};;
+
+.body
+// Call of huge arguments sincos
+{ .mib
+ nop.m 0
+ mov GR_SAVE_PR = pr
+ br.call.sptk b0 = __libm_sincos_large
+};;
+
+{ .mfi
+ mov gp = GR_SAVE_GP
+ nop.f 0
+ mov pr = GR_SAVE_PR, 0x1fffe
+}
+;;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ mov b0 = GR_SAVE_B0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s.s0 cisf_Cos_res = cisf_Cos_res, f1, f0
+ mov ar.pfs = GR_SAVE_PFS
+}
+// exit for |x| > 2^24 path (__libm_sincos and cis)
+{ .mfb
+ nop.m 0
+ fma.s.s0 cisf_Sin_res = cisf_Sin_res, f1, f0
+(p14) br.cond.sptk _CISF_RETURN
+};;
+
+{ .mmb
+ stfs [cisf_pResSin] = cisf_Sin_res
+ stfs [cisf_pResCos] = cisf_Cos_res
+ br.ret.sptk b0 // exit for sincos |x| > 2^24 path
+};;
+
+.endp _CISF_LARGE_ARGS
+
+.type __libm_sincos_large#,@function
+.global __libm_sincos_large#
+
diff --git a/sysdeps/ia64/fpu/libm_sincosl.S b/sysdeps/ia64/fpu/libm_sincosl.S
new file mode 100644
index 0000000000..2a03a23e7e
--- /dev/null
+++ b/sysdeps/ia64/fpu/libm_sincosl.S
@@ -0,0 +1,2527 @@
+.file "libm_sincosl.asm"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//*********************************************************************
+//
+// History:
+// 05/13/02 Initial version of sincosl (based on libm's sinl and cosl)
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
+//
+//*********************************************************************
+//
+// Function: Combined sincosl routine with 3 different API's
+//
+// API's
+//==============================================================
+// 1) long double _Complex cisl(long double)
+// 2) void sincosl(long double, long double*s, long double*c)
+// 3) __libm_sincosl - internal LIBM function, that accepts
+// argument in f8 and returns cosine through f8, sine through f9
+//
+//
+//*********************************************************************
+//
+// Resources Used:
+//
+// Floating-Point Registers: f8 (Input x and cosl return value),
+// f9 (sinl returned)
+// f32-f121
+//
+// General Purpose Registers:
+// r32-r47
+//
+// Predicate Registers: p6-p15
+//
+//*********************************************************************
+//
+// IEEE Special Conditions:
+//
+// Denormal fault raised on denormal inputs
+// Overflow exceptions do not occur
+// Underflow exceptions raised when appropriate for sincosl
+// (No specialized error handling for this routine)
+// Inexact raised when appropriate by algorithm
+//
+// sincosl(SNaN) = QNaN, QNaN
+// sincosl(QNaN) = QNaN, QNaN
+// sincosl(inf) = QNaN, QNaN
+// sincosl(+/-0) = +/-0, 1
+//
+//*********************************************************************
+//
+// Mathematical Description
+// ========================
+//
+// The computation of FSIN and FCOS performed in parallel.
+//
+// Arg = N pi/2 + alpha, |alpha| <= pi/4.
+//
+// cosl( Arg ) = sinl( (N+1) pi/2 + alpha ),
+//
+// therefore, the code for computing sine will produce cosine as long
+// as 1 is added to N immediately after the argument reduction
+// process.
+//
+// Let M = N if sine
+// N+1 if cosine.
+//
+// Now, given
+//
+// Arg = M pi/2 + alpha, |alpha| <= pi/4,
+//
+// let I = M mod 4, or I be the two lsb of M when M is represented
+// as 2's complement. I = [i_0 i_1]. Then
+//
+// sinl( Arg ) = (-1)^i_0 sinl( alpha ) if i_1 = 0,
+// = (-1)^i_0 cosl( alpha ) if i_1 = 1.
+//
+// For example:
+// if M = -1, I = 11
+// sin ((-pi/2 + alpha) = (-1) cos (alpha)
+// if M = 0, I = 00
+// sin (alpha) = sin (alpha)
+// if M = 1, I = 01
+// sin (pi/2 + alpha) = cos (alpha)
+// if M = 2, I = 10
+// sin (pi + alpha) = (-1) sin (alpha)
+// if M = 3, I = 11
+// sin ((3/2)pi + alpha) = (-1) cos (alpha)
+//
+// The value of alpha is obtained by argument reduction and
+// represented by two working precision numbers r and c where
+//
+// alpha = r + c accurately.
+//
+// The reduction method is described in a previous write up.
+// The argument reduction scheme identifies 4 cases. For Cases 2
+// and 4, because |alpha| is small, sinl(r+c) and cosl(r+c) can be
+// computed very easily by 2 or 3 terms of the Taylor series
+// expansion as follows:
+//
+// Case 2:
+// -------
+//
+// sinl(r + c) = r + c - r^3/6 accurately
+// cosl(r + c) = 1 - 2^(-67) accurately
+//
+// Case 4:
+// -------
+//
+// sinl(r + c) = r + c - r^3/6 + r^5/120 accurately
+// cosl(r + c) = 1 - r^2/2 + r^4/24 accurately
+//
+// The only cases left are Cases 1 and 3 of the argument reduction
+// procedure. These two cases will be merged since after the
+// argument is reduced in either cases, we have the reduced argument
+// represented as r + c and that the magnitude |r + c| is not small
+// enough to allow the usage of a very short approximation.
+//
+// The required calculation is either
+//
+// sinl(r + c) = sinl(r) + correction, or
+// cosl(r + c) = cosl(r) + correction.
+//
+// Specifically,
+//
+// sinl(r + c) = sinl(r) + c sin'(r) + O(c^2)
+// = sinl(r) + c cos (r) + O(c^2)
+// = sinl(r) + c(1 - r^2/2) accurately.
+// Similarly,
+//
+// cosl(r + c) = cosl(r) - c sinl(r) + O(c^2)
+// = cosl(r) - c(r - r^3/6) accurately.
+//
+// We therefore concentrate on accurately calculating sinl(r) and
+// cosl(r) for a working-precision number r, |r| <= pi/4 to within
+// 0.1% or so.
+//
+// The greatest challenge of this task is that the second terms of
+// the Taylor series
+//
+// r - r^3/3! + r^r/5! - ...
+//
+// and
+//
+// 1 - r^2/2! + r^4/4! - ...
+//
+// are not very small when |r| is close to pi/4 and the rounding
+// errors will be a concern if simple polynomial accumulation is
+// used. When |r| < 2^-3, however, the second terms will be small
+// enough (6 bits or so of right shift) that a normal Horner
+// recurrence suffices. Hence there are two cases that we consider
+// in the accurate computation of sinl(r) and cosl(r), |r| <= pi/4.
+//
+// Case small_r: |r| < 2^(-3)
+// --------------------------
+//
+// Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1],
+// we have
+//
+// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0
+// = (-1)^i_0 * cosl(r + c) if i_1 = 1
+//
+// can be accurately approximated by
+//
+// sinl(Arg) = (-1)^i_0 * [sinl(r) + c] if i_1 = 0
+// = (-1)^i_0 * [cosl(r) - c*r] if i_1 = 1
+//
+// because |r| is small and thus the second terms in the correction
+// are unneccessary.
+//
+// Finally, sinl(r) and cosl(r) are approximated by polynomials of
+// moderate lengths.
+//
+// sinl(r) = r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11
+// cosl(r) = 1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10
+//
+// We can make use of predicates to selectively calculate
+// sinl(r) or cosl(r) based on i_1.
+//
+// Case normal_r: 2^(-3) <= |r| <= pi/4
+// ------------------------------------
+//
+// This case is more likely than the previous one if one considers
+// r to be uniformly distributed in [-pi/4 pi/4]. Again,
+//
+// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0
+// = (-1)^i_0 * cosl(r + c) if i_1 = 1.
+//
+// Because |r| is now larger, we need one extra term in the
+// correction. sinl(Arg) can be accurately approximated by
+//
+// sinl(Arg) = (-1)^i_0 * [sinl(r) + c(1-r^2/2)] if i_1 = 0
+// = (-1)^i_0 * [cosl(r) - c*r*(1 - r^2/6)] i_1 = 1.
+//
+// Finally, sinl(r) and cosl(r) are approximated by polynomials of
+// moderate lengths.
+//
+// sinl(r) = r + PP_1_hi r^3 + PP_1_lo r^3 +
+// PP_2 r^5 + ... + PP_8 r^17
+//
+// cosl(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16
+//
+// where PP_1_hi is only about 16 bits long and QQ_1 is -1/2.
+// The crux in accurate computation is to calculate
+//
+// r + PP_1_hi r^3 or 1 + QQ_1 r^2
+//
+// accurately as two pieces: U_hi and U_lo. The way to achieve this
+// is to obtain r_hi as a 10 sig. bit number that approximates r to
+// roughly 8 bits or so of accuracy. (One convenient way is
+//
+// r_hi := frcpa( frcpa( r ) ).)
+//
+// This way,
+//
+// r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 +
+// PP_1_hi (r^3 - r_hi^3)
+// = [r + PP_1_hi r_hi^3] +
+// [PP_1_hi (r - r_hi)
+// (r^2 + r_hi r + r_hi^2) ]
+// = U_hi + U_lo
+//
+// Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long,
+// PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed
+// exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign
+// and that there is no more than 8 bit shift off between r and
+// PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus
+// calculated without any error. Finally, the fact that
+//
+// |U_lo| <= 2^(-8) |U_hi|
+//
+// says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly
+// 8 extra bits of accuracy.
+//
+// Similarly,
+//
+// 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] +
+// [QQ_1 (r - r_hi)(r + r_hi)]
+// = U_hi + U_lo.
+//
+// Summarizing, we calculate r_hi = frcpa( frcpa( r ) ).
+//
+// If i_1 = 0, then
+//
+// U_hi := r + PP_1_hi * r_hi^3
+// U_lo := PP_1_hi * (r - r_hi) * (r^2 + r*r_hi + r_hi^2)
+// poly := PP_1_lo r^3 + PP_2 r^5 + ... + PP_8 r^17
+// correction := c * ( 1 + C_1 r^2 )
+//
+// Else ...i_1 = 1
+//
+// U_hi := 1 + QQ_1 * r_hi * r_hi
+// U_lo := QQ_1 * (r - r_hi) * (r + r_hi)
+// poly := QQ_2 * r^4 + QQ_3 * r^6 + ... + QQ_8 r^16
+// correction := -c * r * (1 + S_1 * r^2)
+//
+// End
+//
+// Finally,
+//
+// V := poly + ( U_lo + correction )
+//
+// / U_hi + V if i_0 = 0
+// result := |
+// \ (-U_hi) - V if i_0 = 1
+//
+// It is important that in the last step, negation of U_hi is
+// performed prior to the subtraction which is to be performed in
+// the user-set rounding mode.
+//
+//
+// Algorithmic Description
+// =======================
+//
+// The argument reduction algorithm shares the same code between FSIN and FCOS.
+// The argument reduction description given
+// previously is repeated below.
+//
+//
+// Step 0. Initialization.
+//
+// Step 1. Check for exceptional and special cases.
+//
+// * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special
+// handling.
+// * If |Arg| < 2^24, go to Step 2 for reduction of moderate
+// arguments. This is the most likely case.
+// * If |Arg| < 2^63, go to Step 8 for pre-reduction of large
+// arguments.
+// * If |Arg| >= 2^63, go to Step 10 for special handling.
+//
+// Step 2. Reduction of moderate arguments.
+//
+// If |Arg| < pi/4 ...quick branch
+// N_fix := N_inc (integer)
+// r := Arg
+// c := 0.0
+// Branch to Step 4, Case_1_complete
+// Else ...cf. argument reduction
+// N := Arg * two_by_PI (fp)
+// N_fix := fcvt.fx( N ) (int)
+// N := fcvt.xf( N_fix )
+// N_fix := N_fix + N_inc
+// s := Arg - N * P_1 (first piece of pi/2)
+// w := -N * P_2 (second piece of pi/2)
+//
+// If |s| >= 2^(-33)
+// go to Step 3, Case_1_reduce
+// Else
+// go to Step 7, Case_2_reduce
+// Endif
+// Endif
+//
+// Step 3. Case_1_reduce.
+//
+// r := s + w
+// c := (s - r) + w ...observe order
+//
+// Step 4. Case_1_complete
+//
+// ...At this point, the reduced argument alpha is
+// ...accurately represented as r + c.
+// If |r| < 2^(-3), go to Step 6, small_r.
+//
+// Step 5. Normal_r.
+//
+// Let [i_0 i_1] by the 2 lsb of N_fix.
+// FR_rsq := r * r
+// r_hi := frcpa( frcpa( r ) )
+// r_lo := r - r_hi
+//
+// If i_1 = 0, then
+// poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8))
+// U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order
+// U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi)
+// correction := c + c*C_1*FR_rsq ...any order
+// Else
+// poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8))
+// U_hi := 1 + QQ_1 * r_hi * r_hi ...any order
+// U_lo := QQ_1 * r_lo * (r + r_hi)
+// correction := -c*(r + S_1*FR_rsq*r) ...any order
+// Endif
+//
+// V := poly + (U_lo + correction) ...observe order
+//
+// result := (i_0 == 0? 1.0 : -1.0)
+//
+// Last instruction in user-set rounding mode
+//
+// result := (i_0 == 0? result*U_hi + V :
+// result*U_hi - V)
+//
+// Return
+//
+// Step 6. Small_r.
+//
+// ...Use flush to zero mode without causing exception
+// Let [i_0 i_1] be the two lsb of N_fix.
+//
+// FR_rsq := r * r
+//
+// If i_1 = 0 then
+// z := FR_rsq*FR_rsq; z := FR_rsq*z *r
+// poly_lo := S_3 + FR_rsq*(S_4 + FR_rsq*S_5)
+// poly_hi := r*FR_rsq*(S_1 + FR_rsq*S_2)
+// correction := c
+// result := r
+// Else
+// z := FR_rsq*FR_rsq; z := FR_rsq*z
+// poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5)
+// poly_hi := FR_rsq*(C_1 + FR_rsq*C_2)
+// correction := -c*r
+// result := 1
+// Endif
+//
+// poly := poly_hi + (z * poly_lo + correction)
+//
+// If i_0 = 1, result := -result
+//
+// Last operation. Perform in user-set rounding mode
+//
+// result := (i_0 == 0? result + poly :
+// result - poly )
+// Return
+//
+// Step 7. Case_2_reduce.
+//
+// ...Refer to the write up for argument reduction for
+// ...rationale. The reduction algorithm below is taken from
+// ...argument reduction description and integrated this.
+//
+// w := N*P_3
+// U_1 := N*P_2 + w ...FMA
+// U_2 := (N*P_2 - U_1) + w ...2 FMA
+// ...U_1 + U_2 is N*(P_2+P_3) accurately
+//
+// r := s - U_1
+// c := ( (s - r) - U_1 ) - U_2
+//
+// ...The mathematical sum r + c approximates the reduced
+// ...argument accurately. Note that although compared to
+// ...Case 1, this case requires much more work to reduce
+// ...the argument, the subsequent calculation needed for
+// ...any of the trigonometric function is very little because
+// ...|alpha| < 1.01*2^(-33) and thus two terms of the
+// ...Taylor series expansion suffices.
+//
+// If i_1 = 0 then
+// poly := c + S_1 * r * r * r ...any order
+// result := r
+// Else
+// poly := -2^(-67)
+// result := 1.0
+// Endif
+//
+// If i_0 = 1, result := -result
+//
+// Last operation. Perform in user-set rounding mode
+//
+// result := (i_0 == 0? result + poly :
+// result - poly )
+//
+// Return
+//
+//
+// Step 8. Pre-reduction of large arguments.
+//
+// ...Again, the following reduction procedure was described
+// ...in the separate write up for argument reduction, which
+// ...is tightly integrated here.
+
+// N_0 := Arg * Inv_P_0
+// N_0_fix := fcvt.fx( N_0 )
+// N_0 := fcvt.xf( N_0_fix)
+
+// Arg' := Arg - N_0 * P_0
+// w := N_0 * d_1
+// N := Arg' * two_by_PI
+// N_fix := fcvt.fx( N )
+// N := fcvt.xf( N_fix )
+// N_fix := N_fix + N_inc
+//
+// s := Arg' - N * P_1
+// w := w - N * P_2
+//
+// If |s| >= 2^(-14)
+// go to Step 3
+// Else
+// go to Step 9
+// Endif
+//
+// Step 9. Case_4_reduce.
+//
+// ...first obtain N_0*d_1 and -N*P_2 accurately
+// U_hi := N_0 * d_1 V_hi := -N*P_2
+// U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs
+//
+// ...compute the contribution from N_0*d_1 and -N*P_3
+// w := -N*P_3
+// w := w + N_0*d_2
+// t := U_lo + V_lo + w ...any order
+//
+// ...at this point, the mathematical value
+// ...s + U_hi + V_hi + t approximates the true reduced argument
+// ...accurately. Just need to compute this accurately.
+//
+// ...Calculate U_hi + V_hi accurately:
+// A := U_hi + V_hi
+// if |U_hi| >= |V_hi| then
+// a := (U_hi - A) + V_hi
+// else
+// a := (V_hi - A) + U_hi
+// endif
+// ...order in computing "a" must be observed. This branch is
+// ...best implemented by predicates.
+// ...A + a is U_hi + V_hi accurately. Moreover, "a" is
+// ...much smaller than A: |a| <= (1/2)ulp(A).
+//
+// ...Just need to calculate s + A + a + t
+// C_hi := s + A t := t + a
+// C_lo := (s - C_hi) + A
+// C_lo := C_lo + t
+//
+// ...Final steps for reduction
+// r := C_hi + C_lo
+// c := (C_hi - r) + C_lo
+//
+// ...At this point, we have r and c
+// ...And all we need is a couple of terms of the corresponding
+// ...Taylor series.
+//
+// If i_1 = 0
+// poly := c + r*FR_rsq*(S_1 + FR_rsq*S_2)
+// result := r
+// Else
+// poly := FR_rsq*(C_1 + FR_rsq*C_2)
+// result := 1
+// Endif
+//
+// If i_0 = 1, result := -result
+//
+// Last operation. Perform in user-set rounding mode
+//
+// result := (i_0 == 0? result + poly :
+// result - poly )
+// Return
+//
+// Large Arguments: For arguments above 2**63, a Payne-Hanek
+// style argument reduction is used and pi_by_2 reduce is called.
+//
+
+
+RODATA
+.align 64
+
+LOCAL_OBJECT_START(FSINCOSL_CONSTANTS)
+
+sincosl_table_p:
+//data4 0x4E44152A, 0xA2F9836E, 0x00003FFE,0x00000000 // Inv_pi_by_2
+//data4 0xCE81B9F1, 0xC84D32B0, 0x00004016,0x00000000 // P_0
+//data4 0x2168C235, 0xC90FDAA2, 0x00003FFF,0x00000000 // P_1
+//data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD,0x00000000 // P_2
+//data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C,0x00000000 // P_3
+//data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF,0x00000000 // d_1
+//data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C,0x00000000 // d_2
+data8 0xA2F9836E4E44152A, 0x00003FFE // Inv_pi_by_2
+data8 0xC84D32B0CE81B9F1, 0x00004016 // P_0
+data8 0xC90FDAA22168C235, 0x00003FFF // P_1
+data8 0xECE675D1FC8F8CBB, 0x0000BFBD // P_2
+data8 0xB7ED8FBBACC19C60, 0x0000BF7C // P_3
+data8 0x8D848E89DBD171A1, 0x0000BFBF // d_1
+data8 0xD5394C3618A66F8E, 0x0000BF7C // d_2
+LOCAL_OBJECT_END(FSINCOSL_CONSTANTS)
+
+LOCAL_OBJECT_START(sincosl_table_d)
+//data4 0x2168C234, 0xC90FDAA2, 0x00003FFE,0x00000000 // pi_by_4
+//data4 0x6EC6B45A, 0xA397E504, 0x00003FE7,0x00000000 // Inv_P_0
+data8 0xC90FDAA22168C234, 0x00003FFE // pi_by_4
+data8 0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0
+data4 0x3E000000, 0xBE000000 // 2^-3 and -2^-3
+data4 0x2F000000, 0xAF000000 // 2^-33 and -2^-33
+data4 0x9E000000, 0x00000000 // -2^-67
+data4 0x00000000, 0x00000000 // pad
+LOCAL_OBJECT_END(sincosl_table_d)
+
+LOCAL_OBJECT_START(sincosl_table_pp)
+//data4 0xA21C0BC9, 0xCC8ABEBC, 0x00003FCE,0x00000000 // PP_8
+//data4 0x720221DA, 0xD7468A05, 0x0000BFD6,0x00000000 // PP_7
+//data4 0x640AD517, 0xB092382F, 0x00003FDE,0x00000000 // PP_6
+//data4 0xD1EB75A4, 0xD7322B47, 0x0000BFE5,0x00000000 // PP_5
+//data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1
+//data4 0x00000000, 0xAAAA0000, 0x0000BFFC,0x00000000 // PP_1_hi
+//data4 0xBAF69EEA, 0xB8EF1D2A, 0x00003FEC,0x00000000 // PP_4
+//data4 0x0D03BB69, 0xD00D00D0, 0x0000BFF2,0x00000000 // PP_3
+//data4 0x88888962, 0x88888888, 0x00003FF8,0x00000000 // PP_2
+//data4 0xAAAB0000, 0xAAAAAAAA, 0x0000BFEC,0x00000000 // PP_1_lo
+data8 0xCC8ABEBCA21C0BC9, 0x00003FCE // PP_8
+data8 0xD7468A05720221DA, 0x0000BFD6 // PP_7
+data8 0xB092382F640AD517, 0x00003FDE // PP_6
+data8 0xD7322B47D1EB75A4, 0x0000BFE5 // PP_5
+data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1
+data8 0xAAAA000000000000, 0x0000BFFC // PP_1_hi
+data8 0xB8EF1D2ABAF69EEA, 0x00003FEC // PP_4
+data8 0xD00D00D00D03BB69, 0x0000BFF2 // PP_3
+data8 0x8888888888888962, 0x00003FF8 // PP_2
+data8 0xAAAAAAAAAAAB0000, 0x0000BFEC // PP_1_lo
+LOCAL_OBJECT_END(sincosl_table_pp)
+
+LOCAL_OBJECT_START(sincosl_table_qq)
+//data4 0xC2B0FE52, 0xD56232EF, 0x00003FD2 // QQ_8
+//data4 0x2B48DCA6, 0xC9C99ABA, 0x0000BFDA // QQ_7
+//data4 0x9C716658, 0x8F76C650, 0x00003FE2 // QQ_6
+//data4 0xFDA8D0FC, 0x93F27DBA, 0x0000BFE9 // QQ_5
+//data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC // S_1
+//data4 0x00000000, 0x80000000, 0x0000BFFE,0x00000000 // QQ_1
+//data4 0x0C6E5041, 0xD00D00D0, 0x00003FEF,0x00000000 // QQ_4
+//data4 0x0B607F60, 0xB60B60B6, 0x0000BFF5,0x00000000 // QQ_3
+//data4 0xAAAAAA9B, 0xAAAAAAAA, 0x00003FFA,0x00000000 // QQ_2
+data8 0xD56232EFC2B0FE52, 0x00003FD2 // QQ_8
+data8 0xC9C99ABA2B48DCA6, 0x0000BFDA // QQ_7
+data8 0x8F76C6509C716658, 0x00003FE2 // QQ_6
+data8 0x93F27DBAFDA8D0FC, 0x0000BFE9 // QQ_5
+data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1
+data8 0x8000000000000000, 0x0000BFFE // QQ_1
+data8 0xD00D00D00C6E5041, 0x00003FEF // QQ_4
+data8 0xB60B60B60B607F60, 0x0000BFF5 // QQ_3
+data8 0xAAAAAAAAAAAAAA9B, 0x00003FFA // QQ_2
+LOCAL_OBJECT_END(sincosl_table_qq)
+
+LOCAL_OBJECT_START(sincosl_table_c)
+//data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1
+//data4 0xAAAA719F, 0xAAAAAAAA, 0x00003FFA,0x00000000 // C_2
+//data4 0x0356F994, 0xB60B60B6, 0x0000BFF5,0x00000000 // C_3
+//data4 0xB2385EA9, 0xD00CFFD5, 0x00003FEF,0x00000000 // C_4
+//data4 0x292A14CD, 0x93E4BD18, 0x0000BFE9,0x00000000 // C_5
+data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1
+data8 0xAAAAAAAAAAAA719F, 0x00003FFA // C_2
+data8 0xB60B60B60356F994, 0x0000BFF5 // C_3
+data8 0xD00CFFD5B2385EA9, 0x00003FEF // C_4
+data8 0x93E4BD18292A14CD, 0x0000BFE9 // C_5
+LOCAL_OBJECT_END(sincosl_table_c)
+
+LOCAL_OBJECT_START(sincosl_table_s)
+//data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1
+//data4 0x888868DB, 0x88888888, 0x00003FF8,0x00000000 // S_2
+//data4 0x055EFD4B, 0xD00D00D0, 0x0000BFF2,0x00000000 // S_3
+//data4 0x839730B9, 0xB8EF1C5D, 0x00003FEC,0x00000000 // S_4
+//data4 0xE5B3F492, 0xD71EA3A4, 0x0000BFE5,0x00000000 // S_5
+data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1
+data8 0x88888888888868DB, 0x00003FF8 // S_2
+data8 0xD00D00D0055EFD4B, 0x0000BFF2 // S_3
+data8 0xB8EF1C5D839730B9, 0x00003FEC // S_4
+data8 0xD71EA3A4E5B3F492, 0x0000BFE5 // S_5
+data4 0x38800000, 0xB8800000 // two**-14 and -two**-14
+LOCAL_OBJECT_END(sincosl_table_s)
+
+FR_Input_X = f8
+FR_Result = f8
+FR_ResultS = f9
+FR_ResultC = f8
+FR_r = f8
+FR_c = f9
+
+FR_norm_x = f9
+FR_inv_pi_2to63 = f10
+FR_rshf_2to64 = f11
+FR_2tom64 = f12
+FR_rshf = f13
+FR_N_float_signif = f14
+FR_abs_x = f15
+
+FR_r6 = f32
+FR_r7 = f33
+FR_Pi_by_4 = f34
+FR_Two_to_M14 = f35
+FR_Neg_Two_to_M14 = f36
+FR_Two_to_M33 = f37
+FR_Neg_Two_to_M33 = f38
+FR_Neg_Two_to_M67 = f39
+FR_Inv_pi_by_2 = f40
+FR_N_float = f41
+FR_N_fix = f42
+FR_P_1 = f43
+FR_P_2 = f44
+FR_P_3 = f45
+FR_s = f46
+FR_w = f47
+FR_Z = f50
+FR_A = f51
+FR_a = f52
+FR_t = f53
+FR_U_1 = f54
+FR_U_2 = f55
+FR_C_1 = f56
+FR_C_2 = f57
+FR_C_3 = f58
+FR_C_4 = f59
+FR_C_5 = f60
+FR_S_1 = f61
+FR_S_2 = f62
+FR_S_3 = f63
+FR_S_4 = f64
+FR_S_5 = f65
+FR_r_hi = f68
+FR_r_lo = f69
+FR_rsq = f70
+FR_r_cubed = f71
+FR_C_hi = f72
+FR_N_0 = f73
+FR_d_1 = f74
+FR_V_hi = f75
+FR_V_lo = f76
+FR_U_hi = f77
+FR_U_lo = f78
+FR_U_hiabs = f79
+FR_V_hiabs = f80
+FR_PP_8 = f81
+FR_QQ_8 = f101
+FR_PP_7 = f82
+FR_QQ_7 = f102
+FR_PP_6 = f83
+FR_QQ_6 = f103
+FR_PP_5 = f84
+FR_QQ_5 = f104
+FR_PP_4 = f85
+FR_QQ_4 = f105
+FR_PP_3 = f86
+FR_QQ_3 = f106
+FR_PP_2 = f87
+FR_QQ_2 = f107
+FR_QQ_1 = f108
+FR_r_hi_sq = f88
+FR_N_0_fix = f89
+FR_Inv_P_0 = f90
+FR_d_2 = f93
+FR_P_0 = f95
+FR_C_lo = f96
+FR_PP_1 = f97
+FR_PP_1_lo = f98
+FR_ArgPrime = f99
+FR_inexact = f100
+
+FR_Neg_Two_to_M3 = f109
+FR_Two_to_M3 = f110
+
+FR_poly_hiS = f66
+FR_poly_hiC = f112
+
+FR_poly_loS = f67
+FR_poly_loC = f113
+
+FR_polyS = f92
+FR_polyC = f114
+
+FR_cS = FR_c
+FR_cC = f115
+
+FR_corrS = f91
+FR_corrC = f116
+
+FR_U_hiC = f117
+FR_U_loC = f118
+
+FR_VS = f75
+FR_VC = f119
+
+FR_FirstS = f120
+FR_FirstC = f121
+
+FR_U_hiS = FR_U_hi
+FR_U_loS = FR_U_lo
+
+FR_Tmp = f94
+
+
+
+
+sincos_pResSin = r34
+sincos_pResCos = r35
+
+GR_sig_inv_pi = r14
+GR_rshf_2to64 = r15
+GR_exp_2tom64 = r16
+GR_rshf = r17
+GR_ad_p = r18
+GR_ad_d = r19
+GR_ad_pp = r20
+GR_ad_qq = r21
+GR_ad_c = r22
+GR_ad_s = r23
+GR_ad_ce = r24
+GR_ad_se = r25
+GR_ad_m14 = r26
+GR_ad_s1 = r27
+GR_exp_m2_to_m3= r36
+GR_N_Inc = r37
+GR_Cis = r38
+GR_signexp_x = r40
+GR_exp_x = r40
+GR_exp_mask = r41
+GR_exp_2_to_63 = r42
+GR_exp_2_to_m3 = r43
+GR_exp_2_to_24 = r44
+
+GR_N_SignS = r45
+GR_N_SignC = r46
+GR_N_SinCos = r47
+
+
+// For unwind support
+GR_SAVE_B0 = r39
+GR_SAVE_GP = r40
+GR_SAVE_PFS = r41
+
+
+.section .text
+
+GLOBAL_IEEE754_ENTRY(sincosl)
+{ .mlx ///////////////////////////// 1 /////////////////
+ alloc r32 = ar.pfs,3,13,2,0
+ movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
+}
+{ .mlx
+ mov GR_N_Inc = 0x0
+ movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
+};;
+
+{ .mfi ///////////////////////////// 2 /////////////////
+ addl GR_ad_p = @ltoff(FSINCOSL_CONSTANTS#), gp
+ fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test x natval, nan, inf
+ mov GR_exp_2_to_m3 = 0xffff - 3 // Exponent of 2^-3
+}
+{ .mfb
+ mov GR_Cis = 0x0
+ fnorm.s1 FR_norm_x = FR_Input_X // Normalize x
+ br.cond.sptk _COMMON_SINCOSL
+};;
+GLOBAL_IEEE754_END(sincosl)
+
+LOCAL_LIBM_ENTRY(cisl)
+LOCAL_LIBM_END(cisl)
+GLOBAL_LIBM_ENTRY(__libm_sincosl)
+{ .mlx ///////////////////////////// 1 /////////////////
+ alloc r32 = ar.pfs,3,14,2,0
+ movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
+}
+{ .mlx
+ mov GR_N_Inc = 0x0
+ movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
+};;
+
+{ .mfi ///////////////////////////// 2 /////////////////
+ addl GR_ad_p = @ltoff(FSINCOSL_CONSTANTS#), gp
+ fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test x natval, nan, inf
+ mov GR_exp_2_to_m3 = 0xffff - 3 // Exponent of 2^-3
+}
+{ .mfb
+ mov GR_Cis = 0x1
+ fnorm.s1 FR_norm_x = FR_Input_X // Normalize x
+ nop.b 0
+};;
+
+_COMMON_SINCOSL:
+{ .mfi ///////////////////////////// 3 /////////////////
+ setf.sig FR_inv_pi_2to63 = GR_sig_inv_pi // Form 1/pi * 2^63
+ nop.f 0
+ mov GR_exp_2tom64 = 0xffff - 64 // Scaling constant to compute N
+}
+{ .mlx
+ setf.d FR_rshf_2to64 = GR_rshf_2to64 // Form const 1.1000 * 2^(63+64)
+ movl GR_rshf = 0x43e8000000000000 // Form const 1.1000 * 2^63
+};;
+
+{ .mfi ///////////////////////////// 4 /////////////////
+ ld8 GR_ad_p = [GR_ad_p] // Point to Inv_pi_by_2
+ fclass.m p7, p0 = FR_Input_X, 0x0b // Test x denormal
+ nop.i 0
+};;
+
+{ .mfi ///////////////////////////// 5 /////////////////
+ getf.exp GR_signexp_x = FR_Input_X // Get sign and exponent of x
+ fclass.m p10, p0 = FR_Input_X, 0x007 // Test x zero
+ nop.i 0
+}
+{ .mib
+ mov GR_exp_mask = 0x1ffff // Exponent mask
+ nop.i 0
+(p6) br.cond.spnt SINCOSL_SPECIAL // Branch if x natval, nan, inf
+};;
+
+{ .mfi ///////////////////////////// 6 /////////////////
+ setf.exp FR_2tom64 = GR_exp_2tom64 // Form 2^-64 for scaling N_float
+ nop.f 0
+ add GR_ad_d = 0x70, GR_ad_p // Point to constant table d
+}
+{ .mib
+ setf.d FR_rshf = GR_rshf // Form right shift const 1.1000 * 2^63
+ mov GR_exp_m2_to_m3 = 0x2fffc // Form -(2^-3)
+(p7) br.cond.spnt SINCOSL_DENORMAL // Branch if x denormal
+};;
+
+SINCOSL_COMMON2:
+{ .mfi ///////////////////////////// 7 /////////////////
+ and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x
+ fclass.nm p8, p0 = FR_Input_X, 0x1FF // Test x unsupported type
+ mov GR_exp_2_to_63 = 0xffff + 63 // Exponent of 2^63
+}
+{ .mib
+ add GR_ad_pp = 0x40, GR_ad_d // Point to constant table pp
+ mov GR_exp_2_to_24 = 0xffff + 24 // Exponent of 2^24
+(p10) br.cond.spnt SINCOSL_ZERO // Branch if x zero
+};;
+
+{ .mfi ///////////////////////////// 8 /////////////////
+ ldfe FR_Inv_pi_by_2 = [GR_ad_p], 16 // Load 2/pi
+ fcmp.eq.s0 p15, p0 = FR_Input_X, f0 // Dummy to set denormal
+ add GR_ad_qq = 0xa0, GR_ad_pp // Point to constant table qq
+}
+{ .mfi
+ ldfe FR_Pi_by_4 = [GR_ad_d], 16 // Load pi/4 for range test
+ nop.f 0
+ cmp.ge p10,p0 = GR_exp_x, GR_exp_2_to_63 // Is |x| >= 2^63
+};;
+
+{ .mfi ///////////////////////////// 9 /////////////////
+ ldfe FR_P_0 = [GR_ad_p], 16 // Load P_0 for pi/4 <= |x| < 2^63
+ fmerge.s FR_abs_x = f1, FR_norm_x // |x|
+ add GR_ad_c = 0x90, GR_ad_qq // Point to constant table c
+}
+{ .mfi
+ ldfe FR_Inv_P_0 = [GR_ad_d], 16 // Load 1/P_0 for pi/4 <= |x| < 2^63
+ nop.f 0
+ cmp.ge p7,p0 = GR_exp_x, GR_exp_2_to_24 // Is |x| >= 2^24
+};;
+
+{ .mfi ///////////////////////////// 10 /////////////////
+ ldfe FR_P_1 = [GR_ad_p], 16 // Load P_1 for pi/4 <= |x| < 2^63
+ nop.f 0
+ add GR_ad_s = 0x50, GR_ad_c // Point to constant table s
+}
+{ .mfi
+ ldfe FR_PP_8 = [GR_ad_pp], 16 // Load PP_8 for 2^-3 < |r| < pi/4
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi ///////////////////////////// 11 /////////////////
+ ldfe FR_P_2 = [GR_ad_p], 16 // Load P_2 for pi/4 <= |x| < 2^63
+ nop.f 0
+ add GR_ad_ce = 0x40, GR_ad_c // Point to end of constant table c
+}
+{ .mfi
+ ldfe FR_QQ_8 = [GR_ad_qq], 16 // Load QQ_8 for 2^-3 < |r| < pi/4
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi ///////////////////////////// 12 /////////////////
+ ldfe FR_QQ_7 = [GR_ad_qq], 16 // Load QQ_7 for 2^-3 < |r| < pi/4
+ fma.s1 FR_N_float_signif = FR_Input_X, FR_inv_pi_2to63, FR_rshf_2to64
+ add GR_ad_se = 0x40, GR_ad_s // Point to end of constant table s
+}
+{ .mib
+ ldfe FR_PP_7 = [GR_ad_pp], 16 // Load PP_7 for 2^-3 < |r| < pi/4
+ mov GR_ad_s1 = GR_ad_s // Save pointer to S_1
+(p10) br.cond.spnt SINCOSL_ARG_TOO_LARGE // Branch if |x| >= 2^63
+ // Use Payne-Hanek Reduction
+};;
+
+{ .mfi ///////////////////////////// 13 /////////////////
+ ldfe FR_P_3 = [GR_ad_p], 16 // Load P_3 for pi/4 <= |x| < 2^63
+ fmerge.se FR_r = FR_norm_x, FR_norm_x // r = x, in case |x| < pi/4
+ add GR_ad_m14 = 0x50, GR_ad_s // Point to constant table m14
+}
+{ .mfb
+ ldfps FR_Two_to_M3, FR_Neg_Two_to_M3 = [GR_ad_d], 8
+ fma.s1 FR_rsq = FR_norm_x, FR_norm_x, f0 // rsq = x*x, in case |x| < pi/4
+(p7) br.cond.spnt SINCOSL_LARGER_ARG // Branch if 2^24 <= |x| < 2^63
+ // Use pre-reduction
+};;
+
+{ .mmf ///////////////////////////// 14 /////////////////
+ ldfe FR_PP_6 = [GR_ad_pp], 16 // Load PP_6 for normal path
+ ldfe FR_QQ_6 = [GR_ad_qq], 16 // Load QQ_6 for normal path
+ fmerge.se FR_c = f0, f0 // c = 0 in case |x| < pi/4
+};;
+
+{ .mmf ///////////////////////////// 15 /////////////////
+ ldfe FR_PP_5 = [GR_ad_pp], 16 // Load PP_5 for normal path
+ ldfe FR_QQ_5 = [GR_ad_qq], 16 // Load QQ_5 for normal path
+ nop.f 0
+};;
+
+// Here if 0 < |x| < 2^24
+{ .mfi ///////////////////////////// 17 /////////////////
+ ldfe FR_S_5 = [GR_ad_se], -16 // Load S_5 if i_1=0
+ fcmp.lt.s1 p6, p7 = FR_abs_x, FR_Pi_by_4 // Test |x| < pi/4
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_C_5 = [GR_ad_ce], -16 // Load C_5 if i_1=1
+ fms.s1 FR_N_float = FR_N_float_signif, FR_2tom64, FR_rshf
+ nop.i 0
+};;
+
+{ .mmi ///////////////////////////// 18 /////////////////
+ ldfe FR_S_4 = [GR_ad_se], -16 // Load S_4 if i_1=0
+ ldfe FR_C_4 = [GR_ad_ce], -16 // Load C_4 if i_1=1
+ nop.i 0
+};;
+
+//
+// N = Arg * 2/pi
+// Check if Arg < pi/4
+//
+//
+// Case 2: Convert integer N_fix back to normalized floating-point value.
+// Case 1: p8 is only affected when p6 is set
+//
+//
+// Grab the integer part of N and call it N_fix
+//
+{ .mfi ///////////////////////////// 19 /////////////////
+(p7) ldfps FR_Two_to_M33, FR_Neg_Two_to_M33 = [GR_ad_d], 8
+(p6) fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 // r^3 if |x| < pi/4
+(p6) mov GR_N_Inc = 0x0 // N_IncS if |x| < pi/4
+};;
+
+// If |x| < pi/4, r = x and c = 0
+// lf |x| < pi/4, is x < 2**(-3).
+// r = Arg
+// c = 0
+{ .mmi ///////////////////////////// 20 /////////////////
+(p7) getf.sig GR_N_Inc = FR_N_float_signif
+ nop.m 0
+(p6) cmp.lt.unc p8,p0 = GR_exp_x, GR_exp_2_to_m3 // Is |x| < 2^-3
+};;
+
+//
+// lf |x| < pi/4, is -2**(-3)< x < 2**(-3) - set p8.
+// If |x| >= pi/4,
+// Create the right N for |x| < pi/4 and otherwise
+// Case 2: Place integer part of N in GP register
+//
+
+{ .mbb ///////////////////////////// 21 /////////////////
+ nop.m 0
+(p8) br.cond.spnt SINCOSL_SMALL_R_0 // Branch if 0 < |x| < 2^-3
+(p6) br.cond.spnt SINCOSL_NORMAL_R_0 // Branch if 2^-3 <= |x| < pi/4
+};;
+
+// Here if pi/4 <= |x| < 2^24
+{ .mfi
+ ldfs FR_Neg_Two_to_M67 = [GR_ad_d], 8 // Load -2^-67
+ fnma.s1 FR_s = FR_N_float, FR_P_1, FR_Input_X // s = -N * P_1 + Arg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_w = FR_N_float, FR_P_2, f0 // w = N * P_2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = FR_s, f1, FR_w // r = s - w, assume |s| >= 2^-33
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fcmp.lt.s1 p7, p6 = FR_s, FR_Two_to_M33
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p7) fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33 // p6 if |s| >= 2^-33, else p7
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_c = FR_s, f1, FR_r // c = s - r, for |s| >= 2^-33
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rsq = FR_r, FR_r, f0 // rsq = r * r, for |s| >= 2^-33
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p7) fma.s1 FR_w = FR_N_float, FR_P_3, f0
+ nop.i 0
+};;
+
+{ .mmf
+ ldfe FR_C_1 = [GR_ad_pp], 16 // Load C_1 if i_1=0
+ ldfe FR_S_1 = [GR_ad_qq], 16 // Load S_1 if i_1=1
+ frcpa.s1 FR_r_hi, p15 = f1, FR_r // r_hi = frcpa(r)
+};;
+
+{ .mfi
+ nop.m 0
+(p6) fcmp.lt.unc.s1 p8, p13 = FR_r, FR_Two_to_M3 // If big s, test r with 2^-3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p7) fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w
+ nop.i 0
+};;
+
+//
+// For big s: r = s - w: No futher reduction is necessary
+// For small s: w = N * P_3 (change sign) More reduction
+//
+{ .mfi
+ nop.m 0
+(p8) fcmp.gt.s1 p8, p13 = FR_r, FR_Neg_Two_to_M3 // If big s, p8 if |r| < 2^-3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyS = FR_rsq, FR_PP_8, FR_PP_7 // poly = rsq*PP_8+PP_7
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyC = FR_rsq, FR_QQ_8, FR_QQ_7 // poly = rsq*QQ_8+QQ_7
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p7) fms.s1 FR_r = FR_s, f1, FR_U_1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p6) fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 // rcubed = r * rsq
+ nop.i 0
+};;
+
+{ .mfi
+//
+// For big s: Is |r| < 2**(-3)?
+// For big s: c = S - r
+// For small s: U_1 = N * P_2 + w
+//
+// If p8 is set, prepare to branch to Small_R.
+// If p9 is set, prepare to branch to Normal_R.
+// For big s, r is complete here.
+//
+//
+// For big s: c = c + w (w has not been negated.)
+// For small s: r = S - U_1
+//
+ nop.m 0
+(p6) fms.s1 FR_c = FR_c, f1, FR_w
+ nop.i 0
+}
+{ .mbb
+ nop.m 0
+(p8) br.cond.spnt SINCOSL_SMALL_R_1 // Branch if |s|>=2^-33, |r| < 2^-3,
+ // and pi/4 <= |x| < 2^24
+(p13) br.cond.sptk SINCOSL_NORMAL_R_1 // Branch if |s|>=2^-33, |r| >= 2^-3,
+ // and pi/4 <= |x| < 2^24
+};;
+
+SINCOSL_S_TINY:
+//
+// Here if |s| < 2^-33, and pi/4 <= |x| < 2^24
+//
+{ .mfi
+ and GR_N_SinCos = 0x1, GR_N_Inc
+ fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1
+ tbit.z p8,p12 = GR_N_Inc, 0
+};;
+
+
+//
+// For small s: U_2 = N * P_2 - U_1
+// S_1 stored constant - grab the one stored with the
+// coefficients.
+//
+{ .mfi
+ ldfe FR_S_1 = [GR_ad_s1], 16
+ fma.s1 FR_polyC = f0, f1, FR_Neg_Two_to_M67
+ sub GR_N_SignS = GR_N_Inc, GR_N_SinCos
+}
+{ .mfi
+ add GR_N_SignC = GR_N_Inc, GR_N_SinCos
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_s = FR_s, f1, FR_r
+(p8) tbit.z.unc p10,p11 = GR_N_SignC, 1
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rsq = FR_r, FR_r, f0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U_2 = FR_U_2, f1, FR_w
+(p8) tbit.z.unc p8,p9 = GR_N_SignS, 1
+};;
+
+{ .mfi
+ nop.m 0
+ fmerge.se FR_FirstS = FR_r, FR_r
+(p12) tbit.z.unc p14,p15 = GR_N_SignC, 1
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_FirstC = f0, f1, f1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_c = FR_s, f1, FR_U_1
+(p12) tbit.z.unc p12,p13 = GR_N_SignS, 1
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r = FR_S_1, FR_r, f0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s0 FR_S_1 = FR_S_1, FR_S_1, f0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_c = FR_c, f1, FR_U_2
+ nop.i 0
+};;
+
+.pred.rel "mutex",p9,p15
+{ .mfi
+ nop.m 0
+(p9) fms.s0 FR_FirstS = f1, f0, FR_FirstS
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fms.s0 FR_FirstS = f1, f0, FR_FirstS
+ nop.i 0
+};;
+
+.pred.rel "mutex",p11,p13
+{ .mfi
+ nop.m 0
+(p11) fms.s0 FR_FirstC = f1, f0, FR_FirstC
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fms.s0 FR_FirstC = f1, f0, FR_FirstC
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyS = FR_r, FR_rsq, FR_c
+ nop.i 0
+};;
+
+
+.pred.rel "mutex",p8,p9
+{ .mfi
+ nop.m 0
+(p8) fma.s0 FR_ResultS = FR_FirstS, f1, FR_polyS
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p9) fms.s0 FR_ResultS = FR_FirstS, f1, FR_polyS
+ nop.i 0
+};;
+
+.pred.rel "mutex",p10,p11
+{ .mfi
+ nop.m 0
+(p10) fma.s0 FR_ResultC = FR_FirstC, f1, FR_polyC
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fms.s0 FR_ResultC = FR_FirstC, f1, FR_polyC
+ nop.i 0
+};;
+
+
+
+.pred.rel "mutex",p12,p13
+{ .mfi
+ nop.m 0
+(p12) fma.s0 FR_ResultS = FR_FirstC, f1, FR_polyC
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fms.s0 FR_ResultS = FR_FirstC, f1, FR_polyC
+ nop.i 0
+};;
+
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p14) fma.s0 FR_ResultC = FR_FirstS, f1, FR_polyS
+ nop.i 0
+}
+{ .mfb
+ cmp.eq p10, p0 = 0x1, GR_Cis
+(p15) fms.s0 FR_ResultC = FR_FirstS, f1, FR_polyS
+(p10) br.ret.sptk b0
+};;
+
+{ .mmb // exit for sincosl
+ stfe [sincos_pResSin] = FR_ResultS
+ stfe [sincos_pResCos] = FR_ResultC
+ br.ret.sptk b0
+};;
+
+
+
+
+
+
+SINCOSL_LARGER_ARG:
+//
+// Here if 2^24 <= |x| < 2^63
+//
+{ .mfi
+ ldfe FR_d_1 = [GR_ad_p], 16 // Load d_1 for |x| >= 2^24 path
+ fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0 // N_0 = Arg * Inv_P_0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfps FR_Two_to_M14, FR_Neg_Two_to_M14 = [GR_ad_m14]
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_d_2 = [GR_ad_p], 16 // Load d_2 for |x| >= 2^24 path
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fcvt.fx.s1 FR_N_0_fix = FR_N_0 // N_0_fix = integer part of N_0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fcvt.xf FR_N_0 = FR_N_0_fix // Make N_0 the integer part
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X // Arg'=-N_0*P_0+Arg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_w = FR_N_0, FR_d_1, f0 // w = N_0 * d_1
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0 // N = A' * 2/pi
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fcvt.fx.s1 FR_N_fix = FR_N_float // N_fix is the integer part
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fcvt.xf FR_N_float = FR_N_fix
+ nop.i 0
+};;
+
+{ .mfi
+ getf.sig GR_N_Inc = FR_N_fix // N is the integer part of
+ // the reduced-reduced argument
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime // s = -N*P_1 + Arg'
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w // w = -N*P_2 + w
+ nop.i 0
+};;
+
+//
+// For |s| > 2**(-14) r = S + w (r complete)
+// Else U_hi = N_0 * d_1
+//
+{ .mfi
+ nop.m 0
+ fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p9) fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14 // p9 if |s| < 2^-14
+ nop.i 0
+};;
+
+//
+// Either S <= -2**(-14) or S >= 2**(-14)
+// or -2**(-14) < s < 2**(-14)
+//
+{ .mfi
+ nop.m 0
+(p9) fma.s1 FR_V_hi = FR_N_float, FR_P_2, f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p9) fma.s1 FR_U_hi = FR_N_0, FR_d_1, f0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p8) fma.s1 FR_r = FR_s, f1, FR_w
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p9) fma.s1 FR_w = FR_N_float, FR_P_3, f0
+ nop.i 0
+};;
+
+//
+// We need abs of both U_hi and V_hi - don't
+// worry about switched sign of V_hi.
+//
+// Big s: finish up c = (S - r) + w (c complete)
+// Case 4: A = U_hi + V_hi
+// Note: Worry about switched sign of V_hi, so subtract instead of add.
+//
+{ .mfi
+ nop.m 0
+(p9) fms.s1 FR_A = FR_U_hi, f1, FR_V_hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p9) fnma.s1 FR_V_lo = FR_N_float, FR_P_2, FR_V_hi
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p9) fmerge.s FR_V_hiabs = f0, FR_V_hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p9) fms.s1 FR_U_lo = FR_N_0, FR_d_1, FR_U_hi // For small s: U_lo=N_0*d_1-U_hi
+ nop.i 0
+};;
+
+//
+// For big s: Is |r| < 2**(-3)
+// For big s: if p12 set, prepare to branch to Small_R.
+// For big s: If p13 set, prepare to branch to Normal_R.
+//
+{ .mfi
+ nop.m 0
+(p9) fmerge.s FR_U_hiabs = f0, FR_U_hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p8) fms.s1 FR_c = FR_s, f1, FR_r // For big s: c = S - r
+ nop.i 0
+};;
+
+//
+// For small S: V_hi = N * P_2
+// w = N * P_3
+// Note the product does not include the (-) as in the writeup
+// so (-) missing for V_hi and w.
+//
+{ .mfi
+ nop.m 0
+(p8) fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p12) fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p8) fma.s1 FR_c = FR_c, f1, FR_w
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p9) fms.s1 FR_w = FR_N_0, FR_d_2, FR_w
+(p12) br.cond.spnt SINCOSL_SMALL_R // Branch if |r| < 2^-3
+ // and 2^24 <= |x| < 2^63
+};;
+
+{ .mib
+ nop.m 0
+ nop.i 0
+(p13) br.cond.sptk SINCOSL_NORMAL_R // Branch if |r| >= 2^-3
+ // and 2^24 <= |x| < 2^63
+};;
+
+SINCOSL_LARGER_S_TINY:
+// Here if |s| < 2^-14, and 2^24 <= |x| < 2^63
+//
+// Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true.
+// The remaining stuff is for Case 4.
+// Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup)
+// Note: the (-) is still missing for V_lo.
+// Small s: w = w + N_0 * d_2
+// Note: the (-) is now incorporated in w.
+//
+{ .mfi
+ and GR_N_SinCos = 0x1, GR_N_Inc
+ fcmp.ge.unc.s1 p6, p7 = FR_U_hiabs, FR_V_hiabs
+ tbit.z p8,p12 = GR_N_Inc, 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_t = FR_U_lo, f1, FR_V_lo // C_hi = S + A
+ nop.i 0
+};;
+
+{ .mfi
+ sub GR_N_SignS = GR_N_Inc, GR_N_SinCos
+(p6) fms.s1 FR_a = FR_U_hi, f1, FR_A
+ add GR_N_SignC = GR_N_Inc, GR_N_SinCos
+}
+{ .mfi
+ nop.m 0
+(p7) fma.s1 FR_a = FR_V_hi, f1, FR_A
+ nop.i 0
+};;
+
+{ .mmf
+ ldfe FR_C_1 = [GR_ad_c], 16
+ ldfe FR_S_1 = [GR_ad_s], 16
+ fma.s1 FR_C_hi = FR_s, f1, FR_A
+};;
+
+{ .mmi
+ ldfe FR_C_2 = [GR_ad_c], 64
+ ldfe FR_S_2 = [GR_ad_s], 64
+(p8) tbit.z.unc p10,p11 = GR_N_SignC, 1
+};;
+
+//
+// r and c have been computed.
+// Make sure ftz mode is set - should be automatic when using wre
+// |r| < 2**(-3)
+// Get [i_0,i_1] - two lsb of N_fix.
+//
+// For larger u than v: a = U_hi - A
+// Else a = V_hi - A (do an add to account for missing (-) on V_hi
+//
+{ .mfi
+ nop.m 0
+ fma.s1 FR_t = FR_t, f1, FR_w // t = t + w
+(p8) tbit.z.unc p8,p9 = GR_N_SignS, 1
+}
+{ .mfi
+ nop.m 0
+(p6) fms.s1 FR_a = FR_a, f1, FR_V_hi
+ nop.i 0
+};;
+
+//
+// If u > v: a = (U_hi - A) + V_hi
+// Else a = (V_hi - A) + U_hi
+// In each case account for negative missing from V_hi.
+//
+{ .mfi
+ nop.m 0
+ fms.s1 FR_C_lo = FR_s, f1, FR_C_hi
+(p12) tbit.z.unc p14,p15 = GR_N_SignC, 1
+}
+{ .mfi
+ nop.m 0
+(p7) fms.s1 FR_a = FR_U_hi, f1, FR_a
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C_lo = FR_C_lo, f1, FR_A // C_lo = (S - C_hi) + A
+(p12) tbit.z.unc p12,p13 = GR_N_SignS, 1
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_t = FR_t, f1, FR_a // t = t + a
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r = FR_C_hi, f1, FR_C_lo
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C_lo = FR_C_lo, f1, FR_t // C_lo = C_lo + t
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rsq = FR_r, FR_r, f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_c = FR_C_hi, f1, FR_r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_FirstS = f0, f1, FR_r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_FirstC = f0, f1, f1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyS = FR_rsq, FR_S_2, FR_S_1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyC = FR_rsq, FR_C_2, FR_C_1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_cubed = FR_rsq, FR_r, f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_c = FR_c, f1, FR_C_lo
+ nop.i 0
+};;
+
+.pred.rel "mutex",p9,p15
+{ .mfi
+ nop.m 0
+(p9) fms.s0 FR_FirstS = f1, f0, FR_FirstS
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fms.s0 FR_FirstS = f1, f0, FR_FirstS
+ nop.i 0
+};;
+
+.pred.rel "mutex",p11,p13
+{ .mfi
+ nop.m 0
+(p11) fms.s0 FR_FirstC = f1, f0, FR_FirstC
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fms.s0 FR_FirstC = f1, f0, FR_FirstC
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyS = FR_r_cubed, FR_polyS, FR_c
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyC = FR_rsq, FR_polyC, f0
+ nop.i 0
+};;
+
+
+
+.pred.rel "mutex",p8,p9
+{ .mfi
+ nop.m 0
+(p8) fma.s0 FR_ResultS = FR_FirstS, f1, FR_polyS
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p9) fms.s0 FR_ResultS = FR_FirstS, f1, FR_polyS
+ nop.i 0
+};;
+
+.pred.rel "mutex",p10,p11
+{ .mfi
+ nop.m 0
+(p10) fma.s0 FR_ResultC = FR_FirstC, f1, FR_polyC
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fms.s0 FR_ResultC = FR_FirstC, f1, FR_polyC
+ nop.i 0
+};;
+
+
+
+.pred.rel "mutex",p12,p13
+{ .mfi
+ nop.m 0
+(p12) fma.s0 FR_ResultS = FR_FirstC, f1, FR_polyC
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fms.s0 FR_ResultS = FR_FirstC, f1, FR_polyC
+ nop.i 0
+};;
+
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p14) fma.s0 FR_ResultC = FR_FirstS, f1, FR_polyS
+ nop.i 0
+}
+{ .mfb
+ cmp.eq p10, p0 = 0x1, GR_Cis
+(p15) fms.s0 FR_ResultC = FR_FirstS, f1, FR_polyS
+(p10) br.ret.sptk b0
+};;
+
+
+{ .mmb // exit for sincosl
+ stfe [sincos_pResSin] = FR_ResultS
+ stfe [sincos_pResCos] = FR_ResultC
+ br.ret.sptk b0
+};;
+
+
+
+SINCOSL_SMALL_R:
+//
+// Here if |r| < 2^-3
+//
+// Enter with r, c, and N_Inc computed
+//
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rsq = FR_r, FR_r, f0 // rsq = r * r
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe FR_S_5 = [GR_ad_se], -16 // Load S_5
+ ldfe FR_C_5 = [GR_ad_ce], -16 // Load C_5
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe FR_S_4 = [GR_ad_se], -16 // Load S_4
+ ldfe FR_C_4 = [GR_ad_ce], -16 // Load C_4
+ nop.i 0
+};;
+
+SINCOSL_SMALL_R_0:
+// Entry point for 2^-3 < |x| < pi/4
+SINCOSL_SMALL_R_1:
+// Entry point for pi/4 < |x| < 2^24 and |r| < 2^-3
+{ .mfi
+ ldfe FR_S_3 = [GR_ad_se], -16 // Load S_3
+ fma.s1 FR_r6 = FR_rsq, FR_rsq, f0 // Z = rsq * rsq
+ tbit.z p7,p11 = GR_N_Inc, 0
+}
+{ .mfi
+ ldfe FR_C_3 = [GR_ad_ce], -16 // Load C_3
+ nop.f 0
+ and GR_N_SinCos = 0x1, GR_N_Inc
+};;
+
+{ .mfi
+ ldfe FR_S_2 = [GR_ad_se], -16 // Load S_2
+ fnma.s1 FR_cC = FR_c, FR_r, f0 // c = -c * r
+ sub GR_N_SignS = GR_N_Inc, GR_N_SinCos
+}
+{ .mfi
+ ldfe FR_C_2 = [GR_ad_ce], -16 // Load C_2
+ nop.f 0
+ add GR_N_SignC = GR_N_Inc, GR_N_SinCos
+};;
+
+{ .mmi
+ ldfe FR_S_1 = [GR_ad_se], -16 // Load S_1
+ ldfe FR_C_1 = [GR_ad_ce], -16 // Load C_1
+(p7) tbit.z.unc p9,p10 = GR_N_SignC, 1
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r7 = FR_r6, FR_r, f0 // Z = Z * r
+(p7) tbit.z.unc p7,p8 = GR_N_SignS, 1
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_loS = FR_rsq, FR_S_5, FR_S_4 // poly_lo=rsq*S_5+S_4
+(p11) tbit.z.unc p13,p14 = GR_N_SignC, 1
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_loC = FR_rsq, FR_C_5, FR_C_4 // poly_lo=rsq*C_5+C_4
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_hiS = FR_rsq, FR_S_2, FR_S_1 // poly_hi=rsq*S_2+S_1
+(p11) tbit.z.unc p11,p12 = GR_N_SignS, 1
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_hiC = FR_rsq, FR_C_2, FR_C_1 // poly_hi=rsq*C_2+C_1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s0 FR_FirstS = FR_r, f1, f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s0 FR_FirstC = f1, f1, f0
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r6 = FR_r6, FR_rsq, f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r7 = FR_r7, FR_rsq, f0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_loS = FR_rsq, FR_poly_loS, FR_S_3 // p_lo=p_lo*rsq+S_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_loC = FR_rsq, FR_poly_loC, FR_C_3 // p_lo=p_lo*rsq+C_3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s0 FR_inexact = FR_S_4, FR_S_4, f0 // Dummy op to set inexact
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_hiS = FR_poly_hiS, FR_rsq, f0 // p_hi=p_hi*rsq
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_hiC = FR_poly_hiC, FR_rsq, f0 // p_hi=p_hi*rsq
+ nop.i 0
+};;
+
+.pred.rel "mutex",p8,p14
+{ .mfi
+ nop.m 0
+(p8) fms.s0 FR_FirstS = f1, f0, FR_FirstS
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p14) fms.s0 FR_FirstS = f1, f0, FR_FirstS
+ nop.i 0
+};;
+
+.pred.rel "mutex",p10,p12
+{ .mfi
+ nop.m 0
+(p10) fms.s0 FR_FirstC = f1, f0, FR_FirstC
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p12) fms.s0 FR_FirstC = f1, f0, FR_FirstC
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyS = FR_r7, FR_poly_loS, FR_cS // poly=Z*poly_lo+c
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyC = FR_r6, FR_poly_loC, FR_cC // poly=Z*poly_lo+c
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_hiS = FR_r, FR_poly_hiS, f0 // p_hi=r*p_hi
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyS = FR_polyS, f1, FR_poly_hiS
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyC = FR_polyC, f1, FR_poly_hiC
+ nop.i 0
+};;
+
+.pred.rel "mutex",p7,p8
+{ .mfi
+ nop.m 0
+(p7) fma.s0 FR_ResultS = FR_FirstS, f1, FR_polyS
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p8) fms.s0 FR_ResultS = FR_FirstS, f1, FR_polyS
+ nop.i 0
+};;
+
+.pred.rel "mutex",p9,p10
+{ .mfi
+ nop.m 0
+(p9) fma.s0 FR_ResultC = FR_FirstC, f1, FR_polyC
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p10) fms.s0 FR_ResultC = FR_FirstC, f1, FR_polyC
+ nop.i 0
+};;
+
+.pred.rel "mutex",p11,p12
+{ .mfi
+ nop.m 0
+(p11) fma.s0 FR_ResultS = FR_FirstC, f1, FR_polyC
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p12) fms.s0 FR_ResultS = FR_FirstC, f1, FR_polyC
+ nop.i 0
+};;
+
+.pred.rel "mutex",p13,p14
+{ .mfi
+ nop.m 0
+(p13) fma.s0 FR_ResultC = FR_FirstS, f1, FR_polyS
+ nop.i 0
+}
+{ .mfb
+ cmp.eq p15, p0 = 0x1, GR_Cis
+(p14) fms.s0 FR_ResultC = FR_FirstS, f1, FR_polyS
+(p15) br.ret.sptk b0
+};;
+
+
+{ .mmb // exit for sincosl
+ stfe [sincos_pResSin] = FR_ResultS
+ stfe [sincos_pResCos] = FR_ResultC
+ br.ret.sptk b0
+};;
+
+
+
+
+
+
+SINCOSL_NORMAL_R:
+//
+// Here if 2^-3 <= |r| < pi/4
+// THIS IS THE MAIN PATH
+//
+// Enter with r, c, and N_Inc having been computed
+//
+{ .mfi
+ ldfe FR_PP_6 = [GR_ad_pp], 16 // Load PP_6
+ fma.s1 FR_rsq = FR_r, FR_r, f0 // rsq = r * r
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_QQ_6 = [GR_ad_qq], 16 // Load QQ_6
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe FR_PP_5 = [GR_ad_pp], 16 // Load PP_5
+ ldfe FR_QQ_5 = [GR_ad_qq], 16 // Load QQ_5
+ nop.i 0
+};;
+
+
+
+SINCOSL_NORMAL_R_0:
+// Entry for 2^-3 < |x| < pi/4
+.pred.rel "mutex",p9,p10
+{ .mmf
+ ldfe FR_C_1 = [GR_ad_pp], 16 // Load C_1
+ ldfe FR_S_1 = [GR_ad_qq], 16 // Load S_1
+ frcpa.s1 FR_r_hi, p6 = f1, FR_r // r_hi = frcpa(r)
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyS = FR_rsq, FR_PP_8, FR_PP_7 // poly = rsq*PP_8+PP_7
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyC = FR_rsq, FR_QQ_8, FR_QQ_7 // poly = rsq*QQ_8+QQ_7
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 // rcubed = r * rsq
+ nop.i 0
+};;
+
+
+SINCOSL_NORMAL_R_1:
+// Entry for pi/4 <= |x| < 2^24
+.pred.rel "mutex",p9,p10
+{ .mmf
+ ldfe FR_PP_1 = [GR_ad_pp], 16 // Load PP_1_hi
+ ldfe FR_QQ_1 = [GR_ad_qq], 16 // Load QQ_1
+ frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi // r_hi = frpca(frcpa(r))
+};;
+
+{ .mfi
+ ldfe FR_PP_4 = [GR_ad_pp], 16 // Load PP_4
+ fma.s1 FR_polyS = FR_rsq, FR_polyS, FR_PP_6 // poly = rsq*poly+PP_6
+ and GR_N_SinCos = 0x1, GR_N_Inc
+}
+{ .mfi
+ ldfe FR_QQ_4 = [GR_ad_qq], 16 // Load QQ_4
+ fma.s1 FR_polyC = FR_rsq, FR_polyC, FR_QQ_6 // poly = rsq*poly+QQ_6
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_corrS = FR_C_1, FR_rsq, f0 // corr = C_1 * rsq
+ sub GR_N_SignS = GR_N_Inc, GR_N_SinCos
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_corrC = FR_S_1, FR_r_cubed, FR_r // corr = S_1 * r^3 + r
+ add GR_N_SignC = GR_N_Inc, GR_N_SinCos
+};;
+
+{ .mfi
+ ldfe FR_PP_3 = [GR_ad_pp], 16 // Load PP_3
+ fma.s1 FR_r_hi_sq = FR_r_hi, FR_r_hi, f0 // r_hi_sq = r_hi * r_hi
+ tbit.z p7,p11 = GR_N_Inc, 0
+}
+{ .mfi
+ ldfe FR_QQ_3 = [GR_ad_qq], 16 // Load QQ_3
+ fms.s1 FR_r_lo = FR_r, f1, FR_r_hi // r_lo = r - r_hi
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_PP_2 = [GR_ad_pp], 16 // Load PP_2
+ fma.s1 FR_polyS = FR_rsq, FR_polyS, FR_PP_5 // poly = rsq*poly+PP_5
+(p7) tbit.z.unc p9,p10 = GR_N_SignC, 1
+}
+{ .mfi
+ ldfe FR_QQ_2 = [GR_ad_qq], 16 // Load QQ_2
+ fma.s1 FR_polyC = FR_rsq, FR_polyC, FR_QQ_5 // poly = rsq*poly+QQ_5
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_PP_1_lo = [GR_ad_pp], 16 // Load PP_1_lo
+ fma.s1 FR_corrS = FR_corrS, FR_c, FR_c // corr = corr * c + c
+(p7) tbit.z.unc p7,p8 = GR_N_SignS, 1
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_corrC = FR_corrC, FR_c, f0 // corr = -corr * c
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U_loS = FR_r, FR_r_hi, FR_r_hi_sq // U_lo = r*r_hi+r_hi_sq
+(p11) tbit.z.unc p13,p14 = GR_N_SignC, 1
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U_loC = FR_r_hi, f1, FR_r // U_lo = r_hi + r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U_hiS = FR_r_hi, FR_r_hi_sq, f0 // U_hi = r_hi*r_hi_sq
+(p11) tbit.z.unc p11,p12 = GR_N_SignS, 1
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U_hiC = FR_QQ_1, FR_r_hi_sq, f1 // U_hi = QQ_1*r_hi_sq+1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyS = FR_rsq, FR_polyS, FR_PP_4 // poly = poly*rsq+PP_4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyC = FR_rsq, FR_polyC, FR_QQ_4 // poly = poly*rsq+QQ_4
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U_loS = FR_r, FR_r, FR_U_loS // U_lo = r * r + U_lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U_loC = FR_r_lo, FR_U_loC, f0 // U_lo = r_lo * U_lo
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U_hiS = FR_PP_1, FR_U_hiS, f0 // U_hi = PP_1 * U_hi
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyS = FR_rsq, FR_polyS, FR_PP_3 // poly = poly*rsq+PP_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyC = FR_rsq, FR_polyC, FR_QQ_3 // poly = poly*rsq+QQ_3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U_loS = FR_r_lo, FR_U_loS, f0 // U_lo = r_lo * U_lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U_loC = FR_QQ_1,FR_U_loC, f0 // U_lo = QQ_1 * U_lo
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U_hiS = FR_r, f1, FR_U_hiS // U_hi = r + U_hi
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyS = FR_rsq, FR_polyS, FR_PP_2 // poly = poly*rsq+PP_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyC = FR_rsq, FR_polyC, FR_QQ_2 // poly = poly*rsq+QQ_2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U_loS = FR_PP_1, FR_U_loS, f0 // U_lo = PP_1 * U_lo
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyS = FR_rsq, FR_polyS, FR_PP_1_lo // poly =poly*rsq+PP1lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyC = FR_rsq, FR_polyC, f0 // poly = poly*rsq
+ nop.i 0
+};;
+
+
+.pred.rel "mutex",p8,p14
+{ .mfi
+ nop.m 0
+(p8) fms.s0 FR_U_hiS = f1, f0, FR_U_hiS
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p14) fms.s0 FR_U_hiS = f1, f0, FR_U_hiS
+ nop.i 0
+};;
+
+.pred.rel "mutex",p10,p12
+{ .mfi
+ nop.m 0
+(p10) fms.s0 FR_U_hiC = f1, f0, FR_U_hiC
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p12) fms.s0 FR_U_hiC = f1, f0, FR_U_hiC
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_VS = FR_U_loS, f1, FR_corrS // V = U_lo + corr
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_VC = FR_U_loC, f1, FR_corrC // V = U_lo + corr
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s0 FR_inexact = FR_PP_5, FR_PP_4, f0 // Dummy op to set inexact
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyS = FR_r_cubed, FR_polyS, f0 // poly = poly*r^3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_polyC = FR_rsq, FR_polyC, f0 // poly = poly*rsq
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_VS = FR_polyS, f1, FR_VS // V = poly + V
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_VC = FR_polyC, f1, FR_VC // V = poly + V
+ nop.i 0
+};;
+
+
+
+.pred.rel "mutex",p7,p8
+{ .mfi
+ nop.m 0
+(p7) fma.s0 FR_ResultS = FR_U_hiS, f1, FR_VS
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p8) fms.s0 FR_ResultS = FR_U_hiS, f1, FR_VS
+ nop.i 0
+};;
+
+.pred.rel "mutex",p9,p10
+{ .mfi
+ nop.m 0
+(p9) fma.s0 FR_ResultC = FR_U_hiC, f1, FR_VC
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p10) fms.s0 FR_ResultC = FR_U_hiC, f1, FR_VC
+ nop.i 0
+};;
+
+
+
+.pred.rel "mutex",p11,p12
+{ .mfi
+ nop.m 0
+(p11) fma.s0 FR_ResultS = FR_U_hiC, f1, FR_VC
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p12) fms.s0 FR_ResultS = FR_U_hiC, f1, FR_VC
+ nop.i 0
+};;
+
+.pred.rel "mutex",p13,p14
+{ .mfi
+ nop.m 0
+(p13) fma.s0 FR_ResultC = FR_U_hiS, f1, FR_VS
+ nop.i 0
+}
+{ .mfb
+ cmp.eq p15, p0 = 0x1, GR_Cis
+(p14) fms.s0 FR_ResultC = FR_U_hiS, f1, FR_VS
+(p15) br.ret.sptk b0
+};;
+
+{ .mmb // exit for sincosl
+ stfe [sincos_pResSin] = FR_ResultS
+ stfe [sincos_pResCos] = FR_ResultC
+ br.ret.sptk b0
+};;
+
+
+
+
+
+SINCOSL_ZERO:
+
+{ .mfi
+ nop.m 0
+ fmerge.s FR_ResultS = FR_Input_X, FR_Input_X // If sin, result = input
+ nop.i 0
+}
+{ .mfb
+ cmp.eq p15, p0 = 0x1, GR_Cis
+ fma.s0 FR_ResultC = f1, f1, f0 // If cos, result=1.0
+(p15) br.ret.sptk b0
+};;
+
+{ .mmb // exit for sincosl
+ stfe [sincos_pResSin] = FR_ResultS
+ stfe [sincos_pResCos] = FR_ResultC
+ br.ret.sptk b0
+};;
+
+
+SINCOSL_DENORMAL:
+{ .mmb
+ getf.exp GR_signexp_x = FR_norm_x // Get sign and exponent of x
+ nop.m 999
+ br.cond.sptk SINCOSL_COMMON2 // Return to common code
+}
+;;
+
+
+SINCOSL_SPECIAL:
+//
+// Path for Arg = +/- QNaN, SNaN, Inf
+// Invalid can be raised. SNaNs
+// become QNaNs
+//
+{ .mfi
+ cmp.eq p15, p0 = 0x1, GR_Cis
+ fmpy.s0 FR_ResultS = FR_Input_X, f0
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fmpy.s0 FR_ResultC = FR_Input_X, f0
+(p15) br.ret.sptk b0
+};;
+
+{ .mmb // exit for sincosl
+ stfe [sincos_pResSin] = FR_ResultS
+ stfe [sincos_pResCos] = FR_ResultC
+ br.ret.sptk b0
+};;
+
+GLOBAL_LIBM_END(__libm_sincosl)
+
+// *******************************************************************
+// *******************************************************************
+// *******************************************************************
+//
+// Special Code to handle very large argument case.
+// Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63
+// The interface is custom:
+// On input:
+// (Arg or x) is in f8
+// On output:
+// r is in f8
+// c is in f9
+// N is in r8
+// Be sure to allocate at least 2 GP registers as output registers for
+// __libm_pi_by_2_reduce. This routine uses r49-50. These are used as
+// scratch registers within the __libm_pi_by_2_reduce routine (for speed).
+//
+// We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127. We
+// use this to eliminate save/restore of key fp registers in this calling
+// function.
+//
+// *******************************************************************
+// *******************************************************************
+// *******************************************************************
+
+LOCAL_LIBM_ENTRY(__libm_callout)
+SINCOSL_ARG_TOO_LARGE:
+.prologue
+{ .mfi
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+};;
+
+{ .mmi
+ setf.exp FR_Two_to_M3 = GR_exp_2_to_m3 // Form 2^-3
+ mov GR_SAVE_GP=gp // Save gp
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+//
+// Call argument reduction with x in f8
+// Returns with N in r8, r in f8, c in f9
+// Assumes f71-127 are preserved across the call
+//
+{ .mib
+ setf.exp FR_Neg_Two_to_M3 = GR_exp_m2_to_m3 // Form -(2^-3)
+ nop.i 0
+ br.call.sptk b0=__libm_pi_by_2_reduce#
+};;
+
+{ .mfi
+ mov GR_N_Inc = r8
+ fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+
+{ .mfi
+ mov gp = GR_SAVE_GP // Restore gp
+(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+};;
+
+{ .mbb
+ nop.m 0
+(p6) br.cond.spnt SINCOSL_SMALL_R // Branch if |r|< 2^-3 for |x| >= 2^63
+ br.cond.sptk SINCOSL_NORMAL_R // Branch if |r|>=2^-3 for |x| >= 2^63
+};;
+
+LOCAL_LIBM_END(__libm_callout)
+
+.type __libm_pi_by_2_reduce#,@function
+.global __libm_pi_by_2_reduce#
+
+
+
diff --git a/sysdeps/ia64/fpu/libm_support.h b/sysdeps/ia64/fpu/libm_support.h
index 5d3498dfc9..50dac33133 100644
--- a/sysdeps/ia64/fpu/libm_support.h
+++ b/sysdeps/ia64/fpu/libm_support.h
@@ -1,9 +1,10 @@
-//
-// Copyright (C) 2000, 2001, Intel Corporation
+/* file: libm_support.h */
+
+
+// Copyright (c) 2000 - 2002, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -19,14 +20,14 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
@@ -34,45 +35,51 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-// History: 02/02/2000 Initial version
+// History: 02/02/2000 Initial version
// 2/28/2000 added tags for logb and nextafter
-// 3/22/2000 Changes to support _LIB_VERSION variable
-// and filled some enum gaps. Added support for C99.
+// 3/22/2000 Changes to support _LIB_VERSIONIMF variable
+// and filled some enum gaps. Added support for C99.
// 5/31/2000 added prototypes for __libm_frexp_4l/8l
-// 8/10/2000 Changed declaration of _LIB_VERSION to work for library
+// 8/10/2000 Changed declaration of _LIB_VERSIONIMF to work for library
// builds and other application builds (precompiler directives).
// 8/11/2000 Added pointers-to-matherr-functions declarations to allow
// for user-defined matherr functions in the dll build.
// 12/07/2000 Added scalbn error_types values.
+// 5/01/2001 Added error_types values for C99 nearest integer
+// functions.
+// 6/07/2001 Added error_types values for fdim.
+// 6/18/2001 Added include of complex_support.h.
+// 8/03/2001 Added error_types values for nexttoward, scalbln.
+// 8/23/2001 Corrected tag numbers from 186 and higher.
+// 8/27/2001 Added check for long int and long long int definitions.
+// 12/10/2001 Added error_types for erfc.
+// 12/27/2001 Added error_types for degree argument functions.
+// 01/02/2002 Added error_types for tand, cotd.
+// 01/04/2002 Delete include of complex_support.h
+// 01/23/2002 Deleted prototypes for __libm_frexp*. Added check for
+// multiple int, long int, and long long int definitions.
+// 05/20/2002 Added error_types for cot.
+// 06/27/2002 Added error_types for sinhcosh.
+// 12/05/2002 Added error_types for annuity and compound
+// 04/10/2003 Added error_types for tgammal/tgamma/tgammaf
//
-#ifndef __ASSEMBLER__
-#include <math.h>
-
-float __libm_frexp_4f( float x, int* exp);
-float _GI___libm_frexp_4f( float x, int* exp);
-float __libm_frexp_8f( float x, int* exp);
-double __libm_frexp_4( double x, int* exp);
-double _GI___libm_frexp_4( double x, int* exp);
-double __libm_frexp_8( double x, int* exp);
-long double __libm_frexp_4l( long double x, int* exp);
-long double _GI___libm_frexp_4l( long double x, int* exp);
-long double __libm_frexp_8l( long double x, int* exp);
void __libm_sincos_pi4(double,double*,double*,int);
void __libm_y0y1(double , double *, double *);
void __libm_j0j1(double , double *, double *);
-double __libm_lgamma_kernel(double,int*,int,int);
double __libm_j0(double);
double __libm_j1(double);
double __libm_jn(int,double);
double __libm_y0(double);
double __libm_y1(double);
double __libm_yn(int,double);
+double __libm_copysign (double, double);
+float __libm_copysignf (float, float);
+long double __libm_copysignl (long double, long double);
-extern double rint(double);
extern double sqrt(double);
extern double fabs(double);
extern double log(double);
@@ -112,24 +119,31 @@ extern long double log1pl(long double);
extern long double logl(long double);
extern long double sqrtl(long double);
extern long double expl(long double);
-
-extern long lround(double);
-extern long lroundf(float);
-extern long lroundl(long double);
+extern long double fabsl(long double);
#if !(defined(SIZE_INT_32) || defined(SIZE_INT_64))
- #error integer size not established; define SIZE_INT_32 or SIZE_INT_64
+#error integer size not established; define SIZE_INT_32 or SIZE_INT_64
#endif
-struct fp64 { /*/ sign:1 exponent:11 significand:52 (implied leading 1)*/
- unsigned lo_significand:32;
- unsigned hi_significand:20;
- unsigned exponent:11;
- unsigned sign:1;
-};
+#if (defined(SIZE_INT_32) && defined(SIZE_INT_64))
+#error multiple integer size definitions; define SIZE_INT_32 or SIZE_INT_64
+#endif
-#define HI_SIGNIFICAND_LESS(X, HI) ((X)->hi_significand < 0x ## HI)
-#define f64abs(x) ((x) < 0.0 ? -(x) : (x))
+#if !(defined(SIZE_LONG_INT_32) || defined(SIZE_LONG_INT_64))
+#error long int size not established; define SIZE_LONG_INT_32 or SIZE_LONG_INT_64
+#endif
+
+#if (defined(SIZE_LONG_INT_32) && defined(SIZE_LONG_INT_64))
+#error multiple long int size definitions; define SIZE_LONG_INT_32 or SIZE_LONG_INT_64
+#endif
+
+#if !(defined(SIZE_LONG_LONG_INT_32) || defined(SIZE_LONG_LONG_INT_64))
+#error long long int size not established; define SIZE_LONG_LONG_INT_32 or SIZE_LONG_LONG_INT_64
+#endif
+
+#if (defined(SIZE_LONG_LONG_INT_32) && defined(SIZE_LONG_LONG_INT_64))
+#error multiple long long int size definitions; define SIZE_LONG_LONG_INT_32 or SIZE_LONG_LONG_INT_64
+#endif
typedef enum
{
@@ -148,14 +162,14 @@ typedef enum
powl_neg_to_non_integer, /* 22 */
powl_nan_to_zero, /* 23 */
pow_overflow, pow_underflow, /* 24, 25 */
- pow_zero_to_zero, /* 26 */
+ pow_zero_to_zero, /* 26 */
pow_zero_to_negative, /* 27 */
pow_neg_to_non_integer, /* 28 */
pow_nan_to_zero, /* 29 */
powf_overflow, powf_underflow, /* 30, 31 */
powf_zero_to_zero, /* 32 */
- powf_zero_to_negative, /* 33 */
- powf_neg_to_non_integer, /* 34 */
+ powf_zero_to_negative, /* 33 */
+ powf_neg_to_non_integer, /* 34 */
powf_nan_to_zero, /* 35 */
atan2l_zero, /* 36 */
atan2_zero, /* 37 */
@@ -181,13 +195,13 @@ typedef enum
y0l_zero, y0l_negative,y0l_gt_loss, /* 66, 67, 68 */
y0_zero, y0_negative,y0_gt_loss, /* 69, 70, 71 */
y0f_zero, y0f_negative,y0f_gt_loss, /* 72, 73, 74 */
- y1l_zero, y1l_negative,y1l_gt_loss, /* 75, 76, 77 */
- y1_zero, y1_negative,y1_gt_loss, /* 78, 79, 80 */
- y1f_zero, y1f_negative,y1f_gt_loss, /* 81, 82, 83 */
+ y1l_zero, y1l_negative,y1l_gt_loss, /* 75, 76, 77 */
+ y1_zero, y1_negative,y1_gt_loss, /* 78, 79, 80 */
+ y1f_zero, y1f_negative,y1f_gt_loss, /* 81, 82, 83 */
ynl_zero, ynl_negative,ynl_gt_loss, /* 84, 85, 86 */
yn_zero, yn_negative,yn_gt_loss, /* 87, 88, 89 */
ynf_zero, ynf_negative,ynf_gt_loss, /* 90, 91, 92 */
- j0l_gt_loss, /* 93 */
+ j0l_gt_loss, /* 93 */
j0_gt_loss, /* 94 */
j0f_gt_loss, /* 95 */
j1l_gt_loss, /* 96 */
@@ -201,7 +215,7 @@ typedef enum
lgammaf_overflow, lgammaf_negative, lgammaf_reserve,/* 108, 109, 110 */
gammal_overflow,gammal_negative, gammal_reserve, /* 111, 112, 113 */
gamma_overflow, gamma_negative, gamma_reserve, /* 114, 115, 116 */
- gammaf_overflow,gammaf_negative,gammaf_reserve, /* 117, 118, 119 */
+ gammaf_overflow,gammaf_negative,gammaf_reserve, /* 117, 118, 119 */
fmodl_by_zero, /* 120 */
fmod_by_zero, /* 121 */
fmodf_by_zero, /* 122 */
@@ -222,7 +236,7 @@ typedef enum
ldexp_overflow, ldexp_underflow, /* 146, 147 */
ldexpf_overflow, ldexpf_underflow, /* 148, 149 */
logbl_zero, logb_zero, logbf_zero, /* 150, 151, 152 */
- nextafterl_overflow, nextafter_overflow,
+ nextafterl_overflow, nextafter_overflow,
nextafterf_overflow, /* 153, 154, 155 */
ilogbl_zero, ilogb_zero, ilogbf_zero, /* 156, 157, 158 */
exp2l_overflow, exp2l_underflow, /* 159, 160 */
@@ -235,18 +249,406 @@ typedef enum
log2f_zero, log2f_negative, /* 172, 173 */
scalbnl_overflow, scalbnl_underflow, /* 174, 175 */
scalbn_overflow, scalbn_underflow, /* 176, 177 */
- scalbnf_overflow, scalbnf_underflow /* 178, 179 */
+ scalbnf_overflow, scalbnf_underflow, /* 178, 179 */
+ remquol_by_zero, /* 180 */
+ remquo_by_zero, /* 181 */
+ remquof_by_zero, /* 182 */
+ lrintl_large, lrint_large, lrintf_large, /* 183, 184, 185 */
+ llrintl_large, llrint_large, llrintf_large, /* 186, 187, 188 */
+ lroundl_large, lround_large, lroundf_large, /* 189, 190, 191 */
+ llroundl_large, llround_large, llroundf_large, /* 192, 193, 194 */
+ fdiml_overflow, fdim_overflow, fdimf_overflow, /* 195, 196, 197 */
+ nexttowardl_overflow, nexttoward_overflow,
+ nexttowardf_overflow, /* 198, 199, 200 */
+ scalblnl_overflow, scalblnl_underflow, /* 201, 202 */
+ scalbln_overflow, scalbln_underflow, /* 203, 204 */
+ scalblnf_overflow, scalblnf_underflow, /* 205, 206 */
+ erfcl_underflow, erfc_underflow, erfcf_underflow, /* 207, 208, 209 */
+ acosdl_gt_one, acosd_gt_one, acosdf_gt_one, /* 210, 211, 212 */
+ asindl_gt_one, asind_gt_one, asindf_gt_one, /* 213, 214, 215 */
+ atan2dl_zero, atan2d_zero, atan2df_zero, /* 216, 217, 218 */
+ tandl_overflow, tand_overflow, tandf_overflow, /* 219, 220, 221 */
+ cotdl_overflow, cotd_overflow, cotdf_overflow, /* 222, 223, 224 */
+ cotl_overflow, cot_overflow, cotf_overflow, /* 225, 226, 227 */
+ sinhcoshl_overflow, sinhcosh_overflow, sinhcoshf_overflow, /* 228, 229, 230 */
+ annuityl_by_zero, annuity_by_zero, annuityf_by_zero, /* 231, 232, 233 */
+ annuityl_less_m1, annuity_less_m1, annuityf_less_m1, /* 234, 235, 236 */
+ annuityl_overflow, annuity_overflow, annuityf_overflow, /* 237, 238, 239 */
+ annuityl_underflow, annuity_underflow, annuityf_underflow, /* 240, 241, 242 */
+ compoundl_by_zero, compound_by_zero, compoundf_by_zero, /* 243, 244, 245 */
+ compoundl_less_m1, compound_less_m1, compoundf_less_m1, /* 246, 247, 248 */
+ compoundl_overflow, compound_overflow, compoundf_overflow, /* 249, 250, 251 */
+ compoundl_underflow, compound_underflow, compoundf_underflow, /* 252, 253, 254 */
+ tgammal_overflow, tgammal_negative, tgammal_reserve, /* 255, 256, 257 */
+ tgamma_overflow, tgamma_negative, tgamma_reserve, /* 258, 259, 260 */
+ tgammaf_overflow, tgammaf_negative, tgammaf_reserve, /* 261, 262, 263 */
} error_types;
void __libm_error_support(void*,void*,void*,error_types);
+#ifdef _LIBC
libc_hidden_proto(__libm_error_support)
+#endif
+
+#define HI_SIGNIFICAND_LESS(X, HI) ((X)->hi_significand < 0x ## HI)
+#define f64abs(x) ((x) < 0.0 ? -(x) : (x))
+
+#if !defined(__USE_EXTERNAL_FPMEMTYP_H__)
+
+#define BIAS_32 0x007F
+#define BIAS_64 0x03FF
+#define BIAS_80 0x3FFF
+
+#define MAXEXP_32 0x00FE
+#define MAXEXP_64 0x07FE
+#define MAXEXP_80 0x7FFE
+
+#define EXPINF_32 0x00FF
+#define EXPINF_64 0x07FF
+#define EXPINF_80 0x7FFF
+
+struct fp32 { /*// sign:1 exponent:8 significand:23 (implied leading 1)*/
+#if defined(SIZE_INT_32)
+ unsigned significand:23;
+ unsigned exponent:8;
+ unsigned sign:1;
+#elif defined(SIZE_INT_64)
+ unsigned significand:23;
+ unsigned exponent:8;
+ unsigned sign:1;
+#endif
+};
+
+struct fp64 { /*/ sign:1 exponent:11 significand:52 (implied leading 1)*/
+#if defined(SIZE_INT_32)
+ unsigned lo_significand:32;
+ unsigned hi_significand:20;
+ unsigned exponent:11;
+ unsigned sign:1;
+#elif defined(SIZE_INT_64)
+ unsigned significand:52;
+ unsigned exponent:11;
+ unsigned sign:1;
+#endif
+};
+
+struct fp80 { /*/ sign:1 exponent:15 significand:64 (NO implied bits) */
+#if defined(SIZE_INT_32)
+ unsigned lo_significand;
+ unsigned hi_significand;
+ unsigned exponent:15;
+ unsigned sign:1;
+#elif defined(SIZE_INT_64)
+ unsigned significand;
+ unsigned exponent:15;
+ unsigned sign:1;
+#endif
+};
+
+#endif /*__USE_EXTERNAL_FPMEMTYP_H__*/
+
+/* macros to form a double value in hex representation (unsigned int type) */
+
+#define DOUBLE_HEX(hi,lo) 0x##lo,0x##hi /*LITTLE_ENDIAN*/
+
+/* macros to form a long double value in hex representation (unsigned short type) */
+
+#if defined(_WIN32) || defined(_WIN64)
+#define LDOUBLE_ALIGN 16
+#else
+#define LDOUBLE_ALIGN 12
+#endif
+
+#if (LDOUBLE_ALIGN == 16)
+#define _XPD_ ,0x0000,0x0000,0x0000
+#else /*12*/
+#define _XPD_ ,0x0000
+#endif
+
+#define LDOUBLE_HEX(w4,w3,w2,w1,w0) 0x##w0,0x##w1,0x##w2,0x##w3,0x##w4 _XPD_ /*LITTLE_ENDIAN*/
+
+/* macros to sign-expand low 'num' bits of 'val' to native integer */
-#define BIAS_64 1023
-#define EXPINF_64 2047
+#if defined(SIZE_INT_32)
+# define SIGN_EXPAND(val,num) ((int)(val) << (32-(num))) >> (32-(num)) /* sign expand of 'num' LSBs */
+#elif defined(SIZE_INT_64)
+# define SIGN_EXPAND(val,num) ((int)(val) << (64-(num))) >> (64-(num)) /* sign expand of 'num' LSBs */
+#endif
+
+/* macros to form pointers to FP number on-the-fly */
+
+#define FP32(f) ((struct fp32 *)&f)
+#define FP64(d) ((struct fp64 *)&d)
+#define FP80(ld) ((struct fp80 *)&ld)
+
+/* macros to extract signed low and high doubleword of long double */
+
+#if defined(SIZE_INT_32)
+# define HI_DWORD_80(ld) ((((FP80(ld)->sign << 15) | FP80(ld)->exponent) << 16) | \
+ ((FP80(ld)->hi_significand >> 16) & 0xFFFF))
+# define LO_DWORD_80(ld) SIGN_EXPAND(FP80(ld)->lo_significand, 32)
+#elif defined(SIZE_INT_64)
+# define HI_DWORD_80(ld) ((((FP80(ld)->sign << 15) | FP80(ld)->exponent) << 16) | \
+ ((FP80(ld)->significand >> 48) & 0xFFFF))
+# define LO_DWORD_80(ld) SIGN_EXPAND(FP80(ld)->significand, 32)
+#endif
+
+/* macros to extract hi bits of significand.
+ * note that explicit high bit do not count (returns as is)
+ */
+
+#if defined(SIZE_INT_32)
+# define HI_SIGNIFICAND_80(X,NBITS) ((X)->hi_significand >> (31 - (NBITS)))
+#elif defined(SIZE_INT_64)
+# define HI_SIGNIFICAND_80(X,NBITS) ((X)->significand >> (63 - (NBITS)))
+#endif
+
+/* macros to check, whether a significand bits are all zero, or some of them are non-zero.
+ * note that SIGNIFICAND_ZERO_80 tests high bit also, but SIGNIFICAND_NONZERO_80 does not
+ */
+
+#define SIGNIFICAND_ZERO_32(X) ((X)->significand == 0)
+#define SIGNIFICAND_NONZERO_32(X) ((X)->significand != 0)
-#define DOUBLE_HEX(HI, LO) 0x ## LO, 0x ## HI
+#if defined(SIZE_INT_32)
+# define SIGNIFICAND_ZERO_64(X) (((X)->hi_significand == 0) && ((X)->lo_significand == 0))
+# define SIGNIFICAND_NONZERO_64(X) (((X)->hi_significand != 0) || ((X)->lo_significand != 0))
+#elif defined(SIZE_INT_64)
+# define SIGNIFICAND_ZERO_64(X) ((X)->significand == 0)
+# define SIGNIFICAND_NONZERO_64(X) ((X)->significand != 0)
+#endif
+
+#if defined(SIZE_INT_32)
+# define SIGNIFICAND_ZERO_80(X) (((X)->hi_significand == 0x00000000) && ((X)->lo_significand == 0))
+# define SIGNIFICAND_NONZERO_80(X) (((X)->hi_significand != 0x80000000) || ((X)->lo_significand != 0))
+#elif defined(SIZE_INT_64)
+# define SIGNIFICAND_ZERO_80(X) ((X)->significand == 0x0000000000000000)
+# define SIGNIFICAND_NONZERO_80(X) ((X)->significand != 0x8000000000000000)
+#endif
+
+/* macros to compare long double with constant value, represented as hex */
+
+#define SIGNIFICAND_EQ_HEX_32(X,BITS) ((X)->significand == 0x ## BITS)
+#define SIGNIFICAND_GT_HEX_32(X,BITS) ((X)->significand > 0x ## BITS)
+#define SIGNIFICAND_GE_HEX_32(X,BITS) ((X)->significand >= 0x ## BITS)
+#define SIGNIFICAND_LT_HEX_32(X,BITS) ((X)->significand < 0x ## BITS)
+#define SIGNIFICAND_LE_HEX_32(X,BITS) ((X)->significand <= 0x ## BITS)
+
+#if defined(SIZE_INT_32)
+# define SIGNIFICAND_EQ_HEX_64(X,HI,LO) \
+ (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand == 0x ## LO))
+# define SIGNIFICAND_GT_HEX_64(X,HI,LO) (((X)->hi_significand > 0x ## HI) || \
+ (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand > 0x ## LO)))
+# define SIGNIFICAND_GE_HEX_64(X,HI,LO) (((X)->hi_significand > 0x ## HI) || \
+ (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand >= 0x ## LO)))
+# define SIGNIFICAND_LT_HEX_64(X,HI,LO) (((X)->hi_significand < 0x ## HI) || \
+ (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand < 0x ## LO)))
+# define SIGNIFICAND_LE_HEX_64(X,HI,LO) (((X)->hi_significand < 0x ## HI) || \
+ (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand <= 0x ## LO)))
+#elif defined(SIZE_INT_64)
+# define SIGNIFICAND_EQ_HEX_64(X,HI,LO) ((X)->significand == 0x ## HI ## LO)
+# define SIGNIFICAND_GT_HEX_64(X,HI,LO) ((X)->significand > 0x ## HI ## LO)
+# define SIGNIFICAND_GE_HEX_64(X,HI,LO) ((X)->significand >= 0x ## HI ## LO)
+# define SIGNIFICAND_LT_HEX_64(X,HI,LO) ((X)->significand < 0x ## HI ## LO)
+# define SIGNIFICAND_LE_HEX_64(X,HI,LO) ((X)->significand <= 0x ## HI ## LO)
+#endif
+
+#if defined(SIZE_INT_32)
+# define SIGNIFICAND_EQ_HEX_80(X,HI,LO) \
+ (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand == 0x ## LO))
+# define SIGNIFICAND_GT_HEX_80(X,HI,LO) (((X)->hi_significand > 0x ## HI) || \
+ (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand > 0x ## LO)))
+# define SIGNIFICAND_GE_HEX_80(X,HI,LO) (((X)->hi_significand > 0x ## HI) || \
+ (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand >= 0x ## LO)))
+# define SIGNIFICAND_LT_HEX_80(X,HI,LO) (((X)->hi_significand < 0x ## HI) || \
+ (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand < 0x ## LO)))
+# define SIGNIFICAND_LE_HEX_80(X,HI,LO) (((X)->hi_significand < 0x ## HI) || \
+ (((X)->hi_significand == 0x ## HI) && ((X)->lo_significand <= 0x ## LO)))
+#elif defined(SIZE_INT_64)
+# define SIGNIFICAND_EQ_HEX_80(X,HI,LO) ((X)->significand == 0x ## HI ## LO)
+# define SIGNIFICAND_GT_HEX_80(X,HI,LO) ((X)->significand > 0x ## HI ## LO)
+# define SIGNIFICAND_GE_HEX_80(X,HI,LO) ((X)->significand >= 0x ## HI ## LO)
+# define SIGNIFICAND_LT_HEX_80(X,HI,LO) ((X)->significand < 0x ## HI ## LO)
+# define SIGNIFICAND_LE_HEX_80(X,HI,LO) ((X)->significand <= 0x ## HI ## LO)
+#endif
+
+#define VALUE_EQ_HEX_32(X,EXP,BITS) \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_EQ_HEX_32(X, BITS)))
+#define VALUE_GT_HEX_32(X,EXP,BITS) (((X)->exponent > (EXP)) || \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_GT_HEX_32(X, BITS))))
+#define VALUE_GE_HEX_32(X,EXP,BITS) (((X)->exponent > (EXP)) || \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_GE_HEX_32(X, BITS))))
+#define VALUE_LT_HEX_32(X,EXP,BITS) (((X)->exponent < (EXP)) || \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_LT_HEX_32(X, BITS))))
+#define VALUE_LE_HEX_32(X,EXP,BITS) (((X)->exponent < (EXP)) || \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_LE_HEX_32(X, BITS))))
+
+#define VALUE_EQ_HEX_64(X,EXP,HI,LO) \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_EQ_HEX_64(X, HI, LO)))
+#define VALUE_GT_HEX_64(X,EXP,HI,LO) (((X)->exponent > (EXP)) || \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_GT_HEX_64(X, HI, LO))))
+#define VALUE_GE_HEX_64(X,EXP,HI,LO) (((X)->exponent > (EXP)) || \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_GE_HEX_64(X, HI, LO))))
+#define VALUE_LT_HEX_64(X,EXP,HI,LO) (((X)->exponent < (EXP)) || \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_LT_HEX_64(X, HI, LO))))
+#define VALUE_LE_HEX_64(X,EXP,HI,LO) (((X)->exponent < (EXP)) || \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_LE_HEX_64(X, HI, LO))))
+
+#define VALUE_EQ_HEX_80(X,EXP,HI,LO) \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_EQ_HEX_80(X, HI, LO)))
+#define VALUE_GT_HEX_80(X,EXP,HI,LO) (((X)->exponent > (EXP)) || \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_GT_HEX_80(X, HI, LO))))
+#define VALUE_GE_HEX_80(X,EXP,HI,LO) (((X)->exponent > (EXP)) || \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_GE_HEX_80(X, HI, LO))))
+#define VALUE_LT_HEX_80(X,EXP,HI,LO) (((X)->exponent < (EXP)) || \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_LT_HEX_80(X, HI, LO))))
+#define VALUE_LE_HEX_80(X,EXP,HI,LO) (((X)->exponent < (EXP)) || \
+ (((X)->exponent == (EXP)) && (SIGNIFICAND_LE_HEX_80(X, HI, LO))))
+
+/* macros to compare two long doubles */
+
+#define SIGNIFICAND_EQ_32(X,Y) ((X)->significand == (Y)->significand)
+#define SIGNIFICAND_GT_32(X,Y) ((X)->significand > (Y)->significand)
+#define SIGNIFICAND_GE_32(X,Y) ((X)->significand >= (Y)->significand)
+#define SIGNIFICAND_LT_32(X,Y) ((X)->significand < (Y)->significand)
+#define SIGNIFICAND_LE_32(X,Y) ((X)->significand <= (Y)->significand)
+
+#if defined(SIZE_INT_32)
+# define SIGNIFICAND_EQ_64(X,Y) \
+ (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand == (Y)->lo_significand))
+# define SIGNIFICAND_GT_64(X,Y) (((X)->hi_significand > (Y)->hi_significand) || \
+ (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand > (Y)->lo_significand)))
+# define SIGNIFICAND_GE_64(X,Y) (((X)->hi_significand > (Y)->hi_significand) || \
+ (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand >= (Y)->lo_significand)))
+# define SIGNIFICAND_LT_64(X,Y) (((X)->hi_significand < (Y)->hi_significand) || \
+ (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand < (Y)->lo_significand)))
+# define SIGNIFICAND_LE_64(X,Y) (((X)->hi_significand < (Y)->hi_significand) || \
+ (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand <= (Y)->lo_significand)))
+#elif defined(SIZE_INT_64)
+# define SIGNIFICAND_EQ_64(X,Y) ((X)->significand == (Y)->significand)
+# define SIGNIFICAND_GT_64(X,Y) ((X)->significand > (Y)->significand)
+# define SIGNIFICAND_GE_64(X,Y) ((X)->significand >= (Y)->significand)
+# define SIGNIFICAND_LT_64(X,Y) ((X)->significand < (Y)->significand)
+# define SIGNIFICAND_LE_64(X,Y) ((X)->significand <= (Y)->significand)
+#endif
+
+#if defined(SIZE_INT_32)
+# define SIGNIFICAND_EQ_80(X,Y) \
+ (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand == (Y)->lo_significand))
+# define SIGNIFICAND_GT_80(X,Y) (((X)->hi_significand > (Y)->hi_significand) || \
+ (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand > (Y)->lo_significand)))
+# define SIGNIFICAND_GE_80(X,Y) (((X)->hi_significand > (Y)->hi_significand) || \
+ (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand >= (Y)->lo_significand)))
+# define SIGNIFICAND_LT_80(X,Y) (((X)->hi_significand < (Y)->hi_significand) || \
+ (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand < (Y)->lo_significand)))
+# define SIGNIFICAND_LE_80(X,Y) (((X)->hi_significand < (Y)->hi_significand) || \
+ (((X)->hi_significand == (Y)->hi_significand) && ((X)->lo_significand <= (Y)->lo_significand)))
+#elif defined(SIZE_INT_64)
+# define SIGNIFICAND_EQ_80(X,Y) ((X)->significand == (Y)->significand)
+# define SIGNIFICAND_GT_80(X,Y) ((X)->significand > (Y)->significand)
+# define SIGNIFICAND_GE_80(X,Y) ((X)->significand >= (Y)->significand)
+# define SIGNIFICAND_LT_80(X,Y) ((X)->significand < (Y)->significand)
+# define SIGNIFICAND_LE_80(X,Y) ((X)->significand <= (Y)->significand)
+#endif
+
+#define VALUE_EQ_32(X,Y) \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_EQ_32(X, Y)))
+#define VALUE_GT_32(X,Y) (((X)->exponent > (Y)->exponent) || \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_GT_32(X, Y))))
+#define VALUE_GE_32(X,Y) (((X)->exponent > (Y)->exponent) || \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_GE_32(X, Y))))
+#define VALUE_LT_32(X,Y) (((X)->exponent < (Y)->exponent) || \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_LT_32(X, Y))))
+#define VALUE_LE_32(X,Y) (((X)->exponent < (Y)->exponent) || \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_LE_32(X, Y))))
+
+#define VALUE_EQ_64(X,Y) \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_EQ_64(X, Y)))
+#define VALUE_GT_64(X,Y) (((X)->exponent > (Y)->exponent) || \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_GT_64(X, Y))))
+#define VALUE_GE_64(X,Y) (((X)->exponent > (Y)->exponent) || \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_GE_64(X, Y))))
+#define VALUE_LT_64(X,Y) (((X)->exponent < (Y)->exponent) || \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_LT_64(X, Y))))
+#define VALUE_LE_64(X,Y) (((X)->exponent < (Y)->exponent) || \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_LE_64(X, Y))))
+
+#define VALUE_EQ_80(X,Y) \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_EQ_80(X, Y)))
+#define VALUE_GT_80(X,Y) (((X)->exponent > (Y)->exponent) || \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_GT_80(X, Y))))
+#define VALUE_GE_80(X,Y) (((X)->exponent > (Y)->exponent) || \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_GE_80(X, Y))))
+#define VALUE_LT_80(X,Y) (((X)->exponent < (Y)->exponent) || \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_LT_80(X, Y))))
+#define VALUE_LE_80(X,Y) (((X)->exponent < (Y)->exponent) || \
+ (((X)->exponent == (Y)->exponent) && (SIGNIFICAND_LE_80(X, Y))))
+
+/* add/subtract 1 ulp macros */
+
+#if defined(SIZE_INT_32)
+# define ADD_ULP_80(X) \
+ if ((++(X)->lo_significand == 0) && \
+ (++(X)->hi_significand == (((X)->exponent == 0) ? 0x80000000 : 0))) \
+ { \
+ (X)->hi_significand |= 0x80000000; \
+ ++(X)->exponent; \
+ }
+# define SUB_ULP_80(X) \
+ if (--(X)->lo_significand == 0xFFFFFFFF) { \
+ --(X)->hi_significand; \
+ if (((X)->exponent != 0) && \
+ ((X)->hi_significand == 0x7FFFFFFF) && \
+ (--(X)->exponent != 0)) \
+ { \
+ (X)->hi_significand |= 0x80000000; \
+ } \
+ }
+#elif defined(SIZE_INT_64)
+# define ADD_ULP_80(X) \
+ if (++(X)->significand == (((X)->exponent == 0) ? 0x8000000000000000 : 0))) { \
+ (X)->significand |= 0x8000000000000000; \
+ ++(X)->exponent; \
+ }
+# define SUB_ULP_80(X) \
+ { \
+ --(X)->significand; \
+ if (((X)->exponent != 0) && \
+ ((X)->significand == 0x7FFFFFFFFFFFFFFF) && \
+ (--(X)->exponent != 0)) \
+ { \
+ (X)->significand |= 0x8000000000000000; \
+ } \
+ }
+#endif
+
+
+
+#if (defined(_WIN32) && !defined(_WIN64))
+
+#define FP80_DECLARE()
+#define _FPC_64 0x0300
+static unsigned short __wControlWord, __wNewControlWord;
+#define FP80_SET() { \
+ __asm { fnstcw word ptr [__wControlWord] } \
+ __wNewControlWord = __wControlWord | _FPC_64; \
+ __asm { fldcw word ptr [__wNewControlWord] } \
+ }
+#define FP80_RESET() { \
+ __asm { fldcw word ptr [__wControlWord] } \
+ }
+#else /* defined(_WIN32) && !defined(_WIN64) */
+
+#define FP80_DECLARE()
+#define FP80_SET()
+#define FP80_RESET()
+
+#endif /* defined(_WIN32) && !defined(_WIN64) */
+
+
+#ifdef _LIBC
+# include <math.h>
+#else
-#if 0
static const unsigned INF[] = {
DOUBLE_HEX(7ff00000, 00000000),
DOUBLE_HEX(fff00000, 00000000)
@@ -255,12 +657,12 @@ static const unsigned INF[] = {
static const double _zeroo = 0.0;
static const double _bigg = 1.0e300;
static const double _ponee = 1.0;
-static const double _nonee = -1.0;
+static const double _nonee = -1.0;
#define INVALID (_zeroo * *((double*)&INF[0]))
-#define PINF *((double*)&INF[0])
-#define NINF -PINF
-#define PINF_DZ (_ponee/_zeroo)
+#define PINF *((double*)&INF[0])
+#define NINF -PINF
+#define PINF_DZ (_ponee/_zeroo)
#define X_TLOSS 1.41484755040568800000e+16
#endif
@@ -278,7 +680,7 @@ struct __exception
char *name;
double arg1, arg2, retval;
};
-# else
+# else
# ifndef _LIBC
struct exception
@@ -300,18 +702,18 @@ struct exceptionl
};
#ifdef _MS_
-#define MATHERR_F _matherrf
-#define MATHERR_D _matherr
+#define MATHERR_F _matherrf
+#define MATHERR_D _matherr
#else
-#define MATHERR_F matherrf
-#define MATHERR_D matherr
+#define MATHERR_F matherrf
+#define MATHERR_D matherr
#endif
# ifdef __cplusplus
-#define EXC_DECL_D __exception
+#define EXC_DECL_D __exception
#else
// exception is a reserved name in C++
-#define EXC_DECL_D exception
+#define EXC_DECL_D exception
#endif
extern int MATHERR_F(struct exceptionf*);
@@ -324,7 +726,7 @@ extern int matherrl(struct exceptionl*);
#define ERRNO_DOMAIN errno = EDOM
-// Add code to support _LIB_VERSION
+// Add code to support _LIB_VERSIONIMF
#ifndef _LIBC
typedef enum
{
@@ -335,29 +737,19 @@ typedef enum
_ISOC_ // ISO C9X
} _LIB_VERSION_TYPE;
-extern _LIB_VERSION_TYPE _LIB_VERSION;
-#endif
-// This is a run-time variable and may effect
-// floating point behavior of the libm functions
-
-#elif defined _LIBC
-
-# if !defined NOT_IN_libc && defined SHARED && defined DO_VERSIONING \
- && !defined HAVE_BROKEN_ALIAS_ATTRIBUTE && !defined NO_HIDDEN
-# define __libm_error_support __GI___libm_error_support
-# endif
-
-#endif /* __ASSEMBLER__ */
-
-/* Support for compatible assembler handling. */
-#if !defined L && defined _LIBC
-#define L(name) .L##name
-#endif
-#ifdef __ELF__
-#define ASM_SIZE_DIRECTIVE(name) .size name,.-name
-#define ASM_TYPE_DIRECTIVE(name,T) .type name,T
+#if !defined( LIBM_BUILD )
+#if defined( _DLL )
+extern _LIB_VERSION_TYPE __declspec(dllimport) _LIB_VERSIONIMF;
+#else
+extern _LIB_VERSION_TYPE _LIB_VERSIONIMF;
+#endif /* _DLL */
#else
-#define ASM_SIZE_DIRECTIVE(name)
-#define ASM_TYPE_DIRECTIVE(name,T)
+extern int (*pmatherrf)(struct exceptionf*);
+extern int (*pmatherr)(struct EXC_DECL_D*);
+extern int (*pmatherrl)(struct exceptionl*);
+#endif /* LIBM_BUILD */
+
+// This is a run-time variable and may affect
+// floating point behavior of the libm functions
#endif
diff --git a/sysdeps/ia64/fpu/s_asinh.S b/sysdeps/ia64/fpu/s_asinh.S
new file mode 100644
index 0000000000..a9ef4e1143
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_asinh.S
@@ -0,0 +1,1136 @@
+.file "asinh.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// ==============================================================
+// History
+// ==============================================================
+// 04/02/01 Initial version
+// 04/19/01 Improved speed of the paths #1,2,3,4,5
+// 10/18/01 Improved accuracy
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+// 05/21/03 Improved performance, fixed to handle unorms
+//
+// API
+// ==============================================================
+// double asinh(double)
+//
+// Overview of operation
+// ==============================================================
+//
+// There are 7 paths:
+// 1. x = 0.0
+// Return asinh(x) = 0.0
+//
+// 2. 0.0 <|x| < 2^(-3)
+// Return asinh(x) = POL13(x),
+// where POL13(x) = (x^2*C13 + ...)*x^2 + C5)*x^2 + C3)*x^3 + x
+//
+// 3. 2^(-3) <= |x| < 2^63
+// Return asinh(x) = sign(x)*(log(|x| + sqrt(x^2 + 1.0)))
+// To compute x + sqrt(x^2 + 1.0) modified Newton Raphson method is used
+// (3 iterations)
+// Algorithm description for log function see below.
+//
+// 4. 2^63 <= |x| < +INF
+// Return asinh(x) = sign(x)*log(2*|x|)
+// Algorithm description for log function see below.
+//
+// 5. x = INF
+// Return asinh(x) = INF
+//
+// 6. x = [S,Q]NaN
+// Return asinh(x) = QNaN
+//
+// 7. x = denormal
+// Return asinh(x) = x correctly rounded
+//
+//==============================================================
+// Algorithm Description for log(x) function
+// Below we are using the fact that inequality x - 1.0 > 2^(-6) is always
+// true for this asinh implementation
+//
+// Consider x = 2^N 1.f1 f2 f3 f4...f63
+// Log(x) = log(frcpa(x) x/frcpa(x))
+// = log(1/frcpa(x)) + log(frcpa(x) x)
+// = -log(frcpa(x)) + log(frcpa(x) x)
+//
+// frcpa(x) = 2^-N frcpa((1.f1 f2 ... f63)
+//
+// -log(frcpa(x)) = -log(C)
+// = -log(2^-N) - log(frcpa(1.f1 f2 ... f63))
+//
+// -log(frcpa(x)) = -log(C)
+// = +Nlog2 - log(frcpa(1.f1 f2 ... f63))
+//
+// -log(frcpa(x)) = -log(C)
+// = +Nlog2 + log(frcpa(1.f1 f2 ... f63))
+//
+// Log(x) = log(1/frcpa(x)) + log(frcpa(x) x)
+//
+// Log(x) = +Nlog2 + log(1./frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)
+// Log(x) = +Nlog2 - log(/frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)
+// Log(x) = +Nlog2 + T + log(frcpa(x) x)
+//
+// Log(x) = +Nlog2 + T + log(C x)
+//
+// Cx = 1 + r
+//
+// Log(x) = +Nlog2 + T + log(1+r)
+// Log(x) = +Nlog2 + T + Series( r - r^2/2 + r^3/3 - r^4/4 ....)
+//
+// 1.f1 f2 ... f8 has 256 entries.
+// They are 1 + k/2^8, k = 0 ... 255
+// These 256 values are the table entries.
+//
+// Implementation
+//==============================================================
+// C = frcpa(x)
+// r = C * x - 1
+//
+// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4 + P4*r^5 + P5*r^6
+//
+// x = f * 2*n where f is 1.f_1f_2f_3....f_63
+// Nfloat = float(n) where n is the true unbiased exponent
+// pre-index = f_1f_2....f_8
+// index = pre_index * 16
+// get the dxt table entry at index + offset = T
+//
+// result = (T + Nfloat * log(2)) + rseries
+//
+// The T table is calculated as follows
+// Form x_k = 1 + k/2^8 where k goes from 0... 255
+// y_k = frcpa(x_k)
+// log(1/y_k) in quad and round to double-extended
+//
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f9 -> f15, f32 -> f68
+
+// General registers used:
+// r14 -> r27
+
+// Predicate registers used:
+// p6 -> p14
+
+// p6 to filter out case when x = [Q,S]NaN or INF or zero
+// p7 to filter out case when x < 0.0
+// p8 to select path #2
+// p9 used in the frcpa from path #3
+// p11 to filter out case when x >= 0
+// p12 to filter out case when x = unorm
+// p13 to select path #4
+// Assembly macros
+//==============================================================
+log_GR_exp_17_ones = r14
+log_GR_signexp_f8 = r15
+log_table_address2 = r16
+log_GR_exp_16_ones = r17
+log_GR_exp_f8 = r18
+log_GR_true_exp_f8 = r19
+log_GR_significand_f8 = r20
+log_GR_index = r21
+log_GR_comp2 = r22
+asinh_GR_f8 = r23
+asinh_GR_comp = r24
+asinh_GR_f8 = r25
+log_table_address3 = r26
+NR_table_address = r27
+
+//==============================================================
+log_y = f9
+NR1 = f10
+NR2 = f11
+log_y_rs = f12
+log_y_rs_iter = f13
+log_y_rs_iter1 = f14
+fNormX = f15
+asinh_w_sq = f32
+log_C13 = f33
+log_C11 = f34
+log_P3 = f35
+log_P2 = f36
+log_P1 = f37
+log_P5 = f38
+log_P4 = f39
+log_C3 = f40
+log_C5 = f41
+log_C7 = f42
+log2 = f43
+asinh_f8 = f44
+log_C = f45
+log_arg = f46
+log_C9 = f47
+asinh_w_four = f48
+log_int_Nfloat = f49
+log_r = f50
+log_rsq = f51
+log_rp_p4 = f52
+log_rp_p32 = f53
+log_rcube = f54
+log_rp_p10 = f55
+log_rp_p2 = f56
+log_Nfloat = f57
+log_T = f58
+log_r2P_r = f59
+log_T_plus_Nlog2 = f60
+asinh_w_3 = f61
+asinh_w_5 = f62
+asinh_w_cube = f63
+asinh_w_7 = f64
+log_arg_early = f65
+asinh_w_9 = f66
+asinh_w_13 = f67
+asinh_w_seven = f68
+
+// Data tables
+//==============================================================
+
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(log_table_1)
+data8 0xBFC5555DA7212371 // P5
+data8 0x3FC999A19EEF5826 // P4
+data8 0xBFCFFFFFFFFEF009 // P3
+data8 0x3FD555555554ECB2 // P2
+data8 0xBFE0000000000000 // P1 = -0.5
+data8 0x0000000000000000 // pad
+data8 0xb17217f7d1cf79ac, 0x00003ffe // log2
+LOCAL_OBJECT_END(log_table_1)
+
+LOCAL_OBJECT_START(log_table_2)
+data8 0x3FE0000000000000 // 0.5
+data8 0x4008000000000000 // 3.0
+//
+data8 0x8824BE4D74BC4F00, 0x00003FF9 // C13
+data8 0xB725A2CD9556CC57, 0x0000BFF9 // C11
+data8 0xF8E339127FBFF49D, 0x00003FF9 // C9
+data8 0xB6DB6D7DCE17CB78, 0x0000BFFA // C7
+data8 0x999999998802CCEF, 0x00003FFB // C5
+data8 0xAAAAAAAAAAA8DC40, 0x0000BFFC // C3
+LOCAL_OBJECT_END(log_table_2)
+
+
+LOCAL_OBJECT_START(log_table_3)
+data8 0x80200aaeac44ef38 , 0x00003ff6 // log(1/frcpa(1+ 0/2^-8))
+//
+data8 0xc09090a2c35aa070 , 0x00003ff7 // log(1/frcpa(1+ 1/2^-8))
+data8 0xa0c94fcb41977c75 , 0x00003ff8 // log(1/frcpa(1+ 2/2^-8))
+data8 0xe18b9c263af83301 , 0x00003ff8 // log(1/frcpa(1+ 3/2^-8))
+data8 0x8d35c8d6399c30ea , 0x00003ff9 // log(1/frcpa(1+ 4/2^-8))
+data8 0xadd4d2ecd601cbb8 , 0x00003ff9 // log(1/frcpa(1+ 5/2^-8))
+//
+data8 0xce95403a192f9f01 , 0x00003ff9 // log(1/frcpa(1+ 6/2^-8))
+data8 0xeb59392cbcc01096 , 0x00003ff9 // log(1/frcpa(1+ 7/2^-8))
+data8 0x862c7d0cefd54c5d , 0x00003ffa // log(1/frcpa(1+ 8/2^-8))
+data8 0x94aa63c65e70d499 , 0x00003ffa // log(1/frcpa(1+ 9/2^-8))
+data8 0xa54a696d4b62b382 , 0x00003ffa // log(1/frcpa(1+ 10/2^-8))
+//
+data8 0xb3e4a796a5dac208 , 0x00003ffa // log(1/frcpa(1+ 11/2^-8))
+data8 0xc28c45b1878340a9 , 0x00003ffa // log(1/frcpa(1+ 12/2^-8))
+data8 0xd35c55f39d7a6235 , 0x00003ffa // log(1/frcpa(1+ 13/2^-8))
+data8 0xe220f037b954f1f5 , 0x00003ffa // log(1/frcpa(1+ 14/2^-8))
+data8 0xf0f3389b036834f3 , 0x00003ffa // log(1/frcpa(1+ 15/2^-8))
+//
+data8 0xffd3488d5c980465 , 0x00003ffa // log(1/frcpa(1+ 16/2^-8))
+data8 0x87609ce2ed300490 , 0x00003ffb // log(1/frcpa(1+ 17/2^-8))
+data8 0x8ede9321e8c85927 , 0x00003ffb // log(1/frcpa(1+ 18/2^-8))
+data8 0x96639427f2f8e2f4 , 0x00003ffb // log(1/frcpa(1+ 19/2^-8))
+data8 0x9defad3e8f73217b , 0x00003ffb // log(1/frcpa(1+ 20/2^-8))
+//
+data8 0xa582ebd50097029c , 0x00003ffb // log(1/frcpa(1+ 21/2^-8))
+data8 0xac06dbe75ab80fee , 0x00003ffb // log(1/frcpa(1+ 22/2^-8))
+data8 0xb3a78449b2d3ccca , 0x00003ffb // log(1/frcpa(1+ 23/2^-8))
+data8 0xbb4f79635ab46bb2 , 0x00003ffb // log(1/frcpa(1+ 24/2^-8))
+data8 0xc2fec93a83523f3f , 0x00003ffb // log(1/frcpa(1+ 25/2^-8))
+//
+data8 0xc99af2eaca4c4571 , 0x00003ffb // log(1/frcpa(1+ 26/2^-8))
+data8 0xd1581106472fa653 , 0x00003ffb // log(1/frcpa(1+ 27/2^-8))
+data8 0xd8002560d4355f2e , 0x00003ffb // log(1/frcpa(1+ 28/2^-8))
+data8 0xdfcb43b4fe508632 , 0x00003ffb // log(1/frcpa(1+ 29/2^-8))
+data8 0xe67f6dff709d4119 , 0x00003ffb // log(1/frcpa(1+ 30/2^-8))
+//
+data8 0xed393b1c22351280 , 0x00003ffb // log(1/frcpa(1+ 31/2^-8))
+data8 0xf5192bff087bcc35 , 0x00003ffb // log(1/frcpa(1+ 32/2^-8))
+data8 0xfbdf4ff6dfef2fa3 , 0x00003ffb // log(1/frcpa(1+ 33/2^-8))
+data8 0x81559a97f92f9cc7 , 0x00003ffc // log(1/frcpa(1+ 34/2^-8))
+data8 0x84be72bce90266e8 , 0x00003ffc // log(1/frcpa(1+ 35/2^-8))
+//
+data8 0x88bc74113f23def2 , 0x00003ffc // log(1/frcpa(1+ 36/2^-8))
+data8 0x8c2ba3edf6799d11 , 0x00003ffc // log(1/frcpa(1+ 37/2^-8))
+data8 0x8f9dc92f92ea08b1 , 0x00003ffc // log(1/frcpa(1+ 38/2^-8))
+data8 0x9312e8f36efab5a7 , 0x00003ffc // log(1/frcpa(1+ 39/2^-8))
+data8 0x968b08643409ceb6 , 0x00003ffc // log(1/frcpa(1+ 40/2^-8))
+//
+data8 0x9a062cba08a1708c , 0x00003ffc // log(1/frcpa(1+ 41/2^-8))
+data8 0x9d845b3abf95485c , 0x00003ffc // log(1/frcpa(1+ 42/2^-8))
+data8 0xa06fd841bc001bb4 , 0x00003ffc // log(1/frcpa(1+ 43/2^-8))
+data8 0xa3f3a74652fbe0db , 0x00003ffc // log(1/frcpa(1+ 44/2^-8))
+data8 0xa77a8fb2336f20f5 , 0x00003ffc // log(1/frcpa(1+ 45/2^-8))
+//
+data8 0xab0497015d28b0a0 , 0x00003ffc // log(1/frcpa(1+ 46/2^-8))
+data8 0xae91c2be6ba6a615 , 0x00003ffc // log(1/frcpa(1+ 47/2^-8))
+data8 0xb189d1b99aebb20b , 0x00003ffc // log(1/frcpa(1+ 48/2^-8))
+data8 0xb51cced5de9c1b2c , 0x00003ffc // log(1/frcpa(1+ 49/2^-8))
+data8 0xb819bee9e720d42f , 0x00003ffc // log(1/frcpa(1+ 50/2^-8))
+//
+data8 0xbbb2a0947b093a5d , 0x00003ffc // log(1/frcpa(1+ 51/2^-8))
+data8 0xbf4ec1505811684a , 0x00003ffc // log(1/frcpa(1+ 52/2^-8))
+data8 0xc2535bacfa8975ff , 0x00003ffc // log(1/frcpa(1+ 53/2^-8))
+data8 0xc55a3eafad187eb8 , 0x00003ffc // log(1/frcpa(1+ 54/2^-8))
+data8 0xc8ff2484b2c0da74 , 0x00003ffc // log(1/frcpa(1+ 55/2^-8))
+//
+data8 0xcc0b1a008d53ab76 , 0x00003ffc // log(1/frcpa(1+ 56/2^-8))
+data8 0xcfb6203844b3209b , 0x00003ffc // log(1/frcpa(1+ 57/2^-8))
+data8 0xd2c73949a47a19f5 , 0x00003ffc // log(1/frcpa(1+ 58/2^-8))
+data8 0xd5daae18b49d6695 , 0x00003ffc // log(1/frcpa(1+ 59/2^-8))
+data8 0xd8f08248cf7e8019 , 0x00003ffc // log(1/frcpa(1+ 60/2^-8))
+//
+data8 0xdca7749f1b3e540e , 0x00003ffc // log(1/frcpa(1+ 61/2^-8))
+data8 0xdfc28e033aaaf7c7 , 0x00003ffc // log(1/frcpa(1+ 62/2^-8))
+data8 0xe2e012a5f91d2f55 , 0x00003ffc // log(1/frcpa(1+ 63/2^-8))
+data8 0xe600064ed9e292a8 , 0x00003ffc // log(1/frcpa(1+ 64/2^-8))
+data8 0xe9226cce42b39f60 , 0x00003ffc // log(1/frcpa(1+ 65/2^-8))
+//
+data8 0xec4749fd97a28360 , 0x00003ffc // log(1/frcpa(1+ 66/2^-8))
+data8 0xef6ea1bf57780495 , 0x00003ffc // log(1/frcpa(1+ 67/2^-8))
+data8 0xf29877ff38809091 , 0x00003ffc // log(1/frcpa(1+ 68/2^-8))
+data8 0xf5c4d0b245cb89be , 0x00003ffc // log(1/frcpa(1+ 69/2^-8))
+data8 0xf8f3afd6fcdef3aa , 0x00003ffc // log(1/frcpa(1+ 70/2^-8))
+//
+data8 0xfc2519756be1abc7 , 0x00003ffc // log(1/frcpa(1+ 71/2^-8))
+data8 0xff59119f503e6832 , 0x00003ffc // log(1/frcpa(1+ 72/2^-8))
+data8 0x8147ce381ae0e146 , 0x00003ffd // log(1/frcpa(1+ 73/2^-8))
+data8 0x82e45f06cb1ad0f2 , 0x00003ffd // log(1/frcpa(1+ 74/2^-8))
+data8 0x842f5c7c573cbaa2 , 0x00003ffd // log(1/frcpa(1+ 75/2^-8))
+//
+data8 0x85ce471968c8893a , 0x00003ffd // log(1/frcpa(1+ 76/2^-8))
+data8 0x876e8305bc04066d , 0x00003ffd // log(1/frcpa(1+ 77/2^-8))
+data8 0x891012678031fbb3 , 0x00003ffd // log(1/frcpa(1+ 78/2^-8))
+data8 0x8a5f1493d766a05f , 0x00003ffd // log(1/frcpa(1+ 79/2^-8))
+data8 0x8c030c778c56fa00 , 0x00003ffd // log(1/frcpa(1+ 80/2^-8))
+//
+data8 0x8da85df17e31d9ae , 0x00003ffd // log(1/frcpa(1+ 81/2^-8))
+data8 0x8efa663e7921687e , 0x00003ffd // log(1/frcpa(1+ 82/2^-8))
+data8 0x90a22b6875c6a1f8 , 0x00003ffd // log(1/frcpa(1+ 83/2^-8))
+data8 0x91f62cc8f5d24837 , 0x00003ffd // log(1/frcpa(1+ 84/2^-8))
+data8 0x93a06cfc3857d980 , 0x00003ffd // log(1/frcpa(1+ 85/2^-8))
+//
+data8 0x94f66d5e6fd01ced , 0x00003ffd // log(1/frcpa(1+ 86/2^-8))
+data8 0x96a330156e6772f2 , 0x00003ffd // log(1/frcpa(1+ 87/2^-8))
+data8 0x97fb3582754ea25b , 0x00003ffd // log(1/frcpa(1+ 88/2^-8))
+data8 0x99aa8259aad1bbf2 , 0x00003ffd // log(1/frcpa(1+ 89/2^-8))
+data8 0x9b0492f6227ae4a8 , 0x00003ffd // log(1/frcpa(1+ 90/2^-8))
+//
+data8 0x9c5f8e199bf3a7a5 , 0x00003ffd // log(1/frcpa(1+ 91/2^-8))
+data8 0x9e1293b9998c1daa , 0x00003ffd // log(1/frcpa(1+ 92/2^-8))
+data8 0x9f6fa31e0b41f308 , 0x00003ffd // log(1/frcpa(1+ 93/2^-8))
+data8 0xa0cda11eaf46390e , 0x00003ffd // log(1/frcpa(1+ 94/2^-8))
+data8 0xa22c8f029cfa45aa , 0x00003ffd // log(1/frcpa(1+ 95/2^-8))
+//
+data8 0xa3e48badb7856b34 , 0x00003ffd // log(1/frcpa(1+ 96/2^-8))
+data8 0xa5459a0aa95849f9 , 0x00003ffd // log(1/frcpa(1+ 97/2^-8))
+data8 0xa6a79c84480cfebd , 0x00003ffd // log(1/frcpa(1+ 98/2^-8))
+data8 0xa80a946d0fcb3eb2 , 0x00003ffd // log(1/frcpa(1+ 99/2^-8))
+data8 0xa96e831a3ea7b314 , 0x00003ffd // log(1/frcpa(1+100/2^-8))
+//
+data8 0xaad369e3dc544e3b , 0x00003ffd // log(1/frcpa(1+101/2^-8))
+data8 0xac92e9588952c815 , 0x00003ffd // log(1/frcpa(1+102/2^-8))
+data8 0xadfa035aa1ed8fdc , 0x00003ffd // log(1/frcpa(1+103/2^-8))
+data8 0xaf6219eae1ad6e34 , 0x00003ffd // log(1/frcpa(1+104/2^-8))
+data8 0xb0cb2e6d8160f753 , 0x00003ffd // log(1/frcpa(1+105/2^-8))
+//
+data8 0xb2354249ad950f72 , 0x00003ffd // log(1/frcpa(1+106/2^-8))
+data8 0xb3a056e98ef4a3b4 , 0x00003ffd // log(1/frcpa(1+107/2^-8))
+data8 0xb50c6dba52c6292a , 0x00003ffd // log(1/frcpa(1+108/2^-8))
+data8 0xb679882c33876165 , 0x00003ffd // log(1/frcpa(1+109/2^-8))
+data8 0xb78c07429785cedc , 0x00003ffd // log(1/frcpa(1+110/2^-8))
+//
+data8 0xb8faeb8dc4a77d24 , 0x00003ffd // log(1/frcpa(1+111/2^-8))
+data8 0xba6ad77eb36ae0d6 , 0x00003ffd // log(1/frcpa(1+112/2^-8))
+data8 0xbbdbcc915e9bee50 , 0x00003ffd // log(1/frcpa(1+113/2^-8))
+data8 0xbd4dcc44f8cf12ef , 0x00003ffd // log(1/frcpa(1+114/2^-8))
+data8 0xbec0d81bf5b531fa , 0x00003ffd // log(1/frcpa(1+115/2^-8))
+//
+data8 0xc034f19c139186f4 , 0x00003ffd // log(1/frcpa(1+116/2^-8))
+data8 0xc14cb69f7c5e55ab , 0x00003ffd // log(1/frcpa(1+117/2^-8))
+data8 0xc2c2abbb6e5fd56f , 0x00003ffd // log(1/frcpa(1+118/2^-8))
+data8 0xc439b2c193e6771e , 0x00003ffd // log(1/frcpa(1+119/2^-8))
+data8 0xc553acb9d5c67733 , 0x00003ffd // log(1/frcpa(1+120/2^-8))
+//
+data8 0xc6cc96e441272441 , 0x00003ffd // log(1/frcpa(1+121/2^-8))
+data8 0xc8469753eca88c30 , 0x00003ffd // log(1/frcpa(1+122/2^-8))
+data8 0xc962cf3ce072b05c , 0x00003ffd // log(1/frcpa(1+123/2^-8))
+data8 0xcadeba8771f694aa , 0x00003ffd // log(1/frcpa(1+124/2^-8))
+data8 0xcc5bc08d1f72da94 , 0x00003ffd // log(1/frcpa(1+125/2^-8))
+//
+data8 0xcd7a3f99ea035c29 , 0x00003ffd // log(1/frcpa(1+126/2^-8))
+data8 0xcef93860c8a53c35 , 0x00003ffd // log(1/frcpa(1+127/2^-8))
+data8 0xd0192f68a7ed23df , 0x00003ffd // log(1/frcpa(1+128/2^-8))
+data8 0xd19a201127d3c645 , 0x00003ffd // log(1/frcpa(1+129/2^-8))
+data8 0xd2bb92f4061c172c , 0x00003ffd // log(1/frcpa(1+130/2^-8))
+//
+data8 0xd43e80b2ee8cc8fc , 0x00003ffd // log(1/frcpa(1+131/2^-8))
+data8 0xd56173601fc4ade4 , 0x00003ffd // log(1/frcpa(1+132/2^-8))
+data8 0xd6e6637efb54086f , 0x00003ffd // log(1/frcpa(1+133/2^-8))
+data8 0xd80ad9f58f3c8193 , 0x00003ffd // log(1/frcpa(1+134/2^-8))
+data8 0xd991d1d31aca41f8 , 0x00003ffd // log(1/frcpa(1+135/2^-8))
+//
+data8 0xdab7d02231484a93 , 0x00003ffd // log(1/frcpa(1+136/2^-8))
+data8 0xdc40d532cde49a54 , 0x00003ffd // log(1/frcpa(1+137/2^-8))
+data8 0xdd685f79ed8b265e , 0x00003ffd // log(1/frcpa(1+138/2^-8))
+data8 0xde9094bbc0e17b1d , 0x00003ffd // log(1/frcpa(1+139/2^-8))
+data8 0xe01c91b78440c425 , 0x00003ffd // log(1/frcpa(1+140/2^-8))
+//
+data8 0xe14658f26997e729 , 0x00003ffd // log(1/frcpa(1+141/2^-8))
+data8 0xe270cdc2391e0d23 , 0x00003ffd // log(1/frcpa(1+142/2^-8))
+data8 0xe3ffce3a2aa64922 , 0x00003ffd // log(1/frcpa(1+143/2^-8))
+data8 0xe52bdb274ed82887 , 0x00003ffd // log(1/frcpa(1+144/2^-8))
+data8 0xe6589852e75d7df6 , 0x00003ffd // log(1/frcpa(1+145/2^-8))
+//
+data8 0xe786068c79937a7d , 0x00003ffd // log(1/frcpa(1+146/2^-8))
+data8 0xe91903adad100911 , 0x00003ffd // log(1/frcpa(1+147/2^-8))
+data8 0xea481236f7d35bb0 , 0x00003ffd // log(1/frcpa(1+148/2^-8))
+data8 0xeb77d48c692e6b14 , 0x00003ffd // log(1/frcpa(1+149/2^-8))
+data8 0xeca84b83d7297b87 , 0x00003ffd // log(1/frcpa(1+150/2^-8))
+//
+data8 0xedd977f4962aa158 , 0x00003ffd // log(1/frcpa(1+151/2^-8))
+data8 0xef7179a22f257754 , 0x00003ffd // log(1/frcpa(1+152/2^-8))
+data8 0xf0a450d139366ca7 , 0x00003ffd // log(1/frcpa(1+153/2^-8))
+data8 0xf1d7e0524ff9ffdb , 0x00003ffd // log(1/frcpa(1+154/2^-8))
+data8 0xf30c29036a8b6cae , 0x00003ffd // log(1/frcpa(1+155/2^-8))
+//
+data8 0xf4412bc411ea8d92 , 0x00003ffd // log(1/frcpa(1+156/2^-8))
+data8 0xf576e97564c8619d , 0x00003ffd // log(1/frcpa(1+157/2^-8))
+data8 0xf6ad62fa1b5f172f , 0x00003ffd // log(1/frcpa(1+158/2^-8))
+data8 0xf7e499368b55c542 , 0x00003ffd // log(1/frcpa(1+159/2^-8))
+data8 0xf91c8d10abaffe22 , 0x00003ffd // log(1/frcpa(1+160/2^-8))
+//
+data8 0xfa553f7018c966f3 , 0x00003ffd // log(1/frcpa(1+161/2^-8))
+data8 0xfb8eb13e185d802c , 0x00003ffd // log(1/frcpa(1+162/2^-8))
+data8 0xfcc8e3659d9bcbed , 0x00003ffd // log(1/frcpa(1+163/2^-8))
+data8 0xfe03d6d34d487fd2 , 0x00003ffd // log(1/frcpa(1+164/2^-8))
+data8 0xff3f8c7581e9f0ae , 0x00003ffd // log(1/frcpa(1+165/2^-8))
+//
+data8 0x803e029e280173ae , 0x00003ffe // log(1/frcpa(1+166/2^-8))
+data8 0x80dca10cc52d0757 , 0x00003ffe // log(1/frcpa(1+167/2^-8))
+data8 0x817ba200632755a1 , 0x00003ffe // log(1/frcpa(1+168/2^-8))
+data8 0x821b05f3b01d6774 , 0x00003ffe // log(1/frcpa(1+169/2^-8))
+data8 0x82bacd623ff19d06 , 0x00003ffe // log(1/frcpa(1+170/2^-8))
+//
+data8 0x835af8c88e7a8f47 , 0x00003ffe // log(1/frcpa(1+171/2^-8))
+data8 0x83c5f8299e2b4091 , 0x00003ffe // log(1/frcpa(1+172/2^-8))
+data8 0x8466cb43f3d87300 , 0x00003ffe // log(1/frcpa(1+173/2^-8))
+data8 0x850803a67c80ca4b , 0x00003ffe // log(1/frcpa(1+174/2^-8))
+data8 0x85a9a1d11a23b461 , 0x00003ffe // log(1/frcpa(1+175/2^-8))
+//
+data8 0x864ba644a18e6e05 , 0x00003ffe // log(1/frcpa(1+176/2^-8))
+data8 0x86ee1182dcc432f7 , 0x00003ffe // log(1/frcpa(1+177/2^-8))
+data8 0x875a925d7e48c316 , 0x00003ffe // log(1/frcpa(1+178/2^-8))
+data8 0x87fdaa109d23aef7 , 0x00003ffe // log(1/frcpa(1+179/2^-8))
+data8 0x88a129ed4becfaf2 , 0x00003ffe // log(1/frcpa(1+180/2^-8))
+//
+data8 0x89451278ecd7f9cf , 0x00003ffe // log(1/frcpa(1+181/2^-8))
+data8 0x89b29295f8432617 , 0x00003ffe // log(1/frcpa(1+182/2^-8))
+data8 0x8a572ac5a5496882 , 0x00003ffe // log(1/frcpa(1+183/2^-8))
+data8 0x8afc2d0ce3b2dadf , 0x00003ffe // log(1/frcpa(1+184/2^-8))
+data8 0x8b6a69c608cfd3af , 0x00003ffe // log(1/frcpa(1+185/2^-8))
+//
+data8 0x8c101e106e899a83 , 0x00003ffe // log(1/frcpa(1+186/2^-8))
+data8 0x8cb63de258f9d626 , 0x00003ffe // log(1/frcpa(1+187/2^-8))
+data8 0x8d2539c5bd19e2b1 , 0x00003ffe // log(1/frcpa(1+188/2^-8))
+data8 0x8dcc0e064b29e6f1 , 0x00003ffe // log(1/frcpa(1+189/2^-8))
+data8 0x8e734f45d88357ae , 0x00003ffe // log(1/frcpa(1+190/2^-8))
+//
+data8 0x8ee30cef034a20db , 0x00003ffe // log(1/frcpa(1+191/2^-8))
+data8 0x8f8b0515686d1d06 , 0x00003ffe // log(1/frcpa(1+192/2^-8))
+data8 0x90336bba039bf32f , 0x00003ffe // log(1/frcpa(1+193/2^-8))
+data8 0x90a3edd23d1c9d58 , 0x00003ffe // log(1/frcpa(1+194/2^-8))
+data8 0x914d0de2f5d61b32 , 0x00003ffe // log(1/frcpa(1+195/2^-8))
+//
+data8 0x91be0c20d28173b5 , 0x00003ffe // log(1/frcpa(1+196/2^-8))
+data8 0x9267e737c06cd34a , 0x00003ffe // log(1/frcpa(1+197/2^-8))
+data8 0x92d962ae6abb1237 , 0x00003ffe // log(1/frcpa(1+198/2^-8))
+data8 0x9383fa6afbe2074c , 0x00003ffe // log(1/frcpa(1+199/2^-8))
+data8 0x942f0421651c1c4e , 0x00003ffe // log(1/frcpa(1+200/2^-8))
+//
+data8 0x94a14a3845bb985e , 0x00003ffe // log(1/frcpa(1+201/2^-8))
+data8 0x954d133857f861e7 , 0x00003ffe // log(1/frcpa(1+202/2^-8))
+data8 0x95bfd96468e604c4 , 0x00003ffe // log(1/frcpa(1+203/2^-8))
+data8 0x9632d31cafafa858 , 0x00003ffe // log(1/frcpa(1+204/2^-8))
+data8 0x96dfaabd86fa1647 , 0x00003ffe // log(1/frcpa(1+205/2^-8))
+//
+data8 0x9753261fcbb2a594 , 0x00003ffe // log(1/frcpa(1+206/2^-8))
+data8 0x9800c11b426b996d , 0x00003ffe // log(1/frcpa(1+207/2^-8))
+data8 0x9874bf4d45ae663c , 0x00003ffe // log(1/frcpa(1+208/2^-8))
+data8 0x99231f5ee9a74f79 , 0x00003ffe // log(1/frcpa(1+209/2^-8))
+data8 0x9997a18a56bcad28 , 0x00003ffe // log(1/frcpa(1+210/2^-8))
+//
+data8 0x9a46c873a3267e79 , 0x00003ffe // log(1/frcpa(1+211/2^-8))
+data8 0x9abbcfc621eb6cb6 , 0x00003ffe // log(1/frcpa(1+212/2^-8))
+data8 0x9b310cb0d354c990 , 0x00003ffe // log(1/frcpa(1+213/2^-8))
+data8 0x9be14cf9e1b3515c , 0x00003ffe // log(1/frcpa(1+214/2^-8))
+data8 0x9c5710b8cbb73a43 , 0x00003ffe // log(1/frcpa(1+215/2^-8))
+//
+data8 0x9ccd0abd301f399c , 0x00003ffe // log(1/frcpa(1+216/2^-8))
+data8 0x9d7e67f3bdce8888 , 0x00003ffe // log(1/frcpa(1+217/2^-8))
+data8 0x9df4ea81a99daa01 , 0x00003ffe // log(1/frcpa(1+218/2^-8))
+data8 0x9e6ba405a54514ba , 0x00003ffe // log(1/frcpa(1+219/2^-8))
+data8 0x9f1e21c8c7bb62b3 , 0x00003ffe // log(1/frcpa(1+220/2^-8))
+//
+data8 0x9f956593f6b6355c , 0x00003ffe // log(1/frcpa(1+221/2^-8))
+data8 0xa00ce1092e5498c3 , 0x00003ffe // log(1/frcpa(1+222/2^-8))
+data8 0xa0c08309c4b912c1 , 0x00003ffe // log(1/frcpa(1+223/2^-8))
+data8 0xa1388a8c6faa2afa , 0x00003ffe // log(1/frcpa(1+224/2^-8))
+data8 0xa1b0ca7095b5f985 , 0x00003ffe // log(1/frcpa(1+225/2^-8))
+//
+data8 0xa22942eb47534a00 , 0x00003ffe // log(1/frcpa(1+226/2^-8))
+data8 0xa2de62326449d0a3 , 0x00003ffe // log(1/frcpa(1+227/2^-8))
+data8 0xa357690f88bfe345 , 0x00003ffe // log(1/frcpa(1+228/2^-8))
+data8 0xa3d0a93f45169a4b , 0x00003ffe // log(1/frcpa(1+229/2^-8))
+data8 0xa44a22f7ffe65f30 , 0x00003ffe // log(1/frcpa(1+230/2^-8))
+//
+data8 0xa500c5e5b4c1aa36 , 0x00003ffe // log(1/frcpa(1+231/2^-8))
+data8 0xa57ad064eb2ebbc2 , 0x00003ffe // log(1/frcpa(1+232/2^-8))
+data8 0xa5f5152dedf4384e , 0x00003ffe // log(1/frcpa(1+233/2^-8))
+data8 0xa66f9478856233ec , 0x00003ffe // log(1/frcpa(1+234/2^-8))
+data8 0xa6ea4e7cca02c32e , 0x00003ffe // log(1/frcpa(1+235/2^-8))
+//
+data8 0xa765437325341ccf , 0x00003ffe // log(1/frcpa(1+236/2^-8))
+data8 0xa81e21e6c75b4020 , 0x00003ffe // log(1/frcpa(1+237/2^-8))
+data8 0xa899ab333fe2b9ca , 0x00003ffe // log(1/frcpa(1+238/2^-8))
+data8 0xa9157039c51ebe71 , 0x00003ffe // log(1/frcpa(1+239/2^-8))
+data8 0xa991713433c2b999 , 0x00003ffe // log(1/frcpa(1+240/2^-8))
+//
+data8 0xaa0dae5cbcc048b3 , 0x00003ffe // log(1/frcpa(1+241/2^-8))
+data8 0xaa8a27ede5eb13ad , 0x00003ffe // log(1/frcpa(1+242/2^-8))
+data8 0xab06de228a9e3499 , 0x00003ffe // log(1/frcpa(1+243/2^-8))
+data8 0xab83d135dc633301 , 0x00003ffe // log(1/frcpa(1+244/2^-8))
+data8 0xac3fb076adc7fe7a , 0x00003ffe // log(1/frcpa(1+245/2^-8))
+//
+data8 0xacbd3cbbe47988f1 , 0x00003ffe // log(1/frcpa(1+246/2^-8))
+data8 0xad3b06b1a5dc57c3 , 0x00003ffe // log(1/frcpa(1+247/2^-8))
+data8 0xadb90e94af887717 , 0x00003ffe // log(1/frcpa(1+248/2^-8))
+data8 0xae3754a218f7c816 , 0x00003ffe // log(1/frcpa(1+249/2^-8))
+data8 0xaeb5d9175437afa2 , 0x00003ffe // log(1/frcpa(1+250/2^-8))
+//
+data8 0xaf349c322e9c7cee , 0x00003ffe // log(1/frcpa(1+251/2^-8))
+data8 0xafb39e30d1768d1c , 0x00003ffe // log(1/frcpa(1+252/2^-8))
+data8 0xb032df51c2c93116 , 0x00003ffe // log(1/frcpa(1+253/2^-8))
+data8 0xb0b25fd3e6035ad9 , 0x00003ffe // log(1/frcpa(1+254/2^-8))
+data8 0xb1321ff67cba178c , 0x00003ffe // log(1/frcpa(1+255/2^-8))
+LOCAL_OBJECT_END(log_table_3)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(asinh)
+
+{ .mfi
+ getf.exp asinh_GR_f8 = f8 // Must recompute later if x unorm
+ fclass.m p12,p0 = f8, 0x0b // Test x unorm
+ mov log_GR_exp_17_ones = 0x1ffff
+}
+{ .mfi
+ addl NR_table_address = @ltoff(log_table_1), gp
+ fma.s1 log_y = f8, f8, f1 // y = x^2 + 1
+ mov asinh_GR_comp = 0xfffc
+}
+;;
+
+{ .mfi
+ mov log_GR_exp_16_ones = 0xffff //BIAS
+ fclass.m p6,p0 = f8, 0xe7 // Test for x = NaN and inf and zero
+ mov log_GR_comp2 = 0x1003e
+}
+{ .mfi
+ ld8 NR_table_address = [NR_table_address]
+ fma.s1 asinh_w_sq = f8,f8,f0 // x^2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fcmp.lt.s1 p7,p11 = f8,f0 // if x<0
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fnorm.s1 fNormX = f8 // Normalize x
+(p12) br.cond.spnt ASINH_UNORM // Branch if x=unorm
+}
+;;
+
+ASINH_COMMON:
+// Return here if x=unorm and not denorm
+{ .mfi
+ //to get second table address
+ adds log_table_address2 = 0x40, NR_table_address
+ fma.s1 log_arg = f8,f1,f8
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p6) fma.d.s0 f8 = f8,f1,f8 // quietize nan result if x=nan
+(p6) br.ret.spnt b0 // Exit for x=nan and inf and zero
+}
+;;
+
+{ .mfi
+ ldfpd NR1,NR2 = [log_table_address2],16
+ frsqrta.s1 log_y_rs,p0 = log_y // z=1/sqrt(y)
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe log_C13 = [log_table_address2],16
+ nop.f 0
+ and asinh_GR_f8 = asinh_GR_f8,log_GR_exp_17_ones
+}
+;;
+
+{ .mib
+ ldfe log_C11 = [log_table_address2],16
+ cmp.le p13,p0 = log_GR_comp2,asinh_GR_f8
+(p13) br.cond.spnt LOG_COMMON1 // Branch if path 4, |x| >= 2^63
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter = log_y_rs,log_y,f0 // y*z
+ nop.i 0
+}
+;;
+
+.pred.rel "mutex",p7,p11
+{ .mfi
+ nop.m 0
+(p11) mov asinh_f8 = fNormX
+ nop.i 0
+}
+{ .mfb
+ cmp.gt p8,p0 = asinh_GR_comp,asinh_GR_f8
+(p7) fnma.s1 asinh_f8 = fNormX,f1,f0
+(p8) br.cond.spnt ASINH_NEAR_ZERO // Branch if path 2, 0 < |x| < 2^-3
+}
+;;
+
+// Here if main path, 2^-3 <= |x| < 2^63
+///////////////////////////////// The first iteration /////////////////////////
+{ .mfi
+ ldfpd log_P5,log_P4 = [NR_table_address],16
+ fnma.s1 log_y_rs_iter = log_y_rs_iter,log_y_rs,NR2 // 3-(y*z)*z
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs,NR1,f0 // 0.5*z
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfpd log_P3,log_P2 = [NR_table_address],16
+ // (0.5*z)*(3-(y*z)*z)
+ fma.s1 log_y_rs_iter = log_y_rs_iter1,log_y_rs_iter,f0
+ nop.i 0
+}
+;;
+
+/////////////////////////// The second iteration /////////////////////////////
+{ .mfi
+ ldfd log_P1 = [NR_table_address],16
+ fma.s1 log_y_rs = log_y_rs_iter,log_y,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 log_y_rs = log_y_rs,log_y_rs_iter,NR2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs_iter,NR1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe log2 = [NR_table_address],16
+ // (0.5*z)*(3-(y*z)*z)
+ fma.s1 log_y_rs_iter = log_y_rs_iter1,log_y_rs,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (0.5*z)*(3-(y*z)*z)
+ fma.s1 log_arg_early = log_y_rs_iter1,log_y_rs,f0
+ nop.i 0
+}
+;;
+
+////////////////////////////////// The third iteration ////////////////////////
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs = log_y_rs_iter,log_y,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs_iter,NR1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_arg_early = log_arg_early,log_y,asinh_f8
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 log_y_rs = log_y_rs,log_y_rs_iter,NR2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs_iter1,log_y,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ frcpa.s1 log_C,p0 = f1,log_arg_early
+ nop.i 0
+}
+;;
+
+{ .mfi
+ getf.exp log_GR_signexp_f8 = log_arg_early
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ getf.sig log_GR_significand_f8 = log_arg_early
+ // (0.5*z)*(3-(y*z)*z)*y + |x|
+ fma.s1 log_arg = log_y_rs_iter1,log_y_rs,asinh_f8
+ //to get third table address
+ adds log_table_address3 = 0x70, NR_table_address
+}
+;;
+
+///////////////////////////////// The end NR iterations /////////////////////
+{ .mfi
+ nop.m 0
+ nop.f 0
+ //significant bit destruction
+ and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones
+}
+;;
+
+{ .mfi
+ //BIAS subtraction
+ sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones
+(p7) fnma.s1 log2 = log2,f1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ setf.sig log_int_Nfloat = log_GR_true_exp_f8
+ fms.s1 log_r = log_C,log_arg,f1 // C = frcpa(x); r = C * x - 1
+ extr.u log_GR_index = log_GR_significand_f8,55,8 //Extract 8 bits
+}
+;;
+
+{ .mmi
+ //pre-index*16 + index
+ shladd log_table_address3 = log_GR_index,4,log_table_address3
+;;
+ ldfe log_T = [log_table_address3]
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rsq = log_r, log_r, f0 //r^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p4 = log_P5, log_r, log_P4 //P5*r + P4
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p32 = log_P3, log_r, log_P2 //P3*r + P2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //convert N to the floating-point format
+ fcvt.xf log_Nfloat = log_int_Nfloat
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rcube = log_rsq, log_r, f0 //r^3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p10 = log_rsq, log_P1, log_r //P1*r^2 + r
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //(P5*r + P4)*r^2 + P3*r + P2
+ fma.s1 log_rp_p2 = log_rp_p4, log_rsq, log_rp_p32
+ nop.i 0
+}
+;;
+
+.pred.rel "mutex",p7,p11
+{ .mfi
+ nop.m 0
+(p11) fma.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 + T if x>0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fms.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 - T if x<0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //((P5*r + P4)*r^2 + P3*r + P2)*w^3 + P1*r^2 + r
+ fma.s1 log_r2P_r = log_rp_p2, log_rcube, log_rp_p10
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // N*log2 + T + ((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r
+(p11) fadd.d.s0 f8 = log_T_plus_Nlog2,log_r2P_r
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ // -N*log2 - T - ((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r
+(p7) fsub.d.s0 f8 = log_T_plus_Nlog2,log_r2P_r
+ br.ret.sptk b0 // Exit main path, path 3: 2^-3 <= |x| < 2^63
+}
+;;
+
+// Here if path 4, |x| >= 2^63
+LOG_COMMON1:
+{ .mfi
+ ldfpd log_P5,log_P4 = [NR_table_address],16
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfpd log_P3,log_P2 = [NR_table_address],16
+ frcpa.s1 log_C,p0 = f1,log_arg
+ nop.i 0
+}
+;;
+
+{ .mmi
+ getf.exp log_GR_signexp_f8 = log_arg
+ ldfd log_P1 = [NR_table_address],16
+ nop.i 0
+}
+;;
+
+{ .mmi
+ getf.sig log_GR_significand_f8 = log_arg
+ ldfe log2 = [NR_table_address],16
+ nop.i 0
+}
+;;
+
+{ .mfi
+ adds log_table_address3 = 0x70, NR_table_address
+ nop.f 0
+ //significant bit destruction
+ and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones
+}
+;;
+
+{ .mmf
+ nop.m 0
+ //BIAS subtraction
+ sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones
+ fms.s1 log_r = log_C,log_arg,f1 //C = frcpa(x); r = C * x - 1
+}
+;;
+
+{ .mfi
+ setf.sig log_int_Nfloat = log_GR_true_exp_f8
+ nop.f 0
+ extr.u log_GR_index = log_GR_significand_f8,55,8 //Extract 8 bits
+}
+;;
+
+{ .mmi
+ //pre-index*16 + index
+ shladd log_table_address3 = log_GR_index,4,log_table_address3
+;;
+ ldfe log_T = [log_table_address3]
+ nop.i 0
+
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rsq = log_r, log_r, f0 //r^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p4 = log_P5, log_r, log_P4 //P5*r + P4
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p32 = log_P3, log_r, log_P2 //P3*r + P2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fnma.s1 log2 = log2,f1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rcube = log_rsq, log_r, f0 //r^3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p10 = log_rsq, log_P1, log_r //P1*r^2 + r
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //convert N to the floating-point format
+ fcvt.xf log_Nfloat = log_int_Nfloat
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ //(P5*r + P4)*r^2 + P3*r + P2
+ fma.s1 log_rp_p2 = log_rp_p4, log_rsq, log_rp_p32
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p7) fnma.s1 log_T = log_T,f1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 + T
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ //((P5*r + P4)*r^2 + P3*r + P2)*w^3 + P1*r^2 + r
+ fma.s1 log_r2P_r = log_rp_p2, log_rcube, log_rp_p10
+ nop.i 0
+}
+;;
+
+.pred.rel "mutex",p7,p11
+{ .mfi
+ nop.m 0
+ // N*log2 + T + ((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r
+(p11) fadd.d.s0 f8 = log_T_plus_Nlog2,log_r2P_r
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ // -N*log2 - T - ((P5*r + P4)*r^2 + P3*r + P2)*r^3 + P1*r^2 + r
+(p7) fsub.d.s0 f8 = log_T_plus_Nlog2,log_r2P_r
+ br.ret.sptk b0 // Exit path 4, |x| >= 2^63
+}
+;;
+
+// Here is path 2, 0 < |x| < 2^-3
+ASINH_NEAR_ZERO:
+{ .mfi
+ ldfe log_C9 = [log_table_address2],16
+ fma.s1 asinh_w_cube = asinh_w_sq,fNormX,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe log_C7 = [log_table_address2],16
+ fma.s1 asinh_w_four = asinh_w_sq,asinh_w_sq,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe log_C5 = [log_table_address2],16
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe log_C3 = [log_table_address2],16
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 asinh_w_13 = log_C13,asinh_w_sq,log_C11
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 asinh_w_9 = log_C9,asinh_w_sq,log_C7
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 asinh_w_3 = log_C5,asinh_w_sq,log_C3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 asinh_w_seven = asinh_w_four,asinh_w_cube,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 asinh_w_7 = asinh_w_13,asinh_w_four,asinh_w_9
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 asinh_w_5 = asinh_w_3,asinh_w_cube,fNormX
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = asinh_w_7,asinh_w_seven,asinh_w_5
+ br.ret.sptk b0 // Exit path 2 (0.0 <|x| < 2^(-3))
+}
+;;
+
+ASINH_UNORM:
+// Here if x=unorm
+{ .mfi
+ getf.exp asinh_GR_f8 = fNormX // Recompute if x unorm
+ fclass.m p0,p13 = fNormX, 0x0b // Test x denorm
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ fcmp.eq.s0 p14,p0 = f8, f0 // Dummy to set denormal flag
+(p13) br.cond.sptk ASINH_COMMON // Continue if x unorm and not denorm
+}
+;;
+
+.pred.rel "mutex",p7,p11
+{ .mfi
+ nop.m 0
+(p7) fma.d.s0 f8 = f8,f8,f8 // Result x+x^2 if x=-denorm
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p11) fnma.d.s0 f8 = f8,f8,f8 // Result x-x^2 if x=+denorm
+ br.ret.spnt b0 // Exit if denorm
+}
+;;
+
+GLOBAL_LIBM_END(asinh)
diff --git a/sysdeps/ia64/fpu/s_asinhf.S b/sysdeps/ia64/fpu/s_asinhf.S
new file mode 100644
index 0000000000..df616deae0
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_asinhf.S
@@ -0,0 +1,937 @@
+.file "asinhf.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// ==============================================================
+// History
+// ==============================================================
+// 04/02/01 Initial version
+// 04/19/01 Improved speed of the paths #1,2,3,4,5
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+// 05/21/03 Improved performance, fixed to handle unorms
+//
+// API
+// ==============================================================
+// float asinhf(float)
+//
+// Overview of operation
+// ==============================================================
+//
+// There are 7 paths:
+// 1. x = 0.0
+// Return asinhf(x) = 0.0
+// 2. 0.0 <|x| < 2^(-5)
+// Return asinhf(x) = Pol5(x), where Pol5(x) = ((x^2)*C1 + C0)*x^3 + x
+
+// 3. 2^(-5) <= |x| < 2^51
+// Return asinhf(x) = sign(x)*(log(|x| + sqrt(x^2 + 1.0)))
+// To compute x + sqrt(x^2 + 1.0) modified Newton Raphson method is used
+// (2 iterations)
+// Algorithm description for log function see below.
+//
+// 4. 2^51 <= |x| < +INF
+// Return asinhf(x) = sign(x)*log(2*|x|)
+// Algorithm description for log function see below.
+//
+// 5. x = INF
+// Return asinhf(x) = INF
+//
+// 6. x = [S,Q]NaN
+// Return asinhf(x) = QNaN
+//
+// 7. x = denormal
+// Return asinhf(x) = x
+//
+//==============================================================
+// Algorithm Description for log(x) function
+// Below we are using the fact that inequality x - 1.0 > 2^(-6) is always
+// true for this asinh implementation
+//
+// Consider x = 2^N 1.f1 f2 f3 f4...f63
+// Log(x) = log(frcpa(x) x/frcpa(x))
+// = log(1/frcpa(x)) + log(frcpa(x) x)
+// = -log(frcpa(x)) + log(frcpa(x) x)
+//
+// frcpa(x) = 2^-N frcpa((1.f1 f2 ... f63)
+//
+// -log(frcpa(x)) = -log(C)
+// = -log(2^-N) - log(frcpa(1.f1 f2 ... f63))
+//
+// -log(frcpa(x)) = -log(C)
+// = +Nlog2 - log(frcpa(1.f1 f2 ... f63))
+//
+// -log(frcpa(x)) = -log(C)
+// = +Nlog2 + log(frcpa(1.f1 f2 ... f63))
+//
+// Log(x) = log(1/frcpa(x)) + log(frcpa(x) x)
+//
+// Log(x) = +Nlog2 + log(1./frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)
+// Log(x) = +Nlog2 - log(/frcpa(1.f1 f2 ... f63)) + log(frcpa(x) x)
+// Log(x) = +Nlog2 + T + log(frcpa(x) x)
+//
+// Log(x) = +Nlog2 + T + log(C x)
+//
+// Cx = 1 + r
+//
+// Log(x) = +Nlog2 + T + log(1+r)
+// Log(x) = +Nlog2 + T + Series( r - r^2/2 + r^3/3 - r^4/4 ....)
+//
+// 1.f1 f2 ... f8 has 256 entries.
+// They are 1 + k/2^8, k = 0 ... 255
+// These 256 values are the table entries.
+//
+// Implementation
+//==============================================================
+// C = frcpa(x)
+// r = C * x - 1
+//
+// Form rseries = r + P1*r^2 + P2*r^3 + P3*r^4
+//
+// x = f * 2*n where f is 1.f_1f_2f_3....f_63
+// Nfloat = float(n) where n is the true unbiased exponent
+// pre-index = f_1f_2....f_8
+// index = pre_index * 8
+// get the dxt table entry at index + offset = T
+//
+// result = (T + Nfloat * log(2)) + rseries
+//
+// The T table is calculated as follows
+// Form x_k = 1 + k/2^8 where k goes from 0... 255
+// y_k = frcpa(x_k)
+// log(1/y_k) in quad and round to double-extended
+//
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f9 -> f15, f32 -> f55
+
+// General registers used:
+// r14 -> r27
+
+// Predicate registers used:
+// p6 -> p14
+
+// p6 to filter out case when x = [Q,S]NaN or INF or zero
+// p7 to filter out case when x < 0.0
+// p8 to select path #2
+
+// p11 to filter out case when x >= 0
+// p12 to filter out case when x = + denormal
+// p13 to select path #4
+// p14 to filtef out case when x = - denormal
+// Assembly macros
+//==============================================================
+log_GR_exp_17_ones = r14
+log_GR_signexp_f8 = r15
+log_table_address2 = r16
+log_GR_exp_16_ones = r17
+log_GR_exp_f8 = r18
+log_GR_true_exp_f8 = r19
+log_GR_significand_f8 = r20
+log_GR_index = r21
+log_GR_comp2 = r22
+asinh_GR_f8 = r23
+asinh_GR_comp = r24
+asinh_GR_f8 = r25
+log_table_address3 = r26
+NR_table_address = r27
+
+//==============================================================
+log_y = f9
+NR1 = f10
+NR2 = f11
+log_y_rs = f12
+log_y_rs_iter = f13
+log_y_rs_iter1 = f14
+fNormX = f15
+asinh_w_sq = f32
+log_arg_early = f33
+log_y_rs_iter2 = f34
+log_P3 = f35
+log_P2 = f36
+log_P1 = f37
+log2 = f38
+log_C0 = f39
+log_C1 = f40
+asinh_f8 = f41
+log_C = f42
+log_arg = f43
+asinh_w_cube = f44
+log_int_Nfloat = f45
+log_r = f46
+log_rsq = f47
+asinh_w_1 = f48
+log_rp_p32 = f49
+log_rcube = f50
+log_rp_p10 = f51
+log_rp_p2 = f52
+log_Nfloat = f53
+log_T = f54
+log_T_plus_Nlog2 = f55
+
+// Data tables
+//==============================================================
+
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(log_table_1)
+
+data8 0xbfd0001008f39d59 // p3
+data8 0x3fd5556073e0c45a // p2
+data8 0xbfdffffffffaea15 // p1
+data8 0x3fe62e42fefa39ef // log(2)
+LOCAL_OBJECT_END(log_table_1)
+
+LOCAL_OBJECT_START(log_table_2)
+data8 0x3FE0000000000000 // 0.5
+data8 0x4008000000000000 // 3.0
+data8 0x9979C79685A5EB16, 0x00003FFB // C1 3FFB9979C79685A5EB16
+data8 0xAAAAA96F80786D62, 0x0000BFFC // C0 BFFCAAAAA96F80786D62
+LOCAL_OBJECT_END(log_table_2)
+
+LOCAL_OBJECT_START(log_table_3)
+data8 0x3F60040155D5889E //log(1/frcpa(1+ 0/256)
+data8 0x3F78121214586B54 //log(1/frcpa(1+ 1/256)
+data8 0x3F841929F96832F0 //log(1/frcpa(1+ 2/256)
+data8 0x3F8C317384C75F06 //log(1/frcpa(1+ 3/256)
+data8 0x3F91A6B91AC73386 //log(1/frcpa(1+ 4/256)
+data8 0x3F95BA9A5D9AC039 //log(1/frcpa(1+ 5/256)
+data8 0x3F99D2A8074325F4 //log(1/frcpa(1+ 6/256)
+data8 0x3F9D6B2725979802 //log(1/frcpa(1+ 7/256)
+data8 0x3FA0C58FA19DFAAA //log(1/frcpa(1+ 8/256)
+data8 0x3FA2954C78CBCE1B //log(1/frcpa(1+ 9/256)
+data8 0x3FA4A94D2DA96C56 //log(1/frcpa(1+ 10/256)
+data8 0x3FA67C94F2D4BB58 //log(1/frcpa(1+ 11/256)
+data8 0x3FA85188B630F068 //log(1/frcpa(1+ 12/256)
+data8 0x3FAA6B8ABE73AF4C //log(1/frcpa(1+ 13/256)
+data8 0x3FAC441E06F72A9E //log(1/frcpa(1+ 14/256)
+data8 0x3FAE1E6713606D07 //log(1/frcpa(1+ 15/256)
+data8 0x3FAFFA6911AB9301 //log(1/frcpa(1+ 16/256)
+data8 0x3FB0EC139C5DA601 //log(1/frcpa(1+ 17/256)
+data8 0x3FB1DBD2643D190B //log(1/frcpa(1+ 18/256)
+data8 0x3FB2CC7284FE5F1C //log(1/frcpa(1+ 19/256)
+data8 0x3FB3BDF5A7D1EE64 //log(1/frcpa(1+ 20/256)
+data8 0x3FB4B05D7AA012E0 //log(1/frcpa(1+ 21/256)
+data8 0x3FB580DB7CEB5702 //log(1/frcpa(1+ 22/256)
+data8 0x3FB674F089365A7A //log(1/frcpa(1+ 23/256)
+data8 0x3FB769EF2C6B568D //log(1/frcpa(1+ 24/256)
+data8 0x3FB85FD927506A48 //log(1/frcpa(1+ 25/256)
+data8 0x3FB9335E5D594989 //log(1/frcpa(1+ 26/256)
+data8 0x3FBA2B0220C8E5F5 //log(1/frcpa(1+ 27/256)
+data8 0x3FBB0004AC1A86AC //log(1/frcpa(1+ 28/256)
+data8 0x3FBBF968769FCA11 //log(1/frcpa(1+ 29/256)
+data8 0x3FBCCFEDBFEE13A8 //log(1/frcpa(1+ 30/256)
+data8 0x3FBDA727638446A2 //log(1/frcpa(1+ 31/256)
+data8 0x3FBEA3257FE10F7A //log(1/frcpa(1+ 32/256)
+data8 0x3FBF7BE9FEDBFDE6 //log(1/frcpa(1+ 33/256)
+data8 0x3FC02AB352FF25F4 //log(1/frcpa(1+ 34/256)
+data8 0x3FC097CE579D204D //log(1/frcpa(1+ 35/256)
+data8 0x3FC1178E8227E47C //log(1/frcpa(1+ 36/256)
+data8 0x3FC185747DBECF34 //log(1/frcpa(1+ 37/256)
+data8 0x3FC1F3B925F25D41 //log(1/frcpa(1+ 38/256)
+data8 0x3FC2625D1E6DDF57 //log(1/frcpa(1+ 39/256)
+data8 0x3FC2D1610C86813A //log(1/frcpa(1+ 40/256)
+data8 0x3FC340C59741142E //log(1/frcpa(1+ 41/256)
+data8 0x3FC3B08B6757F2A9 //log(1/frcpa(1+ 42/256)
+data8 0x3FC40DFB08378003 //log(1/frcpa(1+ 43/256)
+data8 0x3FC47E74E8CA5F7C //log(1/frcpa(1+ 44/256)
+data8 0x3FC4EF51F6466DE4 //log(1/frcpa(1+ 45/256)
+data8 0x3FC56092E02BA516 //log(1/frcpa(1+ 46/256)
+data8 0x3FC5D23857CD74D5 //log(1/frcpa(1+ 47/256)
+data8 0x3FC6313A37335D76 //log(1/frcpa(1+ 48/256)
+data8 0x3FC6A399DABBD383 //log(1/frcpa(1+ 49/256)
+data8 0x3FC70337DD3CE41B //log(1/frcpa(1+ 50/256)
+data8 0x3FC77654128F6127 //log(1/frcpa(1+ 51/256)
+data8 0x3FC7E9D82A0B022D //log(1/frcpa(1+ 52/256)
+data8 0x3FC84A6B759F512F //log(1/frcpa(1+ 53/256)
+data8 0x3FC8AB47D5F5A310 //log(1/frcpa(1+ 54/256)
+data8 0x3FC91FE49096581B //log(1/frcpa(1+ 55/256)
+data8 0x3FC981634011AA75 //log(1/frcpa(1+ 56/256)
+data8 0x3FC9F6C407089664 //log(1/frcpa(1+ 57/256)
+data8 0x3FCA58E729348F43 //log(1/frcpa(1+ 58/256)
+data8 0x3FCABB55C31693AD //log(1/frcpa(1+ 59/256)
+data8 0x3FCB1E104919EFD0 //log(1/frcpa(1+ 60/256)
+data8 0x3FCB94EE93E367CB //log(1/frcpa(1+ 61/256)
+data8 0x3FCBF851C067555F //log(1/frcpa(1+ 62/256)
+data8 0x3FCC5C0254BF23A6 //log(1/frcpa(1+ 63/256)
+data8 0x3FCCC000C9DB3C52 //log(1/frcpa(1+ 64/256)
+data8 0x3FCD244D99C85674 //log(1/frcpa(1+ 65/256)
+data8 0x3FCD88E93FB2F450 //log(1/frcpa(1+ 66/256)
+data8 0x3FCDEDD437EAEF01 //log(1/frcpa(1+ 67/256)
+data8 0x3FCE530EFFE71012 //log(1/frcpa(1+ 68/256)
+data8 0x3FCEB89A1648B971 //log(1/frcpa(1+ 69/256)
+data8 0x3FCF1E75FADF9BDE //log(1/frcpa(1+ 70/256)
+data8 0x3FCF84A32EAD7C35 //log(1/frcpa(1+ 71/256)
+data8 0x3FCFEB2233EA07CD //log(1/frcpa(1+ 72/256)
+data8 0x3FD028F9C7035C1C //log(1/frcpa(1+ 73/256)
+data8 0x3FD05C8BE0D9635A //log(1/frcpa(1+ 74/256)
+data8 0x3FD085EB8F8AE797 //log(1/frcpa(1+ 75/256)
+data8 0x3FD0B9C8E32D1911 //log(1/frcpa(1+ 76/256)
+data8 0x3FD0EDD060B78081 //log(1/frcpa(1+ 77/256)
+data8 0x3FD122024CF0063F //log(1/frcpa(1+ 78/256)
+data8 0x3FD14BE2927AECD4 //log(1/frcpa(1+ 79/256)
+data8 0x3FD180618EF18ADF //log(1/frcpa(1+ 80/256)
+data8 0x3FD1B50BBE2FC63B //log(1/frcpa(1+ 81/256)
+data8 0x3FD1DF4CC7CF242D //log(1/frcpa(1+ 82/256)
+data8 0x3FD214456D0EB8D4 //log(1/frcpa(1+ 83/256)
+data8 0x3FD23EC5991EBA49 //log(1/frcpa(1+ 84/256)
+data8 0x3FD2740D9F870AFB //log(1/frcpa(1+ 85/256)
+data8 0x3FD29ECDABCDFA04 //log(1/frcpa(1+ 86/256)
+data8 0x3FD2D46602ADCCEE //log(1/frcpa(1+ 87/256)
+data8 0x3FD2FF66B04EA9D4 //log(1/frcpa(1+ 88/256)
+data8 0x3FD335504B355A37 //log(1/frcpa(1+ 89/256)
+data8 0x3FD360925EC44F5D //log(1/frcpa(1+ 90/256)
+data8 0x3FD38BF1C3337E75 //log(1/frcpa(1+ 91/256)
+data8 0x3FD3C25277333184 //log(1/frcpa(1+ 92/256)
+data8 0x3FD3EDF463C1683E //log(1/frcpa(1+ 93/256)
+data8 0x3FD419B423D5E8C7 //log(1/frcpa(1+ 94/256)
+data8 0x3FD44591E0539F49 //log(1/frcpa(1+ 95/256)
+data8 0x3FD47C9175B6F0AD //log(1/frcpa(1+ 96/256)
+data8 0x3FD4A8B341552B09 //log(1/frcpa(1+ 97/256)
+data8 0x3FD4D4F3908901A0 //log(1/frcpa(1+ 98/256)
+data8 0x3FD501528DA1F968 //log(1/frcpa(1+ 99/256)
+data8 0x3FD52DD06347D4F6 //log(1/frcpa(1+ 100/256)
+data8 0x3FD55A6D3C7B8A8A //log(1/frcpa(1+ 101/256)
+data8 0x3FD5925D2B112A59 //log(1/frcpa(1+ 102/256)
+data8 0x3FD5BF406B543DB2 //log(1/frcpa(1+ 103/256)
+data8 0x3FD5EC433D5C35AE //log(1/frcpa(1+ 104/256)
+data8 0x3FD61965CDB02C1F //log(1/frcpa(1+ 105/256)
+data8 0x3FD646A84935B2A2 //log(1/frcpa(1+ 106/256)
+data8 0x3FD6740ADD31DE94 //log(1/frcpa(1+ 107/256)
+data8 0x3FD6A18DB74A58C5 //log(1/frcpa(1+ 108/256)
+data8 0x3FD6CF31058670EC //log(1/frcpa(1+ 109/256)
+data8 0x3FD6F180E852F0BA //log(1/frcpa(1+ 110/256)
+data8 0x3FD71F5D71B894F0 //log(1/frcpa(1+ 111/256)
+data8 0x3FD74D5AEFD66D5C //log(1/frcpa(1+ 112/256)
+data8 0x3FD77B79922BD37E //log(1/frcpa(1+ 113/256)
+data8 0x3FD7A9B9889F19E2 //log(1/frcpa(1+ 114/256)
+data8 0x3FD7D81B037EB6A6 //log(1/frcpa(1+ 115/256)
+data8 0x3FD8069E33827231 //log(1/frcpa(1+ 116/256)
+data8 0x3FD82996D3EF8BCB //log(1/frcpa(1+ 117/256)
+data8 0x3FD85855776DCBFB //log(1/frcpa(1+ 118/256)
+data8 0x3FD8873658327CCF //log(1/frcpa(1+ 119/256)
+data8 0x3FD8AA75973AB8CF //log(1/frcpa(1+ 120/256)
+data8 0x3FD8D992DC8824E5 //log(1/frcpa(1+ 121/256)
+data8 0x3FD908D2EA7D9512 //log(1/frcpa(1+ 122/256)
+data8 0x3FD92C59E79C0E56 //log(1/frcpa(1+ 123/256)
+data8 0x3FD95BD750EE3ED3 //log(1/frcpa(1+ 124/256)
+data8 0x3FD98B7811A3EE5B //log(1/frcpa(1+ 125/256)
+data8 0x3FD9AF47F33D406C //log(1/frcpa(1+ 126/256)
+data8 0x3FD9DF270C1914A8 //log(1/frcpa(1+ 127/256)
+data8 0x3FDA0325ED14FDA4 //log(1/frcpa(1+ 128/256)
+data8 0x3FDA33440224FA79 //log(1/frcpa(1+ 129/256)
+data8 0x3FDA57725E80C383 //log(1/frcpa(1+ 130/256)
+data8 0x3FDA87D0165DD199 //log(1/frcpa(1+ 131/256)
+data8 0x3FDAAC2E6C03F896 //log(1/frcpa(1+ 132/256)
+data8 0x3FDADCCC6FDF6A81 //log(1/frcpa(1+ 133/256)
+data8 0x3FDB015B3EB1E790 //log(1/frcpa(1+ 134/256)
+data8 0x3FDB323A3A635948 //log(1/frcpa(1+ 135/256)
+data8 0x3FDB56FA04462909 //log(1/frcpa(1+ 136/256)
+data8 0x3FDB881AA659BC93 //log(1/frcpa(1+ 137/256)
+data8 0x3FDBAD0BEF3DB165 //log(1/frcpa(1+ 138/256)
+data8 0x3FDBD21297781C2F //log(1/frcpa(1+ 139/256)
+data8 0x3FDC039236F08819 //log(1/frcpa(1+ 140/256)
+data8 0x3FDC28CB1E4D32FD //log(1/frcpa(1+ 141/256)
+data8 0x3FDC4E19B84723C2 //log(1/frcpa(1+ 142/256)
+data8 0x3FDC7FF9C74554C9 //log(1/frcpa(1+ 143/256)
+data8 0x3FDCA57B64E9DB05 //log(1/frcpa(1+ 144/256)
+data8 0x3FDCCB130A5CEBB0 //log(1/frcpa(1+ 145/256)
+data8 0x3FDCF0C0D18F326F //log(1/frcpa(1+ 146/256)
+data8 0x3FDD232075B5A201 //log(1/frcpa(1+ 147/256)
+data8 0x3FDD490246DEFA6B //log(1/frcpa(1+ 148/256)
+data8 0x3FDD6EFA918D25CD //log(1/frcpa(1+ 149/256)
+data8 0x3FDD9509707AE52F //log(1/frcpa(1+ 150/256)
+data8 0x3FDDBB2EFE92C554 //log(1/frcpa(1+ 151/256)
+data8 0x3FDDEE2F3445E4AF //log(1/frcpa(1+ 152/256)
+data8 0x3FDE148A1A2726CE //log(1/frcpa(1+ 153/256)
+data8 0x3FDE3AFC0A49FF40 //log(1/frcpa(1+ 154/256)
+data8 0x3FDE6185206D516E //log(1/frcpa(1+ 155/256)
+data8 0x3FDE882578823D52 //log(1/frcpa(1+ 156/256)
+data8 0x3FDEAEDD2EAC990C //log(1/frcpa(1+ 157/256)
+data8 0x3FDED5AC5F436BE3 //log(1/frcpa(1+ 158/256)
+data8 0x3FDEFC9326D16AB9 //log(1/frcpa(1+ 159/256)
+data8 0x3FDF2391A2157600 //log(1/frcpa(1+ 160/256)
+data8 0x3FDF4AA7EE03192D //log(1/frcpa(1+ 161/256)
+data8 0x3FDF71D627C30BB0 //log(1/frcpa(1+ 162/256)
+data8 0x3FDF991C6CB3B379 //log(1/frcpa(1+ 163/256)
+data8 0x3FDFC07ADA69A910 //log(1/frcpa(1+ 164/256)
+data8 0x3FDFE7F18EB03D3E //log(1/frcpa(1+ 165/256)
+data8 0x3FE007C053C5002E //log(1/frcpa(1+ 166/256)
+data8 0x3FE01B942198A5A1 //log(1/frcpa(1+ 167/256)
+data8 0x3FE02F74400C64EB //log(1/frcpa(1+ 168/256)
+data8 0x3FE04360BE7603AD //log(1/frcpa(1+ 169/256)
+data8 0x3FE05759AC47FE34 //log(1/frcpa(1+ 170/256)
+data8 0x3FE06B5F1911CF52 //log(1/frcpa(1+ 171/256)
+data8 0x3FE078BF0533C568 //log(1/frcpa(1+ 172/256)
+data8 0x3FE08CD9687E7B0E //log(1/frcpa(1+ 173/256)
+data8 0x3FE0A10074CF9019 //log(1/frcpa(1+ 174/256)
+data8 0x3FE0B5343A234477 //log(1/frcpa(1+ 175/256)
+data8 0x3FE0C974C89431CE //log(1/frcpa(1+ 176/256)
+data8 0x3FE0DDC2305B9886 //log(1/frcpa(1+ 177/256)
+data8 0x3FE0EB524BAFC918 //log(1/frcpa(1+ 178/256)
+data8 0x3FE0FFB54213A476 //log(1/frcpa(1+ 179/256)
+data8 0x3FE114253DA97D9F //log(1/frcpa(1+ 180/256)
+data8 0x3FE128A24F1D9AFF //log(1/frcpa(1+ 181/256)
+data8 0x3FE1365252BF0865 //log(1/frcpa(1+ 182/256)
+data8 0x3FE14AE558B4A92D //log(1/frcpa(1+ 183/256)
+data8 0x3FE15F85A19C765B //log(1/frcpa(1+ 184/256)
+data8 0x3FE16D4D38C119FA //log(1/frcpa(1+ 185/256)
+data8 0x3FE18203C20DD133 //log(1/frcpa(1+ 186/256)
+data8 0x3FE196C7BC4B1F3B //log(1/frcpa(1+ 187/256)
+data8 0x3FE1A4A738B7A33C //log(1/frcpa(1+ 188/256)
+data8 0x3FE1B981C0C9653D //log(1/frcpa(1+ 189/256)
+data8 0x3FE1CE69E8BB106B //log(1/frcpa(1+ 190/256)
+data8 0x3FE1DC619DE06944 //log(1/frcpa(1+ 191/256)
+data8 0x3FE1F160A2AD0DA4 //log(1/frcpa(1+ 192/256)
+data8 0x3FE2066D7740737E //log(1/frcpa(1+ 193/256)
+data8 0x3FE2147DBA47A394 //log(1/frcpa(1+ 194/256)
+data8 0x3FE229A1BC5EBAC3 //log(1/frcpa(1+ 195/256)
+data8 0x3FE237C1841A502E //log(1/frcpa(1+ 196/256)
+data8 0x3FE24CFCE6F80D9A //log(1/frcpa(1+ 197/256)
+data8 0x3FE25B2C55CD5762 //log(1/frcpa(1+ 198/256)
+data8 0x3FE2707F4D5F7C41 //log(1/frcpa(1+ 199/256)
+data8 0x3FE285E0842CA384 //log(1/frcpa(1+ 200/256)
+data8 0x3FE294294708B773 //log(1/frcpa(1+ 201/256)
+data8 0x3FE2A9A2670AFF0C //log(1/frcpa(1+ 202/256)
+data8 0x3FE2B7FB2C8D1CC1 //log(1/frcpa(1+ 203/256)
+data8 0x3FE2C65A6395F5F5 //log(1/frcpa(1+ 204/256)
+data8 0x3FE2DBF557B0DF43 //log(1/frcpa(1+ 205/256)
+data8 0x3FE2EA64C3F97655 //log(1/frcpa(1+ 206/256)
+data8 0x3FE3001823684D73 //log(1/frcpa(1+ 207/256)
+data8 0x3FE30E97E9A8B5CD //log(1/frcpa(1+ 208/256)
+data8 0x3FE32463EBDD34EA //log(1/frcpa(1+ 209/256)
+data8 0x3FE332F4314AD796 //log(1/frcpa(1+ 210/256)
+data8 0x3FE348D90E7464D0 //log(1/frcpa(1+ 211/256)
+data8 0x3FE35779F8C43D6E //log(1/frcpa(1+ 212/256)
+data8 0x3FE36621961A6A99 //log(1/frcpa(1+ 213/256)
+data8 0x3FE37C299F3C366A //log(1/frcpa(1+ 214/256)
+data8 0x3FE38AE2171976E7 //log(1/frcpa(1+ 215/256)
+data8 0x3FE399A157A603E7 //log(1/frcpa(1+ 216/256)
+data8 0x3FE3AFCCFE77B9D1 //log(1/frcpa(1+ 217/256)
+data8 0x3FE3BE9D503533B5 //log(1/frcpa(1+ 218/256)
+data8 0x3FE3CD7480B4A8A3 //log(1/frcpa(1+ 219/256)
+data8 0x3FE3E3C43918F76C //log(1/frcpa(1+ 220/256)
+data8 0x3FE3F2ACB27ED6C7 //log(1/frcpa(1+ 221/256)
+data8 0x3FE4019C2125CA93 //log(1/frcpa(1+ 222/256)
+data8 0x3FE4181061389722 //log(1/frcpa(1+ 223/256)
+data8 0x3FE42711518DF545 //log(1/frcpa(1+ 224/256)
+data8 0x3FE436194E12B6BF //log(1/frcpa(1+ 225/256)
+data8 0x3FE445285D68EA69 //log(1/frcpa(1+ 226/256)
+data8 0x3FE45BCC464C893A //log(1/frcpa(1+ 227/256)
+data8 0x3FE46AED21F117FC //log(1/frcpa(1+ 228/256)
+data8 0x3FE47A1527E8A2D3 //log(1/frcpa(1+ 229/256)
+data8 0x3FE489445EFFFCCC //log(1/frcpa(1+ 230/256)
+data8 0x3FE4A018BCB69835 //log(1/frcpa(1+ 231/256)
+data8 0x3FE4AF5A0C9D65D7 //log(1/frcpa(1+ 232/256)
+data8 0x3FE4BEA2A5BDBE87 //log(1/frcpa(1+ 233/256)
+data8 0x3FE4CDF28F10AC46 //log(1/frcpa(1+ 234/256)
+data8 0x3FE4DD49CF994058 //log(1/frcpa(1+ 235/256)
+data8 0x3FE4ECA86E64A684 //log(1/frcpa(1+ 236/256)
+data8 0x3FE503C43CD8EB68 //log(1/frcpa(1+ 237/256)
+data8 0x3FE513356667FC57 //log(1/frcpa(1+ 238/256)
+data8 0x3FE522AE0738A3D8 //log(1/frcpa(1+ 239/256)
+data8 0x3FE5322E26867857 //log(1/frcpa(1+ 240/256)
+data8 0x3FE541B5CB979809 //log(1/frcpa(1+ 241/256)
+data8 0x3FE55144FDBCBD62 //log(1/frcpa(1+ 242/256)
+data8 0x3FE560DBC45153C7 //log(1/frcpa(1+ 243/256)
+data8 0x3FE5707A26BB8C66 //log(1/frcpa(1+ 244/256)
+data8 0x3FE587F60ED5B900 //log(1/frcpa(1+ 245/256)
+data8 0x3FE597A7977C8F31 //log(1/frcpa(1+ 246/256)
+data8 0x3FE5A760D634BB8B //log(1/frcpa(1+ 247/256)
+data8 0x3FE5B721D295F10F //log(1/frcpa(1+ 248/256)
+data8 0x3FE5C6EA94431EF9 //log(1/frcpa(1+ 249/256)
+data8 0x3FE5D6BB22EA86F6 //log(1/frcpa(1+ 250/256)
+data8 0x3FE5E6938645D390 //log(1/frcpa(1+ 251/256)
+data8 0x3FE5F673C61A2ED2 //log(1/frcpa(1+ 252/256)
+data8 0x3FE6065BEA385926 //log(1/frcpa(1+ 253/256)
+data8 0x3FE6164BFA7CC06B //log(1/frcpa(1+ 254/256)
+data8 0x3FE62643FECF9743 //log(1/frcpa(1+ 255/256)
+LOCAL_OBJECT_END(log_table_3)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(asinhf)
+
+{ .mfi
+ getf.exp asinh_GR_f8 = f8 // Must recompute later if x unorm
+ fclass.m p12,p0 = f8, 0x0b // Test x unorm
+ mov log_GR_exp_17_ones = 0x1ffff
+}
+{ .mfi
+ addl NR_table_address = @ltoff(log_table_1), gp
+ fma.s1 log_y = f8, f8, f1 // y = x^2 + 1
+ mov asinh_GR_comp = 0xfffa
+}
+;;
+
+{ .mfi
+ mov log_GR_exp_16_ones = 0xffff //BIAS
+ fclass.m p6,p0 = f8, 0xe7 // Test for x = NaN and inf and zero
+ mov log_GR_comp2 = 0x10032
+}
+{ .mfi
+ ld8 NR_table_address = [NR_table_address]
+ fma.s1 asinh_w_sq = f8,f8,f0 // x^2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fcmp.lt.s1 p7,p11 = f8,f0 // if x<0
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fnorm.s1 fNormX = f8 // Normalize x
+(p12) br.cond.spnt ASINH_UNORM // Branch if x=unorm
+}
+;;
+
+ASINH_COMMON:
+// Return here if x=unorm and not denorm
+{ .mfi
+ //to get second table address
+ adds log_table_address2 = 0x20, NR_table_address
+ fma.s1 log_arg = f8,f1,f8
+}
+{ .mfb
+ nop.m 0
+(p6) fma.s.s0 f8 = f8,f1,f8 // quietize nan result if x=nan
+(p6) br.ret.spnt b0 // Exit for x=nan and inf and zero
+}
+;;
+
+{ .mfi
+ ldfpd NR1,NR2 = [log_table_address2],16
+ frsqrta.s1 log_y_rs,p0 = log_y // z=1/sqrt(y)
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe log_C1 = [log_table_address2],16
+ nop.f 0
+ and asinh_GR_f8 = asinh_GR_f8,log_GR_exp_17_ones
+}
+;;
+
+{ .mib
+ ldfe log_C0 = [log_table_address2],16
+ cmp.le p13,p0 = log_GR_comp2,asinh_GR_f8
+(p13) br.cond.spnt LOG_COMMON1 // Branch if path 4: |x| >= 2^51
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter = log_y_rs,log_y,f0 // y*z
+ nop.i 0
+}
+;;
+
+.pred.rel "mutex",p7,p11
+{ .mfi
+ nop.m 0
+(p11) mov asinh_f8 = fNormX
+ nop.i 0
+}
+{ .mfb
+ cmp.gt p8,p0 = asinh_GR_comp,asinh_GR_f8
+(p7) fnma.s1 asinh_f8 = fNormX,f1,f0
+(p8) br.cond.spnt ASINH_NEAR_ZERO // Branch if path 2: 0 < |x| < 2^-5
+}
+;;
+
+// Here if main path, 2^-5 <= |x| < 2^51
+///////////////////////////////// The first iteration /////////////////////////
+{ .mfi
+ ldfpd log_P3,log_P2 = [NR_table_address],16
+ fnma.s1 log_y_rs_iter2 = log_y_rs_iter,log_y_rs,NR2 // 3-(y*z)*z
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs,NR1,f0 // 0.5*z
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfpd log_P1,log2 = [NR_table_address],16
+ // (0.5*z)*(3-(y*z)*z)
+ fma.s1 log_y_rs_iter = log_y_rs_iter1,log_y_rs_iter2,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (0.5*z)*(3-(y*z)*z)
+ fma.s1 log_arg_early = log_y_rs_iter1,log_y_rs_iter2,f0
+ nop.i 0
+}
+;;
+
+////////////////////////////////// The second iteration ////////////////////////
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs = log_y_rs_iter,log_y,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs_iter,NR1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_arg_early = log_arg_early,log_y,asinh_f8
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 log_y_rs = log_y_rs,log_y_rs_iter,NR2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_y_rs_iter1 = log_y_rs_iter1,log_y,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ frcpa.s1 log_C,p0 = f1,log_arg_early
+ nop.i 0
+}
+;;
+
+{ .mfi
+ getf.exp log_GR_signexp_f8 = log_arg_early
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ getf.sig log_GR_significand_f8 = log_arg_early
+ // (0.5*z)*(3-(y*z)*z)*y + |x|
+ fma.s1 log_arg = log_y_rs_iter1,log_y_rs,asinh_f8
+ //to get third table address
+ adds log_table_address3 = 0x30, NR_table_address
+}
+;;
+
+/////////////////////////////////////////// The end NR iterations /////////////
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ //significant bit destruction
+ and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones
+}
+;;
+
+{ .mfi
+ //BIAS subtraction
+ sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones
+(p7) fnma.s1 log2 = log2,f1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ setf.sig log_int_Nfloat = log_GR_true_exp_f8
+ fms.s1 log_r = log_C,log_arg,f1 //C = frcpa(x); r = C * x - 1
+ extr.u log_GR_index = log_GR_significand_f8,55,8 //Extract 8 bits
+}
+;;
+
+{ .mmi
+ //pre-index*16 + index
+ shladd log_table_address3 = log_GR_index,3,log_table_address3
+;;
+ ldfd log_T = [log_table_address3]
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rsq = log_r, log_r, f0 //r^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p32 = log_P3, log_r, log_P2 //P3*r + P2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p10 = log_P1, log_r, f1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //convert N to the floating-point format
+ fcvt.xf log_Nfloat = log_int_Nfloat
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p2 = log_rp_p32, log_rsq, log_rp_p10
+ nop.i 0
+}
+;;
+
+.pred.rel "mutex",p7,p11
+{ .mfi
+ nop.m 0
+(p11) fma.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 + T if x>0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fms.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 - T if x<0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p11) fma.s.s0 f8 = log_rp_p2,log_r,log_T_plus_Nlog2
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p7) fnma.s.s0 f8 = log_rp_p2,log_r,log_T_plus_Nlog2
+ br.ret.sptk b0 // Exit main path, path 3: 2^-5 <= |x| < 2^51
+}
+;;
+
+
+// Here if path 4, |x| >= 2^51
+LOG_COMMON1:
+{ .mfi
+ ldfpd log_P3,log_P2 = [NR_table_address],16
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfpd log_P1,log2 = [NR_table_address],16
+ frcpa.s1 log_C,p0 = f1,log_arg
+ nop.i 0
+}
+;;
+
+{ .mfi
+ getf.exp log_GR_signexp_f8 = log_arg
+ nop.f 0
+ //to get third table address
+ adds log_table_address3 = 0x30, NR_table_address
+}
+;;
+
+{ .mfi
+ getf.sig log_GR_significand_f8 = log_arg
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ //to destroy the most bit in the significant area
+ and log_GR_exp_f8 = log_GR_signexp_f8, log_GR_exp_17_ones
+}
+;;
+
+{ .mmf
+ nop.m 0
+ //BIAS subtraction
+ sub log_GR_true_exp_f8 = log_GR_exp_f8, log_GR_exp_16_ones
+ fms.s1 log_r = log_C,log_arg,f1 //C = frcpa(x); r = C * x - 1
+}
+;;
+
+{ .mfi
+ setf.sig log_int_Nfloat = log_GR_true_exp_f8
+ nop.f 0
+ extr.u log_GR_index = log_GR_significand_f8,55,8 //Extract 8 bits
+}
+;;
+
+{ .mmi
+ //pre-index*16 + index
+ shladd log_table_address3 = log_GR_index,3,log_table_address3
+;;
+ ldfd log_T = [log_table_address3]
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rsq = log_r, log_r, f0 //r^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p32 = log_P3, log_r, log_P2 //P3*r + P2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p10 = log_P1, log_r, f1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fnma.s1 log2 = log2,f1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ //convert N to the floating-point format
+ fcvt.xf log_Nfloat = log_int_Nfloat
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 log_rp_p2 = log_rp_p32, log_rsq, log_rp_p10
+ nop.i 0
+}
+;;
+
+.pred.rel "mutex",p7,p11
+{ .mfi
+ nop.m 0
+(p11) fma.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 + T if x>0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fms.s1 log_T_plus_Nlog2 = log_Nfloat,log2,log_T //N*log2 - T if x<0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p11) fma.s.s0 f8 = log_rp_p2,log_r,log_T_plus_Nlog2
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p7) fnma.s.s0 f8 = log_rp_p2,log_r,log_T_plus_Nlog2
+ br.ret.sptk b0 // Exit path 4, |x| >= 2^51
+}
+;;
+
+// Here if path 2, 0 < |x| < 2^-5
+ASINH_NEAR_ZERO:
+{ .mfi
+ nop.m 0
+ fma.s1 asinh_w_1 = asinh_w_sq,log_C1,log_C0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 asinh_w_cube = asinh_w_sq,fNormX,f0
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ fma.s.s0 f8 = asinh_w_1,asinh_w_cube,fNormX
+ br.ret.sptk b0 // Exit path 2, 0 < |x| < 2^-5
+}
+;;
+
+ASINH_UNORM:
+// Here if x=unorm
+{ .mfi
+ getf.exp asinh_GR_f8 = fNormX // Recompute if x unorm
+ fclass.m p0,p13 = fNormX, 0x0b // Test x denorm
+ nop.i 0
+}
+;;
+
+{ .mfb
+ nop.m 0
+ fcmp.eq.s0 p14,p0 = f8, f0 // Dummy to set denormal flag
+(p13) br.cond.sptk ASINH_COMMON // Continue if x unorm and not denorm
+}
+;;
+
+.pred.rel "mutex",p7,p11
+{ .mfi
+ nop.m 0
+(p7) fma.s.s0 f8 = f8,f8,f8 // Result x+x^2 if x=-denorm
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p11) fnma.s.s0 f8 = f8,f8,f8 // Result x-x^2 if x=+denorm
+ br.ret.spnt b0 // Exit if denorm
+}
+;;
+
+GLOBAL_LIBM_END(asinhf)
diff --git a/sysdeps/ia64/fpu/s_asinhl.S b/sysdeps/ia64/fpu/s_asinhl.S
new file mode 100644
index 0000000000..fcb4e6e7e0
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_asinhl.S
@@ -0,0 +1,1346 @@
+.file "asinhl.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//*********************************************************************
+//
+// History:
+// 09/04/01 Initial version
+// 09/13/01 Performance improved, symmetry problems fixed
+// 10/10/01 Performance improved, split issues removed
+// 12/11/01 Changed huges_logp to not be global
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
+//
+//*********************************************************************
+//
+// API
+//==============================================================
+// long double asinhl(long double);
+//
+// Overview of operation
+//==============================================================
+//
+// There are 6 paths:
+// 1. x = 0, [S,Q]Nan or +/-INF
+// Return asinhl(x) = x + x;
+//
+// 2. x = + denormal
+// Return asinhl(x) = x - x^2;
+//
+// 3. x = - denormal
+// Return asinhl(x) = x + x^2;
+//
+// 4. 'Near 0': max denormal < |x| < 1/128
+// Return asinhl(x) = sign(x)*(x+x^3*(c3+x^2*(c5+x^2*(c7+x^2*(c9)))));
+//
+// 5. 'Huges': |x| > 2^63
+// Return asinhl(x) = sign(x)*(logl(2*x));
+//
+// 6. 'Main path': 1/128 < |x| < 2^63
+// b_hi + b_lo = x + sqrt(x^2 + 1);
+// asinhl(x) = sign(x)*(log_special(b_hi, b_lo));
+//
+// Algorithm description
+//==============================================================
+//
+// Main path algorithm
+// ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
+// *************************************************************************
+//
+// There are 3 parts of x+sqrt(x^2+1) computation:
+//
+// 1) p2 = (p2_hi+p2_lo) = x^2+1 obtaining
+// ------------------------------------
+// p2_hi = x2_hi + 1, where x2_hi = x * x;
+// p2_lo = x2_lo + p1_lo, where
+// x2_lo = FMS(x*x-x2_hi),
+// p1_lo = (1 - p2_hi) + x2_hi;
+//
+// 2) g = (g_hi+g_lo) = sqrt(p2) = sqrt(p2_hi+p2_lo)
+// ----------------------------------------------
+// r = invsqrt(p2_hi) (8-bit reciprocal square root approximation);
+// g = p2_hi * r (first 8 bit-approximation of sqrt);
+//
+// h = 0.5 * r;
+// e = 0.5 - g * h;
+// g = g * e + g (second 16 bit-approximation of sqrt);
+//
+// h = h * e + h;
+// e = 0.5 - g * h;
+// g = g * e + g (third 32 bit-approximation of sqrt);
+//
+// h = h * e + h;
+// e = 0.5 - g * h;
+// g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
+//
+// Remainder computation:
+// h = h * e + h;
+// d = (p2_hi - g_hi * g_hi) + p2_lo;
+// g_lo = d * h;
+//
+// 3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2+1)
+// -------------------------------------------------------------------
+// b_hi = (g_hi + x) + gl;
+// b_lo = (g_hi - b_hi) + x + gl;
+//
+// Now we pass b presented as sum b_hi + b_lo to special version
+// of logl function which accept a pair of arguments as
+// 'mutiprecision' value.
+//
+// Special log algorithm overview
+// ================================
+// Here we use a table lookup method. The basic idea is that in
+// order to compute logl(Arg) = logl (Arg-1) for an argument Arg in [1,2),
+// we construct a value G such that G*Arg is close to 1 and that
+// logl(1/G) is obtainable easily from a table of values calculated
+// beforehand. Thus
+//
+// logl(Arg) = logl(1/G) + logl((G*Arg - 1))
+//
+// Because |G*Arg - 1| is small, the second term on the right hand
+// side can be approximated by a short polynomial. We elaborate
+// this method in four steps.
+//
+// Step 0: Initialization
+//
+// We need to calculate logl( X ). Obtain N, S_hi such that
+//
+// X = 2^N * ( S_hi + S_lo ) exactly
+//
+// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
+// that |S_lo| <= ulp(S_hi).
+//
+// For the special version of logl: S_lo = b_lo
+// !-----------------------------------------------!
+//
+// Step 1: Argument Reduction
+//
+// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
+//
+// G := G_1 * G_2 * G_3
+// r := (G * S_hi - 1) + G * S_lo
+//
+// These G_j's have the property that the product is exactly
+// representable and that |r| < 2^(-12) as a result.
+//
+// Step 2: Approximation
+//
+// logl(1 + r) is approximated by a short polynomial poly(r).
+//
+// Step 3: Reconstruction
+//
+// Finally,
+//
+// logl( X ) = logl( 2^N * (S_hi + S_lo) )
+// ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
+// ~=~ N*logl(2) + logl(1/G) + poly(r).
+//
+// For detailed description see logl or log1pl function, regular path.
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f32 -> f101 (70 registers)
+
+// General registers used:
+// r32 -> r57 (26 registers)
+
+// Predicate registers used:
+// p6 -> p11
+// p6 for '0, NaNs, Inf' path
+// p7 for '+ denormals' path
+// p8 for 'near 0' path
+// p9 for 'huges' path
+// p10 for '- denormals' path
+// p11 for negative values
+//
+// Data tables
+//==============================================================
+
+RODATA
+.align 64
+
+// C7, C9 'near 0' polynomial coefficients
+LOCAL_OBJECT_START(Poly_C_near_0_79)
+data8 0xF8DC939BBEDD5A54, 0x00003FF9
+data8 0xB6DB6DAB21565AC5, 0x0000BFFA
+LOCAL_OBJECT_END(Poly_C_near_0_79)
+
+// C3, C5 'near 0' polynomial coefficients
+LOCAL_OBJECT_START(Poly_C_near_0_35)
+data8 0x999999999991D582, 0x00003FFB
+data8 0xAAAAAAAAAAAAAAA9, 0x0000BFFC
+LOCAL_OBJECT_END(Poly_C_near_0_35)
+
+// Q coeffs
+LOCAL_OBJECT_START(Constants_Q)
+data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
+data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
+data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
+data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
+data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
+data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
+LOCAL_OBJECT_END(Constants_Q)
+
+// Z1 - 16 bit fixed
+LOCAL_OBJECT_START(Constants_Z_1)
+data4 0x00008000
+data4 0x00007879
+data4 0x000071C8
+data4 0x00006BCB
+data4 0x00006667
+data4 0x00006187
+data4 0x00005D18
+data4 0x0000590C
+data4 0x00005556
+data4 0x000051EC
+data4 0x00004EC5
+data4 0x00004BDB
+data4 0x00004925
+data4 0x0000469F
+data4 0x00004445
+data4 0x00004211
+LOCAL_OBJECT_END(Constants_Z_1)
+
+// G1 and H1 - IEEE single and h1 - IEEE double
+LOCAL_OBJECT_START(Constants_G_H_h1)
+data4 0x3F800000,0x00000000
+data8 0x0000000000000000
+data4 0x3F70F0F0,0x3D785196
+data8 0x3DA163A6617D741C
+data4 0x3F638E38,0x3DF13843
+data8 0x3E2C55E6CBD3D5BB
+data4 0x3F579430,0x3E2FF9A0
+data8 0xBE3EB0BFD86EA5E7
+data4 0x3F4CCCC8,0x3E647FD6
+data8 0x3E2E6A8C86B12760
+data4 0x3F430C30,0x3E8B3AE7
+data8 0x3E47574C5C0739BA
+data4 0x3F3A2E88,0x3EA30C68
+data8 0x3E20E30F13E8AF2F
+data4 0x3F321640,0x3EB9CEC8
+data8 0xBE42885BF2C630BD
+data4 0x3F2AAAA8,0x3ECF9927
+data8 0x3E497F3497E577C6
+data4 0x3F23D708,0x3EE47FC5
+data8 0x3E3E6A6EA6B0A5AB
+data4 0x3F1D89D8,0x3EF8947D
+data8 0xBDF43E3CD328D9BE
+data4 0x3F17B420,0x3F05F3A1
+data8 0x3E4094C30ADB090A
+data4 0x3F124920,0x3F0F4303
+data8 0xBE28FBB2FC1FE510
+data4 0x3F0D3DC8,0x3F183EBF
+data8 0x3E3A789510FDE3FA
+data4 0x3F088888,0x3F20EC80
+data8 0x3E508CE57CC8C98F
+data4 0x3F042108,0x3F29516A
+data8 0xBE534874A223106C
+LOCAL_OBJECT_END(Constants_G_H_h1)
+
+// Z2 - 16 bit fixed
+LOCAL_OBJECT_START(Constants_Z_2)
+data4 0x00008000
+data4 0x00007F81
+data4 0x00007F02
+data4 0x00007E85
+data4 0x00007E08
+data4 0x00007D8D
+data4 0x00007D12
+data4 0x00007C98
+data4 0x00007C20
+data4 0x00007BA8
+data4 0x00007B31
+data4 0x00007ABB
+data4 0x00007A45
+data4 0x000079D1
+data4 0x0000795D
+data4 0x000078EB
+LOCAL_OBJECT_END(Constants_Z_2)
+
+// G2 and H2 - IEEE single and h2 - IEEE double
+LOCAL_OBJECT_START(Constants_G_H_h2)
+data4 0x3F800000,0x00000000
+data8 0x0000000000000000
+data4 0x3F7F00F8,0x3B7F875D
+data8 0x3DB5A11622C42273
+data4 0x3F7E03F8,0x3BFF015B
+data8 0x3DE620CF21F86ED3
+data4 0x3F7D08E0,0x3C3EE393
+data8 0xBDAFA07E484F34ED
+data4 0x3F7C0FC0,0x3C7E0586
+data8 0xBDFE07F03860BCF6
+data4 0x3F7B1880,0x3C9E75D2
+data8 0x3DEA370FA78093D6
+data4 0x3F7A2328,0x3CBDC97A
+data8 0x3DFF579172A753D0
+data4 0x3F792FB0,0x3CDCFE47
+data8 0x3DFEBE6CA7EF896B
+data4 0x3F783E08,0x3CFC15D0
+data8 0x3E0CF156409ECB43
+data4 0x3F774E38,0x3D0D874D
+data8 0xBE0B6F97FFEF71DF
+data4 0x3F766038,0x3D1CF49B
+data8 0xBE0804835D59EEE8
+data4 0x3F757400,0x3D2C531D
+data8 0x3E1F91E9A9192A74
+data4 0x3F748988,0x3D3BA322
+data8 0xBE139A06BF72A8CD
+data4 0x3F73A0D0,0x3D4AE46F
+data8 0x3E1D9202F8FBA6CF
+data4 0x3F72B9D0,0x3D5A1756
+data8 0xBE1DCCC4BA796223
+data4 0x3F71D488,0x3D693B9D
+data8 0xBE049391B6B7C239
+LOCAL_OBJECT_END(Constants_G_H_h2)
+
+// G3 and H3 - IEEE single and h3 - IEEE double
+LOCAL_OBJECT_START(Constants_G_H_h3)
+data4 0x3F7FFC00,0x38800100
+data8 0x3D355595562224CD
+data4 0x3F7FF400,0x39400480
+data8 0x3D8200A206136FF6
+data4 0x3F7FEC00,0x39A00640
+data8 0x3DA4D68DE8DE9AF0
+data4 0x3F7FE400,0x39E00C41
+data8 0xBD8B4291B10238DC
+data4 0x3F7FDC00,0x3A100A21
+data8 0xBD89CCB83B1952CA
+data4 0x3F7FD400,0x3A300F22
+data8 0xBDB107071DC46826
+data4 0x3F7FCC08,0x3A4FF51C
+data8 0x3DB6FCB9F43307DB
+data4 0x3F7FC408,0x3A6FFC1D
+data8 0xBD9B7C4762DC7872
+data4 0x3F7FBC10,0x3A87F20B
+data8 0xBDC3725E3F89154A
+data4 0x3F7FB410,0x3A97F68B
+data8 0xBD93519D62B9D392
+data4 0x3F7FAC18,0x3AA7EB86
+data8 0x3DC184410F21BD9D
+data4 0x3F7FA420,0x3AB7E101
+data8 0xBDA64B952245E0A6
+data4 0x3F7F9C20,0x3AC7E701
+data8 0x3DB4B0ECAABB34B8
+data4 0x3F7F9428,0x3AD7DD7B
+data8 0x3D9923376DC40A7E
+data4 0x3F7F8C30,0x3AE7D474
+data8 0x3DC6E17B4F2083D3
+data4 0x3F7F8438,0x3AF7CBED
+data8 0x3DAE314B811D4394
+data4 0x3F7F7C40,0x3B03E1F3
+data8 0xBDD46F21B08F2DB1
+data4 0x3F7F7448,0x3B0BDE2F
+data8 0xBDDC30A46D34522B
+data4 0x3F7F6C50,0x3B13DAAA
+data8 0x3DCB0070B1F473DB
+data4 0x3F7F6458,0x3B1BD766
+data8 0xBDD65DDC6AD282FD
+data4 0x3F7F5C68,0x3B23CC5C
+data8 0xBDCDAB83F153761A
+data4 0x3F7F5470,0x3B2BC997
+data8 0xBDDADA40341D0F8F
+data4 0x3F7F4C78,0x3B33C711
+data8 0x3DCD1BD7EBC394E8
+data4 0x3F7F4488,0x3B3BBCC6
+data8 0xBDC3532B52E3E695
+data4 0x3F7F3C90,0x3B43BAC0
+data8 0xBDA3961EE846B3DE
+data4 0x3F7F34A0,0x3B4BB0F4
+data8 0xBDDADF06785778D4
+data4 0x3F7F2CA8,0x3B53AF6D
+data8 0x3DCC3ED1E55CE212
+data4 0x3F7F24B8,0x3B5BA620
+data8 0xBDBA31039E382C15
+data4 0x3F7F1CC8,0x3B639D12
+data8 0x3D635A0B5C5AF197
+data4 0x3F7F14D8,0x3B6B9444
+data8 0xBDDCCB1971D34EFC
+data4 0x3F7F0CE0,0x3B7393BC
+data8 0x3DC7450252CD7ADA
+data4 0x3F7F04F0,0x3B7B8B6D
+data8 0xBDB68F177D7F2A42
+LOCAL_OBJECT_END(Constants_G_H_h3)
+
+// Assembly macros
+//==============================================================
+
+// Floating Point Registers
+
+FR_Arg = f8
+FR_Res = f8
+FR_AX = f32
+FR_XLog_Hi = f33
+FR_XLog_Lo = f34
+
+ // Special logl registers
+FR_Y_hi = f35
+FR_Y_lo = f36
+
+FR_Scale = f37
+FR_X_Prime = f38
+FR_S_hi = f39
+FR_W = f40
+FR_G = f41
+
+FR_H = f42
+FR_wsq = f43
+FR_w4 = f44
+FR_h = f45
+FR_w6 = f46
+
+FR_G2 = f47
+FR_H2 = f48
+FR_poly_lo = f49
+FR_P8 = f50
+FR_poly_hi = f51
+
+FR_P7 = f52
+FR_h2 = f53
+FR_rsq = f54
+FR_P6 = f55
+FR_r = f56
+
+FR_log2_hi = f57
+FR_log2_lo = f58
+
+FR_float_N = f59
+FR_Q4 = f60
+
+FR_G3 = f61
+FR_H3 = f62
+FR_h3 = f63
+
+FR_Q3 = f64
+FR_Q2 = f65
+FR_1LN10_hi = f66
+
+FR_Q1 = f67
+FR_1LN10_lo = f68
+FR_P5 = f69
+FR_rcub = f70
+
+FR_Neg_One = f71
+FR_Z = f72
+FR_AA = f73
+FR_BB = f74
+FR_S_lo = f75
+FR_2_to_minus_N = f76
+
+
+ // Huge & Main path prolog registers
+FR_Half = f77
+FR_Two = f78
+FR_X2 = f79
+FR_P2 = f80
+FR_P2L = f81
+FR_Rcp = f82
+FR_GG = f83
+FR_HH = f84
+FR_EE = f85
+FR_DD = f86
+FR_GL = f87
+FR_A = f88
+FR_AL = f89
+FR_B = f90
+FR_BL = f91
+FR_Tmp = f92
+
+ // Near 0 & Huges path prolog registers
+FR_C3 = f93
+FR_C5 = f94
+FR_C7 = f95
+FR_C9 = f96
+
+FR_X3 = f97
+FR_X4 = f98
+FR_P9 = f99
+FR_P5 = f100
+FR_P3 = f101
+
+
+// General Purpose Registers
+
+ // General prolog registers
+GR_PFS = r32
+GR_TwoN7 = r40
+GR_TwoP63 = r41
+GR_ExpMask = r42
+GR_ArgExp = r43
+GR_Half = r44
+
+ // Near 0 path prolog registers
+GR_Poly_C_35 = r45
+GR_Poly_C_79 = r46
+
+ // Special logl registers
+GR_Index1 = r34
+GR_Index2 = r35
+GR_signif = r36
+GR_X_0 = r37
+GR_X_1 = r38
+GR_X_2 = r39
+GR_Z_1 = r40
+GR_Z_2 = r41
+GR_N = r42
+GR_Bias = r43
+GR_M = r44
+GR_Index3 = r45
+GR_exp_2tom80 = r45
+GR_exp_mask = r47
+GR_exp_2tom7 = r48
+GR_ad_ln10 = r49
+GR_ad_tbl_1 = r50
+GR_ad_tbl_2 = r51
+GR_ad_tbl_3 = r52
+GR_ad_q = r53
+GR_ad_z_1 = r54
+GR_ad_z_2 = r55
+GR_ad_z_3 = r56
+GR_minus_N = r57
+
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(asinhl)
+
+{ .mfi
+ alloc GR_PFS = ar.pfs,0,27,0,0
+ fma.s1 FR_P2 = FR_Arg, FR_Arg, f1 // p2 = x^2 + 1
+ mov GR_Half = 0xfffe // 0.5's exp
+}
+{ .mfi
+ addl GR_Poly_C_79 = @ltoff(Poly_C_near_0_79), gp // C7, C9 coeffs
+ fma.s1 FR_X2 = FR_Arg, FR_Arg, f0 // Obtain x^2
+ addl GR_Poly_C_35 = @ltoff(Poly_C_near_0_35), gp // C3, C5 coeffs
+};;
+
+{ .mfi
+ getf.exp GR_ArgExp = FR_Arg // get arument's exponent
+ fabs FR_AX = FR_Arg // absolute value of argument
+ mov GR_TwoN7 = 0xfff8 // 2^-7 exp
+}
+{ .mfi
+ ld8 GR_Poly_C_79 = [GR_Poly_C_79] // get actual coeff table address
+ fma.s0 FR_Two = f1, f1, f1 // construct 2.0
+ mov GR_ExpMask = 0x1ffff // mask for exp
+};;
+
+{ .mfi
+ ld8 GR_Poly_C_35 = [GR_Poly_C_35] // get actual coeff table address
+ fclass.m p6,p0 = FR_Arg, 0xe7 // if arg NaN inf zero
+ mov GR_TwoP63 = 0x1003e // 2^63 exp
+}
+{ .mfi
+ addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ setf.exp FR_Half = GR_Half // construct 0.5
+ fclass.m p7,p0 = FR_Arg, 0x09 // if arg + denorm
+ and GR_ArgExp = GR_ExpMask, GR_ArgExp // select exp
+}
+{ .mfb
+ ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1
+ nop.f 0
+ nop.b 0
+};;
+{ .mfi
+ ldfe FR_C9 = [GR_Poly_C_79],16 // load C9
+ fclass.m p10,p0 = FR_Arg, 0x0a // if arg - denorm
+ cmp.gt p8, p0 = GR_TwoN7, GR_ArgExp // if arg < 2^-7 ('near 0')
+}
+{ .mfb
+ cmp.le p9, p0 = GR_TwoP63, GR_ArgExp // if arg > 2^63 ('huges')
+(p6) fma.s0 FR_Res = FR_Arg,f1,FR_Arg // r = a + a
+(p6) br.ret.spnt b0 // return
+};;
+// (X^2 + 1) computation
+{ .mfi
+(p8) ldfe FR_C5 = [GR_Poly_C_35],16 // load C5
+ fms.s1 FR_Tmp = f1, f1, FR_P2 // Tmp = 1 - p2
+ add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
+}
+{ .mfb
+(p8) ldfe FR_C7 = [GR_Poly_C_79],16 // load C7
+(p7) fnma.s0 FR_Res = FR_Arg,FR_Arg,FR_Arg // r = a - a*a
+(p7) br.ret.spnt b0 // return
+};;
+
+{ .mfi
+(p8) ldfe FR_C3 = [GR_Poly_C_35],16 // load C3
+ fcmp.lt.s1 p11, p12 = FR_Arg, f0 // if arg is negative
+ add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
+}
+{ .mfb
+ add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
+(p10) fma.s0 FR_Res = FR_Arg,FR_Arg,FR_Arg // r = a + a*a
+(p10) br.ret.spnt b0 // return
+};;
+
+{ .mfi
+ add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
+ frsqrta.s1 FR_Rcp, p0 = FR_P2 // Rcp = 1/p2 reciprocal appr.
+ add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_P2L = FR_AX, FR_AX, FR_X2 //low part of p2=fma(X*X-p2)
+ mov GR_Bias = 0x0FFFF // Create exponent bias
+};;
+
+{ .mfb
+ nop.m 0
+(p9) fms.s1 FR_XLog_Hi = FR_Two, FR_AX, f0 // Hi of log1p arg = 2*X - 1
+(p9) br.cond.spnt huges_logl // special version of log1p
+};;
+
+{ .mfb
+ ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
+(p8) fma.s1 FR_X3 = FR_X2, FR_Arg, f0 // x^3 = x^2 * x
+(p8) br.cond.spnt near_0 // Go to near 0 branch
+};;
+
+{ .mfi
+ ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
+ fma.s1 FR_Tmp = FR_Tmp, f1, FR_X2 // Tmp = Tmp + x^2
+ mov GR_exp_mask = 0x1FFFF // Create exponent mask
+};;
+
+{ .mfi
+ ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
+ fma.s1 FR_GG = FR_Rcp, FR_P2, f0 // g = Rcp * p2
+ // 8 bit Newton Raphson iteration
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
+ fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P2L = FR_Tmp, f1, FR_P2L // low part of p2 = Tmp + p2l
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_Q1 = [GR_ad_q] // Load Q1
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
+ // 16 bit Newton Raphson iteration
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
+ // 32 bit Newton Raphson iteration
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
+ // 64 bit Newton Raphson iteration
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_DD = FR_GG, FR_GG, FR_P2 // Remainder d = g * g - p2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_XLog_Hi = FR_AX, f1, FR_GG // bh = z + gh
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_DD = FR_DD, f1, FR_P2L // add p2l: d = d + p2l
+ nop.i 0
+};;
+
+
+{ .mfi
+ getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
+ fmerge.ns FR_Neg_One = f1, f1 // Form -1.0
+ mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h
+ extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_XLog_Hi = FR_DD, FR_HH, FR_XLog_Hi // bh = bh + gl
+ nop.i 0
+};;
+
+{ .mmi
+ shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
+ shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
+ extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
+};;
+
+{ .mmi
+ ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_XLog_Lo = FR_GG, f1, FR_XLog_Hi // bl = gh - bh
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
+};;
+
+// WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
+// "DEAD" ZONE!
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1|
+ nop.i 0
+};;
+
+{ .mmi
+ getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
+ ldfd FR_h = [GR_ad_tbl_1] // Load h_1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+};;
+
+
+{ .mfi
+ shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
+ fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_AX // bl = bl + x
+ mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
+}
+{ .mfi
+ shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
+ nop.f 0
+ sub GR_N = GR_N, GR_Bias // sub bias from exp
+};;
+
+{ .mmi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
+};;
+
+{ .mmi
+ ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mmi
+ setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
+ setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
+};;
+
+// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
+// BECAUSE OF POSSIBLE 10 CLOCKS STALL!
+// So we can negate Q coefficients there for negative values
+
+{ .mfi
+ nop.m 0
+(p11) fma.s1 FR_Q1 = FR_Q1, FR_Neg_One, f0 // Negate Q1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GL // bl = bl + gl
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p11) fma.s1 FR_Q2 = FR_Q2, FR_Neg_One, f0 // Negate Q2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p11) fma.s1 FR_Q3 = FR_Q3, FR_Neg_One, f0 // Negate Q3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p11) fma.s1 FR_Q4 = FR_Q4, FR_Neg_One, f0 // Negate Q4
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+};;
+
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+ fcvt.xf FR_float_N = FR_float_N
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S_lo = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^-N
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r=G*S_lo+(G*S_hi-1)
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+};;
+
+.pred.rel "mutex",p12,p11
+{ .mfi
+ nop.m 0
+(p12) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fms.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
+ nop.i 0
+};;
+
+
+.pred.rel "mutex",p12,p11
+{ .mfi
+ nop.m 0
+(p12) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fms.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo
+ // Y_lo=poly_hi+poly_lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fma.s0 FR_Y_hi = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+ fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
+ br.ret.sptk b0 // Common exit for 2^-7 < x < inf
+};;
+
+// * SPECIAL VERSION OF LOGL FOR HUGE ARGUMENTS *
+
+huges_logl:
+{ .mfi
+ getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
+ fmerge.ns FR_Neg_One = f1, f1 // Form -1.0
+ mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
+};;
+
+{ .mfi
+ add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
+ nop.f 0
+ add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
+}
+{ .mfi
+ add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
+ nop.f 0
+ add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
+};;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
+}
+{ .mfi
+ add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
+ nop.f 0
+ extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
+};;
+
+{ .mfi
+ ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
+ nop.f 0
+ mov GR_exp_mask = 0x1FFFF // Create exponent mask
+}
+{ .mfi
+ shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
+ nop.f 0
+ mov GR_Bias = 0x0FFFF // Create exponent bias
+};;
+
+{ .mfi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
+ fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1|
+ nop.i 0
+};;
+
+{ .mmi
+ getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
+ ldfd FR_h = [GR_ad_tbl_1] // Load h_1
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
+ nop.f 0
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
+};;
+
+// WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
+// "DEAD" ZONE!
+
+{ .mmi
+ ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
+ sub GR_N = GR_N, GR_Bias
+ mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
+};;
+
+{ .mfi
+ ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
+ nop.f 0
+ sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
+};;
+
+{ .mmf
+ ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
+ setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
+ nop.f 0
+};;
+
+{ .mmi
+ nop.m 0
+ ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+};;
+
+{ .mmi
+ ldfe FR_Q1 = [GR_ad_q] // Load Q1
+ shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
+ nop.i 0
+};;
+
+{ .mmi
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
+ nop.i 0
+};;
+
+{ .mmi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
+};;
+
+// WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
+// "DEAD" ZONE!
+// JUST HAVE TO INSERT 3 NOP CYCLES (nothing to do here)
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p11) fma.s1 FR_Q4 = FR_Q4, FR_Neg_One, f0 // Negate Q4
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+ };;
+
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
+ fcvt.xf FR_float_N = FR_float_N
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fma.s1 FR_Q3 = FR_Q3, FR_Neg_One, f0 // Negate Q3
+ nop.i 0
+};;
+
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
+(p11) fma.s1 FR_Q2 = FR_Q2, FR_Neg_One, f0 // Negate Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fma.s1 FR_Q1 = FR_Q1, FR_Neg_One, f0 // Negate Q1
+ nop.i 0
+};;
+
+{ .mfi
+ ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 0
+};;
+
+{ .mmf
+ nop.m 0
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+};;
+
+.pred.rel "mutex",p12,p11
+{ .mfi
+ nop.m 0
+(p12) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fms.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
+ nop.i 0
+};;
+
+
+.pred.rel "mutex",p12,p11
+{ .mfi
+ nop.m 0
+(p12) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fms.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo=poly_hi+poly_lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fma.s0 FR_Y_hi = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+ fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
+ br.ret.sptk b0 // Common exit for 2^-7 < x < inf
+};;
+
+// NEAR ZERO POLYNOMIAL INTERVAL
+near_0:
+{ .mfi
+ nop.m 0
+ fma.s1 FR_X4 = FR_X2, FR_X2, f0 // x^4 = x^2 * x^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P9 = FR_C9,FR_X2,FR_C7 // p9 = C9*x^2 + C7
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P5 = FR_C5,FR_X2,FR_C3 // p5 = C5*x^2 + C3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P3 = FR_P9,FR_X4,FR_P5 // p3 = p9*x^4 + p5
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+ fma.s0 FR_Res = FR_P3,FR_X3,FR_Arg // res = p3*C3 + x
+ br.ret.sptk b0 // Near 0 path return
+};;
+
+GLOBAL_LIBM_END(asinhl)
+
+
diff --git a/sysdeps/ia64/fpu/s_atan.S b/sysdeps/ia64/fpu/s_atan.S
index c0daabd3d7..720ecad28a 100644
--- a/sysdeps/ia64/fpu/s_atan.S
+++ b/sysdeps/ia64/fpu/s_atan.S
@@ -1,10 +1,10 @@
.file "atan.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,947 +20,734 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 4/13/00: Improved speed
-// 4/19/00: Removed the qualifying predicate from the fmerge.s that
-// takes the absolute value.
-// 6/16/00: Reassigned FP registers to eliminate stalls on loads
-// 8/30/00: Saved 5 cycles in main path by rearranging large argument logic
-// and delaying use of result of fcmp in load by 1 group
+// 02/02/00 Initial version
+// 04/13/00 Improved speed
+// 04/19/00 Removed the qualifying predicate from the fmerge.s that
+// takes the absolute value.
+// 06/16/00 Reassigned FP registers to eliminate stalls on loads
+// 08/30/00 Saved 5 cycles in main path by rearranging large argument logic
+// and delaying use of result of fcmp in load by 1 group
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 08/20/02 Use atan2 algorithm with x=1 for better accuracy
+// 02/06/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
-// double atan( double x);
+// double atan(double Y)
//
// Overview of operation
//==============================================================
-// atan(x) = sign(X)pi/2 - atan(1/x)
//
-// We have two paths: |x| > 1 and |x| <= 1
+// The atan function returns values in the interval [-pi/2,+pi/2].
//
-// |x| > 1
-// ==========================================
+// The algorithm used is the atan2(Y,X) algorithm where we fix X=1.0.
//
-// c = frcpa(x) which is approximately 1/x
+// There are two basic paths: swap true and swap false.
+// atan2(Y,X) ==> atan2(V/U) where U >= V. If Y > X, we must swap.
//
-// xc = 1- B
-// B = 1-xc
+// p6 swap True |Y| > |X|
+// p7 swap False |Y| <= |X|
//
-// Approximate 1/(1-B)^k by a polynomial in B, poly(B)
-// k is 45.
//
-// poly(B) = 1 + r1 B + r2 B^2 + ...+ r10 B^10
+// Simple trigonometric identities show
+// Region 1
+// |Y|<=1.0, V=Y, U=1.0 atan2(Y,X) = sgnY * (0 + atan(V/U))
//
-// c^k = (1-B)^k/x^k
-// c^k/(1-B)^k = 1/x^k
-// c^k poly(B) = 1/x^k
-
-// poly(x) = series(atan(1/x)) = 1/x - 1/3x^3 + 1/5x^5 - 1/7x^7 .... + 1/45 x^45
-// = 1/x^45 ( x^44 - x^42/3 + x^40/5 - x^38/7 ... +1)
-// = 1/x^45 ( y^22 - y^21/3 + y^20/5 - y^19/7 ... +1)
-//
-// = c^45 poly(B) poly(x)
-// = c^45 r(B) q(y)
-
-// q(y) = q0 + q1 y + q2 y^2 + ... + q22 y^22
-// where q22 is 1.0
-
-// atan(x) = sign(X)pi/2 - c^45 r(B) q(y)
-
-// |x| <= 1
-// ==========================================
-// poly(x) = series(atan(x)) = x - x^3/3 + x^5/5 + .....
-// poly(x) = series(atan(x)) = x + x^3(- 1/3 + x^2/5 + ..... +x^47/47)
-// poly(x) = series(atan(x)) = x + x^3(p0 + x^2/5 + ..... + x^44/47)
-// poly(x) = series(atan(x)) = x + x^3(p0 + y/5 + ..... + y^22/47)
-
-// where p0 is about -1/3.
-
-// atan(x) = poly(x)
-
-#include "libm_support.h"
+// Region 2
+// |Y|>1.0, V=1.0, U=Y atan2(Y,X) = sgnY * (pi/2 - atan(V/U))
+//
+//
+// We compute atan(V/U) from the identity
+// atan(z) + atan([(V/U)-z] / [1+(V/U)z])
+// where z is a limited precision approximation (16 bits) to V/U
+//
+// z is calculated with the assistance of the frcpa instruction.
+//
+// atan(z) is calculated by a polynomial z + z^3 * p(w), w=z^2
+// where p(w) = P0+P1*w+...+P22*w^22
+//
+// Let d = [(V/U)-z] / [1+(V/U)z]) = (V-U*z)/(U+V*z)
+//
+// Approximate atan(d) by d + P0*d^3
+// Let F = 1/(U+V*z) * (1-a), where |a|< 2^-8.8.
+// Compute q(a) = 1 + a + ... + a^5.
+// Then F*q(a) approximates the reciprocal to more than 50 bits.
-// Special Values
+// Special values
//==============================================================
// atan(QNAN) = QNAN
// atan(SNAN) = quieted SNAN
-// atan(+-inf) = +- pi/2
+// atan(+-inf) = +- pi/2
// atan(+-0) = +-0
-
-
// Registers used
//==============================================================
-// predicate registers used:
-// p6 -> p11
+// predicate registers used:
+// p6 -> p15
-// floating-point registers used:
-// f32 -> f127
+// floating-point registers used:
+// f8, input
+// f32 -> f116
// general registers used
-// r32 -> r37
+// r14 -> r16
// Assembly macros
//==============================================================
-atan_Pi_by_2 = f32
-atan_S_PI = f33
-atan_ABS_f8 = f34
-
-atan_R0 = f35
-atan_R1 = f36
-atan_R2 = f37
-atan_R3 = f38
-atan_R4 = f39
-atan_R5 = f40
-atan_R6 = f41
-atan_R7 = f42
-atan_R8 = f43
-atan_R9 = f44
-atan_R10 = f45
-
-atan_Q0 = f46
-
-atan_Q1 = f47
-atan_Q2 = f48
-atan_Q3 = f49
-atan_Q4 = f50
-atan_Q5 = f51
-atan_Q6 = f52
-atan_Q7 = f53
-atan_Q8 = f54
-atan_Q9 = f55
-atan_Q10 = f56
-
-atan_Q11 = f57
-atan_Q12 = f58
-atan_Q13 = f59
-atan_Q14 = f60
-atan_Q15 = f61
-atan_Q16 = f62
-atan_Q17 = f63
-atan_Q18 = f64
-atan_Q19 = f65
-atan_Q20 = f66
-atan_Q21 = f67
-atan_Q22 = f68
-
-// P and Q constants are mutually exclusive
-// so they can share macro definitions
-atan_P0 = f46
-
-atan_P1 = f47
-atan_P2 = f48
-atan_P3 = f49
-atan_P4 = f10
-atan_P5 = f11
-atan_P6 = f12
-atan_P7 = f13
-atan_P10 = f103
-
-atan_P11 = f114
-atan_P12 = f58
-atan_P13 = f59
-atan_P14 = f60
-atan_P15 = f61
-atan_P16 = f62
-atan_P17 = f63
-atan_P18 = f64
-atan_P19 = f65
-atan_P20 = f14
-atan_P21 = f99
-atan_P22 = f68
-// end of P constant macros
-
-atan_C = f69
-atan_Y = f70
-atan_B = f71
-atan_Z = f72
-atan_V11 = f73
-atan_V12 = f74
-
-atan_V7 = f75
-atan_V8 = f76
-
-atan_W13 = f77
-atan_W11 = f78
-
-atan_V3 = f79
-atan_V4 = f80
-
-atan_G11 = f81
-atan_G12 = f82
-atan_G7 = f83
-atan_G8 = f84
-
-atan_Z1 = f85
-atan_W7 = f86
-
-atan_G3 = f87
-atan_W8 = f88
-atan_V9 = f89
-atan_V10 = f90
-
-atan_G10 = f91
-atan_W3 = f92
-atan_G4 = f93
-atan_G9 = f94
-
-atan_G6 = f95
-atan_W4 = f96
-atan_Z2 = f97
-atan_V6 = f98
-
-atan_V2 = f99
-atan_W6 = f100
-atan_W10 = f101
-atan_Y3 = f102
-
-atan_G2 = f103
-
-atan_Y8 = f104
-
-atan_G5 = f105
-atan_Z3 = f106
-atan_Z4 = f107
-atan_W2 = f108
-atan_V5 = f109
-
-atan_W5 = f110
-atan_G1 = f111
-atan_Y11 = f112
-
-atan_Z5 = f113
-atan_Z6 = f114
-atan_V1 = f115
-atan_W1 = f116
-
-atan_Z7 = f117
-atan_Q = f118
-atan_Z = f119
-atan_abs_f8 = f120
-
-atan_V13 = f121
-atan_Xcub = f122
-atan_Y12 = f123
-atan_P = f124
-
-atan_NORM_f8 = f125
-
-atan_P8 = f126
-atan_P9 = f127
-
-
-
-
-atan_GR_AD_R = r14
-atan_GR_AD_Q = r15
-atan_GR_AD_P = r16
-atan_GR_10172 = r17
-atan_GR_exp_f8 = r18
-atan_GR_signexp_f8 = r19
-atan_GR_exp_mask = r20
-
-
+EXP_AD_P1 = r14
+EXP_AD_P2 = r15
+rsig_near_one = r16
+
+atan2_Y = f8
+atan2_X = f1
+
+atan2_u1_X = f32
+atan2_u1_Y = f33
+atan2_z2_X = f34
+
+atan2_two = f36
+atan2_B1sq_Y = f37
+atan2_z1_X = f38
+atan2_B1X = f40
+
+atan2_B1Y = f41
+atan2_wp_X = f42
+atan2_B1sq_X = f43
+atan2_z = f44
+atan2_w = f45
+
+atan2_P0 = f46
+atan2_P1 = f47
+atan2_P2 = f48
+atan2_P3 = f49
+atan2_P4 = f50
+
+atan2_P5 = f51
+atan2_P6 = f52
+atan2_P7 = f53
+atan2_P8 = f54
+atan2_P9 = f55
+
+atan2_P10 = f56
+atan2_P11 = f57
+atan2_P12 = f58
+atan2_P13 = f59
+atan2_P14 = f60
+
+atan2_P15 = f61
+atan2_P16 = f62
+atan2_P17 = f63
+atan2_P18 = f64
+atan2_P19 = f65
+
+atan2_P20 = f66
+atan2_P21 = f67
+atan2_P22 = f68
+atan2_pi_by_2 = f69
+atan2_sgn_pi_by_2 = f69
+atan2_V13 = f70
+
+atan2_W11 = f71
+atan2_E = f72
+atan2_wp_Y = f73
+atan2_V11 = f74
+atan2_V12 = f75
+
+atan2_V7 = f76
+atan2_V8 = f77
+atan2_W7 = f78
+atan2_W8 = f79
+atan2_W3 = f80
+
+atan2_W4 = f81
+atan2_V3 = f82
+atan2_V4 = f83
+atan2_F = f84
+atan2_gV = f85
+
+atan2_V10 = f86
+atan2_zcub = f87
+atan2_V6 = f88
+atan2_V9 = f89
+atan2_W10 = f90
+
+atan2_W6 = f91
+atan2_W2 = f92
+atan2_V2 = f93
+atan2_alpha = f94
+atan2_alpha_1 = f95
+
+atan2_gVF = f96
+atan2_V5 = f97
+atan2_W12 = f98
+atan2_W5 = f99
+atan2_alpha_sq = f100
+
+atan2_Cp = f101
+atan2_V1 = f102
+atan2_ysq = f103
+atan2_W1 = f104
+atan2_alpha_cub = f105
+
+atan2_C = f106
+atan2_d = f108
+atan2_A_hi = f109
+atan2_dsq = f110
+
+atan2_pd = f111
+atan2_A_lo = f112
+atan2_A = f113
+atan2_Pp = f114
+atan2_sgnY = f115
+
+atan2_sig_near_one = f116
+atan2_near_one = f116
/////////////////////////////////////////////////////////////
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+RODATA
.align 16
-double_atan_constants_R:
-ASM_TYPE_DIRECTIVE(double_atan_constants_R,@object)
- data8 0xB36B46B9C5443CED, 0x0000401C //R8
- data8 0x842633E0D126261F, 0x0000401F //R9
- data8 0xBE04FFFFFFFF46E0, 0x00004010 //R4
- data8 0xE8C62000244D66E2, 0x00004013 //R5
- data8 0xF2790C001E3789B3, 0x00004016 //R6
- data8 0xDCD2CCF97D7C764F, 0x00004019 //R7
- data8 0xB40000000000000B, 0x00004004 //R1
- data8 0xB265F3D38F5EE28F, 0x00004021 //R10
- data8 0x8160000000000001, 0x00004009 //R2
- data8 0xFD5BFFFFFFFE55CD, 0x0000400C //R3
- data8 0xC90FDAA22168C235, 0x00003FFF // pi/2
-ASM_SIZE_DIRECTIVE(double_atan_constants_R)
-
-double_atan_constants_Q:
-ASM_TYPE_DIRECTIVE(double_atan_constants_Q,@object)
- data8 0xEBD602FA7761BC33, 0x00003FF9 //Q8
- data8 0x8CB1CABD6A91913C, 0x0000BFFA //Q9
- data8 0x84C665C37D623CD2, 0x00003FF7 //Q4
- data8 0x8DE0D1673DAEA9BC, 0x0000BFF8 //Q5
- data8 0xF658ADBE2C6E6FCC, 0x00003FF8 //Q6
-
- data8 0xB56307BE1DD3FFB6, 0x0000BFF9 //Q7
- data8 0xAAAAAAAAAAAA8000, 0x0000BFFD //Q21
- data8 0x8000000000000000, 0x00003FFF //Q22
- data8 0x924924923A9D710C, 0x0000BFFC //Q19
- data8 0xCCCCCCCCCC9380E7, 0x00003FFC //Q20
-
- data8 0xA644DC250EFA2800, 0x00003FED //Q0
- data8 0x83DEAE24EEBF5E44, 0x0000BFF1 //Q1
- data8 0xC758CCC64793D4EC, 0x00003FF3 //Q2
- data8 0xBFDC0B54E7C89DCE, 0x0000BFF5 //Q3
- data8 0x888855199D1290AF, 0x0000BFFB //Q15
-
- data8 0x9D89D3BE514B0178, 0x00003FFB //Q16
- data8 0xBA2E8B4DEC70282A, 0x0000BFFB //Q17
- data8 0xE38E38DF9E9FC83B, 0x00003FFB //Q18
- data8 0x9F8781CC990029D9, 0x00003FFA //Q10
- data8 0xB0B39472DEBA3C79, 0x0000BFFA //Q11
-
- data8 0xC2AFAEF8C85B0BC6, 0x00003FFA //Q12
- data8 0xD780E539797525DD, 0x0000BFFA //Q13
- data8 0xF0EDC449AC786DF9, 0x00003FFA //Q14
-ASM_SIZE_DIRECTIVE(double_atan_constants_Q)
-
-
-
-double_atan_constants_P:
-ASM_TYPE_DIRECTIVE(double_atan_constants_P,@object)
- data8 0xB1899EC590CDB8DF, 0x0000BFFA //P10
- data8 0xA1E79850A67D59B0, 0x00003FFA //P11
- data8 0x911D8B30C2A96E6D, 0x0000BFF3 //P20
- data8 0xB87233C68A640706, 0x00003FF0 //P21
- data8 0xD78E4B82F3C29D7A, 0x0000BFFA //P8
-
- data8 0xC2EBE37AF932C14F, 0x00003FFA //P9
- data8 0xBA2E8B94AA104DD6, 0x0000BFFB //P4
- data8 0x9D89D7A640B71D38, 0x00003FFB //P5
- data8 0x88887CA2CE9B2A40, 0x0000BFFB //P6
- data8 0xF0F017D57A919C1E, 0x00003FFA //P7
-
- data8 0xD0D635F230C80E06, 0x0000BFF8 //P16
- data8 0xE847BECA7209B479, 0x00003FF7 //P17
- data8 0xD14C6A2AAE0D5B07, 0x0000BFF6 //P18
- data8 0x915F612A5C469117, 0x00003FF5 //P19
- data8 0x921EDE5FD0DBBBE2, 0x0000BFFA //P12
-
- data8 0xFFD303C2C8535445, 0x00003FF9 //P13
- data8 0xD30DF50E295386F7, 0x0000BFF9 //P14
- data8 0x9E81F2B1BBD210A8, 0x00003FF9 //P15
- data8 0xAAAAAAAAAAAAA800, 0x0000BFFD //P0
- data8 0xCCCCCCCCCCC7D476, 0x00003FFC //P1
-
- data8 0x9249249247838066, 0x0000BFFC //P2
- data8 0xE38E38E302290D68, 0x00003FFB //P3
- data8 0xDF7F0A816F7E5025, 0x0000BFEC //P22
-ASM_SIZE_DIRECTIVE(double_atan_constants_P)
-
-
-.align 32
-.global atan#
-
-////////////////////////////////////////////////////////
-
+LOCAL_OBJECT_START(atan2_tb1)
+data8 0xA21922DC45605EA1 , 0x00003FFA // P11
+data8 0xB199DD6D2675C40F , 0x0000BFFA // P10
+data8 0xC2F01E5DDD100DBE , 0x00003FFA // P9
+data8 0xD78F28FC2A592781 , 0x0000BFFA // P8
+data8 0xF0F03ADB3FC930D3 , 0x00003FFA // P7
+data8 0x88887EBB209E3543 , 0x0000BFFB // P6
+data8 0x9D89D7D55C3287A5 , 0x00003FFB // P5
+data8 0xBA2E8B9793955C77 , 0x0000BFFB // P4
+data8 0xE38E38E320A8A098 , 0x00003FFB // P3
+data8 0x9249249247E37913 , 0x0000BFFC // P2
+data8 0xCCCCCCCCCCC906CD , 0x00003FFC // P1
+data8 0xAAAAAAAAAAAAA8A9 , 0x0000BFFD // P0
+data8 0x0000000000000000 , 0x00000000 // pad to avoid bank conflict
+LOCAL_OBJECT_END(atan2_tb1)
+
+LOCAL_OBJECT_START(atan2_tb2)
+data8 0xCE585A259BD8374C , 0x00003FF0 // P21
+data8 0x9F90FB984D8E39D0 , 0x0000BFF3 // P20
+data8 0x9D3436AABE218776 , 0x00003FF5 // P19
+data8 0xDEC343E068A6D2A8 , 0x0000BFF6 // P18
+data8 0xF396268151CFB11C , 0x00003FF7 // P17
+data8 0xD818B4BB43D84BF2 , 0x0000BFF8 // P16
+data8 0xA2270D30A90AA220 , 0x00003FF9 // P15
+data8 0xD5F4F2182E7A8725 , 0x0000BFF9 // P14
+data8 0x80D601879218B53A , 0x00003FFA // P13
+data8 0x9297B23CCFFB291F , 0x0000BFFA // P12
+data8 0xFE7E52D2A89995B3 , 0x0000BFEC // P22
+data8 0xC90FDAA22168C235 , 0x00003FFF // pi/2
+LOCAL_OBJECT_END(atan2_tb2)
-.section .text
-.proc atan#
-.align 32
-atan:
-
-{ .mmf
-(p0) addl atan_GR_AD_P = @ltoff(double_atan_constants_P), gp
-(p0) addl atan_GR_AD_Q = @ltoff(double_atan_constants_Q), gp
-(p0) fmerge.s atan_ABS_f8 = f0,f8
-}
-;;
-{ .mmf
- ld8 atan_GR_AD_P = [atan_GR_AD_P]
- ld8 atan_GR_AD_Q = [atan_GR_AD_Q]
-(p0) frcpa.s1 atan_C,p8 = f1,f8
-}
-;;
+.section .text
+GLOBAL_LIBM_ENTRY(atan)
-{ .mmf
-(p0) addl atan_GR_AD_R = @ltoff(double_atan_constants_R), gp
-(p0) addl atan_GR_exp_mask = 0x1ffff, r0
-(p0) fma.s1 atan_Y = f8,f8,f0
+{ .mfi
+ nop.m 999
+ frcpa.s1 atan2_u1_Y,p7 = f1,atan2_Y
+ nop.i 999
}
+{ .mfi
+ addl EXP_AD_P1 = @ltoff(atan2_tb1), gp
+ fma.s1 atan2_two = f1,f1,f1
+ nop.i 999
;;
-
-// This fnorm takes faults or sets fault flags
-{ .mmf
-(p0) mov atan_GR_10172 = 0x10172
- ld8 atan_GR_AD_R = [atan_GR_AD_R]
-(p0) fnorm atan_NORM_f8 = f8
}
-;;
-
-
-// qnan snan inf norm unorm 0 -+
-// 1 1 0 0 0 1 11
-// c 7
-
-// p9 set if we have a NAN or +-0
-{ .mmf
-(p0) ldfe atan_Q8 = [atan_GR_AD_Q],16
-(p0) ldfe atan_P10 = [atan_GR_AD_P],16
-(p0) fclass.m.unc p9, p0 = f8, 0xc7
+{ .mfi
+ ld8 EXP_AD_P1 = [EXP_AD_P1]
+ frcpa.s1 atan2_u1_X,p6 = f1,atan2_X
+ nop.i 999
}
-;;
-
-
-{ .mmi
-(p0) ldfe atan_Q9 = [atan_GR_AD_Q],16
-(p0) ldfe atan_P11 = [atan_GR_AD_P],16
- nop.i 999
+{ .mfi
+ nop.m 999
+ fma.s1 atan2_ysq = atan2_Y,atan2_Y,f0
+ nop.i 999
}
;;
-
-{ .mmf
-(p0) ldfe atan_Q4 = [atan_GR_AD_Q],16
-(p0) ldfe atan_P20 = [atan_GR_AD_P],16
-(p9) fma.d.s0 f8 = f8,f1,f0
-;;
-}
-
-// Exit if we have a NAN or +-0
-{ .mmb
-(p0) ldfe atan_Q5 = [atan_GR_AD_Q],16
-(p0) ldfe atan_P21 = [atan_GR_AD_P],16
-(p9) br.ret.spnt b0
-;;
+{ .mfi
+ add EXP_AD_P2 = 0xd0,EXP_AD_P1
+ fmerge.s atan2_sgnY = atan2_Y,f1
+ nop.i 999
}
-
-
-// p6 is TRUE if |x| <= 1
-// p7 is TRUE if |x| > 1
-{ .mmf
-(p0) ldfe atan_Q6 = [atan_GR_AD_Q],16
-(p0) ldfe atan_P8 = [atan_GR_AD_P],16
-(p0) fcmp.le.unc p6,p7 = atan_ABS_f8, f1
;;
-}
{ .mfi
-(p0) ldfe atan_Q7 = [atan_GR_AD_Q],16
-(p0) fma.s1 atan_Z = atan_C, atan_C, f0
- nop.i 999
+ ldfe atan2_P11 = [EXP_AD_P1],16
+ fclass.m p10,p0 = atan2_Y, 0xc3 // Test for y=nan
+ nop.i 999
}
{ .mfi
-(p0) ldfe atan_P9 = [atan_GR_AD_P],16
-(p0) fnma.s1 atan_B = atan_C,f8, f1
- nop.i 999 ;;
+ ldfe atan2_P21 = [EXP_AD_P2],16
+ nop.f 999
+ nop.i 999
+;;
}
{ .mfi
-(p0) ldfe atan_Q21 = [atan_GR_AD_Q],16
-(p0) fma.s1 atan_V12 = atan_Y, atan_Y, f0
- nop.i 999
+ ldfe atan2_P10 = [EXP_AD_P1],16
+ fnma.s1 atan2_B1Y = atan2_u1_Y, atan2_Y, atan2_two
+ nop.i 999
}
{ .mfi
-(p0) ldfe atan_P4 = [atan_GR_AD_P],16
-(p0) fma.s1 atan_Xcub = f8, atan_Y , f0
- nop.i 999
-;;
-}
-
-
-{ .mmi
-(p7) ldfe atan_Q22 = [atan_GR_AD_Q],16
-(p6) ldfe atan_P5 = [atan_GR_AD_P],16
-(p6) cmp.eq.unc p8,p0 = r0,r0
-;;
-}
-
-
-{ .mmi
-(p7) ldfe atan_Q19 = [atan_GR_AD_Q],16
-(p6) ldfe atan_P6 = [atan_GR_AD_P],16
-(p7) cmp.eq.unc p9,p0 = r0,r0
-;;
-}
-
-
-{ .mmi
-(p7) ldfe atan_Q20 = [atan_GR_AD_Q],16
-(p6) ldfe atan_P7 = [atan_GR_AD_P],16
- nop.i 999
+ ldfe atan2_P20 = [EXP_AD_P2],16
+ fma.s1 atan2_wp_Y = atan2_u1_Y, atan2_u1_Y, f0
+ nop.i 999
;;
}
{ .mfi
-(p7) ldfe atan_Q0 = [atan_GR_AD_Q],16
-(p6) fma.s1 atan_V13 = atan_Y, atan_P11, atan_P10
- nop.i 999
+ ldfe atan2_P9 = [EXP_AD_P1],16
+ fma.s1 atan2_z1_X = atan2_u1_X, atan2_Y, f0
+ nop.i 999
}
{ .mfi
-(p6) ldfe atan_P16 = [atan_GR_AD_P],16
-(p7) fma.s1 atan_V11 = atan_Y, atan_Q9, atan_Q8
- nop.i 999 ;;
+ ldfe atan2_P19 = [EXP_AD_P2],16
+ fnma.s1 atan2_B1X = atan2_u1_X, atan2_X, atan2_two
+ nop.i 999
}
-
+;;
{ .mfi
-(p7) ldfe atan_Q1 = [atan_GR_AD_Q],16
-(p7) fma.s1 atan_G12 = atan_B, atan_B, f0
- nop.i 999
+ ldfe atan2_P8 = [EXP_AD_P1],16
+ fma.s1 atan2_z2_X = atan2_u1_X, atan2_ysq, f0
+ nop.i 999
}
-{ .mfi
-(p6) ldfe atan_P17 = [atan_GR_AD_P],16
-(p0) fma.s1 atan_V9 = atan_V12, atan_V12, f0
- nop.i 999 ;;
+{ .mfb
+ ldfe atan2_P18 = [EXP_AD_P2],16
+(p10) fma.d.s0 f8 = atan2_Y,atan2_X,f0 // If y=nan, result quietized y
+(p10) br.ret.spnt b0 // Exit if y=nan
}
+;;
-
+// p6 true if swap, means |y| > 1.0 or ysq > 1.0
+// p7 true if no swap, means 1.0 >= |y| or 1.0 >= ysq
{ .mfi
-(p7) ldfe atan_Q2 = [atan_GR_AD_Q],16
-(p6) fma.s1 atan_W11 = atan_Y, atan_P21, atan_P20
- nop.i 999
+ ldfe atan2_P7 = [EXP_AD_P1],16
+ fcmp.ge.s1 p7,p6 = f1, atan2_ysq
+ nop.i 999
}
-{ .mfi
-(p6) ldfe atan_P18 = [atan_GR_AD_P],16
-(p7) fma.s1 atan_V7 = atan_Y, atan_Q5, atan_Q4
- nop.i 999 ;;
+{ .mmf
+ ldfe atan2_P17 = [EXP_AD_P2],16
+ nop.m 999
+ nop.f 999
}
+;;
{ .mfi
-(p7) ldfe atan_Q3 = [atan_GR_AD_Q],16
-(p7) fma.s1 atan_Z1 = atan_Z, atan_Z, f0
- nop.i 999
+ ldfe atan2_P6 = [EXP_AD_P1],16
+ fma.s1 atan2_E = atan2_u1_Y, atan2_B1Y, atan2_Y
+ nop.i 999
}
{ .mfi
-(p6) ldfe atan_P19 = [atan_GR_AD_P],16
-(p7) fma.s1 atan_Y3 = atan_Y , atan_V12, f0
- nop.i 999 ;;
+ ldfe atan2_P16 = [EXP_AD_P2],16
+ fma.s1 atan2_B1sq_Y = atan2_B1Y, atan2_B1Y, f0
+ nop.i 999
+;;
}
{ .mfi
-(p7) ldfe atan_R8 = [atan_GR_AD_R],16
-(p6) fma.s1 atan_V11 = atan_Y, atan_P9, atan_P8
- nop.i 999
+ ldfe atan2_P5 = [EXP_AD_P1],16
+(p7) fma.s1 atan2_wp_X = atan2_z1_X, atan2_z1_X, f0
+ nop.i 999
}
{ .mfi
-(p6) ldfe atan_P12 = [atan_GR_AD_P],16
-(p7) fma.s1 atan_V8 = atan_Y, atan_Q7, atan_Q6
- nop.i 999 ;;
-}
-
-{ .mmi
-(p7) ldfe atan_R9 = [atan_GR_AD_R],16
-(p6) ldfe atan_P13 = [atan_GR_AD_P],16
- nop.i 999
+ ldfe atan2_P15 = [EXP_AD_P2],16
+(p7) fma.s1 atan2_B1sq_X = atan2_B1X, atan2_B1X, f0
+ nop.i 999
;;
}
{ .mfi
-(p7) ldfe atan_R4 = [atan_GR_AD_R],16
-(p6) fma.s1 atan_V7 = atan_Y, atan_P5, atan_P4
- nop.i 999
+ ldfe atan2_P4 = [EXP_AD_P1],16
+(p6) fma.s1 atan2_z = atan2_u1_Y, atan2_B1Y, f0
+ nop.i 999
}
{ .mfi
-(p6) ldfe atan_P14 = [atan_GR_AD_P],16
-(p7) fma.s1 atan_W13 = atan_Y, atan_Q22, atan_Q21
- nop.i 999 ;;
+ ldfe atan2_P14 = [EXP_AD_P2],16
+(p7) fma.s1 atan2_E = atan2_z2_X, atan2_B1X, atan2_X
+ nop.i 999
+;;
}
{ .mfi
-(p7) ldfe atan_R5 = [atan_GR_AD_R],16
-(p6) fma.s1 atan_Y12 = atan_V9 , atan_V9 , f0
- nop.i 999
+ ldfe atan2_P3 = [EXP_AD_P1],16
+ fcmp.eq.s0 p14,p15=atan2_X,atan2_Y // Dummy for denorm and invalid
+ nop.i 999
}
-{ .mfi
-(p6) ldfe atan_P15 = [atan_GR_AD_P],16
-(p7) fma.s1 atan_Y8 = atan_V9 , atan_V9 , f0
- nop.i 999 ;;
+{ .mmf
+ ldfe atan2_P13 = [EXP_AD_P2],16
+ nop.m 999
+(p7) fma.s1 atan2_z = atan2_z1_X, atan2_B1X, f0
+;;
}
-
{ .mfi
-(p7) ldfe atan_R6 = [atan_GR_AD_R],16
-(p6) fma.s1 atan_V8 = atan_Y, atan_P7, atan_P6
- nop.i 999
+ ldfe atan2_P2 = [EXP_AD_P1],16
+(p6) fma.s1 atan2_w = atan2_wp_Y, atan2_B1sq_Y,f0
+ nop.i 999
}
-{ .mfi
-(p6) ldfe atan_P0 = [atan_GR_AD_P],16
-(p7) fma.s1 atan_W11 = atan_Y, atan_Q20, atan_Q19
- nop.i 999 ;;
+{ .mlx
+ ldfe atan2_P12 = [EXP_AD_P2],16
+ movl rsig_near_one = 0x8000000000000001 // signif near 1.0
+;;
}
-
{ .mfi
-(p7) ldfe atan_R7 = [atan_GR_AD_R],16
-(p7) fma.s1 atan_Z2 = atan_Z1 , atan_Z1, f0
- nop.i 999
+ ldfe atan2_P1 = [EXP_AD_P1],16
+ fclass.m p9,p0 = atan2_Y, 0x23 // test if y inf
+ nop.i 999
}
{ .mfi
-(p6) ldfe atan_P1 = [atan_GR_AD_P],16
-(p6) fma.s1 atan_V10 = atan_V12, atan_V13, atan_V11
- nop.i 999 ;;
+ ldfe atan2_P22 = [EXP_AD_P2],16
+(p7) fma.s1 atan2_w = atan2_wp_X, atan2_B1sq_X,f0
+ nop.i 999
+;;
}
{ .mfi
-(p7) ldfe atan_Q15 = [atan_GR_AD_Q],16
-(p6) fma.s1 atan_W7 = atan_Y, atan_P17, atan_P16
- nop.i 999
+ ldfe atan2_P0 = [EXP_AD_P1],16
+ frcpa.s1 atan2_F,p0 = f1, atan2_E
+ nop.i 999
}
{ .mfi
-(p6) ldfe atan_P2 = [atan_GR_AD_P],16
-(p7) fma.s1 atan_V3 = atan_Y, atan_Q1 , atan_Q0
- nop.i 999 ;;
+ ldfe atan2_pi_by_2 = [EXP_AD_P2],16
+(p6) fnma.s1 atan2_gV = atan2_Y, atan2_z, atan2_X
+ nop.i 999
+;;
}
{ .mfi
-(p7) ldfe atan_Q16 = [atan_GR_AD_Q],16
-(p7) fma.s1 atan_G9 = atan_G12, atan_G12, f0
- nop.i 999
+ setf.sig atan2_sig_near_one = rsig_near_one
+(p7) fnma.s1 atan2_gV = atan2_X, atan2_z, atan2_Y
+ nop.i 999
}
-{ .mfi
-(p6) ldfe atan_P3 = [atan_GR_AD_P],16
-(p7) fma.s1 atan_V6 = atan_V12, atan_V8, atan_V7
- nop.i 999 ;;
+{ .mfb
+ nop.m 999
+(p9) fma.d.s0 f8 = atan2_sgnY, atan2_pi_by_2, f0 // +-pi/2 if y inf
+(p9) br.ret.spnt b0 // exit if y inf, result is +-pi/2
+;;
}
-
{ .mfi
-(p7) ldfe atan_R1 = [atan_GR_AD_R],16
-(p6) fma.s1 atan_W8 = atan_Y, atan_P19, atan_P18
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_V13 = atan2_w, atan2_P11, atan2_P10
+ nop.i 999
}
{ .mfi
-(p6) ldfe atan_P22 = [atan_GR_AD_P],16
-(p7) fma.s1 atan_V4 = atan_Y, atan_Q3 , atan_Q2
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_W11 = atan2_w, atan2_P21, atan2_P20
+ nop.i 999
+;;
}
-
{ .mfi
- getf.exp atan_GR_signexp_f8 = atan_NORM_f8
-(p7) fma.s1 atan_Y11 = atan_Y3 , atan_Y8 , f0
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_V11 = atan2_w, atan2_P9, atan2_P8
+ nop.i 999
}
{ .mfi
-(p7) ldfe atan_Q17 = [atan_GR_AD_Q],16
-(p6) fma.s1 atan_V6 = atan_V12, atan_V8, atan_V7
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_V12 = atan2_w, atan2_w, f0
+ nop.i 999
+;;
}
-
{ .mfi
-(p7) ldfe atan_Q18 = [atan_GR_AD_Q],16
-(p6) fma.s1 atan_W3 = atan_Y, atan_P13, atan_P12
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_V8 = atan2_w, atan2_P7 , atan2_P6
+ nop.i 999
}
{ .mfi
-(p7) ldfe atan_R10 = [atan_GR_AD_R],16
-(p7) fma.s1 atan_G11 = atan_B, atan_R9 , atan_R8
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_W8 = atan2_w, atan2_P19, atan2_P18
+ nop.i 999
+;;
}
-
{ .mfi
-(p7) ldfe atan_Q10 = [atan_GR_AD_Q],16
-(p7) fma.s1 atan_Z3 = atan_Z1 , atan_Z2 , f0
-(p0) and atan_GR_exp_f8 = atan_GR_signexp_f8,atan_GR_exp_mask
+ nop.m 999
+ fnma.s1 atan2_alpha = atan2_E, atan2_F, f1
+ nop.i 999
}
{ .mfi
-(p7) ldfe atan_R2 = [atan_GR_AD_R],16
-(p7) fma.s1 atan_Z4 = atan_Z2 , atan_Z2 , f0
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 atan2_alpha_1 = atan2_E, atan2_F, atan2_two
+ nop.i 999
+;;
}
{ .mfi
-(p7) ldfe atan_Q11 = [atan_GR_AD_Q],16
-(p6) fma.s1 atan_W4 = atan_Y, atan_P15, atan_P14
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_V7 = atan2_w, atan2_P5 , atan2_P4
+ nop.i 999
}
{ .mfi
-(p7) ldfe atan_R3 = [atan_GR_AD_R],16
-(p7) fma.s1 atan_G7 = atan_B, atan_R5 , atan_R4
-(p0) cmp.le.unc p11,p0 = atan_GR_10172,atan_GR_exp_f8
-;;
-}
-
-
-{ .mmf
-(p9) ldfe atan_Q12 = [atan_GR_AD_Q],16
-(p0) ldfe atan_S_PI = [atan_GR_AD_R],16
-(p8) fma.s1 atan_W6 = atan_V12, atan_W8, atan_W7
+ nop.m 999
+ fma.s1 atan2_W7 = atan2_w, atan2_P17, atan2_P16
+ nop.i 999
;;
}
-
-
{ .mfi
-(p9) ldfe atan_Q13 = [atan_GR_AD_Q],16
-(p8) fma.s1 atan_V3 = atan_Y, atan_P1 , atan_P0
-(p11) cmp.ne.and p6,p7 = r0,r0
+ nop.m 999
+ fma.s1 atan2_V4 = atan2_w, atan2_P3 , atan2_P2
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p8) fma.s1 atan_V5 = atan_V9 , atan_V10, atan_V6
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_W4 = atan2_w, atan2_P15, atan2_P14
+ nop.i 999
+;;
}
-
-.pred.rel "mutex",p6,p7,p11
{ .mfi
-(p7) ldfe atan_Q14 = [atan_GR_AD_Q],16
-(p6) fma.s1 atan_Y12 = atan_V9 , atan_Y12, f0
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_V3 = atan2_w, atan2_P1 , atan2_P0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_G8 = atan_B, atan_R7 , atan_R6
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_W3 = atan2_w, atan2_P13, atan2_P12
+ nop.i 999
+;;
}
-
{ .mfi
- nop.m 999
-(p6) fma.s1 atan_V4 = atan_Y, atan_P3 , atan_P2
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_V10 = atan2_V12, atan2_V13, atan2_V11
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_W7 = atan_Y, atan_Q16, atan_Q15
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_gVF = atan2_gV, atan2_F, f0
+ nop.i 999
+;;
}
-
{ .mfi
- nop.m 999
-(p6) fma.s1 atan_W10 = atan_V12, atan_P22, atan_W11
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_alpha_sq = atan2_alpha, atan2_alpha, f0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_G3 = atan_B, atan_R1 , f1
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_Cp = atan2_alpha, atan2_alpha_1, f1
+ nop.i 999
+;;
}
-
{ .mfi
- nop.m 999
-(p6) fma.s1 atan_W2 = atan_V12, atan_W4 , atan_W3
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_V9 = atan2_V12, atan2_V12, f0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_V2 = atan_V12, atan_V4 , atan_V3
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_W10 = atan2_V12, atan2_P22 , atan2_W11
+ nop.i 999
+;;
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_W8 = atan_Y, atan_Q18, atan_Q17
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_V6 = atan2_V12, atan2_V8 , atan2_V7
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_G10 = atan_G12, atan_R10, atan_G11
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_W6 = atan2_V12, atan2_W8 , atan2_W7
+ nop.i 999
+;;
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_V10 = atan_V12, atan_Q10, atan_V11
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_V2 = atan2_V12, atan2_V4 , atan2_V3
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_G6 = atan_G12, atan_G8 , atan_G7
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_W2 = atan2_V12, atan2_W4 , atan2_W3
+ nop.i 999
+;;
}
-
{ .mfi
- nop.m 999
-(p6) fma.s1 atan_V2 = atan_V12, atan_V4, atan_V3
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_alpha_cub = atan2_alpha, atan2_alpha_sq, f0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_G4 = atan_B , atan_R3 , atan_R2
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_C = atan2_gVF, atan2_Cp, f0
+ nop.i 999
+;;
}
-
{ .mfi
- nop.m 999
-(p6) fma.s1 atan_W5 = atan_V9 , atan_W10, atan_W6
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p7) fma.s1 atan_W3 = atan_Y , atan_Q12, atan_Q11
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_W12 = atan2_V9, atan2_V9, f0
+ nop.i 999
+;;
}
-
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_Z5 = atan_Z3 , atan_Z4 , f0
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_V5 = atan2_V9, atan2_V10, atan2_V6
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_W10 = atan_V12, atan_W13, atan_W11
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_W5 = atan2_V9, atan2_W10, atan2_W6
+ nop.i 999
+;;
}
-
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_W4 = atan_Y , atan_Q14, atan_Q13
- nop.i 999
+ nop.m 999
+ fclass.m p8,p0 = atan2_Y, 0x07 // Test for y=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_W6 = atan_V12, atan_W8, atan_W7
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_d = atan2_alpha_cub, atan2_C, atan2_C
+ nop.i 999
}
+;;
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_V5 = atan_V9 , atan_V10, atan_V6
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_W12 = atan2_V9, atan2_W12, f0
+ nop.i 999
}
-{ .mfi
- nop.m 999
-(p7) fma.s1 atan_G5 = atan_G9 , atan_G10, atan_G6
- nop.i 999 ;;
-}
-
+;;
{ .mfi
- nop.m 999
-(p6) fma.s1 atan_V1 = atan_V9 , atan_V5 , atan_V2
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_V1 = atan2_V9, atan2_V5, atan2_V2
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_G2 = atan_G12, atan_G4 , atan_G3
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_W1 = atan2_V9, atan2_W5, atan2_W2
+ nop.i 999
+;;
}
-
-{ .mfi
- nop.m 999
-(p6) fma.s1 atan_W1 = atan_V9 , atan_W5 , atan_W2
- nop.i 999
-}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_Z6 = atan_Z4 , atan_C , f0
- nop.i 999 ;;
+ nop.m 999
+(p8) fmerge.s f8 = atan2_sgnY, f0 // +-0 if y=0
+ nop.i 999
}
-
-{ .mfi
- nop.m 999
-(p0) fmerge.s atan_S_PI = f8, atan_S_PI
- nop.i 999 ;;
+{ .mfb
+ nop.m 999
+ fma.s1 atan2_zcub = atan2_z, atan2_w, f0
+(p8) br.ret.spnt b0 // Exit if y=0
+;;
}
-
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_W5 = atan_V9 , atan_W10, atan_W6
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_pd = atan2_P0, atan2_d, f0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_W2 = atan_V12, atan_W4 , atan_W3
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_dsq = atan2_d, atan2_d, f0
+ nop.i 999
+;;
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_G1 = atan_G9 , atan_G5 , atan_G2
- nop.i 999
+ nop.m 999
+ fmerge.se atan2_near_one = f1, atan2_sig_near_one // Const ~1.0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_V1 = atan_V9 , atan_V5 , atan_V2
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_Pp = atan2_W12, atan2_W1, atan2_V1
+ nop.i 999
+;;
}
-
{ .mfi
- nop.m 999
-(p6) fma.s1 atan_P = atan_Y12, atan_W1 , atan_V1
- nop.i 999
+ nop.m 999
+ fma.s1 atan2_sgn_pi_by_2 = atan2_pi_by_2, atan2_sgnY, f0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_Z7 = atan_Z5 , atan_Z6 , f0
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_A_lo = atan2_pd, atan2_dsq, atan2_d
+ nop.i 999
+;;
}
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_W1 = atan_V9 , atan_W5 , atan_W2
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 atan2_A_hi = atan2_zcub, atan2_Pp, atan2_z
+ nop.i 999
+;;
}
{ .mfi
- nop.m 999
-(p11) fma.d.s0 f8 = atan_S_PI,f1,f0
- nop.i 999
+ nop.m 999
+(p6) fma.s1 atan2_A = atan2_A_hi, f1, atan2_A_lo
+ nop.i 999
}
+// For |Y| <= |X| and X > 0, result is A_hi + A_lo
{ .mfi
- nop.m 999
-(p7) fma.s1 atan_Z = atan_G1 , atan_Z7 , f0
- nop.i 999 ;;
-}
-
-
-{ .mfi
- nop.m 999
-(p7) fma.s1 atan_Q = atan_Y11, atan_W1 , atan_V1
- nop.i 999 ;;
+ nop.m 999
+(p7) fma.d.s0 f8 = atan2_A_hi, f1, atan2_A_lo
+ nop.i 999
+;;
}
-
-{ .mfi
- nop.m 999
-(p6) fma.d.s0 f8 = atan_P , atan_Xcub , f8
- nop.i 999
-}
+// For |Y| > |X|, result is +- pi/2 - (A_hi + A_lo)
+// We perturb A by multiplying by 1.0+1ulp as we produce the result
+// in order to get symmetrically rounded results in directed rounding modes.
+// If we don't do this, there are a few cases where the trailing 11 bits of
+// the significand of the result, before converting to double, are zero. These
+// cases do not round symmetrically in round to +infinity or round to -infinity.
{ .mfb
- nop.m 999
-(p7) fnma.d.s0 f8 = atan_Z , atan_Q , atan_S_PI
-(p0) br.ret.sptk b0 ;;
+ nop.m 999
+(p6) fnma.d.s0 f8 = atan2_A, atan2_near_one, atan2_sgn_pi_by_2
+ br.ret.sptk b0
+;;
}
-.endp atan
-ASM_SIZE_DIRECTIVE(atan)
+GLOBAL_LIBM_END(atan)
diff --git a/sysdeps/ia64/fpu/s_atanf.S b/sysdeps/ia64/fpu/s_atanf.S
index b0a68737aa..fb7f4a307e 100644
--- a/sysdeps/ia64/fpu/s_atanf.S
+++ b/sysdeps/ia64/fpu/s_atanf.S
@@ -1,12 +1,10 @@
.file "atanf.s"
-// THIS IS NOT OPTIMIZED AND NOT OFFICIAL
-// Copyright (C) 2000, 2001, Intel Corporation
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -22,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -37,16 +35,18 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
// History
//==============================================================
-// ?/??/00 Initial revision
-// 8/17/00 Changed predicate register macro-usage to direct predicate
+// 02/20/00 Initial version
+// 08/17/00 Changed predicate register macro-usage to direct predicate
// names due to an assembler bug.
-
-#include "libm_support.h"
+// 02/06/02 Corrected .section statement
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align;
+// added missing bundling
//
// Assembly macros
@@ -140,16 +140,11 @@ atanf_answer = f8
//atanf_pred_GT1 = p7
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+RODATA
.align 16
-atanf_coeff_1_table:
-ASM_TYPE_DIRECTIVE(atanf_coeff_1_table,@object)
+LOCAL_OBJECT_START(atanf_coeff_1_table)
data8 0x40c4c241be751ff2 // r4
data8 0x40e9f300c2f3070b // r5
data8 0x409babffef772075 // r3
@@ -164,12 +159,11 @@ data8 0xbfc2473c5145ee38 // p3
data8 0x3fbc4f512b1865f5 // p4
data8 0x3fc9997e7afbff4e // p2
data8 0x3ff921fb54442d18 // pi/2
-ASM_SIZE_DIRECTIVE(atanf_coeff_1_table)
+LOCAL_OBJECT_END(atanf_coeff_1_table)
-atanf_coeff_2_table:
-ASM_TYPE_DIRECTIVE(atanf_coeff_2_table,@object)
+LOCAL_OBJECT_START(atanf_coeff_2_table)
data8 0x4035000000004284 // r1
data8 0x406cdffff336a59b // r2
data8 0x3fbc4f512b1865f5 // p4 = q6
@@ -182,18 +176,12 @@ data8 0xbfa6e10ba401393f // p7
data8 0x3f97105b4160f86b // p8
data8 0xbf7deaadaa336451 // p9
data8 0x3f522e5d33bc9baa // p10
-ASM_SIZE_DIRECTIVE(atanf_coeff_2_table)
-
-
+LOCAL_OBJECT_END(atanf_coeff_2_table)
-.global atanf
-.text
-.proc atanf
-
-.align 32
-atanf:
+.section .text
+GLOBAL_LIBM_ENTRY(atanf)
{ .mfi
alloc r32 = ar.pfs,1,2,0,0
@@ -325,7 +313,7 @@ atanf:
{ .mfb
nop.m 999
fma.s1 atanf_x5 = atanf_t,atanf_xcub,f0
-(p8) br.cond.spnt L(ATANF_X_INF_NAN_ZERO)
+(p8) br.cond.spnt ATANF_X_INF_NAN_ZERO
}
;;
@@ -487,7 +475,7 @@ atanf:
{ .mfi
nop.m 999
- fma atanf_sgnx_piby2 = atanf_sgn_x,atanf_piby2,f0
+ fma.s0 atanf_sgnx_piby2 = atanf_sgn_x,atanf_piby2,f0
nop.i 999
}
{ .mfi
@@ -530,27 +518,38 @@ atanf:
{ .mfi
nop.m 999
//(atanf_pred_GT1) fnma.s atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2
-(p7) fnma.s atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2
+(p7) fnma.s.s0 atanf_answer = atanf_poly_q,atanf_z21_poly_r,atanf_sgnx_piby2
nop.i 999;;
}
{ .mfb
nop.m 999
//(atanf_pred_LE1) fma.s atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4
-(p6) fma.s atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4
+(p6) fma.s.s0 atanf_answer = atanf_x11,atanf_poly_p1,atanf_poly_p4
br.ret.sptk b0
}
-L(ATANF_X_INF_NAN_ZERO):
+ATANF_X_INF_NAN_ZERO:
- fclass.m p8,p9 = f8,0x23 // @inf
+{ .mfi
+ nop.m 0
+ fclass.m p8,p9 = f8,0x23 // @inf
+ nop.i 0
+}
;;
+{ .mfi
+ nop.m 0
(p8) fmerge.s f8 = f8, atanf_piby2
+ nop.i 0
+}
;;
- fnorm.s f8 = f8
+{ .mfb
+ nop.m 0
+ fnorm.s.s0 f8 = f8
br.ret.sptk b0
+}
+;;
-.endp atanf
-ASM_SIZE_DIRECTIVE(atanf)
+GLOBAL_LIBM_END(atanf)
diff --git a/sysdeps/ia64/fpu/s_atanl.S b/sysdeps/ia64/fpu/s_atanl.S
index 28d44c1850..bfd9f458f4 100644
--- a/sysdeps/ia64/fpu/s_atanl.S
+++ b/sysdeps/ia64/fpu/s_atanl.S
@@ -1,10 +1,10 @@
.file "atanl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,41 +35,52 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//
-// *********************************************************************
+//*********************************************************************
//
// History
-// 2/02/00 (hand-optimized)
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 (hand-optimized)
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 03/13/01 Fixed flags when denormal raised on intermediate result
+// 01/08/02 Improved speed.
+// 02/06/02 Corrected .section statement
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
//
-// *********************************************************************
+//*********************************************************************
//
// Function: atanl(x) = inverse tangent(x), for double extended x values
-// Function: atan2l(y,x) = atan(y/x), for double extended x values
+// Function: atan2l(y,x) = atan(y/x), for double extended y, x values
+//
+// API
+//
+// long double atanl (long double x)
+// long double atan2l (long double y, long double x)
//
-// *********************************************************************
+//*********************************************************************
//
// Resources Used:
//
// Floating-Point Registers: f8 (Input and Return Value)
-// f9-f15
-// f32-f79
+// f9 (Input for atan2l)
+// f10-f15, f32-f83
//
// General Purpose Registers:
-// r32-r48
-// r49,r50,r51,r52 (Arguments to error support for 0,0 case)
+// r32-r51
+// r49-r52 (Arguments to error support for 0,0 case)
//
// Predicate Registers: p6-p15
//
-// *********************************************************************
+//*********************************************************************
//
// IEEE Special Conditions:
//
-// Denormal fault raised on denormal inputs
+// Denormal fault raised on denormal inputs
// Underflow exceptions may occur
// Special error handling for the y=0 and x=0 case
// Inexact raised when appropriate by algorithm
@@ -92,7 +103,7 @@
// atan2l(+/-Inf, Inf) = +/-pi/4
// atan2l(+/-Inf, -Inf) = +/-3pi/4
//
-// *********************************************************************
+//*********************************************************************
//
// Mathematical Description
// ---------------------------
@@ -108,16 +119,16 @@
//
//
// (Arg_X, Arg_Y) x
-// \
-// \
-// \
-// \
+// \
+// \
+// \
+// \
// \ angle between is ATANL(Arg_Y,Arg_X)
-// \
+// \
// ------------------> X-axis
// Origin
@@ -232,14 +243,14 @@
// z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1
//
// then
-// / \
+// / \
// | (V/U) - z_hi |
// arctan(V/U) = arctan(z_hi) + acrtan| -------------- |
// | 1 + (V/U)*z_hi |
// \ /
//
-// / \
+// / \
// | V - z_hi*U |
// = arctan(z_hi) + acrtan| -------------- |
@@ -295,7 +306,7 @@
// U := max( |Arg_X|, |Arg_Y| )
// V := min( |Arg_X|, |Arg_Y| )
//
-// execute: frcap E, pred, V, U
+// execute: frcpa E, pred, V, U
// If pred is 0, go to Step 5 for special cases handling.
//
// Step 2. Decide on branch.
@@ -399,7 +410,7 @@
//
// z := V * E ...z approximates V/U to roughly working precision
// zsq := z * z
-// z8 := zsq * zsq; z8 := z8 * z8
+// z4 := zsq * zsq; z8 := z4 * z4
//
// poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
// poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3))
@@ -438,12 +449,11 @@
//
// Step 5. Special Cases
//
-// If pred is 0 where pred is obtained in
-// frcap E, pred, V, U
+// These are detected early in the function by fclass instructions.
//
-// we are in one of those special cases of 0,+-inf or NaN
+// We are in one of those special cases when X or Y is 0,+-inf or NaN
//
-// If one of U and V is NaN, return U+V (which will generate
+// If one of X and Y is NaN, return X+Y (which will generate
// invalid in case one is a signaling NaN). Otherwise,
// return the Result as described in the table
//
@@ -469,8 +479,6 @@
//
//
-#include "libm_support.h"
-
ArgY_orig = f8
Result = f8
FR_RESULT = f8
@@ -504,6 +512,7 @@ Res_hi = f49
Res_lo = f50
Z = f52
zsq = f53
+z4 = f54
z8 = f54
poly1 = f55
poly2 = f56
@@ -521,8 +530,8 @@ P_5 = f67
P_6 = f68
P_7 = f69
P_8 = f70
-TWO_TO_NEG3 = f71
-U_hold = f72
+U_hold = f71
+TWO_TO_NEG3 = f72
C_hi_hold = f73
E_hold = f74
M = f75
@@ -530,6 +539,11 @@ ArgX_abs = f76
ArgY_abs = f77
Result_lo = f78
A_temp = f79
+FR_temp = f80
+Xsq = f81
+Ysq = f82
+tmp_small = f83
+
GR_SAVE_PFS = r33
GR_SAVE_B0 = r34
GR_SAVE_GP = r35
@@ -545,1415 +559,1399 @@ exp_ArgY = r44
exponent_Q = r45
significand_Q = r46
special = r47
-special1 = r48
+sp_exp_Q = r48
+sp_exp_4sig_Q = r49
+table_base = r50
+int_temp = r51
+
GR_Parameter_X = r49
GR_Parameter_Y = r50
GR_Parameter_RESULT = r51
GR_Parameter_TAG = r52
-int_temp = r52
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-.align 64
-
-Constants_atan:
-ASM_TYPE_DIRECTIVE(Constants_atan,@object)
-data4 0x54442D18, 0x3FF921FB, 0x248D3132, 0x3E000000
-// double pi/2, single lo_pi/2, two**(-3)
-data4 0xAAAAAAA3, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // P_1
-data4 0xCCCC54B2, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // P_2
-data4 0x47E4D0C2, 0x92492492, 0x0000BFFC, 0x00000000 // P_3
-data4 0x58870889, 0xE38E38E0, 0x00003FFB, 0x00000000 // P_4
-data4 0x290149F8, 0xBA2E895B, 0x0000BFFB, 0x00000000 // P_5
-data4 0x250F733D, 0x9D88E6D4, 0x00003FFB, 0x00000000 // P_6
-data4 0xFB8745A0, 0x884E51FF, 0x0000BFFB, 0x00000000 // P_7
-data4 0x394396BD, 0xE1C7412B, 0x00003FFA, 0x00000000 // P_8
-data4 0xAAAAA52F, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // Q_1
-data4 0xC75B60D3, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // Q_2
-data4 0x011F1940, 0x924923AD, 0x0000BFFC, 0x00000000 // Q_3
-data4 0x2A5F89BD, 0xE36F716D, 0x00003FFB, 0x00000000 // Q_4
+GR_temp = r52
+
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(Constants_atan)
+// double pi/2
+data8 0x3FF921FB54442D18
+// single lo_pi/2, two**(-3)
+data4 0x248D3132, 0x3E000000
+data8 0xAAAAAAAAAAAAAAA3, 0xBFFD // P_1
+data8 0xCCCCCCCCCCCC54B2, 0x3FFC // P_2
+data8 0x9249249247E4D0C2, 0xBFFC // P_3
+data8 0xE38E38E058870889, 0x3FFB // P_4
+data8 0xBA2E895B290149F8, 0xBFFB // P_5
+data8 0x9D88E6D4250F733D, 0x3FFB // P_6
+data8 0x884E51FFFB8745A0, 0xBFFB // P_7
+data8 0xE1C7412B394396BD, 0x3FFA // P_8
+data8 0xAAAAAAAAAAAAA52F, 0xBFFD // Q_1
+data8 0xCCCCCCCCC75B60D3, 0x3FFC // Q_2
+data8 0x924923AD011F1940, 0xBFFC // Q_3
+data8 0xE36F716D2A5F89BD, 0x3FFB // Q_4
//
// Entries Tbl_hi (double precision)
// B = 1+Index/16+1/32 Index = 0
// Entries Tbl_lo (single precision)
// B = 1+Index/16+1/32 Index = 0
//
-data4 0xA935BD8E, 0x3FE9A000, 0x23ACA08F, 0x00000000
+data8 0x3FE9A000A935BD8E
+data4 0x23ACA08F, 0x00000000
//
// Entries Tbl_hi (double precision) Index = 0,1,...,15
// B = 2^(-1)*(1+Index/16+1/32)
// Entries Tbl_lo (single precision)
// Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32)
//
-data4 0x7F175A34, 0x3FDE77EB, 0x238729EE, 0x00000000
-data4 0x73C1A40B, 0x3FE0039C, 0x249334DB, 0x00000000
-data4 0x5B5B43DA, 0x3FE0C614, 0x22CBA7D1, 0x00000000
-data4 0x88BE7C13, 0x3FE1835A, 0x246310E7, 0x00000000
-data4 0xE2CC9E6A, 0x3FE23B71, 0x236210E5, 0x00000000
-data4 0x8406CBCA, 0x3FE2EE62, 0x2462EAF5, 0x00000000
-data4 0x1CD41719, 0x3FE39C39, 0x24B73EF3, 0x00000000
-data4 0x5B795B55, 0x3FE44506, 0x24C11260, 0x00000000
-data4 0x5BB6EC04, 0x3FE4E8DE, 0x242519EE, 0x00000000
-data4 0x1F732FBA, 0x3FE587D8, 0x24D4346C, 0x00000000
-data4 0x115D7B8D, 0x3FE6220D, 0x24ED487B, 0x00000000
-data4 0x920B3D98, 0x3FE6B798, 0x2495FF1E, 0x00000000
-data4 0x8FBA8E0F, 0x3FE74897, 0x223D9531, 0x00000000
-data4 0x289FA093, 0x3FE7D528, 0x242B0411, 0x00000000
-data4 0x576CC2C5, 0x3FE85D69, 0x2335B374, 0x00000000
-data4 0xA99CC05D, 0x3FE8E17A, 0x24C27CFB, 0x00000000
+data8 0x3FDE77EB7F175A34
+data4 0x238729EE, 0x00000000
+data8 0x3FE0039C73C1A40B
+data4 0x249334DB, 0x00000000
+data8 0x3FE0C6145B5B43DA
+data4 0x22CBA7D1, 0x00000000
+data8 0x3FE1835A88BE7C13
+data4 0x246310E7, 0x00000000
+data8 0x3FE23B71E2CC9E6A
+data4 0x236210E5, 0x00000000
+data8 0x3FE2EE628406CBCA
+data4 0x2462EAF5, 0x00000000
+data8 0x3FE39C391CD41719
+data4 0x24B73EF3, 0x00000000
+data8 0x3FE445065B795B55
+data4 0x24C11260, 0x00000000
+data8 0x3FE4E8DE5BB6EC04
+data4 0x242519EE, 0x00000000
+data8 0x3FE587D81F732FBA
+data4 0x24D4346C, 0x00000000
+data8 0x3FE6220D115D7B8D
+data4 0x24ED487B, 0x00000000
+data8 0x3FE6B798920B3D98
+data4 0x2495FF1E, 0x00000000
+data8 0x3FE748978FBA8E0F
+data4 0x223D9531, 0x00000000
+data8 0x3FE7D528289FA093
+data4 0x242B0411, 0x00000000
+data8 0x3FE85D69576CC2C5
+data4 0x2335B374, 0x00000000
+data8 0x3FE8E17AA99CC05D
+data4 0x24C27CFB, 0x00000000
//
// Entries Tbl_hi (double precision) Index = 0,1,...,15
// B = 2^(-2)*(1+Index/16+1/32)
// Entries Tbl_lo (single precision)
// Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32)
//
-data4 0x510665B5, 0x3FD025FA, 0x24263482, 0x00000000
-data4 0x362431C9, 0x3FD1151A, 0x242C8DC9, 0x00000000
-data4 0x67E47C95, 0x3FD20255, 0x245CF9BA, 0x00000000
-data4 0x7A823CFE, 0x3FD2ED98, 0x235C892C, 0x00000000
-data4 0x29271134, 0x3FD3D6D1, 0x2389BE52, 0x00000000
-data4 0x586890E6, 0x3FD4BDEE, 0x24436471, 0x00000000
-data4 0x175E0F4E, 0x3FD5A2E0, 0x2389DBD4, 0x00000000
-data4 0x9F5FA6FD, 0x3FD68597, 0x2476D43F, 0x00000000
-data4 0x52817501, 0x3FD76607, 0x24711774, 0x00000000
-data4 0xB8DF95D7, 0x3FD84422, 0x23EBB501, 0x00000000
-data4 0x7CD0C662, 0x3FD91FDE, 0x23883A0C, 0x00000000
-data4 0x66168001, 0x3FD9F930, 0x240DF63F, 0x00000000
-data4 0x5422058B, 0x3FDAD00F, 0x23FE261A, 0x00000000
-data4 0x378624A5, 0x3FDBA473, 0x23A8CD0E, 0x00000000
-data4 0x0AAD71F8, 0x3FDC7655, 0x2422D1D0, 0x00000000
-data4 0xC9EC862B, 0x3FDD45AE, 0x2344A109, 0x00000000
+data8 0x3FD025FA510665B5
+data4 0x24263482, 0x00000000
+data8 0x3FD1151A362431C9
+data4 0x242C8DC9, 0x00000000
+data8 0x3FD2025567E47C95
+data4 0x245CF9BA, 0x00000000
+data8 0x3FD2ED987A823CFE
+data4 0x235C892C, 0x00000000
+data8 0x3FD3D6D129271134
+data4 0x2389BE52, 0x00000000
+data8 0x3FD4BDEE586890E6
+data4 0x24436471, 0x00000000
+data8 0x3FD5A2E0175E0F4E
+data4 0x2389DBD4, 0x00000000
+data8 0x3FD685979F5FA6FD
+data4 0x2476D43F, 0x00000000
+data8 0x3FD7660752817501
+data4 0x24711774, 0x00000000
+data8 0x3FD84422B8DF95D7
+data4 0x23EBB501, 0x00000000
+data8 0x3FD91FDE7CD0C662
+data4 0x23883A0C, 0x00000000
+data8 0x3FD9F93066168001
+data4 0x240DF63F, 0x00000000
+data8 0x3FDAD00F5422058B
+data4 0x23FE261A, 0x00000000
+data8 0x3FDBA473378624A5
+data4 0x23A8CD0E, 0x00000000
+data8 0x3FDC76550AAD71F8
+data4 0x2422D1D0, 0x00000000
+data8 0x3FDD45AEC9EC862B
+data4 0x2344A109, 0x00000000
//
// Entries Tbl_hi (double precision) Index = 0,1,...,15
// B = 2^(-3)*(1+Index/16+1/32)
// Entries Tbl_lo (single precision)
// Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32)
//
-data4 0x84212B3D, 0x3FC068D5, 0x239874B6, 0x00000000
-data4 0x41060850, 0x3FC16465, 0x2335E774, 0x00000000
-data4 0x171A535C, 0x3FC25F6E, 0x233E36BE, 0x00000000
-data4 0xEDEB99A3, 0x3FC359E8, 0x239680A3, 0x00000000
-data4 0xC6092A9E, 0x3FC453CE, 0x230FB29E, 0x00000000
-data4 0xBA11570A, 0x3FC54D18, 0x230C1418, 0x00000000
-data4 0xFFB3AA73, 0x3FC645BF, 0x23F0564A, 0x00000000
-data4 0xE8A7D201, 0x3FC73DBD, 0x23D4A5E1, 0x00000000
-data4 0xE398EBC7, 0x3FC8350B, 0x23D4ADDA, 0x00000000
-data4 0x7D050271, 0x3FC92BA3, 0x23BCB085, 0x00000000
-data4 0x601081A5, 0x3FCA217E, 0x23BC841D, 0x00000000
-data4 0x574D780B, 0x3FCB1696, 0x23CF4A8E, 0x00000000
-data4 0x4D768466, 0x3FCC0AE5, 0x23BECC90, 0x00000000
-data4 0x4E1D5395, 0x3FCCFE65, 0x2323DCD2, 0x00000000
-data4 0x864C9D9D, 0x3FCDF110, 0x23F53F3A, 0x00000000
-data4 0x451D980C, 0x3FCEE2E1, 0x23CCB11F, 0x00000000
-
-data4 0x54442D18, 0x400921FB, 0x33145C07, 0x3CA1A626 // PI two doubles
-data4 0x54442D18, 0x3FF921FB, 0x33145C07, 0x3C91A626 // PI_by_2 two dbles
-data4 0x54442D18, 0x3FE921FB, 0x33145C07, 0x3C81A626 // PI_by_4 two dbles
-data4 0x7F3321D2, 0x4002D97C, 0x4C9E8A0A, 0x3C9A7939 // 3PI_by_4 two dbles
-ASM_SIZE_DIRECTIVE(Constants_atan)
-
-
-.text
-.proc atanl#
-.global atanl#
-.align 64
-
-atanl:
-{ .mfb
- nop.m 999
-(p0) mov ArgX_orig = f1
-(p0) br.cond.sptk atan2l ;;
-}
-.endp atanl
-ASM_SIZE_DIRECTIVE(atanl)
-
-.text
-.proc atan2l#
-.global atan2l#
-#ifdef _LIBC
-.proc __atan2l#
-.global __atan2l#
-.proc __ieee754_atan2l#
-.global __ieee754_atan2l#
-#endif
-.align 64
-
-
-atan2l:
-#ifdef _LIBC
-__atan2l:
-__ieee754_atan2l:
-#endif
-{ .mfi
-alloc r32 = ar.pfs, 0, 17 , 4, 0
-(p0) mov ArgY = ArgY_orig
-}
-{ .mfi
- nop.m 999
-(p0) mov ArgX = ArgX_orig
- nop.i 999
-};;
+data8 0x3FC068D584212B3D
+data4 0x239874B6, 0x00000000
+data8 0x3FC1646541060850
+data4 0x2335E774, 0x00000000
+data8 0x3FC25F6E171A535C
+data4 0x233E36BE, 0x00000000
+data8 0x3FC359E8EDEB99A3
+data4 0x239680A3, 0x00000000
+data8 0x3FC453CEC6092A9E
+data4 0x230FB29E, 0x00000000
+data8 0x3FC54D18BA11570A
+data4 0x230C1418, 0x00000000
+data8 0x3FC645BFFFB3AA73
+data4 0x23F0564A, 0x00000000
+data8 0x3FC73DBDE8A7D201
+data4 0x23D4A5E1, 0x00000000
+data8 0x3FC8350BE398EBC7
+data4 0x23D4ADDA, 0x00000000
+data8 0x3FC92BA37D050271
+data4 0x23BCB085, 0x00000000
+data8 0x3FCA217E601081A5
+data4 0x23BC841D, 0x00000000
+data8 0x3FCB1696574D780B
+data4 0x23CF4A8E, 0x00000000
+data8 0x3FCC0AE54D768466
+data4 0x23BECC90, 0x00000000
+data8 0x3FCCFE654E1D5395
+data4 0x2323DCD2, 0x00000000
+data8 0x3FCDF110864C9D9D
+data4 0x23F53F3A, 0x00000000
+data8 0x3FCEE2E1451D980C
+data4 0x23CCB11F, 0x00000000
+//
+data8 0x400921FB54442D18, 0x3CA1A62633145C07 // PI two doubles
+data8 0x3FF921FB54442D18, 0x3C91A62633145C07 // PI_by_2 two dbles
+data8 0x3FE921FB54442D18, 0x3C81A62633145C07 // PI_by_4 two dbles
+data8 0x4002D97C7F3321D2, 0x3C9A79394C9E8A0A // 3PI_by_4 two dbles
+LOCAL_OBJECT_END(Constants_atan)
+
+
+.section .text
+GLOBAL_IEEE754_ENTRY(atanl)
+
+// Use common code with atan2l after setting x=1.0
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p7,p0 = ArgY_orig, 0x103
- nop.i 999
+ alloc r32 = ar.pfs, 0, 17, 4, 0
+ fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-//
-// Save original input args and load table ptr.
-//
-(p0) fclass.m.unc p6,p0 = ArgX_orig, 0x103
- nop.i 999
-};;
+ addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer
+ fma.s1 Xsq = f1, f1, f0 // Form x*x
+ nop.i 999
+}
+;;
+
{ .mfi
-(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp
-(p0) fclass.m.unc p0,p9 = ArgY_orig, 0x1FF
- nop.i 999 ;;
+ ld8 table_ptr1 = [table_ptr1] // Get table pointer
+ fnorm.s1 ArgY = ArgY_orig
+ nop.i 999
}
{ .mfi
- ld8 table_ptr1 = [table_ptr1]
-(p0) fclass.m.unc p0,p8 = ArgX_orig, 0x1FF
- nop.i 999
+ nop.m 999
+ fnorm.s1 ArgX = f1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p13,p0 = ArgY_orig, 0x0C3
- nop.i 999 ;;
+ getf.exp sign_X = f1 // Get signexp of x
+ fmerge.s ArgX_abs = f0, f1 // Form |x|
+ nop.i 999
}
{ .mfi
-(p0) fclass.m.unc p12,p0 = ArgX_orig, 0x0C3
- nop.i 999
+ nop.m 999
+ fnorm.s1 ArgX_orig = f1
+ nop.i 999
}
+;;
+{ .mfi
+ getf.exp sign_Y = ArgY_orig // Get signexp of y
+ fmerge.s ArgY_abs = f0, ArgY_orig // Form |y|
+ mov table_base = table_ptr1 // Save base pointer to tables
+}
+;;
-//
-// Check for NatVals.
-// Check for everything - if false, then must be pseudo-zero
-// or pseudo-nan (IA unsupporteds).
-//
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(ATANL_NATVAL) ;;
+{ .mfi
+ ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi
+ fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero
+ nop.i 999
}
+;;
-{ .mib
- nop.m 999
- nop.i 999
-(p7) br.cond.spnt L(ATANL_NATVAL) ;;
+{ .mfi
+ ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
+ nop.f 999
+ nop.i 999
}
-{ .mib
-(p0) ldfd P_hi = [table_ptr1],8
- nop.i 999
-(p8) br.cond.spnt L(ATANL_UNSUPPORTED) ;;
+{ .mfi
+ nop.m 999
+ fma.s1 M = f1, f1, f0 // Set M = 1.0
+ nop.i 999
}
-{ .mbb
-(p0) add table_ptr2 = 96, table_ptr1
-(p9) br.cond.spnt L(ATANL_UNSUPPORTED)
+;;
+
//
-// Load double precision high-order part of pi
+// Check for everything - if false, then must be pseudo-zero
+// or pseudo-nan (IA unsupporteds).
//
-(p12) br.cond.spnt L(ATANL_NAN) ;;
-}
{ .mfb
- nop.m 999
-(p0) fnorm.s1 ArgX = ArgX
-(p13) br.cond.spnt L(ATANL_NAN) ;;
-}
-//
-// Normalize the input argument.
-// Branch out if NaN inputs
-//
-{ .mmf
-(p0) ldfs P_lo = [table_ptr1], 4
- nop.m 999
-(p0) fnorm.s1 ArgY = ArgY ;;
+ nop.m 999
+ fclass.m p0,p12 = f1, 0x1FF // Test x unsupported
+(p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero
}
-{ .mmf
- nop.m 999
-(p0) ldfs TWO_TO_NEG3 = [table_ptr1], 180
-//
+;;
+
// U = max(ArgX_abs,ArgY_abs)
// V = min(ArgX_abs,ArgY_abs)
-// if PR1, swap = 0
-// if PR2, swap = 1
-//
-(p0) mov M = f1 ;;
-}
{ .mfi
- nop.m 999
-//
-// Get exp and sign of ArgX
-// Get exp and sign of ArgY
-// Load 2**(-3) and increment ptr to Q_4.
-//
-(p0) fmerge.s ArgX_abs = f1, ArgX
- nop.i 999 ;;
+ nop.m 999
+ fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares
+ nop.i 999
}
-//
-// load single precision low-order part of pi = P_lo
-//
+{ .mfb
+ nop.m 999
+ fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y|
+ br.cond.sptk ATANL_COMMON // Branch to common code
+}
+;;
+
+GLOBAL_IEEE754_END(atanl)
+GLOBAL_IEEE754_ENTRY(atan2l)
+
{ .mfi
-(p0) getf.exp sign_X = ArgX
-(p0) fmerge.s ArgY_abs = f1, ArgY
- nop.i 999 ;;
+ alloc r32 = ar.pfs, 0, 17, 4, 0
+ fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y
+ nop.i 999
}
-{ .mii
-(p0) getf.exp sign_Y = ArgY
- nop.i 999 ;;
-(p0) shr sign_X = sign_X, 17 ;;
+{ .mfi
+ addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer
+ fma.s1 Xsq = ArgX_orig, ArgX_orig, f0 // Form x*x
+ nop.i 999
}
-{ .mii
- nop.m 999
-(p0) shr sign_Y = sign_Y, 17 ;;
-(p0) cmp.eq.unc p8, p9 = 0x00000, sign_Y ;;
+;;
+
+{ .mfi
+ ld8 table_ptr1 = [table_ptr1] // Get table pointer
+ fnorm.s1 ArgY = ArgY_orig
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Is ArgX_abs >= ArgY_abs
-// Is sign_Y == 0?
-//
-(p0) fmax.s1 U = ArgX_abs, ArgY_abs
- nop.i 999
+ nop.m 999
+ fnorm.s1 ArgX = ArgX_orig
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// ArgX_abs = |ArgX|
-// ArgY_abs = |ArgY|
-// sign_X is sign bit of ArgX
-// sign_Y is sign bit of ArgY
-//
-(p0) fcmp.ge.s1 p6, p7 = ArgX_abs, ArgY_abs
- nop.i 999 ;;
+ getf.exp sign_X = ArgX_orig // Get signexp of x
+ fmerge.s ArgX_abs = f0, ArgX_orig // Form |x|
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmin.s1 V = ArgX_abs, ArgY_abs
- nop.i 999 ;;
+ getf.exp sign_Y = ArgY_orig // Get signexp of y
+ fmerge.s ArgY_abs = f0, ArgY_orig // Form |y|
+ mov table_base = table_ptr1 // Save base pointer to tables
}
+;;
+
{ .mfi
- nop.m 999
-(p8) fadd.s1 s_Y = f0, f1
-(p6) cmp.eq.unc p10, p11 = 0x00000, sign_X
+ ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi
+ fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero
+ nop.i 999
}
-{ .mii
-(p6) add swap = r0, r0
- nop.i 999 ;;
-(p7) add swap = 1, r0
+;;
+
+{ .mfi
+ ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
+ fclass.m p9,p0 = ArgX_orig, 0x1e7 // Test x natval, nan, inf, zero
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
+ fma.s1 M = f1, f1, f0 // Set M = 1.0
+ nop.i 999
+}
+;;
+
//
-// Let M = 1.0
-// if p8, s_Y = 1.0
-// if p9, s_Y = -1.0
+// Check for everything - if false, then must be pseudo-zero
+// or pseudo-nan (IA unsupporteds).
//
-(p10) fsub.s1 M = M, f1
- nop.i 999 ;;
+{ .mfb
+ nop.m 999
+ fclass.m p0,p12 = ArgX_orig, 0x1FF // Test x unsupported
+(p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero
}
+;;
+
+// U = max(ArgX_abs,ArgY_abs)
+// V = min(ArgX_abs,ArgY_abs)
{ .mfi
- nop.m 999
-(p9) fsub.s1 s_Y = f0, f1
- nop.i 999 ;;
+ nop.m 999
+ fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares
+ nop.i 999
}
+{ .mfb
+ nop.m 999
+ fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y|
+(p9) br.cond.spnt ATANL_X_SPECIAL // Branch if x natval, nan, inf, zero
+}
+;;
+
+// Now common code for atanl and atan2l
+ATANL_COMMON:
{ .mfi
- nop.m 999
-(p0) frcpa.s1 E, p6 = V, U
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p0,p13 = ArgY_orig, 0x1FF // Test y unsupported
+ shr sign_X = sign_X, 17 // Get sign bit of x
+}
+{ .mfi
+ nop.m 999
+ fma.s1 U = ArgY_abs, f1, f0 // Set U assuming |x| < |y|
+ adds table_ptr1 = 176, table_ptr1 // Point to Q4
}
-{ .mbb
- nop.m 999
+;;
+
+{ .mfi
+(p6) add swap = r0, r0 // Set swap=0 if |x| >= |y|
+(p6) frcpa.s1 E, p0 = ArgY_abs, ArgX_abs // Compute E if |x| >= |y|
+ shr sign_Y = sign_Y, 17 // Get sign bit of y
+}
+{ .mfb
+ nop.m 999
+(p6) fma.s1 V = ArgY_abs, f1, f0 // Set V if |x| >= |y|
+(p12) br.cond.spnt ATANL_UNSUPPORTED // Branch if x unsupported
+}
+;;
+
+// Set p8 if y >=0
+// Set p9 if y < 0
+// Set p10 if |x| >= |y| and x >=0
+// Set p11 if |x| >= |y| and x < 0
+{ .mfi
+ cmp.eq p8, p9 = 0, sign_Y // Test for y >= 0
+(p7) frcpa.s1 E, p0 = ArgX_abs, ArgY_abs // Compute E if |x| < |y|
+(p7) add swap = 1, r0 // Set swap=1 if |x| < |y|
+}
+{ .mfb
+(p6) cmp.eq.unc p10, p11 = 0, sign_X // If |x| >= |y|, test for x >= 0
+(p6) fma.s1 U = ArgX_abs, f1, f0 // Set U if |x| >= |y|
+(p13) br.cond.spnt ATANL_UNSUPPORTED // Branch if y unsupported
+}
+;;
+
//
-// E = frcpa(V,U)
+// if p8, s_Y = 1.0
+// if p9, s_Y = -1.0
//
-(p6) br.cond.sptk L(ATANL_STEP2)
-(p0) br.cond.spnt L(ATANL_SPECIAL_HANDLING) ;;
+.pred.rel "mutex",p8,p9
+{ .mfi
+ nop.m 999
+(p8) fadd.s1 s_Y = f0, f1 // If y >= 0 set s_Y = 1.0
+ nop.i 999
}
-L(ATANL_STEP2):
{ .mfi
- nop.m 999
-(p0) fmpy.s1 Q = E, V
- nop.i 999
+ nop.m 999
+(p9) fsub.s1 s_Y = f0, f1 // If y < 0 set s_Y = -1.0
+ nop.i 999
}
+;;
+
+.pred.rel "mutex",p10,p11
{ .mfi
- nop.m 999
-(p0) fcmp.eq.s0 p0, p9 = f1, ArgY_orig
- nop.i 999 ;;
+ nop.m 999
+(p10) fsub.s1 M = M, f1 // If |x| >= |y| and x >=0, set M=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Is Q < 2**(-3)?
-//
-(p0) fcmp.eq.s0 p0, p8 = f1, ArgX_orig
- nop.i 999
+ nop.m 999
+(p11) fadd.s1 M = M, f1 // If |x| >= |y| and x < 0, set M=2.0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p11) fadd.s1 M = M, f1
- nop.i 999 ;;
+ nop.m 999
+ fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag
+ nop.i 999
}
-{ .mlx
- nop.m 999
// *************************************************
// ********************* STEP2 *********************
// *************************************************
-(p0) movl special = 0x8400000000000000
-}
-{ .mlx
- nop.m 999
//
-// lookup = b_1 b_2 b_3 B_4
+// Q = E * V
//
-(p0) movl special1 = 0x0000000000000100 ;;
+{ .mfi
+ nop.m 999
+ fmpy.s1 Q = E, V
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Do fnorms to raise any denormal operand
-// exceptions.
-//
-(p0) fmpy.s1 P_hi = M, P_hi
- nop.i 999
+ nop.m 999
+ fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (1) if POLY path
+ nop.i 999
}
+;;
+
+// Create a single precision representation of the signexp of Q with the
+// 4 most significant bits of the significand followed by a 1 and then 18 0's
{ .mfi
- nop.m 999
-(p0) fmpy.s1 P_lo = M, P_lo
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 P_hi = M, P_hi
+ dep.z special = 0x1, 18, 1 // Form 0x0000000000040000
}
{ .mfi
- nop.m 999
-//
-// Q = E * V
-//
-(p0) fcmp.lt.unc.s1 p6, p7 = Q, TWO_TO_NEG3
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 P_lo = M, P_lo
+ add table_ptr2 = 32, table_ptr1
}
-{ .mmb
-(p0) getf.sig significand_Q = Q
-(p0) getf.exp exponent_Q = Q
- nop.b 999 ;;
+;;
+
+{ .mfi
+ nop.m 999
+ fma.s1 A_temp = Q, f1, f0 // Set A_temp if POLY path
+ nop.i 999
}
-{ .mmi
- nop.m 999 ;;
-(p0) andcm k = 0x0003, exponent_Q
-(p0) extr.u lookup = significand_Q, 59, 4 ;;
+{ .mfi
+ nop.m 999
+ fma.s1 E = E, E_hold, E // E = E + E*E_hold (1) if POLY path
+ nop.i 999
}
-{ .mib
- nop.m 999
-(p0) dep special = lookup, special, 59, 4
+;;
+
//
-// Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
+// Is Q < 2**(-3)?
+// swap = xor(swap,sign_X)
//
-(p6) br.cond.spnt L(ATANL_POLY) ;;
-}
{ .mfi
-(p0) cmp.eq.unc p8, p9 = 0x0000, k
-(p0) fmpy.s1 P_hi = s_Y, P_hi
+ nop.m 999
+ fcmp.lt.s1 p9, p0 = Q, TWO_TO_NEG3 // Test Q < 2^-3
+ xor swap = sign_X, swap
+}
+;;
+
+// P_hi = s_Y * P_hi
+{ .mmf
+ getf.exp exponent_Q = Q // Get signexp of Q
+ cmp.eq.unc p7, p6 = 0x00000, swap
+ fmpy.s1 P_hi = s_Y, P_hi
+}
+;;
+
//
-// We waited a few extra cycles so P_lo and P_hi could be calculated.
-// Load the constant 256 for loading up table entries.
+// if (PR_1) sigma = -1.0
+// if (PR_2) sigma = 1.0
+//
+{ .mfi
+ getf.sig significand_Q = Q // Get significand of Q
+(p6) fsub.s1 sigma = f0, f1
+ nop.i 999
+}
+{ .mfb
+(p9) add table_ptr1 = 128, table_base // Point to P8 if POLY path
+(p7) fadd.s1 sigma = f0, f1
+(p9) br.cond.spnt ATANL_POLY // Branch to POLY if 0 < Q < 2^-3
+}
+;;
+
//
// *************************************************
// ******************** STEP3 **********************
// *************************************************
-(p0) add table_ptr2 = 16, table_ptr1
-}
//
-// Let z_hi have exponent and sign of original Q
-// Load the Tbl_hi(0) else, increment pointer.
+// lookup = b_1 b_2 b_3 B_4
//
-{ .mii
-(p0) ldfe Q_4 = [table_ptr1], -16
-(p0) xor swap = sign_X, swap ;;
-(p9) sub k = k, r0, 1
-}
{ .mmi
-(p0) setf.sig z_hi = special
-(p0) ldfe Q_3 = [table_ptr1], -16
-(p9) add table_ptr2 = 16, table_ptr2 ;;
+ nop.m 999
+ nop.m 999
+ andcm k = 0x0003, exponent_Q // k=0,1,2,3 for exp_Q=0,-1,-2,-3
}
+;;
+
//
-// U_hold = U - U_prime_hi
-// k = k * 256 - Result can be 0, 256, or 512.
+// Generate sign_exp_Q b_1 b_2 b_3 b_4 1 0 0 0 ... 0 in single precision
+// representation. Note sign of Q is always 0.
//
-{ .mmb
-(p0) ldfe Q_2 = [table_ptr1], -16
-(p8) ldfd Tbl_hi = [table_ptr2], 8
- nop.b 999 ;;
+{ .mfi
+ cmp.eq p8, p9 = 0x0000, k // Test k=0
+ nop.f 999
+ extr.u lookup = significand_Q, 59, 4 // Extract b_1 b_2 b_3 b_4 for index
}
-//
-// U_prime_lo = U_hold + V * z_hi
-// lookup -> lookup * 16 + k
-//
-{ .mmi
-(p0) ldfe Q_1 = [table_ptr1], -16 ;;
-(p8) ldfs Tbl_lo = [table_ptr2], 8
-//
-// U_prime_hi = U + V * z_hi
-// Load the Tbl_lo(0)
-//
-(p9) pmpy2.r k = k, special1 ;;
+{ .mfi
+ sub sp_exp_Q = 0x7f, k // Form single prec biased exp of Q
+ nop.f 999
+ sub k = k, r0, 1 // Decrement k
}
-{ .mii
- nop.m 999
- nop.i 999
- nop.i 999 ;;
+;;
+
+// Form pointer to B index table
+{ .mfi
+ ldfe Q_4 = [table_ptr1], -16 // Load Q_4
+ nop.f 999
+(p9) shl k = k, 8 // k = 0, 256, or 512
}
-{ .mii
- nop.m 999
- nop.i 999
- nop.i 999 ;;
+{ .mfi
+(p9) shladd table_ptr2 = lookup, 4, table_ptr2
+ nop.f 999
+ shladd sp_exp_4sig_Q = sp_exp_Q, 4, lookup // Shift and add in 4 high bits
}
-{ .mii
- nop.m 999
- nop.i 999
- nop.i 999 ;;
+;;
+
+{ .mmi
+(p8) add table_ptr2 = -16, table_ptr2 // Pointer if original k was 0
+(p9) add table_ptr2 = k, table_ptr2 // Pointer if k was 1, 2, 3
+ dep special = sp_exp_4sig_Q, special, 19, 13 // Form z_hi as single prec
}
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p9) shladd lookup = lookup, 0x0004, k ;;
+;;
+
+// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
+{ .mmi
+ ldfd Tbl_hi = [table_ptr2], 8 // Load Tbl_hi from index table
+;;
+ setf.s z_hi = special // Form z_hi
+ nop.i 999
}
{ .mmi
-(p9) add table_ptr2 = table_ptr2, lookup ;;
-//
-// V_prime = V - U * z_hi
-//
-(p9) ldfd Tbl_hi = [table_ptr2], 8
- nop.i 999 ;;
+ ldfs Tbl_lo = [table_ptr2], 8 // Load Tbl_lo from index table
+;;
+ ldfe Q_3 = [table_ptr1], -16 // Load Q_3
+ nop.i 999
}
+;;
+
+{ .mmi
+ ldfe Q_2 = [table_ptr1], -16 // Load Q_2
+ nop.m 999
+ nop.i 999
+}
+;;
+
{ .mmf
- nop.m 999
-//
-// C_hi = frcpa(1,U_prime_hi)
-//
-(p9) ldfs Tbl_lo = [table_ptr2], 8
-//
-// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
-// Point to beginning of Tbl_hi entries - k = 0.
-//
-(p0) fmerge.se z_hi = Q, z_hi ;;
+ ldfe Q_1 = [table_ptr1], -16 // Load Q_1
+ nop.m 999
+ nop.f 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 U_prime_hi = V, z_hi, U
- nop.i 999
+ nop.m 999
+ fma.s1 U_prime_hi = V, z_hi, U // U_prime_hi = U + V * z_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fnma.s1 V_prime = U, z_hi, V
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 V_prime = U, z_hi, V // V_prime = V - U * z_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) mov A_hi = Tbl_hi
- nop.i 999 ;;
+ nop.m 999
+ mov A_hi = Tbl_hi // Start with A_hi = Tbl_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fsub.s1 U_hold = U, U_prime_hi
- nop.i 999 ;;
+ nop.m 999
+ fsub.s1 U_hold = U, U_prime_hi // U_hold = U - U_prime_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) frcpa.s1 C_hi, p6 = f1, U_prime_hi
- nop.i 999 ;;
+ nop.m 999
+ frcpa.s1 C_hi, p0 = f1, U_prime_hi // C_hi = frcpa(1,U_prime_hi)
+ nop.i 999
}
+;;
+
{ .mfi
-(p0) cmp.eq.unc p7, p6 = 0x00000, swap
-(p0) fmpy.s1 A_hi = s_Y, A_hi
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly = wsq * poly
-//
-(p7) fadd.s1 sigma = f0, f1
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 U_prime_lo = z_hi, V, U_hold // U_prime_lo = U_hold + V * z_hi
+ nop.i 999
}
+;;
+
+// C_hi_hold = 1 - C_hi * U_prime_hi (1)
{ .mfi
- nop.m 999
-(p0) fma.s1 U_prime_lo = z_hi, V, U_hold
- nop.i 999
+ nop.m 999
+ fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p6) fsub.s1 sigma = f0, f1
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (1)
+ nop.i 999
}
+;;
+
+// C_hi_hold = 1 - C_hi * U_prime_hi (2)
{ .mfi
- nop.m 999
-//
-// A_lo = A_lo + w_hi
-// A_hi = s_Y * A_hi
-//
-(p0) fma.s1 Res_hi = sigma, A_hi, P_hi
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// C_hi_hold = 1 - C_hi * U_prime_hi (1)
-//
-(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (2)
+ nop.i 999
}
+;;
+
+// C_hi_hold = 1 - C_hi * U_prime_hi (3)
{ .mfi
- nop.m 999
-//
-// C_hi = C_hi + C_hi * C_hi_hold (1)
-//
-(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// C_hi_hold = 1 - C_hi * U_prime_hi (2)
-//
-(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (3)
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// C_hi = C_hi + C_hi * C_hi_hold (2)
-//
-(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 w_hi = V_prime, C_hi // w_hi = V_prime * C_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// C_hi_hold = 1 - C_hi * U_prime_hi (3)
-//
-(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 wsq = w_hi, w_hi // wsq = w_hi * w_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// C_hi = C_hi + C_hi * C_hi_hold (3)
-//
-(p0) fmpy.s1 w_hi = V_prime, C_hi
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 w_lo = w_hi, U_prime_hi, V_prime // w_lo = V_prime-w_hi*U_prime_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// w_hi = V_prime * C_hi
-//
-(p0) fmpy.s1 wsq = w_hi, w_hi
- nop.i 999
+ nop.m 999
+ fma.s1 poly = wsq, Q_4, Q_3 // poly = Q_3 + wsq * Q_4
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fnma.s1 w_lo = w_hi, U_prime_hi, V_prime
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 w_lo = w_hi, U_prime_lo, w_lo // w_lo = w_lo - w_hi * U_prime_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// wsq = w_hi * w_hi
-// w_lo = = V_prime - w_hi * U_prime_hi
-//
-(p0) fma.s1 poly = wsq, Q_4, Q_3
- nop.i 999
+ nop.m 999
+ fma.s1 poly = wsq, poly, Q_2 // poly = Q_2 + wsq * poly
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fnma.s1 w_lo = w_hi, U_prime_lo, w_lo
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 w_lo = C_hi, w_lo // w_lo = = w_lo * C_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly = Q_3 + wsq * Q_4
-// w_lo = = w_lo - w_hi * U_prime_lo
-//
-(p0) fma.s1 poly = wsq, poly, Q_2
- nop.i 999
+ nop.m 999
+ fma.s1 poly = wsq, poly, Q_1 // poly = Q_1 + wsq * poly
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 w_lo = C_hi, w_lo
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 A_lo = Tbl_lo, w_lo // A_lo = Tbl_lo + w_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly = Q_2 + wsq * poly
-// w_lo = = w_lo * C_hi
-//
-(p0) fma.s1 poly = wsq, poly, Q_1
- nop.i 999
+ nop.m 999
+ fmpy.s0 Q_1 = Q_1, Q_1 // Dummy operation to raise inexact
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fadd.s1 A_lo = Tbl_lo, w_lo
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 poly = wsq, poly // poly = wsq * poly
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode)
-//
-(p0) fmpy.s0 Q_1 = Q_1, Q_1
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 poly = w_hi, poly // poly = w_hi * poly
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly = Q_1 + wsq * poly
-// A_lo = Tbl_lo + w_lo
-// swap = xor(swap,sign_X)
-//
-(p0) fmpy.s1 poly = wsq, poly
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 A_lo = A_lo, poly // A_lo = A_lo + poly
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Is (swap) != 0 ?
-// poly = wsq * poly
-// A_hi = Tbl_hi
-//
-(p0) fmpy.s1 poly = w_hi, poly
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 A_lo = A_lo, w_hi // A_lo = A_lo + w_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// if (PR_1) sigma = -1.0
-// if (PR_2) sigma = 1.0
-//
-(p0) fadd.s1 A_lo = A_lo, poly
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 Res_lo = sigma, A_lo, P_lo // Res_lo = P_lo + sigma * A_lo
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
-// P_hi = s_Y * P_hi
-// A_lo = A_lo + poly
+// Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode)
//
-(p0) fadd.s1 A_lo = A_lo, w_hi
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fma.s1 Res_lo = sigma, A_lo, P_lo
- nop.i 999 ;;
-}
{ .mfb
- nop.m 999
-//
-// Res_hi = P_hi + sigma * A_hi
-// Res_lo = P_lo + sigma * A_lo
-//
-(p0) fma.s0 Result = Res_lo, s_Y, Res_hi
-//
-// Raise inexact.
-//
-br.ret.sptk b0 ;;
-}
-//
-// poly1 = P_5 + zsq * poly1
-// poly2 = zsq * poly2
-//
-L(ATANL_POLY):
-{ .mmf
-(p0) xor swap = sign_X, swap
- nop.m 999
-(p0) fnma.s1 E_hold = E, U, f1 ;;
+ nop.m 999
+ fma.s0 Result = Res_lo, s_Y, Res_hi
+ br.ret.sptk b0 // Exit table path 2^-3 <= V/U < 1
}
-{ .mfi
- nop.m 999
-(p0) mov A_temp = Q
+;;
+
+
+ATANL_POLY:
+// Here if 0 < V/U < 2^-3
//
-// poly1 = P_4 + zsq * poly1
-// swap = xor(swap,sign_X)
+// ***********************************************
+// ******************** STEP4 ********************
+// ***********************************************
+
//
-// sign_X gr_002
-// swap gr_004
-// poly1 = poly1 <== Done with poly1
-// poly1 = P_4 + zsq * poly1
-// swap = xor(swap,sign_X)
+// Following:
+// Iterate 3 times E = E + E*(1.0 - E*U)
+// Also load P_8, P_7, P_6, P_5, P_4
//
-(p0) cmp.eq.unc p7, p6 = 0x00000, swap
-}
-{ .mfi
- nop.m 999
-(p0) fmpy.s1 P_hi = s_Y, P_hi
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p6) fsub.s1 sigma = f0, f1
- nop.i 999
+ ldfe P_8 = [table_ptr1], -16 // Load P_8
+ fnma.s1 z_lo = A_temp, U, V // z_lo = V - A_temp * U
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fadd.s1 sigma = f0, f1
- nop.i 999 ;;
-}
-
-// ***********************************************
-// ******************** STEP4 ********************
-// ***********************************************
-
-{ .mmi
nop.m 999
-(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp
+ fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (2)
nop.i 999
}
;;
{ .mmi
- ld8 table_ptr1 = [table_ptr1]
- nop.m 999
+ ldfe P_7 = [table_ptr1], -16 // Load P_7
+;;
+ ldfe P_6 = [table_ptr1], -16 // Load P_6
nop.i 999
}
;;
-
{ .mfi
- nop.m 999
-(p0) fma.s1 E = E, E_hold, E
-//
-// Following:
-// Iterate 3 times E = E + E*(1.0 - E*U)
-// Also load P_8, P_7, P_6, P_5, P_4
-// E_hold = 1.0 - E * U (1)
-// A_temp = Q
-//
-(p0) add table_ptr1 = 128, table_ptr1 ;;
-}
-{ .mmf
- nop.m 999
-//
-// E = E + E_hold*E (1)
-// Point to P_8.
-//
-(p0) ldfe P_8 = [table_ptr1], -16
-//
-// poly = z8*poly1 + poly2 (Typo in writeup)
-// Is (swap) != 0 ?
-//
-(p0) fnma.s1 z_lo = A_temp, U, V ;;
+ ldfe P_5 = [table_ptr1], -16 // Load P_5
+ fma.s1 E = E, E_hold, E // E = E + E_hold*E (2)
+ nop.i 999
}
-{ .mmb
- nop.m 999
-//
-// E_hold = 1.0 - E * U (2)
-//
-(p0) ldfe P_7 = [table_ptr1], -16
- nop.b 999 ;;
+;;
+
+{ .mmi
+ ldfe P_4 = [table_ptr1], -16 // Load P_4
+;;
+ ldfe P_3 = [table_ptr1], -16 // Load P_3
+ nop.i 999
}
-{ .mmb
- nop.m 999
-//
-// E = E + E_hold*E (2)
-//
-(p0) ldfe P_6 = [table_ptr1], -16
- nop.b 999 ;;
+;;
+
+{ .mfi
+ ldfe P_2 = [table_ptr1], -16 // Load P_2
+ fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (3)
+ nop.i 999
}
-{ .mmb
- nop.m 999
-//
-// E_hold = 1.0 - E * U (3)
-//
-(p0) ldfe P_5 = [table_ptr1], -16
- nop.b 999 ;;
+{ .mlx
+ nop.m 999
+ movl int_temp = 0x24005 // Signexp for small neg number
}
+;;
+
{ .mmf
- nop.m 999
-//
-// E = E + E_hold*E (3)
+ ldfe P_1 = [table_ptr1], -16 // Load P_1
+ setf.exp tmp_small = int_temp // Form small neg number
+ fma.s1 E = E, E_hold, E // E = E + E_hold*E (3)
+}
+;;
+
//
//
// At this point E approximates 1/U to roughly working precision
-// z = V*E approximates V/U
+// Z = V*E approximates V/U
//
-(p0) ldfe P_4 = [table_ptr1], -16
-(p0) fnma.s1 E_hold = E, U, f1 ;;
+{ .mfi
+ nop.m 999
+ fmpy.s1 Z = V, E // Z = V * E
+ nop.i 999
}
-{ .mmb
- nop.m 999
-//
-// Z = V * E
-//
-(p0) ldfe P_3 = [table_ptr1], -16
- nop.b 999 ;;
+{ .mfi
+ nop.m 999
+ fmpy.s1 z_lo = z_lo, E // z_lo = z_lo * E
+ nop.i 999
}
-{ .mmb
- nop.m 999
+;;
+
//
-// zsq = Z * Z
+// Now what we want to do is
+// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
+// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
//
-(p0) ldfe P_2 = [table_ptr1], -16
- nop.b 999 ;;
-}
-{ .mmb
- nop.m 999
//
-// z8 = zsq * zsq
+// Fixup added to force inexact later -
+// A_hi = A_temp + z_lo
+// z_lo = (A_temp - A_hi) + z_lo
//
-(p0) ldfe P_1 = [table_ptr1], -16
- nop.b 999 ;;
-}
-{ .mlx
- nop.m 999
-(p0) movl int_temp = 0x24005
-}
{ .mfi
- nop.m 999
-(p0) fma.s1 E = E, E_hold, E
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 zsq = Z, Z // zsq = Z * Z
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fnma.s1 E_hold = E, U, f1
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 A_hi = A_temp, z_lo // A_hi = A_temp + z_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 E = E, E_hold, E
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 poly1 = zsq, P_8, P_7 // poly1 = P_7 + zsq * P_8
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 Z = V, E
- nop.i 999
+ nop.m 999
+ fma.s1 poly2 = zsq, P_3, P_2 // poly2 = P_2 + zsq * P_3
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// z_lo = V - A_temp * U
-// if (PR_2) sigma = 1.0
-//
-(p0) fmpy.s1 z_lo = z_lo, E
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 z4 = zsq, zsq // z4 = zsq * zsq
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 zsq = Z, Z
- nop.i 999
+ nop.m 999
+ fsub.s1 A_temp = A_temp, A_hi // A_temp = A_temp - A_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// z_lo = z_lo * E
-// if (PR_1) sigma = -1.0
-//
-(p0) fadd.s1 A_hi = A_temp, z_lo
- nop.i 999 ;;
+ nop.m 999
+ fmerge.s tmp = A_hi, A_hi // Copy tmp = A_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// z8 = z8 * z8
-//
-//
-// Now what we want to do is
-// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
-// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
-//
-(p0) fma.s1 poly1 = zsq, P_8, P_7
- nop.i 999
+ nop.m 999
+ fma.s1 poly1 = zsq, poly1, P_6 // poly1 = P_6 + zsq * poly1
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 poly2 = zsq, P_3, P_2
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 poly2 = zsq, poly2, P_1 // poly2 = P_2 + zsq * poly2
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmpy.s1 z8 = zsq, zsq
- nop.i 999
+ nop.m 999
+ fmpy.s1 z8 = z4, z4 // z8 = z4 * z4
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fsub.s1 A_temp = A_temp, A_hi
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 z_lo = A_temp, z_lo // z_lo = (A_temp - A_hi) + z_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// A_lo = Z * poly + z_lo
-//
-(p0) fmerge.s tmp = A_hi, A_hi
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 poly1 = zsq, poly1, P_5 // poly1 = P_5 + zsq * poly1
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// poly1 = P_7 + zsq * P_8
-// poly2 = P_2 + zsq * P_3
-//
-(p0) fma.s1 poly1 = zsq, poly1, P_6
- nop.i 999
+ nop.m 999
+ fmpy.s1 poly2 = poly2, zsq // poly2 = zsq * poly2
+ nop.i 999
}
+;;
+
+// Create small GR double in case need to raise underflow
{ .mfi
- nop.m 999
-(p0) fma.s1 poly2 = zsq, poly2, P_1
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 poly1 = zsq, poly1, P_4 // poly1 = P_4 + zsq * poly1
+ dep GR_temp = -1,r0,0,53
}
+;;
+
+// Create small double in case need to raise underflow
{ .mfi
- nop.m 999
-(p0) fmpy.s1 z8 = z8, z8
- nop.i 999
+ setf.d FR_temp = GR_temp
+ fma.s1 poly = z8, poly1, poly2 // poly = poly2 + z8 * poly1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fadd.s1 z_lo = A_temp, z_lo
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 A_lo = Z, poly, z_lo // A_lo = z_lo + Z * poly
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly1 = P_6 + zsq * poly1
-// poly2 = P_2 + zsq * poly2
-//
-(p0) fma.s1 poly1 = zsq, poly1, P_5
- nop.i 999
+ nop.m 999
+ fadd.s1 A_hi = tmp, A_lo // A_hi = tmp + A_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmpy.s1 poly2 = poly2, zsq
- nop.i 999 ;;
+ nop.m 999
+ fsub.s1 tmp = tmp, A_hi // tmp = tmp - A_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Result = Res_hi + Res_lo (User Supplied Rounding Mode)
-//
-(p0) fmpy.s1 P_5 = P_5, P_5
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 poly1 = zsq, poly1, P_4
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 A_lo = tmp, A_lo // A_lo = tmp + A_lo
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 poly = z8, poly1, poly2
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
+ nop.m 999
+ fsub.s1 tmp = P_hi, Res_hi // tmp = P_hi - Res_hi
+ nop.i 999
+}
+;;
+
//
-// Fixup added to force inexact later -
-// A_hi = A_temp + z_lo
-// z_lo = (A_temp - A_hi) + z_lo
+// Test if A_lo is zero
//
-(p0) fma.s1 A_lo = Z, poly, z_lo
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p0) fadd.s1 A_hi = tmp, A_lo
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p6,p0 = A_lo, 0x007 // Test A_lo = 0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fsub.s1 tmp = tmp, A_hi
- nop.i 999
+ nop.m 999
+(p6) mov A_lo = tmp_small // If A_lo zero, make very small
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmpy.s1 A_hi = s_Y, A_hi
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 tmp = A_hi, sigma, tmp // tmp = sigma * A_hi + tmp
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fadd.s1 A_lo = tmp, A_lo
- nop.i 999
+ nop.m 999
+ fma.s1 sigma = A_lo, sigma, P_lo // sigma = A_lo * sigma + P_lo
+ nop.i 999
}
+;;
+
{ .mfi
-(p0) setf.exp tmp = int_temp
+ nop.m 999
+ fma.s1 Res_lo = s_Y, sigma, tmp // Res_lo = s_Y * sigma + tmp
+ nop.i 999
+}
+;;
+
//
-// P_hi = s_Y * P_hi
-// A_hi = s_Y * A_hi
+// Test if Res_lo is denormal
//
-(p0) fma.s1 Res_hi = sigma, A_hi, P_hi
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6,p0 = A_lo, 0x007
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p6) mov A_lo = tmp
- nop.i 999
+ nop.m 999
+ fclass.m p14, p15 = Res_lo, 0x0b
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
-// Res_hi = P_hi + sigma * A_hi
+// Compute Result = Res_lo + Res_hi. Use s3 if Res_lo is denormal.
//
-(p0) fsub.s1 tmp = P_hi, Res_hi
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-//
-// tmp = P_hi - Res_hi
-//
-(p0) fma.s1 tmp = A_hi, sigma, tmp
- nop.i 999
+ nop.m 999
+(p14) fadd.s3 Result = Res_lo, Res_hi // Result for Res_lo denormal
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 sigma = A_lo, sigma, P_lo
- nop.i 999 ;;
+ nop.m 999
+(p15) fadd.s0 Result = Res_lo, Res_hi // Result for Res_lo normal
+ nop.i 999
}
+;;
+
+//
+// If Res_lo is denormal test if Result equals zero
+//
{ .mfi
- nop.m 999
-//
-// tmp = sigma * A_hi + tmp
-// sigma = A_lo * sigma + P_lo
-//
-(p0) fma.s1 Res_lo = s_Y, sigma, tmp
- nop.i 999 ;;
+ nop.m 999
+(p14) fclass.m.unc p14, p0 = Result, 0x07
+ nop.i 999
}
-{ .mfb
- nop.m 999
+;;
+
//
-// Res_lo = s_Y * sigma + tmp
+// If Res_lo is denormal and Result equals zero, raise inexact, underflow
+// by squaring small double
//
-(p0) fadd.s0 Result = Res_lo, Res_hi
-br.ret.sptk b0 ;;
+{ .mfb
+ nop.m 999
+(p14) fmpy.d.s0 FR_temp = FR_temp, FR_temp
+ br.ret.sptk b0 // Exit POLY path, 0 < Q < 2^-3
}
-L(ATANL_NATVAL):
-L(ATANL_UNSUPPORTED):
-L(ATANL_NAN):
+;;
+
+
+ATANL_UNSUPPORTED:
{ .mfb
- nop.m 999
-(p0) fmpy.s0 Result = ArgX,ArgY
-(p0) br.ret.sptk b0 ;;
+ nop.m 999
+ fmpy.s0 Result = ArgX,ArgY
+ br.ret.sptk b0
}
-L(ATANL_SPECIAL_HANDLING):
+;;
+
+// Here if y natval, nan, inf, zero
+ATANL_Y_SPECIAL:
+// Here if x natval, nan, inf, zero
+ATANL_X_SPECIAL:
{ .mfi
- nop.m 999
-(p0) fcmp.eq.s0 p0, p6 = f1, ArgY_orig
- nop.i 999
+ nop.m 999
+ fclass.m p13,p12 = ArgY_orig, 0x0c3 // Test y nan
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fcmp.eq.s0 p0, p5 = f1, ArgX_orig
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p15,p14 = ArgY_orig, 0x103 // Test y natval
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6, p7 = ArgY, 0x007
- nop.i 999
-}
-{ .mlx
- nop.m 999
-(p0) movl special = 992
+ nop.m 999
+(p12) fclass.m p13,p0 = ArgX_orig, 0x0c3 // Test x nan
+ nop.i 999
}
;;
-
-{ .mmi
+{ .mfi
nop.m 999
-(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp
+(p14) fclass.m p15,p0 = ArgX_orig, 0x103 // Test x natval
nop.i 999
}
;;
-{ .mmi
- ld8 table_ptr1 = [table_ptr1]
+{ .mfb
nop.m 999
- nop.i 999
+(p13) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result nan if x or y nan
+(p13) br.ret.spnt b0 // Exit if x or y nan
+}
+;;
+
+{ .mfb
+ nop.m 999
+(p15) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result natval if x or y natval
+(p15) br.ret.spnt b0 // Exit if x or y natval
}
;;
-{ .mib
-(p0) add table_ptr1 = table_ptr1, special
- nop.i 999
-(p7) br.cond.spnt L(ATANL_ArgY_Not_ZERO) ;;
+// Here if x or y inf or zero
+ATANL_SPECIAL_HANDLING:
+{ .mfi
+ nop.m 999
+ fclass.m p6, p7 = ArgY_orig, 0x007 // Test y zero
+ mov special = 992 // Offset to table
}
+;;
+
+{ .mfb
+ add table_ptr1 = table_base, special // Point to 3pi/4
+ fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag
+(p7) br.cond.spnt ATANL_ArgY_Not_ZERO // Branch if y not zero
+}
+;;
+
+// Here if y zero
{ .mmf
-(p0) ldfd Result = [table_ptr1], 8
- nop.m 999
-(p6) fclass.m.unc p14, p0 = ArgX, 0x035 ;;
+ ldfd Result = [table_ptr1], 8 // Get pi high
+ nop.m 999
+ fclass.m p14, p0 = ArgX, 0x035 // Test for x>=+0
}
+;;
+
{ .mmf
- nop.m 999
-(p0) ldfd Result_lo = [table_ptr1], -8
-(p6) fclass.m.unc p15, p0 = ArgX, 0x036 ;;
+ nop.m 999
+ ldfd Result_lo = [table_ptr1], -8 // Get pi lo
+ fclass.m p15, p0 = ArgX, 0x036 // Test for x<=-0
}
+;;
+
+//
+// Return sign_Y * 0 when ArgX > +0
+//
{ .mfi
- nop.m 999
-(p14) fmerge.s Result = ArgY, f0
- nop.i 999
+ nop.m 999
+(p14) fmerge.s Result = ArgY, f0 // If x>=+0, y=0, hi sgn(y)*0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p6) fclass.m.unc p13, p0 = ArgX, 0x007
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p13, p0 = ArgX, 0x007 // Test for x=0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p14) fmerge.s Result_lo = ArgY, f0
- nop.i 999 ;;
+ nop.m 999
+(p14) fmerge.s Result_lo = ArgY, f0 // If x>=+0, y=0, lo sgn(y)*0
+ nop.i 999
}
+;;
+
{ .mfi
-(p13) mov GR_Parameter_TAG = 36
- nop.f 999
- nop.i 999 ;;
+(p13) mov GR_Parameter_TAG = 36 // Error tag for x=0, y=0
+ nop.f 999
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
-// Return sign_Y * 0 when ArgX > +0
+// Return sign_Y * pi when ArgX < -0
//
-(p15) fmerge.s Result = ArgY, Result
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p15) fmerge.s Result_lo = ArgY, Result_lo
- nop.i 999 ;;
+ nop.m 999
+(p15) fmerge.s Result = ArgY, Result // If x<0, y=0, hi=sgn(y)*pi
+ nop.i 999
}
-{ .mfb
- nop.m 999
-//
-// Return sign_Y * 0 when ArgX < -0
-//
-(p0) fadd.s0 Result = Result, Result_lo
-(p13) br.cond.spnt __libm_error_region ;;
+;;
+
+{ .mfi
+ nop.m 999
+(p15) fmerge.s Result_lo = ArgY, Result_lo // If x<0, y=0, lo=sgn(y)*pi
+ nop.i 999
}
-{ .mib
- nop.m 999
- nop.i 999
+;;
+
//
-// Call error support funciton for atan(0,0)
+// Call error support function for atan(0,0)
//
-(p0) br.ret.sptk b0 ;;
-}
-L(ATANL_ArgY_Not_ZERO):
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p9, p10 = ArgY, 0x023
- nop.i 999 ;;
+{ .mfb
+ nop.m 999
+ fadd.s0 Result = Result, Result_lo
+(p13) br.cond.spnt __libm_error_region // Branch if atan(0,0)
}
+;;
+
{ .mib
- nop.m 999
- nop.i 999
-(p10) br.cond.spnt L(ATANL_ArgY_Not_INF) ;;
-}
-{ .mfi
- nop.m 999
-(p9) fclass.m.unc p6, p0 = ArgX, 0x017
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p9) fclass.m.unc p7, p0 = ArgX, 0x021
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p9) fclass.m.unc p8, p0 = ArgX, 0x022
- nop.i 999 ;;
-}
-{ .mmi
-(p6) add table_ptr1 = 16, table_ptr1 ;;
-(p0) ldfd Result = [table_ptr1], 8
- nop.i 999 ;;
-}
-{ .mfi
-(p0) ldfd Result_lo = [table_ptr1], -8
- nop.f 999
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p6) fmerge.s Result = ArgY, Result
- nop.i 999 ;;
+ nop.m 999
+ nop.i 999
+ br.ret.sptk b0 // Exit for y=0, x not 0
}
+;;
+
+// Here if y not zero
+ATANL_ArgY_Not_ZERO:
{ .mfi
- nop.m 999
-(p6) fmerge.s Result_lo = ArgY, Result_lo
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p0, p10 = ArgY, 0x023 // Test y inf
+ nop.i 999
}
+;;
+
{ .mfb
- nop.m 999
-(p6) fadd.s0 Result = Result, Result_lo
-(p6) br.ret.sptk b0 ;;
+ nop.m 999
+ fclass.m p6, p0 = ArgX, 0x017 // Test for 0 <= |x| < inf
+(p10) br.cond.spnt ATANL_ArgY_Not_INF // Branch if 0 < |y| < inf
}
+;;
+
+// Here if y=inf
//
-// Load PI/2 and adjust its sign.
// Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal
// Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal
+// Return +PI/4 when ArgY = +Inf and ArgX = +Inf
+// Return -PI/4 when ArgY = -Inf and ArgX = +Inf
+// Return +3PI/4 when ArgY = +Inf and ArgX = -Inf
+// Return -3PI/4 when ArgY = -Inf and ArgX = -Inf
//
-{ .mmi
-(p7) add table_ptr1 = 32, table_ptr1 ;;
-(p7) ldfd Result = [table_ptr1], 8
- nop.i 999 ;;
-}
{ .mfi
-(p7) ldfd Result_lo = [table_ptr1], -8
- nop.f 999
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p7) fmerge.s Result = ArgY, Result
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p7, p0 = ArgX, 0x021 // Test for x=+inf
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p7) fmerge.s Result_lo = ArgY, Result_lo
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p7) fadd.s0 Result = Result, Result_lo
-(p7) br.ret.sptk b0 ;;
+(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2, if x finite
+ fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf
+ nop.i 999
}
-//
-// Load PI/4 and adjust its sign.
-// Return +PI/4 when ArgY = +Inf and ArgX = +Inf
-// Return -PI/4 when ArgY = -Inf and ArgX = +Inf
-//
+;;
+
{ .mmi
-(p8) add table_ptr1 = 48, table_ptr1 ;;
-(p8) ldfd Result = [table_ptr1], 8
- nop.i 999 ;;
+(p7) add table_ptr1 = 32, table_ptr1 // Point to pi/4 if x=+inf
+;;
+(p8) add table_ptr1 = 48, table_ptr1 // Point to 3pi/4 if x=-inf
+
+ nop.i 999
}
-{ .mfi
-(p8) ldfd Result_lo = [table_ptr1], -8
- nop.f 999
- nop.i 999 ;;
+;;
+
+{ .mmi
+ ldfd Result = [table_ptr1], 8 // Load pi/2, pi/4, or 3pi/4 hi
+;;
+ ldfd Result_lo = [table_ptr1], -8 // Load pi/2, pi/4, or 3pi/4 lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p8) fmerge.s Result = ArgY, Result
- nop.i 999 ;;
+ nop.m 999
+ fmerge.s Result = ArgY, Result // Merge sgn(y) in hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p8) fmerge.s Result_lo = ArgY, Result_lo
- nop.i 999 ;;
+ nop.m 999
+ fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo
+ nop.i 999
}
+;;
+
{ .mfb
- nop.m 999
-(p8) fadd.s0 Result = Result, Result_lo
-(p8) br.ret.sptk b0 ;;
+ nop.m 999
+ fadd.s0 Result = Result, Result_lo // Compute complete result
+ br.ret.sptk b0 // Exit for y=inf
}
-L(ATANL_ArgY_Not_INF):
-{ .mfi
- nop.m 999
+;;
+
+// Here if y not INF, and x=0 or INF
+ATANL_ArgY_Not_INF:
//
-// Load PI/4 and adjust its sign.
-// Return +3PI/4 when ArgY = +Inf and ArgX = -Inf
-// Return -3PI/4 when ArgY = -Inf and ArgX = -Inf
+// Return +PI/2 when ArgY NOT Inf, ArgY > 0 and ArgX = +/-0
+// Return -PI/2 when ArgY NOT Inf, ArgY < 0 and ArgX = +/-0
+// Return +0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
+// Return -0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
+// Return +PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
+// Return -PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
//
-(p0) fclass.m.unc p6, p0 = ArgX, 0x007
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p7, p0 = ArgX, 0x021
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p8, p0 = ArgX, 0x022
- nop.i 999 ;;
-}
-{ .mmi
-(p6) add table_ptr1 = 16, table_ptr1 ;;
-(p6) ldfd Result = [table_ptr1], 8
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p7, p9 = ArgX, 0x021 // Test for x=+inf
+ nop.i 999
}
+;;
+
{ .mfi
-(p6) ldfd Result_lo = [table_ptr1], -8
- nop.f 999
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p6, p0 = ArgX, 0x007 // Test for x=0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p6) fmerge.s Result = ArgY, Result
- nop.i 999 ;;
+(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2
+ fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf
+ nop.i 999
}
+;;
+
+.pred.rel "mutex",p7,p9
{ .mfi
- nop.m 999
-(p6) fmerge.s Result_lo = ArgY, Result_lo
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p6) fadd.s0 Result = Result, Result_lo
-(p6) br.ret.spnt b0 ;;
+(p9) ldfd Result = [table_ptr1], 8 // Load pi or pi/2 hi
+(p7) fmerge.s Result = ArgY, f0 // If y not inf, x=+inf, sgn(y)*0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// return = sign_Y * PI/2 when ArgX = 0
-//
-(p7) fmerge.s Result = ArgY, f0
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p7) fnorm.s0 Result = Result
-(p7) br.ret.spnt b0 ;;
-}
-//
-// return = sign_Y * 0 when ArgX = Inf
-//
-{ .mmi
-(p8) ldfd Result = [table_ptr1], 8 ;;
-(p8) ldfd Result_lo = [table_ptr1], -8
- nop.i 999 ;;
+(p9) ldfd Result_lo = [table_ptr1], -8 // Load pi or pi/2 lo
+(p7) fnorm.s0 Result = Result // If y not inf, x=+inf normalize
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p8) fmerge.s Result = ArgY, Result
- nop.i 999 ;;
+ nop.m 999
+(p9) fmerge.s Result = ArgY, Result // Merge sgn(y) in hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p8) fmerge.s Result_lo = ArgY, Result_lo
- nop.i 999 ;;
+ nop.m 999
+(p9) fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo
+ nop.i 999
}
+;;
+
{ .mfb
- nop.m 999
-(p8) fadd.s0 Result = Result, Result_lo
-(p8) br.ret.sptk b0 ;;
+ nop.m 999
+(p9) fadd.s0 Result = Result, Result_lo // Compute complete result
+ br.ret.spnt b0 // Exit for y not inf, x=0,inf
}
-//
-// return = sign_Y * PI when ArgX = -Inf
-//
-.endp atan2l
-ASM_SIZE_DIRECTIVE(atan2l)
-ASM_SIZE_DIRECTIVE(__atan2l)
-ASM_SIZE_DIRECTIVE(__ieee754_atan2l)
-
-.proc __libm_error_region
-__libm_error_region:
+;;
+
+GLOBAL_IEEE754_END(atan2l)
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
@@ -2001,7 +1999,6 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region#)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_cbrt.S b/sysdeps/ia64/fpu/s_cbrt.S
index 1e23b6024d..b7a827d1da 100644
--- a/sysdeps/ia64/fpu/s_cbrt.S
+++ b/sysdeps/ia64/fpu/s_cbrt.S
@@ -1,11 +1,10 @@
-.file "cbrt.asm"
+.file "cbrt.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang
-// of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,27 +20,30 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http: //www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 5/19/00: New version (modified algorithm)
+// 02/02/00 Initial version
+// 05/19/00 New version (modified algorithm)
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/28/03 Updated polynomial coefficients (changed to Remez coefficients),
+// to slightly improve accuracy
//
// API
//==============================================================
@@ -53,637 +55,713 @@
//
// Implementation
//
-// cbrt(a) = cbrt(a y) / cbrt(y)
-// = cbrt(1 - (1 - a y)) * 1/cbrt(y)
+// Let y= frcpa(a), where a is the argument
+//
+// cbrt(a)= cbrt(a*y)/cbrt(y) = cbrt(1 - (1-a*y)) * (1/cbrt(y))
+//
+// For all values of y, the 3 possible significands of 1/cbrt(y)
+// are stored in a table (T0) to 64 bits of accuracy. (There are
+// 3 possible significands because the exponent of y modulo 3
+// can be 0, 1, or 2.)
//
-// where y = frcpa(a).
//
-// * cbrt(1 - (1 - a y)) is approximated by a degree-5 polynomial
-//
-// 1 - (1/3)*r - (1/9)*r^2 - (5/81)*r^3 - (10/243)*r^4 - (22/729)*r^5
-//
-// in r = 1 - a y.
+// * cbrt(1 - (1-a*y)) is approximated by a degree-5 polynomial ~
+//
+// ~ 1 - (1/3)*r - (1/9)*r^2 - (5/81)*r^3 - (10/243)*r^4 - (22/729)*r^5
+//
+// in r = 1-a*y.
//
-// * The values 1/cbrt(y) are stored in a table of constants T0
-// to 64 bits of accuracy
//
// The table values are stored for three exponent values and are
// then multiplied by e/3 where e is the exponent of the input number.
// This computation is carried out in parallel with the polynomial
// evaluation:
//
-// T = 2^(e/3) * T0
+// T= 2^(e/3) * T0
//===============
-// input = x
-// C = frcpa(x)
-// r = 1 - C * x
+// input= x
+// C= frcpa(x)
+// r= 1 - C * x
//
-// Special values
+// Special values
//==============================================================
// Registers used
//==============================================================
-// f6-f15
-// r2, r23-r26, r28-r30
-// p6,p7,p8,p12
+// f6-f15
+// GR_GP, r23-r26, r28-r30
+// p6, p7, p8, p12
+
+ FR_R = f6
+ FR_COEFF1 = f7
+ FR_COEFF2 = f9
+ FR_COEFF3 = f10
+ FR_COEFF4 = f11
+ FR_COEFF5 = f12
+ FR_R2 = f13
+ FR_ARG = f14
+ FR_P23 = f15
+ FR_P25 = f32
+ FR_P15 = f33
+ FR_P1 = f34
+ FR_P45 = f35
+ FR_2EXP = f36
+ FR_TMP63 = f37
+
+ GR_GP = r2
+ GR_ADDR = r2
+ GR_CONST1 = r3
+ GR_I1 = r8
+ GR_EXP = r9
+ GR_ADDR2 = r10
+ GR_IT1 = r11
+ GR_TMP2 = r11
+ GR_EXPON = r15
+ GR_TMP1 = r16
+ GR_TMP6 = r16
+ GR_ITB1 = r17
+ GR_TMP3 = r18
+ GR_TMP4 = r19
+ GR_TMP63 = r19
+ GR_TMP5 = r20
+ GR_EXP_BY_3 = r20
+ GR_CONST4 = r21
+ GR_TMP6 = r22
+ GR_INDEX = r23
+ GR_EBIAS = r24
+ GR_SIGNIF = r25
+ GR_SIGNIF2 = r25
+ GR_TEST = r25
+ GR_ARGEXP = r26
+ GR_CONST2 = r27
+ GR_SIGN = r28
+ GR_REM = r29
+ GR_CONST3 = r30
+ GR_SEXP = r31
+
+
+
-#include "libm_support.h"
// Data tables
//==============================================================
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+RODATA
.align 16
-poly_coeffs:
-ASM_TYPE_DIRECTIVE(poly_coeffs,@object)
-data8 0xaaaaaaaaaaaaaaab, 0x00003ffd // 1/3
-data8 0x3fbc71c71c71c71d, 0x3faf9add3c0ca459
-data8 0x3fa511e8d2b3183b, 0x3f9ee7113506ac13
-ASM_SIZE_DIRECTIVE(poly_coeffs)
-
-T_table:
-ASM_TYPE_DIRECTIVE(T_table,@object)
-
-data8 0x80155c748c374836, 0xa160019ed37fb4ae
-data8 0xcb51ddcb9e93095e, 0x8040404b0879f7f9
-data8 0xa1960b5966da4608, 0xcb95f333968ad59b
-data8 0x806b5dce4b405c10, 0xa1cc5dbe6dc2aab4
-data8 0xcbda64292d3ffd97, 0x8096b586974669b1
-data8 0xa202f97995b69c0d, 0xcc1f3184af961596
-data8 0x80bcd273d952a028, 0xa232fe6eb0c0577d
-data8 0xcc5bb1ac954d33e2, 0x80e898c52813f2f3
-data8 0xa26a2582012f6e17, 0xcca12e9831fc6402
-data8 0x81149add67c2d208, 0xa2a197e5d10465cb
-data8 0xcce70a67b64f24ad, 0x813b4e2c856b6e9a
-data8 0xa2d25a532efefbc8, 0xcd24794726477ea5
-data8 0x8167c1dde03de7aa, 0xa30a5bd6e49e4ab8
-data8 0xcd6b096a0b70ee87, 0x818ed973b811135e
-data8 0xa33b9c9b59879e24, 0xcda9177738b15a90
-data8 0x81bbc0c33e13ec98, 0xa3742fca6a3c1f21
-data8 0xcdf05f2247dffab9, 0x81e33e69fbe7504a
-data8 0xa3a5f1273887bf22, 0xce2f0f347f96f906
-data8 0x820aec524e3c23e9, 0xa3d7ef508ff11574
-data8 0xce6e0be0cd551a61, 0x823880f78e70b805
-data8 0xa4115ce30548bc15, 0xceb666b2c347d1de
-data8 0x826097a62a8e5200, 0xa443df0e53df577a
-data8 0xcef609b0cb874f00, 0x8288dfe00e9b5eaf
-data8 0xa4769fa5913c0ec3, 0xcf35fb5447e5c765
-data8 0x82b15a10c5371624, 0xa4a99f303bc7def5
-data8 0xcf763c47ee869f00, 0x82da06a527b18937
-data8 0xa4dcde37779adf4b, 0xcfb6cd3888d71785
-data8 0x8302e60b635ab394, 0xa5105d46152c938a
-data8 0xcff7aed4fbfbb447, 0x832bf8b2feec2f0e
-data8 0xa5441ce89825cb8d, 0xd038e1ce5167e3c6
-data8 0x83553f0ce00e276b, 0xa5781dad3e54d899
-data8 0xd07a66d7bfa0ebba, 0x837eb98b50f8322a
-data8 0xa5ac602406c4e68c, 0xd0bc3ea6b32d1b21
-data8 0x83a270f44c84f699, 0xa5d9601d95c2c0bc
-data8 0xd0f4f0e8f36c1bf8, 0x83cc4d7cfcfac5ca
-data8 0xa60e1e1a2de14745, 0xd1376458e34b037e
-data8 0x83f65f78a8872b4c, 0xa6431f6e3fbd9658
-data8 0xd17a2ca133f78572, 0x8420a75f2f7b53c8
-data8 0xa67864b0d432fda4, 0xd1bd4a80301c5715
-data8 0x844510461ff14209, 0xa6a6444aa0243c0b
-data8 0xd1f71682b2fa4575, 0x846fbd91b930bed2
-data8 0xa6dc094d10f25792, 0xd23ad555f773f059
-data8 0x84947e18234f3294, 0xa70a574cc02bba69
-data8 0xd2752c7039a5bf73, 0x84bf92755825045a
-data8 0xa7409e2af9549084, 0xd2b98ee008c06b59
-data8 0x84e4ac0ee112ba51, 0xa76f5c64ca2cf13b
-data8 0xd2f4735ffd700280, 0x8509ef44b86f20be
-data8 0xa79e4f0babab5dc0, 0xd32f99ed6d9ac0e1
-data8 0x85359d5d91768427, 0xa7d5579ae5164b85
-data8 0xd374f0666c75d51c, 0x855b3bd5b7384357
-data8 0xa804bd3c6fe61cc8, 0xd3b0a7d13618e4a1
-data8 0x858104f0c415f79a, 0xa8345895e5250a5a
-data8 0xd3eca2ea53bcec0c, 0x85a6f90390d29864
-data8 0xa8642a122b44ef0b, 0xd428e23874f13a17
-data8 0x85d3772fcd56a1dd, 0xa89c38ca18f6108b
-data8 0xd46f82fe293bc6d3, 0x85f9c982fcc002f3
-data8 0xa8cc81063b6e87ca, 0xd4ac57e9b7186420
-data8 0x862047e0e7ea554b, 0xa8fd00bfa409285e
-data8 0xd4e972becb04e8b8, 0x8646f2a26f7f5852
-data8 0xa92db8664d5516da, 0xd526d40a7a9b43a3
-data8 0x866dca21754096b5, 0xa95ea86b75cc2c20
-data8 0xd5647c5b73917370, 0x8694ceb8dfd17a37
-data8 0xa98fd141a4992deb, 0xd5a26c4201bd6d13
-data8 0x86bc00c49e9307e8, 0xa9c1335cae7446ba
-data8 0xd5e0a45015350a7e, 0x86dccd74fce79610
-data8 0xa9ea8686f556f645, 0xd614b539c6194104
-data8 0x870453c845acf90f, 0xaa1c52d17906bb19
-data8 0xd6537310e224283f, 0x872c089a1e90342c
-data8 0xaa4e59b046dab887, 0xd6927ab62244c917
-data8 0x8753ec4a92d16c5e, 0xaa809b9c60d1890b
-data8 0xd6d1ccc1fc4ef4b7, 0x877bff3aca19f6b4
-data8 0xaab319102f3f9b33, 0xd71169cea98fdded
-data8 0x879d88b6fe1c324c, 0xaadd5a18c1e21274
-data8 0xd746a66a5bc9f6d9, 0x87c5f346dbf98c3a
-data8 0xab1045f2ac31bdf5, 0xd786ce8f0fae5317
-data8 0x87e7c653efacef2c, 0xab3ae3ab2df7231e
-data8 0xd7bc7ff214c4e75a, 0x881089d4e73ffefc
-data8 0xab6e3f945d1e96fc, 0xd7fd35467a517ed1
-data8 0x88397e6a366f2a8a, 0xaba1d953a08fa94e
-data8 0xd83e38838648d815, 0x885bc559e5e1c081
-data8 0xabcd090db7ef4c3f, 0xd874a1db598b8951
-data8 0x887e2ee392bb7a93, 0xabf864602d7c323d
-data8 0xd8ab42205b80edaf, 0x88a7a8587e404257
-data8 0xac2ca5886ccf9b57, 0xd8ed1849d202f965
-data8 0x88ca5eda67594784, 0xac5861d4aa441f0f
-data8 0xd92432bd5a173685, 0x88f4356166bd590e
-data8 0xac8d183fe3a2fbed, 0xd9669ca45b03c23e
-data8 0x89173a0acf5ce026, 0xacb93703ff51571e
-data8 0xd99e3327cf89574e, 0x893a62a098b6a57b
-data8 0xace5830ad0c3f14b, 0xd9d602b19b100466
-data8 0x895daf637236ae2c, 0xad11fca5d78b3ff2
-data8 0xda0e0ba86c096841, 0x89883b9d1c2fa9c5
-data8 0xad4797fddf91a798, 0xda5195fcdb1c3dce
-data8 0x89abd8dd374a5d7b, 0xad747701e559ebcb
-data8 0xda8a1eb87a491f6c, 0x89cf9b1dcd197fa0
-data8 0xada184a47e9c7613, 0xdac2e230b91c3f84
-data8 0x89f382a258ea79de, 0xadcec13ab0dda8ff
-data8 0xdafbe0d0b66aea30, 0x8a178faf06648f29
-data8 0xadfc2d1a5fd21ba8, 0xdb351b04a8fafced
-data8 0x8a3bc288b3e1d18a, 0xae29c89a5053c33a
-data8 0xdb6e9139e33cdd8e, 0x8a601b74f4d1f835
-data8 0xae5794122b638df9, 0xdba843ded7151ea1
-data8 0x8a849aba14274764, 0xae858fda8137ae0a
-data8 0xdbe2336319b61fc8, 0x8aa9409f16cdbc9b
-data8 0xaeb3bc4ccc56d3d1, 0xdc1c60376789fa68
-data8 0x8ace0d6bbe2cb316, 0xaee219c374c09920
-data8 0xdc56cacda82d0cd5, 0x8af301688ab33558
-data8 0xaf10a899d3235fe7, 0xdc917398f2797814
-data8 0x8b181cdebe6f3206, 0xaf3f692c341fe8b4
-data8 0xdccc5b0d90a3e628, 0x8b3d60185fafcb7c
-data8 0xaf6e5bd7db9ae6c2, 0xdd0781a10469f0f2
-data8 0x8b62cb603bb2fad0, 0xaf9d80fb081cd91b
-data8 0xdd42e7ca0b52838f, 0x8b80d7d6bc4104de
-data8 0xafc35ce063eb3787, 0xdd729ad01c69114d
-data8 0x8ba68bf73ac74f39, 0xaff2ddcb5f28f03d
-data8 0xddae749c001fbf5e, 0x8bcc68fb9f9f7335
-data8 0xb022923b148e05c5, 0xddea8f50a51c69b1
-data8 0x8bf26f31c534fca2, 0xb0527a919adbf58b
-data8 0xde26eb69a0f0f111, 0x8c10f86e13a1a1f9
-data8 0xb078f3ab1d701c65, 0xde576480262399bc
-data8 0x8c3749916cc6abb5, 0xb0a93a6870649f31
-data8 0xde943789645933c8, 0x8c5dc4c4f7706032
-data8 0xb0d9b624d62ec856, 0xded14d58139a28af
-data8 0x8c7cac3a8c42e3e0, 0xb100a5f53fb3c8e1
-data8 0xdf025c00bbf2b5c7, 0x8ca373f1b7bf2716
-data8 0xb131821882f5540a, 0xdf3feb44d723a713
-data8 0x8cc29907fb951294, 0xb158bf8e4cb04055
-data8 0xdf715bc16c159be0, 0x8ce9ae4e9492aac8
-data8 0xb189fd69d56b238f, 0xdfaf66240e29cda8
-data8 0x8d0911dddbfdad0e, 0xb1b189958e8108e4
-data8 0xdfe139cbf6e19bdc, 0x8d3075c4f20f04ee
-data8 0xb1e32a8165b09832, 0xe01fc0fe94d9fc52
-data8 0x8d5018a9d4de77d5, 0xb20b0678fc271eec
-data8 0xe051f92ffcc0bd60, 0x8d77cc47dd143515
-data8 0xb23d0bd3f7592b6e, 0xe090feec9c9a06ac
-data8 0x8d97af6352739cb7, 0xb26538b2db8420dc
-data8 0xe0c39d0c9ff862d6, 0x8db7af523167800f
-data8 0xb28d89e339ceca14, 0xe0f668eeb99f188d
-data8 0x8ddfd80bc68c32ff, 0xb2c022ca12e55a16
-data8 0xe1362890eb663139, 0x8e00197e1e7c88fe
-data8 0xb2e8c6852c6b03f1, 0xe1695c7212aecbaa
-data8 0x8e207859f77e20e7, 0xb3118f4eda9fe40f
-data8 0xe19cbf0391bbbbe9, 0x8e40f4ce60c9f8e2
-data8 0xb33a7d6268109ebe, 0xe1d050901c531e85
-data8 0x8e69ba46cf2fde4d, 0xb36ddbc5ea70ec55
-data8 0xe2110903b4f4047a, 0x8e8a7a00bd7ae63e
-data8 0xb3971e9b39264023, 0xe2450559b4d80b6d
-data8 0x8eab57ef1cf2f529, 0xb3c0877ecc18e24a
-data8 0xe27931a231554ef3, 0x8ecc5442cffb1dad
-data8 0xb3ea16ae3a6c905f, 0xe2ad8e2ac3c5b04b
-data8 0x8eed6f2d2a4acbfe, 0xb413cc67aa0e4d2d
-data8 0xe2e21b41b9694cce, 0x8f0ea8dff24441ff
-data8 0xb43da8e9d163e1af, 0xe316d93615862714
-data8 0x8f385c95d696b817, 0xb47233773b84d425
-data8 0xe3590bd86a0d30f9, 0x8f59dc43edd930f3
-data8 0xb49c6825430fe730, 0xe38e38e38e38e38e
-data8 0x8f7b7b5f5ffad1c4, 0xb4c6c46bcdb27dcf
-data8 0xe3c397d1e6db7839, 0x8f9d3a1bea165f38
-data8 0xb4f1488c0b35d26f, 0xe3f928f5953feb9e
-data8 0x8fbf18adc34b66da, 0xb51bf4c7c51f0168
-data8 0xe42eeca17c62886c, 0x8fe117499e356095
-data8 0xb546c9616087ab9c, 0xe464e32943446305
-data8 0x90033624aa685f8d, 0xb571c69bdffd9a70
-data8 0xe49b0ce15747a8a2, 0x9025757495f36b86
-data8 0xb59cecbae56984c3, 0xe4d16a1eee94e9d4
-data8 0x903f3a5dcc091203, 0xb5bd64512bb14bb7
-data8 0xe4fa52107353f67d, 0x9061b2fceb2bdbab
-data8 0xb5e8d2a4bf5ba416, 0xe5310a471f4d2dc3
-data8 0x90844ca7211032a7, 0xb6146a9a1bc47819
-data8 0xe567f6f1c2b9c224, 0x90a7079403e6a15d
-data8 0xb6402c7749d621c0, 0xe59f18689a9e4c9a
-data8 0x90c9e3fbafd63799, 0xb66c1882fb435ea2
-data8 0xe5d66f04b8a68ecf, 0x90ece216c8a16ee4
-data8 0xb6982f048c999a56, 0xe60dfb2005c192e9
-data8 0x9110021e7b516f0a, 0xb6c47044075b4142
-data8 0xe645bd1544c7ea51, 0x912a708a39be9075
-data8 0xb6e5bd6bfd02bafd, 0xe66fb21b505b20a0
-data8 0x914dcc7b31146370, 0xb7124a2736ff8ef2
-data8 0xe6a7d32af4a7c59a, 0x91714af8cfe984d5
-data8 0xb73f026a01e94177, 0xe6e02b129c6a5ae4
-data8 0x918c00a6f3795e97, 0xb760a959f1d0a7a7
-data8 0xe70a9136a7403039, 0x91afbc299ed0295d
-data8 0xb78dae7e06868ab0, 0xe74349fb2d92a589
-data8 0x91d39add3e958db0, 0xb7badff8ad9e4e02
-data8 0xe77c3a9c86ed7d42, 0x91ee9920a8974d92
-data8 0xb7dce25b8e17ae9f, 0xe7a713f88151518a
-data8 0x9212b5fcac537c19, 0xb80a6226904045e2
-data8 0xe7e067453317ed2b, 0x9236f6b256923fcf
-data8 0xb8380f1cafd73c1c, 0xe819f37a81871bb5
-data8 0x92523ee6f90dcfc3, 0xb85a6ea8e321b4d8
-data8 0xe8454236bfaeca14, 0x9276bef031e6eb79
-data8 0xb8886b684ae7d2fa, 0xe87f32f24c3fc90e
-data8 0x929236ec237a24ad, 0xb8ab0726fa00cf5d
-data8 0xe8aacd8688892ba6, 0x92b6f70b7efe9dc3
-data8 0xb8d954a4d13b7cb1, 0xe8e523fd32f606f7
-data8 0x92d29f61eec7dc2b, 0xb8fc2d4f6cd9f04a
-data8 0xe9110b5311407927, 0x92f7a05d5b8ba92f
-data8 0xb92acc851476b1ab, 0xe94bc8bf0c108fa3
-data8 0x931379a403be5c16, 0xb94de2d841a184c2
-data8 0xe977fdc439c2ca3c, 0x9338bc44de2e3f34
-data8 0xb97cd4c36c92693c, 0xe9b3236528fc349e
-data8 0x9354c71412c69486, 0xb9a0297f172665e3
-data8 0xe9dfa70b745ac1b4, 0x937a4c273907e262
-data8 0xb9cf6f21e36c3924, 0xea1b36268d0eaa38
-data8 0x93968919f6e7975d, 0xb9f3030951267208
-data8 0xea480963fd394197, 0x93bc516fdd4680c9
-data8 0xba229d6a618e7c59, 0xea84034425f27484
-data8 0x93d8c123d9be59b2, 0xba467144459f9855
-data8 0xeab12713138dd1cc, 0x93f546c955e60076
-data8 0xba6a60c3c48f1a4b, 0xeade6db73a5e503b
-data8 0x941b70a65879079f, 0xba9a76056b67ee7a
-data8 0xeb1b0268343b121b, 0x943829f337410591
-data8 0xbabea699563ada6e, 0xeb489b0b2bdb5f14
-data8 0x9454f995765bc4d2, 0xbae2f350b262cc4b
-data8 0xeb765721e85f03d0, 0x947b86b57f5842ed
-data8 0xbb1385a23be24e57, 0xebb389645f222f62
-data8 0x94988aeb23470f86, 0xbb3814975e17c680
-data8 0xebe198f090607e0c, 0x94b5a5dc9695f42a
-data8 0xbb5cc031009bf467, 0xec0fcc9321024509
-data8 0x94d2d7a9170d8b42, 0xbb81889680024764
-data8 0xec3e247da8b82f61, 0x94f9e87dd78bf019
-data8 0xbbb2c0d8703ae95d, 0xec7c27d21321c9f7
-data8 0x95175019a503d89e, 0xbbd7cd09ba3c5463
-data8 0xecaad5278824e453, 0x9534cefa625fcb3a
-data8 0xbbfcf68c4977718f, 0xecd9a76d097d4e77
-data8 0x955265405c491a25, 0xbc223d88cfc88eee
-data8 0xed089ed5dcd99446, 0x9570130c1f9bb857
-data8 0xbc47a2284fee4ff8, 0xed37bb95add09a1c
-data8 0x9597ca4119525184, 0xbc79ac0916ed7b8a
-data8 0xed76c70508f904b6, 0x95b5af6fb5aa4d3c
-data8 0xbc9f5670d1a13030, 0xeda63bb05e7f93c6
-data8 0x95d3ac9273aafd7a, 0xbcc51f068cb95c1d
-data8 0xedd5d661daed2dc4, 0x95f1c1cafdfd3684
-data8 0xbceb05f4b30a9bc0, 0xee05974eef86b903
-data8 0x960fef3b430b8d5f, 0xbd110b6604c7d306
-data8 0xee357ead791fc670, 0x962e350575b409c5
-data8 0xbd372f8598620f19, 0xee658cb3c134a463
-data8 0x964c934c0dfc1708, 0xbd5d727edb6b3c7e
-data8 0xee95c1987f080211, 0x966b0a31c9c6bc7d
-data8 0xbd83d47d937bbc6d, 0xeec61d92d8c4314f
-data8 0x968999d9ad8d264e, 0xbdaa55addf1ae47d
-data8 0xeef6a0da64a014ac, 0x96a8426705198795
-data8 0xbdd0f63c36aa73f0, 0xef274ba72a07c811
-data8 0x96c703fd64445ee5, 0xbdf7b6556d550a15
-data8 0xef581e31a2c91260, 0x96e5dec0a7b4268d
-data8 0xbe1e9626b1ffa96b, 0xef8918b2bc43aec6
-data8 0x9704d2d4f59f79f3, 0xbe4595dd903e5371
-data8 0xefba3b63d89d7cbf, 0x9723e05ebe91b9b0
-data8 0xbe6cb5a7f14bc935, 0xefeb867ecffaa607
-data8 0x97430782be323831, 0xbe93f5b41d047cf7
-data8 0xf01cfa3df1b9c9fa, 0x97624865fc0df8bf
-data8 0xbebb5630bae4c15f, 0xf04e96dc05b43e2d
-data8 0x9781a32dcc640b2a, 0xbee2d74cd30a430c
-data8 0xf0805c944d827454, 0x97a117ffd0f48e46
-data8 0xbf0a7937cf38d981, 0xf0b24ba285c495cb
-data8 0x97c0a701f9d263c9, 0xbf323c217be2bc8c
-data8 0xf0e46442e76f6569, 0x97e0505a8637a036
-data8 0xbf5a203a09342bbb, 0xf116a6b2291d7896
-data8 0x97f57a9fb0b08c6e, 0xbf74cad1c14ebfc4
-data8 0xf1383fa9e9b5b381, 0x9815503365914a9d
-data8 0xbf9ce6a497a89f78, 0xf16ac84f90083b9b
-data8 0x98354085054fd204, 0xbfc52428bec6e72f
-data8 0xf19d7b686dcb03d7, 0x98554bbbf8a77902
-data8 0xbfed838fddab024b, 0xf1d0593311db1757
-data8 0x987571fffb7f94f6, 0xc016050c0420981a
-data8 0xf20361ee8f1c711e, 0x9895b3791dd03c23
-data8 0xc03ea8cfabddc330, 0xf23695da7de51d3f
-data8 0x98ab43a5fc65d0c8, 0xc059d3cbd65ddbce
-data8 0xf258d095e465cc35, 0x98cbb2d196bd713d
-data8 0xc082b122a3c78c9d, 0xf28c4d0bfc982b34
-data8 0x98ec3d9ec7b6f21a, 0xc0abb1499ae736c4
-data8 0xf2bff55eb3f0ea71, 0x990ce436db5e8344
-data8 0xc0d4d474c3aedaaf, 0xf2f3c9cf9884636e
-data8 0x9922b8218160967a, 0xc0f054ca33eb3437
-data8 0xf31670135ab9cc0f, 0x99438d686f75779d
-data8 0xc119b2c67e600ed0, 0xf34a8e9f0b54cdfb
-data8 0x99647eea131fa20b, 0xc1433453de2033ff
-data8 0xf37ed9fa6b8add3f, 0x997a85045a47c6d0
-data8 0xc15ef3e44e10032d, 0xf3a1cfe884ef6bb6
-data8 0x999ba5f14f8add02, 0xc188b130431d80e6
-data8 0xf3d66689dcc8e8d3, 0x99bce38b5465ecae
-data8 0xc1b2929d6067730e, 0xf40b2ab069d5c96a
-data8 0x99d31ca0887f30f9, 0xc1ce9268f31cc734
-data8 0xf42e718b90c8bc16, 0x99f48a669c74c09e
-data8 0xc1f8b0877c1b0c08, 0xf463822a0a3b4b00
-data8 0x9a16154eb445c873, 0xc222f35a87b415ba
-data8 0xf498c1076015faf8, 0x9a2c822ec198d667
-data8 0xc23f3467349e5c88, 0xf4bc5a19a33990b5
-data8 0x9a4e3e080cd91b78, 0xc269b4e40e088c01
-data8 0xf4f1e6a7d6f5425f, 0x9a70177afe52322e
-data8 0xc2945aac24daaf6e, 0xf527a232cf6be334
-data8 0x9a86b8fa94eebe10, 0xc2b0de05e43c1d66
-data8 0xf54b8ecdcda90851, 0x9aa8c42866ae2958
-data8 0xc2dbc275e1229d09, 0xf5819949c7ad87b4
-data8 0x9abf86f9e12fc45e, 0xc2f86fca9d80eeff
-data8 0xf5a5bac9213b48a9, 0x9ae1c462fc05f49d
-data8 0xc323938449a2587e, 0xf5dc1501f324a812
-data8 0x9af8a8dc936b84d0, 0xc3406b40a538ed20
-data8 0xf6006bee86b5589e, 0x9b1b19033be35730
-data8 0xc36bcee8211d15e0, 0xf63716b2fa067fa4
-data8 0x9b3da7daf04c2892, 0xc397593adf2ba366
-data8 0xf66df22fb6132b9c, 0x9b54c2e4c8a9012b
-data8 0xc3b475b6206155d5, 0xf6929fb98225deb1
-data8 0x9b77854e6c661200, 0xc3e0410243b97383
-data8 0xf6c9cd13021e3fea, 0x9b8ec2e678d56d2f
-data8 0xc3fd890709833d37, 0xf6eeb177472cedae
-data8 0x9ba60e6a5ca133b6, 0xc41ae295f7e7fa06
-data8 0xf713abf4cb0b3afb, 0x9bc919ea66a151a4
-data8 0xc44709f7bb8a4dd2, 0xf74b4d5333684ef1
-data8 0x9be0887c09ef82bb, 0xc4648fb0e0bec4c1
-data8 0xf7707f75a72f8e94, 0x9c03c8d5fffc3503
-data8 0xc490f9a94695ba14, 0xf7a874b97927af44
-data8 0x9c1b5ad21a81cbb9, 0xc4aeac0173b7d390
-data8 0xf7cddf140aedf1d8, 0x9c3ed09216e9ca02
-data8 0xc4db5941007aa853, 0xf806291bacb7f7a9
-data8 0x9c568656c0423def, 0xc4f938aec206291a
-data8 0xf82bcc43b92eafef, 0x9c7a320af242ce60
-data8 0xc52629e899dfd622, 0xf8646bf0defb759e
-data8 0x9c920bf7a8c01dc2, 0xc54436e44043b965
-data8 0xf88a487dfc3ff5f7, 0x9ca9f475d98b159c
-data8 0xc562563abf9ea07f, 0xf8b03c2b46cdc17f
-data8 0x9ccdeca60e80b5f8, 0xc58fa7d1dc42921c
-data8 0xf8e95541c152ae7a, 0x9ce5f9d4653d4902
-data8 0xc5adf561b91e110a, 0xf90f832c2700c160
-data8 0x9cfe15cb38bfdd8e, 0xc5cc5591bdbd82fa
-data8 0xf935c88e0c7f419b, 0x9d225b983f6c1f96
-data8 0xc5fa08f1ff20593c, 0xf96f5cd84fd86873
-data8 0x9d3a9cca32261ed7, 0xc618980a79ce6862
-data8 0xf995dd53ebdd9d6d, 0x9d52ecfccebe1768
-data8 0xc6373a09e34b50fa, 0xf9bc75a034436a41
-data8 0x9d77818d95b82f86, 0xc66550a6e0baaf35
-data8 0xf9f686f26d5518de, 0x9d8ff7893fa4706c
-data8 0xc6842241926342c9, 0xfa1d5b39b910a8c5
-data8 0x9da87cbef36f2a5e, 0xc6a3070b7c93bb9e
-data8 0xfa4447acc4ecbfd2, 0x9dcd6140b4a35aeb
-data8 0xc6d18260bb84081b, 0xfa7ed7e51e6fdfb4
-data8 0x9de60cd06dc6e2d4, 0xc6f0977c9416828b
-data8 0xfaa601394d49a1a0, 0x9dfec7d4cc43b76f
-data8 0xc70fc0117c641630, 0xfacd431644ce0e40
-data8 0x9e17925ec9fccc4a, 0xc72efc34d7e615be
-data8 0xfaf49d96f7a75909, 0x9e3cdf6db57dc075
-data8 0xc75dfb441594141e, 0xfb2fd3c65e562fd5
-data8 0x9e55d110b63637a8, 0xc77d68aa019bda4c
-data8 0xfb576c5762024805, 0x9e6ed27594550d2e
-data8 0xc79ce9ea478dbc4f, 0xfb7f1debc22c4040
-data8 0x9e87e3adc385d393, 0xc7bc7f1ae453219d
-data8 0xfba6e89f32d0190a, 0x9ead9b54b37a1055
-data8 0xc7ec0476e15e141a, 0xfbe2c803a0894893
-data8 0x9ec6d46a3d7de215, 0xc80bcbe16f1d540f
-data8 0xfc0ad1ff0ed9ecf0, 0x9ee01d9108be3154
-data8 0xc82ba78a5d349735, 0xfc32f57bdfbcbe7f
-data8 0x9ef976db07288d04, 0xc84b978847a06b87
-data8 0xfc5b32968f99b21c, 0x9f12e05a4759ec25
-data8 0xc86b9bf1ee817bc6, 0xfc83896bc861ab08
-data8 0x9f2c5a20f4da6668, 0xc88bb4de3667cdf4
-data8 0xfcabfa1861ed4815, 0x9f52af78ed1733ca
-data8 0xc8bc00e7fe9e23a3, 0xfce8d3cea7d3163e
-data8 0x9f6c52426a39d003, 0xc8dc4d7ff2d25232
-data8 0xfd118595143ee273, 0x9f860593d42fd7f3
-data8 0xc8fcaeebcb40eb47, 0xfd3a519943d4865a
-data8 0x9f9fc97fdb96bd51, 0xc91d25431426a663
-data8 0xfd6337f8e1ae5a4b, 0x9fb99e194f4a7037
-data8 0xc93db09d7fdb2949, 0xfd8c38d1c8e927eb
-data8 0x9fd383731ca51db9, 0xc95e5112e721582a
-data8 0xfdb5544205095a53, 0x9fed79a04fbf9423
-data8 0xc97f06bb49787677, 0xfdde8a67d2613531
-data8 0xa00780b413b24ee8, 0xc99fd1aecd6e1b06
-data8 0xfe07db619e781611, 0xa02eab2c4474b0cd
-data8 0xc9d12a3e27bb1625, 0xfe460768d80bf758
-data8 0xa048dcd51ccfd142, 0xc9f22ad82ba3d5f0
-data8 0xfe6f9bfb06cd32f6, 0xa0631fa894b11b8d
-data8 0xca134113105e67b2, 0xfe994bcd3d14fcc2
-data8 0xa07d73ba65e680af, 0xca346d07b045a876
-data8 0xfec316fecaf3f2ab, 0xa097d91e6aaf71b0
-data8 0xca55aecf0e94bb88, 0xfeecfdaf33fadb80
-data8 0xa0b24fe89e02602f, 0xca77068257be9bab
-data8 0xff16fffe2fa8fad6, 0xa0ccd82d1bd2f68b
-data8 0xca98743ae1c693a8, 0xff411e0ba9db886d
-data8 0xa0e77200215909e6, 0xcab9f8122c99a101
-data8 0xff6b57f7c33e4e9a, 0xa1021d760d584855
-data8 0xcadb9221e268c3b5, 0xff95ade2d1bd7358
-data8 0xa11cdaa36068a57d, 0xcafd4283d8043dfd
-data8 0xffc01fed60f86fb5, 0xa137a99cbd3f880b
-data8 0xcb1f09520d37c6fb, 0xffeaae3832b63956
-ASM_SIZE_DIRECTIVE(T_table)
-
-
-
-
-
-
-.align 32
-.global cbrt#
+LOCAL_OBJECT_START(poly_coeffs)
+
+ data8 0xaaaaaaaaaaaaaab4, 0x0000bffd // ~ 1/3
+ data8 0xbfbc71c71c718e45, 0xbfaf9add3c0bbb43
+ data8 0xbfa511edb93dc98d, 0xbf9ee71c45f0dfbc
+LOCAL_OBJECT_END(poly_coeffs)
+
+
+// For every entry B in the frcpa table, this table contains
+// the significands of cbrt(1/B), cbrt(2/B), cbrt(4/B).
+// The index to this table is the same as the frcpa index.
+
+LOCAL_OBJECT_START(T_table)
+
+
+ data8 0x80155c748c374836, 0xa160019ed37fb4ae
+ data8 0xcb51ddcb9e93095e, 0x8040404b0879f7f9
+ data8 0xa1960b5966da4608, 0xcb95f333968ad59b
+ data8 0x806b5dce4b405c10, 0xa1cc5dbe6dc2aab4
+ data8 0xcbda64292d3ffd97, 0x8096b586974669b1
+ data8 0xa202f97995b69c0d, 0xcc1f3184af961596
+ data8 0x80bcd273d952a028, 0xa232fe6eb0c0577d
+ data8 0xcc5bb1ac954d33e2, 0x80e898c52813f2f3
+ data8 0xa26a2582012f6e17, 0xcca12e9831fc6402
+ data8 0x81149add67c2d208, 0xa2a197e5d10465cb
+ data8 0xcce70a67b64f24ad, 0x813b4e2c856b6e9a
+ data8 0xa2d25a532efefbc8, 0xcd24794726477ea5
+ data8 0x8167c1dde03de7aa, 0xa30a5bd6e49e4ab8
+ data8 0xcd6b096a0b70ee87, 0x818ed973b811135e
+ data8 0xa33b9c9b59879e24, 0xcda9177738b15a90
+ data8 0x81bbc0c33e13ec98, 0xa3742fca6a3c1f21
+ data8 0xcdf05f2247dffab9, 0x81e33e69fbe7504a
+ data8 0xa3a5f1273887bf22, 0xce2f0f347f96f906
+ data8 0x820aec524e3c23e9, 0xa3d7ef508ff11574
+ data8 0xce6e0be0cd551a61, 0x823880f78e70b805
+ data8 0xa4115ce30548bc15, 0xceb666b2c347d1de
+ data8 0x826097a62a8e5200, 0xa443df0e53df577a
+ data8 0xcef609b0cb874f00, 0x8288dfe00e9b5eaf
+ data8 0xa4769fa5913c0ec3, 0xcf35fb5447e5c765
+ data8 0x82b15a10c5371624, 0xa4a99f303bc7def5
+ data8 0xcf763c47ee869f00, 0x82da06a527b18937
+ data8 0xa4dcde37779adf4b, 0xcfb6cd3888d71785
+ data8 0x8302e60b635ab394, 0xa5105d46152c938a
+ data8 0xcff7aed4fbfbb447, 0x832bf8b2feec2f0e
+ data8 0xa5441ce89825cb8d, 0xd038e1ce5167e3c6
+ data8 0x83553f0ce00e276b, 0xa5781dad3e54d899
+ data8 0xd07a66d7bfa0ebba, 0x837eb98b50f8322a
+ data8 0xa5ac602406c4e68c, 0xd0bc3ea6b32d1b21
+ data8 0x83a270f44c84f699, 0xa5d9601d95c2c0bc
+ data8 0xd0f4f0e8f36c1bf8, 0x83cc4d7cfcfac5ca
+ data8 0xa60e1e1a2de14745, 0xd1376458e34b037e
+ data8 0x83f65f78a8872b4c, 0xa6431f6e3fbd9658
+ data8 0xd17a2ca133f78572, 0x8420a75f2f7b53c8
+ data8 0xa67864b0d432fda4, 0xd1bd4a80301c5715
+ data8 0x844510461ff14209, 0xa6a6444aa0243c0b
+ data8 0xd1f71682b2fa4575, 0x846fbd91b930bed2
+ data8 0xa6dc094d10f25792, 0xd23ad555f773f059
+ data8 0x84947e18234f3294, 0xa70a574cc02bba69
+ data8 0xd2752c7039a5bf73, 0x84bf92755825045a
+ data8 0xa7409e2af9549084, 0xd2b98ee008c06b59
+ data8 0x84e4ac0ee112ba51, 0xa76f5c64ca2cf13b
+ data8 0xd2f4735ffd700280, 0x8509ef44b86f20be
+ data8 0xa79e4f0babab5dc0, 0xd32f99ed6d9ac0e1
+ data8 0x85359d5d91768427, 0xa7d5579ae5164b85
+ data8 0xd374f0666c75d51c, 0x855b3bd5b7384357
+ data8 0xa804bd3c6fe61cc8, 0xd3b0a7d13618e4a1
+ data8 0x858104f0c415f79a, 0xa8345895e5250a5a
+ data8 0xd3eca2ea53bcec0c, 0x85a6f90390d29864
+ data8 0xa8642a122b44ef0b, 0xd428e23874f13a17
+ data8 0x85d3772fcd56a1dd, 0xa89c38ca18f6108b
+ data8 0xd46f82fe293bc6d3, 0x85f9c982fcc002f3
+ data8 0xa8cc81063b6e87ca, 0xd4ac57e9b7186420
+ data8 0x862047e0e7ea554b, 0xa8fd00bfa409285e
+ data8 0xd4e972becb04e8b8, 0x8646f2a26f7f5852
+ data8 0xa92db8664d5516da, 0xd526d40a7a9b43a3
+ data8 0x866dca21754096b5, 0xa95ea86b75cc2c20
+ data8 0xd5647c5b73917370, 0x8694ceb8dfd17a37
+ data8 0xa98fd141a4992deb, 0xd5a26c4201bd6d13
+ data8 0x86bc00c49e9307e8, 0xa9c1335cae7446ba
+ data8 0xd5e0a45015350a7e, 0x86dccd74fce79610
+ data8 0xa9ea8686f556f645, 0xd614b539c6194104
+ data8 0x870453c845acf90f, 0xaa1c52d17906bb19
+ data8 0xd6537310e224283f, 0x872c089a1e90342c
+ data8 0xaa4e59b046dab887, 0xd6927ab62244c917
+ data8 0x8753ec4a92d16c5e, 0xaa809b9c60d1890b
+ data8 0xd6d1ccc1fc4ef4b7, 0x877bff3aca19f6b4
+ data8 0xaab319102f3f9b33, 0xd71169cea98fdded
+ data8 0x879d88b6fe1c324c, 0xaadd5a18c1e21274
+ data8 0xd746a66a5bc9f6d9, 0x87c5f346dbf98c3a
+ data8 0xab1045f2ac31bdf5, 0xd786ce8f0fae5317
+ data8 0x87e7c653efacef2c, 0xab3ae3ab2df7231e
+ data8 0xd7bc7ff214c4e75a, 0x881089d4e73ffefc
+ data8 0xab6e3f945d1e96fc, 0xd7fd35467a517ed1
+ data8 0x88397e6a366f2a8a, 0xaba1d953a08fa94e
+ data8 0xd83e38838648d815, 0x885bc559e5e1c081
+ data8 0xabcd090db7ef4c3f, 0xd874a1db598b8951
+ data8 0x887e2ee392bb7a93, 0xabf864602d7c323d
+ data8 0xd8ab42205b80edaf, 0x88a7a8587e404257
+ data8 0xac2ca5886ccf9b57, 0xd8ed1849d202f965
+ data8 0x88ca5eda67594784, 0xac5861d4aa441f0f
+ data8 0xd92432bd5a173685, 0x88f4356166bd590e
+ data8 0xac8d183fe3a2fbed, 0xd9669ca45b03c23e
+ data8 0x89173a0acf5ce026, 0xacb93703ff51571e
+ data8 0xd99e3327cf89574e, 0x893a62a098b6a57b
+ data8 0xace5830ad0c3f14b, 0xd9d602b19b100466
+ data8 0x895daf637236ae2c, 0xad11fca5d78b3ff2
+ data8 0xda0e0ba86c096841, 0x89883b9d1c2fa9c5
+ data8 0xad4797fddf91a798, 0xda5195fcdb1c3dce
+ data8 0x89abd8dd374a5d7b, 0xad747701e559ebcb
+ data8 0xda8a1eb87a491f6c, 0x89cf9b1dcd197fa0
+ data8 0xada184a47e9c7613, 0xdac2e230b91c3f84
+ data8 0x89f382a258ea79de, 0xadcec13ab0dda8ff
+ data8 0xdafbe0d0b66aea30, 0x8a178faf06648f29
+ data8 0xadfc2d1a5fd21ba8, 0xdb351b04a8fafced
+ data8 0x8a3bc288b3e1d18a, 0xae29c89a5053c33a
+ data8 0xdb6e9139e33cdd8e, 0x8a601b74f4d1f835
+ data8 0xae5794122b638df9, 0xdba843ded7151ea1
+ data8 0x8a849aba14274764, 0xae858fda8137ae0a
+ data8 0xdbe2336319b61fc8, 0x8aa9409f16cdbc9b
+ data8 0xaeb3bc4ccc56d3d1, 0xdc1c60376789fa68
+ data8 0x8ace0d6bbe2cb316, 0xaee219c374c09920
+ data8 0xdc56cacda82d0cd5, 0x8af301688ab33558
+ data8 0xaf10a899d3235fe7, 0xdc917398f2797814
+ data8 0x8b181cdebe6f3206, 0xaf3f692c341fe8b4
+ data8 0xdccc5b0d90a3e628, 0x8b3d60185fafcb7c
+ data8 0xaf6e5bd7db9ae6c2, 0xdd0781a10469f0f2
+ data8 0x8b62cb603bb2fad0, 0xaf9d80fb081cd91b
+ data8 0xdd42e7ca0b52838f, 0x8b80d7d6bc4104de
+ data8 0xafc35ce063eb3787, 0xdd729ad01c69114d
+ data8 0x8ba68bf73ac74f39, 0xaff2ddcb5f28f03d
+ data8 0xddae749c001fbf5e, 0x8bcc68fb9f9f7335
+ data8 0xb022923b148e05c5, 0xddea8f50a51c69b1
+ data8 0x8bf26f31c534fca2, 0xb0527a919adbf58b
+ data8 0xde26eb69a0f0f111, 0x8c10f86e13a1a1f9
+ data8 0xb078f3ab1d701c65, 0xde576480262399bc
+ data8 0x8c3749916cc6abb5, 0xb0a93a6870649f31
+ data8 0xde943789645933c8, 0x8c5dc4c4f7706032
+ data8 0xb0d9b624d62ec856, 0xded14d58139a28af
+ data8 0x8c7cac3a8c42e3e0, 0xb100a5f53fb3c8e1
+ data8 0xdf025c00bbf2b5c7, 0x8ca373f1b7bf2716
+ data8 0xb131821882f5540a, 0xdf3feb44d723a713
+ data8 0x8cc29907fb951294, 0xb158bf8e4cb04055
+ data8 0xdf715bc16c159be0, 0x8ce9ae4e9492aac8
+ data8 0xb189fd69d56b238f, 0xdfaf66240e29cda8
+ data8 0x8d0911dddbfdad0e, 0xb1b189958e8108e4
+ data8 0xdfe139cbf6e19bdc, 0x8d3075c4f20f04ee
+ data8 0xb1e32a8165b09832, 0xe01fc0fe94d9fc52
+ data8 0x8d5018a9d4de77d5, 0xb20b0678fc271eec
+ data8 0xe051f92ffcc0bd60, 0x8d77cc47dd143515
+ data8 0xb23d0bd3f7592b6e, 0xe090feec9c9a06ac
+ data8 0x8d97af6352739cb7, 0xb26538b2db8420dc
+ data8 0xe0c39d0c9ff862d6, 0x8db7af523167800f
+ data8 0xb28d89e339ceca14, 0xe0f668eeb99f188d
+ data8 0x8ddfd80bc68c32ff, 0xb2c022ca12e55a16
+ data8 0xe1362890eb663139, 0x8e00197e1e7c88fe
+ data8 0xb2e8c6852c6b03f1, 0xe1695c7212aecbaa
+ data8 0x8e207859f77e20e7, 0xb3118f4eda9fe40f
+ data8 0xe19cbf0391bbbbe9, 0x8e40f4ce60c9f8e2
+ data8 0xb33a7d6268109ebe, 0xe1d050901c531e85
+ data8 0x8e69ba46cf2fde4d, 0xb36ddbc5ea70ec55
+ data8 0xe2110903b4f4047a, 0x8e8a7a00bd7ae63e
+ data8 0xb3971e9b39264023, 0xe2450559b4d80b6d
+ data8 0x8eab57ef1cf2f529, 0xb3c0877ecc18e24a
+ data8 0xe27931a231554ef3, 0x8ecc5442cffb1dad
+ data8 0xb3ea16ae3a6c905f, 0xe2ad8e2ac3c5b04b
+ data8 0x8eed6f2d2a4acbfe, 0xb413cc67aa0e4d2d
+ data8 0xe2e21b41b9694cce, 0x8f0ea8dff24441ff
+ data8 0xb43da8e9d163e1af, 0xe316d93615862714
+ data8 0x8f385c95d696b817, 0xb47233773b84d425
+ data8 0xe3590bd86a0d30f9, 0x8f59dc43edd930f3
+ data8 0xb49c6825430fe730, 0xe38e38e38e38e38e
+ data8 0x8f7b7b5f5ffad1c4, 0xb4c6c46bcdb27dcf
+ data8 0xe3c397d1e6db7839, 0x8f9d3a1bea165f38
+ data8 0xb4f1488c0b35d26f, 0xe3f928f5953feb9e
+ data8 0x8fbf18adc34b66da, 0xb51bf4c7c51f0168
+ data8 0xe42eeca17c62886c, 0x8fe117499e356095
+ data8 0xb546c9616087ab9c, 0xe464e32943446305
+ data8 0x90033624aa685f8d, 0xb571c69bdffd9a70
+ data8 0xe49b0ce15747a8a2, 0x9025757495f36b86
+ data8 0xb59cecbae56984c3, 0xe4d16a1eee94e9d4
+ data8 0x903f3a5dcc091203, 0xb5bd64512bb14bb7
+ data8 0xe4fa52107353f67d, 0x9061b2fceb2bdbab
+ data8 0xb5e8d2a4bf5ba416, 0xe5310a471f4d2dc3
+ data8 0x90844ca7211032a7, 0xb6146a9a1bc47819
+ data8 0xe567f6f1c2b9c224, 0x90a7079403e6a15d
+ data8 0xb6402c7749d621c0, 0xe59f18689a9e4c9a
+ data8 0x90c9e3fbafd63799, 0xb66c1882fb435ea2
+ data8 0xe5d66f04b8a68ecf, 0x90ece216c8a16ee4
+ data8 0xb6982f048c999a56, 0xe60dfb2005c192e9
+ data8 0x9110021e7b516f0a, 0xb6c47044075b4142
+ data8 0xe645bd1544c7ea51, 0x912a708a39be9075
+ data8 0xb6e5bd6bfd02bafd, 0xe66fb21b505b20a0
+ data8 0x914dcc7b31146370, 0xb7124a2736ff8ef2
+ data8 0xe6a7d32af4a7c59a, 0x91714af8cfe984d5
+ data8 0xb73f026a01e94177, 0xe6e02b129c6a5ae4
+ data8 0x918c00a6f3795e97, 0xb760a959f1d0a7a7
+ data8 0xe70a9136a7403039, 0x91afbc299ed0295d
+ data8 0xb78dae7e06868ab0, 0xe74349fb2d92a589
+ data8 0x91d39add3e958db0, 0xb7badff8ad9e4e02
+ data8 0xe77c3a9c86ed7d42, 0x91ee9920a8974d92
+ data8 0xb7dce25b8e17ae9f, 0xe7a713f88151518a
+ data8 0x9212b5fcac537c19, 0xb80a6226904045e2
+ data8 0xe7e067453317ed2b, 0x9236f6b256923fcf
+ data8 0xb8380f1cafd73c1c, 0xe819f37a81871bb5
+ data8 0x92523ee6f90dcfc3, 0xb85a6ea8e321b4d8
+ data8 0xe8454236bfaeca14, 0x9276bef031e6eb79
+ data8 0xb8886b684ae7d2fa, 0xe87f32f24c3fc90e
+ data8 0x929236ec237a24ad, 0xb8ab0726fa00cf5d
+ data8 0xe8aacd8688892ba6, 0x92b6f70b7efe9dc3
+ data8 0xb8d954a4d13b7cb1, 0xe8e523fd32f606f7
+ data8 0x92d29f61eec7dc2b, 0xb8fc2d4f6cd9f04a
+ data8 0xe9110b5311407927, 0x92f7a05d5b8ba92f
+ data8 0xb92acc851476b1ab, 0xe94bc8bf0c108fa3
+ data8 0x931379a403be5c16, 0xb94de2d841a184c2
+ data8 0xe977fdc439c2ca3c, 0x9338bc44de2e3f34
+ data8 0xb97cd4c36c92693c, 0xe9b3236528fc349e
+ data8 0x9354c71412c69486, 0xb9a0297f172665e3
+ data8 0xe9dfa70b745ac1b4, 0x937a4c273907e262
+ data8 0xb9cf6f21e36c3924, 0xea1b36268d0eaa38
+ data8 0x93968919f6e7975d, 0xb9f3030951267208
+ data8 0xea480963fd394197, 0x93bc516fdd4680c9
+ data8 0xba229d6a618e7c59, 0xea84034425f27484
+ data8 0x93d8c123d9be59b2, 0xba467144459f9855
+ data8 0xeab12713138dd1cc, 0x93f546c955e60076
+ data8 0xba6a60c3c48f1a4b, 0xeade6db73a5e503b
+ data8 0x941b70a65879079f, 0xba9a76056b67ee7a
+ data8 0xeb1b0268343b121b, 0x943829f337410591
+ data8 0xbabea699563ada6e, 0xeb489b0b2bdb5f14
+ data8 0x9454f995765bc4d2, 0xbae2f350b262cc4b
+ data8 0xeb765721e85f03d0, 0x947b86b57f5842ed
+ data8 0xbb1385a23be24e57, 0xebb389645f222f62
+ data8 0x94988aeb23470f86, 0xbb3814975e17c680
+ data8 0xebe198f090607e0c, 0x94b5a5dc9695f42a
+ data8 0xbb5cc031009bf467, 0xec0fcc9321024509
+ data8 0x94d2d7a9170d8b42, 0xbb81889680024764
+ data8 0xec3e247da8b82f61, 0x94f9e87dd78bf019
+ data8 0xbbb2c0d8703ae95d, 0xec7c27d21321c9f7
+ data8 0x95175019a503d89e, 0xbbd7cd09ba3c5463
+ data8 0xecaad5278824e453, 0x9534cefa625fcb3a
+ data8 0xbbfcf68c4977718f, 0xecd9a76d097d4e77
+ data8 0x955265405c491a25, 0xbc223d88cfc88eee
+ data8 0xed089ed5dcd99446, 0x9570130c1f9bb857
+ data8 0xbc47a2284fee4ff8, 0xed37bb95add09a1c
+ data8 0x9597ca4119525184, 0xbc79ac0916ed7b8a
+ data8 0xed76c70508f904b6, 0x95b5af6fb5aa4d3c
+ data8 0xbc9f5670d1a13030, 0xeda63bb05e7f93c6
+ data8 0x95d3ac9273aafd7a, 0xbcc51f068cb95c1d
+ data8 0xedd5d661daed2dc4, 0x95f1c1cafdfd3684
+ data8 0xbceb05f4b30a9bc0, 0xee05974eef86b903
+ data8 0x960fef3b430b8d5f, 0xbd110b6604c7d306
+ data8 0xee357ead791fc670, 0x962e350575b409c5
+ data8 0xbd372f8598620f19, 0xee658cb3c134a463
+ data8 0x964c934c0dfc1708, 0xbd5d727edb6b3c7e
+ data8 0xee95c1987f080211, 0x966b0a31c9c6bc7d
+ data8 0xbd83d47d937bbc6d, 0xeec61d92d8c4314f
+ data8 0x968999d9ad8d264e, 0xbdaa55addf1ae47d
+ data8 0xeef6a0da64a014ac, 0x96a8426705198795
+ data8 0xbdd0f63c36aa73f0, 0xef274ba72a07c811
+ data8 0x96c703fd64445ee5, 0xbdf7b6556d550a15
+ data8 0xef581e31a2c91260, 0x96e5dec0a7b4268d
+ data8 0xbe1e9626b1ffa96b, 0xef8918b2bc43aec6
+ data8 0x9704d2d4f59f79f3, 0xbe4595dd903e5371
+ data8 0xefba3b63d89d7cbf, 0x9723e05ebe91b9b0
+ data8 0xbe6cb5a7f14bc935, 0xefeb867ecffaa607
+ data8 0x97430782be323831, 0xbe93f5b41d047cf7
+ data8 0xf01cfa3df1b9c9fa, 0x97624865fc0df8bf
+ data8 0xbebb5630bae4c15f, 0xf04e96dc05b43e2d
+ data8 0x9781a32dcc640b2a, 0xbee2d74cd30a430c
+ data8 0xf0805c944d827454, 0x97a117ffd0f48e46
+ data8 0xbf0a7937cf38d981, 0xf0b24ba285c495cb
+ data8 0x97c0a701f9d263c9, 0xbf323c217be2bc8c
+ data8 0xf0e46442e76f6569, 0x97e0505a8637a036
+ data8 0xbf5a203a09342bbb, 0xf116a6b2291d7896
+ data8 0x97f57a9fb0b08c6e, 0xbf74cad1c14ebfc4
+ data8 0xf1383fa9e9b5b381, 0x9815503365914a9d
+ data8 0xbf9ce6a497a89f78, 0xf16ac84f90083b9b
+ data8 0x98354085054fd204, 0xbfc52428bec6e72f
+ data8 0xf19d7b686dcb03d7, 0x98554bbbf8a77902
+ data8 0xbfed838fddab024b, 0xf1d0593311db1757
+ data8 0x987571fffb7f94f6, 0xc016050c0420981a
+ data8 0xf20361ee8f1c711e, 0x9895b3791dd03c23
+ data8 0xc03ea8cfabddc330, 0xf23695da7de51d3f
+ data8 0x98ab43a5fc65d0c8, 0xc059d3cbd65ddbce
+ data8 0xf258d095e465cc35, 0x98cbb2d196bd713d
+ data8 0xc082b122a3c78c9d, 0xf28c4d0bfc982b34
+ data8 0x98ec3d9ec7b6f21a, 0xc0abb1499ae736c4
+ data8 0xf2bff55eb3f0ea71, 0x990ce436db5e8344
+ data8 0xc0d4d474c3aedaaf, 0xf2f3c9cf9884636e
+ data8 0x9922b8218160967a, 0xc0f054ca33eb3437
+ data8 0xf31670135ab9cc0f, 0x99438d686f75779d
+ data8 0xc119b2c67e600ed0, 0xf34a8e9f0b54cdfb
+ data8 0x99647eea131fa20b, 0xc1433453de2033ff
+ data8 0xf37ed9fa6b8add3f, 0x997a85045a47c6d0
+ data8 0xc15ef3e44e10032d, 0xf3a1cfe884ef6bb6
+ data8 0x999ba5f14f8add02, 0xc188b130431d80e6
+ data8 0xf3d66689dcc8e8d3, 0x99bce38b5465ecae
+ data8 0xc1b2929d6067730e, 0xf40b2ab069d5c96a
+ data8 0x99d31ca0887f30f9, 0xc1ce9268f31cc734
+ data8 0xf42e718b90c8bc16, 0x99f48a669c74c09e
+ data8 0xc1f8b0877c1b0c08, 0xf463822a0a3b4b00
+ data8 0x9a16154eb445c873, 0xc222f35a87b415ba
+ data8 0xf498c1076015faf8, 0x9a2c822ec198d667
+ data8 0xc23f3467349e5c88, 0xf4bc5a19a33990b5
+ data8 0x9a4e3e080cd91b78, 0xc269b4e40e088c01
+ data8 0xf4f1e6a7d6f5425f, 0x9a70177afe52322e
+ data8 0xc2945aac24daaf6e, 0xf527a232cf6be334
+ data8 0x9a86b8fa94eebe10, 0xc2b0de05e43c1d66
+ data8 0xf54b8ecdcda90851, 0x9aa8c42866ae2958
+ data8 0xc2dbc275e1229d09, 0xf5819949c7ad87b4
+ data8 0x9abf86f9e12fc45e, 0xc2f86fca9d80eeff
+ data8 0xf5a5bac9213b48a9, 0x9ae1c462fc05f49d
+ data8 0xc323938449a2587e, 0xf5dc1501f324a812
+ data8 0x9af8a8dc936b84d0, 0xc3406b40a538ed20
+ data8 0xf6006bee86b5589e, 0x9b1b19033be35730
+ data8 0xc36bcee8211d15e0, 0xf63716b2fa067fa4
+ data8 0x9b3da7daf04c2892, 0xc397593adf2ba366
+ data8 0xf66df22fb6132b9c, 0x9b54c2e4c8a9012b
+ data8 0xc3b475b6206155d5, 0xf6929fb98225deb1
+ data8 0x9b77854e6c661200, 0xc3e0410243b97383
+ data8 0xf6c9cd13021e3fea, 0x9b8ec2e678d56d2f
+ data8 0xc3fd890709833d37, 0xf6eeb177472cedae
+ data8 0x9ba60e6a5ca133b6, 0xc41ae295f7e7fa06
+ data8 0xf713abf4cb0b3afb, 0x9bc919ea66a151a4
+ data8 0xc44709f7bb8a4dd2, 0xf74b4d5333684ef1
+ data8 0x9be0887c09ef82bb, 0xc4648fb0e0bec4c1
+ data8 0xf7707f75a72f8e94, 0x9c03c8d5fffc3503
+ data8 0xc490f9a94695ba14, 0xf7a874b97927af44
+ data8 0x9c1b5ad21a81cbb9, 0xc4aeac0173b7d390
+ data8 0xf7cddf140aedf1d8, 0x9c3ed09216e9ca02
+ data8 0xc4db5941007aa853, 0xf806291bacb7f7a9
+ data8 0x9c568656c0423def, 0xc4f938aec206291a
+ data8 0xf82bcc43b92eafef, 0x9c7a320af242ce60
+ data8 0xc52629e899dfd622, 0xf8646bf0defb759e
+ data8 0x9c920bf7a8c01dc2, 0xc54436e44043b965
+ data8 0xf88a487dfc3ff5f7, 0x9ca9f475d98b159c
+ data8 0xc562563abf9ea07f, 0xf8b03c2b46cdc17f
+ data8 0x9ccdeca60e80b5f8, 0xc58fa7d1dc42921c
+ data8 0xf8e95541c152ae7a, 0x9ce5f9d4653d4902
+ data8 0xc5adf561b91e110a, 0xf90f832c2700c160
+ data8 0x9cfe15cb38bfdd8e, 0xc5cc5591bdbd82fa
+ data8 0xf935c88e0c7f419b, 0x9d225b983f6c1f96
+ data8 0xc5fa08f1ff20593c, 0xf96f5cd84fd86873
+ data8 0x9d3a9cca32261ed7, 0xc618980a79ce6862
+ data8 0xf995dd53ebdd9d6d, 0x9d52ecfccebe1768
+ data8 0xc6373a09e34b50fa, 0xf9bc75a034436a41
+ data8 0x9d77818d95b82f86, 0xc66550a6e0baaf35
+ data8 0xf9f686f26d5518de, 0x9d8ff7893fa4706c
+ data8 0xc6842241926342c9, 0xfa1d5b39b910a8c5
+ data8 0x9da87cbef36f2a5e, 0xc6a3070b7c93bb9e
+ data8 0xfa4447acc4ecbfd2, 0x9dcd6140b4a35aeb
+ data8 0xc6d18260bb84081b, 0xfa7ed7e51e6fdfb4
+ data8 0x9de60cd06dc6e2d4, 0xc6f0977c9416828b
+ data8 0xfaa601394d49a1a0, 0x9dfec7d4cc43b76f
+ data8 0xc70fc0117c641630, 0xfacd431644ce0e40
+ data8 0x9e17925ec9fccc4a, 0xc72efc34d7e615be
+ data8 0xfaf49d96f7a75909, 0x9e3cdf6db57dc075
+ data8 0xc75dfb441594141e, 0xfb2fd3c65e562fd5
+ data8 0x9e55d110b63637a8, 0xc77d68aa019bda4c
+ data8 0xfb576c5762024805, 0x9e6ed27594550d2e
+ data8 0xc79ce9ea478dbc4f, 0xfb7f1debc22c4040
+ data8 0x9e87e3adc385d393, 0xc7bc7f1ae453219d
+ data8 0xfba6e89f32d0190a, 0x9ead9b54b37a1055
+ data8 0xc7ec0476e15e141a, 0xfbe2c803a0894893
+ data8 0x9ec6d46a3d7de215, 0xc80bcbe16f1d540f
+ data8 0xfc0ad1ff0ed9ecf0, 0x9ee01d9108be3154
+ data8 0xc82ba78a5d349735, 0xfc32f57bdfbcbe7f
+ data8 0x9ef976db07288d04, 0xc84b978847a06b87
+ data8 0xfc5b32968f99b21c, 0x9f12e05a4759ec25
+ data8 0xc86b9bf1ee817bc6, 0xfc83896bc861ab08
+ data8 0x9f2c5a20f4da6668, 0xc88bb4de3667cdf4
+ data8 0xfcabfa1861ed4815, 0x9f52af78ed1733ca
+ data8 0xc8bc00e7fe9e23a3, 0xfce8d3cea7d3163e
+ data8 0x9f6c52426a39d003, 0xc8dc4d7ff2d25232
+ data8 0xfd118595143ee273, 0x9f860593d42fd7f3
+ data8 0xc8fcaeebcb40eb47, 0xfd3a519943d4865a
+ data8 0x9f9fc97fdb96bd51, 0xc91d25431426a663
+ data8 0xfd6337f8e1ae5a4b, 0x9fb99e194f4a7037
+ data8 0xc93db09d7fdb2949, 0xfd8c38d1c8e927eb
+ data8 0x9fd383731ca51db9, 0xc95e5112e721582a
+ data8 0xfdb5544205095a53, 0x9fed79a04fbf9423
+ data8 0xc97f06bb49787677, 0xfdde8a67d2613531
+ data8 0xa00780b413b24ee8, 0xc99fd1aecd6e1b06
+ data8 0xfe07db619e781611, 0xa02eab2c4474b0cd
+ data8 0xc9d12a3e27bb1625, 0xfe460768d80bf758
+ data8 0xa048dcd51ccfd142, 0xc9f22ad82ba3d5f0
+ data8 0xfe6f9bfb06cd32f6, 0xa0631fa894b11b8d
+ data8 0xca134113105e67b2, 0xfe994bcd3d14fcc2
+ data8 0xa07d73ba65e680af, 0xca346d07b045a876
+ data8 0xfec316fecaf3f2ab, 0xa097d91e6aaf71b0
+ data8 0xca55aecf0e94bb88, 0xfeecfdaf33fadb80
+ data8 0xa0b24fe89e02602f, 0xca77068257be9bab
+ data8 0xff16fffe2fa8fad6, 0xa0ccd82d1bd2f68b
+ data8 0xca98743ae1c693a8, 0xff411e0ba9db886d
+ data8 0xa0e77200215909e6, 0xcab9f8122c99a101
+ data8 0xff6b57f7c33e4e9a, 0xa1021d760d584855
+ data8 0xcadb9221e268c3b5, 0xff95ade2d1bd7358
+ data8 0xa11cdaa36068a57d, 0xcafd4283d8043dfd
+ data8 0xffc01fed60f86fb5, 0xa137a99cbd3f880b
+ data8 0xcb1f09520d37c6fb, 0xffeaae3832b63956
+LOCAL_OBJECT_END(T_table)
+
+
+
+
+
+
.section .text
-.proc cbrt#
-.align 32
-cbrt:
-
-
-{ .mfi
- // get significand
- getf.sig r23=f8
- // will continue only for normal/denormal numbers
- (p0) fclass.nm.unc p12,p0 = f8, 0x1b
- // r2 = pointer to C_1,...,C_5 followed by T_table
- addl r2 = @ltoff(poly_coeffs), gp
+GLOBAL_LIBM_ENTRY(cbrt)
+
+
+{.mfi
+ // get significand
+ getf.sig GR_SIGNIF = f8
+ // normalize a
+ fma.s1 FR_ARG = f8, f1, f0
+ // GR_GP = pointer to C_1,..., C_5 followed by T_table
+ addl GR_GP = @ltoff(poly_coeffs), gp ;;
}
+
{.mfi
- // get exponent
- getf.exp r24=f8
- // normalize a
- fma.s1 f14=f8,f1,f0
- // r29=bias-((2^{12}-1)/3) -63=0xffff-0x555-0x3f=0xfa6b
- mov r29=0xfa6b;;
+ // get exponent
+ getf.exp GR_ARGEXP = f8
+ // will continue only for normal/denormal numbers
+ fclass.m.unc p12, p13 = f8, 0x1e7
+ // GR_CONST4 = bias-((2^{12}-1)/3)-63 = 0xffff-0x555-0x3f = 0xfa6b
+ mov GR_CONST4 = 0xfa6b ;;
}
+
{.mlx
- mov r25=0x20000
- // r28=2^52
- movl r28=0x8000000000000000;;
-}
-{.mfb
- // load start address for C_1,...,C_5 followed by T_table
- ld8 r3=[r2]
- (p12) fma.d.s0 f8=f8,f1,f0
- (p12) br.ret.spnt b0
+ mov GR_CONST2 = 0x20000
+ // GR_CONST3 = 2^52
+ movl GR_CONST3 = 0x8000000000000000 ;;
}
+
+.pred.rel "mutex", p12, p13
{.mfi
- nop.m 0
- // y=frcpa(a)
- frcpa.s0 f8,p6=f1,f8
- // p7=1 if denormal input
- cmp.gtu p7,p0=r28,r23;;
+ // load start address for C_1,..., C_5 followed by T_table
+ ld8 GR_ADDR = [ GR_GP ]
+ // y = frcpa(a)
+ (p13) frcpa.s0 f8, p0 = f1, f8
+ // p7 = 1 if denormal input
+ cmp.gtu p7, p0 = GR_CONST3, GR_SIGNIF
+}
+{.mfb
+ nop.m 0
+ // if argument is 0, +/-Infinity, NaN, or NaTVal, then return
+ (p12) fma.d.s0 f8 = f8, f1, f0
+ (p12) br.ret.spnt b0 ;;
}
+
{.mmi
- // get exponent
- (p7) getf.exp r24=f14
- // get normalized significand
- (p7) getf.sig r23=f14
- // r28=bias-(2^{12}-1)
- mov r28=0xf000;;
+ // get exponent (for denormal input)
+ (p7) getf.exp GR_ARGEXP = FR_ARG
+ // get normalized significand (for denormal input)
+ (p7) getf.sig GR_SIGNIF = FR_ARG
+ // GR_CONST1 = bias-(2^{12}-1)
+ mov GR_CONST1 = 0xf000 ;;
}
+
{.mii
- // get r26=sign
- and r26=r24,r25
- // eliminate leading 1 from r23=1st table index
- shl r23=r23,1
- // eliminate sign from exponent (r25)
- andcm r25=r24,r25;;
+ // get GR_SIGN = sign
+ and GR_SIGN = GR_ARGEXP, GR_CONST2
+ // eliminate leading 1 from GR_I1 = 1st table index
+ shl GR_I1 = GR_SIGNIF, 1
+ // eliminate sign from exponent
+ andcm GR_EXP = GR_ARGEXP, GR_CONST2 ;;
}
+
{.mib
- add r2=32,r3
- // r23=1st table index (y_index,8 bits)
- shr.u r23=r23,56
- nop.b 0
+ add GR_ADDR2 = 32, GR_ADDR
+ // GR_IT1 = 1st table index (y_index, 8 bits)
+ shr.u GR_IT1 = GR_I1, 56
+ nop.b 0
}
{.mib
- // load C_1
- ldfe f7=[r3],16
- // subtract bias from r25=exponent
- sub r25=r25,r28
- nop.b 0;;
+ // load C_1
+ ldfe FR_COEFF1 = [ GR_ADDR ], 16
+ // subtract bias from GR_EXPON = exponent
+ sub GR_EXPON = GR_EXP, GR_CONST1
+ nop.b 0 ;;
}
+
{.mib
- // load C_2, C_3
- ldfpd f9,f10=[r3]
- // 1: exponent*=5; // (2^{16}-1)/3=0x5555
- shladd r24=r25,2,r25
- nop.b 0
+ // load C_2, C_3
+ ldfpd FR_COEFF2, FR_COEFF3 = [ GR_ADDR ]
+ // 1: exponent* = 5; // (2^{16}-1)/3 = 0x5555
+ shladd GR_TMP1 = GR_EXPON, 2, GR_EXPON
+ nop.b 0
}
{.mib
- // load C_4, C_5
- ldfpd f11,f12=[r2],16
- // r23=3*y_index
- shladd r23=r23,1,r23
- nop.b 0;;
+ // load C_4, C_5
+ ldfpd FR_COEFF4, FR_COEFF5 = [ GR_ADDR2 ], 16
+ // GR_TMP2 = 3*y_index
+ shladd GR_TMP2 = GR_IT1, 1, GR_IT1
+ nop.b 0 ;;
}
{.mfi
- // r30=(5*expon)*16+5*expon=(0x55)*expon
- shladd r30=r24,4,r24
- // r=1-a*y
- (p6) fnma.s1 f6=f8,f14,f1
- // adjust T_table pointer by 1st index
- shladd r2=r23,3,r2;;
+ // GR_TMP6 = (5*expon)*16+5*expon = (0x55)*expon
+ shladd GR_TMP6 = GR_TMP1, 4, GR_TMP1
+ // r = 1-a*y
+ fnma.s1 FR_R = f8, FR_ARG, f1
+ // adjust T_table pointer by 1st index
+ shladd GR_ITB1 = GR_TMP2, 3, GR_ADDR2 ;;
}
{.mii
- nop.m 0
- // r24=(0x5500)*expon
- shl r24=r30,8;;
- // r24=(0x5555)*expon
- add r24=r24,r30;;
+ // eliminate leading 1 from significand
+ add GR_SIGNIF2 = GR_SIGNIF, GR_SIGNIF
+ // GR_TMP3 = (0x5500)*expon
+ shl GR_TMP3 = GR_TMP6, 8 ;;
+ // GR_TMP4 = (0x5555)*expon
+ add GR_TMP4 = GR_TMP3, GR_TMP6 ;;
}
+
{.mii
- // r24=(0x5556)*expon // 0x5556=(2^{16}+2)/3
- add r24=r24,r25
- nop.i 0;;
- // r24=floor(expon/3)
- shr r24=r24,16;;
+ // GR_TMP5 = (0x5556)*expon // 0x5556 = (2^{16}+2)/3
+ add GR_TMP5 = GR_TMP4, GR_EXPON
+ nop.i 0 ;;
+ // GR_EXP_BY_3 = floor(expon/3)
+ shr GR_EXP_BY_3 = GR_TMP5, 16 ;;
}
+
{.mfi
- // r28=3*exponent
- shladd r28=r24,1,r24
- // r2=r*r
- (p6) fma.s1 f13=f6,f6,f0
- // bias exponent
- add r24=r29,r24;;
+ // GR_TMP6 = 3*exponent
+ shladd GR_TMP6 = GR_EXP_BY_3, 1, GR_EXP_BY_3
+ // r*r
+ fma.s1 FR_R2 = FR_R, FR_R, f0
+ // bias exponent
+ add GR_EBIAS = GR_CONST4, GR_EXP_BY_3 ;;
}
+
{.mfi
- // get remainder of exponent/3 : r25-r28
- sub r25=r25,r28
- // c2+c3*r
- (p6) fma.s1 f9=f10,f6,f9
- // add sign to exponent
- or r24=r24,r26
+ // get remainder of exponent/3
+ sub GR_REM = GR_EXPON, GR_TMP6
+ // c2+c3*r
+ fma.s1 FR_P23 = FR_COEFF3, FR_R, FR_COEFF2
+ nop.i 0
}
{.mfi
- nop.m 0
- // c4+c5*r
- (p6) fma.s1 f11=f12,f6,f11
- nop.i 0;;
+ // add sign to exponent
+ or GR_SEXP = GR_EBIAS, GR_SIGN
+ // c4+c5*r
+ fma.s1 FR_P45 = FR_COEFF5, FR_R, FR_COEFF4
+ mov GR_TMP63 = 63+0xffff ;;
}
+
{.mmi
- // f14=sign*2^{exponent/3}
- (p6) setf.exp f14=r24
- // adjust T_table pointer by 2nd index
- shladd r2=r25,3,r2
- nop.i 0;;
+ // FR_2EXP = sign*2^{exponent/3}
+ setf.exp FR_2EXP = GR_SEXP
+ // adjust T_table pointer by 2nd index
+ shladd GR_INDEX = GR_REM, 3, GR_ITB1
+ // is the argument of the form 2^(3*k) ?
+ // get (significand - leading 1) | (exponent mod 3)
+ or GR_TEST = GR_REM, GR_SIGNIF2 ;;
}
+
{.mmi
- // load T
- (p6) ldf8 f8=[r2]
- nop.m 0
- nop.i 0;;
+ // 2^63
+ setf.exp FR_TMP63 = GR_TMP63
+ // load T
+ ldf8 f8 = [ GR_INDEX ]
+ // is the argument of the form 2^(3*k) ?
+ cmp.eq p14, p0 = GR_TEST, r0 ;;
}
{.mfi
- nop.m 0
- // (c2+c3*r)+r^2*(c4+c5*r)
- (p6) fma.s1 f9=f11,f13,f9
- nop.i 0
+ nop.m 0
+ // (c2+c3*r)+r^2*(c4+c5*r)
+ fma.s1 FR_P25 = FR_P45, FR_R2, FR_P23
+ nop.i 0
}
{.mfi
- nop.m 0
- // c1*r
- (p6) fma.s1 f7=f7,f6,f0
- nop.i 0;;
+ nop.m 0
+ // c1*r
+ fma.s1 FR_P1 = FR_COEFF1, FR_R, f0
+ nop.i 0 ;;
+}
+
+{.mfb
+ nop.m 0
+ (p14) fma.d.s0 f8 = FR_2EXP, FR_TMP63, f0
+ (p14) br.ret.spnt b0 ;;
}
{.mfi
- nop.m 0
- // P=c1*r+r^2*[(c2+c3*r)+r^2*(c4+c5*r)]
- (p6) fma.s1 f9=f9,f13,f7
- nop.i 0
+ nop.m 0
+ // P = c1*r+r^2* [ (c2+c3*r)+r^2*(c4+c5*r) ]
+ fma.s1 FR_P15 = FR_P25, FR_R2, FR_P1
+ nop.i 0
}
{.mfi
- nop.m 0
- // T'=T*(2^exp)
- (p6) fma.s1 f8=f8,f14,f0
- nop.i 0;;
+ nop.m 0
+ // T' = T*(2^exp)
+ fma.s1 f8 = f8, FR_2EXP, f0
+ nop.i 0 ;;
}
+
{.mfb
- nop.m 0
- // result = T'-T'*P
- (p6) fnma.d.s0 f8=f8,f9,f8
- br.ret.sptk b0;;
+ nop.m 0
+ // result = T'+T'*P
+ fma.d.s0 f8 = f8, FR_P15, f8
+ br.ret.sptk b0 ;;
}
-.endp cbrt
-ASM_SIZE_DIRECTIVE(cbrt)
+
+
+GLOBAL_LIBM_END(cbrt)
diff --git a/sysdeps/ia64/fpu/s_cbrtf.S b/sysdeps/ia64/fpu/s_cbrtf.S
index 20167797b8..c8c6500b25 100644
--- a/sysdeps/ia64/fpu/s_cbrtf.S
+++ b/sysdeps/ia64/fpu/s_cbrtf.S
@@ -1,11 +1,10 @@
-.file "cbrtf.asm"
+.file "cbrtf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang
-// of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,27 +20,30 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http: //www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 5/18/00: New version (modified algorithm)
+// 02/02/00 Initial version
+// 05/18/00 New version (modified algorithm)
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/28/03 Rescheduled some instructions for better performance
+// on Itanium 2, and reformatted
//
// API
//==============================================================
@@ -53,616 +55,710 @@
//
// Implementation
//
-// cbrt(a) = cbrt(a y) / cbrt(y)
-// = cbrt(1 - (1 - a y)) * 1/cbrt(y)
+// Let y= frcpa(a), where a is the argument
//
-// where y = frcpa(a).
+// cbrt(a)= cbrt(a*y)/cbrt(y) = cbrt(1 - (1-a*y)) * (1/cbrt(y))
//
-// * cbrt(1 - (1 - a y)) is approximated by a degree-2 polynomial
-//
-// 1 - (1/3)*r - (1/9)*r^2
-//
-// in r = 1 - a y.
+// For all values of y, the 3 possible significands of 1/cbrt(y)
+// are stored in a table (T0) to 64 bits of accuracy. (There are
+// 3 possible significands because the exponent of y modulo 3
+// can be 0, 1, or 2.)
//
-// * The values 1/cbrt(y) are stored in a table of constants T0
-// to 64 bits of accuracy
+//
+// * cbrt(1 - (1-a*y)) is approximated by a degree-2 polynomial
+//
+// 1 - (1/3)*r - (1/9)*r^2
+//
+// in r = 1-a*y.
//
// The table values are stored for three exponent values and are
-// then multiplied by e/3 where e is the exponent of the input number.
+// then multiplied by 2^(e/3) where e is the exponent of the input number.
// This computation is carried out in parallel with the polynomial
// evaluation:
//
-// T = 2^(e/3) * T0
+// T= 2^(e/3) * T0
//===============
-// input = x
-// C = frcpa(x)
-// r = 1 - C * x
+// input= x
+// C= frcpa(x)
+// r= 1 - C * x
//
-// Special values
+// Special values
//==============================================================
// Registers used
//==============================================================
-// f6-f15
-// r2, r23-r26, r28-r30
-// p6,p7,p8,p12
+// p6, p7, p8, p12
+
+ FR_R = f6
+ FR_COEFF1 = f7
+ FR_COEFF2 = f9
+ FR_T0 = f10
+ FR_T1 = f11
+ FR_T2 = f12
+ FR_2M63 = f13
+ FR_ARG = f14
+ FR_Y = f15
+
+ GR_GP = r2
+ GR_ADDR = r2
+ GR_TMP5 = r3
+ GR_CONST = r8
+ GR_TMP63 = r8
+ GR_SIGN = r9
+ GR_CT2 = r10
+ GR_CT3 = r11
+ GR_TMP4 = r14
+ GR_EBIAS3 = r15
+ GR_REM = r16
+ GR_SEXP = r17
+ GR_2P63 = r18
+ GR_SIGNIF = r19
+ GR_I1 = r20
+ GR_EBIAS = r21
+ GR_EXP = r22
+ GR_IT1 = r23
+ GR_E5 = r24
+ GR_IT1_3 = r25
+ GR_TP1 = r26
+ GR_TMP = r27
+ GR_TMP2 = r28
+ GR_TMP3 = r29
+ GR_EXP3 = r30
+ GR_ARGEXP = r31
+
+
-#include "libm_support.h"
// Data tables
//==============================================================
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+RODATA
.align 16
-poly_coeffs:
-ASM_TYPE_DIRECTIVE(poly_coeffs,@object)
-data8 0xaaaaaaaaaaaaaaab, 0x00003ffd // 1/3
-data8 0xe38e38e38e38e38e, 0x00003ffb // 1/9
-ASM_SIZE_DIRECTIVE(poly_coeffs)
-
-
-T_table:
-ASM_TYPE_DIRECTIVE(T_table,@object)
-
-data8 0x80155c748c374836, 0xa160019ed37fb4ae
-data8 0xcb51ddcb9e93095e, 0x8040404b0879f7f9
-data8 0xa1960b5966da4608, 0xcb95f333968ad59b
-data8 0x806b5dce4b405c10, 0xa1cc5dbe6dc2aab4
-data8 0xcbda64292d3ffd97, 0x8096b586974669b1
-data8 0xa202f97995b69c0d, 0xcc1f3184af961596
-data8 0x80bcd273d952a028, 0xa232fe6eb0c0577d
-data8 0xcc5bb1ac954d33e2, 0x80e898c52813f2f3
-data8 0xa26a2582012f6e17, 0xcca12e9831fc6402
-data8 0x81149add67c2d208, 0xa2a197e5d10465cb
-data8 0xcce70a67b64f24ad, 0x813b4e2c856b6e9a
-data8 0xa2d25a532efefbc8, 0xcd24794726477ea5
-data8 0x8167c1dde03de7aa, 0xa30a5bd6e49e4ab8
-data8 0xcd6b096a0b70ee87, 0x818ed973b811135e
-data8 0xa33b9c9b59879e24, 0xcda9177738b15a90
-data8 0x81bbc0c33e13ec98, 0xa3742fca6a3c1f21
-data8 0xcdf05f2247dffab9, 0x81e33e69fbe7504a
-data8 0xa3a5f1273887bf22, 0xce2f0f347f96f906
-data8 0x820aec524e3c23e9, 0xa3d7ef508ff11574
-data8 0xce6e0be0cd551a61, 0x823880f78e70b805
-data8 0xa4115ce30548bc15, 0xceb666b2c347d1de
-data8 0x826097a62a8e5200, 0xa443df0e53df577a
-data8 0xcef609b0cb874f00, 0x8288dfe00e9b5eaf
-data8 0xa4769fa5913c0ec3, 0xcf35fb5447e5c765
-data8 0x82b15a10c5371624, 0xa4a99f303bc7def5
-data8 0xcf763c47ee869f00, 0x82da06a527b18937
-data8 0xa4dcde37779adf4b, 0xcfb6cd3888d71785
-data8 0x8302e60b635ab394, 0xa5105d46152c938a
-data8 0xcff7aed4fbfbb447, 0x832bf8b2feec2f0e
-data8 0xa5441ce89825cb8d, 0xd038e1ce5167e3c6
-data8 0x83553f0ce00e276b, 0xa5781dad3e54d899
-data8 0xd07a66d7bfa0ebba, 0x837eb98b50f8322a
-data8 0xa5ac602406c4e68c, 0xd0bc3ea6b32d1b21
-data8 0x83a270f44c84f699, 0xa5d9601d95c2c0bc
-data8 0xd0f4f0e8f36c1bf8, 0x83cc4d7cfcfac5ca
-data8 0xa60e1e1a2de14745, 0xd1376458e34b037e
-data8 0x83f65f78a8872b4c, 0xa6431f6e3fbd9658
-data8 0xd17a2ca133f78572, 0x8420a75f2f7b53c8
-data8 0xa67864b0d432fda4, 0xd1bd4a80301c5715
-data8 0x844510461ff14209, 0xa6a6444aa0243c0b
-data8 0xd1f71682b2fa4575, 0x846fbd91b930bed2
-data8 0xa6dc094d10f25792, 0xd23ad555f773f059
-data8 0x84947e18234f3294, 0xa70a574cc02bba69
-data8 0xd2752c7039a5bf73, 0x84bf92755825045a
-data8 0xa7409e2af9549084, 0xd2b98ee008c06b59
-data8 0x84e4ac0ee112ba51, 0xa76f5c64ca2cf13b
-data8 0xd2f4735ffd700280, 0x8509ef44b86f20be
-data8 0xa79e4f0babab5dc0, 0xd32f99ed6d9ac0e1
-data8 0x85359d5d91768427, 0xa7d5579ae5164b85
-data8 0xd374f0666c75d51c, 0x855b3bd5b7384357
-data8 0xa804bd3c6fe61cc8, 0xd3b0a7d13618e4a1
-data8 0x858104f0c415f79a, 0xa8345895e5250a5a
-data8 0xd3eca2ea53bcec0c, 0x85a6f90390d29864
-data8 0xa8642a122b44ef0b, 0xd428e23874f13a17
-data8 0x85d3772fcd56a1dd, 0xa89c38ca18f6108b
-data8 0xd46f82fe293bc6d3, 0x85f9c982fcc002f3
-data8 0xa8cc81063b6e87ca, 0xd4ac57e9b7186420
-data8 0x862047e0e7ea554b, 0xa8fd00bfa409285e
-data8 0xd4e972becb04e8b8, 0x8646f2a26f7f5852
-data8 0xa92db8664d5516da, 0xd526d40a7a9b43a3
-data8 0x866dca21754096b5, 0xa95ea86b75cc2c20
-data8 0xd5647c5b73917370, 0x8694ceb8dfd17a37
-data8 0xa98fd141a4992deb, 0xd5a26c4201bd6d13
-data8 0x86bc00c49e9307e8, 0xa9c1335cae7446ba
-data8 0xd5e0a45015350a7e, 0x86dccd74fce79610
-data8 0xa9ea8686f556f645, 0xd614b539c6194104
-data8 0x870453c845acf90f, 0xaa1c52d17906bb19
-data8 0xd6537310e224283f, 0x872c089a1e90342c
-data8 0xaa4e59b046dab887, 0xd6927ab62244c917
-data8 0x8753ec4a92d16c5e, 0xaa809b9c60d1890b
-data8 0xd6d1ccc1fc4ef4b7, 0x877bff3aca19f6b4
-data8 0xaab319102f3f9b33, 0xd71169cea98fdded
-data8 0x879d88b6fe1c324c, 0xaadd5a18c1e21274
-data8 0xd746a66a5bc9f6d9, 0x87c5f346dbf98c3a
-data8 0xab1045f2ac31bdf5, 0xd786ce8f0fae5317
-data8 0x87e7c653efacef2c, 0xab3ae3ab2df7231e
-data8 0xd7bc7ff214c4e75a, 0x881089d4e73ffefc
-data8 0xab6e3f945d1e96fc, 0xd7fd35467a517ed1
-data8 0x88397e6a366f2a8a, 0xaba1d953a08fa94e
-data8 0xd83e38838648d815, 0x885bc559e5e1c081
-data8 0xabcd090db7ef4c3f, 0xd874a1db598b8951
-data8 0x887e2ee392bb7a93, 0xabf864602d7c323d
-data8 0xd8ab42205b80edaf, 0x88a7a8587e404257
-data8 0xac2ca5886ccf9b57, 0xd8ed1849d202f965
-data8 0x88ca5eda67594784, 0xac5861d4aa441f0f
-data8 0xd92432bd5a173685, 0x88f4356166bd590e
-data8 0xac8d183fe3a2fbed, 0xd9669ca45b03c23e
-data8 0x89173a0acf5ce026, 0xacb93703ff51571e
-data8 0xd99e3327cf89574e, 0x893a62a098b6a57b
-data8 0xace5830ad0c3f14b, 0xd9d602b19b100466
-data8 0x895daf637236ae2c, 0xad11fca5d78b3ff2
-data8 0xda0e0ba86c096841, 0x89883b9d1c2fa9c5
-data8 0xad4797fddf91a798, 0xda5195fcdb1c3dce
-data8 0x89abd8dd374a5d7b, 0xad747701e559ebcb
-data8 0xda8a1eb87a491f6c, 0x89cf9b1dcd197fa0
-data8 0xada184a47e9c7613, 0xdac2e230b91c3f84
-data8 0x89f382a258ea79de, 0xadcec13ab0dda8ff
-data8 0xdafbe0d0b66aea30, 0x8a178faf06648f29
-data8 0xadfc2d1a5fd21ba8, 0xdb351b04a8fafced
-data8 0x8a3bc288b3e1d18a, 0xae29c89a5053c33a
-data8 0xdb6e9139e33cdd8e, 0x8a601b74f4d1f835
-data8 0xae5794122b638df9, 0xdba843ded7151ea1
-data8 0x8a849aba14274764, 0xae858fda8137ae0a
-data8 0xdbe2336319b61fc8, 0x8aa9409f16cdbc9b
-data8 0xaeb3bc4ccc56d3d1, 0xdc1c60376789fa68
-data8 0x8ace0d6bbe2cb316, 0xaee219c374c09920
-data8 0xdc56cacda82d0cd5, 0x8af301688ab33558
-data8 0xaf10a899d3235fe7, 0xdc917398f2797814
-data8 0x8b181cdebe6f3206, 0xaf3f692c341fe8b4
-data8 0xdccc5b0d90a3e628, 0x8b3d60185fafcb7c
-data8 0xaf6e5bd7db9ae6c2, 0xdd0781a10469f0f2
-data8 0x8b62cb603bb2fad0, 0xaf9d80fb081cd91b
-data8 0xdd42e7ca0b52838f, 0x8b80d7d6bc4104de
-data8 0xafc35ce063eb3787, 0xdd729ad01c69114d
-data8 0x8ba68bf73ac74f39, 0xaff2ddcb5f28f03d
-data8 0xddae749c001fbf5e, 0x8bcc68fb9f9f7335
-data8 0xb022923b148e05c5, 0xddea8f50a51c69b1
-data8 0x8bf26f31c534fca2, 0xb0527a919adbf58b
-data8 0xde26eb69a0f0f111, 0x8c10f86e13a1a1f9
-data8 0xb078f3ab1d701c65, 0xde576480262399bc
-data8 0x8c3749916cc6abb5, 0xb0a93a6870649f31
-data8 0xde943789645933c8, 0x8c5dc4c4f7706032
-data8 0xb0d9b624d62ec856, 0xded14d58139a28af
-data8 0x8c7cac3a8c42e3e0, 0xb100a5f53fb3c8e1
-data8 0xdf025c00bbf2b5c7, 0x8ca373f1b7bf2716
-data8 0xb131821882f5540a, 0xdf3feb44d723a713
-data8 0x8cc29907fb951294, 0xb158bf8e4cb04055
-data8 0xdf715bc16c159be0, 0x8ce9ae4e9492aac8
-data8 0xb189fd69d56b238f, 0xdfaf66240e29cda8
-data8 0x8d0911dddbfdad0e, 0xb1b189958e8108e4
-data8 0xdfe139cbf6e19bdc, 0x8d3075c4f20f04ee
-data8 0xb1e32a8165b09832, 0xe01fc0fe94d9fc52
-data8 0x8d5018a9d4de77d5, 0xb20b0678fc271eec
-data8 0xe051f92ffcc0bd60, 0x8d77cc47dd143515
-data8 0xb23d0bd3f7592b6e, 0xe090feec9c9a06ac
-data8 0x8d97af6352739cb7, 0xb26538b2db8420dc
-data8 0xe0c39d0c9ff862d6, 0x8db7af523167800f
-data8 0xb28d89e339ceca14, 0xe0f668eeb99f188d
-data8 0x8ddfd80bc68c32ff, 0xb2c022ca12e55a16
-data8 0xe1362890eb663139, 0x8e00197e1e7c88fe
-data8 0xb2e8c6852c6b03f1, 0xe1695c7212aecbaa
-data8 0x8e207859f77e20e7, 0xb3118f4eda9fe40f
-data8 0xe19cbf0391bbbbe9, 0x8e40f4ce60c9f8e2
-data8 0xb33a7d6268109ebe, 0xe1d050901c531e85
-data8 0x8e69ba46cf2fde4d, 0xb36ddbc5ea70ec55
-data8 0xe2110903b4f4047a, 0x8e8a7a00bd7ae63e
-data8 0xb3971e9b39264023, 0xe2450559b4d80b6d
-data8 0x8eab57ef1cf2f529, 0xb3c0877ecc18e24a
-data8 0xe27931a231554ef3, 0x8ecc5442cffb1dad
-data8 0xb3ea16ae3a6c905f, 0xe2ad8e2ac3c5b04b
-data8 0x8eed6f2d2a4acbfe, 0xb413cc67aa0e4d2d
-data8 0xe2e21b41b9694cce, 0x8f0ea8dff24441ff
-data8 0xb43da8e9d163e1af, 0xe316d93615862714
-data8 0x8f385c95d696b817, 0xb47233773b84d425
-data8 0xe3590bd86a0d30f9, 0x8f59dc43edd930f3
-data8 0xb49c6825430fe730, 0xe38e38e38e38e38e
-data8 0x8f7b7b5f5ffad1c4, 0xb4c6c46bcdb27dcf
-data8 0xe3c397d1e6db7839, 0x8f9d3a1bea165f38
-data8 0xb4f1488c0b35d26f, 0xe3f928f5953feb9e
-data8 0x8fbf18adc34b66da, 0xb51bf4c7c51f0168
-data8 0xe42eeca17c62886c, 0x8fe117499e356095
-data8 0xb546c9616087ab9c, 0xe464e32943446305
-data8 0x90033624aa685f8d, 0xb571c69bdffd9a70
-data8 0xe49b0ce15747a8a2, 0x9025757495f36b86
-data8 0xb59cecbae56984c3, 0xe4d16a1eee94e9d4
-data8 0x903f3a5dcc091203, 0xb5bd64512bb14bb7
-data8 0xe4fa52107353f67d, 0x9061b2fceb2bdbab
-data8 0xb5e8d2a4bf5ba416, 0xe5310a471f4d2dc3
-data8 0x90844ca7211032a7, 0xb6146a9a1bc47819
-data8 0xe567f6f1c2b9c224, 0x90a7079403e6a15d
-data8 0xb6402c7749d621c0, 0xe59f18689a9e4c9a
-data8 0x90c9e3fbafd63799, 0xb66c1882fb435ea2
-data8 0xe5d66f04b8a68ecf, 0x90ece216c8a16ee4
-data8 0xb6982f048c999a56, 0xe60dfb2005c192e9
-data8 0x9110021e7b516f0a, 0xb6c47044075b4142
-data8 0xe645bd1544c7ea51, 0x912a708a39be9075
-data8 0xb6e5bd6bfd02bafd, 0xe66fb21b505b20a0
-data8 0x914dcc7b31146370, 0xb7124a2736ff8ef2
-data8 0xe6a7d32af4a7c59a, 0x91714af8cfe984d5
-data8 0xb73f026a01e94177, 0xe6e02b129c6a5ae4
-data8 0x918c00a6f3795e97, 0xb760a959f1d0a7a7
-data8 0xe70a9136a7403039, 0x91afbc299ed0295d
-data8 0xb78dae7e06868ab0, 0xe74349fb2d92a589
-data8 0x91d39add3e958db0, 0xb7badff8ad9e4e02
-data8 0xe77c3a9c86ed7d42, 0x91ee9920a8974d92
-data8 0xb7dce25b8e17ae9f, 0xe7a713f88151518a
-data8 0x9212b5fcac537c19, 0xb80a6226904045e2
-data8 0xe7e067453317ed2b, 0x9236f6b256923fcf
-data8 0xb8380f1cafd73c1c, 0xe819f37a81871bb5
-data8 0x92523ee6f90dcfc3, 0xb85a6ea8e321b4d8
-data8 0xe8454236bfaeca14, 0x9276bef031e6eb79
-data8 0xb8886b684ae7d2fa, 0xe87f32f24c3fc90e
-data8 0x929236ec237a24ad, 0xb8ab0726fa00cf5d
-data8 0xe8aacd8688892ba6, 0x92b6f70b7efe9dc3
-data8 0xb8d954a4d13b7cb1, 0xe8e523fd32f606f7
-data8 0x92d29f61eec7dc2b, 0xb8fc2d4f6cd9f04a
-data8 0xe9110b5311407927, 0x92f7a05d5b8ba92f
-data8 0xb92acc851476b1ab, 0xe94bc8bf0c108fa3
-data8 0x931379a403be5c16, 0xb94de2d841a184c2
-data8 0xe977fdc439c2ca3c, 0x9338bc44de2e3f34
-data8 0xb97cd4c36c92693c, 0xe9b3236528fc349e
-data8 0x9354c71412c69486, 0xb9a0297f172665e3
-data8 0xe9dfa70b745ac1b4, 0x937a4c273907e262
-data8 0xb9cf6f21e36c3924, 0xea1b36268d0eaa38
-data8 0x93968919f6e7975d, 0xb9f3030951267208
-data8 0xea480963fd394197, 0x93bc516fdd4680c9
-data8 0xba229d6a618e7c59, 0xea84034425f27484
-data8 0x93d8c123d9be59b2, 0xba467144459f9855
-data8 0xeab12713138dd1cc, 0x93f546c955e60076
-data8 0xba6a60c3c48f1a4b, 0xeade6db73a5e503b
-data8 0x941b70a65879079f, 0xba9a76056b67ee7a
-data8 0xeb1b0268343b121b, 0x943829f337410591
-data8 0xbabea699563ada6e, 0xeb489b0b2bdb5f14
-data8 0x9454f995765bc4d2, 0xbae2f350b262cc4b
-data8 0xeb765721e85f03d0, 0x947b86b57f5842ed
-data8 0xbb1385a23be24e57, 0xebb389645f222f62
-data8 0x94988aeb23470f86, 0xbb3814975e17c680
-data8 0xebe198f090607e0c, 0x94b5a5dc9695f42a
-data8 0xbb5cc031009bf467, 0xec0fcc9321024509
-data8 0x94d2d7a9170d8b42, 0xbb81889680024764
-data8 0xec3e247da8b82f61, 0x94f9e87dd78bf019
-data8 0xbbb2c0d8703ae95d, 0xec7c27d21321c9f7
-data8 0x95175019a503d89e, 0xbbd7cd09ba3c5463
-data8 0xecaad5278824e453, 0x9534cefa625fcb3a
-data8 0xbbfcf68c4977718f, 0xecd9a76d097d4e77
-data8 0x955265405c491a25, 0xbc223d88cfc88eee
-data8 0xed089ed5dcd99446, 0x9570130c1f9bb857
-data8 0xbc47a2284fee4ff8, 0xed37bb95add09a1c
-data8 0x9597ca4119525184, 0xbc79ac0916ed7b8a
-data8 0xed76c70508f904b6, 0x95b5af6fb5aa4d3c
-data8 0xbc9f5670d1a13030, 0xeda63bb05e7f93c6
-data8 0x95d3ac9273aafd7a, 0xbcc51f068cb95c1d
-data8 0xedd5d661daed2dc4, 0x95f1c1cafdfd3684
-data8 0xbceb05f4b30a9bc0, 0xee05974eef86b903
-data8 0x960fef3b430b8d5f, 0xbd110b6604c7d306
-data8 0xee357ead791fc670, 0x962e350575b409c5
-data8 0xbd372f8598620f19, 0xee658cb3c134a463
-data8 0x964c934c0dfc1708, 0xbd5d727edb6b3c7e
-data8 0xee95c1987f080211, 0x966b0a31c9c6bc7d
-data8 0xbd83d47d937bbc6d, 0xeec61d92d8c4314f
-data8 0x968999d9ad8d264e, 0xbdaa55addf1ae47d
-data8 0xeef6a0da64a014ac, 0x96a8426705198795
-data8 0xbdd0f63c36aa73f0, 0xef274ba72a07c811
-data8 0x96c703fd64445ee5, 0xbdf7b6556d550a15
-data8 0xef581e31a2c91260, 0x96e5dec0a7b4268d
-data8 0xbe1e9626b1ffa96b, 0xef8918b2bc43aec6
-data8 0x9704d2d4f59f79f3, 0xbe4595dd903e5371
-data8 0xefba3b63d89d7cbf, 0x9723e05ebe91b9b0
-data8 0xbe6cb5a7f14bc935, 0xefeb867ecffaa607
-data8 0x97430782be323831, 0xbe93f5b41d047cf7
-data8 0xf01cfa3df1b9c9fa, 0x97624865fc0df8bf
-data8 0xbebb5630bae4c15f, 0xf04e96dc05b43e2d
-data8 0x9781a32dcc640b2a, 0xbee2d74cd30a430c
-data8 0xf0805c944d827454, 0x97a117ffd0f48e46
-data8 0xbf0a7937cf38d981, 0xf0b24ba285c495cb
-data8 0x97c0a701f9d263c9, 0xbf323c217be2bc8c
-data8 0xf0e46442e76f6569, 0x97e0505a8637a036
-data8 0xbf5a203a09342bbb, 0xf116a6b2291d7896
-data8 0x97f57a9fb0b08c6e, 0xbf74cad1c14ebfc4
-data8 0xf1383fa9e9b5b381, 0x9815503365914a9d
-data8 0xbf9ce6a497a89f78, 0xf16ac84f90083b9b
-data8 0x98354085054fd204, 0xbfc52428bec6e72f
-data8 0xf19d7b686dcb03d7, 0x98554bbbf8a77902
-data8 0xbfed838fddab024b, 0xf1d0593311db1757
-data8 0x987571fffb7f94f6, 0xc016050c0420981a
-data8 0xf20361ee8f1c711e, 0x9895b3791dd03c23
-data8 0xc03ea8cfabddc330, 0xf23695da7de51d3f
-data8 0x98ab43a5fc65d0c8, 0xc059d3cbd65ddbce
-data8 0xf258d095e465cc35, 0x98cbb2d196bd713d
-data8 0xc082b122a3c78c9d, 0xf28c4d0bfc982b34
-data8 0x98ec3d9ec7b6f21a, 0xc0abb1499ae736c4
-data8 0xf2bff55eb3f0ea71, 0x990ce436db5e8344
-data8 0xc0d4d474c3aedaaf, 0xf2f3c9cf9884636e
-data8 0x9922b8218160967a, 0xc0f054ca33eb3437
-data8 0xf31670135ab9cc0f, 0x99438d686f75779d
-data8 0xc119b2c67e600ed0, 0xf34a8e9f0b54cdfb
-data8 0x99647eea131fa20b, 0xc1433453de2033ff
-data8 0xf37ed9fa6b8add3f, 0x997a85045a47c6d0
-data8 0xc15ef3e44e10032d, 0xf3a1cfe884ef6bb6
-data8 0x999ba5f14f8add02, 0xc188b130431d80e6
-data8 0xf3d66689dcc8e8d3, 0x99bce38b5465ecae
-data8 0xc1b2929d6067730e, 0xf40b2ab069d5c96a
-data8 0x99d31ca0887f30f9, 0xc1ce9268f31cc734
-data8 0xf42e718b90c8bc16, 0x99f48a669c74c09e
-data8 0xc1f8b0877c1b0c08, 0xf463822a0a3b4b00
-data8 0x9a16154eb445c873, 0xc222f35a87b415ba
-data8 0xf498c1076015faf8, 0x9a2c822ec198d667
-data8 0xc23f3467349e5c88, 0xf4bc5a19a33990b5
-data8 0x9a4e3e080cd91b78, 0xc269b4e40e088c01
-data8 0xf4f1e6a7d6f5425f, 0x9a70177afe52322e
-data8 0xc2945aac24daaf6e, 0xf527a232cf6be334
-data8 0x9a86b8fa94eebe10, 0xc2b0de05e43c1d66
-data8 0xf54b8ecdcda90851, 0x9aa8c42866ae2958
-data8 0xc2dbc275e1229d09, 0xf5819949c7ad87b4
-data8 0x9abf86f9e12fc45e, 0xc2f86fca9d80eeff
-data8 0xf5a5bac9213b48a9, 0x9ae1c462fc05f49d
-data8 0xc323938449a2587e, 0xf5dc1501f324a812
-data8 0x9af8a8dc936b84d0, 0xc3406b40a538ed20
-data8 0xf6006bee86b5589e, 0x9b1b19033be35730
-data8 0xc36bcee8211d15e0, 0xf63716b2fa067fa4
-data8 0x9b3da7daf04c2892, 0xc397593adf2ba366
-data8 0xf66df22fb6132b9c, 0x9b54c2e4c8a9012b
-data8 0xc3b475b6206155d5, 0xf6929fb98225deb1
-data8 0x9b77854e6c661200, 0xc3e0410243b97383
-data8 0xf6c9cd13021e3fea, 0x9b8ec2e678d56d2f
-data8 0xc3fd890709833d37, 0xf6eeb177472cedae
-data8 0x9ba60e6a5ca133b6, 0xc41ae295f7e7fa06
-data8 0xf713abf4cb0b3afb, 0x9bc919ea66a151a4
-data8 0xc44709f7bb8a4dd2, 0xf74b4d5333684ef1
-data8 0x9be0887c09ef82bb, 0xc4648fb0e0bec4c1
-data8 0xf7707f75a72f8e94, 0x9c03c8d5fffc3503
-data8 0xc490f9a94695ba14, 0xf7a874b97927af44
-data8 0x9c1b5ad21a81cbb9, 0xc4aeac0173b7d390
-data8 0xf7cddf140aedf1d8, 0x9c3ed09216e9ca02
-data8 0xc4db5941007aa853, 0xf806291bacb7f7a9
-data8 0x9c568656c0423def, 0xc4f938aec206291a
-data8 0xf82bcc43b92eafef, 0x9c7a320af242ce60
-data8 0xc52629e899dfd622, 0xf8646bf0defb759e
-data8 0x9c920bf7a8c01dc2, 0xc54436e44043b965
-data8 0xf88a487dfc3ff5f7, 0x9ca9f475d98b159c
-data8 0xc562563abf9ea07f, 0xf8b03c2b46cdc17f
-data8 0x9ccdeca60e80b5f8, 0xc58fa7d1dc42921c
-data8 0xf8e95541c152ae7a, 0x9ce5f9d4653d4902
-data8 0xc5adf561b91e110a, 0xf90f832c2700c160
-data8 0x9cfe15cb38bfdd8e, 0xc5cc5591bdbd82fa
-data8 0xf935c88e0c7f419b, 0x9d225b983f6c1f96
-data8 0xc5fa08f1ff20593c, 0xf96f5cd84fd86873
-data8 0x9d3a9cca32261ed7, 0xc618980a79ce6862
-data8 0xf995dd53ebdd9d6d, 0x9d52ecfccebe1768
-data8 0xc6373a09e34b50fa, 0xf9bc75a034436a41
-data8 0x9d77818d95b82f86, 0xc66550a6e0baaf35
-data8 0xf9f686f26d5518de, 0x9d8ff7893fa4706c
-data8 0xc6842241926342c9, 0xfa1d5b39b910a8c5
-data8 0x9da87cbef36f2a5e, 0xc6a3070b7c93bb9e
-data8 0xfa4447acc4ecbfd2, 0x9dcd6140b4a35aeb
-data8 0xc6d18260bb84081b, 0xfa7ed7e51e6fdfb4
-data8 0x9de60cd06dc6e2d4, 0xc6f0977c9416828b
-data8 0xfaa601394d49a1a0, 0x9dfec7d4cc43b76f
-data8 0xc70fc0117c641630, 0xfacd431644ce0e40
-data8 0x9e17925ec9fccc4a, 0xc72efc34d7e615be
-data8 0xfaf49d96f7a75909, 0x9e3cdf6db57dc075
-data8 0xc75dfb441594141e, 0xfb2fd3c65e562fd5
-data8 0x9e55d110b63637a8, 0xc77d68aa019bda4c
-data8 0xfb576c5762024805, 0x9e6ed27594550d2e
-data8 0xc79ce9ea478dbc4f, 0xfb7f1debc22c4040
-data8 0x9e87e3adc385d393, 0xc7bc7f1ae453219d
-data8 0xfba6e89f32d0190a, 0x9ead9b54b37a1055
-data8 0xc7ec0476e15e141a, 0xfbe2c803a0894893
-data8 0x9ec6d46a3d7de215, 0xc80bcbe16f1d540f
-data8 0xfc0ad1ff0ed9ecf0, 0x9ee01d9108be3154
-data8 0xc82ba78a5d349735, 0xfc32f57bdfbcbe7f
-data8 0x9ef976db07288d04, 0xc84b978847a06b87
-data8 0xfc5b32968f99b21c, 0x9f12e05a4759ec25
-data8 0xc86b9bf1ee817bc6, 0xfc83896bc861ab08
-data8 0x9f2c5a20f4da6668, 0xc88bb4de3667cdf4
-data8 0xfcabfa1861ed4815, 0x9f52af78ed1733ca
-data8 0xc8bc00e7fe9e23a3, 0xfce8d3cea7d3163e
-data8 0x9f6c52426a39d003, 0xc8dc4d7ff2d25232
-data8 0xfd118595143ee273, 0x9f860593d42fd7f3
-data8 0xc8fcaeebcb40eb47, 0xfd3a519943d4865a
-data8 0x9f9fc97fdb96bd51, 0xc91d25431426a663
-data8 0xfd6337f8e1ae5a4b, 0x9fb99e194f4a7037
-data8 0xc93db09d7fdb2949, 0xfd8c38d1c8e927eb
-data8 0x9fd383731ca51db9, 0xc95e5112e721582a
-data8 0xfdb5544205095a53, 0x9fed79a04fbf9423
-data8 0xc97f06bb49787677, 0xfdde8a67d2613531
-data8 0xa00780b413b24ee8, 0xc99fd1aecd6e1b06
-data8 0xfe07db619e781611, 0xa02eab2c4474b0cd
-data8 0xc9d12a3e27bb1625, 0xfe460768d80bf758
-data8 0xa048dcd51ccfd142, 0xc9f22ad82ba3d5f0
-data8 0xfe6f9bfb06cd32f6, 0xa0631fa894b11b8d
-data8 0xca134113105e67b2, 0xfe994bcd3d14fcc2
-data8 0xa07d73ba65e680af, 0xca346d07b045a876
-data8 0xfec316fecaf3f2ab, 0xa097d91e6aaf71b0
-data8 0xca55aecf0e94bb88, 0xfeecfdaf33fadb80
-data8 0xa0b24fe89e02602f, 0xca77068257be9bab
-data8 0xff16fffe2fa8fad6, 0xa0ccd82d1bd2f68b
-data8 0xca98743ae1c693a8, 0xff411e0ba9db886d
-data8 0xa0e77200215909e6, 0xcab9f8122c99a101
-data8 0xff6b57f7c33e4e9a, 0xa1021d760d584855
-data8 0xcadb9221e268c3b5, 0xff95ade2d1bd7358
-data8 0xa11cdaa36068a57d, 0xcafd4283d8043dfd
-data8 0xffc01fed60f86fb5, 0xa137a99cbd3f880b
-data8 0xcb1f09520d37c6fb, 0xffeaae3832b63956
-ASM_SIZE_DIRECTIVE(T_table)
-
-
-
-
-
-
-.align 32
-.global cbrtf#
+LOCAL_OBJECT_START(poly_coeffs)
+
+ data8 0xaaaab19b7e1f5ef9, 0x00003ffd // ~ 1/3
+ data8 0xe38e5192a5a8e56c, 0x00003ffb // ~ 1/9
+LOCAL_OBJECT_END(poly_coeffs)
+
+// For every entry B in the frcpa table, this table contains
+// the significands of cbrt(1/B), cbrt(2/B), cbrt(4/B).
+// The index to this table is the same as the frcpa index.
+
+LOCAL_OBJECT_START(T_table)
+
+ data8 0x80155c748c374836, 0xa160019ed37fb4ae
+ data8 0xcb51ddcb9e93095e, 0x8040404b0879f7f9
+ data8 0xa1960b5966da4608, 0xcb95f333968ad59b
+ data8 0x806b5dce4b405c10, 0xa1cc5dbe6dc2aab4
+ data8 0xcbda64292d3ffd97, 0x8096b586974669b1
+ data8 0xa202f97995b69c0d, 0xcc1f3184af961596
+ data8 0x80bcd273d952a028, 0xa232fe6eb0c0577d
+ data8 0xcc5bb1ac954d33e2, 0x80e898c52813f2f3
+ data8 0xa26a2582012f6e17, 0xcca12e9831fc6402
+ data8 0x81149add67c2d208, 0xa2a197e5d10465cb
+ data8 0xcce70a67b64f24ad, 0x813b4e2c856b6e9a
+ data8 0xa2d25a532efefbc8, 0xcd24794726477ea5
+ data8 0x8167c1dde03de7aa, 0xa30a5bd6e49e4ab8
+ data8 0xcd6b096a0b70ee87, 0x818ed973b811135e
+ data8 0xa33b9c9b59879e24, 0xcda9177738b15a90
+ data8 0x81bbc0c33e13ec98, 0xa3742fca6a3c1f21
+ data8 0xcdf05f2247dffab9, 0x81e33e69fbe7504a
+ data8 0xa3a5f1273887bf22, 0xce2f0f347f96f906
+ data8 0x820aec524e3c23e9, 0xa3d7ef508ff11574
+ data8 0xce6e0be0cd551a61, 0x823880f78e70b805
+ data8 0xa4115ce30548bc15, 0xceb666b2c347d1de
+ data8 0x826097a62a8e5200, 0xa443df0e53df577a
+ data8 0xcef609b0cb874f00, 0x8288dfe00e9b5eaf
+ data8 0xa4769fa5913c0ec3, 0xcf35fb5447e5c765
+ data8 0x82b15a10c5371624, 0xa4a99f303bc7def5
+ data8 0xcf763c47ee869f00, 0x82da06a527b18937
+ data8 0xa4dcde37779adf4b, 0xcfb6cd3888d71785
+ data8 0x8302e60b635ab394, 0xa5105d46152c938a
+ data8 0xcff7aed4fbfbb447, 0x832bf8b2feec2f0e
+ data8 0xa5441ce89825cb8d, 0xd038e1ce5167e3c6
+ data8 0x83553f0ce00e276b, 0xa5781dad3e54d899
+ data8 0xd07a66d7bfa0ebba, 0x837eb98b50f8322a
+ data8 0xa5ac602406c4e68c, 0xd0bc3ea6b32d1b21
+ data8 0x83a270f44c84f699, 0xa5d9601d95c2c0bc
+ data8 0xd0f4f0e8f36c1bf8, 0x83cc4d7cfcfac5ca
+ data8 0xa60e1e1a2de14745, 0xd1376458e34b037e
+ data8 0x83f65f78a8872b4c, 0xa6431f6e3fbd9658
+ data8 0xd17a2ca133f78572, 0x8420a75f2f7b53c8
+ data8 0xa67864b0d432fda4, 0xd1bd4a80301c5715
+ data8 0x844510461ff14209, 0xa6a6444aa0243c0b
+ data8 0xd1f71682b2fa4575, 0x846fbd91b930bed2
+ data8 0xa6dc094d10f25792, 0xd23ad555f773f059
+ data8 0x84947e18234f3294, 0xa70a574cc02bba69
+ data8 0xd2752c7039a5bf73, 0x84bf92755825045a
+ data8 0xa7409e2af9549084, 0xd2b98ee008c06b59
+ data8 0x84e4ac0ee112ba51, 0xa76f5c64ca2cf13b
+ data8 0xd2f4735ffd700280, 0x8509ef44b86f20be
+ data8 0xa79e4f0babab5dc0, 0xd32f99ed6d9ac0e1
+ data8 0x85359d5d91768427, 0xa7d5579ae5164b85
+ data8 0xd374f0666c75d51c, 0x855b3bd5b7384357
+ data8 0xa804bd3c6fe61cc8, 0xd3b0a7d13618e4a1
+ data8 0x858104f0c415f79a, 0xa8345895e5250a5a
+ data8 0xd3eca2ea53bcec0c, 0x85a6f90390d29864
+ data8 0xa8642a122b44ef0b, 0xd428e23874f13a17
+ data8 0x85d3772fcd56a1dd, 0xa89c38ca18f6108b
+ data8 0xd46f82fe293bc6d3, 0x85f9c982fcc002f3
+ data8 0xa8cc81063b6e87ca, 0xd4ac57e9b7186420
+ data8 0x862047e0e7ea554b, 0xa8fd00bfa409285e
+ data8 0xd4e972becb04e8b8, 0x8646f2a26f7f5852
+ data8 0xa92db8664d5516da, 0xd526d40a7a9b43a3
+ data8 0x866dca21754096b5, 0xa95ea86b75cc2c20
+ data8 0xd5647c5b73917370, 0x8694ceb8dfd17a37
+ data8 0xa98fd141a4992deb, 0xd5a26c4201bd6d13
+ data8 0x86bc00c49e9307e8, 0xa9c1335cae7446ba
+ data8 0xd5e0a45015350a7e, 0x86dccd74fce79610
+ data8 0xa9ea8686f556f645, 0xd614b539c6194104
+ data8 0x870453c845acf90f, 0xaa1c52d17906bb19
+ data8 0xd6537310e224283f, 0x872c089a1e90342c
+ data8 0xaa4e59b046dab887, 0xd6927ab62244c917
+ data8 0x8753ec4a92d16c5e, 0xaa809b9c60d1890b
+ data8 0xd6d1ccc1fc4ef4b7, 0x877bff3aca19f6b4
+ data8 0xaab319102f3f9b33, 0xd71169cea98fdded
+ data8 0x879d88b6fe1c324c, 0xaadd5a18c1e21274
+ data8 0xd746a66a5bc9f6d9, 0x87c5f346dbf98c3a
+ data8 0xab1045f2ac31bdf5, 0xd786ce8f0fae5317
+ data8 0x87e7c653efacef2c, 0xab3ae3ab2df7231e
+ data8 0xd7bc7ff214c4e75a, 0x881089d4e73ffefc
+ data8 0xab6e3f945d1e96fc, 0xd7fd35467a517ed1
+ data8 0x88397e6a366f2a8a, 0xaba1d953a08fa94e
+ data8 0xd83e38838648d815, 0x885bc559e5e1c081
+ data8 0xabcd090db7ef4c3f, 0xd874a1db598b8951
+ data8 0x887e2ee392bb7a93, 0xabf864602d7c323d
+ data8 0xd8ab42205b80edaf, 0x88a7a8587e404257
+ data8 0xac2ca5886ccf9b57, 0xd8ed1849d202f965
+ data8 0x88ca5eda67594784, 0xac5861d4aa441f0f
+ data8 0xd92432bd5a173685, 0x88f4356166bd590e
+ data8 0xac8d183fe3a2fbed, 0xd9669ca45b03c23e
+ data8 0x89173a0acf5ce026, 0xacb93703ff51571e
+ data8 0xd99e3327cf89574e, 0x893a62a098b6a57b
+ data8 0xace5830ad0c3f14b, 0xd9d602b19b100466
+ data8 0x895daf637236ae2c, 0xad11fca5d78b3ff2
+ data8 0xda0e0ba86c096841, 0x89883b9d1c2fa9c5
+ data8 0xad4797fddf91a798, 0xda5195fcdb1c3dce
+ data8 0x89abd8dd374a5d7b, 0xad747701e559ebcb
+ data8 0xda8a1eb87a491f6c, 0x89cf9b1dcd197fa0
+ data8 0xada184a47e9c7613, 0xdac2e230b91c3f84
+ data8 0x89f382a258ea79de, 0xadcec13ab0dda8ff
+ data8 0xdafbe0d0b66aea30, 0x8a178faf06648f29
+ data8 0xadfc2d1a5fd21ba8, 0xdb351b04a8fafced
+ data8 0x8a3bc288b3e1d18a, 0xae29c89a5053c33a
+ data8 0xdb6e9139e33cdd8e, 0x8a601b74f4d1f835
+ data8 0xae5794122b638df9, 0xdba843ded7151ea1
+ data8 0x8a849aba14274764, 0xae858fda8137ae0a
+ data8 0xdbe2336319b61fc8, 0x8aa9409f16cdbc9b
+ data8 0xaeb3bc4ccc56d3d1, 0xdc1c60376789fa68
+ data8 0x8ace0d6bbe2cb316, 0xaee219c374c09920
+ data8 0xdc56cacda82d0cd5, 0x8af301688ab33558
+ data8 0xaf10a899d3235fe7, 0xdc917398f2797814
+ data8 0x8b181cdebe6f3206, 0xaf3f692c341fe8b4
+ data8 0xdccc5b0d90a3e628, 0x8b3d60185fafcb7c
+ data8 0xaf6e5bd7db9ae6c2, 0xdd0781a10469f0f2
+ data8 0x8b62cb603bb2fad0, 0xaf9d80fb081cd91b
+ data8 0xdd42e7ca0b52838f, 0x8b80d7d6bc4104de
+ data8 0xafc35ce063eb3787, 0xdd729ad01c69114d
+ data8 0x8ba68bf73ac74f39, 0xaff2ddcb5f28f03d
+ data8 0xddae749c001fbf5e, 0x8bcc68fb9f9f7335
+ data8 0xb022923b148e05c5, 0xddea8f50a51c69b1
+ data8 0x8bf26f31c534fca2, 0xb0527a919adbf58b
+ data8 0xde26eb69a0f0f111, 0x8c10f86e13a1a1f9
+ data8 0xb078f3ab1d701c65, 0xde576480262399bc
+ data8 0x8c3749916cc6abb5, 0xb0a93a6870649f31
+ data8 0xde943789645933c8, 0x8c5dc4c4f7706032
+ data8 0xb0d9b624d62ec856, 0xded14d58139a28af
+ data8 0x8c7cac3a8c42e3e0, 0xb100a5f53fb3c8e1
+ data8 0xdf025c00bbf2b5c7, 0x8ca373f1b7bf2716
+ data8 0xb131821882f5540a, 0xdf3feb44d723a713
+ data8 0x8cc29907fb951294, 0xb158bf8e4cb04055
+ data8 0xdf715bc16c159be0, 0x8ce9ae4e9492aac8
+ data8 0xb189fd69d56b238f, 0xdfaf66240e29cda8
+ data8 0x8d0911dddbfdad0e, 0xb1b189958e8108e4
+ data8 0xdfe139cbf6e19bdc, 0x8d3075c4f20f04ee
+ data8 0xb1e32a8165b09832, 0xe01fc0fe94d9fc52
+ data8 0x8d5018a9d4de77d5, 0xb20b0678fc271eec
+ data8 0xe051f92ffcc0bd60, 0x8d77cc47dd143515
+ data8 0xb23d0bd3f7592b6e, 0xe090feec9c9a06ac
+ data8 0x8d97af6352739cb7, 0xb26538b2db8420dc
+ data8 0xe0c39d0c9ff862d6, 0x8db7af523167800f
+ data8 0xb28d89e339ceca14, 0xe0f668eeb99f188d
+ data8 0x8ddfd80bc68c32ff, 0xb2c022ca12e55a16
+ data8 0xe1362890eb663139, 0x8e00197e1e7c88fe
+ data8 0xb2e8c6852c6b03f1, 0xe1695c7212aecbaa
+ data8 0x8e207859f77e20e7, 0xb3118f4eda9fe40f
+ data8 0xe19cbf0391bbbbe9, 0x8e40f4ce60c9f8e2
+ data8 0xb33a7d6268109ebe, 0xe1d050901c531e85
+ data8 0x8e69ba46cf2fde4d, 0xb36ddbc5ea70ec55
+ data8 0xe2110903b4f4047a, 0x8e8a7a00bd7ae63e
+ data8 0xb3971e9b39264023, 0xe2450559b4d80b6d
+ data8 0x8eab57ef1cf2f529, 0xb3c0877ecc18e24a
+ data8 0xe27931a231554ef3, 0x8ecc5442cffb1dad
+ data8 0xb3ea16ae3a6c905f, 0xe2ad8e2ac3c5b04b
+ data8 0x8eed6f2d2a4acbfe, 0xb413cc67aa0e4d2d
+ data8 0xe2e21b41b9694cce, 0x8f0ea8dff24441ff
+ data8 0xb43da8e9d163e1af, 0xe316d93615862714
+ data8 0x8f385c95d696b817, 0xb47233773b84d425
+ data8 0xe3590bd86a0d30f9, 0x8f59dc43edd930f3
+ data8 0xb49c6825430fe730, 0xe38e38e38e38e38e
+ data8 0x8f7b7b5f5ffad1c4, 0xb4c6c46bcdb27dcf
+ data8 0xe3c397d1e6db7839, 0x8f9d3a1bea165f38
+ data8 0xb4f1488c0b35d26f, 0xe3f928f5953feb9e
+ data8 0x8fbf18adc34b66da, 0xb51bf4c7c51f0168
+ data8 0xe42eeca17c62886c, 0x8fe117499e356095
+ data8 0xb546c9616087ab9c, 0xe464e32943446305
+ data8 0x90033624aa685f8d, 0xb571c69bdffd9a70
+ data8 0xe49b0ce15747a8a2, 0x9025757495f36b86
+ data8 0xb59cecbae56984c3, 0xe4d16a1eee94e9d4
+ data8 0x903f3a5dcc091203, 0xb5bd64512bb14bb7
+ data8 0xe4fa52107353f67d, 0x9061b2fceb2bdbab
+ data8 0xb5e8d2a4bf5ba416, 0xe5310a471f4d2dc3
+ data8 0x90844ca7211032a7, 0xb6146a9a1bc47819
+ data8 0xe567f6f1c2b9c224, 0x90a7079403e6a15d
+ data8 0xb6402c7749d621c0, 0xe59f18689a9e4c9a
+ data8 0x90c9e3fbafd63799, 0xb66c1882fb435ea2
+ data8 0xe5d66f04b8a68ecf, 0x90ece216c8a16ee4
+ data8 0xb6982f048c999a56, 0xe60dfb2005c192e9
+ data8 0x9110021e7b516f0a, 0xb6c47044075b4142
+ data8 0xe645bd1544c7ea51, 0x912a708a39be9075
+ data8 0xb6e5bd6bfd02bafd, 0xe66fb21b505b20a0
+ data8 0x914dcc7b31146370, 0xb7124a2736ff8ef2
+ data8 0xe6a7d32af4a7c59a, 0x91714af8cfe984d5
+ data8 0xb73f026a01e94177, 0xe6e02b129c6a5ae4
+ data8 0x918c00a6f3795e97, 0xb760a959f1d0a7a7
+ data8 0xe70a9136a7403039, 0x91afbc299ed0295d
+ data8 0xb78dae7e06868ab0, 0xe74349fb2d92a589
+ data8 0x91d39add3e958db0, 0xb7badff8ad9e4e02
+ data8 0xe77c3a9c86ed7d42, 0x91ee9920a8974d92
+ data8 0xb7dce25b8e17ae9f, 0xe7a713f88151518a
+ data8 0x9212b5fcac537c19, 0xb80a6226904045e2
+ data8 0xe7e067453317ed2b, 0x9236f6b256923fcf
+ data8 0xb8380f1cafd73c1c, 0xe819f37a81871bb5
+ data8 0x92523ee6f90dcfc3, 0xb85a6ea8e321b4d8
+ data8 0xe8454236bfaeca14, 0x9276bef031e6eb79
+ data8 0xb8886b684ae7d2fa, 0xe87f32f24c3fc90e
+ data8 0x929236ec237a24ad, 0xb8ab0726fa00cf5d
+ data8 0xe8aacd8688892ba6, 0x92b6f70b7efe9dc3
+ data8 0xb8d954a4d13b7cb1, 0xe8e523fd32f606f7
+ data8 0x92d29f61eec7dc2b, 0xb8fc2d4f6cd9f04a
+ data8 0xe9110b5311407927, 0x92f7a05d5b8ba92f
+ data8 0xb92acc851476b1ab, 0xe94bc8bf0c108fa3
+ data8 0x931379a403be5c16, 0xb94de2d841a184c2
+ data8 0xe977fdc439c2ca3c, 0x9338bc44de2e3f34
+ data8 0xb97cd4c36c92693c, 0xe9b3236528fc349e
+ data8 0x9354c71412c69486, 0xb9a0297f172665e3
+ data8 0xe9dfa70b745ac1b4, 0x937a4c273907e262
+ data8 0xb9cf6f21e36c3924, 0xea1b36268d0eaa38
+ data8 0x93968919f6e7975d, 0xb9f3030951267208
+ data8 0xea480963fd394197, 0x93bc516fdd4680c9
+ data8 0xba229d6a618e7c59, 0xea84034425f27484
+ data8 0x93d8c123d9be59b2, 0xba467144459f9855
+ data8 0xeab12713138dd1cc, 0x93f546c955e60076
+ data8 0xba6a60c3c48f1a4b, 0xeade6db73a5e503b
+ data8 0x941b70a65879079f, 0xba9a76056b67ee7a
+ data8 0xeb1b0268343b121b, 0x943829f337410591
+ data8 0xbabea699563ada6e, 0xeb489b0b2bdb5f14
+ data8 0x9454f995765bc4d2, 0xbae2f350b262cc4b
+ data8 0xeb765721e85f03d0, 0x947b86b57f5842ed
+ data8 0xbb1385a23be24e57, 0xebb389645f222f62
+ data8 0x94988aeb23470f86, 0xbb3814975e17c680
+ data8 0xebe198f090607e0c, 0x94b5a5dc9695f42a
+ data8 0xbb5cc031009bf467, 0xec0fcc9321024509
+ data8 0x94d2d7a9170d8b42, 0xbb81889680024764
+ data8 0xec3e247da8b82f61, 0x94f9e87dd78bf019
+ data8 0xbbb2c0d8703ae95d, 0xec7c27d21321c9f7
+ data8 0x95175019a503d89e, 0xbbd7cd09ba3c5463
+ data8 0xecaad5278824e453, 0x9534cefa625fcb3a
+ data8 0xbbfcf68c4977718f, 0xecd9a76d097d4e77
+ data8 0x955265405c491a25, 0xbc223d88cfc88eee
+ data8 0xed089ed5dcd99446, 0x9570130c1f9bb857
+ data8 0xbc47a2284fee4ff8, 0xed37bb95add09a1c
+ data8 0x9597ca4119525184, 0xbc79ac0916ed7b8a
+ data8 0xed76c70508f904b6, 0x95b5af6fb5aa4d3c
+ data8 0xbc9f5670d1a13030, 0xeda63bb05e7f93c6
+ data8 0x95d3ac9273aafd7a, 0xbcc51f068cb95c1d
+ data8 0xedd5d661daed2dc4, 0x95f1c1cafdfd3684
+ data8 0xbceb05f4b30a9bc0, 0xee05974eef86b903
+ data8 0x960fef3b430b8d5f, 0xbd110b6604c7d306
+ data8 0xee357ead791fc670, 0x962e350575b409c5
+ data8 0xbd372f8598620f19, 0xee658cb3c134a463
+ data8 0x964c934c0dfc1708, 0xbd5d727edb6b3c7e
+ data8 0xee95c1987f080211, 0x966b0a31c9c6bc7d
+ data8 0xbd83d47d937bbc6d, 0xeec61d92d8c4314f
+ data8 0x968999d9ad8d264e, 0xbdaa55addf1ae47d
+ data8 0xeef6a0da64a014ac, 0x96a8426705198795
+ data8 0xbdd0f63c36aa73f0, 0xef274ba72a07c811
+ data8 0x96c703fd64445ee5, 0xbdf7b6556d550a15
+ data8 0xef581e31a2c91260, 0x96e5dec0a7b4268d
+ data8 0xbe1e9626b1ffa96b, 0xef8918b2bc43aec6
+ data8 0x9704d2d4f59f79f3, 0xbe4595dd903e5371
+ data8 0xefba3b63d89d7cbf, 0x9723e05ebe91b9b0
+ data8 0xbe6cb5a7f14bc935, 0xefeb867ecffaa607
+ data8 0x97430782be323831, 0xbe93f5b41d047cf7
+ data8 0xf01cfa3df1b9c9fa, 0x97624865fc0df8bf
+ data8 0xbebb5630bae4c15f, 0xf04e96dc05b43e2d
+ data8 0x9781a32dcc640b2a, 0xbee2d74cd30a430c
+ data8 0xf0805c944d827454, 0x97a117ffd0f48e46
+ data8 0xbf0a7937cf38d981, 0xf0b24ba285c495cb
+ data8 0x97c0a701f9d263c9, 0xbf323c217be2bc8c
+ data8 0xf0e46442e76f6569, 0x97e0505a8637a036
+ data8 0xbf5a203a09342bbb, 0xf116a6b2291d7896
+ data8 0x97f57a9fb0b08c6e, 0xbf74cad1c14ebfc4
+ data8 0xf1383fa9e9b5b381, 0x9815503365914a9d
+ data8 0xbf9ce6a497a89f78, 0xf16ac84f90083b9b
+ data8 0x98354085054fd204, 0xbfc52428bec6e72f
+ data8 0xf19d7b686dcb03d7, 0x98554bbbf8a77902
+ data8 0xbfed838fddab024b, 0xf1d0593311db1757
+ data8 0x987571fffb7f94f6, 0xc016050c0420981a
+ data8 0xf20361ee8f1c711e, 0x9895b3791dd03c23
+ data8 0xc03ea8cfabddc330, 0xf23695da7de51d3f
+ data8 0x98ab43a5fc65d0c8, 0xc059d3cbd65ddbce
+ data8 0xf258d095e465cc35, 0x98cbb2d196bd713d
+ data8 0xc082b122a3c78c9d, 0xf28c4d0bfc982b34
+ data8 0x98ec3d9ec7b6f21a, 0xc0abb1499ae736c4
+ data8 0xf2bff55eb3f0ea71, 0x990ce436db5e8344
+ data8 0xc0d4d474c3aedaaf, 0xf2f3c9cf9884636e
+ data8 0x9922b8218160967a, 0xc0f054ca33eb3437
+ data8 0xf31670135ab9cc0f, 0x99438d686f75779d
+ data8 0xc119b2c67e600ed0, 0xf34a8e9f0b54cdfb
+ data8 0x99647eea131fa20b, 0xc1433453de2033ff
+ data8 0xf37ed9fa6b8add3f, 0x997a85045a47c6d0
+ data8 0xc15ef3e44e10032d, 0xf3a1cfe884ef6bb6
+ data8 0x999ba5f14f8add02, 0xc188b130431d80e6
+ data8 0xf3d66689dcc8e8d3, 0x99bce38b5465ecae
+ data8 0xc1b2929d6067730e, 0xf40b2ab069d5c96a
+ data8 0x99d31ca0887f30f9, 0xc1ce9268f31cc734
+ data8 0xf42e718b90c8bc16, 0x99f48a669c74c09e
+ data8 0xc1f8b0877c1b0c08, 0xf463822a0a3b4b00
+ data8 0x9a16154eb445c873, 0xc222f35a87b415ba
+ data8 0xf498c1076015faf8, 0x9a2c822ec198d667
+ data8 0xc23f3467349e5c88, 0xf4bc5a19a33990b5
+ data8 0x9a4e3e080cd91b78, 0xc269b4e40e088c01
+ data8 0xf4f1e6a7d6f5425f, 0x9a70177afe52322e
+ data8 0xc2945aac24daaf6e, 0xf527a232cf6be334
+ data8 0x9a86b8fa94eebe10, 0xc2b0de05e43c1d66
+ data8 0xf54b8ecdcda90851, 0x9aa8c42866ae2958
+ data8 0xc2dbc275e1229d09, 0xf5819949c7ad87b4
+ data8 0x9abf86f9e12fc45e, 0xc2f86fca9d80eeff
+ data8 0xf5a5bac9213b48a9, 0x9ae1c462fc05f49d
+ data8 0xc323938449a2587e, 0xf5dc1501f324a812
+ data8 0x9af8a8dc936b84d0, 0xc3406b40a538ed20
+ data8 0xf6006bee86b5589e, 0x9b1b19033be35730
+ data8 0xc36bcee8211d15e0, 0xf63716b2fa067fa4
+ data8 0x9b3da7daf04c2892, 0xc397593adf2ba366
+ data8 0xf66df22fb6132b9c, 0x9b54c2e4c8a9012b
+ data8 0xc3b475b6206155d5, 0xf6929fb98225deb1
+ data8 0x9b77854e6c661200, 0xc3e0410243b97383
+ data8 0xf6c9cd13021e3fea, 0x9b8ec2e678d56d2f
+ data8 0xc3fd890709833d37, 0xf6eeb177472cedae
+ data8 0x9ba60e6a5ca133b6, 0xc41ae295f7e7fa06
+ data8 0xf713abf4cb0b3afb, 0x9bc919ea66a151a4
+ data8 0xc44709f7bb8a4dd2, 0xf74b4d5333684ef1
+ data8 0x9be0887c09ef82bb, 0xc4648fb0e0bec4c1
+ data8 0xf7707f75a72f8e94, 0x9c03c8d5fffc3503
+ data8 0xc490f9a94695ba14, 0xf7a874b97927af44
+ data8 0x9c1b5ad21a81cbb9, 0xc4aeac0173b7d390
+ data8 0xf7cddf140aedf1d8, 0x9c3ed09216e9ca02
+ data8 0xc4db5941007aa853, 0xf806291bacb7f7a9
+ data8 0x9c568656c0423def, 0xc4f938aec206291a
+ data8 0xf82bcc43b92eafef, 0x9c7a320af242ce60
+ data8 0xc52629e899dfd622, 0xf8646bf0defb759e
+ data8 0x9c920bf7a8c01dc2, 0xc54436e44043b965
+ data8 0xf88a487dfc3ff5f7, 0x9ca9f475d98b159c
+ data8 0xc562563abf9ea07f, 0xf8b03c2b46cdc17f
+ data8 0x9ccdeca60e80b5f8, 0xc58fa7d1dc42921c
+ data8 0xf8e95541c152ae7a, 0x9ce5f9d4653d4902
+ data8 0xc5adf561b91e110a, 0xf90f832c2700c160
+ data8 0x9cfe15cb38bfdd8e, 0xc5cc5591bdbd82fa
+ data8 0xf935c88e0c7f419b, 0x9d225b983f6c1f96
+ data8 0xc5fa08f1ff20593c, 0xf96f5cd84fd86873
+ data8 0x9d3a9cca32261ed7, 0xc618980a79ce6862
+ data8 0xf995dd53ebdd9d6d, 0x9d52ecfccebe1768
+ data8 0xc6373a09e34b50fa, 0xf9bc75a034436a41
+ data8 0x9d77818d95b82f86, 0xc66550a6e0baaf35
+ data8 0xf9f686f26d5518de, 0x9d8ff7893fa4706c
+ data8 0xc6842241926342c9, 0xfa1d5b39b910a8c5
+ data8 0x9da87cbef36f2a5e, 0xc6a3070b7c93bb9e
+ data8 0xfa4447acc4ecbfd2, 0x9dcd6140b4a35aeb
+ data8 0xc6d18260bb84081b, 0xfa7ed7e51e6fdfb4
+ data8 0x9de60cd06dc6e2d4, 0xc6f0977c9416828b
+ data8 0xfaa601394d49a1a0, 0x9dfec7d4cc43b76f
+ data8 0xc70fc0117c641630, 0xfacd431644ce0e40
+ data8 0x9e17925ec9fccc4a, 0xc72efc34d7e615be
+ data8 0xfaf49d96f7a75909, 0x9e3cdf6db57dc075
+ data8 0xc75dfb441594141e, 0xfb2fd3c65e562fd5
+ data8 0x9e55d110b63637a8, 0xc77d68aa019bda4c
+ data8 0xfb576c5762024805, 0x9e6ed27594550d2e
+ data8 0xc79ce9ea478dbc4f, 0xfb7f1debc22c4040
+ data8 0x9e87e3adc385d393, 0xc7bc7f1ae453219d
+ data8 0xfba6e89f32d0190a, 0x9ead9b54b37a1055
+ data8 0xc7ec0476e15e141a, 0xfbe2c803a0894893
+ data8 0x9ec6d46a3d7de215, 0xc80bcbe16f1d540f
+ data8 0xfc0ad1ff0ed9ecf0, 0x9ee01d9108be3154
+ data8 0xc82ba78a5d349735, 0xfc32f57bdfbcbe7f
+ data8 0x9ef976db07288d04, 0xc84b978847a06b87
+ data8 0xfc5b32968f99b21c, 0x9f12e05a4759ec25
+ data8 0xc86b9bf1ee817bc6, 0xfc83896bc861ab08
+ data8 0x9f2c5a20f4da6668, 0xc88bb4de3667cdf4
+ data8 0xfcabfa1861ed4815, 0x9f52af78ed1733ca
+ data8 0xc8bc00e7fe9e23a3, 0xfce8d3cea7d3163e
+ data8 0x9f6c52426a39d003, 0xc8dc4d7ff2d25232
+ data8 0xfd118595143ee273, 0x9f860593d42fd7f3
+ data8 0xc8fcaeebcb40eb47, 0xfd3a519943d4865a
+ data8 0x9f9fc97fdb96bd51, 0xc91d25431426a663
+ data8 0xfd6337f8e1ae5a4b, 0x9fb99e194f4a7037
+ data8 0xc93db09d7fdb2949, 0xfd8c38d1c8e927eb
+ data8 0x9fd383731ca51db9, 0xc95e5112e721582a
+ data8 0xfdb5544205095a53, 0x9fed79a04fbf9423
+ data8 0xc97f06bb49787677, 0xfdde8a67d2613531
+ data8 0xa00780b413b24ee8, 0xc99fd1aecd6e1b06
+ data8 0xfe07db619e781611, 0xa02eab2c4474b0cd
+ data8 0xc9d12a3e27bb1625, 0xfe460768d80bf758
+ data8 0xa048dcd51ccfd142, 0xc9f22ad82ba3d5f0
+ data8 0xfe6f9bfb06cd32f6, 0xa0631fa894b11b8d
+ data8 0xca134113105e67b2, 0xfe994bcd3d14fcc2
+ data8 0xa07d73ba65e680af, 0xca346d07b045a876
+ data8 0xfec316fecaf3f2ab, 0xa097d91e6aaf71b0
+ data8 0xca55aecf0e94bb88, 0xfeecfdaf33fadb80
+ data8 0xa0b24fe89e02602f, 0xca77068257be9bab
+ data8 0xff16fffe2fa8fad6, 0xa0ccd82d1bd2f68b
+ data8 0xca98743ae1c693a8, 0xff411e0ba9db886d
+ data8 0xa0e77200215909e6, 0xcab9f8122c99a101
+ data8 0xff6b57f7c33e4e9a, 0xa1021d760d584855
+ data8 0xcadb9221e268c3b5, 0xff95ade2d1bd7358
+ data8 0xa11cdaa36068a57d, 0xcafd4283d8043dfd
+ data8 0xffc01fed60f86fb5, 0xa137a99cbd3f880b
+ data8 0xcb1f09520d37c6fb, 0xffeaae3832b63956
+LOCAL_OBJECT_END(T_table)
+
+
+
+
+
+
.section .text
-.proc cbrtf#
-.align 32
-cbrtf:
+GLOBAL_LIBM_ENTRY(cbrtf)
-{ .mfi
- getf.sig r28=f8
- // will continue only for normal/denormal numbers
-(p0) fclass.nm.unc p12,p7 = f8, 0x1b
- // r2 = pointer to C_1,C_2 followed by T_table
- addl r2 = @ltoff(poly_coeffs), gp
+{.mfi
+ getf.sig GR_SIGNIF = f8
+ // will continue only for normal/denormal numbers
+ fclass.nm.unc p12, p7 = f8, 0x1b
+ // GR_GP = pointer to C_1, C_2 followed by T_table
+ nop.i 0
}
{.mfi
- // r29=bias-((2^8-1)/3) -63=0xffff-0x55-0x3f=0xff6b
- mov r29=0xff6b
- // normalize a
- fma.s1 f14=f8,f1,f0
- nop.i 0;;
+ addl GR_GP = @ltoff(poly_coeffs), gp
+ // normalize a
+ fma.s1 FR_ARG = f8, f1, f0
+ // GR_CT3 = bias-((2^8-1)/3) -63 = 0xffff-0x55-0x3f = 0xff6b
+ mov GR_CT3 = 0xff6b ;;
}
-{.mib
- nop.m 0
- (p7) cmp.eq p12,p0=r28,r0
- nop.b 0;;
+
+{.mmi
+ // get exponent
+ getf.exp GR_ARGEXP = f8
+ // load start address for C_1, C_2 followed by T_table
+ ld8 GR_ADDR = [ GR_GP ]
+ nop.i 0 ;;
}
-{.mfb
- // load start address for C_1,C_2 followed by T_table
- ld8 r2=[r2]
- (p12) fma.s.s0 f8=f8,f1,f0
- (p12) br.ret.spnt b0;;
+
+{.mlx
+ // check if input significand is 0
+ (p7) cmp.eq p12, p7 = GR_SIGNIF, r0
+ // GR_2P63 = 2^63
+ movl GR_2P63 = 0x8000000000000000 ;;
+}
+
+{.mfi
+ nop.m 0
+ // y = frcpa(a)
+ // p7 = 1 for normal and denormal (but non-zero) arguments
+ (p7) frcpa.s0 FR_Y, p0 = f1, f8
+ // p9 = 1 if denormal input
+ cmp.gtu p9, p0 = GR_2P63, GR_SIGNIF
}
-{.mmf
- // load C_1
- ldfe f7=[r2],16
- nop.m 0
- // y=frcpa(a)
- frcpa.s0 f8,p6=f1,f8;;
+{.mfb
+ // load C_1
+ ldfe FR_COEFF1 = [ GR_ADDR ], 16
+ // if argument is 0, +/-Infinity, or NaN, return
+ (p12) fma.s.s0 f8 = f8, f1, f0
+ (p12) br.ret.spnt b0 ;;
}
+
{.mmi
- // load C_2
- ldfe f9=[r2],16
- // r28=bias-(2^8-1)
- mov r28=0xff00
- nop.i 0;;
+ // get normalized significand (for denormal inputs only)
+ (p9) getf.sig GR_SIGNIF = FR_ARG
+ // load C_2
+ ldfe FR_COEFF2 = [ GR_ADDR ], 16
+ // GR_CT2 = bias-(2^8-1)
+ mov GR_CT2 = 0xff00
}
-{.mmi
- // get normalized significand
- getf.sig r23=f14
- // get exponent
- getf.exp r24=f14
- mov r25=0x20000;;
+
+{.mii
+ // get exponent (for denormal inputs only)
+ (p9) getf.exp GR_ARGEXP = FR_ARG
+ nop.i 0
+ mov GR_CONST = 0x20000 ;;
}
+
+
{.mii
- // get r26=sign
- and r26=r24,r25
- // eliminate leading 1 from r23=1st table index
- shl r23=r23,1
- // eliminate sign from exponent (r25)
- andcm r25=r24,r25;;
+ // get GR_SIGN = sign
+ and GR_SIGN = GR_ARGEXP, GR_CONST
+ // eliminate leading 1 from GR_I1 = 1st table index
+ shl GR_I1 = GR_SIGNIF, 1
+ // eliminate sign from exponent
+ andcm GR_EBIAS = GR_ARGEXP, GR_CONST ;;
}
+
+
{.mfi
- // subtract bias from r25=exponent
- sub r25=r25,r28
- // r=1-a*y
- (p6) fnma.s1 f6=f8,f14,f1
- // r23=1st table index (y_index8 bits)
- shr.u r23=r23,56;;
+ // subtract bias from GR_EXP = exponent
+ sub GR_EXP = GR_EBIAS, GR_CT2
+ // r = 1-a*y
+ fnma.s1 FR_R = FR_Y, FR_ARG, f1
+ // GR_IT1 = 1st table index (y_index8 bits)
+ shr.u GR_IT1 = GR_I1, 56 ;;
}
+
+
{.mii
- // 1: exponent*=5; // (2^{16}-1)/3=0x5555
- shladd r24=r25,2,r25
- // r23=3*y_index
- shladd r23=r23,1,r23;;
- // r30=(5*expon)*16+5*expon=(0x55)*expon
- shladd r30=r24,4,r24;;
+ // 1: exponent* = 5; // (2^{16}-1)/3 = 0x5555
+ shladd GR_E5 = GR_EXP, 2, GR_EXP
+ // GR_IT1_3 = 3*y_index
+ shladd GR_IT1_3 = GR_IT1, 1, GR_IT1
+ nop.i 0 ;;
}
+
+
+{.mmi
+ // GR_TMP5 = (5*expon)*16+5*expon = (0x55)*expon
+ shladd GR_TMP5 = GR_E5, 4, GR_E5
+ // adjust T_table pointer by 1st index
+ shladd GR_TP1 = GR_IT1_3, 3, GR_ADDR
+ nop.i 0 ;;
+}
+
+
{.mmi
- // adjust T_table pointer by 1st index
- shladd r2=r23,3,r2;;
- // f10=T[0][y]
- (p6) ldf8 f10=[r2],8
- // r24=(0x5500)*expon
- shl r24=r30,8;;
+ // FR_T0 = T [ 0 ] [ y ]
+ ldf8 FR_T0 = [ GR_TP1 ], 8
+ // get 2^{-63}
+ mov GR_TMP63 = 0xffff + 63
+ // GR_TMP = (0x5500)*expon
+ shl GR_TMP = GR_TMP5, 8 ;;
}
+
+
{.mfi
- // f11=T[1][y]
- (p6) ldf8 f11=[r2],8
- // P_1=C_1+C_2*r
- (p6) fma.s1 f7=f9,f6,f7
- // r24=(0x5555)*expon
- add r24=r24,r30;;
+ // FR_T1 = T [ 1 ] [ y ]
+ ldf8 FR_T1 = [ GR_TP1 ], 8
+ // P_1 = C_1+C_2*r
+ fma.s1 FR_COEFF1 = FR_COEFF2, FR_R, FR_COEFF1
+ // GR_TMP2 = (0x5555)*expon
+ add GR_TMP2 = GR_TMP, GR_TMP5 ;;
}
+
+
{.mmi
- // r24=(0x5556)*expon // 0x5556=(2^{16}+2)/3
- add r24=r24,r25;;
- // f8=T[2][y]
- (p6) ldf8 f8=[r2]
- // r24=floor(expon/3)
- shr r24=r24,16;;
+ // GR_TMP3 = (0x5556)*expon // 0x5556 = (2^{16}+2)/3
+ add GR_TMP3 = GR_TMP2, GR_EXP ;;
+ // FR_T2 = T [ 2 ] [ y ]
+ ldf8 FR_T2 = [ GR_TP1 ]
+ // GR_EXP3 = floor(expon/3)
+ shr GR_EXP3 = GR_TMP3, 16 ;;
}
+
+
{.mmi
- nop.m 0
- // r28=3*exponent
- shladd r28=r24,1,r24
- // bias exponent
- add r24=r29,r24;;
+ setf.exp FR_2M63 = GR_TMP63
+ // GR_TMP4 = 3*exponent
+ shladd GR_TMP4 = GR_EXP3, 1, GR_EXP3
+ // bias exponent
+ add GR_EBIAS3 = GR_CT3, GR_EXP3 ;;
+}
+
+
+{.mmf
+ // get remainder of exponent/3
+ sub GR_REM = GR_EXP, GR_TMP4
+ // add sign to exponent
+ or GR_SEXP = GR_EBIAS3, GR_SIGN
+ // P_2 = -r*P_1
+ fnma.s1 FR_R = FR_COEFF1, FR_R, f0 ;;
}
+
+
+
{.mmi
- // get remainder of exponent/3
- sub r25=r25,r28
- // add sign to exponent
- or r24=r24,r26
- nop.i 0;;
-}
-{.mfi
- nop.m 0
- // P_2=-r*P_1
- (p6) fnma.s1 f6=f7,f6,f0
- // remainder=0 ?
- (p6) cmp.eq.unc p7,p8=r0,r25;;
+ // FR_ARG = sign*2^{exponent/3}
+ setf.exp FR_ARG = GR_SEXP
+ nop.m 0
+ // remainder = 0 ?
+ // p7=1 if input exponent is 3*j (remainder is 0)
+ cmp.eq.unc p7, p8 = r0, GR_REM ;;
}
+
+
{.mfi
- // f14=sign*2^{exponent/3}
- (p6) setf.exp f14=r24
- nop.f 0
- // remainder = 1 ?
- (p8) cmp.eq.unc p8,p12=1,r25;;
+ // remainder = 1 ?
+ // p8=1 if input exponent is 3*j+1 (remainder is 1)
+ // p12=1 if input exponent is 3*j+2 (remainder is 2)
+ (p8) cmp.eq.unc p8, p12 = 1, GR_REM
+ // p7=1 -> remainder = 0 -> use T = FR_T0
+ (p7) fma.s1 f8 = FR_T0, FR_R, FR_T0
+ // argument is of the form 2^(3*k) ?
+ // ( GR_I1 holds significand bits, without the leading 1)
+ or GR_I1 = GR_I1, GR_REM ;;
}
-.pred.rel "mutex",p7,p8
+
+
+.pred.rel "mutex", p12, p8
{.mfi
- nop.m 0
- // remainder=0 -> use T=f10
- (p7) fma.s1 f8=f10,f6,f10
- nop.i 0
+ nop.m 0
+ // p8=1 -> remainder = 1 -> use FR_T1
+ (p8) fma.s1 f8 = FR_T1, FR_R, FR_T1
+ // argument is of the form 2^(3*k) ?
+ cmp.eq p14, p7 = GR_I1, r0
}
+
+
{.mfi
- nop.m 0
- // remainder =1 -> use f11
- (p8) fma.s1 f8=f11,f6,f11
- nop.i 0;;
+ nop.m 0
+ // p12=1 -> remainder=2 -> result = T+T*P_2
+ (p12) fma.s1 f8 = FR_T2, FR_R, FR_T2
+ nop.i 0 ;;
}
+
+
+.pred.rel "mutex", p14, p7
{.mfi
- nop.m 0
- // result=T+T*P_2
- (p12) fma.s.s0 f8=f8,f6,f8
- nop.i 0;;
+ nop.m 0
+ // if argument is sgn*2^{3*(expon/3)}
+ (p14) fma.s.s0 f8 = FR_2M63, FR_ARG, f0
+ nop.i 0
}
{.mfb
- nop.m 0
- // T*=sgn*2^{expon/3}
- (p6) fma.s.s0 f8=f8,f14,f0
- br.ret.sptk b0;;
+ nop.m 0
+ // T* = sgn*2^{expon/3}
+ (p7) fma.s.s0 f8 = f8, FR_ARG, f0
+ br.ret.sptk b0 ;;
}
-.endp cbrtf
-ASM_SIZE_DIRECTIVE(cbrtf)
+
+
+GLOBAL_LIBM_END(cbrtf)
+
+
+
diff --git a/sysdeps/ia64/fpu/s_cbrtl.S b/sysdeps/ia64/fpu/s_cbrtl.S
index d4bbf8fdbf..3e621e2c12 100644
--- a/sysdeps/ia64/fpu/s_cbrtl.S
+++ b/sysdeps/ia64/fpu/s_cbrtl.S
@@ -1,11 +1,10 @@
-.file "cbrtl.asm"
+.file "cbrtl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang
-// of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -36,11 +35,13 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 4/28/00: Initial version
+// 04/28/00 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
@@ -95,29 +96,26 @@
// r2-r3, r23-r30
// p6,p7,p12
-#include "libm_support.h"
+
// Data tables
//==============================================================
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+RODATA
.align 16
-poly_coeffs:
-ASM_TYPE_DIRECTIVE(poly_coeffs,@object)
+LOCAL_OBJECT_START(poly_coeffs)
+
data8 0xaaaaaaaaaaaaaab1, 0x00003ffd // C_1
data8 0xe38e38e38e38e3e0, 0x00003ffb // C_2
data8 0x3faf9add3c0be9a6, 0x3fa511e8d2b1f749 // C_3, C_4
data8 0x3f9ee71b2c6ebe99, 0x3f9809180fd0340c // C_5, C_6
-ASM_SIZE_DIRECTIVE(poly_coeffs)
+LOCAL_OBJECT_END(poly_coeffs)
+
+
+LOCAL_OBJECT_START(T_table)
-T_table:
-ASM_TYPE_DIRECTIVE(T_table,@object)
data8 0x80155c748c374836, 0x8040404b0879f7f9
data8 0x806b5dce4b405c10, 0x8096b586974669b1
@@ -503,14 +501,15 @@ data8 0xfec316fecaf3f2ab, 0xfeecfdaf33fadb80
data8 0xff16fffe2fa8fad6, 0xff411e0ba9db886d
data8 0xff6b57f7c33e4e9a, 0xff95ade2d1bd7358
data8 0xffc01fed60f86fb5, 0xffeaae3832b63956
-ASM_SIZE_DIRECTIVE(T_table)
+LOCAL_OBJECT_END(T_table)
-D_table:
-ASM_TYPE_DIRECTIVE(D_table,@object)
+
+LOCAL_OBJECT_START(D_table)
+
data4 0x1e50f488, 0x1ebdc559, 0x1e649ec1, 0x9eed9b2c
data4 0x9e511c44, 0x9ec6d551, 0x9eefe248, 0x9e313854
data4 0x9f54ff18, 0x9d231411, 0x1ee5d63c, 0x9edf6b95
@@ -703,25 +702,16 @@ data4 0x9eafd508, 0x9ef0e9fc, 0x1d1307ac, 0x1eecee20
data4 0x1cf60c6f, 0x9d556216, 0x9eaed175, 0x9ec919f4
data4 0x1ec2c988, 0x1cd82772, 0x9dc99456, 0x1eab0467
data4 0x1e89b36f, 0x1c757944, 0x1eef9abd, 0x9e98664d
-ASM_SIZE_DIRECTIVE(D_table)
-
-
+LOCAL_OBJECT_END(D_table)
-
-.align 32
-.global cbrtl#
-
.section .text
-.proc cbrtl#
-.align 32
-cbrtl:
-
+GLOBAL_LIBM_ENTRY(cbrtl)
{ .mfi
getf.sig r3=f8
// will continue only for normal/denormal numbers
-(p0) fclass.nm.unc p12,p7 = f8, 0x1b
+ fclass.nm.unc p12,p7 = f8, 0x1b
// r2 = pointer to C_1...C_6 followed by T_table
addl r2 = @ltoff(poly_coeffs), gp;;
}
@@ -898,5 +888,5 @@ cbrtl:
(p6) fma.s0 f8=f8,f6,f8
br.ret.sptk b0;;
}
-.endp cbrtl
-ASM_SIZE_DIRECTIVE(cbrtl)
+GLOBAL_LIBM_END(cbrtl)
+
diff --git a/sysdeps/ia64/fpu/s_ceil.S b/sysdeps/ia64/fpu/s_ceil.S
index f7e6d2cfa6..d1d2980618 100644
--- a/sysdeps/ia64/fpu/s_ceil.S
+++ b/sysdeps/ia64/fpu/s_ceil.S
@@ -1,10 +1,10 @@
.file "ceil.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,90 +20,67 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-
-#include "libm_support.h"
-
-.align 32
-.global ceil#
-
-.section .text
-.proc ceil#
-.align 32
-
// History
//==============================================================
-// 2/02/00: Initial version
-// 6/13/00: Improved speed
-// 6/27/00: Eliminated incorrect invalid flag setting
+// 02/02/00 Initial version
+// 06/13/00 Improved speed
+// 06/27/00 Eliminated incorrect invalid flag setting
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/28/03 Improved performance
+//==============================================================
// API
//==============================================================
// double ceil(double x)
+//==============================================================
-// general input registers:
-
-ceil_GR_FFFF = r14
-ceil_GR_signexp = r15
-ceil_GR_exponent = r16
-ceil_GR_expmask = r17
-ceil_GR_bigexp = r18
-
-
-// predicate registers used:
+// general input registers:
+// r14 - r19
-// p6 ==> Input is NaN, infinity, zero
-// p7 ==> Input is denormal
-// p8 ==> Input is <0
-// p9 ==> Input is >=0
-// p10 ==> Input is already an integer (bigger than largest integer)
-// p11 ==> Input is not a large integer
-// p12 ==> Input is a smaller integer
-// p13 ==> Input is not an even integer, so inexact must be set
-// p14 ==> Input is between -1 and 0, so result will be -0 and inexact
+rSignexp = r14
+rExp = r15
+rExpMask = r16
+rBigexp = r17
+rM1 = r18
+rSignexpM1 = r19
+// floating-point registers:
+// f8 - f13
-// floating-point registers used:
+fXInt = f9
+fNormX = f10
+fTmp = f11
+fAdj = f12
+fPreResult = f13
-CEIL_SIGNED_ZERO = f7
-CEIL_NORM_f8 = f9
-CEIL_FFFF = f10
-CEIL_INEXACT = f11
-CEIL_FLOAT_INT_f8 = f12
-CEIL_INT_f8 = f13
-CEIL_adj = f14
-CEIL_MINUS_ONE = f15
+// predicate registers used:
+// p6 - p10
// Overview of operation
//==============================================================
-
// double ceil(double x)
-// Return an integer value (represented as a double) that is the smallest
+// Return an integer value (represented as a double) that is the smallest
// value not less than x
// This is x rounded toward +infinity to an integral value.
// Inexact is set if x != ceil(x)
-// **************************************************************************
-
-// Set denormal flag for denormal input and
-// and take denormal fault if necessary.
-
-// Is the input an integer value already?
+//==============================================================
// double_extended
// if the exponent is > 1003e => 3F(true) = 63(decimal)
@@ -124,139 +101,124 @@ CEIL_MINUS_ONE = f15
// If we multiply by 2^23, we no longer have a fractional part
// So input is an integer value already.
-// If x is NAN, ZERO, or INFINITY, then return
-
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 1 11 0xe7
-
-ceil:
+.section .text
+GLOBAL_LIBM_ENTRY(ceil)
{ .mfi
- getf.exp ceil_GR_signexp = f8
- fcvt.fx.trunc.s1 CEIL_INT_f8 = f8
- addl ceil_GR_bigexp = 0x10033, r0
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
+ addl rBigexp = 0x10033, r0 // Set exponent at which is integer
}
{ .mfi
- addl ceil_GR_FFFF = -1,r0
- fcmp.lt.s1 p8,p9 = f8,f0
- mov ceil_GR_expmask = 0x1FFFF ;;
+ mov rM1 = -1 // Set all ones
+ fcvt.fx.trunc.s1 fXInt = f8 // Convert to int in significand
+ mov rExpMask = 0x1FFFF // Form exponent mask
}
+;;
-// p7 ==> denorm
{ .mfi
- setf.sig CEIL_FFFF = ceil_GR_FFFF
- fclass.m p7,p0 = f8, 0x0b
- nop.i 999
+ mov rSignexpM1 = 0x2FFFF // Form signexp of -1
+ fcmp.lt.s1 p8,p9 = f8, f0 // Test x < 0
+ nop.i 0
}
-{ .mfi
- nop.m 999
- fnorm CEIL_NORM_f8 = f8
- nop.i 999 ;;
+{ .mfb
+ setf.sig fTmp = rM1 // Make const for setting inexact
+ fnorm.s1 fNormX = f8 // Normalize input
+(p7) br.cond.spnt CEIL_UNORM // Branch if x unorm
}
+;;
-// Form 0 with sign of input in case negative zero is needed
-{ .mfi
- nop.m 999
- fmerge.s CEIL_SIGNED_ZERO = f8, f0
- nop.i 999
-}
+CEIL_COMMON:
+// Return here from CEIL_UNORM
{ .mfi
- nop.m 999
- fsub.s1 CEIL_MINUS_ONE = f0, f1
- nop.i 999 ;;
-}
-
-// p6 ==> NAN, INF, ZERO
-{ .mfb
- nop.m 999
- fclass.m p6,p10 = f8, 0xe7
-(p7) br.cond.spnt L(CEIL_DENORM) ;;
+ nop.m 0
+ fclass.m p6,p0 = f8, 0x1e7 // Test x natval, nan, inf, 0
+ nop.i 0
}
+;;
-L(CEIL_COMMON):
.pred.rel "mutex",p8,p9
-// Set adjustment to add to trunc(x) for result
-// If x>0, adjustment is 1.0
-// If x<=0, adjustment is 0.0
{ .mfi
- and ceil_GR_exponent = ceil_GR_signexp, ceil_GR_expmask
-(p9) fadd.s1 CEIL_adj = f1,f0
- nop.i 999
+ nop.m 0
+(p8) fma.s1 fAdj = f0, f0, f0 // If x < 0, adjustment is 0
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p8) fadd.s1 CEIL_adj = f0,f0
- nop.i 999 ;;
+ nop.m 0
+(p9) fma.s1 fAdj = f1, f1, f0 // If x > 0, adjustment is +1
+ nop.i 0
}
+;;
{ .mfi
-(p10) cmp.ge.unc p10,p11 = ceil_GR_exponent, ceil_GR_bigexp
-(p6) fnorm.d f8 = f8
- nop.i 999 ;;
+ nop.m 0
+ fcvt.xf fPreResult = fXInt // trunc(x)
+ nop.i 0
}
-
-{ .mfi
- nop.m 999
-(p11) fcvt.xf CEIL_FLOAT_INT_f8 = CEIL_INT_f8
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+(p6) fma.d.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf, 0
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf, 0
}
+;;
-{ .mfi
- nop.m 999
-(p10) fnorm.d f8 = CEIL_NORM_f8
- nop.i 999 ;;
+{ .mmi
+ and rExp = rSignexp, rExpMask // Get biased exponent
+;;
+ cmp.ge p7,p6 = rExp, rBigexp // Is |x| >= 2^52?
+(p8) cmp.lt.unc p10,p0 = rSignexp, rSignexpM1 // Is -1 < x < 0?
}
+;;
-// Is -1 < x < 0? If so, result will be -0. Special case it with p14 set.
+// If -1 < x < 0, we turn off p6 and compute result as -0
{ .mfi
- nop.m 999
-(p8) fcmp.gt.unc.s1 p14,p0 = CEIL_NORM_f8, CEIL_MINUS_ONE
- nop.i 999 ;;
+(p10) cmp.ne p6,p0 = r0,r0
+(p10) fmerge.s f8 = fNormX, f0
+ nop.i 0
}
+;;
+.pred.rel "mutex",p6,p7
{ .mfi
-(p14) cmp.ne p11,p0 = r0,r0
-(p14) fnorm.d f8 = CEIL_SIGNED_ZERO
- nop.i 999
+ nop.m 0
+(p6) fma.d.s0 f8 = fPreResult, f1, fAdj // Result if !int, |x| < 2^52
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p14) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.d.s0 f8 = fNormX, f1, f0 // Result, if |x| >= 2^52
+(p10) cmp.eq p6,p0 = r0,r0 // If -1 < x < 0, turn on p6 again
}
+;;
{ .mfi
- nop.m 999
-(p11) fadd.d f8 = CEIL_FLOAT_INT_f8,CEIL_adj
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p11) fcmp.eq.unc.s1 p12,p13 = CEIL_FLOAT_INT_f8, CEIL_NORM_f8
- nop.i 999 ;;
+ nop.m 0
+(p6) fcmp.eq.unc.s1 p8, p9 = fPreResult, fNormX // Is trunc(x) = x ?
+ nop.i 0
}
+;;
-// Set inexact if result not equal to input
{ .mfi
- nop.m 999
-(p13) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF
- nop.i 999
+ nop.m 0
+(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact
+ nop.i 0
}
-// Set result to input if integer
{ .mfb
- nop.m 999
-(p12) fnorm.d f8 = CEIL_NORM_f8
- br.ret.sptk b0 ;;
+ nop.m 0
+(p8) fma.d.s0 f8 = fNormX, f1, f0 // If x int, result normalized x
+ br.ret.sptk b0 // Exit main path, 0 < |x| < 2^52
}
+;;
+
-// Here if input denorm
-L(CEIL_DENORM):
+CEIL_UNORM:
+// Here if x unorm
{ .mfb
- getf.exp ceil_GR_signexp = CEIL_NORM_f8
- fcvt.fx.trunc.s1 CEIL_INT_f8 = CEIL_NORM_f8
- br.cond.sptk L(CEIL_COMMON) ;;
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk CEIL_COMMON // Return to main path
}
+;;
-.endp ceil
-ASM_SIZE_DIRECTIVE(ceil)
+GLOBAL_LIBM_END(ceil)
diff --git a/sysdeps/ia64/fpu/s_ceilf.S b/sysdeps/ia64/fpu/s_ceilf.S
index d1011052e8..051534a202 100644
--- a/sysdeps/ia64/fpu/s_ceilf.S
+++ b/sysdeps/ia64/fpu/s_ceilf.S
@@ -1,10 +1,10 @@
.file "ceilf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,90 +20,67 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-
-#include "libm_support.h"
-
-.align 32
-.global ceilf#
-
-.section .text
-.proc ceilf#
-.align 32
-
// History
//==============================================================
-// 2/02/00: Initial version
-// 6/13/00: Improved speed
-// 6/27/00: Eliminated incorrect invalid flag setting
+// 02/02/00 Initial version
+// 06/13/00 Improved speed
+// 06/27/00 Eliminated incorrect invalid flag setting
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/28/03 Improved performance
+//==============================================================
// API
//==============================================================
// float ceilf(float x)
+//==============================================================
-// general input registers:
-
-ceil_GR_FFFF = r14
-ceil_GR_signexp = r15
-ceil_GR_exponent = r16
-ceil_GR_expmask = r17
-ceil_GR_bigexp = r18
-
-
-// predicate registers used:
+// general input registers:
+// r14 - r19
-// p6 ==> Input is NaN, infinity, zero
-// p7 ==> Input is denormal
-// p8 ==> Input is <0
-// p9 ==> Input is >=0
-// p10 ==> Input is already an integer (bigger than largest integer)
-// p11 ==> Input is not a large integer
-// p12 ==> Input is a smaller integer
-// p13 ==> Input is not an even integer, so inexact must be set
-// p14 ==> Input is between -1 and 0, so result will be -0 and inexact
+rSignexp = r14
+rExp = r15
+rExpMask = r16
+rBigexp = r17
+rM1 = r18
+rSignexpM1 = r19
+// floating-point registers:
+// f8 - f13
-// floating-point registers used:
+fXInt = f9
+fNormX = f10
+fTmp = f11
+fAdj = f12
+fPreResult = f13
-CEIL_SIGNED_ZERO = f7
-CEIL_NORM_f8 = f9
-CEIL_FFFF = f10
-CEIL_INEXACT = f11
-CEIL_FLOAT_INT_f8 = f12
-CEIL_INT_f8 = f13
-CEIL_adj = f14
-CEIL_MINUS_ONE = f15
+// predicate registers used:
+// p6 - p10
// Overview of operation
//==============================================================
-
// float ceilf(float x)
-// Return an integer value (represented as a float) that is the smallest
+// Return an integer value (represented as a float) that is the smallest
// value not less than x
// This is x rounded toward +infinity to an integral value.
// Inexact is set if x != ceilf(x)
-// **************************************************************************
-
-// Set denormal flag for denormal input and
-// and take denormal fault if necessary.
-
-// Is the input an integer value already?
+//==============================================================
// double_extended
// if the exponent is > 1003e => 3F(true) = 63(decimal)
@@ -124,139 +101,124 @@ CEIL_MINUS_ONE = f15
// If we multiply by 2^23, we no longer have a fractional part
// So input is an integer value already.
-// If x is NAN, ZERO, or INFINITY, then return
-
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 1 11 0xe7
-
-ceilf:
+.section .text
+GLOBAL_LIBM_ENTRY(ceilf)
{ .mfi
- getf.exp ceil_GR_signexp = f8
- fcvt.fx.trunc.s1 CEIL_INT_f8 = f8
- addl ceil_GR_bigexp = 0x10016, r0
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
+ addl rBigexp = 0x10016, r0 // Set exponent at which is integer
}
{ .mfi
- addl ceil_GR_FFFF = -1,r0
- fcmp.lt.s1 p8,p9 = f8,f0
- mov ceil_GR_expmask = 0x1FFFF ;;
+ mov rM1 = -1 // Set all ones
+ fcvt.fx.trunc.s1 fXInt = f8 // Convert to int in significand
+ mov rExpMask = 0x1FFFF // Form exponent mask
}
+;;
-// p7 ==> denorm
{ .mfi
- setf.sig CEIL_FFFF = ceil_GR_FFFF
- fclass.m p7,p0 = f8, 0x0b
- nop.i 999
+ mov rSignexpM1 = 0x2FFFF // Form signexp of -1
+ fcmp.lt.s1 p8,p9 = f8, f0 // Test x < 0
+ nop.i 0
}
-{ .mfi
- nop.m 999
- fnorm CEIL_NORM_f8 = f8
- nop.i 999 ;;
+{ .mfb
+ setf.sig fTmp = rM1 // Make const for setting inexact
+ fnorm.s1 fNormX = f8 // Normalize input
+(p7) br.cond.spnt CEIL_UNORM // Branch if x unorm
}
+;;
-// Form 0 with sign of input in case negative zero is needed
-{ .mfi
- nop.m 999
- fmerge.s CEIL_SIGNED_ZERO = f8, f0
- nop.i 999
-}
+CEIL_COMMON:
+// Return here from CEIL_UNORM
{ .mfi
- nop.m 999
- fsub.s1 CEIL_MINUS_ONE = f0, f1
- nop.i 999 ;;
-}
-
-// p6 ==> NAN, INF, ZERO
-{ .mfb
- nop.m 999
- fclass.m p6,p10 = f8, 0xe7
-(p7) br.cond.spnt L(CEIL_DENORM) ;;
+ nop.m 0
+ fclass.m p6,p0 = f8, 0x1e7 // Test x natval, nan, inf, 0
+ nop.i 0
}
+;;
-L(CEIL_COMMON):
.pred.rel "mutex",p8,p9
-// Set adjustment to add to trunc(x) for result
-// If x>0, adjustment is 1.0
-// If x<=0, adjustment is 0.0
{ .mfi
- and ceil_GR_exponent = ceil_GR_signexp, ceil_GR_expmask
-(p9) fadd.s1 CEIL_adj = f1,f0
- nop.i 999
+ nop.m 0
+(p8) fma.s1 fAdj = f0, f0, f0 // If x < 0, adjustment is 0
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p8) fadd.s1 CEIL_adj = f0,f0
- nop.i 999 ;;
+ nop.m 0
+(p9) fma.s1 fAdj = f1, f1, f0 // If x > 0, adjustment is +1
+ nop.i 0
}
+;;
{ .mfi
-(p10) cmp.ge.unc p10,p11 = ceil_GR_exponent, ceil_GR_bigexp
-(p6) fnorm.s f8 = f8
- nop.i 999 ;;
+ nop.m 0
+ fcvt.xf fPreResult = fXInt // trunc(x)
+ nop.i 0
}
-
-{ .mfi
- nop.m 999
-(p11) fcvt.xf CEIL_FLOAT_INT_f8 = CEIL_INT_f8
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+(p6) fma.s.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf, 0
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf, 0
}
+;;
-{ .mfi
- nop.m 999
-(p10) fnorm.s f8 = CEIL_NORM_f8
- nop.i 999 ;;
+{ .mmi
+ and rExp = rSignexp, rExpMask // Get biased exponent
+;;
+ cmp.ge p7,p6 = rExp, rBigexp // Is |x| >= 2^23?
+(p8) cmp.lt.unc p10,p0 = rSignexp, rSignexpM1 // Is -1 < x < 0?
}
+;;
-// Is -1 < x < 0? If so, result will be -0. Special case it with p14 set.
+// If -1 < x < 0, we turn off p6 and compute result as -0
{ .mfi
- nop.m 999
-(p8) fcmp.gt.unc.s1 p14,p0 = CEIL_NORM_f8, CEIL_MINUS_ONE
- nop.i 999 ;;
+(p10) cmp.ne p6,p0 = r0,r0
+(p10) fmerge.s f8 = fNormX, f0
+ nop.i 0
}
+;;
+.pred.rel "mutex",p6,p7
{ .mfi
-(p14) cmp.ne p11,p0 = r0,r0
-(p14) fnorm.s f8 = CEIL_SIGNED_ZERO
- nop.i 999
+ nop.m 0
+(p6) fma.s.s0 f8 = fPreResult, f1, fAdj // Result if !int, |x| < 2^23
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p14) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s.s0 f8 = fNormX, f1, f0 // Result, if |x| >= 2^23
+(p10) cmp.eq p6,p0 = r0,r0 // If -1 < x < 0, turn on p6 again
}
+;;
{ .mfi
- nop.m 999
-(p11) fadd.s f8 = CEIL_FLOAT_INT_f8,CEIL_adj
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p11) fcmp.eq.unc.s1 p12,p13 = CEIL_FLOAT_INT_f8, CEIL_NORM_f8
- nop.i 999 ;;
+ nop.m 0
+(p6) fcmp.eq.unc.s1 p8, p9 = fPreResult, fNormX // Is trunc(x) = x ?
+ nop.i 0
}
+;;
-// Set inexact if result not equal to input
{ .mfi
- nop.m 999
-(p13) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF
- nop.i 999
+ nop.m 0
+(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact
+ nop.i 0
}
-// Set result to input if integer
{ .mfb
- nop.m 999
-(p12) fnorm.s f8 = CEIL_NORM_f8
- br.ret.sptk b0 ;;
+ nop.m 0
+(p8) fma.s.s0 f8 = fNormX, f1, f0 // If x int, result normalized x
+ br.ret.sptk b0 // Exit main path, 0 < |x| < 2^23
}
+;;
+
-// Here if input denorm
-L(CEIL_DENORM):
+CEIL_UNORM:
+// Here if x unorm
{ .mfb
- getf.exp ceil_GR_signexp = CEIL_NORM_f8
- fcvt.fx.trunc.s1 CEIL_INT_f8 = CEIL_NORM_f8
- br.cond.sptk L(CEIL_COMMON) ;;
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk CEIL_COMMON // Return to main path
}
+;;
-.endp ceilf
-ASM_SIZE_DIRECTIVE(ceilf)
+GLOBAL_LIBM_END(ceilf)
diff --git a/sysdeps/ia64/fpu/s_ceill.S b/sysdeps/ia64/fpu/s_ceill.S
index d3d8719584..71cb01d3fa 100644
--- a/sysdeps/ia64/fpu/s_ceill.S
+++ b/sysdeps/ia64/fpu/s_ceill.S
@@ -1,10 +1,10 @@
.file "ceill.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,90 +20,67 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-
-#include "libm_support.h"
-
-.align 32
-.global ceill#
-
-.section .text
-.proc ceill#
-.align 32
-
// History
//==============================================================
-// 2/02/00: Initial version
-// 6/13/00: Improved speed
-// 6/27/00: Eliminated incorrect invalid flag setting
+// 02/02/00 Initial version
+// 06/13/00 Improved speed
+// 06/27/00 Eliminated incorrect invalid flag setting
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/28/03 Improved performance
+//==============================================================
// API
//==============================================================
-// double ceill(double x)
-
-// general input registers:
-
-ceil_GR_FFFF = r14
-ceil_GR_signexp = r15
-ceil_GR_exponent = r16
-ceil_GR_expmask = r17
-ceil_GR_bigexp = r18
-
+// long double ceill(long double x)
+//==============================================================
-// predicate registers used:
+// general input registers:
+// r14 - r19
-// p6 ==> Input is NaN, infinity, zero
-// p7 ==> Input is denormal
-// p8 ==> Input is <0
-// p9 ==> Input is >=0
-// p10 ==> Input is already an integer (bigger than largest integer)
-// p11 ==> Input is not a large integer
-// p12 ==> Input is a smaller integer
-// p13 ==> Input is not an even integer, so inexact must be set
-// p14 ==> Input is between -1 and 0, so result will be -0 and inexact
+rSignexp = r14
+rExp = r15
+rExpMask = r16
+rBigexp = r17
+rM1 = r18
+rSignexpM1 = r19
+// floating-point registers:
+// f8 - f13
-// floating-point registers used:
+fXInt = f9
+fNormX = f10
+fTmp = f11
+fAdj = f12
+fPreResult = f13
-CEIL_SIGNED_ZERO = f7
-CEIL_NORM_f8 = f9
-CEIL_FFFF = f10
-CEIL_INEXACT = f11
-CEIL_FLOAT_INT_f8 = f12
-CEIL_INT_f8 = f13
-CEIL_adj = f14
-CEIL_MINUS_ONE = f15
+// predicate registers used:
+// p6 - p10
// Overview of operation
//==============================================================
-
// long double ceill(long double x)
-// Return an integer value (represented as a long double) that is the smallest
+// Return an integer value (represented as a long double) that is the smallest
// value not less than x
// This is x rounded toward +infinity to an integral value.
// Inexact is set if x != ceill(x)
-// **************************************************************************
-
-// Set denormal flag for denormal input and
-// and take denormal fault if necessary.
-
-// Is the input an integer value already?
+//==============================================================
// double_extended
// if the exponent is > 1003e => 3F(true) = 63(decimal)
@@ -124,139 +101,124 @@ CEIL_MINUS_ONE = f15
// If we multiply by 2^23, we no longer have a fractional part
// So input is an integer value already.
-// If x is NAN, ZERO, or INFINITY, then return
-
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 1 11 0xe7
-
-ceill:
+.section .text
+GLOBAL_LIBM_ENTRY(ceill)
{ .mfi
- getf.exp ceil_GR_signexp = f8
- fcvt.fx.trunc.s1 CEIL_INT_f8 = f8
- addl ceil_GR_bigexp = 0x1003e, r0
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
+ addl rBigexp = 0x1003e, r0 // Set exponent at which is integer
}
{ .mfi
- addl ceil_GR_FFFF = -1,r0
- fcmp.lt.s1 p8,p9 = f8,f0
- mov ceil_GR_expmask = 0x1FFFF ;;
+ mov rM1 = -1 // Set all ones
+ fcvt.fx.trunc.s1 fXInt = f8 // Convert to int in significand
+ mov rExpMask = 0x1FFFF // Form exponent mask
}
+;;
-// p7 ==> denorm
{ .mfi
- setf.sig CEIL_FFFF = ceil_GR_FFFF
- fclass.m p7,p0 = f8, 0x0b
- nop.i 999
+ mov rSignexpM1 = 0x2FFFF // Form signexp of -1
+ fcmp.lt.s1 p8,p9 = f8, f0 // Test x < 0
+ nop.i 0
}
-{ .mfi
- nop.m 999
- fnorm CEIL_NORM_f8 = f8
- nop.i 999 ;;
+{ .mfb
+ setf.sig fTmp = rM1 // Make const for setting inexact
+ fnorm.s1 fNormX = f8 // Normalize input
+(p7) br.cond.spnt CEIL_UNORM // Branch if x unorm
}
+;;
-// Form 0 with sign of input in case negative zero is needed
-{ .mfi
- nop.m 999
- fmerge.s CEIL_SIGNED_ZERO = f8, f0
- nop.i 999
-}
+CEIL_COMMON:
+// Return here from CEIL_UNORM
{ .mfi
- nop.m 999
- fsub.s1 CEIL_MINUS_ONE = f0, f1
- nop.i 999 ;;
-}
-
-// p6 ==> NAN, INF, ZERO
-{ .mfb
- nop.m 999
- fclass.m p6,p10 = f8, 0xe7
-(p7) br.cond.spnt L(CEIL_DENORM) ;;
+ nop.m 0
+ fclass.m p6,p0 = f8, 0x1e7 // Test x natval, nan, inf, 0
+ nop.i 0
}
+;;
-L(CEIL_COMMON):
.pred.rel "mutex",p8,p9
-// Set adjustment to add to trunc(x) for result
-// If x>0, adjustment is 1.0
-// If x<=0, adjustment is 0.0
{ .mfi
- and ceil_GR_exponent = ceil_GR_signexp, ceil_GR_expmask
-(p9) fadd.s1 CEIL_adj = f1,f0
- nop.i 999
+ nop.m 0
+(p8) fma.s1 fAdj = f0, f0, f0 // If x < 0, adjustment is 0
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p8) fadd.s1 CEIL_adj = f0,f0
- nop.i 999 ;;
+ nop.m 0
+(p9) fma.s1 fAdj = f1, f1, f0 // If x > 0, adjustment is +1
+ nop.i 0
}
+;;
{ .mfi
-(p10) cmp.ge.unc p10,p11 = ceil_GR_exponent, ceil_GR_bigexp
-(p6) fnorm f8 = f8
- nop.i 999 ;;
+ nop.m 0
+ fcvt.xf fPreResult = fXInt // trunc(x)
+ nop.i 0
}
-
-{ .mfi
- nop.m 999
-(p11) fcvt.xf CEIL_FLOAT_INT_f8 = CEIL_INT_f8
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+(p6) fma.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf, 0
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf, 0
}
+;;
-{ .mfi
- nop.m 999
-(p10) fnorm f8 = CEIL_NORM_f8
- nop.i 999 ;;
+{ .mmi
+ and rExp = rSignexp, rExpMask // Get biased exponent
+;;
+ cmp.ge p7,p6 = rExp, rBigexp // Is |x| >= 2^63?
+(p8) cmp.lt.unc p10,p0 = rSignexp, rSignexpM1 // Is -1 < x < 0?
}
+;;
-// Is -1 < x < 0? If so, result will be -0. Special case it with p14 set.
+// If -1 < x < 0, we turn off p6 and compute result as -0
{ .mfi
- nop.m 999
-(p8) fcmp.gt.unc.s1 p14,p0 = CEIL_NORM_f8, CEIL_MINUS_ONE
- nop.i 999 ;;
+(p10) cmp.ne p6,p0 = r0,r0
+(p10) fmerge.s f8 = fNormX, f0
+ nop.i 0
}
+;;
+.pred.rel "mutex",p6,p7
{ .mfi
-(p14) cmp.ne p11,p0 = r0,r0
-(p14) fnorm f8 = CEIL_SIGNED_ZERO
- nop.i 999
+ nop.m 0
+(p6) fma.s0 f8 = fPreResult, f1, fAdj // Result if !int, |x| < 2^63
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p14) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s0 f8 = fNormX, f1, f0 // Result, if |x| >= 2^63
+(p10) cmp.eq p6,p0 = r0,r0 // If -1 < x < 0, turn on p6 again
}
+;;
{ .mfi
- nop.m 999
-(p11) fadd f8 = CEIL_FLOAT_INT_f8,CEIL_adj
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p11) fcmp.eq.unc.s1 p12,p13 = CEIL_FLOAT_INT_f8, CEIL_NORM_f8
- nop.i 999 ;;
+ nop.m 0
+(p6) fcmp.eq.unc.s1 p8, p9 = fPreResult, fNormX // Is trunc(x) = x ?
+ nop.i 0
}
+;;
-// Set inexact if result not equal to input
{ .mfi
- nop.m 999
-(p13) fmpy.s0 CEIL_INEXACT = CEIL_FFFF,CEIL_FFFF
- nop.i 999
+ nop.m 0
+(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact
+ nop.i 0
}
-// Set result to input if integer
{ .mfb
- nop.m 999
-(p12) fnorm f8 = CEIL_NORM_f8
- br.ret.sptk b0 ;;
+ nop.m 0
+(p8) fma.s0 f8 = fNormX, f1, f0 // If x int, result normalized x
+ br.ret.sptk b0 // Exit main path, 0 < |x| < 2^63
}
+;;
+
-// Here if input denorm
-L(CEIL_DENORM):
+CEIL_UNORM:
+// Here if x unorm
{ .mfb
- getf.exp ceil_GR_signexp = CEIL_NORM_f8
- fcvt.fx.trunc.s1 CEIL_INT_f8 = CEIL_NORM_f8
- br.cond.sptk L(CEIL_COMMON) ;;
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk CEIL_COMMON // Return to main path
}
+;;
-.endp ceill
-ASM_SIZE_DIRECTIVE(ceill)
+GLOBAL_LIBM_END(ceill)
diff --git a/sysdeps/ia64/fpu/s_copysign.S b/sysdeps/ia64/fpu/s_copysign.S
index e0d08cb721..0903565ff3 100644
--- a/sysdeps/ia64/fpu/s_copysign.S
+++ b/sysdeps/ia64/fpu/s_copysign.S
@@ -23,12 +23,16 @@ ENTRY (__copysign)
{
fmerge.s fret0 = farg1, farg0
br.ret.sptk.many rp
-}
+}
END (__copysign)
strong_alias (__copysign, __copysignf)
strong_alias (__copysign, __copysignl)
+strong_alias (__copysign, __libm_copysign)
+strong_alias (__copysign, __libm_copysignf)
+strong_alias (__copysign, __libm_copysignl)
+
weak_alias (__copysign, copysign)
weak_alias (__copysignf, copysignf)
weak_alias (__copysignl, copysignl)
diff --git a/sysdeps/ia64/fpu/s_cos.S b/sysdeps/ia64/fpu/s_cos.S
index 6540aec724..84c177abab 100644
--- a/sysdeps/ia64/fpu/s_cos.S
+++ b/sysdeps/ia64/fpu/s_cos.S
@@ -1,10 +1,10 @@
.file "sincos.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,17 +35,22 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial revision
-// 4/02/00 Unwind support added.
-// 6/16/00 Updated tables to enforce symmetry
-// 8/31/00 Saved 2 cycles in main path, and 9 in other paths.
-// 9/20/00 The updated tables regressed to an old version, so reinstated them
+// 02/02/00 Initial version
+// 04/02/00 Unwind support added.
+// 06/16/00 Updated tables to enforce symmetry
+// 08/31/00 Saved 2 cycles in main path, and 9 in other paths.
+// 09/20/00 The updated tables regressed to an old version, so reinstated them
// 10/18/00 Changed one table entry to ensure symmetry
-// 1/03/01 Improved speed, fixed flag settings for small arguments.
+// 01/03/01 Improved speed, fixed flag settings for small arguments.
+// 02/18/02 Large arguments processing routine excluded
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 06/03/02 Insure inexact flag set for large arg result
+// 09/05/02 Work range is widened by reduction strengthen (3 parts of Pi/16)
+// 02/10/03 Reordered header: .section, .global, .proc, .align
// API
//==============================================================
@@ -63,9 +68,13 @@
// nfloat = Round result to integer (round-to-nearest)
//
// r = x - nfloat * pi/2^k
-// Do this as (x - nfloat * HIGH(pi/2^k)) - nfloat * LOW(pi/2^k) for increased accuracy.
+// Do this as ((((x - nfloat * HIGH(pi/2^k))) -
+// nfloat * LOW(pi/2^k)) -
+// nfloat * LOWEST(pi/2^k) for increased accuracy.
// pi/2^k is stored as two numbers that when added make pi/2^k.
// pi/2^k = HIGH(pi/2^k) + LOW(pi/2^k)
+// HIGH and LOW parts are rounded to zero values,
+// and LOWEST is rounded to nearest one.
//
// x = (nfloat * pi/2^k) + r
// r is small enough that we can use a polynomial approximation
@@ -121,7 +130,7 @@
//
// as follows
//
-// Sm = Sin(Mpi/2^k) and Cm = Cos(Mpi/2^k)
+// S[m] = Sin(Mpi/2^k) and C[m] = Cos(Mpi/2^k)
// rsq = r*r
//
//
@@ -141,23 +150,22 @@
//
// P = r + rcub * P
//
-// Answer = Sm Cos(r) + Cm P
+// Answer = S[m] Cos(r) + [Cm] P
//
// Cos(r) = 1 + rsq Q
// Cos(r) = 1 + r^2 Q
// Cos(r) = 1 + r^2 (q1 + r^2q2 + r^4q3 + r^6q4)
// Cos(r) = 1 + r^2q1 + r^4q2 + r^6q3 + r^8q4 + ...
//
-// Sm Cos(r) = Sm(1 + rsq Q)
-// Sm Cos(r) = Sm + Sm rsq Q
-// Sm Cos(r) = Sm + s_rsq Q
-// Q = Sm + s_rsq Q
+// S[m] Cos(r) = S[m](1 + rsq Q)
+// S[m] Cos(r) = S[m] + Sm rsq Q
+// S[m] Cos(r) = S[m] + s_rsq Q
+// Q = S[m] + s_rsq Q
//
// Then,
//
-// Answer = Q + Cm P
+// Answer = Q + C[m] P
-#include "libm_support.h"
// Registers used
//==============================================================
@@ -174,99 +182,97 @@
// Assembly macros
//==============================================================
-sind_NORM_f8 = f9
-sind_W = f10
-sind_int_Nfloat = f11
-sind_Nfloat = f12
+sincos_NORM_f8 = f9
+sincos_W = f10
+sincos_int_Nfloat = f11
+sincos_Nfloat = f12
-sind_r = f13
-sind_rsq = f14
-sind_rcub = f15
+sincos_r = f13
+sincos_rsq = f14
+sincos_rcub = f15
+sincos_save_tmp = f15
-sind_Inv_Pi_by_16 = f32
-sind_Pi_by_16_hi = f33
-sind_Pi_by_16_lo = f34
+sincos_Inv_Pi_by_16 = f32
+sincos_Pi_by_16_1 = f33
+sincos_Pi_by_16_2 = f34
-sind_Inv_Pi_by_64 = f35
-sind_Pi_by_64_hi = f36
-sind_Pi_by_64_lo = f37
+sincos_Inv_Pi_by_64 = f35
-sind_Sm = f38
-sind_Cm = f39
+sincos_Pi_by_16_3 = f36
-sind_P1 = f40
-sind_Q1 = f41
-sind_P2 = f42
-sind_Q2 = f43
-sind_P3 = f44
-sind_Q3 = f45
-sind_P4 = f46
-sind_Q4 = f47
+sincos_r_exact = f37
-sind_P_temp1 = f48
-sind_P_temp2 = f49
+sincos_Sm = f38
+sincos_Cm = f39
-sind_Q_temp1 = f50
-sind_Q_temp2 = f51
+sincos_P1 = f40
+sincos_Q1 = f41
+sincos_P2 = f42
+sincos_Q2 = f43
+sincos_P3 = f44
+sincos_Q3 = f45
+sincos_P4 = f46
+sincos_Q4 = f47
-sind_P = f52
-sind_Q = f53
+sincos_P_temp1 = f48
+sincos_P_temp2 = f49
-sind_srsq = f54
+sincos_Q_temp1 = f50
+sincos_Q_temp2 = f51
-sind_SIG_INV_PI_BY_16_2TO61 = f55
-sind_RSHF_2TO61 = f56
-sind_RSHF = f57
-sind_2TOM61 = f58
-sind_NFLOAT = f59
-sind_W_2TO61_RSH = f60
+sincos_P = f52
+sincos_Q = f53
-fp_tmp = f61
+sincos_srsq = f54
+
+sincos_SIG_INV_PI_BY_16_2TO61 = f55
+sincos_RSHF_2TO61 = f56
+sincos_RSHF = f57
+sincos_2TOM61 = f58
+sincos_NFLOAT = f59
+sincos_W_2TO61_RSH = f60
+
+fp_tmp = f61
/////////////////////////////////////////////////////////////
-sind_AD_1 = r33
-sind_AD_2 = r34
-sind_exp_limit = r35
-sind_r_signexp = r36
-sind_AD_beta_table = r37
-sind_r_sincos = r38
+sincos_AD_1 = r33
+sincos_AD_2 = r34
+sincos_exp_limit = r35
+sincos_r_signexp = r36
+sincos_AD_beta_table = r37
+sincos_r_sincos = r38
-sind_r_exp = r39
-sind_r_17_ones = r40
+sincos_r_exp = r39
+sincos_r_17_ones = r40
-sind_GR_sig_inv_pi_by_16 = r14
-sind_GR_rshf_2to61 = r15
-sind_GR_rshf = r16
-sind_GR_exp_2tom61 = r17
-sind_GR_n = r18
-sind_GR_m = r19
-sind_GR_32m = r19
+sincos_GR_sig_inv_pi_by_16 = r14
+sincos_GR_rshf_2to61 = r15
+sincos_GR_rshf = r16
+sincos_GR_exp_2tom61 = r17
+sincos_GR_n = r18
+sincos_GR_m = r19
+sincos_GR_32m = r19
+sincos_GR_all_ones = r19
-gr_tmp = r41
-GR_SAVE_PFS = r41
-GR_SAVE_B0 = r42
-GR_SAVE_GP = r43
+gr_tmp = r41
+GR_SAVE_PFS = r41
+GR_SAVE_B0 = r42
+GR_SAVE_GP = r43
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+RODATA
+// Pi/16 parts
.align 16
-double_sind_pi:
-ASM_TYPE_DIRECTIVE(double_sind_pi,@object)
-// data8 0xA2F9836E4E44152A, 0x00004001 // 16/pi (significand loaded w/ setf)
-// c90fdaa22168c234
- data8 0xC90FDAA22168C234, 0x00003FFC // pi/16 hi
-// c4c6628b80dc1cd1 29024e088a
- data8 0xC4C6628B80DC1CD1, 0x00003FBC // pi/16 lo
-ASM_SIZE_DIRECTIVE(double_sind_pi)
-
-double_sind_pq_k4:
-ASM_TYPE_DIRECTIVE(double_sind_pq_k4,@object)
+LOCAL_OBJECT_START(double_sincos_pi)
+ data8 0xC90FDAA22168C234, 0x00003FFC // pi/16 1st part
+ data8 0xC4C6628B80DC1CD1, 0x00003FBC // pi/16 2nd part
+ data8 0xA4093822299F31D0, 0x00003F7A // pi/16 3rd part
+LOCAL_OBJECT_END(double_sincos_pi)
+
+// Coefficients for polynomials
+LOCAL_OBJECT_START(double_sincos_pq_k4)
data8 0x3EC71C963717C63A // P4
data8 0x3EF9FFBA8F191AE6 // Q4
data8 0xBF2A01A00F4E11A8 // P3
@@ -275,125 +281,119 @@ ASM_TYPE_DIRECTIVE(double_sind_pq_k4,@object)
data8 0x3FA555555554DD45 // Q2
data8 0xBFC5555555555555 // P1
data8 0xBFDFFFFFFFFFFFFC // Q1
-ASM_SIZE_DIRECTIVE(double_sind_pq_k4)
+LOCAL_OBJECT_END(double_sincos_pq_k4)
+// Sincos table (S[m], C[m])
+LOCAL_OBJECT_START(double_sin_cos_beta_k4)
-double_sin_cos_beta_k4:
-ASM_TYPE_DIRECTIVE(double_sin_cos_beta_k4,@object)
data8 0x0000000000000000 , 0x00000000 // sin( 0 pi/16) S0
data8 0x8000000000000000 , 0x00003fff // cos( 0 pi/16) C0
-
+//
data8 0xc7c5c1e34d3055b3 , 0x00003ffc // sin( 1 pi/16) S1
data8 0xfb14be7fbae58157 , 0x00003ffe // cos( 1 pi/16) C1
-
+//
data8 0xc3ef1535754b168e , 0x00003ffd // sin( 2 pi/16) S2
data8 0xec835e79946a3146 , 0x00003ffe // cos( 2 pi/16) C2
-
+//
data8 0x8e39d9cd73464364 , 0x00003ffe // sin( 3 pi/16) S3
data8 0xd4db3148750d181a , 0x00003ffe // cos( 3 pi/16) C3
-
+//
data8 0xb504f333f9de6484 , 0x00003ffe // sin( 4 pi/16) S4
data8 0xb504f333f9de6484 , 0x00003ffe // cos( 4 pi/16) C4
-
-
+//
+//
data8 0xd4db3148750d181a , 0x00003ffe // sin( 5 pi/16) C3
data8 0x8e39d9cd73464364 , 0x00003ffe // cos( 5 pi/16) S3
-
+//
data8 0xec835e79946a3146 , 0x00003ffe // sin( 6 pi/16) C2
data8 0xc3ef1535754b168e , 0x00003ffd // cos( 6 pi/16) S2
-
+//
data8 0xfb14be7fbae58157 , 0x00003ffe // sin( 7 pi/16) C1
data8 0xc7c5c1e34d3055b3 , 0x00003ffc // cos( 7 pi/16) S1
-
+//
data8 0x8000000000000000 , 0x00003fff // sin( 8 pi/16) C0
data8 0x0000000000000000 , 0x00000000 // cos( 8 pi/16) S0
-
-
+//
+//
data8 0xfb14be7fbae58157 , 0x00003ffe // sin( 9 pi/16) C1
data8 0xc7c5c1e34d3055b3 , 0x0000bffc // cos( 9 pi/16) -S1
-
+//
data8 0xec835e79946a3146 , 0x00003ffe // sin(10 pi/16) C2
data8 0xc3ef1535754b168e , 0x0000bffd // cos(10 pi/16) -S2
-
+//
data8 0xd4db3148750d181a , 0x00003ffe // sin(11 pi/16) C3
data8 0x8e39d9cd73464364 , 0x0000bffe // cos(11 pi/16) -S3
-
+//
data8 0xb504f333f9de6484 , 0x00003ffe // sin(12 pi/16) S4
data8 0xb504f333f9de6484 , 0x0000bffe // cos(12 pi/16) -S4
-
-
+//
+//
data8 0x8e39d9cd73464364 , 0x00003ffe // sin(13 pi/16) S3
data8 0xd4db3148750d181a , 0x0000bffe // cos(13 pi/16) -C3
-
+//
data8 0xc3ef1535754b168e , 0x00003ffd // sin(14 pi/16) S2
data8 0xec835e79946a3146 , 0x0000bffe // cos(14 pi/16) -C2
-
+//
data8 0xc7c5c1e34d3055b3 , 0x00003ffc // sin(15 pi/16) S1
data8 0xfb14be7fbae58157 , 0x0000bffe // cos(15 pi/16) -C1
-
+//
data8 0x0000000000000000 , 0x00000000 // sin(16 pi/16) S0
data8 0x8000000000000000 , 0x0000bfff // cos(16 pi/16) -C0
-
-
+//
+//
data8 0xc7c5c1e34d3055b3 , 0x0000bffc // sin(17 pi/16) -S1
data8 0xfb14be7fbae58157 , 0x0000bffe // cos(17 pi/16) -C1
-
+//
data8 0xc3ef1535754b168e , 0x0000bffd // sin(18 pi/16) -S2
data8 0xec835e79946a3146 , 0x0000bffe // cos(18 pi/16) -C2
-
+//
data8 0x8e39d9cd73464364 , 0x0000bffe // sin(19 pi/16) -S3
data8 0xd4db3148750d181a , 0x0000bffe // cos(19 pi/16) -C3
-
+//
data8 0xb504f333f9de6484 , 0x0000bffe // sin(20 pi/16) -S4
data8 0xb504f333f9de6484 , 0x0000bffe // cos(20 pi/16) -S4
-
-
+//
+//
data8 0xd4db3148750d181a , 0x0000bffe // sin(21 pi/16) -C3
data8 0x8e39d9cd73464364 , 0x0000bffe // cos(21 pi/16) -S3
-
+//
data8 0xec835e79946a3146 , 0x0000bffe // sin(22 pi/16) -C2
data8 0xc3ef1535754b168e , 0x0000bffd // cos(22 pi/16) -S2
-
+//
data8 0xfb14be7fbae58157 , 0x0000bffe // sin(23 pi/16) -C1
data8 0xc7c5c1e34d3055b3 , 0x0000bffc // cos(23 pi/16) -S1
-
+//
data8 0x8000000000000000 , 0x0000bfff // sin(24 pi/16) -C0
data8 0x0000000000000000 , 0x00000000 // cos(24 pi/16) S0
-
-
+//
+//
data8 0xfb14be7fbae58157 , 0x0000bffe // sin(25 pi/16) -C1
data8 0xc7c5c1e34d3055b3 , 0x00003ffc // cos(25 pi/16) S1
-
+//
data8 0xec835e79946a3146 , 0x0000bffe // sin(26 pi/16) -C2
data8 0xc3ef1535754b168e , 0x00003ffd // cos(26 pi/16) S2
-
+//
data8 0xd4db3148750d181a , 0x0000bffe // sin(27 pi/16) -C3
data8 0x8e39d9cd73464364 , 0x00003ffe // cos(27 pi/16) S3
-
+//
data8 0xb504f333f9de6484 , 0x0000bffe // sin(28 pi/16) -S4
data8 0xb504f333f9de6484 , 0x00003ffe // cos(28 pi/16) S4
-
-
+//
+//
data8 0x8e39d9cd73464364 , 0x0000bffe // sin(29 pi/16) -S3
data8 0xd4db3148750d181a , 0x00003ffe // cos(29 pi/16) C3
-
+//
data8 0xc3ef1535754b168e , 0x0000bffd // sin(30 pi/16) -S2
data8 0xec835e79946a3146 , 0x00003ffe // cos(30 pi/16) C2
-
+//
data8 0xc7c5c1e34d3055b3 , 0x0000bffc // sin(31 pi/16) -S1
data8 0xfb14be7fbae58157 , 0x00003ffe // cos(31 pi/16) C1
-
+//
data8 0x0000000000000000 , 0x00000000 // sin(32 pi/16) S0
data8 0x8000000000000000 , 0x00003fff // cos(32 pi/16) C0
-ASM_SIZE_DIRECTIVE(double_sin_cos_beta_k4)
+LOCAL_OBJECT_END(double_sin_cos_beta_k4)
-.align 32
-.global sin#
-.global cos#
-#ifdef _LIBC
-.global __sin#
-.global __cos#
-#endif
+.section .text
////////////////////////////////////////////////////////
// There are two entry points: sin and cos
@@ -402,85 +402,63 @@ ASM_SIZE_DIRECTIVE(double_sin_cos_beta_k4)
// If from sin, p8 is true
// If from cos, p9 is true
-.section .text
-.proc sin#
-#ifdef _LIBC
-.proc __sin#
-#endif
-.align 32
-
-sin:
-#ifdef _LIBC
-__sin:
-#endif
+GLOBAL_IEEE754_ENTRY(sin)
{ .mlx
- alloc r32=ar.pfs,1,13,0,0
- movl sind_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A // significand of 16/pi
+ alloc r32 = ar.pfs, 1, 13, 0, 0
+ movl sincos_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A // signd of 16/pi
}
{ .mlx
- addl sind_AD_1 = @ltoff(double_sind_pi), gp
- movl sind_GR_rshf_2to61 = 0x47b8000000000000 // 1.1000 2^(63+63-2)
+ addl sincos_AD_1 = @ltoff(double_sincos_pi), gp
+ movl sincos_GR_rshf_2to61 = 0x47b8000000000000 // 1.1 2^(63+63-2)
}
;;
{ .mfi
- ld8 sind_AD_1 = [sind_AD_1]
- fnorm sind_NORM_f8 = f8
- cmp.eq p8,p9 = r0, r0
+ ld8 sincos_AD_1 = [sincos_AD_1]
+ fnorm.s0 sincos_NORM_f8 = f8 // Normalize argument
+ cmp.eq p8,p9 = r0, r0 // set p8 (clear p9) for sin
}
{ .mib
- mov sind_GR_exp_2tom61 = 0xffff-61 // exponent of scaling factor 2^-61
- mov sind_r_sincos = 0x0
- br.cond.sptk L(SIND_SINCOS)
+ mov sincos_GR_exp_2tom61 = 0xffff-61 // exponent of scale 2^-61
+ mov sincos_r_sincos = 0x0 // sincos_r_sincos = 0 for sin
+ br.cond.sptk _SINCOS_COMMON // go to common part
}
;;
-.endp sin
-ASM_SIZE_DIRECTIVE(sin)
-
-
-.section .text
-.proc cos#
-#ifdef _LIBC
-.proc __cos#
-#endif
-.align 32
-cos:
-#ifdef _LIBC
-__cos:
-#endif
+GLOBAL_IEEE754_END(sin)
+GLOBAL_IEEE754_ENTRY(cos)
{ .mlx
- alloc r32=ar.pfs,1,13,0,0
- movl sind_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A // significand of 16/pi
+ alloc r32 = ar.pfs, 1, 13, 0, 0
+ movl sincos_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A // signd of 16/pi
}
{ .mlx
- addl sind_AD_1 = @ltoff(double_sind_pi), gp
- movl sind_GR_rshf_2to61 = 0x47b8000000000000 // 1.1000 2^(63+63-2)
+ addl sincos_AD_1 = @ltoff(double_sincos_pi), gp
+ movl sincos_GR_rshf_2to61 = 0x47b8000000000000 // 1.1 2^(63+63-2)
}
;;
{ .mfi
- ld8 sind_AD_1 = [sind_AD_1]
- fnorm.s1 sind_NORM_f8 = f8
- cmp.eq p9,p8 = r0, r0
+ ld8 sincos_AD_1 = [sincos_AD_1]
+ fnorm.s1 sincos_NORM_f8 = f8 // Normalize argument
+ cmp.eq p9,p8 = r0, r0 // set p9 (clear p8) for cos
}
{ .mib
- mov sind_GR_exp_2tom61 = 0xffff-61 // exponent of scaling factor 2^-61
- mov sind_r_sincos = 0x8
- br.cond.sptk L(SIND_SINCOS)
+ mov sincos_GR_exp_2tom61 = 0xffff-61 // exp of scale 2^-61
+ mov sincos_r_sincos = 0x8 // sincos_r_sincos = 8 for cos
+ nop.b 999
}
;;
-
////////////////////////////////////////////////////////
// All entry points end up here.
-// If from sin, sind_r_sincos is 0 and p8 is true
-// If from cos, sind_r_sincos is 8 = 2^(k-1) and p9 is true
-// We add sind_r_sincos to N
+// If from sin, sincos_r_sincos is 0 and p8 is true
+// If from cos, sincos_r_sincos is 8 = 2^(k-1) and p9 is true
+// We add sincos_r_sincos to N
-L(SIND_SINCOS):
+///////////// Common sin and cos part //////////////////
+_SINCOS_COMMON:
// Form two constants we need
@@ -488,3014 +466,320 @@ L(SIND_SINCOS):
// 1.1000...000 * 2^(63+63-2) to right shift int(W) into the low significand
// fcmp used to set denormal, and invalid on snans
{ .mfi
- setf.sig sind_SIG_INV_PI_BY_16_2TO61 = sind_GR_sig_inv_pi_by_16
- fcmp.eq.s0 p12,p0=f8,f0
- mov sind_r_17_ones = 0x1ffff
+ setf.sig sincos_SIG_INV_PI_BY_16_2TO61 = sincos_GR_sig_inv_pi_by_16
+ fclass.m p6,p0 = f8, 0xe7 // if x = 0,inf,nan
+ mov sincos_exp_limit = 0x1001a
}
{ .mlx
- setf.d sind_RSHF_2TO61 = sind_GR_rshf_2to61
- movl sind_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift
-}
+ setf.d sincos_RSHF_2TO61 = sincos_GR_rshf_2to61
+ movl sincos_GR_rshf = 0x43e8000000000000 // 1.1 2^63
+} // Right shift
;;
// Form another constant
// 2^-61 for scaling Nfloat
-// 0x10009 is register_bias + 10.
-// So if f8 > 2^10 = Gamma, go to DBX
-{ .mfi
- setf.exp sind_2TOM61 = sind_GR_exp_2tom61
- fclass.m p13,p0 = f8, 0x23 // Test for x inf
- mov sind_exp_limit = 0x10009
+// 0x1001a is register_bias + 27.
+// So if f8 >= 2^27, go to large argument routines
+{ .mmi
+ getf.exp sincos_r_signexp = f8
+ setf.exp sincos_2TOM61 = sincos_GR_exp_2tom61
+ addl gr_tmp = -1,r0 // For "inexect" constant create
}
;;
// Load the two pieces of pi/16
// Form another constant
// 1.1000...000 * 2^63, the right shift constant
-{ .mmf
- ldfe sind_Pi_by_16_hi = [sind_AD_1],16
- setf.d sind_RSHF = sind_GR_rshf
- fclass.m p14,p0 = f8, 0xc3 // Test for x nan
-}
-;;
-
-{ .mfi
- ldfe sind_Pi_by_16_lo = [sind_AD_1],16
-(p13) frcpa.s0 f8,p12=f0,f0 // force qnan indef for x=inf
- addl gr_tmp = -1,r0
-}
-{ .mfb
- addl sind_AD_beta_table = @ltoff(double_sin_cos_beta_k4), gp
- nop.f 999
-(p13) br.ret.spnt b0 ;; // Exit for x=inf
-}
-
-// Start loading P, Q coefficients
-// SIN(0)
-{ .mfi
- ldfpd sind_P4,sind_Q4 = [sind_AD_1],16
-(p8) fclass.m.unc p6,p0 = f8, 0x07 // Test for sin(0)
- nop.i 999
-}
-{ .mfb
- addl sind_AD_beta_table = @ltoff(double_sin_cos_beta_k4), gp
-(p14) fma.d f8=f8,f1,f0 // qnan for x=nan
-(p14) br.ret.spnt b0 ;; // Exit for x=nan
-}
-
-
-// COS(0)
-{ .mfi
- getf.exp sind_r_signexp = f8
-(p9) fclass.m.unc p7,p0 = f8, 0x07 // Test for sin(0)
- nop.i 999
-}
-{ .mfi
- ld8 sind_AD_beta_table = [sind_AD_beta_table]
- nop.f 999
- nop.i 999 ;;
-}
-
{ .mmb
- ldfpd sind_P3,sind_Q3 = [sind_AD_1],16
- setf.sig fp_tmp = gr_tmp // Create constant such that fmpy sets inexact
-(p6) br.ret.spnt b0 ;;
-}
-
-{ .mfb
- and sind_r_exp = sind_r_17_ones, sind_r_signexp
-(p7) fmerge.s f8 = f1,f1
-(p7) br.ret.spnt b0 ;;
-}
-
-// p10 is true if we must call routines to handle larger arguments
-// p10 is true if f8 exp is > 0x10009
-
-{ .mfi
- ldfpd sind_P2,sind_Q2 = [sind_AD_1],16
- nop.f 999
- cmp.ge p10,p0 = sind_r_exp,sind_exp_limit
+ ldfe sincos_Pi_by_16_1 = [sincos_AD_1],16
+ setf.d sincos_RSHF = sincos_GR_rshf
+(p6) br.cond.spnt _SINCOS_SPECIAL_ARGS
}
;;
-// sind_W = x * sind_Inv_Pi_by_16
-// Multiply x by scaled 16/pi and add large const to shift integer part of W to
-// rightmost bits of significand
-{ .mfi
- ldfpd sind_P1,sind_Q1 = [sind_AD_1]
- fma.s1 sind_W_2TO61_RSH = sind_NORM_f8,sind_SIG_INV_PI_BY_16_2TO61,sind_RSHF_2TO61
- nop.i 999
-}
-{ .mbb
-(p10) cmp.ne.unc p11,p12=sind_r_sincos,r0 // p11 call __libm_cos_double_dbx
- // p12 call __libm_sin_double_dbx
-(p11) br.cond.spnt L(COSD_DBX)
-(p12) br.cond.spnt L(SIND_DBX)
-}
-;;
-
-
-// sind_NFLOAT = Round_Int_Nearest(sind_W)
-// This is done by scaling back by 2^-61 and subtracting the shift constant
-{ .mfi
- nop.m 999
- fms.s1 sind_NFLOAT = sind_W_2TO61_RSH,sind_2TOM61,sind_RSHF
- nop.i 999 ;;
-}
-
-
-// get N = (int)sind_int_Nfloat
-{ .mfi
- getf.sig sind_GR_n = sind_W_2TO61_RSH
- nop.f 999
- nop.i 999 ;;
-}
-
-// Add 2^(k-1) (which is in sind_r_sincos) to N
-// sind_r = -sind_Nfloat * sind_Pi_by_16_hi + x
-// sind_r = sind_r -sind_Nfloat * sind_Pi_by_16_lo
-{ .mfi
- add sind_GR_n = sind_GR_n, sind_r_sincos
- fnma.s1 sind_r = sind_NFLOAT, sind_Pi_by_16_hi, sind_NORM_f8
- nop.i 999 ;;
-}
-
-
-// Get M (least k+1 bits of N)
{ .mmi
- and sind_GR_m = 0x1f,sind_GR_n ;;
- nop.m 999
- shl sind_GR_32m = sind_GR_m,5 ;;
-}
-
-// Add 32*M to address of sin_cos_beta table
-{ .mmi
- add sind_AD_2 = sind_GR_32m, sind_AD_beta_table
- nop.m 999
- nop.i 999 ;;
-}
-
-{ .mfi
- ldfe sind_Sm = [sind_AD_2],16
-(p8) fclass.m.unc p10,p0=f8,0x0b // If sin, note denormal input to set uflow
- nop.i 999 ;;
-}
-
-{ .mfi
- ldfe sind_Cm = [sind_AD_2]
- fnma.s1 sind_r = sind_NFLOAT, sind_Pi_by_16_lo, sind_r
- nop.i 999 ;;
-}
-
-// get rsq
-{ .mfi
- nop.m 999
- fma.s1 sind_rsq = sind_r, sind_r, f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fmpy.s0 fp_tmp = fp_tmp,fp_tmp // fmpy forces inexact flag
- nop.i 999 ;;
-}
-
-// form P and Q series
-{ .mfi
- nop.m 999
- fma.s1 sind_P_temp1 = sind_rsq, sind_P4, sind_P3
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
- fma.s1 sind_Q_temp1 = sind_rsq, sind_Q4, sind_Q3
- nop.i 999 ;;
-}
-
-// get rcube and sm*rsq
-{ .mfi
- nop.m 999
- fmpy.s1 sind_srsq = sind_Sm,sind_rsq
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
- fmpy.s1 sind_rcub = sind_r, sind_rsq
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
- fma.s1 sind_Q_temp2 = sind_rsq, sind_Q_temp1, sind_Q2
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
- fma.s1 sind_P_temp2 = sind_rsq, sind_P_temp1, sind_P2
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
- fma.s1 sind_Q = sind_rsq, sind_Q_temp2, sind_Q1
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
- fma.s1 sind_P = sind_rsq, sind_P_temp2, sind_P1
- nop.i 999 ;;
-}
-
-// Get final P and Q
-{ .mfi
- nop.m 999
- fma.s1 sind_Q = sind_srsq,sind_Q, sind_Sm
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
- fma.s1 sind_P = sind_rcub,sind_P, sind_r
- nop.i 999 ;;
-}
-
-// If sin(denormal), force inexact to be set
-{ .mfi
- nop.m 999
-(p10) fmpy.d.s0 fp_tmp = f8,f8
- nop.i 999 ;;
-}
-
-// Final calculation
-{ .mfb
- nop.m 999
- fma.d f8 = sind_Cm, sind_P, sind_Q
- br.ret.sptk b0 ;;
-}
-.endp cos#
-ASM_SIZE_DIRECTIVE(cos#)
-
-
-
-.proc __libm_callout_1s
-__libm_callout_1s:
-L(SIND_DBX):
-.prologue
-{ .mfi
- nop.m 0
- nop.f 0
-.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs
-}
-;;
-
-{ .mfi
- mov GR_SAVE_GP=gp
- nop.f 0
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0
-}
-
-.body
-{ .mib
- nop.m 999
- nop.i 999
- br.call.sptk.many b0=__libm_sin_double_dbx# ;;
-}
-;;
-
-
-{ .mfi
- mov gp = GR_SAVE_GP
- nop.f 999
- mov b0 = GR_SAVE_B0
-}
-;;
-
-{ .mib
- nop.m 999
- mov ar.pfs = GR_SAVE_PFS
- br.ret.sptk b0 ;;
-}
-.endp __libm_callout_1s
-ASM_SIZE_DIRECTIVE(__libm_callout_1s)
-
-
-.proc __libm_callout_1c
-__libm_callout_1c:
-L(COSD_DBX):
-.prologue
-{ .mfi
- nop.m 0
- nop.f 0
-.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs
-}
-;;
-
-{ .mfi
- mov GR_SAVE_GP=gp
- nop.f 0
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0
-}
-
-.body
-{ .mib
- nop.m 999
- nop.i 999
- br.call.sptk.many b0=__libm_cos_double_dbx# ;;
-}
-;;
-
+ ldfe sincos_Pi_by_16_2 = [sincos_AD_1],16
+ setf.sig fp_tmp = gr_tmp // constant for inexact set
+ nop.i 999
+};;
{ .mfi
- mov gp = GR_SAVE_GP
- nop.f 999
- mov b0 = GR_SAVE_B0
-}
-;;
-
-{ .mib
- nop.m 999
- mov ar.pfs = GR_SAVE_PFS
- br.ret.sptk b0 ;;
-}
-.endp __libm_callout_1c
-ASM_SIZE_DIRECTIVE(__libm_callout_1c)
-
-
-// ====================================================================
-// ====================================================================
-
-// These functions calculate the sin and cos for inputs
-// greater than 2^10
-// __libm_sin_double_dbx# and __libm_cos_double_dbx#
-
-// *********************************************************************
-// *********************************************************************
-//
-// Function: Combined sin(x) and cos(x), where
-//
-// sin(x) = sine(x), for double precision x values
-// cos(x) = cosine(x), for double precision x values
-//
-// *********************************************************************
-//
-// Accuracy: Within .7 ulps for 80-bit floating point values
-// Very accurate for double precision values
-//
-// *********************************************************************
-//
-// Resources Used:
-//
-// Floating-Point Registers: f8 (Input and Return Value)
-// f32-f99
-//
-// General Purpose Registers:
-// r32-r43
-// r44-r45 (Used to pass arguments to pi_by_2 reduce routine)
-//
-// Predicate Registers: p6-p13
-//
-// *********************************************************************
-//
-// IEEE Special Conditions:
-//
-// Denormal fault raised on denormal inputs
-// Overflow exceptions do not occur
-// Underflow exceptions raised when appropriate for sin
-// (No specialized error handling for this routine)
-// Inexact raised when appropriate by algorithm
-//
-// sin(SNaN) = QNaN
-// sin(QNaN) = QNaN
-// sin(inf) = QNaN
-// sin(+/-0) = +/-0
-// cos(inf) = QNaN
-// cos(SNaN) = QNaN
-// cos(QNaN) = QNaN
-// cos(0) = 1
-//
-// *********************************************************************
-//
-// Mathematical Description
-// ========================
-//
-// The computation of FSIN and FCOS is best handled in one piece of
-// code. The main reason is that given any argument Arg, computation
-// of trigonometric functions first calculate N and an approximation
-// to alpha where
-//
-// Arg = N pi/2 + alpha, |alpha| <= pi/4.
-//
-// Since
-//
-// cos( Arg ) = sin( (N+1) pi/2 + alpha ),
-//
-// therefore, the code for computing sine will produce cosine as long
-// as 1 is added to N immediately after the argument reduction
-// process.
-//
-// Let M = N if sine
-// N+1 if cosine.
-//
-// Now, given
-//
-// Arg = M pi/2 + alpha, |alpha| <= pi/4,
-//
-// let I = M mod 4, or I be the two lsb of M when M is represented
-// as 2's complement. I = [i_0 i_1]. Then
-//
-// sin( Arg ) = (-1)^i_0 sin( alpha ) if i_1 = 0,
-// = (-1)^i_0 cos( alpha ) if i_1 = 1.
-//
-// For example:
-// if M = -1, I = 11
-// sin ((-pi/2 + alpha) = (-1) cos (alpha)
-// if M = 0, I = 00
-// sin (alpha) = sin (alpha)
-// if M = 1, I = 01
-// sin (pi/2 + alpha) = cos (alpha)
-// if M = 2, I = 10
-// sin (pi + alpha) = (-1) sin (alpha)
-// if M = 3, I = 11
-// sin ((3/2)pi + alpha) = (-1) cos (alpha)
-//
-// The value of alpha is obtained by argument reduction and
-// represented by two working precision numbers r and c where
-//
-// alpha = r + c accurately.
-//
-// The reduction method is described in a previous write up.
-// The argument reduction scheme identifies 4 cases. For Cases 2
-// and 4, because |alpha| is small, sin(r+c) and cos(r+c) can be
-// computed very easily by 2 or 3 terms of the Taylor series
-// expansion as follows:
-//
-// Case 2:
-// -------
-//
-// sin(r + c) = r + c - r^3/6 accurately
-// cos(r + c) = 1 - 2^(-67) accurately
-//
-// Case 4:
-// -------
-//
-// sin(r + c) = r + c - r^3/6 + r^5/120 accurately
-// cos(r + c) = 1 - r^2/2 + r^4/24 accurately
-//
-// The only cases left are Cases 1 and 3 of the argument reduction
-// procedure. These two cases will be merged since after the
-// argument is reduced in either cases, we have the reduced argument
-// represented as r + c and that the magnitude |r + c| is not small
-// enough to allow the usage of a very short approximation.
-//
-// The required calculation is either
-//
-// sin(r + c) = sin(r) + correction, or
-// cos(r + c) = cos(r) + correction.
-//
-// Specifically,
-//
-// sin(r + c) = sin(r) + c sin'(r) + O(c^2)
-// = sin(r) + c cos (r) + O(c^2)
-// = sin(r) + c(1 - r^2/2) accurately.
-// Similarly,
-//
-// cos(r + c) = cos(r) - c sin(r) + O(c^2)
-// = cos(r) - c(r - r^3/6) accurately.
-//
-// We therefore concentrate on accurately calculating sin(r) and
-// cos(r) for a working-precision number r, |r| <= pi/4 to within
-// 0.1% or so.
-//
-// The greatest challenge of this task is that the second terms of
-// the Taylor series
-//
-// r - r^3/3! + r^r/5! - ...
-//
-// and
-//
-// 1 - r^2/2! + r^4/4! - ...
-//
-// are not very small when |r| is close to pi/4 and the rounding
-// errors will be a concern if simple polynomial accumulation is
-// used. When |r| < 2^-3, however, the second terms will be small
-// enough (6 bits or so of right shift) that a normal Horner
-// recurrence suffices. Hence there are two cases that we consider
-// in the accurate computation of sin(r) and cos(r), |r| <= pi/4.
-//
-// Case small_r: |r| < 2^(-3)
-// --------------------------
-//
-// Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1],
-// we have
-//
-// sin(Arg) = (-1)^i_0 * sin(r + c) if i_1 = 0
-// = (-1)^i_0 * cos(r + c) if i_1 = 1
-//
-// can be accurately approximated by
-//
-// sin(Arg) = (-1)^i_0 * [sin(r) + c] if i_1 = 0
-// = (-1)^i_0 * [cos(r) - c*r] if i_1 = 1
-//
-// because |r| is small and thus the second terms in the correction
-// are unneccessary.
-//
-// Finally, sin(r) and cos(r) are approximated by polynomials of
-// moderate lengths.
-//
-// sin(r) = r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11
-// cos(r) = 1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10
-//
-// We can make use of predicates to selectively calculate
-// sin(r) or cos(r) based on i_1.
-//
-// Case normal_r: 2^(-3) <= |r| <= pi/4
-// ------------------------------------
-//
-// This case is more likely than the previous one if one considers
-// r to be uniformly distributed in [-pi/4 pi/4]. Again,
-//
-// sin(Arg) = (-1)^i_0 * sin(r + c) if i_1 = 0
-// = (-1)^i_0 * cos(r + c) if i_1 = 1.
-//
-// Because |r| is now larger, we need one extra term in the
-// correction. sin(Arg) can be accurately approximated by
-//
-// sin(Arg) = (-1)^i_0 * [sin(r) + c(1-r^2/2)] if i_1 = 0
-// = (-1)^i_0 * [cos(r) - c*r*(1 - r^2/6)] i_1 = 1.
-//
-// Finally, sin(r) and cos(r) are approximated by polynomials of
-// moderate lengths.
-//
-// sin(r) = r + PP_1_hi r^3 + PP_1_lo r^3 +
-// PP_2 r^5 + ... + PP_8 r^17
-//
-// cos(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16
-//
-// where PP_1_hi is only about 16 bits long and QQ_1 is -1/2.
-// The crux in accurate computation is to calculate
-//
-// r + PP_1_hi r^3 or 1 + QQ_1 r^2
-//
-// accurately as two pieces: U_hi and U_lo. The way to achieve this
-// is to obtain r_hi as a 10 sig. bit number that approximates r to
-// roughly 8 bits or so of accuracy. (One convenient way is
-//
-// r_hi := frcpa( frcpa( r ) ).)
-//
-// This way,
-//
-// r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 +
-// PP_1_hi (r^3 - r_hi^3)
-// = [r + PP_1_hi r_hi^3] +
-// [PP_1_hi (r - r_hi)
-// (r^2 + r_hi r + r_hi^2) ]
-// = U_hi + U_lo
-//
-// Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long,
-// PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed
-// exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign
-// and that there is no more than 8 bit shift off between r and
-// PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus
-// calculated without any error. Finally, the fact that
-//
-// |U_lo| <= 2^(-8) |U_hi|
-//
-// says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly
-// 8 extra bits of accuracy.
-//
-// Similarly,
-//
-// 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] +
-// [QQ_1 (r - r_hi)(r + r_hi)]
-// = U_hi + U_lo.
-//
-// Summarizing, we calculate r_hi = frcpa( frcpa( r ) ).
-//
-// If i_1 = 0, then
-//
-// U_hi := r + PP_1_hi * r_hi^3
-// U_lo := PP_1_hi * (r - r_hi) * (r^2 + r*r_hi + r_hi^2)
-// poly := PP_1_lo r^3 + PP_2 r^5 + ... + PP_8 r^17
-// correction := c * ( 1 + C_1 r^2 )
-//
-// Else ...i_1 = 1
-//
-// U_hi := 1 + QQ_1 * r_hi * r_hi
-// U_lo := QQ_1 * (r - r_hi) * (r + r_hi)
-// poly := QQ_2 * r^4 + QQ_3 * r^6 + ... + QQ_8 r^16
-// correction := -c * r * (1 + S_1 * r^2)
-//
-// End
-//
-// Finally,
-//
-// V := poly + ( U_lo + correction )
-//
-// / U_hi + V if i_0 = 0
-// result := |
-// \ (-U_hi) - V if i_0 = 1
-//
-// It is important that in the last step, negation of U_hi is
-// performed prior to the subtraction which is to be performed in
-// the user-set rounding mode.
-//
-//
-// Algorithmic Description
-// =======================
-//
-// The argument reduction algorithm is tightly integrated into FSIN
-// and FCOS which share the same code. The following is complete and
-// self-contained. The argument reduction description given
-// previously is repeated below.
-//
-//
-// Step 0. Initialization.
-//
-// If FSIN is invoked, set N_inc := 0; else if FCOS is invoked,
-// set N_inc := 1.
-//
-// Step 1. Check for exceptional and special cases.
-//
-// * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special
-// handling.
-// * If |Arg| < 2^24, go to Step 2 for reduction of moderate
-// arguments. This is the most likely case.
-// * If |Arg| < 2^63, go to Step 8 for pre-reduction of large
-// arguments.
-// * If |Arg| >= 2^63, go to Step 10 for special handling.
-//
-// Step 2. Reduction of moderate arguments.
-//
-// If |Arg| < pi/4 ...quick branch
-// N_fix := N_inc (integer)
-// r := Arg
-// c := 0.0
-// Branch to Step 4, Case_1_complete
-// Else ...cf. argument reduction
-// N := Arg * two_by_PI (fp)
-// N_fix := fcvt.fx( N ) (int)
-// N := fcvt.xf( N_fix )
-// N_fix := N_fix + N_inc
-// s := Arg - N * P_1 (first piece of pi/2)
-// w := -N * P_2 (second piece of pi/2)
-//
-// If |s| >= 2^(-33)
-// go to Step 3, Case_1_reduce
-// Else
-// go to Step 7, Case_2_reduce
-// Endif
-// Endif
-//
-// Step 3. Case_1_reduce.
-//
-// r := s + w
-// c := (s - r) + w ...observe order
-//
-// Step 4. Case_1_complete
-//
-// ...At this point, the reduced argument alpha is
-// ...accurately represented as r + c.
-// If |r| < 2^(-3), go to Step 6, small_r.
-//
-// Step 5. Normal_r.
-//
-// Let [i_0 i_1] by the 2 lsb of N_fix.
-// FR_rsq := r * r
-// r_hi := frcpa( frcpa( r ) )
-// r_lo := r - r_hi
-//
-// If i_1 = 0, then
-// poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8))
-// U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order
-// U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi)
-// correction := c + c*C_1*FR_rsq ...any order
-// Else
-// poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8))
-// U_hi := 1 + QQ_1 * r_hi * r_hi ...any order
-// U_lo := QQ_1 * r_lo * (r + r_hi)
-// correction := -c*(r + S_1*FR_rsq*r) ...any order
-// Endif
-//
-// V := poly + (U_lo + correction) ...observe order
-//
-// result := (i_0 == 0? 1.0 : -1.0)
-//
-// Last instruction in user-set rounding mode
-//
-// result := (i_0 == 0? result*U_hi + V :
-// result*U_hi - V)
-//
-// Return
-//
-// Step 6. Small_r.
-//
-// ...Use flush to zero mode without causing exception
-// Let [i_0 i_1] be the two lsb of N_fix.
-//
-// FR_rsq := r * r
-//
-// If i_1 = 0 then
-// z := FR_rsq*FR_rsq; z := FR_rsq*z *r
-// poly_lo := S_3 + FR_rsq*(S_4 + FR_rsq*S_5)
-// poly_hi := r*FR_rsq*(S_1 + FR_rsq*S_2)
-// correction := c
-// result := r
-// Else
-// z := FR_rsq*FR_rsq; z := FR_rsq*z
-// poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5)
-// poly_hi := FR_rsq*(C_1 + FR_rsq*C_2)
-// correction := -c*r
-// result := 1
-// Endif
-//
-// poly := poly_hi + (z * poly_lo + correction)
-//
-// If i_0 = 1, result := -result
-//
-// Last operation. Perform in user-set rounding mode
-//
-// result := (i_0 == 0? result + poly :
-// result - poly )
-// Return
-//
-// Step 7. Case_2_reduce.
-//
-// ...Refer to the write up for argument reduction for
-// ...rationale. The reduction algorithm below is taken from
-// ...argument reduction description and integrated this.
-//
-// w := N*P_3
-// U_1 := N*P_2 + w ...FMA
-// U_2 := (N*P_2 - U_1) + w ...2 FMA
-// ...U_1 + U_2 is N*(P_2+P_3) accurately
-//
-// r := s - U_1
-// c := ( (s - r) - U_1 ) - U_2
-//
-// ...The mathematical sum r + c approximates the reduced
-// ...argument accurately. Note that although compared to
-// ...Case 1, this case requires much more work to reduce
-// ...the argument, the subsequent calculation needed for
-// ...any of the trigonometric function is very little because
-// ...|alpha| < 1.01*2^(-33) and thus two terms of the
-// ...Taylor series expansion suffices.
-//
-// If i_1 = 0 then
-// poly := c + S_1 * r * r * r ...any order
-// result := r
-// Else
-// poly := -2^(-67)
-// result := 1.0
-// Endif
-//
-// If i_0 = 1, result := -result
-//
-// Last operation. Perform in user-set rounding mode
-//
-// result := (i_0 == 0? result + poly :
-// result - poly )
-//
-// Return
-//
-//
-// Step 8. Pre-reduction of large arguments.
-//
-// ...Again, the following reduction procedure was described
-// ...in the separate write up for argument reduction, which
-// ...is tightly integrated here.
-
-// N_0 := Arg * Inv_P_0
-// N_0_fix := fcvt.fx( N_0 )
-// N_0 := fcvt.xf( N_0_fix)
-
-// Arg' := Arg - N_0 * P_0
-// w := N_0 * d_1
-// N := Arg' * two_by_PI
-// N_fix := fcvt.fx( N )
-// N := fcvt.xf( N_fix )
-// N_fix := N_fix + N_inc
-//
-// s := Arg' - N * P_1
-// w := w - N * P_2
-//
-// If |s| >= 2^(-14)
-// go to Step 3
-// Else
-// go to Step 9
-// Endif
-//
-// Step 9. Case_4_reduce.
-//
-// ...first obtain N_0*d_1 and -N*P_2 accurately
-// U_hi := N_0 * d_1 V_hi := -N*P_2
-// U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs
-//
-// ...compute the contribution from N_0*d_1 and -N*P_3
-// w := -N*P_3
-// w := w + N_0*d_2
-// t := U_lo + V_lo + w ...any order
-//
-// ...at this point, the mathematical value
-// ...s + U_hi + V_hi + t approximates the true reduced argument
-// ...accurately. Just need to compute this accurately.
-//
-// ...Calculate U_hi + V_hi accurately:
-// A := U_hi + V_hi
-// if |U_hi| >= |V_hi| then
-// a := (U_hi - A) + V_hi
-// else
-// a := (V_hi - A) + U_hi
-// endif
-// ...order in computing "a" must be observed. This branch is
-// ...best implemented by predicates.
-// ...A + a is U_hi + V_hi accurately. Moreover, "a" is
-// ...much smaller than A: |a| <= (1/2)ulp(A).
-//
-// ...Just need to calculate s + A + a + t
-// C_hi := s + A t := t + a
-// C_lo := (s - C_hi) + A
-// C_lo := C_lo + t
-//
-// ...Final steps for reduction
-// r := C_hi + C_lo
-// c := (C_hi - r) + C_lo
-//
-// ...At this point, we have r and c
-// ...And all we need is a couple of terms of the corresponding
-// ...Taylor series.
-//
-// If i_1 = 0
-// poly := c + r*FR_rsq*(S_1 + FR_rsq*S_2)
-// result := r
-// Else
-// poly := FR_rsq*(C_1 + FR_rsq*C_2)
-// result := 1
-// Endif
-//
-// If i_0 = 1, result := -result
-//
-// Last operation. Perform in user-set rounding mode
-//
-// result := (i_0 == 0? result + poly :
-// result - poly )
-// Return
-//
-// Large Arguments: For arguments above 2**63, a Payne-Hanek
-// style argument reduction is used and pi_by_2 reduce is called.
-//
-
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-.align 64
-
-FSINCOS_CONSTANTS:
-ASM_TYPE_DIRECTIVE(FSINCOS_CONSTANTS,@object)
-data4 0x4B800000, 0xCB800000, 0x00000000,0x00000000 // two**24, -two**24
-data4 0x4E44152A, 0xA2F9836E, 0x00003FFE,0x00000000 // Inv_pi_by_2
-data4 0xCE81B9F1, 0xC84D32B0, 0x00004016,0x00000000 // P_0
-data4 0x2168C235, 0xC90FDAA2, 0x00003FFF,0x00000000 // P_1
-data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD,0x00000000 // P_2
-data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C,0x00000000 // P_3
-data4 0x5F000000, 0xDF000000, 0x00000000,0x00000000 // two_to_63, -two_to_63
-data4 0x6EC6B45A, 0xA397E504, 0x00003FE7,0x00000000 // Inv_P_0
-data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF,0x00000000 // d_1
-data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C,0x00000000 // d_2
-data4 0x2168C234, 0xC90FDAA2, 0x00003FFE,0x00000000 // pi_by_4
-data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE,0x00000000 // neg_pi_by_4
-data4 0x3E000000, 0xBE000000, 0x00000000,0x00000000 // two**-3, -two**-3
-data4 0x2F000000, 0xAF000000, 0x9E000000,0x00000000 // two**-33, -two**-33, -two**-67
-data4 0xA21C0BC9, 0xCC8ABEBC, 0x00003FCE,0x00000000 // PP_8
-data4 0x720221DA, 0xD7468A05, 0x0000BFD6,0x00000000 // PP_7
-data4 0x640AD517, 0xB092382F, 0x00003FDE,0x00000000 // PP_6
-data4 0xD1EB75A4, 0xD7322B47, 0x0000BFE5,0x00000000 // PP_5
-data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1
-data4 0x00000000, 0xAAAA0000, 0x0000BFFC,0x00000000 // PP_1_hi
-data4 0xBAF69EEA, 0xB8EF1D2A, 0x00003FEC,0x00000000 // PP_4
-data4 0x0D03BB69, 0xD00D00D0, 0x0000BFF2,0x00000000 // PP_3
-data4 0x88888962, 0x88888888, 0x00003FF8,0x00000000 // PP_2
-data4 0xAAAB0000, 0xAAAAAAAA, 0x0000BFEC,0x00000000 // PP_1_lo
-data4 0xC2B0FE52, 0xD56232EF, 0x00003FD2,0x00000000 // QQ_8
-data4 0x2B48DCA6, 0xC9C99ABA, 0x0000BFDA,0x00000000 // QQ_7
-data4 0x9C716658, 0x8F76C650, 0x00003FE2,0x00000000 // QQ_6
-data4 0xFDA8D0FC, 0x93F27DBA, 0x0000BFE9,0x00000000 // QQ_5
-data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1
-data4 0x00000000, 0x80000000, 0x0000BFFE,0x00000000 // QQ_1
-data4 0x0C6E5041, 0xD00D00D0, 0x00003FEF,0x00000000 // QQ_4
-data4 0x0B607F60, 0xB60B60B6, 0x0000BFF5,0x00000000 // QQ_3
-data4 0xAAAAAA9B, 0xAAAAAAAA, 0x00003FFA,0x00000000 // QQ_2
-data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1
-data4 0xAAAA719F, 0xAAAAAAAA, 0x00003FFA,0x00000000 // C_2
-data4 0x0356F994, 0xB60B60B6, 0x0000BFF5,0x00000000 // C_3
-data4 0xB2385EA9, 0xD00CFFD5, 0x00003FEF,0x00000000 // C_4
-data4 0x292A14CD, 0x93E4BD18, 0x0000BFE9,0x00000000 // C_5
-data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1
-data4 0x888868DB, 0x88888888, 0x00003FF8,0x00000000 // S_2
-data4 0x055EFD4B, 0xD00D00D0, 0x0000BFF2,0x00000000 // S_3
-data4 0x839730B9, 0xB8EF1C5D, 0x00003FEC,0x00000000 // S_4
-data4 0xE5B3F492, 0xD71EA3A4, 0x0000BFE5,0x00000000 // S_5
-data4 0x38800000, 0xB8800000, 0x00000000 // two**-14, -two**-14
-ASM_SIZE_DIRECTIVE(FSINCOS_CONSTANTS)
-
-FR_Input_X = f8
-FR_Neg_Two_to_M3 = f32
-FR_Two_to_63 = f32
-FR_Two_to_24 = f33
-FR_Pi_by_4 = f33
-FR_Two_to_M14 = f34
-FR_Two_to_M33 = f35
-FR_Neg_Two_to_24 = f36
-FR_Neg_Pi_by_4 = f36
-FR_Neg_Two_to_M14 = f37
-FR_Neg_Two_to_M33 = f38
-FR_Neg_Two_to_M67 = f39
-FR_Inv_pi_by_2 = f40
-FR_N_float = f41
-FR_N_fix = f42
-FR_P_1 = f43
-FR_P_2 = f44
-FR_P_3 = f45
-FR_s = f46
-FR_w = f47
-FR_c = f48
-FR_r = f49
-FR_Z = f50
-FR_A = f51
-FR_a = f52
-FR_t = f53
-FR_U_1 = f54
-FR_U_2 = f55
-FR_C_1 = f56
-FR_C_2 = f57
-FR_C_3 = f58
-FR_C_4 = f59
-FR_C_5 = f60
-FR_S_1 = f61
-FR_S_2 = f62
-FR_S_3 = f63
-FR_S_4 = f64
-FR_S_5 = f65
-FR_poly_hi = f66
-FR_poly_lo = f67
-FR_r_hi = f68
-FR_r_lo = f69
-FR_rsq = f70
-FR_r_cubed = f71
-FR_C_hi = f72
-FR_N_0 = f73
-FR_d_1 = f74
-FR_V = f75
-FR_V_hi = f75
-FR_V_lo = f76
-FR_U_hi = f77
-FR_U_lo = f78
-FR_U_hiabs = f79
-FR_V_hiabs = f80
-FR_PP_8 = f81
-FR_QQ_8 = f81
-FR_PP_7 = f82
-FR_QQ_7 = f82
-FR_PP_6 = f83
-FR_QQ_6 = f83
-FR_PP_5 = f84
-FR_QQ_5 = f84
-FR_PP_4 = f85
-FR_QQ_4 = f85
-FR_PP_3 = f86
-FR_QQ_3 = f86
-FR_PP_2 = f87
-FR_QQ_2 = f87
-FR_QQ_1 = f88
-FR_N_0_fix = f89
-FR_Inv_P_0 = f90
-FR_corr = f91
-FR_poly = f92
-FR_d_2 = f93
-FR_Two_to_M3 = f94
-FR_Neg_Two_to_63 = f94
-FR_P_0 = f95
-FR_C_lo = f96
-FR_PP_1 = f97
-FR_PP_1_lo = f98
-FR_ArgPrime = f99
-
-GR_Table_Base = r32
-GR_Table_Base1 = r33
-GR_i_0 = r34
-GR_i_1 = r35
-GR_N_Inc = r36
-GR_Sin_or_Cos = r37
-
-GR_SAVE_B0 = r39
-GR_SAVE_GP = r40
-GR_SAVE_PFS = r41
-
-.section .text
-.proc __libm_sin_double_dbx#
-.align 64
-__libm_sin_double_dbx:
-
-{ .mlx
-alloc GR_Table_Base = ar.pfs,0,12,2,0
- movl GR_Sin_or_Cos = 0x0 ;;
-}
-
-{ .mmi
- nop.m 999
- addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp
- nop.i 999
-}
-;;
-
-{ .mmi
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.m 999
- nop.i 999
-}
-;;
-
-
-{ .mib
- nop.m 999
- nop.i 999
- br.cond.sptk L(SINCOS_CONTINUE) ;;
-}
-
-.endp __libm_sin_double_dbx#
-ASM_SIZE_DIRECTIVE(__libm_sin_double_dbx)
-
-.section .text
-.proc __libm_cos_double_dbx#
-__libm_cos_double_dbx:
-
-{ .mlx
-alloc GR_Table_Base= ar.pfs,0,12,2,0
- movl GR_Sin_or_Cos = 0x1 ;;
-}
-
-{ .mmi
- nop.m 999
- addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp
- nop.i 999
-}
-;;
+ ldfe sincos_Pi_by_16_3 = [sincos_AD_1],16
+ nop.f 999
+ nop.i 999
+};;
+// Polynomial coefficients (Q4, P4, Q3, P3, Q2, Q1, P2, P1) loading
{ .mmi
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.m 999
- nop.i 999
-}
-;;
-
-//
-// Load Table Address
-//
-L(SINCOS_CONTINUE):
+ ldfpd sincos_P4,sincos_Q4 = [sincos_AD_1],16
+ nop.m 999
+ nop.i 999
+};;
+// Select exponent (17 lsb)
{ .mmi
- add GR_Table_Base1 = 96, GR_Table_Base
- ldfs FR_Two_to_24 = [GR_Table_Base], 4
- nop.i 999
+ ldfpd sincos_P3,sincos_Q3 = [sincos_AD_1],16
+ nop.m 999
+ dep.z sincos_r_exp = sincos_r_signexp, 0, 17
}
;;
-{ .mmi
- nop.m 999
-//
-// Load 2**24, load 2**63.
-//
- ldfs FR_Neg_Two_to_24 = [GR_Table_Base], 12
- mov r41 = ar.pfs ;;
-}
-
-{ .mfi
- ldfs FR_Two_to_63 = [GR_Table_Base1], 4
-//
-// Check for unnormals - unsupported operands. We do not want
-// to generate denormal exception
-// Check for NatVals, QNaNs, SNaNs, +/-Infs
-// Check for EM unsupporteds
-// Check for Zero
-//
- fclass.m.unc p6, p8 = FR_Input_X, 0x1E3
- mov r40 = gp ;;
-}
-
-{ .mfi
- nop.m 999
- fclass.nm.unc p8, p0 = FR_Input_X, 0x1FF
-// GR_Sin_or_Cos denotes
- mov r39 = b0
-}
-
-{ .mfb
- ldfs FR_Neg_Two_to_63 = [GR_Table_Base1], 12
- fclass.m.unc p10, p0 = FR_Input_X, 0x007
-(p6) br.cond.spnt L(SINCOS_SPECIAL) ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p8) br.cond.spnt L(SINCOS_SPECIAL) ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-//
-// Branch if +/- NaN, Inf.
-// Load -2**24, load -2**63.
-//
-(p10) br.cond.spnt L(SINCOS_ZERO) ;;
-}
-
-{ .mmb
- ldfe FR_Inv_pi_by_2 = [GR_Table_Base], 16
- ldfe FR_Inv_P_0 = [GR_Table_Base1], 16
- nop.b 999 ;;
-}
-
-{ .mmb
- nop.m 999
- ldfe FR_d_1 = [GR_Table_Base1], 16
- nop.b 999 ;;
-}
-//
-// Raise possible denormal operand flag with useful fcmp
-// Is x <= -2**63
-// Load Inv_P_0 for pre-reduction
-// Load Inv_pi_by_2
-//
-
+// p10 is true if we must call routines to handle larger arguments
+// p10 is true if f8 exp is >= 0x1001a (2^27)
{ .mmb
- ldfe FR_P_0 = [GR_Table_Base], 16
- ldfe FR_d_2 = [GR_Table_Base1], 16
- nop.b 999 ;;
-}
-//
-// Load P_0
-// Load d_1
-// Is x >= 2**63
-// Is x <= -2**24?
-//
-
-{ .mmi
- ldfe FR_P_1 = [GR_Table_Base], 16 ;;
-//
-// Load P_1
-// Load d_2
-// Is x >= 2**24?
-//
- ldfe FR_P_2 = [GR_Table_Base], 16
- nop.i 999 ;;
-}
-
-{ .mmf
- nop.m 999
- ldfe FR_P_3 = [GR_Table_Base], 16
- fcmp.le.unc.s1 p7, p8 = FR_Input_X, FR_Neg_Two_to_24
-}
-
-{ .mfi
- nop.m 999
-//
-// Branch if +/- zero.
-// Decide about the paths to take:
-// If -2**24 < FR_Input_X < 2**24 - CASE 1 OR 2
-// OTHERWISE - CASE 3 OR 4
-//
- fcmp.le.unc.s0 p10, p11 = FR_Input_X, FR_Neg_Two_to_63
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p8) fcmp.ge.s1 p7, p0 = FR_Input_X, FR_Two_to_24
- nop.i 999
-}
-
-{ .mfi
- ldfe FR_Pi_by_4 = [GR_Table_Base1], 16
-(p11) fcmp.ge.s1 p10, p0 = FR_Input_X, FR_Two_to_63
- nop.i 999 ;;
-}
-
-{ .mmi
- ldfe FR_Neg_Pi_by_4 = [GR_Table_Base1], 16 ;;
- ldfs FR_Two_to_M3 = [GR_Table_Base1], 4
- nop.i 999 ;;
-}
-
-{ .mib
- ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1], 12
- nop.i 999
-//
-// Load P_2
-// Load P_3
-// Load pi_by_4
-// Load neg_pi_by_4
-// Load 2**(-3)
-// Load -2**(-3).
-//
-(p10) br.cond.spnt L(SINCOS_ARG_TOO_LARGE) ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-//
-// Branch out if x >= 2**63. Use Payne-Hanek Reduction
-//
-(p7) br.cond.spnt L(SINCOS_LARGER_ARG) ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Branch if Arg <= -2**24 or Arg >= 2**24 and use pre-reduction.
-//
- fma.s1 FR_N_float = FR_Input_X, FR_Inv_pi_by_2, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
- fcmp.lt.unc.s1 p6, p7 = FR_Input_X, FR_Pi_by_4
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Select the case when |Arg| < pi/4
-// Else Select the case when |Arg| >= pi/4
-//
- fcvt.fx.s1 FR_N_fix = FR_N_float
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// N = Arg * 2/pi
-// Check if Arg < pi/4
-//
-(p6) fcmp.gt.s1 p6, p7 = FR_Input_X, FR_Neg_Pi_by_4
- nop.i 999 ;;
-}
-//
-// Case 2: Convert integer N_fix back to normalized floating-point value.
-// Case 1: p8 is only affected when p6 is set
-//
-
-{ .mfi
-(p7) ldfs FR_Two_to_M33 = [GR_Table_Base1], 4
-//
-// Grab the integer part of N and call it N_fix
-//
-(p6) fmerge.se FR_r = FR_Input_X, FR_Input_X
-// If |x| < pi/4, r = x and c = 0
-// lf |x| < pi/4, is x < 2**(-3).
-// r = Arg
-// c = 0
-(p6) mov GR_N_Inc = GR_Sin_or_Cos ;;
-}
-
-{ .mmf
- nop.m 999
-(p7) ldfs FR_Neg_Two_to_M33 = [GR_Table_Base1], 4
-(p6) fmerge.se FR_c = f0, f0
-}
-
-{ .mfi
- nop.m 999
-(p6) fcmp.lt.unc.s1 p8, p9 = FR_Input_X, FR_Two_to_M3
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// lf |x| < pi/4, is -2**(-3)< x < 2**(-3) - set p8.
-// If |x| >= pi/4,
-// Create the right N for |x| < pi/4 and otherwise
-// Case 2: Place integer part of N in GP register
-//
-(p7) fcvt.xf FR_N_float = FR_N_fix
- nop.i 999 ;;
-}
-
-{ .mmf
- nop.m 999
-(p7) getf.sig GR_N_Inc = FR_N_fix
-(p8) fcmp.gt.s1 p8, p0 = FR_Input_X, FR_Neg_Two_to_M3 ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-//
-// Load 2**(-33), -2**(-33)
-//
-(p8) br.cond.spnt L(SINCOS_SMALL_R) ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.sptk L(SINCOS_NORMAL_R) ;;
-}
-//
-// if |x| < pi/4, branch based on |x| < 2**(-3) or otherwise.
-//
-//
-// In this branch, |x| >= pi/4.
-//
-
-{ .mfi
- ldfs FR_Neg_Two_to_M67 = [GR_Table_Base1], 8
-//
-// Load -2**(-67)
-//
- fnma.s1 FR_s = FR_N_float, FR_P_1, FR_Input_X
-//
-// w = N * P_2
-// s = -N * P_1 + Arg
-//
- add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos
-}
-
-{ .mfi
- nop.m 999
- fma.s1 FR_w = FR_N_float, FR_P_2, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Adjust N_fix by N_inc to determine whether sine or
-// cosine is being calculated
-//
- fcmp.lt.unc.s1 p7, p6 = FR_s, FR_Two_to_M33
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p7) fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-// Remember x >= pi/4.
-// Is s <= -2**(-33) or s >= 2**(-33) (p6)
-// or -2**(-33) < s < 2**(-33) (p7)
-(p6) fms.s1 FR_r = FR_s, f1, FR_w
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p7) fma.s1 FR_w = FR_N_float, FR_P_3, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p7) fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p6) fms.s1 FR_c = FR_s, f1, FR_r
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// For big s: r = s - w: No futher reduction is necessary
-// For small s: w = N * P_3 (change sign) More reduction
-//
-(p6) fcmp.lt.unc.s1 p8, p9 = FR_r, FR_Two_to_M3
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p8) fcmp.gt.s1 p8, p9 = FR_r, FR_Neg_Two_to_M3
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p7) fms.s1 FR_r = FR_s, f1, FR_U_1
- nop.i 999
-}
-
-{ .mfb
- nop.m 999
-//
-// For big s: Is |r| < 2**(-3)?
-// For big s: c = S - r
-// For small s: U_1 = N * P_2 + w
-//
-// If p8 is set, prepare to branch to Small_R.
-// If p9 is set, prepare to branch to Normal_R.
-// For big s, r is complete here.
-//
-(p6) fms.s1 FR_c = FR_c, f1, FR_w
-//
-// For big s: c = c + w (w has not been negated.)
-// For small s: r = S - U_1
-//
-(p8) br.cond.spnt L(SINCOS_SMALL_R) ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.sptk L(SINCOS_NORMAL_R) ;;
-}
-
-{ .mfi
-(p7) add GR_Table_Base1 = 224, GR_Table_Base1
-//
-// Branch to SINCOS_SMALL_R or SINCOS_NORMAL_R
-//
-(p7) fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1
-//
-// c = S - U_1
-// r = S_1 * r
-//
-//
-(p7) extr.u GR_i_1 = GR_N_Inc, 0, 1
-}
-
-{ .mmi
- nop.m 999 ;;
-//
-// Get [i_0,i_1] - two lsb of N_fix_gr.
-// Do dummy fmpy so inexact is always set.
-//
-(p7) cmp.eq.unc p9, p10 = 0x0, GR_i_1
-(p7) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
-}
-//
-// For small s: U_2 = N * P_2 - U_1
-// S_1 stored constant - grab the one stored with the
-// coefficients.
-//
-
-{ .mfi
-(p7) ldfe FR_S_1 = [GR_Table_Base1], 16
-//
-// Check if i_1 and i_0 != 0
-//
-(p10) fma.s1 FR_poly = f0, f1, FR_Neg_Two_to_M67
-(p7) cmp.eq.unc p11, p12 = 0x0, GR_i_0 ;;
-}
-
-{ .mfi
- nop.m 999
-(p7) fms.s1 FR_s = FR_s, f1, FR_r
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// S = S - r
-// U_2 = U_2 + w
-// load S_1
-//
-(p7) fma.s1 FR_rsq = FR_r, FR_r, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p7) fma.s1 FR_U_2 = FR_U_2, f1, FR_w
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p7) fmerge.se FR_Input_X = FR_r, FR_r
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_Input_X = f0, f1, f1
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// FR_rsq = r * r
-// Save r as the result.
-//
-(p7) fms.s1 FR_c = FR_s, f1, FR_U_1
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if ( i_1 ==0) poly = c + S_1*r*r*r
-// else Result = 1
-//
-(p12) fnma.s1 FR_Input_X = FR_Input_X, f1, f0
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p7) fma.s1 FR_r = FR_S_1, FR_r, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p7) fma.d.s0 FR_S_1 = FR_S_1, FR_S_1, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// If i_1 != 0, poly = 2**(-67)
-//
-(p7) fms.s1 FR_c = FR_c, f1, FR_U_2
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// c = c - U_2
-//
-(p9) fma.s1 FR_poly = FR_r, FR_rsq, FR_c
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// i_0 != 0, so Result = -Result
-//
-(p11) fma.d.s0 FR_Input_X = FR_Input_X, f1, FR_poly
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-(p12) fms.d.s0 FR_Input_X = FR_Input_X, f1, FR_poly
-//
-// if (i_0 == 0), Result = Result + poly
-// else Result = Result - poly
-//
- br.ret.sptk b0 ;;
-}
-L(SINCOS_LARGER_ARG):
-
-{ .mfi
- nop.m 999
- fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0
- nop.i 999
-}
-;;
-
-// This path for argument > 2*24
-// Adjust table_ptr1 to beginning of table.
-//
-
-{ .mmi
- nop.m 999
- addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp
- nop.i 999
-}
-;;
-
-{ .mmi
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.m 999
- nop.i 999
-}
-;;
-
-
-//
-// Point to 2*-14
-// N_0 = Arg * Inv_P_0
-//
-
-{ .mmi
- add GR_Table_Base = 688, GR_Table_Base ;;
- ldfs FR_Two_to_M14 = [GR_Table_Base], 4
- nop.i 999 ;;
-}
-
-{ .mfi
- ldfs FR_Neg_Two_to_M14 = [GR_Table_Base], 0
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Load values 2**(-14) and -2**(-14)
-//
- fcvt.fx.s1 FR_N_0_fix = FR_N_0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// N_0_fix = integer part of N_0
-//
- fcvt.xf FR_N_0 = FR_N_0_fix
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Make N_0 the integer part
-//
- fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
- fma.s1 FR_w = FR_N_0, FR_d_1, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Arg' = -N_0 * P_0 + Arg
-// w = N_0 * d_1
-//
- fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// N = A' * 2/pi
-//
- fcvt.fx.s1 FR_N_fix = FR_N_float
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// N_fix is the integer part
-//
- fcvt.xf FR_N_float = FR_N_fix
- nop.i 999 ;;
-}
-
-{ .mfi
- getf.sig GR_N_Inc = FR_N_fix
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
- add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// N is the integer part of the reduced-reduced argument.
-// Put the integer in a GP register
-//
- fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
- fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// s = -N*P_1 + Arg'
-// w = -N*P_2 + w
-// N_fix_gr = N_fix_gr + N_inc
-//
- fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p9) fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// For |s| > 2**(-14) r = S + w (r complete)
-// Else U_hi = N_0 * d_1
-//
-(p9) fma.s1 FR_V_hi = FR_N_float, FR_P_2, f0
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p9) fma.s1 FR_U_hi = FR_N_0, FR_d_1, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Either S <= -2**(-14) or S >= 2**(-14)
-// or -2**(-14) < s < 2**(-14)
-//
-(p8) fma.s1 FR_r = FR_s, f1, FR_w
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p9) fma.s1 FR_w = FR_N_float, FR_P_3, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// We need abs of both U_hi and V_hi - don't
-// worry about switched sign of V_hi.
-//
-(p9) fms.s1 FR_A = FR_U_hi, f1, FR_V_hi
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// Big s: finish up c = (S - r) + w (c complete)
-// Case 4: A = U_hi + V_hi
-// Note: Worry about switched sign of V_hi, so subtract instead of add.
-//
-(p9) fnma.s1 FR_V_lo = FR_N_float, FR_P_2, FR_V_hi
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p9) fms.s1 FR_U_lo = FR_N_0, FR_d_1, FR_U_hi
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p9) fmerge.s FR_V_hiabs = f0, FR_V_hi
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-// For big s: c = S - r
-// For small s do more work: U_lo = N_0 * d_1 - U_hi
-//
-(p9) fmerge.s FR_U_hiabs = f0, FR_U_hi
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// For big s: Is |r| < 2**(-3)
-// For big s: if p12 set, prepare to branch to Small_R.
-// For big s: If p13 set, prepare to branch to Normal_R.
-//
-(p8) fms.s1 FR_c = FR_s, f1, FR_r
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// For small S: V_hi = N * P_2
-// w = N * P_3
-// Note the product does not include the (-) as in the writeup
-// so (-) missing for V_hi and w.
-//
-(p8) fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p12) fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p8) fma.s1 FR_c = FR_c, f1, FR_w
- nop.i 999
-}
-
-{ .mfb
- nop.m 999
-(p9) fms.s1 FR_w = FR_N_0, FR_d_2, FR_w
-(p12) br.cond.spnt L(SINCOS_SMALL_R) ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p13) br.cond.sptk L(SINCOS_NORMAL_R) ;;
-}
+ ldfpd sincos_P2,sincos_Q2 = [sincos_AD_1],16
+ cmp.ge p10,p0 = sincos_r_exp,sincos_exp_limit
+(p10) br.cond.spnt _SINCOS_LARGE_ARGS // Go to "large args" routine
+};;
+// sincos_W = x * sincos_Inv_Pi_by_16
+// Multiply x by scaled 16/pi and add large const to shift integer part of W to
+// rightmost bits of significand
{ .mfi
- nop.m 999
-//
-// Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true.
-// The remaining stuff is for Case 4.
-// Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup)
-// Note: the (-) is still missing for V_lo.
-// Small s: w = w + N_0 * d_2
-// Note: the (-) is now incorporated in w.
-//
-(p9) fcmp.ge.unc.s1 p10, p11 = FR_U_hiabs, FR_V_hiabs
- extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;
-}
+ ldfpd sincos_P1,sincos_Q1 = [sincos_AD_1],16
+ fma.s1 sincos_W_2TO61_RSH = sincos_NORM_f8,sincos_SIG_INV_PI_BY_16_2TO61,sincos_RSHF_2TO61
+ nop.i 999
+};;
+// sincos_NFLOAT = Round_Int_Nearest(sincos_W)
+// This is done by scaling back by 2^-61 and subtracting the shift constant
{ .mfi
- nop.m 999
-//
-// C_hi = S + A
-//
-(p9) fma.s1 FR_t = FR_U_lo, f1, FR_V_lo
- extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
-}
+ nop.m 999
+ fms.s1 sincos_NFLOAT = sincos_W_2TO61_RSH,sincos_2TOM61,sincos_RSHF
+ nop.i 999
+};;
-{ .mfi
- nop.m 999
-//
-// t = U_lo + V_lo
-//
-//
-(p10) fms.s1 FR_a = FR_U_hi, f1, FR_A
- nop.i 999 ;;
-}
+// get N = (int)sincos_int_Nfloat
{ .mfi
- nop.m 999
-(p11) fma.s1 FR_a = FR_V_hi, f1, FR_A
- nop.i 999
-}
-;;
-
-{ .mmi
- nop.m 999
- addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp
- nop.i 999
-}
-;;
-
-{ .mmi
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.m 999
- nop.i 999
-}
-;;
-
+ getf.sig sincos_GR_n = sincos_W_2TO61_RSH
+ nop.f 999
+ nop.i 999
+};;
+// Add 2^(k-1) (which is in sincos_r_sincos) to N
+// sincos_r = -sincos_Nfloat * sincos_Pi_by_16_1 + x
{ .mfi
- add GR_Table_Base = 528, GR_Table_Base
-//
-// Is U_hiabs >= V_hiabs?
-//
-(p9) fma.s1 FR_C_hi = FR_s, f1, FR_A
- nop.i 999 ;;
-}
+ add sincos_GR_n = sincos_GR_n, sincos_r_sincos
+ fnma.s1 sincos_r = sincos_NFLOAT, sincos_Pi_by_16_1, sincos_NORM_f8
+ nop.i 999
+};;
+// Get M (least k+1 bits of N)
{ .mmi
- ldfe FR_C_1 = [GR_Table_Base], 16 ;;
- ldfe FR_C_2 = [GR_Table_Base], 64
- nop.i 999 ;;
-}
-
-{ .mmf
- nop.m 999
-//
-// c = c + C_lo finished.
-// Load C_2
-//
- ldfe FR_S_1 = [GR_Table_Base], 16
-//
-// C_lo = S - C_hi
-//
- fma.s1 FR_t = FR_t, f1, FR_w ;;
-}
-//
-// r and c have been computed.
-// Make sure ftz mode is set - should be automatic when using wre
-// |r| < 2**(-3)
-// Get [i_0,i_1] - two lsb of N_fix.
-// Load S_1
-//
+ and sincos_GR_m = 0x1f,sincos_GR_n;;
+ nop.m 999
+ shl sincos_GR_32m = sincos_GR_m,5
+};;
+// Add 32*M to address of sin_cos_beta table
{ .mfi
- ldfe FR_S_2 = [GR_Table_Base], 64
-//
-// t = t + w
-//
-(p10) fms.s1 FR_a = FR_a, f1, FR_V_hi
- cmp.eq.unc p9, p10 = 0x0, GR_i_0
-}
+ add sincos_AD_2 = sincos_GR_32m, sincos_AD_1
+(p8) fclass.m.unc p10,p0 = f8,0x0b // For sin denorm. - set uflow
+ nop.i 999
+};;
+// Load Sin and Cos table value using obtained index m (sincosf_AD_2)
{ .mfi
- nop.m 999
-//
-// For larger u than v: a = U_hi - A
-// Else a = V_hi - A (do an add to account for missing (-) on V_hi
-//
- fms.s1 FR_C_lo = FR_s, f1, FR_C_hi
- nop.i 999 ;;
-}
+ ldfe sincos_Sm = [sincos_AD_2],16
+(p9) fclass.m.unc p11,p0 = f8,0x0b // For cos denorm - set denorm
+ nop.i 999
+};;
+// sincos_r = sincos_r -sincos_Nfloat * sincos_Pi_by_16_2
{ .mfi
- nop.m 999
-(p11) fms.s1 FR_a = FR_U_hi, f1, FR_a
- cmp.eq.unc p11, p12 = 0x0, GR_i_1
-}
+ ldfe sincos_Cm = [sincos_AD_2]
+ fnma.s1 sincos_r = sincos_NFLOAT, sincos_Pi_by_16_2, sincos_r
+ nop.i 999
+};;
+// get rsq = r*r
{ .mfi
- nop.m 999
-//
-// If u > v: a = (U_hi - A) + V_hi
-// Else a = (V_hi - A) + U_hi
-// In each case account for negative missing from V_hi.
-//
- fma.s1 FR_C_lo = FR_C_lo, f1, FR_A
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 sincos_rsq = sincos_r, sincos_r, f0 // r^2 = r*r
+ nop.i 999
}
-
{ .mfi
- nop.m 999
-//
-// C_lo = (S - C_hi) + A
-//
- fma.s1 FR_t = FR_t, f1, FR_a
- nop.i 999 ;;
-}
+ nop.m 999
+ fmpy.s0 fp_tmp = fp_tmp,fp_tmp // forces inexact flag
+ nop.i 999
+};;
+// sincos_r_exact = sincos_r -sincos_Nfloat * sincos_Pi_by_16_3
{ .mfi
- nop.m 999
-//
-// t = t + a
-//
- fma.s1 FR_C_lo = FR_C_lo, f1, FR_t
- nop.i 999 ;;
-}
+ nop.m 999
+ fnma.s1 sincos_r_exact = sincos_NFLOAT, sincos_Pi_by_16_3, sincos_r
+ nop.i 999
+};;
+// Polynomials calculation
+// P_1 = P4*r^2 + P3
+// Q_2 = Q4*r^2 + Q3
{ .mfi
- nop.m 999
-//
-// C_lo = C_lo + t
-// Adjust Table_Base to beginning of table
-//
- fma.s1 FR_r = FR_C_hi, f1, FR_C_lo
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 sincos_P_temp1 = sincos_rsq, sincos_P4, sincos_P3
+ nop.i 999
}
-
{ .mfi
- nop.m 999
-//
-// Load S_2
-//
- fma.s1 FR_rsq = FR_r, FR_r, f0
- nop.i 999
-}
+ nop.m 999
+ fma.s1 sincos_Q_temp1 = sincos_rsq, sincos_Q4, sincos_Q3
+ nop.i 999
+};;
+// get rcube = r^3 and S[m]*r^2
{ .mfi
- nop.m 999
-//
-// Table_Base points to C_1
-// r = C_hi + C_lo
-//
- fms.s1 FR_c = FR_C_hi, f1, FR_r
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 sincos_srsq = sincos_Sm,sincos_rsq
+ nop.i 999
}
-
{ .mfi
- nop.m 999
-//
-// if i_1 ==0: poly = S_2 * FR_rsq + S_1
-// else poly = C_2 * FR_rsq + C_1
-//
-(p11) fma.s1 FR_Input_X = f0, f1, FR_r
- nop.i 999 ;;
-}
+ nop.m 999
+ fmpy.s1 sincos_rcub = sincos_r_exact, sincos_rsq
+ nop.i 999
+};;
+// Polynomials calculation
+// Q_2 = Q_1*r^2 + Q2
+// P_1 = P_1*r^2 + P2
{ .mfi
- nop.m 999
-(p12) fma.s1 FR_Input_X = f0, f1, f1
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 sincos_Q_temp2 = sincos_rsq, sincos_Q_temp1, sincos_Q2
+ nop.i 999
}
-
{ .mfi
- nop.m 999
-//
-// Compute r_cube = FR_rsq * r
-//
-(p11) fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1
- nop.i 999 ;;
-}
+ nop.m 999
+ fma.s1 sincos_P_temp2 = sincos_rsq, sincos_P_temp1, sincos_P2
+ nop.i 999
+};;
+// Polynomials calculation
+// Q = Q_2*r^2 + Q1
+// P = P_2*r^2 + P1
{ .mfi
- nop.m 999
-(p12) fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1
- nop.i 999
+ nop.m 999
+ fma.s1 sincos_Q = sincos_rsq, sincos_Q_temp2, sincos_Q1
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Compute FR_rsq = r * r
-// Is i_1 == 0 ?
-//
- fma.s1 FR_r_cubed = FR_rsq, FR_r, f0
- nop.i 999 ;;
-}
+ nop.m 999
+ fma.s1 sincos_P = sincos_rsq, sincos_P_temp2, sincos_P1
+ nop.i 999
+};;
+// Get final P and Q
+// Q = Q*S[m]*r^2 + S[m]
+// P = P*r^3 + r
{ .mfi
- nop.m 999
-//
-// c = C_hi - r
-// Load C_1
-//
- fma.s1 FR_c = FR_c, f1, FR_C_lo
- nop.i 999
+ nop.m 999
+ fma.s1 sincos_Q = sincos_srsq,sincos_Q, sincos_Sm
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// if i_1 ==0: poly = r_cube * poly + c
-// else poly = FR_rsq * poly
-//
-(p10) fms.s1 FR_Input_X = f0, f1, FR_Input_X
- nop.i 999 ;;
-}
+ nop.m 999
+ fma.s1 sincos_P = sincos_rcub,sincos_P, sincos_r_exact
+ nop.i 999
+};;
+// If sin(denormal), force underflow to be set
+.pred.rel "mutex",p10,p11
{ .mfi
- nop.m 999
-//
-// if i_1 ==0: Result = r
-// else Result = 1.0
-//
-(p11) fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c
- nop.i 999 ;;
+ nop.m 999
+(p10) fmpy.d.s0 fp_tmp = f8,f8 // forces underflow flag
+ nop.i 999 // for denormal sine args
}
-
{ .mfi
- nop.m 999
-(p12) fma.s1 FR_poly = FR_rsq, FR_poly, f0
- nop.i 999 ;;
-}
+ nop.m 999
+(p11) fma.d.s0 fp_tmp = f8,f1, f8 // forces denormal flag
+ nop.i 999 // for denormal cosine args
+};;
-{ .mfi
- nop.m 999
-//
-// if i_0 !=0: Result = -Result
-//
-(p9) fma.d.s0 FR_Input_X = FR_Input_X, f1, FR_poly
- nop.i 999 ;;
-}
+// Final calculation
+// result = C[m]*P + Q
{ .mfb
- nop.m 999
-(p10) fms.d.s0 FR_Input_X = FR_Input_X, f1, FR_poly
-//
-// if i_0 == 0: Result = Result + poly
-// else Result = Result - poly
-//
- br.ret.sptk b0 ;;
-}
-L(SINCOS_SMALL_R):
-
-{ .mii
- nop.m 999
- extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;
-//
-//
-// Compare both i_1 and i_0 with 0.
-// if i_1 == 0, set p9.
-// if i_0 == 0, set p11.
-//
- cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;
-}
-
-{ .mfi
- nop.m 999
- fma.s1 FR_rsq = FR_r, FR_r, f0
- extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Z = Z * FR_rsq
-//
-(p10) fnma.s1 FR_c = FR_c, FR_r, f0
- cmp.eq.unc p11, p12 = 0x0, GR_i_0
-}
-;;
-
-// ******************************************************************
-// ******************************************************************
-// ******************************************************************
-// r and c have been computed.
-// We know whether this is the sine or cosine routine.
-// Make sure ftz mode is set - should be automatic when using wre
-// |r| < 2**(-3)
-//
-// Set table_ptr1 to beginning of constant table.
-// Get [i_0,i_1] - two lsb of N_fix_gr.
-//
-
-{ .mmi
- nop.m 999
- addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp
- nop.i 999
-}
-;;
-
-{ .mmi
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.m 999
- nop.i 999
-}
-;;
-
-
-//
-// Set table_ptr1 to point to S_5.
-// Set table_ptr1 to point to C_5.
-// Compute FR_rsq = r * r
-//
-
-{ .mfi
-(p9) add GR_Table_Base = 672, GR_Table_Base
-(p10) fmerge.s FR_r = f1, f1
-(p10) add GR_Table_Base = 592, GR_Table_Base ;;
-}
-//
-// Set table_ptr1 to point to S_5.
-// Set table_ptr1 to point to C_5.
-//
-
-{ .mmi
-(p9) ldfe FR_S_5 = [GR_Table_Base], -16 ;;
-//
-// if (i_1 == 0) load S_5
-// if (i_1 != 0) load C_5
-//
-(p9) ldfe FR_S_4 = [GR_Table_Base], -16
- nop.i 999 ;;
-}
-
-{ .mmf
-(p10) ldfe FR_C_5 = [GR_Table_Base], -16
-//
-// Z = FR_rsq * FR_rsq
-//
-(p9) ldfe FR_S_3 = [GR_Table_Base], -16
-//
-// Compute FR_rsq = r * r
-// if (i_1 == 0) load S_4
-// if (i_1 != 0) load C_4
-//
- fma.s1 FR_Z = FR_rsq, FR_rsq, f0 ;;
-}
-//
-// if (i_1 == 0) load S_3
-// if (i_1 != 0) load C_3
-//
-
-{ .mmi
-(p9) ldfe FR_S_2 = [GR_Table_Base], -16 ;;
-//
-// if (i_1 == 0) load S_2
-// if (i_1 != 0) load C_2
-//
-(p9) ldfe FR_S_1 = [GR_Table_Base], -16
- nop.i 999
-}
-
-{ .mmi
-(p10) ldfe FR_C_4 = [GR_Table_Base], -16 ;;
-(p10) ldfe FR_C_3 = [GR_Table_Base], -16
- nop.i 999 ;;
-}
-
-{ .mmi
-(p10) ldfe FR_C_2 = [GR_Table_Base], -16 ;;
-(p10) ldfe FR_C_1 = [GR_Table_Base], -16
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1 != 0):
-// poly_lo = FR_rsq * C_5 + C_4
-// poly_hi = FR_rsq * C_2 + C_1
-//
-(p9) fma.s1 FR_Z = FR_Z, FR_r, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1 == 0) load S_1
-// if (i_1 != 0) load C_1
-//
-(p9) fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// c = -c * r
-// dummy fmpy's to flag inexact.
-//
-(p9) fma.d.s0 FR_S_4 = FR_S_4, FR_S_4, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// poly_lo = FR_rsq * poly_lo + C_3
-// poly_hi = FR_rsq * poly_hi
-//
- fma.s1 FR_Z = FR_Z, FR_rsq, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1 == 0):
-// poly_lo = FR_rsq * S_5 + S_4
-// poly_hi = FR_rsq * S_2 + S_1
-//
-(p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1 == 0):
-// Z = Z * r for only one of the small r cases - not there
-// in original implementation notes.
-//
-(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.d.s0 FR_C_1 = FR_C_1, FR_C_1, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// poly_lo = FR_rsq * poly_lo + S_3
-// poly_hi = FR_rsq * poly_hi
-//
-(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1 == 0): dummy fmpy's to flag inexact
-// r = 1
-//
-(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// poly_hi = r * poly_hi
-//
- fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p12) fms.s1 FR_r = f0, f1, FR_r
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// poly_hi = Z * poly_lo + c
-// if i_0 == 1: r = -r
-//
- fma.s1 FR_poly = FR_poly, f1, FR_poly_hi
- nop.i 999 ;;
-}
+ nop.m 999
+ fma.d.s0 f8 = sincos_Cm, sincos_P, sincos_Q
+ br.ret.sptk b0 // Exit for common path
+};;
+////////// x = 0/Inf/NaN path //////////////////
+_SINCOS_SPECIAL_ARGS:
+.pred.rel "mutex",p8,p9
+// sin(+/-0) = +/-0
+// sin(Inf) = NaN
+// sin(NaN) = NaN
{ .mfi
- nop.m 999
-(p12) fms.d.s0 FR_Input_X = FR_r, f1, FR_poly
- nop.i 999
+ nop.m 999
+(p8) fma.d.s0 f8 = f8, f0, f0 // sin(+/-0,NaN,Inf)
+ nop.i 999
}
-
+// cos(+/-0) = 1.0
+// cos(Inf) = NaN
+// cos(NaN) = NaN
{ .mfb
- nop.m 999
-//
-// poly = poly + poly_hi
-//
-(p11) fma.d.s0 FR_Input_X = FR_r, f1, FR_poly
-//
-// if (i_0 == 0) Result = r + poly
-// if (i_0 != 0) Result = r - poly
-//
- br.ret.sptk b0 ;;
-}
-L(SINCOS_NORMAL_R):
-
-{ .mii
- nop.m 999
- extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;
-//
-// Set table_ptr1 and table_ptr2 to base address of
-// constant table.
- cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;
-}
-
-{ .mfi
- nop.m 999
- fma.s1 FR_rsq = FR_r, FR_r, f0
- extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
-}
+ nop.m 999
+(p9) fma.d.s0 f8 = f8, f0, f1 // cos(+/-0,NaN,Inf)
+ br.ret.sptk b0 // Exit for x = 0/Inf/NaN path
+};;
+GLOBAL_IEEE754_END(cos)
+//////////// x >= 2^27 - large arguments routine call ////////////
+LOCAL_LIBM_ENTRY(__libm_callout_sincos)
+_SINCOS_LARGE_ARGS:
+.prologue
{ .mfi
- nop.m 999
- frcpa.s1 FR_r_hi, p6 = f1, FR_r
- cmp.eq.unc p11, p12 = 0x0, GR_i_0
-}
-;;
-
-// ******************************************************************
-// ******************************************************************
-// ******************************************************************
-//
-// r and c have been computed.
-// We known whether this is the sine or cosine routine.
-// Make sure ftz mode is set - should be automatic when using wre
-// Get [i_0,i_1] - two lsb of N_fix_gr alone.
-//
-
-{ .mmi
- nop.m 999
- addl GR_Table_Base = @ltoff(FSINCOS_CONSTANTS#), gp
- nop.i 999
+ mov sincos_GR_all_ones = -1 // 0xffffffff
+ nop.f 999
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS = ar.pfs
}
;;
-{ .mmi
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.m 999
- nop.i 999
-}
-;;
-
-
-{ .mfi
-(p10) add GR_Table_Base = 384, GR_Table_Base
-(p12) fms.s1 FR_Input_X = f0, f1, f1
-(p9) add GR_Table_Base = 224, GR_Table_Base ;;
-}
-
-{ .mmf
- nop.m 999
-(p10) ldfe FR_QQ_8 = [GR_Table_Base], 16
-//
-// if (i_1==0) poly = poly * FR_rsq + PP_1_lo
-// else poly = FR_rsq * poly
-//
-(p11) fma.s1 FR_Input_X = f0, f1, f1 ;;
-}
-
-{ .mmf
-(p10) ldfe FR_QQ_7 = [GR_Table_Base], 16
-//
-// Adjust table pointers based on i_0
-// Compute rsq = r * r
-//
-(p9) ldfe FR_PP_8 = [GR_Table_Base], 16
- fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 ;;
-}
-
-{ .mmf
-(p9) ldfe FR_PP_7 = [GR_Table_Base], 16
-(p10) ldfe FR_QQ_6 = [GR_Table_Base], 16
-//
-// Load PP_8 and QQ_8; PP_7 and QQ_7
-//
- frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;;
-}
-//
-// if (i_1==0) poly = PP_7 + FR_rsq * PP_8.
-// else poly = QQ_7 + FR_rsq * QQ_8.
-//
-
-{ .mmb
-(p9) ldfe FR_PP_6 = [GR_Table_Base], 16
-(p10) ldfe FR_QQ_5 = [GR_Table_Base], 16
- nop.b 999 ;;
-}
-
-{ .mmb
-(p9) ldfe FR_PP_5 = [GR_Table_Base], 16
-(p10) ldfe FR_S_1 = [GR_Table_Base], 16
- nop.b 999 ;;
-}
-
-{ .mmb
-(p10) ldfe FR_QQ_1 = [GR_Table_Base], 16
-(p9) ldfe FR_C_1 = [GR_Table_Base], 16
- nop.b 999 ;;
-}
-
-{ .mmi
-(p10) ldfe FR_QQ_4 = [GR_Table_Base], 16 ;;
-(p9) ldfe FR_PP_1 = [GR_Table_Base], 16
- nop.i 999 ;;
-}
-
-{ .mmf
-(p10) ldfe FR_QQ_3 = [GR_Table_Base], 16
-//
-// if (i_1=0) corr = corr + c*c
-// else corr = corr * c
-//
-(p9) ldfe FR_PP_4 = [GR_Table_Base], 16
-(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 ;;
-}
-//
-// if (i_1=0) poly = rsq * poly + PP_5
-// else poly = rsq * poly + QQ_5
-// Load PP_4 or QQ_4
-//
-
-{ .mmf
-(p9) ldfe FR_PP_3 = [GR_Table_Base], 16
-(p10) ldfe FR_QQ_2 = [GR_Table_Base], 16
-//
-// r_hi = frcpa(frcpa(r)).
-// r_cube = r * FR_rsq.
-//
-(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 ;;
-}
-//
-// Do dummy multiplies so inexact is always set.
-//
-
-{ .mfi
-(p9) ldfe FR_PP_2 = [GR_Table_Base], 16
-//
-// r_lo = r - r_hi
-//
-(p9) fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0
- nop.i 999 ;;
-}
-
-{ .mmf
- nop.m 999
-(p9) ldfe FR_PP_1_lo = [GR_Table_Base], 16
-(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1=0) U_lo = r_hi * r_hi
-// else U_lo = r_hi + r
-//
-(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1=0) corr = C_1 * rsq
-// else corr = S_1 * r_cubed + r
-//
-(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1=0) U_hi = r_hi + U_hi
-// else U_hi = QQ_1 * U_hi + 1
-//
-(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// U_hi = r_hi * r_hi
-//
- fms.s1 FR_r_lo = FR_r, f1, FR_r_hi
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Load PP_1, PP_6, PP_5, and C_1
-// Load QQ_1, QQ_6, QQ_5, and S_1
-//
- fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1=0) U_lo = r * r_hi + U_lo
-// else U_lo = r_lo * U_lo
-//
-(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1 =0) U_hi = r + U_hi
-// if (i_1 =0) U_lo = r_lo * U_lo
-//
-//
-(p9) fma.d.s0 FR_PP_5 = FR_PP_5, FR_PP_4, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1=0) poly = poly * rsq + PP_6
-// else poly = poly * rsq + QQ_6
-//
-(p9) fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1
- nop.i 999 ;;
-}
-
{ .mfi
- nop.m 999
-(p10) fma.d.s0 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0
- nop.i 999 ;;
+ mov GR_SAVE_GP = gp
+ nop.f 999
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0 = b0
}
-{ .mfi
- nop.m 999
-//
-// if (i_1!=0) U_hi = PP_1 * U_hi
-// if (i_1!=0) U_lo = r * r + U_lo
-// Load PP_3 or QQ_3
-//
-(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Load PP_2, QQ_2
-//
-(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1==0) poly = FR_rsq * poly + PP_3
-// else poly = FR_rsq * poly + QQ_3
-// Load PP_1_lo
-//
-(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1 =0) poly = poly * rsq + pp_r4
-// else poly = poly * rsq + qq_r4
-//
-(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1==0) U_lo = PP_1_hi * U_lo
-// else U_lo = QQ_1 * U_lo
-//
-(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_0==0) Result = 1
-// else Result = -1
-//
- fma.s1 FR_V = FR_U_lo, f1, FR_corr
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1==0) poly = FR_rsq * poly + PP_2
-// else poly = FR_rsq * poly + QQ_2
-//
-(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// V = U_lo + corr
-//
-(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// if (i_1==0) poly = r_cube * poly
-// else poly = FR_rsq * poly
-//
- fma.s1 FR_V = FR_poly, f1, FR_V
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p12) fms.d.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V
- nop.i 999
-}
-
-{ .mfb
- nop.m 999
-//
-// V = V + poly
-//
-(p11) fma.d.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V
-//
-// if (i_0==0) Result = Result * U_hi + V
-// else Result = Result * U_hi - V
-//
- br.ret.sptk b0 ;;
-}
-
-//
-// If cosine, FR_Input_X = 1
-// If sine, FR_Input_X = +/-Zero (Input FR_Input_X)
-// Results are exact, no exceptions
-//
-L(SINCOS_ZERO):
-
-{ .mmb
- cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos
- nop.m 999
- nop.b 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p7) fmerge.s FR_Input_X = FR_Input_X, FR_Input_X
- nop.i 999
-}
-
-{ .mfb
- nop.m 999
-(p6) fmerge.s FR_Input_X = f1, f1
- br.ret.sptk b0 ;;
-}
-
-L(SINCOS_SPECIAL):
-
-//
-// Path for Arg = +/- QNaN, SNaN, Inf
-// Invalid can be raised. SNaNs
-// become QNaNs
-//
-
-{ .mfb
- nop.m 999
- fmpy.d.s0 FR_Input_X = FR_Input_X, f0
- br.ret.sptk b0 ;;
-}
-.endp __libm_cos_double_dbx#
-ASM_SIZE_DIRECTIVE(__libm_cos_double_dbx#)
-
-
-
-//
-// Call int pi_by_2_reduce(double* x, double *y)
-// for |arguments| >= 2**63
-// Address to save r and c as double
-//
-//
-// psp sp+64
-// sp+48 -> f0 c
-// r45 sp+32 -> f0 r
-// r44 -> sp+16 -> InputX
-// sp sp -> scratch provided to callee
-
+.body
+{ .mbb
+ setf.sig sincos_save_tmp = sincos_GR_all_ones// inexact set
+ nop.b 999
+(p8) br.call.sptk.many b0 = __libm_sin_large# // sin(large_X)
+};;
-.proc __libm_callout_2
-__libm_callout_2:
-L(SINCOS_ARG_TOO_LARGE):
+{ .mbb
+ cmp.ne p9,p0 = sincos_r_sincos, r0 // set p9 if cos
+ nop.b 999
+(p9) br.call.sptk.many b0 = __libm_cos_large# // cos(large_X)
+};;
-.prologue
{ .mfi
- add r45=-32,sp // Parameter: r address
- nop.f 0
-.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov gp = GR_SAVE_GP
+ fma.d.s0 f8 = f8, f1, f0 // Round result to double
+ mov b0 = GR_SAVE_B0
}
+// Force inexact set
{ .mfi
-.fframe 64
- add sp=-64,sp // Create new stack
- nop.f 0
- mov GR_SAVE_GP=gp // Save gp
-};;
-{ .mmi
- stfe [r45] = f0,16 // Clear Parameter r on stack
- add r44 = 16,sp // Parameter x address
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
-};;
-.body
-{ .mib
- stfe [r45] = f0,-16 // Clear Parameter c on stack
- nop.i 0
- nop.b 0
-}
-{ .mib
- stfe [r44] = FR_Input_X // Store Parameter x on stack
- nop.i 0
- br.call.sptk b0=__libm_pi_by_2_reduce# ;;
+ nop.m 999
+ fmpy.s0 sincos_save_tmp = sincos_save_tmp, sincos_save_tmp
+ nop.i 999
};;
-
-{ .mii
- ldfe FR_Input_X =[r44],16
-//
-// Get r and c off stack
-//
- adds GR_Table_Base1 = -16, GR_Table_Base1
-//
-// Get r and c off stack
-//
- add GR_N_Inc = GR_Sin_or_Cos,r8 ;;
-}
-{ .mmb
- ldfe FR_r =[r45],16
-//
-// Get X off the stack
-// Readjust Table ptr
-//
- ldfs FR_Two_to_M3 = [GR_Table_Base1],4
- nop.b 999 ;;
-}
-{ .mmb
- ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0
- ldfe FR_c =[r45]
- nop.b 999 ;;
-}
-
-{ .mfi
-.restore sp
- add sp = 64,sp // Restore stack pointer
- fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3
- mov b0 = GR_SAVE_B0 // Restore return address
-};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- nop.b 0
+ nop.m 999
+ mov ar.pfs = GR_SAVE_PFS
+ br.ret.sptk b0 // Exit for large arguments routine call
};;
+LOCAL_LIBM_END(__libm_callout_sincos)
-{ .mfi
- nop.m 999
-(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3
- nop.i 999 ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(SINCOS_SMALL_R) ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
- br.cond.sptk L(SINCOS_NORMAL_R) ;;
-}
-
-.endp __libm_callout_2
-ASM_SIZE_DIRECTIVE(__libm_callout_2)
-
-.type __libm_pi_by_2_reduce#,@function
-.global __libm_pi_by_2_reduce#
-
+.type __libm_sin_large#,@function
+.global __libm_sin_large#
+.type __libm_cos_large#,@function
+.global __libm_cos_large#
-.type __libm_sin_double_dbx#,@function
-.global __libm_sin_double_dbx#
-.type __libm_cos_double_dbx#,@function
-.global __libm_cos_double_dbx#
diff --git a/sysdeps/ia64/fpu/s_cosf.S b/sysdeps/ia64/fpu/s_cosf.S
index 0e47255b3f..89cf82372d 100644
--- a/sysdeps/ia64/fpu/s_cosf.S
+++ b/sysdeps/ia64/fpu/s_cosf.S
@@ -1,12 +1,10 @@
-
.file "sincosf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -22,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -37,663 +35,680 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
-
-
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
// History
//==============================================================
-// 2/02/00 Initial revision
-// 4/02/00 Unwind support added.
-// 5/10/00 Improved speed with new algorithm.
-// 8/08/00 Improved speed by avoiding SIR flush.
-// 8/17/00 Changed predicate register macro-usage to direct predicate
-// names due to an assembler bug.
-// 8/30/00 Put sin_of_r before sin_tbl_S_cos_of_r to gain a cycle
-// 1/02/00 Fixed flag settings, improved speed.
+// 02/02/00 Initial version
+// 04/02/00 Unwind support added.
+// 06/16/00 Updated tables to enforce symmetry
+// 08/31/00 Saved 2 cycles in main path, and 9 in other paths.
+// 09/20/00 The updated tables regressed to an old version, so reinstated them
+// 10/18/00 Changed one table entry to ensure symmetry
+// 01/03/01 Improved speed, fixed flag settings for small arguments.
+// 02/18/02 Large arguments processing routine excluded
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 06/03/02 Insure inexact flag set for large arg result
+// 09/05/02 Single precision version is made using double precision one as base
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
// float sinf( float x);
// float cosf( float x);
//
+// Overview of operation
+//==============================================================
+//
+// Step 1
+// ======
+// Reduce x to region -1/2*pi/2^k ===== 0 ===== +1/2*pi/2^k where k=4
+// divide x by pi/2^k.
+// Multiply by 2^k/pi.
+// nfloat = Round result to integer (round-to-nearest)
+//
+// r = x - nfloat * pi/2^k
+// Do this as (x - nfloat * HIGH(pi/2^k)) - nfloat * LOW(pi/2^k)
-#include "libm_support.h"
-
-// Assembly macros
+// for increased accuracy.
+// pi/2^k is stored as two numbers that when added make pi/2^k.
+// pi/2^k = HIGH(pi/2^k) + LOW(pi/2^k)
+// HIGH part is rounded to zero, LOW - to nearest
+//
+// x = (nfloat * pi/2^k) + r
+// r is small enough that we can use a polynomial approximation
+// and is referred to as the reduced argument.
+//
+// Step 3
+// ======
+// Take the unreduced part and remove the multiples of 2pi.
+// So nfloat = nfloat (with lower k+1 bits cleared) + lower k+1 bits
+//
+// nfloat (with lower k+1 bits cleared) is a multiple of 2^(k+1)
+// N * 2^(k+1)
+// nfloat * pi/2^k = N * 2^(k+1) * pi/2^k + (lower k+1 bits) * pi/2^k
+// nfloat * pi/2^k = N * 2 * pi + (lower k+1 bits) * pi/2^k
+// nfloat * pi/2^k = N2pi + M * pi/2^k
+//
+//
+// Sin(x) = Sin((nfloat * pi/2^k) + r)
+// = Sin(nfloat * pi/2^k) * Cos(r) + Cos(nfloat * pi/2^k) * Sin(r)
+//
+// Sin(nfloat * pi/2^k) = Sin(N2pi + Mpi/2^k)
+// = Sin(N2pi)Cos(Mpi/2^k) + Cos(N2pi)Sin(Mpi/2^k)
+// = Sin(Mpi/2^k)
+//
+// Cos(nfloat * pi/2^k) = Cos(N2pi + Mpi/2^k)
+// = Cos(N2pi)Cos(Mpi/2^k) + Sin(N2pi)Sin(Mpi/2^k)
+// = Cos(Mpi/2^k)
+//
+// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r)
+//
+//
+// Step 4
+// ======
+// 0 <= M < 2^(k+1)
+// There are 2^(k+1) Sin entries in a table.
+// There are 2^(k+1) Cos entries in a table.
+//
+// Get Sin(Mpi/2^k) and Cos(Mpi/2^k) by table lookup.
+//
+//
+// Step 5
+// ======
+// Calculate Cos(r) and Sin(r) by polynomial approximation.
+//
+// Cos(r) = 1 + r^2 q1 + r^4 q2 = Series for Cos
+// Sin(r) = r + r^3 p1 + r^5 p2 = Series for Sin
+//
+// and the coefficients q1, q2 and p1, p2 are stored in a table
+//
+//
+// Calculate
+// Sin(x) = Sin(Mpi/2^k) Cos(r) + Cos(Mpi/2^k) Sin(r)
+//
+// as follows
+//
+// S[m] = Sin(Mpi/2^k) and C[m] = Cos(Mpi/2^k)
+// rsq = r*r
+//
+//
+// P = P1 + r^2*P2
+// Q = Q1 + r^2*Q2
+//
+// rcub = r * rsq
+// Sin(r) = r + rcub * P
+// = r + r^3p1 + r^5p2 = Sin(r)
+//
+// The coefficients are not exactly these values, but almost.
+//
+// p1 = -1/6 = -1/3!
+// p2 = 1/120 = 1/5!
+// p3 = -1/5040 = -1/7!
+// p4 = 1/362889 = 1/9!
+//
+// P = r + r^3 * P
+//
+// Answer = S[m] Cos(r) + C[m] P
+//
+// Cos(r) = 1 + rsq Q
+// Cos(r) = 1 + r^2 Q
+// Cos(r) = 1 + r^2 (q1 + r^2q2)
+// Cos(r) = 1 + r^2q1 + r^4q2
+//
+// S[m] Cos(r) = S[m](1 + rsq Q)
+// S[m] Cos(r) = S[m] + S[m] rsq Q
+// S[m] Cos(r) = S[m] + s_rsq Q
+// Q = S[m] + s_rsq Q
+//
+// Then,
+//
+// Answer = Q + C[m] P
+
+
+// Registers used
//==============================================================
+// general input registers:
+// r14 -> r19
+// r32 -> r45
-// SIN_Sin_Flag = p6
-// SIN_Cos_Flag = p7
-
-// integer registers used
-
- SIN_AD_PQ_1 = r33
- SIN_AD_PQ_2 = r33
- sin_GR_sincos_flag = r34
- sin_GR_Mint = r35
-
- sin_GR_index = r36
- gr_tmp = r37
-
- GR_SAVE_B0 = r37
- GR_SAVE_GP = r38
- GR_SAVE_PFS = r39
-
-
-// floating point registers used
-
- sin_coeff_P1 = f32
- sin_coeff_P2 = f33
- sin_coeff_Q1 = f34
- sin_coeff_Q2 = f35
- sin_coeff_P4 = f36
- sin_coeff_P5 = f37
- sin_coeff_Q3 = f38
- sin_coeff_Q4 = f39
- sin_Mx = f40
- sin_Mfloat = f41
- sin_tbl_S = f42
- sin_tbl_C = f43
- sin_r = f44
- sin_rcube = f45
- sin_tsq = f46
- sin_r7 = f47
- sin_t = f48
- sin_poly_p2 = f49
- sin_poly_p1 = f50
- fp_tmp = f51
- sin_poly_p3 = f52
- sin_poly_p4 = f53
- sin_of_r = f54
- sin_S_t = f55
- sin_poly_q2 = f56
- sin_poly_q1 = f57
- sin_S_tcube = f58
- sin_poly_q3 = f59
- sin_poly_q4 = f60
- sin_tbl_S_tcube = f61
- sin_tbl_S_cos_of_r = f62
-
- sin_coeff_Q5 = f63
- sin_coeff_Q6 = f64
- sin_coeff_P3 = f65
-
- sin_poly_q5 = f66
- sin_poly_q12 = f67
- sin_poly_q3456 = f68
- fp_tmp2 = f69
- SIN_NORM_f8 = f70
-
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+// predicate registers used:
+// p6 -> p14
-.align 16
+// floating-point registers used
+// f9 -> f15
+// f32 -> f61
-sin_coeff_1_table:
-ASM_TYPE_DIRECTIVE(sin_coeff_1_table,@object)
-data8 0xBF56C16C16BF6462 // q3
-data8 0x3EFA01A0128B9EBC // q4
-data8 0xBE927E42FDF33FFE // q5
-data8 0x3E21DA5C72A446F3 // q6
-data8 0x3EC71DD1D5E421A4 // p4
-data8 0xBE5AC5C9D0ACF95A // p5
-data8 0xBFC55555555554CA // p1
-data8 0x3F811111110F2395 // p2
-data8 0xBFE0000000000000 // q1
-data8 0x3FA55555555554EF // q2
-data8 0xBF2A01A011232913 // p3
-data8 0x0000000000000000 // pad
-
-
-/////////////////////////////////////////
-
-data8 0xBFE1A54991426566 //sin(-32)
-data8 0x3FEAB1F5305DE8E5 //cos(-32)
-data8 0x3FD9DBC0B640FC81 //sin(-31)
-data8 0x3FED4591C3E12A20 //cos(-31)
-data8 0x3FEF9DF47F1C903D //sin(-30)
-data8 0x3FC3BE82F2505A52 //cos(-30)
-data8 0x3FE53C7D20A6C9E7 //sin(-29)
-data8 0xBFE7F01658314E47 //cos(-29)
-data8 0xBFD156853B4514D6 //sin(-28)
-data8 0xBFEECDAAD1582500 //cos(-28)
-data8 0xBFEE9AA1B0E5BA30 //sin(-27)
-data8 0xBFD2B266F959DED5 //cos(-27)
-data8 0xBFE866E0FAC32583 //sin(-26)
-data8 0x3FE4B3902691A9ED //cos(-26)
-data8 0x3FC0F0E6F31E809D //sin(-25)
-data8 0x3FEFB7EEF59504FF //cos(-25)
-data8 0x3FECFA7F7919140F //sin(-24)
-data8 0x3FDB25BFB50A609A //cos(-24)
-data8 0x3FEB143CD0247D02 //sin(-23)
-data8 0xBFE10CF7D591F272 //cos(-23)
-data8 0x3F8220A29F6EB9F4 //sin(-22)
-data8 0xBFEFFFADD8D4ACDA //cos(-22)
-data8 0xBFEAC5E20BB0D7ED //sin(-21)
-data8 0xBFE186FF83773759 //cos(-21)
-data8 0xBFED36D8F55D3CE0 //sin(-20)
-data8 0x3FDA1E043964A83F //cos(-20)
-data8 0xBFC32F2D28F584CF //sin(-19)
-data8 0x3FEFA377DE108258 //cos(-19)
-data8 0x3FE8081668131E26 //sin(-18)
-data8 0x3FE52150815D2470 //cos(-18)
-data8 0x3FEEC3C4AC42882B //sin(-17)
-data8 0xBFD19C46B07F58E7 //cos(-17)
-data8 0x3FD26D02085F20F8 //sin(-16)
-data8 0xBFEEA5257E962F74 //cos(-16)
-data8 0xBFE4CF2871CEC2E8 //sin(-15)
-data8 0xBFE84F5D069CA4F3 //cos(-15)
-data8 0xBFEFB30E327C5E45 //sin(-14)
-data8 0x3FC1809AEC2CA0ED //cos(-14)
-data8 0xBFDAE4044881C506 //sin(-13)
-data8 0x3FED09CDD5260CB7 //cos(-13)
-data8 0x3FE12B9AF7D765A5 //sin(-12)
-data8 0x3FEB00DA046B65E3 //cos(-12)
-data8 0x3FEFFFEB762E93EB //sin(-11)
-data8 0x3F7220AE41EE2FDF //cos(-11)
-data8 0x3FE1689EF5F34F52 //sin(-10)
-data8 0xBFEAD9AC890C6B1F //cos(-10)
-data8 0xBFDA6026360C2F91 //sin( -9)
-data8 0xBFED27FAA6A6196B //cos( -9)
-data8 0xBFEFA8D2A028CF7B //sin( -8)
-data8 0xBFC29FBEBF632F94 //cos( -8)
-data8 0xBFE50608C26D0A08 //sin( -7)
-data8 0x3FE81FF79ED92017 //cos( -7)
-data8 0x3FD1E1F18AB0A2C0 //sin( -6)
-data8 0x3FEEB9B7097822F5 //cos( -6)
-data8 0x3FEEAF81F5E09933 //sin( -5)
-data8 0x3FD22785706B4AD9 //cos( -5)
-data8 0x3FE837B9DDDC1EAE //sin( -4)
-data8 0xBFE4EAA606DB24C1 //cos( -4)
-data8 0xBFC210386DB6D55B //sin( -3)
-data8 0xBFEFAE04BE85E5D2 //cos( -3)
-data8 0xBFED18F6EAD1B446 //sin( -2)
-data8 0xBFDAA22657537205 //cos( -2)
-data8 0xBFEAED548F090CEE //sin( -1)
-data8 0x3FE14A280FB5068C //cos( -1)
-data8 0x0000000000000000 //sin( 0)
-data8 0x3FF0000000000000 //cos( 0)
-data8 0x3FEAED548F090CEE //sin( 1)
-data8 0x3FE14A280FB5068C //cos( 1)
-data8 0x3FED18F6EAD1B446 //sin( 2)
-data8 0xBFDAA22657537205 //cos( 2)
-data8 0x3FC210386DB6D55B //sin( 3)
-data8 0xBFEFAE04BE85E5D2 //cos( 3)
-data8 0xBFE837B9DDDC1EAE //sin( 4)
-data8 0xBFE4EAA606DB24C1 //cos( 4)
-data8 0xBFEEAF81F5E09933 //sin( 5)
-data8 0x3FD22785706B4AD9 //cos( 5)
-data8 0xBFD1E1F18AB0A2C0 //sin( 6)
-data8 0x3FEEB9B7097822F5 //cos( 6)
-data8 0x3FE50608C26D0A08 //sin( 7)
-data8 0x3FE81FF79ED92017 //cos( 7)
-data8 0x3FEFA8D2A028CF7B //sin( 8)
-data8 0xBFC29FBEBF632F94 //cos( 8)
-data8 0x3FDA6026360C2F91 //sin( 9)
-data8 0xBFED27FAA6A6196B //cos( 9)
-data8 0xBFE1689EF5F34F52 //sin( 10)
-data8 0xBFEAD9AC890C6B1F //cos( 10)
-data8 0xBFEFFFEB762E93EB //sin( 11)
-data8 0x3F7220AE41EE2FDF //cos( 11)
-data8 0xBFE12B9AF7D765A5 //sin( 12)
-data8 0x3FEB00DA046B65E3 //cos( 12)
-data8 0x3FDAE4044881C506 //sin( 13)
-data8 0x3FED09CDD5260CB7 //cos( 13)
-data8 0x3FEFB30E327C5E45 //sin( 14)
-data8 0x3FC1809AEC2CA0ED //cos( 14)
-data8 0x3FE4CF2871CEC2E8 //sin( 15)
-data8 0xBFE84F5D069CA4F3 //cos( 15)
-data8 0xBFD26D02085F20F8 //sin( 16)
-data8 0xBFEEA5257E962F74 //cos( 16)
-data8 0xBFEEC3C4AC42882B //sin( 17)
-data8 0xBFD19C46B07F58E7 //cos( 17)
-data8 0xBFE8081668131E26 //sin( 18)
-data8 0x3FE52150815D2470 //cos( 18)
-data8 0x3FC32F2D28F584CF //sin( 19)
-data8 0x3FEFA377DE108258 //cos( 19)
-data8 0x3FED36D8F55D3CE0 //sin( 20)
-data8 0x3FDA1E043964A83F //cos( 20)
-data8 0x3FEAC5E20BB0D7ED //sin( 21)
-data8 0xBFE186FF83773759 //cos( 21)
-data8 0xBF8220A29F6EB9F4 //sin( 22)
-data8 0xBFEFFFADD8D4ACDA //cos( 22)
-data8 0xBFEB143CD0247D02 //sin( 23)
-data8 0xBFE10CF7D591F272 //cos( 23)
-data8 0xBFECFA7F7919140F //sin( 24)
-data8 0x3FDB25BFB50A609A //cos( 24)
-data8 0xBFC0F0E6F31E809D //sin( 25)
-data8 0x3FEFB7EEF59504FF //cos( 25)
-data8 0x3FE866E0FAC32583 //sin( 26)
-data8 0x3FE4B3902691A9ED //cos( 26)
-data8 0x3FEE9AA1B0E5BA30 //sin( 27)
-data8 0xBFD2B266F959DED5 //cos( 27)
-data8 0x3FD156853B4514D6 //sin( 28)
-data8 0xBFEECDAAD1582500 //cos( 28)
-data8 0xBFE53C7D20A6C9E7 //sin( 29)
-data8 0xBFE7F01658314E47 //cos( 29)
-data8 0xBFEF9DF47F1C903D //sin( 30)
-data8 0x3FC3BE82F2505A52 //cos( 30)
-data8 0xBFD9DBC0B640FC81 //sin( 31)
-data8 0x3FED4591C3E12A20 //cos( 31)
-data8 0x3FE1A54991426566 //sin( 32)
-data8 0x3FEAB1F5305DE8E5 //cos( 32)
-ASM_SIZE_DIRECTIVE(sin_coeff_1_table)
-
-//////////////////////////////////////////
-
-
-.global sinf
-.global cosf
-#ifdef _LIBC
-.global __sinf
-.global __cosf
-#endif
-
-.text
-.proc cosf
-#ifdef _LIBC
-.proc __cosf
-#endif
-.align 32
-
-
-cosf:
-#ifdef _LIBC
-__cosf:
-#endif
-{ .mfi
- alloc r32 = ar.pfs,1,7,0,0
- fcvt.fx.s1 sin_Mx = f8
- cmp.ne p6,p7 = r0,r0 // p7 set if cos
-}
-{ .mfi
- addl SIN_AD_PQ_1 = @ltoff(sin_coeff_1_table),gp
- fnorm.s0 SIN_NORM_f8 = f8 // Sets denormal or invalid
- mov sin_GR_sincos_flag = 0x0
-}
-;;
+// Assembly macros
+//==============================================================
+sincosf_NORM_f8 = f9
+sincosf_W = f10
+sincosf_int_Nfloat = f11
+sincosf_Nfloat = f12
-{ .mfi
- ld8 SIN_AD_PQ_1 = [SIN_AD_PQ_1]
- fclass.m.unc p9,p0 = f8, 0x07
- cmp.ne p8,p0 = r0,r0
-}
-{ .mfb
- nop.m 999
- nop.f 999
- br.sptk L(SINCOSF_COMMON)
-}
-;;
+sincosf_r = f13
+sincosf_rsq = f14
+sincosf_rcub = f15
+sincosf_save_tmp = f15
-.endp cosf
-ASM_SIZE_DIRECTIVE(cosf)
+sincosf_Inv_Pi_by_16 = f32
+sincosf_Pi_by_16_1 = f33
+sincosf_Pi_by_16_2 = f34
+sincosf_Inv_Pi_by_64 = f35
-.text
-.proc sinf
-#ifdef _LIBC
-.proc __sinf
-#endif
-.align 32
+sincosf_Pi_by_16_3 = f36
-sinf:
-#ifdef _LIBC
-__sinf:
-#endif
-{ .mfi
- alloc r32 = ar.pfs,1,7,0,0
- fcvt.fx.s1 sin_Mx = f8
- cmp.eq p6,p7 = r0,r0 // p6 set if sin
-}
-{ .mfi
- addl SIN_AD_PQ_1 = @ltoff(sin_coeff_1_table),gp
- fnorm.s0 SIN_NORM_f8 = f8 // Sets denormal or invalid
- mov sin_GR_sincos_flag = 0x1
-}
-;;
+sincosf_r_exact = f37
-{ .mfi
- ld8 SIN_AD_PQ_1 = [SIN_AD_PQ_1]
- fclass.m.unc p8,p0 = f8, 0x07
- cmp.ne p9,p0 = r0,r0
-}
-{ .mfb
- nop.m 999
- nop.f 999
- br.sptk L(SINCOSF_COMMON)
-}
-;;
+sincosf_Sm = f38
+sincosf_Cm = f39
+sincosf_P1 = f40
+sincosf_Q1 = f41
+sincosf_P2 = f42
+sincosf_Q2 = f43
+sincosf_P3 = f44
+sincosf_Q3 = f45
+sincosf_P4 = f46
+sincosf_Q4 = f47
-L(SINCOSF_COMMON):
+sincosf_P_temp1 = f48
+sincosf_P_temp2 = f49
-// Here with p6 if sin, p7 if cos, p8 if sin(0), p9 if cos(0)
+sincosf_Q_temp1 = f50
+sincosf_Q_temp2 = f51
+sincosf_P = f52
+sincosf_Q = f53
-{ .mmf
- ldfpd sin_coeff_Q3, sin_coeff_Q4 = [SIN_AD_PQ_1], 16
- nop.m 999
- fclass.m.unc p11,p0 = f8, 0x23 // Test for x=inf
-}
-;;
+sincosf_srsq = f54
-{ .mfb
- ldfpd sin_coeff_Q5, sin_coeff_Q6 = [SIN_AD_PQ_1], 16
- fclass.m.unc p10,p0 = f8, 0xc3 // Test for x=nan
-(p8) br.ret.spnt b0 // Exit for sin(0)
-}
-{ .mfb
- nop.m 999
-(p9) fma.s f8 = f1,f1,f0
-(p9) br.ret.spnt b0 // Exit for cos(0)
-}
-;;
+sincosf_SIG_INV_PI_BY_16_2TO61 = f55
+sincosf_RSHF_2TO61 = f56
+sincosf_RSHF = f57
+sincosf_2TOM61 = f58
+sincosf_NFLOAT = f59
+sincosf_W_2TO61_RSH = f60
-{ .mmf
- ldfpd sin_coeff_P4, sin_coeff_P5 = [SIN_AD_PQ_1], 16
- addl gr_tmp = -1,r0
- fcvt.xf sin_Mfloat = sin_Mx
-}
-;;
+fp_tmp = f61
-{ .mfi
- getf.sig sin_GR_Mint = sin_Mx
-(p11) frcpa.s0 f8,p13 = f0,f0 // qnan indef if x=inf
- nop.i 999
-}
-{ .mfb
- ldfpd sin_coeff_P1, sin_coeff_P2 = [SIN_AD_PQ_1], 16
- nop.f 999
-(p11) br.ret.spnt b0 // Exit for x=inf
-}
-;;
+/////////////////////////////////////////////////////////////
-{ .mfi
- ldfpd sin_coeff_Q1, sin_coeff_Q2 = [SIN_AD_PQ_1], 16
- nop.f 999
- cmp.ge p8,p9 = -33,sin_GR_Mint
-}
-{ .mfb
- add sin_GR_index = 32,sin_GR_Mint
-(p10) fma.s f8 = f8,f1,f0 // Force qnan if x=nan
-(p10) br.ret.spnt b0 // Exit for x=nan
-}
-;;
+sincosf_AD_1 = r33
+sincosf_AD_2 = r34
+sincosf_exp_limit = r35
+sincosf_r_signexp = r36
+sincosf_AD_beta_table = r37
+sincosf_r_sincos = r38
-{ .mmi
- ldfd sin_coeff_P3 = [SIN_AD_PQ_1], 16
-(p9) cmp.le p8,p0 = 33, sin_GR_Mint
- shl sin_GR_index = sin_GR_index,4
-}
-;;
+sincosf_r_exp = r39
+sincosf_r_17_ones = r40
+sincosf_GR_sig_inv_pi_by_16 = r14
+sincosf_GR_rshf_2to61 = r15
+sincosf_GR_rshf = r16
+sincosf_GR_exp_2tom61 = r17
+sincosf_GR_n = r18
+sincosf_GR_m = r19
+sincosf_GR_32m = r19
+sincosf_GR_all_ones = r19
-{ .mfi
- setf.sig fp_tmp = gr_tmp // Create constant such that fmpy sets inexact
- fnma.s1 sin_r = f1,sin_Mfloat,SIN_NORM_f8
-(p8) cmp.eq.unc p11,p12=sin_GR_sincos_flag,r0 // p11 if must call dbl cos
- // p12 if must call dbl sin
-}
-{ .mbb
- add SIN_AD_PQ_2 = sin_GR_index,SIN_AD_PQ_1
-(p11) br.cond.spnt COS_DOUBLE
-(p12) br.cond.spnt SIN_DOUBLE
-}
-;;
+gr_tmp = r41
+GR_SAVE_PFS = r41
+GR_SAVE_B0 = r42
+GR_SAVE_GP = r43
-.pred.rel "mutex",p6,p7 //SIN_Sin_Flag, SIN_Cos_Flag
-{ .mmi
-(p6) ldfpd sin_tbl_S,sin_tbl_C = [SIN_AD_PQ_2]
-(p7) ldfpd sin_tbl_C,sin_tbl_S = [SIN_AD_PQ_2]
- nop.i 999
-}
-;;
+RODATA
+.align 16
-{ .mfi
- nop.m 999
-(p6) fclass.m.unc p8,p0 = f8, 0x0b // If sin, note denormal input to set uflow
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 sin_t = sin_r,sin_r,f0
- nop.i 999
-}
-;;
+// Pi/16 parts
+LOCAL_OBJECT_START(double_sincosf_pi)
+ data8 0xC90FDAA22168C234, 0x00003FFC // pi/16 1st part
+ data8 0xC4C6628B80DC1CD1, 0x00003FBC // pi/16 2nd part
+LOCAL_OBJECT_END(double_sincosf_pi)
+
+// Coefficients for polynomials
+LOCAL_OBJECT_START(double_sincosf_pq_k4)
+ data8 0x3F810FABB668E9A2 // P2
+ data8 0x3FA552E3D6DE75C9 // Q2
+ data8 0xBFC555554447BC7F // P1
+ data8 0xBFDFFFFFC447610A // Q1
+LOCAL_OBJECT_END(double_sincosf_pq_k4)
+
+// Sincos table (S[m], C[m])
+LOCAL_OBJECT_START(double_sin_cos_beta_k4)
+ data8 0x0000000000000000 // sin ( 0 Pi / 16 )
+ data8 0x3FF0000000000000 // cos ( 0 Pi / 16 )
+//
+ data8 0x3FC8F8B83C69A60B // sin ( 1 Pi / 16 )
+ data8 0x3FEF6297CFF75CB0 // cos ( 1 Pi / 16 )
+//
+ data8 0x3FD87DE2A6AEA963 // sin ( 2 Pi / 16 )
+ data8 0x3FED906BCF328D46 // cos ( 2 Pi / 16 )
+//
+ data8 0x3FE1C73B39AE68C8 // sin ( 3 Pi / 16 )
+ data8 0x3FEA9B66290EA1A3 // cos ( 3 Pi / 16 )
+//
+ data8 0x3FE6A09E667F3BCD // sin ( 4 Pi / 16 )
+ data8 0x3FE6A09E667F3BCD // cos ( 4 Pi / 16 )
+//
+ data8 0x3FEA9B66290EA1A3 // sin ( 5 Pi / 16 )
+ data8 0x3FE1C73B39AE68C8 // cos ( 5 Pi / 16 )
+//
+ data8 0x3FED906BCF328D46 // sin ( 6 Pi / 16 )
+ data8 0x3FD87DE2A6AEA963 // cos ( 6 Pi / 16 )
+//
+ data8 0x3FEF6297CFF75CB0 // sin ( 7 Pi / 16 )
+ data8 0x3FC8F8B83C69A60B // cos ( 7 Pi / 16 )
+//
+ data8 0x3FF0000000000000 // sin ( 8 Pi / 16 )
+ data8 0x0000000000000000 // cos ( 8 Pi / 16 )
+//
+ data8 0x3FEF6297CFF75CB0 // sin ( 9 Pi / 16 )
+ data8 0xBFC8F8B83C69A60B // cos ( 9 Pi / 16 )
+//
+ data8 0x3FED906BCF328D46 // sin ( 10 Pi / 16 )
+ data8 0xBFD87DE2A6AEA963 // cos ( 10 Pi / 16 )
+//
+ data8 0x3FEA9B66290EA1A3 // sin ( 11 Pi / 16 )
+ data8 0xBFE1C73B39AE68C8 // cos ( 11 Pi / 16 )
+//
+ data8 0x3FE6A09E667F3BCD // sin ( 12 Pi / 16 )
+ data8 0xBFE6A09E667F3BCD // cos ( 12 Pi / 16 )
+//
+ data8 0x3FE1C73B39AE68C8 // sin ( 13 Pi / 16 )
+ data8 0xBFEA9B66290EA1A3 // cos ( 13 Pi / 16 )
+//
+ data8 0x3FD87DE2A6AEA963 // sin ( 14 Pi / 16 )
+ data8 0xBFED906BCF328D46 // cos ( 14 Pi / 16 )
+//
+ data8 0x3FC8F8B83C69A60B // sin ( 15 Pi / 16 )
+ data8 0xBFEF6297CFF75CB0 // cos ( 15 Pi / 16 )
+//
+ data8 0x0000000000000000 // sin ( 16 Pi / 16 )
+ data8 0xBFF0000000000000 // cos ( 16 Pi / 16 )
+//
+ data8 0xBFC8F8B83C69A60B // sin ( 17 Pi / 16 )
+ data8 0xBFEF6297CFF75CB0 // cos ( 17 Pi / 16 )
+//
+ data8 0xBFD87DE2A6AEA963 // sin ( 18 Pi / 16 )
+ data8 0xBFED906BCF328D46 // cos ( 18 Pi / 16 )
+//
+ data8 0xBFE1C73B39AE68C8 // sin ( 19 Pi / 16 )
+ data8 0xBFEA9B66290EA1A3 // cos ( 19 Pi / 16 )
+//
+ data8 0xBFE6A09E667F3BCD // sin ( 20 Pi / 16 )
+ data8 0xBFE6A09E667F3BCD // cos ( 20 Pi / 16 )
+//
+ data8 0xBFEA9B66290EA1A3 // sin ( 21 Pi / 16 )
+ data8 0xBFE1C73B39AE68C8 // cos ( 21 Pi / 16 )
+//
+ data8 0xBFED906BCF328D46 // sin ( 22 Pi / 16 )
+ data8 0xBFD87DE2A6AEA963 // cos ( 22 Pi / 16 )
+//
+ data8 0xBFEF6297CFF75CB0 // sin ( 23 Pi / 16 )
+ data8 0xBFC8F8B83C69A60B // cos ( 23 Pi / 16 )
+//
+ data8 0xBFF0000000000000 // sin ( 24 Pi / 16 )
+ data8 0x0000000000000000 // cos ( 24 Pi / 16 )
+//
+ data8 0xBFEF6297CFF75CB0 // sin ( 25 Pi / 16 )
+ data8 0x3FC8F8B83C69A60B // cos ( 25 Pi / 16 )
+//
+ data8 0xBFED906BCF328D46 // sin ( 26 Pi / 16 )
+ data8 0x3FD87DE2A6AEA963 // cos ( 26 Pi / 16 )
+//
+ data8 0xBFEA9B66290EA1A3 // sin ( 27 Pi / 16 )
+ data8 0x3FE1C73B39AE68C8 // cos ( 27 Pi / 16 )
+//
+ data8 0xBFE6A09E667F3BCD // sin ( 28 Pi / 16 )
+ data8 0x3FE6A09E667F3BCD // cos ( 28 Pi / 16 )
+//
+ data8 0xBFE1C73B39AE68C8 // sin ( 29 Pi / 16 )
+ data8 0x3FEA9B66290EA1A3 // cos ( 29 Pi / 16 )
+//
+ data8 0xBFD87DE2A6AEA963 // sin ( 30 Pi / 16 )
+ data8 0x3FED906BCF328D46 // cos ( 30 Pi / 16 )
+//
+ data8 0xBFC8F8B83C69A60B // sin ( 31 Pi / 16 )
+ data8 0x3FEF6297CFF75CB0 // cos ( 31 Pi / 16 )
+//
+ data8 0x0000000000000000 // sin ( 32 Pi / 16 )
+ data8 0x3FF0000000000000 // cos ( 32 Pi / 16 )
+LOCAL_OBJECT_END(double_sin_cos_beta_k4)
-{ .mfi
- nop.m 999
- fma.s1 sin_rcube = sin_t,sin_r,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 sin_tsq = sin_t,sin_t,f0
- nop.i 999
-}
-;;
+.section .text
-{ .mfi
- nop.m 999
- fma.s1 sin_poly_q3 = sin_t,sin_coeff_Q4,sin_coeff_Q3
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 sin_poly_q5 = sin_t,sin_coeff_Q6,sin_coeff_Q5
- nop.i 999
-}
-;;
+////////////////////////////////////////////////////////
+// There are two entry points: sin and cos
+// If from sin, p8 is true
+// If from cos, p9 is true
-{ .mfi
- nop.m 999
- fma.s1 sin_poly_p1 = sin_t,sin_coeff_P5,sin_coeff_P4
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 sin_poly_p2 = sin_t,sin_coeff_P2,sin_coeff_P1
- nop.i 999
-}
-;;
+GLOBAL_IEEE754_ENTRY(sinf)
-{ .mfi
- nop.m 999
- fma.s1 sin_poly_q1 = sin_t,sin_coeff_Q2,sin_coeff_Q1
- nop.i 999
+{ .mlx
+ alloc r32 = ar.pfs,1,13,0,0
+ movl sincosf_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A //signd of 16/pi
}
-{ .mfi
- nop.m 999
- fma.s1 sin_S_t = sin_t,sin_tbl_S,f0
- nop.i 999
-}
-;;
+{ .mlx
+ addl sincosf_AD_1 = @ltoff(double_sincosf_pi), gp
+ movl sincosf_GR_rshf_2to61 = 0x47b8000000000000 // 1.1 2^(63+63-2)
+};;
-{ .mfi
- nop.m 999
-(p8) fmpy.s.s0 fp_tmp2 = f8,f8 // Dummy mult to set underflow if sin(denormal)
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 sin_r7 = sin_rcube,sin_tsq,f0
- nop.i 999
+{ .mfi
+ ld8 sincosf_AD_1 = [sincosf_AD_1]
+ fnorm.s1 sincosf_NORM_f8 = f8 // Normalize argument
+ cmp.eq p8,p9 = r0, r0 // set p8 (clear p9) for sin
}
-;;
+{ .mib
+ mov sincosf_GR_exp_2tom61 = 0xffff-61 // exponent of scale 2^-61
+ mov sincosf_r_sincos = 0x0 // 0 for sin
+ br.cond.sptk _SINCOSF_COMMON // go to common part
+};;
-{ .mfi
- nop.m 999
- fma.s1 sin_poly_q3456 = sin_tsq,sin_poly_q5,sin_poly_q3
- nop.i 999
-}
-;;
+GLOBAL_IEEE754_END(sinf)
+GLOBAL_IEEE754_ENTRY(cosf)
-{ .mfi
- nop.m 999
- fma.s1 sin_poly_p3 = sin_t,sin_poly_p1,sin_coeff_P3
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 sin_poly_p4 = sin_rcube,sin_poly_p2,sin_r
- nop.i 999
+{ .mlx
+ alloc r32 = ar.pfs,1,13,0,0
+ movl sincosf_GR_sig_inv_pi_by_16 = 0xA2F9836E4E44152A //signd of 16/pi
}
-;;
+{ .mlx
+ addl sincosf_AD_1 = @ltoff(double_sincosf_pi), gp
+ movl sincosf_GR_rshf_2to61 = 0x47b8000000000000 // 1.1 2^(63+63-2)
+};;
-{ .mfi
- nop.m 999
- fma.s1 sin_tbl_S_tcube = sin_S_t,sin_tsq,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.s1 sin_poly_q12 = sin_S_t,sin_poly_q1,sin_tbl_S
- nop.i 999
+{ .mfi
+ ld8 sincosf_AD_1 = [sincosf_AD_1]
+ fnorm.s1 sincosf_NORM_f8 = f8 // Normalize argument
+ cmp.eq p9,p8 = r0, r0 // set p9 (clear p8) for cos
}
-;;
+{ .mib
+ mov sincosf_GR_exp_2tom61 = 0xffff-61 // exponent of scale 2^-61
+ mov sincosf_r_sincos = 0x8 // 8 for cos
+ nop.b 999
+};;
+
+////////////////////////////////////////////////////////
+// All entry points end up here.
+// If from sin, sincosf_r_sincos is 0 and p8 is true
+// If from cos, sincosf_r_sincos is 8 = 2^(k-1) and p9 is true
+// We add sincosf_r_sincos to N
+
+///////////// Common sin and cos part //////////////////
+_SINCOSF_COMMON:
+
+// Form two constants we need
+// 16/pi * 2^-2 * 2^63, scaled by 2^61 since we just loaded the significand
+// 1.1000...000 * 2^(63+63-2) to right shift int(W) into the low significand
+// fcmp used to set denormal, and invalid on snans
+{ .mfi
+ setf.sig sincosf_SIG_INV_PI_BY_16_2TO61 = sincosf_GR_sig_inv_pi_by_16
+ fclass.m p6,p0 = f8, 0xe7 // if x=0,inf,nan
+ mov sincosf_exp_limit = 0x10017
+}
+{ .mlx
+ setf.d sincosf_RSHF_2TO61 = sincosf_GR_rshf_2to61
+ movl sincosf_GR_rshf = 0x43e8000000000000 // 1.1000 2^63
+};; // Right shift
+
+// Form another constant
+// 2^-61 for scaling Nfloat
+// 0x10017 is register_bias + 24.
+// So if f8 >= 2^24, go to large argument routines
+{ .mmi
+ getf.exp sincosf_r_signexp = f8
+ setf.exp sincosf_2TOM61 = sincosf_GR_exp_2tom61
+ addl gr_tmp = -1,r0 // For "inexect" constant create
+};;
+
+// Load the two pieces of pi/16
+// Form another constant
+// 1.1000...000 * 2^63, the right shift constant
+{ .mmb
+ ldfe sincosf_Pi_by_16_1 = [sincosf_AD_1],16
+ setf.d sincosf_RSHF = sincosf_GR_rshf
+(p6) br.cond.spnt _SINCOSF_SPECIAL_ARGS
+};;
-{ .mfi
- nop.m 999
- fma.d.s1 sin_of_r = sin_r7,sin_poly_p3,sin_poly_p4
- nop.i 999
-}
-;;
+// Getting argument's exp for "large arguments" filtering
+{ .mmi
+ ldfe sincosf_Pi_by_16_2 = [sincosf_AD_1],16
+ setf.sig fp_tmp = gr_tmp // constant for inexact set
+ nop.i 999
+};;
-{ .mfi
- nop.m 999
- fma.d.s1 sin_tbl_S_cos_of_r = sin_tbl_S_tcube,sin_poly_q3456,sin_poly_q12
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
- nop.i 999
-}
-;;
+// Polynomial coefficients (Q2, Q1, P2, P1) loading
+{ .mmi
+ ldfpd sincosf_P2,sincosf_Q2 = [sincosf_AD_1],16
+ nop.m 999
+ nop.i 999
+};;
+// Select exponent (17 lsb)
+{ .mmi
+ ldfpd sincosf_P1,sincosf_Q1 = [sincosf_AD_1],16
+ nop.m 999
+ dep.z sincosf_r_exp = sincosf_r_signexp, 0, 17
+};;
-.pred.rel "mutex",p6,p7 //SIN_Sin_Flag, SIN_Cos_Flag
-{ .mfi
- nop.m 999
-//(SIN_Sin_Flag) fma.s f8 = sin_tbl_C,sin_of_r,sin_tbl_S_cos_of_r
-(p6) fma.s f8 = sin_tbl_C,sin_of_r,sin_tbl_S_cos_of_r
- nop.i 999
-}
-{ .mfb
- nop.m 999
-//(SIN_Cos_Flag) fnma.s f8 = sin_tbl_C,sin_of_r,sin_tbl_S_cos_of_r
-(p7) fnma.s f8 = sin_tbl_C,sin_of_r,sin_tbl_S_cos_of_r
- br.ret.sptk b0
-}
+// p10 is true if we must call routines to handle larger arguments
+// p10 is true if f8 exp is >= 0x10017 (2^24)
+{ .mfb
+ cmp.ge p10,p0 = sincosf_r_exp,sincosf_exp_limit
+ nop.f 999
+(p10) br.cond.spnt _SINCOSF_LARGE_ARGS // Go to "large args" routine
+};;
+
+// sincosf_W = x * sincosf_Inv_Pi_by_16
+// Multiply x by scaled 16/pi and add large const to shift integer part of W to
+// rightmost bits of significand
+{ .mfi
+ nop.m 999
+ fma.s1 sincosf_W_2TO61_RSH = sincosf_NORM_f8, sincosf_SIG_INV_PI_BY_16_2TO61, sincosf_RSHF_2TO61
+ nop.i 999
+};;
-.endp sinf
-ASM_SIZE_DIRECTIVE(sinf)
+// sincosf_NFLOAT = Round_Int_Nearest(sincosf_W)
+// This is done by scaling back by 2^-61 and subtracting the shift constant
+{ .mfi
+ nop.m 999
+ fms.s1 sincosf_NFLOAT = sincosf_W_2TO61_RSH,sincosf_2TOM61,sincosf_RSHF
+ nop.i 999
+};;
+// get N = (int)sincosf_int_Nfloat
+{ .mfi
+ getf.sig sincosf_GR_n = sincosf_W_2TO61_RSH // integer N value
+ nop.f 999
+ nop.i 999
+};;
-.proc SIN_DOUBLE
-SIN_DOUBLE:
-.prologue
+// Add 2^(k-1) (which is in sincosf_r_sincos=8) to N
+// sincosf_r = -sincosf_Nfloat * sincosf_Pi_by_16_1 + x
{ .mfi
- nop.m 0
- nop.f 0
-.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs
-}
-;;
+ add sincosf_GR_n = sincosf_GR_n, sincosf_r_sincos
+ fnma.s1 sincosf_r = sincosf_NFLOAT, sincosf_Pi_by_16_1, sincosf_NORM_f8
+ nop.i 999
+};;
+// Get M (least k+1 bits of N)
+{ .mmi
+ and sincosf_GR_m = 0x1f,sincosf_GR_n // Put mask 0x1F -
+ nop.m 999 // - select k+1 bits
+ nop.i 999
+};;
+
+// Add 16*M to address of sin_cos_beta table
{ .mfi
- mov GR_SAVE_GP=gp
- nop.f 0
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0
-}
+ shladd sincosf_AD_2 = sincosf_GR_32m, 4, sincosf_AD_1
+(p8) fclass.m.unc p10,p0 = f8,0x0b // If sin denormal input -
+ nop.i 999
+};;
-.body
-{ .mmb
- nop.m 999
- nop.m 999
- br.call.sptk.many b0=sin
+// Load Sin and Cos table value using obtained index m (sincosf_AD_2)
+{ .mfi
+ ldfd sincosf_Sm = [sincosf_AD_2],8 // Sin value S[m]
+(p9) fclass.m.unc p11,p0 = f8,0x0b // If cos denormal input -
+ nop.i 999 // - set denormal
+};;
+
+// sincosf_r = sincosf_r -sincosf_Nfloat * sincosf_Pi_by_16_2
+{ .mfi
+ ldfd sincosf_Cm = [sincosf_AD_2] // Cos table value C[m]
+ fnma.s1 sincosf_r_exact = sincosf_NFLOAT, sincosf_Pi_by_16_2, sincosf_r
+ nop.i 999
}
-;;
+// get rsq = r*r
+{ .mfi
+ nop.m 999
+ fma.s1 sincosf_rsq = sincosf_r, sincosf_r, f0 // r^2 = r*r
+ nop.i 999
+};;
{ .mfi
- mov gp = GR_SAVE_GP
- nop.f 999
- mov b0 = GR_SAVE_B0
+ nop.m 999
+ fmpy.s0 fp_tmp = fp_tmp, fp_tmp // forces inexact flag
+ nop.i 999
+};;
+
+// Polynomials calculation
+// Q = Q2*r^2 + Q1
+// P = P2*r^2 + P1
+{ .mfi
+ nop.m 999
+ fma.s1 sincosf_Q = sincosf_rsq, sincosf_Q2, sincosf_Q1
+ nop.i 999
}
-;;
+{ .mfi
+ nop.m 999
+ fma.s1 sincosf_P = sincosf_rsq, sincosf_P2, sincosf_P1
+ nop.i 999
+};;
+// get rcube and S[m]*r^2
{ .mfi
- nop.m 999
- fma.s f8 = f8,f1,f0
-(p0) mov ar.pfs = GR_SAVE_PFS
+ nop.m 999
+ fmpy.s1 sincosf_srsq = sincosf_Sm,sincosf_rsq // r^2*S[m]
+ nop.i 999
}
-{ .mib
- nop.m 999
- nop.i 999
-(p0) br.ret.sptk b0
+{ .mfi
+ nop.m 999
+ fmpy.s1 sincosf_rcub = sincosf_r_exact, sincosf_rsq
+ nop.i 999
+};;
+
+// Get final P and Q
+// Q = Q*S[m]*r^2 + S[m]
+// P = P*r^3 + r
+{ .mfi
+ nop.m 999
+ fma.s1 sincosf_Q = sincosf_srsq,sincosf_Q, sincosf_Sm
+ nop.i 999
}
-;;
+{ .mfi
+ nop.m 999
+ fma.s1 sincosf_P = sincosf_rcub,sincosf_P,sincosf_r_exact
+ nop.i 999
+};;
-.endp SIN_DOUBLE
-ASM_SIZE_DIRECTIVE(SIN_DOUBLE)
+// If sinf(denormal) - force underflow to be set
+.pred.rel "mutex",p10,p11
+{ .mfi
+ nop.m 999
+(p10) fmpy.s.s0 fp_tmp = f8,f8 // forces underflow flag
+ nop.i 999 // for denormal sine args
+}
+// If cosf(denormal) - force denormal to be set
+{ .mfi
+ nop.m 999
+(p11) fma.s.s0 fp_tmp = f8, f1, f8 // forces denormal flag
+ nop.i 999 // for denormal cosine args
+};;
-.proc COS_DOUBLE
-COS_DOUBLE:
+// Final calculation
+// result = C[m]*P + Q
+{ .mfb
+ nop.m 999
+ fma.s.s0 f8 = sincosf_Cm, sincosf_P, sincosf_Q
+ br.ret.sptk b0 // Exit for common path
+};;
+
+////////// x = 0/Inf/NaN path //////////////////
+_SINCOSF_SPECIAL_ARGS:
+.pred.rel "mutex",p8,p9
+// sinf(+/-0) = +/-0
+// sinf(Inf) = NaN
+// sinf(NaN) = NaN
+{ .mfi
+ nop.m 999
+(p8) fma.s.s0 f8 = f8, f0, f0 // sinf(+/-0,NaN,Inf)
+ nop.i 999
+}
+// cosf(+/-0) = 1.0
+// cosf(Inf) = NaN
+// cosf(NaN) = NaN
+{ .mfb
+ nop.m 999
+(p9) fma.s.s0 f8 = f8, f0, f1 // cosf(+/-0,NaN,Inf)
+ br.ret.sptk b0 // Exit for x = 0/Inf/NaN path
+};;
+
+GLOBAL_IEEE754_END(cosf)
+//////////// x >= 2^24 - large arguments routine call ////////////
+LOCAL_LIBM_ENTRY(__libm_callout_sincosf)
+_SINCOSF_LARGE_ARGS:
.prologue
{ .mfi
- nop.m 0
- nop.f 0
-.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs
+ mov sincosf_GR_all_ones = -1 // 0xffffffff
+ nop.f 999
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS = ar.pfs
}
;;
{ .mfi
- mov GR_SAVE_GP=gp
- nop.f 0
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0
+ mov GR_SAVE_GP = gp
+ nop.f 999
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0 = b0
}
-
.body
-{ .mmb
- nop.m 999
- nop.m 999
- br.call.sptk.many b0=cos
-}
-;;
-{ .mfi
- mov gp = GR_SAVE_GP
- nop.f 999
- mov b0 = GR_SAVE_B0
-}
-;;
+{ .mbb
+ setf.sig sincosf_save_tmp = sincosf_GR_all_ones // inexact set
+ nop.b 999
+(p8) br.call.sptk.many b0 = __libm_sin_large# // sinf(large_X)
+};;
+
+{ .mbb
+ cmp.ne p9,p0 = sincosf_r_sincos, r0 // set p9 if cos
+ nop.b 999
+(p9) br.call.sptk.many b0 = __libm_cos_large# // cosf(large_X)
+};;
{ .mfi
- nop.m 999
- fma.s f8 = f8,f1,f0
-(p0) mov ar.pfs = GR_SAVE_PFS
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p0) br.ret.sptk b0
+ mov gp = GR_SAVE_GP
+ fma.s.s0 f8 = f8, f1, f0 // Round result to single
+ mov b0 = GR_SAVE_B0
}
-;;
-
-.endp COS_DOUBLE
-ASM_SIZE_DIRECTIVE(COS_DOUBLE)
+{ .mfi // force inexact set
+ nop.m 999
+ fmpy.s0 sincosf_save_tmp = sincosf_save_tmp, sincosf_save_tmp
+ nop.i 999
+};;
+{ .mib
+ nop.m 999
+ mov ar.pfs = GR_SAVE_PFS
+ br.ret.sptk b0 // Exit for large arguments routine call
+};;
+LOCAL_LIBM_END(__libm_callout_sincosf)
+.type __libm_sin_large#, @function
+.global __libm_sin_large#
+.type __libm_cos_large#, @function
+.global __libm_cos_large#
-.type sin,@function
-.global sin
-.type cos,@function
-.global cos
diff --git a/sysdeps/ia64/fpu/s_cosl.S b/sysdeps/ia64/fpu/s_cosl.S
index 2755580c0d..374e822256 100644
--- a/sysdeps/ia64/fpu/s_cosl.S
+++ b/sysdeps/ia64/fpu/s_cosl.S
@@ -1,10 +1,10 @@
.file "sincosl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,76 +20,81 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-// *********************************************************************
+//*********************************************************************
//
-// History:
-// 2/02/2000 (hand-optimized)
-// 4/04/00 Unwind support added
+// History:
+// 02/02/00 (hand-optimized)
+// 04/04/00 Unwind support added
+// 07/30/01 Improved speed on all paths
+// 08/20/01 Fixed bundling typo
+// 05/13/02 Changed interface to __libm_pi_by_2_reduce
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
//
-// *********************************************************************
+//*********************************************************************
//
// Function: Combined sinl(x) and cosl(x), where
//
// sinl(x) = sine(x), for double-extended precision x values
// cosl(x) = cosine(x), for double-extended precision x values
//
-// *********************************************************************
+//*********************************************************************
//
// Resources Used:
//
-// Floating-Point Registers: f8 (Input and Return Value)
+// Floating-Point Registers: f8 (Input and Return Value)
// f32-f99
//
// General Purpose Registers:
-// r32-r43
+// r32-r43
// r44-r45 (Used to pass arguments to pi_by_2 reduce routine)
//
// Predicate Registers: p6-p13
//
-// *********************************************************************
+//*********************************************************************
//
// IEEE Special Conditions:
//
// Denormal fault raised on denormal inputs
// Overflow exceptions do not occur
-// Underflow exceptions raised when appropriate for sin
+// Underflow exceptions raised when appropriate for sin
// (No specialized error handling for this routine)
// Inexact raised when appropriate by algorithm
//
// sinl(SNaN) = QNaN
// sinl(QNaN) = QNaN
-// sinl(inf) = QNaN
+// sinl(inf) = QNaN
// sinl(+/-0) = +/-0
-// cosl(inf) = QNaN
+// cosl(inf) = QNaN
// cosl(SNaN) = QNaN
// cosl(QNaN) = QNaN
// cosl(0) = 1
-//
-// *********************************************************************
+//
+//*********************************************************************
//
// Mathematical Description
// ========================
//
-// The computation of FSIN and FCOS is best handled in one piece of
-// code. The main reason is that given any argument Arg, computation
-// of trigonometric functions first calculate N and an approximation
+// The computation of FSIN and FCOS is best handled in one piece of
+// code. The main reason is that given any argument Arg, computation
+// of trigonometric functions first calculate N and an approximation
// to alpha where
//
// Arg = N pi/2 + alpha, |alpha| <= pi/4.
@@ -98,62 +103,62 @@
//
// cosl( Arg ) = sinl( (N+1) pi/2 + alpha ),
//
-// therefore, the code for computing sine will produce cosine as long
-// as 1 is added to N immediately after the argument reduction
+// therefore, the code for computing sine will produce cosine as long
+// as 1 is added to N immediately after the argument reduction
// process.
//
// Let M = N if sine
-// N+1 if cosine.
+// N+1 if cosine.
//
// Now, given
//
// Arg = M pi/2 + alpha, |alpha| <= pi/4,
//
-// let I = M mod 4, or I be the two lsb of M when M is represented
+// let I = M mod 4, or I be the two lsb of M when M is represented
// as 2's complement. I = [i_0 i_1]. Then
//
-// sinl( Arg ) = (-1)^i_0 sinl( alpha ) if i_1 = 0,
+// sinl( Arg ) = (-1)^i_0 sinl( alpha ) if i_1 = 0,
// = (-1)^i_0 cosl( alpha ) if i_1 = 1.
//
// For example:
-// if M = -1, I = 11
+// if M = -1, I = 11
// sin ((-pi/2 + alpha) = (-1) cos (alpha)
-// if M = 0, I = 00
+// if M = 0, I = 00
// sin (alpha) = sin (alpha)
-// if M = 1, I = 01
+// if M = 1, I = 01
// sin (pi/2 + alpha) = cos (alpha)
-// if M = 2, I = 10
+// if M = 2, I = 10
// sin (pi + alpha) = (-1) sin (alpha)
-// if M = 3, I = 11
+// if M = 3, I = 11
// sin ((3/2)pi + alpha) = (-1) cos (alpha)
//
-// The value of alpha is obtained by argument reduction and
+// The value of alpha is obtained by argument reduction and
// represented by two working precision numbers r and c where
//
// alpha = r + c accurately.
//
// The reduction method is described in a previous write up.
-// The argument reduction scheme identifies 4 cases. For Cases 2
-// and 4, because |alpha| is small, sinl(r+c) and cosl(r+c) can be
-// computed very easily by 2 or 3 terms of the Taylor series
+// The argument reduction scheme identifies 4 cases. For Cases 2
+// and 4, because |alpha| is small, sinl(r+c) and cosl(r+c) can be
+// computed very easily by 2 or 3 terms of the Taylor series
// expansion as follows:
//
// Case 2:
// -------
//
-// sinl(r + c) = r + c - r^3/6 accurately
-// cosl(r + c) = 1 - 2^(-67) accurately
+// sinl(r + c) = r + c - r^3/6 accurately
+// cosl(r + c) = 1 - 2^(-67) accurately
//
// Case 4:
// -------
//
-// sinl(r + c) = r + c - r^3/6 + r^5/120 accurately
-// cosl(r + c) = 1 - r^2/2 + r^4/24 accurately
+// sinl(r + c) = r + c - r^3/6 + r^5/120 accurately
+// cosl(r + c) = 1 - r^2/2 + r^4/24 accurately
//
-// The only cases left are Cases 1 and 3 of the argument reduction
-// procedure. These two cases will be merged since after the
-// argument is reduced in either cases, we have the reduced argument
-// represented as r + c and that the magnitude |r + c| is not small
+// The only cases left are Cases 1 and 3 of the argument reduction
+// procedure. These two cases will be merged since after the
+// argument is reduced in either cases, we have the reduced argument
+// represented as r + c and that the magnitude |r + c| is not small
// enough to allow the usage of a very short approximation.
//
// The required calculation is either
@@ -163,32 +168,32 @@
//
// Specifically,
//
-// sinl(r + c) = sinl(r) + c sin'(r) + O(c^2)
-// = sinl(r) + c cos (r) + O(c^2)
-// = sinl(r) + c(1 - r^2/2) accurately.
+// sinl(r + c) = sinl(r) + c sin'(r) + O(c^2)
+// = sinl(r) + c cos (r) + O(c^2)
+// = sinl(r) + c(1 - r^2/2) accurately.
// Similarly,
//
-// cosl(r + c) = cosl(r) - c sinl(r) + O(c^2)
-// = cosl(r) - c(r - r^3/6) accurately.
+// cosl(r + c) = cosl(r) - c sinl(r) + O(c^2)
+// = cosl(r) - c(r - r^3/6) accurately.
//
-// We therefore concentrate on accurately calculating sinl(r) and
+// We therefore concentrate on accurately calculating sinl(r) and
// cosl(r) for a working-precision number r, |r| <= pi/4 to within
// 0.1% or so.
//
-// The greatest challenge of this task is that the second terms of
+// The greatest challenge of this task is that the second terms of
// the Taylor series
-//
-// r - r^3/3! + r^r/5! - ...
+//
+// r - r^3/3! + r^r/5! - ...
//
// and
//
-// 1 - r^2/2! + r^4/4! - ...
+// 1 - r^2/2! + r^4/4! - ...
//
-// are not very small when |r| is close to pi/4 and the rounding
-// errors will be a concern if simple polynomial accumulation is
-// used. When |r| < 2^-3, however, the second terms will be small
-// enough (6 bits or so of right shift) that a normal Horner
-// recurrence suffices. Hence there are two cases that we consider
+// are not very small when |r| is close to pi/4 and the rounding
+// errors will be a concern if simple polynomial accumulation is
+// used. When |r| < 2^-3, however, the second terms will be small
+// enough (6 bits or so of right shift) that a normal Horner
+// recurrence suffices. Hence there are two cases that we consider
// in the accurate computation of sinl(r) and cosl(r), |r| <= pi/4.
//
// Case small_r: |r| < 2^(-3)
@@ -197,88 +202,88 @@
// Since Arg = M pi/4 + r + c accurately, and M mod 4 is [i_0 i_1],
// we have
//
-// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0
-// = (-1)^i_0 * cosl(r + c) if i_1 = 1
+// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0
+// = (-1)^i_0 * cosl(r + c) if i_1 = 1
//
// can be accurately approximated by
//
-// sinl(Arg) = (-1)^i_0 * [sinl(r) + c] if i_1 = 0
+// sinl(Arg) = (-1)^i_0 * [sinl(r) + c] if i_1 = 0
// = (-1)^i_0 * [cosl(r) - c*r] if i_1 = 1
//
-// because |r| is small and thus the second terms in the correction
+// because |r| is small and thus the second terms in the correction
// are unneccessary.
//
-// Finally, sinl(r) and cosl(r) are approximated by polynomials of
+// Finally, sinl(r) and cosl(r) are approximated by polynomials of
// moderate lengths.
//
// sinl(r) = r + S_1 r^3 + S_2 r^5 + ... + S_5 r^11
// cosl(r) = 1 + C_1 r^2 + C_2 r^4 + ... + C_5 r^10
//
-// We can make use of predicates to selectively calculate
-// sinl(r) or cosl(r) based on i_1.
+// We can make use of predicates to selectively calculate
+// sinl(r) or cosl(r) based on i_1.
//
// Case normal_r: 2^(-3) <= |r| <= pi/4
// ------------------------------------
//
// This case is more likely than the previous one if one considers
// r to be uniformly distributed in [-pi/4 pi/4]. Again,
-//
-// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0
-// = (-1)^i_0 * cosl(r + c) if i_1 = 1.
//
-// Because |r| is now larger, we need one extra term in the
+// sinl(Arg) = (-1)^i_0 * sinl(r + c) if i_1 = 0
+// = (-1)^i_0 * cosl(r + c) if i_1 = 1.
+//
+// Because |r| is now larger, we need one extra term in the
// correction. sinl(Arg) can be accurately approximated by
//
// sinl(Arg) = (-1)^i_0 * [sinl(r) + c(1-r^2/2)] if i_1 = 0
// = (-1)^i_0 * [cosl(r) - c*r*(1 - r^2/6)] i_1 = 1.
//
-// Finally, sinl(r) and cosl(r) are approximated by polynomials of
+// Finally, sinl(r) and cosl(r) are approximated by polynomials of
// moderate lengths.
//
-// sinl(r) = r + PP_1_hi r^3 + PP_1_lo r^3 +
-// PP_2 r^5 + ... + PP_8 r^17
+// sinl(r) = r + PP_1_hi r^3 + PP_1_lo r^3 +
+// PP_2 r^5 + ... + PP_8 r^17
//
-// cosl(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16
+// cosl(r) = 1 + QQ_1 r^2 + QQ_2 r^4 + ... + QQ_8 r^16
//
-// where PP_1_hi is only about 16 bits long and QQ_1 is -1/2.
-// The crux in accurate computation is to calculate
+// where PP_1_hi is only about 16 bits long and QQ_1 is -1/2.
+// The crux in accurate computation is to calculate
//
// r + PP_1_hi r^3 or 1 + QQ_1 r^2
//
-// accurately as two pieces: U_hi and U_lo. The way to achieve this
-// is to obtain r_hi as a 10 sig. bit number that approximates r to
+// accurately as two pieces: U_hi and U_lo. The way to achieve this
+// is to obtain r_hi as a 10 sig. bit number that approximates r to
// roughly 8 bits or so of accuracy. (One convenient way is
//
// r_hi := frcpa( frcpa( r ) ).)
//
// This way,
//
-// r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 +
-// PP_1_hi (r^3 - r_hi^3)
-// = [r + PP_1_hi r_hi^3] +
-// [PP_1_hi (r - r_hi)
-// (r^2 + r_hi r + r_hi^2) ]
-// = U_hi + U_lo
+// r + PP_1_hi r^3 = r + PP_1_hi r_hi^3 +
+// PP_1_hi (r^3 - r_hi^3)
+// = [r + PP_1_hi r_hi^3] +
+// [PP_1_hi (r - r_hi)
+// (r^2 + r_hi r + r_hi^2) ]
+// = U_hi + U_lo
//
// Since r_hi is only 10 bit long and PP_1_hi is only 16 bit long,
-// PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed
-// exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign
-// and that there is no more than 8 bit shift off between r and
-// PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus
-// calculated without any error. Finally, the fact that
+// PP_1_hi * r_hi^3 is only at most 46 bit long and thus computed
+// exactly. Furthermore, r and PP_1_hi r_hi^3 are of opposite sign
+// and that there is no more than 8 bit shift off between r and
+// PP_1_hi * r_hi^3. Hence the sum, U_hi, is representable and thus
+// calculated without any error. Finally, the fact that
//
-// |U_lo| <= 2^(-8) |U_hi|
+// |U_lo| <= 2^(-8) |U_hi|
//
-// says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly
+// says that U_hi + U_lo is approximating r + PP_1_hi r^3 to roughly
// 8 extra bits of accuracy.
//
// Similarly,
//
-// 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] +
-// [QQ_1 (r - r_hi)(r + r_hi)]
-// = U_hi + U_lo.
-//
-// Summarizing, we calculate r_hi = frcpa( frcpa( r ) ).
+// 1 + QQ_1 r^2 = [1 + QQ_1 r_hi^2] +
+// [QQ_1 (r - r_hi)(r + r_hi)]
+// = U_hi + U_lo.
+//
+// Summarizing, we calculate r_hi = frcpa( frcpa( r ) ).
//
// If i_1 = 0, then
//
@@ -297,35 +302,35 @@
// End
//
// Finally,
-//
-// V := poly + ( U_lo + correction )
+//
+// V := poly + ( U_lo + correction )
//
// / U_hi + V if i_0 = 0
-// result := |
+// result := |
// \ (-U_hi) - V if i_0 = 1
//
-// It is important that in the last step, negation of U_hi is
-// performed prior to the subtraction which is to be performed in
-// the user-set rounding mode.
+// It is important that in the last step, negation of U_hi is
+// performed prior to the subtraction which is to be performed in
+// the user-set rounding mode.
//
//
// Algorithmic Description
// =======================
//
-// The argument reduction algorithm is tightly integrated into FSIN
-// and FCOS which share the same code. The following is complete and
-// self-contained. The argument reduction description given
+// The argument reduction algorithm is tightly integrated into FSIN
+// and FCOS which share the same code. The following is complete and
+// self-contained. The argument reduction description given
// previously is repeated below.
//
//
-// Step 0. Initialization.
+// Step 0. Initialization.
//
// If FSIN is invoked, set N_inc := 0; else if FCOS is invoked,
// set N_inc := 1.
//
// Step 1. Check for exceptional and special cases.
//
-// * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special
+// * If Arg is +-0, +-inf, NaN, NaT, go to Step 10 for special
// handling.
// * If |Arg| < 2^24, go to Step 2 for reduction of moderate
// arguments. This is the most likely case.
@@ -335,18 +340,18 @@
//
// Step 2. Reduction of moderate arguments.
//
-// If |Arg| < pi/4 ...quick branch
-// N_fix := N_inc (integer)
+// If |Arg| < pi/4 ...quick branch
+// N_fix := N_inc (integer)
// r := Arg
// c := 0.0
// Branch to Step 4, Case_1_complete
-// Else ...cf. argument reduction
-// N := Arg * two_by_PI (fp)
-// N_fix := fcvt.fx( N ) (int)
+// Else ...cf. argument reduction
+// N := Arg * two_by_PI (fp)
+// N_fix := fcvt.fx( N ) (int)
// N := fcvt.xf( N_fix )
// N_fix := N_fix + N_inc
-// s := Arg - N * P_1 (first piece of pi/2)
-// w := -N * P_2 (second piece of pi/2)
+// s := Arg - N * P_1 (first piece of pi/2)
+// w := -N * P_2 (second piece of pi/2)
//
// If |s| >= 2^(-33)
// go to Step 3, Case_1_reduce
@@ -358,8 +363,8 @@
// Step 3. Case_1_reduce.
//
// r := s + w
-// c := (s - r) + w ...observe order
-//
+// c := (s - r) + w ...observe order
+//
// Step 4. Case_1_complete
//
// ...At this point, the reduced argument alpha is
@@ -375,17 +380,17 @@
//
// If i_1 = 0, then
// poly := r*FR_rsq*(PP_1_lo + FR_rsq*(PP_2 + ... FR_rsq*PP_8))
-// U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order
+// U_hi := r + PP_1_hi*r_hi*r_hi*r_hi ...any order
// U_lo := PP_1_hi*r_lo*(r*r + r*r_hi + r_hi*r_hi)
-// correction := c + c*C_1*FR_rsq ...any order
+// correction := c + c*C_1*FR_rsq ...any order
// Else
// poly := FR_rsq*FR_rsq*(QQ_2 + FR_rsq*(QQ_3 + ... + FR_rsq*QQ_8))
-// U_hi := 1 + QQ_1 * r_hi * r_hi ...any order
+// U_hi := 1 + QQ_1 * r_hi * r_hi ...any order
// U_lo := QQ_1 * r_lo * (r + r_hi)
-// correction := -c*(r + S_1*FR_rsq*r) ...any order
+// correction := -c*(r + S_1*FR_rsq*r) ...any order
// Endif
//
-// V := poly + (U_lo + correction) ...observe order
+// V := poly + (U_lo + correction) ...observe order
//
// result := (i_0 == 0? 1.0 : -1.0)
//
@@ -397,7 +402,7 @@
// Return
//
// Step 6. Small_r.
-//
+//
// ...Use flush to zero mode without causing exception
// Let [i_0 i_1] be the two lsb of N_fix.
//
@@ -412,7 +417,7 @@
// Else
// z := FR_rsq*FR_rsq; z := FR_rsq*z
// poly_lo := C_3 + FR_rsq*(C_4 + FR_rsq*C_5)
-// poly_hi := FR_rsq*(C_1 + FR_rsq*C_2)
+// poly_hi := FR_rsq*(C_1 + FR_rsq*C_2)
// correction := -c*r
// result := 1
// Endif
@@ -429,15 +434,15 @@
//
// Step 7. Case_2_reduce.
//
-// ...Refer to the write up for argument reduction for
+// ...Refer to the write up for argument reduction for
// ...rationale. The reduction algorithm below is taken from
// ...argument reduction description and integrated this.
//
// w := N*P_3
-// U_1 := N*P_2 + w ...FMA
-// U_2 := (N*P_2 - U_1) + w ...2 FMA
+// U_1 := N*P_2 + w ...FMA
+// U_2 := (N*P_2 - U_1) + w ...2 FMA
// ...U_1 + U_2 is N*(P_2+P_3) accurately
-//
+//
// r := s - U_1
// c := ( (s - r) - U_1 ) - U_2
//
@@ -446,29 +451,29 @@
// ...Case 1, this case requires much more work to reduce
// ...the argument, the subsequent calculation needed for
// ...any of the trigonometric function is very little because
-// ...|alpha| < 1.01*2^(-33) and thus two terms of the
+// ...|alpha| < 1.01*2^(-33) and thus two terms of the
// ...Taylor series expansion suffices.
//
// If i_1 = 0 then
-// poly := c + S_1 * r * r * r ...any order
+// poly := c + S_1 * r * r * r ...any order
// result := r
// Else
// poly := -2^(-67)
// result := 1.0
// Endif
-//
+//
// If i_0 = 1, result := -result
//
// Last operation. Perform in user-set rounding mode
//
// result := (i_0 == 0? result + poly :
// result - poly )
-//
+//
// Return
//
-//
+//
// Step 8. Pre-reduction of large arguments.
-//
+//
// ...Again, the following reduction procedure was described
// ...in the separate write up for argument reduction, which
// ...is tightly integrated here.
@@ -476,13 +481,13 @@
// N_0 := Arg * Inv_P_0
// N_0_fix := fcvt.fx( N_0 )
// N_0 := fcvt.xf( N_0_fix)
-
+
// Arg' := Arg - N_0 * P_0
// w := N_0 * d_1
// N := Arg' * two_by_PI
// N_fix := fcvt.fx( N )
// N := fcvt.xf( N_fix )
-// N_fix := N_fix + N_inc
+// N_fix := N_fix + N_inc
//
// s := Arg' - N * P_1
// w := w - N * P_2
@@ -494,15 +499,15 @@
// Endif
//
// Step 9. Case_4_reduce.
-//
+//
// ...first obtain N_0*d_1 and -N*P_2 accurately
-// U_hi := N_0 * d_1 V_hi := -N*P_2
-// U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs
+// U_hi := N_0 * d_1 V_hi := -N*P_2
+// U_lo := N_0 * d_1 - U_hi V_lo := -N*P_2 - U_hi ...FMAs
//
// ...compute the contribution from N_0*d_1 and -N*P_3
// w := -N*P_3
// w := w + N_0*d_2
-// t := U_lo + V_lo + w ...any order
+// t := U_lo + V_lo + w ...any order
//
// ...at this point, the mathematical value
// ...s + U_hi + V_hi + t approximates the true reduced argument
@@ -517,12 +522,12 @@
// endif
// ...order in computing "a" must be observed. This branch is
// ...best implemented by predicates.
-// ...A + a is U_hi + V_hi accurately. Moreover, "a" is
+// ...A + a is U_hi + V_hi accurately. Moreover, "a" is
// ...much smaller than A: |a| <= (1/2)ulp(A).
//
// ...Just need to calculate s + A + a + t
-// C_hi := s + A t := t + a
-// C_lo := (s - C_hi) + A
+// C_hi := s + A t := t + a
+// C_lo := (s - C_hi) + A
// C_lo := C_lo + t
//
// ...Final steps for reduction
@@ -548,156 +553,191 @@
// result := (i_0 == 0? result + poly :
// result - poly )
// Return
-//
+//
// Large Arguments: For arguments above 2**63, a Payne-Hanek
// style argument reduction is used and pi_by_2 reduce is called.
//
-#include "libm_support.h"
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-.align 64
-
-FSINCOSL_CONSTANTS:
-ASM_TYPE_DIRECTIVE(FSINCOSL_CONSTANTS,@object)
-data4 0x4B800000, 0xCB800000, 0x00000000,0x00000000 // two**24, -two**24
-data4 0x4E44152A, 0xA2F9836E, 0x00003FFE,0x00000000 // Inv_pi_by_2
-data4 0xCE81B9F1, 0xC84D32B0, 0x00004016,0x00000000 // P_0
-data4 0x2168C235, 0xC90FDAA2, 0x00003FFF,0x00000000 // P_1
-data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD,0x00000000 // P_2
-data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C,0x00000000 // P_3
-data4 0x5F000000, 0xDF000000, 0x00000000,0x00000000 // two_to_63, -two_to_63
-data4 0x6EC6B45A, 0xA397E504, 0x00003FE7,0x00000000 // Inv_P_0
-data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF,0x00000000 // d_1
-data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C,0x00000000 // d_2
-data4 0x2168C234, 0xC90FDAA2, 0x00003FFE,0x00000000 // pi_by_4
-data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE,0x00000000 // neg_pi_by_4
-data4 0x3E000000, 0xBE000000, 0x00000000,0x00000000 // two**-3, -two**-3
-data4 0x2F000000, 0xAF000000, 0x9E000000,0x00000000 // two**-33, -two**-33, -two**-67
-data4 0xA21C0BC9, 0xCC8ABEBC, 0x00003FCE,0x00000000 // PP_8
-data4 0x720221DA, 0xD7468A05, 0x0000BFD6,0x00000000 // PP_7
-data4 0x640AD517, 0xB092382F, 0x00003FDE,0x00000000 // PP_6
-data4 0xD1EB75A4, 0xD7322B47, 0x0000BFE5,0x00000000 // PP_5
-data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1
-data4 0x00000000, 0xAAAA0000, 0x0000BFFC,0x00000000 // PP_1_hi
-data4 0xBAF69EEA, 0xB8EF1D2A, 0x00003FEC,0x00000000 // PP_4
-data4 0x0D03BB69, 0xD00D00D0, 0x0000BFF2,0x00000000 // PP_3
-data4 0x88888962, 0x88888888, 0x00003FF8,0x00000000 // PP_2
-data4 0xAAAB0000, 0xAAAAAAAA, 0x0000BFEC,0x00000000 // PP_1_lo
-data4 0xC2B0FE52, 0xD56232EF, 0x00003FD2,0x00000000 // QQ_8
-data4 0x2B48DCA6, 0xC9C99ABA, 0x0000BFDA,0x00000000 // QQ_7
-data4 0x9C716658, 0x8F76C650, 0x00003FE2,0x00000000 // QQ_6
-data4 0xFDA8D0FC, 0x93F27DBA, 0x0000BFE9,0x00000000 // QQ_5
-data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1
-data4 0x00000000, 0x80000000, 0x0000BFFE,0x00000000 // QQ_1
-data4 0x0C6E5041, 0xD00D00D0, 0x00003FEF,0x00000000 // QQ_4
-data4 0x0B607F60, 0xB60B60B6, 0x0000BFF5,0x00000000 // QQ_3
-data4 0xAAAAAA9B, 0xAAAAAAAA, 0x00003FFA,0x00000000 // QQ_2
-data4 0xFFFFFFFE, 0xFFFFFFFF, 0x0000BFFD,0x00000000 // C_1
-data4 0xAAAA719F, 0xAAAAAAAA, 0x00003FFA,0x00000000 // C_2
-data4 0x0356F994, 0xB60B60B6, 0x0000BFF5,0x00000000 // C_3
-data4 0xB2385EA9, 0xD00CFFD5, 0x00003FEF,0x00000000 // C_4
-data4 0x292A14CD, 0x93E4BD18, 0x0000BFE9,0x00000000 // C_5
-data4 0xAAAAAAAA, 0xAAAAAAAA, 0x0000BFFC,0x00000000 // S_1
-data4 0x888868DB, 0x88888888, 0x00003FF8,0x00000000 // S_2
-data4 0x055EFD4B, 0xD00D00D0, 0x0000BFF2,0x00000000 // S_3
-data4 0x839730B9, 0xB8EF1C5D, 0x00003FEC,0x00000000 // S_4
-data4 0xE5B3F492, 0xD71EA3A4, 0x0000BFE5,0x00000000 // S_5
-data4 0x38800000, 0xB8800000, 0x00000000 // two**-14, -two**-14
-ASM_SIZE_DIRECTIVE(FSINCOSL_CONSTANTS)
-
-FR_Input_X = f8
-FR_Neg_Two_to_M3 = f32
-FR_Two_to_63 = f32
-FR_Two_to_24 = f33
-FR_Pi_by_4 = f33
-FR_Two_to_M14 = f34
-FR_Two_to_M33 = f35
-FR_Neg_Two_to_24 = f36
-FR_Neg_Pi_by_4 = f36
-FR_Neg_Two_to_M14 = f37
-FR_Neg_Two_to_M33 = f38
-FR_Neg_Two_to_M67 = f39
-FR_Inv_pi_by_2 = f40
-FR_N_float = f41
-FR_N_fix = f42
-FR_P_1 = f43
-FR_P_2 = f44
-FR_P_3 = f45
-FR_s = f46
-FR_w = f47
-FR_c = f48
-FR_r = f49
-FR_Z = f50
-FR_A = f51
-FR_a = f52
-FR_t = f53
-FR_U_1 = f54
-FR_U_2 = f55
-FR_C_1 = f56
-FR_C_2 = f57
-FR_C_3 = f58
-FR_C_4 = f59
-FR_C_5 = f60
-FR_S_1 = f61
-FR_S_2 = f62
-FR_S_3 = f63
-FR_S_4 = f64
-FR_S_5 = f65
-FR_poly_hi = f66
-FR_poly_lo = f67
-FR_r_hi = f68
-FR_r_lo = f69
-FR_rsq = f70
-FR_r_cubed = f71
-FR_C_hi = f72
-FR_N_0 = f73
-FR_d_1 = f74
-FR_V = f75
-FR_V_hi = f75
-FR_V_lo = f76
-FR_U_hi = f77
-FR_U_lo = f78
-FR_U_hiabs = f79
-FR_V_hiabs = f80
-FR_PP_8 = f81
-FR_QQ_8 = f81
-FR_PP_7 = f82
-FR_QQ_7 = f82
-FR_PP_6 = f83
-FR_QQ_6 = f83
-FR_PP_5 = f84
-FR_QQ_5 = f84
-FR_PP_4 = f85
-FR_QQ_4 = f85
-FR_PP_3 = f86
-FR_QQ_3 = f86
-FR_PP_2 = f87
-FR_QQ_2 = f87
-FR_QQ_1 = f88
-FR_N_0_fix = f89
-FR_Inv_P_0 = f90
-FR_corr = f91
-FR_poly = f92
-FR_d_2 = f93
-FR_Two_to_M3 = f94
-FR_Neg_Two_to_63 = f94
-FR_P_0 = f95
-FR_C_lo = f96
-FR_PP_1 = f97
-FR_PP_1_lo = f98
-FR_ArgPrime = f99
-
-GR_Table_Base = r32
-GR_Table_Base1 = r33
-GR_i_0 = r34
-GR_i_1 = r35
-GR_N_Inc = r36
-GR_Sin_or_Cos = r37
+
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(FSINCOSL_CONSTANTS)
+
+sincosl_table_p:
+data8 0xA2F9836E4E44152A, 0x00003FFE // Inv_pi_by_2
+data8 0xC84D32B0CE81B9F1, 0x00004016 // P_0
+data8 0xC90FDAA22168C235, 0x00003FFF // P_1
+data8 0xECE675D1FC8F8CBB, 0x0000BFBD // P_2
+data8 0xB7ED8FBBACC19C60, 0x0000BF7C // P_3
+data8 0x8D848E89DBD171A1, 0x0000BFBF // d_1
+data8 0xD5394C3618A66F8E, 0x0000BF7C // d_2
+LOCAL_OBJECT_END(FSINCOSL_CONSTANTS)
+
+LOCAL_OBJECT_START(sincosl_table_d)
+data8 0xC90FDAA22168C234, 0x00003FFE // pi_by_4
+data8 0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0
+data4 0x3E000000, 0xBE000000 // 2^-3 and -2^-3
+data4 0x2F000000, 0xAF000000 // 2^-33 and -2^-33
+data4 0x9E000000, 0x00000000 // -2^-67
+data4 0x00000000, 0x00000000 // pad
+LOCAL_OBJECT_END(sincosl_table_d)
+
+LOCAL_OBJECT_START(sincosl_table_pp)
+data8 0xCC8ABEBCA21C0BC9, 0x00003FCE // PP_8
+data8 0xD7468A05720221DA, 0x0000BFD6 // PP_7
+data8 0xB092382F640AD517, 0x00003FDE // PP_6
+data8 0xD7322B47D1EB75A4, 0x0000BFE5 // PP_5
+data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1
+data8 0xAAAA000000000000, 0x0000BFFC // PP_1_hi
+data8 0xB8EF1D2ABAF69EEA, 0x00003FEC // PP_4
+data8 0xD00D00D00D03BB69, 0x0000BFF2 // PP_3
+data8 0x8888888888888962, 0x00003FF8 // PP_2
+data8 0xAAAAAAAAAAAB0000, 0x0000BFEC // PP_1_lo
+LOCAL_OBJECT_END(sincosl_table_pp)
+
+LOCAL_OBJECT_START(sincosl_table_qq)
+data8 0xD56232EFC2B0FE52, 0x00003FD2 // QQ_8
+data8 0xC9C99ABA2B48DCA6, 0x0000BFDA // QQ_7
+data8 0x8F76C6509C716658, 0x00003FE2 // QQ_6
+data8 0x93F27DBAFDA8D0FC, 0x0000BFE9 // QQ_5
+data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1
+data8 0x8000000000000000, 0x0000BFFE // QQ_1
+data8 0xD00D00D00C6E5041, 0x00003FEF // QQ_4
+data8 0xB60B60B60B607F60, 0x0000BFF5 // QQ_3
+data8 0xAAAAAAAAAAAAAA9B, 0x00003FFA // QQ_2
+LOCAL_OBJECT_END(sincosl_table_qq)
+
+LOCAL_OBJECT_START(sincosl_table_c)
+data8 0xFFFFFFFFFFFFFFFE, 0x0000BFFD // C_1
+data8 0xAAAAAAAAAAAA719F, 0x00003FFA // C_2
+data8 0xB60B60B60356F994, 0x0000BFF5 // C_3
+data8 0xD00CFFD5B2385EA9, 0x00003FEF // C_4
+data8 0x93E4BD18292A14CD, 0x0000BFE9 // C_5
+LOCAL_OBJECT_END(sincosl_table_c)
+
+LOCAL_OBJECT_START(sincosl_table_s)
+data8 0xAAAAAAAAAAAAAAAA, 0x0000BFFC // S_1
+data8 0x88888888888868DB, 0x00003FF8 // S_2
+data8 0xD00D00D0055EFD4B, 0x0000BFF2 // S_3
+data8 0xB8EF1C5D839730B9, 0x00003FEC // S_4
+data8 0xD71EA3A4E5B3F492, 0x0000BFE5 // S_5
+data4 0x38800000, 0xB8800000 // two**-14 and -two**-14
+LOCAL_OBJECT_END(sincosl_table_s)
+
+FR_Input_X = f8
+FR_Result = f8
+
+FR_r = f8
+FR_c = f9
+
+FR_norm_x = f9
+FR_inv_pi_2to63 = f10
+FR_rshf_2to64 = f11
+FR_2tom64 = f12
+FR_rshf = f13
+FR_N_float_signif = f14
+FR_abs_x = f15
+FR_Pi_by_4 = f34
+FR_Two_to_M14 = f35
+FR_Neg_Two_to_M14 = f36
+FR_Two_to_M33 = f37
+FR_Neg_Two_to_M33 = f38
+FR_Neg_Two_to_M67 = f39
+FR_Inv_pi_by_2 = f40
+FR_N_float = f41
+FR_N_fix = f42
+FR_P_1 = f43
+FR_P_2 = f44
+FR_P_3 = f45
+FR_s = f46
+FR_w = f47
+FR_d_2 = f48
+FR_tmp_result = f49
+FR_Z = f50
+FR_A = f51
+FR_a = f52
+FR_t = f53
+FR_U_1 = f54
+FR_U_2 = f55
+FR_C_1 = f56
+FR_C_2 = f57
+FR_C_3 = f58
+FR_C_4 = f59
+FR_C_5 = f60
+FR_S_1 = f61
+FR_S_2 = f62
+FR_S_3 = f63
+FR_S_4 = f64
+FR_S_5 = f65
+FR_poly_hi = f66
+FR_poly_lo = f67
+FR_r_hi = f68
+FR_r_lo = f69
+FR_rsq = f70
+FR_r_cubed = f71
+FR_C_hi = f72
+FR_N_0 = f73
+FR_d_1 = f74
+FR_V = f75
+FR_V_hi = f75
+FR_V_lo = f76
+FR_U_hi = f77
+FR_U_lo = f78
+FR_U_hiabs = f79
+FR_V_hiabs = f80
+FR_PP_8 = f81
+FR_QQ_8 = f101
+FR_PP_7 = f82
+FR_QQ_7 = f102
+FR_PP_6 = f83
+FR_QQ_6 = f103
+FR_PP_5 = f84
+FR_QQ_5 = f104
+FR_PP_4 = f85
+FR_QQ_4 = f105
+FR_PP_3 = f86
+FR_QQ_3 = f106
+FR_PP_2 = f87
+FR_QQ_2 = f107
+FR_QQ_1 = f108
+FR_r_hi_sq = f88
+FR_N_0_fix = f89
+FR_Inv_P_0 = f90
+FR_corr = f91
+FR_poly = f92
+FR_Neg_Two_to_M3 = f93
+FR_Two_to_M3 = f94
+FR_P_0 = f95
+FR_C_lo = f96
+FR_PP_1 = f97
+FR_PP_1_lo = f98
+FR_ArgPrime = f99
+FR_inexact = f100
+
+GR_sig_inv_pi = r14
+GR_rshf_2to64 = r15
+GR_exp_2tom64 = r16
+GR_rshf = r17
+GR_ad_p = r18
+GR_ad_d = r19
+GR_ad_pp = r20
+GR_ad_qq = r21
+GR_ad_c = r22
+GR_ad_s = r23
+GR_ad_ce = r24
+GR_ad_se = r25
+GR_ad_m14 = r26
+GR_ad_s1 = r27
+GR_exp_m2_to_m3= r36
+GR_N_Inc = r37
+GR_Sin_or_Cos = r38
+GR_signexp_x = r40
+GR_exp_x = r40
+GR_exp_mask = r41
+GR_exp_2_to_63 = r42
+GR_exp_2_to_m3 = r43
+GR_exp_2_to_24 = r44
// Added for unwind support
@@ -706,386 +746,376 @@ GR_SAVE_GP = r40
GR_SAVE_PFS = r41
-.global sinl#
-.global cosl#
-#ifdef _LIBC
-.global __sinl#
-.global __cosl#
-#endif
-
.section .text
-.proc sinl#
-#ifdef _LIBC
-.proc __sinl#
-#endif
-.align 64
-sinl:
-#ifdef _LIBC
-__sinl:
-#endif
+
+GLOBAL_IEEE754_ENTRY(sinl)
{ .mlx
-alloc GR_Table_Base = ar.pfs,0,12,2,0
-(p0) movl GR_Sin_or_Cos = 0x0 ;;
+ alloc r32 = ar.pfs,0,12,2,0
+ movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
}
-
-{ .mmi
- nop.m 999
-(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp
- nop.i 999
+{ .mlx
+ mov GR_Sin_or_Cos = 0x0
+ movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
}
;;
-{ .mmb
- ld8 GR_Table_Base = [GR_Table_Base]
+{ .mfi
+ addl GR_ad_p = @ltoff(FSINCOSL_CONSTANTS#), gp
+ fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test x natval, nan, inf
+ mov GR_exp_2_to_m3 = 0xffff - 3 // Exponent of 2^-3
+}
+{ .mfb
nop.m 999
-(p0) br.cond.sptk L(SINCOSL_CONTINUE) ;;
+ fnorm.s1 FR_norm_x = FR_Input_X // Normalize x
+ br.cond.sptk SINCOSL_CONTINUE
}
;;
-
-.endp sinl#
-ASM_SIZE_DIRECTIVE(sinl#)
-
-.section .text
-.proc cosl#
-cosl:
-#ifdef _LIBC
-.proc __cosl#
-__cosl:
-#endif
+GLOBAL_IEEE754_END(sinl)
+GLOBAL_IEEE754_ENTRY(cosl)
+{ .mlx
+ alloc r32 = ar.pfs,0,12,2,0
+ movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
+}
{ .mlx
-alloc GR_Table_Base= ar.pfs,0,12,2,0
-(p0) movl GR_Sin_or_Cos = 0x1 ;;
+ mov GR_Sin_or_Cos = 0x1
+ movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
}
;;
-{ .mmi
+{ .mfi
+ addl GR_ad_p = @ltoff(FSINCOSL_CONSTANTS#), gp
+ fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test x natval, nan, inf
+ mov GR_exp_2_to_m3 = 0xffff - 3 // Exponent of 2^-3
+}
+{ .mfi
nop.m 999
-(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp
+ fnorm.s1 FR_norm_x = FR_Input_X // Normalize x
nop.i 999
}
;;
-{ .mmb
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.m 999
- nop.b 999
+SINCOSL_CONTINUE:
+{ .mfi
+ setf.sig FR_inv_pi_2to63 = GR_sig_inv_pi // Form 1/pi * 2^63
+ nop.f 999
+ mov GR_exp_2tom64 = 0xffff - 64 // Scaling constant to compute N
+}
+{ .mlx
+ setf.d FR_rshf_2to64 = GR_rshf_2to64 // Form const 1.1000 * 2^(63+64)
+ movl GR_rshf = 0x43e8000000000000 // Form const 1.1000 * 2^63
}
;;
+{ .mfi
+ ld8 GR_ad_p = [GR_ad_p] // Point to Inv_pi_by_2
+ fclass.m p7, p0 = FR_Input_X, 0x0b // Test x denormal
+ nop.i 999
+}
+;;
-
-//
-// Load Table Address
-//
-
-L(SINCOSL_CONTINUE):
-{ .mmi
-(p0) add GR_Table_Base1 = 96, GR_Table_Base
-(p0) ldfs FR_Two_to_24 = [GR_Table_Base], 4
-// GR_Sin_or_Cos denotes
-(p0) mov r39 = b0 ;;
+{ .mfi
+ getf.exp GR_signexp_x = FR_Input_X // Get sign and exponent of x
+ fclass.m p10, p0 = FR_Input_X, 0x007 // Test x zero
+ nop.i 999
}
-{ .mmi
- nop.m 0
-//
-// Load 2**24, load 2**63.
-//
-(p0) ldfs FR_Neg_Two_to_24 = [GR_Table_Base], 12
- nop.i 0
+{ .mib
+ mov GR_exp_mask = 0x1ffff // Exponent mask
+ nop.i 999
+(p6) br.cond.spnt SINCOSL_SPECIAL // Branch if x natval, nan, inf
}
+;;
+
{ .mfi
-(p0) ldfs FR_Two_to_63 = [GR_Table_Base1], 4
-//
-// Check for unnormals - unsupported operands. We do not want
-// to generate denormal exception
-// Check for NatVals, QNaNs, SNaNs, +/-Infs
-// Check for EM unsupporteds
-// Check for Zero
-//
-(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3
- nop.i 0
-};;
-{ .mmf
- nop.m 999
-(p0) ldfs FR_Neg_Two_to_63 = [GR_Table_Base1], 12
-(p0) fclass.nm.unc p8, p0 = FR_Input_X, 0x1FF
-}
-{ .mfb
- nop.m 999
-(p0) fclass.m.unc p10, p0 = FR_Input_X, 0x007
-(p6) br.cond.spnt L(SINCOSL_SPECIAL) ;;
+ setf.exp FR_2tom64 = GR_exp_2tom64 // Form 2^-64 for scaling N_float
+ nop.f 0
+ add GR_ad_d = 0x70, GR_ad_p // Point to constant table d
}
{ .mib
- nop.m 999
- nop.i 999
-(p8) br.cond.spnt L(SINCOSL_SPECIAL) ;;
+ setf.d FR_rshf = GR_rshf // Form right shift const 1.1000 * 2^63
+ mov GR_exp_m2_to_m3 = 0x2fffc // Form -(2^-3)
+(p7) br.cond.spnt SINCOSL_DENORMAL // Branch if x denormal
}
-{ .mib
- nop.m 999
- nop.i 999
-//
-// Branch if +/- NaN, Inf.
-// Load -2**24, load -2**63.
-//
-(p10) br.cond.spnt L(SINCOSL_ZERO) ;;
+;;
+
+SINCOSL_COMMON:
+{ .mfi
+ and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x
+ fclass.nm p8, p0 = FR_Input_X, 0x1FF // Test x unsupported type
+ mov GR_exp_2_to_63 = 0xffff + 63 // Exponent of 2^63
}
-{ .mmb
-(p0) ldfe FR_Inv_pi_by_2 = [GR_Table_Base], 16
-(p0) ldfe FR_Inv_P_0 = [GR_Table_Base1], 16
- nop.b 999 ;;
+{ .mib
+ add GR_ad_pp = 0x40, GR_ad_d // Point to constant table pp
+ mov GR_exp_2_to_24 = 0xffff + 24 // Exponent of 2^24
+(p10) br.cond.spnt SINCOSL_ZERO // Branch if x zero
}
-{ .mmb
-(p0) ldfe FR_d_1 = [GR_Table_Base1], 16
-//
-// Raise possible denormal operand flag with useful fcmp
-// Is x <= -2**63
-// Load Inv_P_0 for pre-reduction
-// Load Inv_pi_by_2
-//
-(p0) ldfe FR_P_0 = [GR_Table_Base], 16
- nop.b 999 ;;
+;;
+
+{ .mfi
+ ldfe FR_Inv_pi_by_2 = [GR_ad_p], 16 // Load 2/pi
+ fcmp.eq.s0 p15, p0 = FR_Input_X, f0 // Dummy to set denormal
+ add GR_ad_qq = 0xa0, GR_ad_pp // Point to constant table qq
}
-{ .mmb
-(p0) ldfe FR_d_2 = [GR_Table_Base1], 16
-//
-// Load P_0
-// Load d_1
-// Is x >= 2**63
-// Is x <= -2**24?
-//
-(p0) ldfe FR_P_1 = [GR_Table_Base], 16
- nop.b 999 ;;
+{ .mfi
+ ldfe FR_Pi_by_4 = [GR_ad_d], 16 // Load pi/4 for range test
+ nop.f 999
+ cmp.ge p10,p0 = GR_exp_x, GR_exp_2_to_63 // Is |x| >= 2^63
}
-//
-// Load P_1
-// Load d_2
-// Is x >= 2**24?
-//
+;;
+
{ .mfi
-(p0) ldfe FR_P_2 = [GR_Table_Base], 16
-(p0) fcmp.le.unc.s1 p7, p8 = FR_Input_X, FR_Neg_Two_to_24
- nop.i 999 ;;
+ ldfe FR_P_0 = [GR_ad_p], 16 // Load P_0 for pi/4 <= |x| < 2^63
+ fmerge.s FR_abs_x = f1, FR_norm_x // |x|
+ add GR_ad_c = 0x90, GR_ad_qq // Point to constant table c
}
-{ .mbb
-(p0) ldfe FR_P_3 = [GR_Table_Base], 16
- nop.b 999
- nop.b 999 ;;
+{ .mfi
+ ldfe FR_Inv_P_0 = [GR_ad_d], 16 // Load 1/P_0 for pi/4 <= |x| < 2^63
+ nop.f 999
+ cmp.ge p7,p0 = GR_exp_x, GR_exp_2_to_24 // Is |x| >= 2^24
}
+;;
+
{ .mfi
- nop.m 999
-(p8) fcmp.ge.s1 p7, p0 = FR_Input_X, FR_Two_to_24
- nop.i 999
+ ldfe FR_P_1 = [GR_ad_p], 16 // Load P_1 for pi/4 <= |x| < 2^63
+ nop.f 999
+ add GR_ad_s = 0x50, GR_ad_c // Point to constant table s
}
{ .mfi
-(p0) ldfe FR_Pi_by_4 = [GR_Table_Base1], 16
-//
-// Branch if +/- zero.
-// Decide about the paths to take:
-// If -2**24 < FR_Input_X < 2**24 - CASE 1 OR 2
-// OTHERWISE - CASE 3 OR 4
-//
-(p0) fcmp.le.unc.s0 p10, p11 = FR_Input_X, FR_Neg_Two_to_63
- nop.i 999 ;;
+ ldfe FR_PP_8 = [GR_ad_pp], 16 // Load PP_8 for 2^-3 < |r| < pi/4
+ nop.f 999
+ nop.i 999
}
-{ .mmi
-(p0) ldfe FR_Neg_Pi_by_4 = [GR_Table_Base1], 16 ;;
-(p0) ldfs FR_Two_to_M3 = [GR_Table_Base1], 4
- nop.i 999
+;;
+
+{ .mfi
+ ldfe FR_P_2 = [GR_ad_p], 16 // Load P_2 for pi/4 <= |x| < 2^63
+ nop.f 999
+ add GR_ad_ce = 0x40, GR_ad_c // Point to end of constant table c
}
{ .mfi
- nop.m 999
-(p11) fcmp.ge.s1 p10, p0 = FR_Input_X, FR_Two_to_63
- nop.i 999 ;;
+ ldfe FR_QQ_8 = [GR_ad_qq], 16 // Load QQ_8 for 2^-3 < |r| < pi/4
+ nop.f 999
+ nop.i 999
}
-{ .mib
-(p0) ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1], 12
- nop.i 999
-//
-// Load P_2
-// Load P_3
-// Load pi_by_4
-// Load neg_pi_by_4
-// Load 2**(-3)
-// Load -2**(-3).
-//
-(p10) br.cond.spnt L(SINCOSL_ARG_TOO_LARGE) ;;
+;;
+
+{ .mfi
+ ldfe FR_QQ_7 = [GR_ad_qq], 16 // Load QQ_7 for 2^-3 < |r| < pi/4
+ fma.s1 FR_N_float_signif = FR_Input_X, FR_inv_pi_2to63, FR_rshf_2to64
+ add GR_ad_se = 0x40, GR_ad_s // Point to end of constant table s
}
{ .mib
- nop.m 999
- nop.i 999
-//
-// Branch out if x >= 2**63. Use Payne-Hanek Reduction
-//
-(p7) br.cond.spnt L(SINCOSL_LARGER_ARG) ;;
+ ldfe FR_PP_7 = [GR_ad_pp], 16 // Load PP_7 for 2^-3 < |r| < pi/4
+ mov GR_ad_s1 = GR_ad_s // Save pointer to S_1
+(p10) br.cond.spnt SINCOSL_ARG_TOO_LARGE // Branch if |x| >= 2^63
+ // Use Payne-Hanek Reduction
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Branch if Arg <= -2**24 or Arg >= 2**24 and use pre-reduction.
-//
-(p0) fma.s1 FR_N_float = FR_Input_X, FR_Inv_pi_by_2, f0
- nop.i 999 ;;
+ ldfe FR_P_3 = [GR_ad_p], 16 // Load P_3 for pi/4 <= |x| < 2^63
+ fmerge.se FR_r = FR_norm_x, FR_norm_x // r = x, in case |x| < pi/4
+ add GR_ad_m14 = 0x50, GR_ad_s // Point to constant table m14
}
-{ .mfi
- nop.m 999
-(p0) fcmp.lt.unc.s1 p6, p7 = FR_Input_X, FR_Pi_by_4
- nop.i 999 ;;
+{ .mfb
+ ldfps FR_Two_to_M3, FR_Neg_Two_to_M3 = [GR_ad_d], 8
+ fma.s1 FR_rsq = FR_norm_x, FR_norm_x, f0 // rsq = x*x, in case |x| < pi/4
+(p7) br.cond.spnt SINCOSL_LARGER_ARG // Branch if 2^24 <= |x| < 2^63
+ // Use pre-reduction
+}
+;;
+
+{ .mmf
+ ldfe FR_PP_6 = [GR_ad_pp], 16 // Load PP_6 for normal path
+ ldfe FR_QQ_6 = [GR_ad_qq], 16 // Load QQ_6 for normal path
+ fmerge.se FR_c = f0, f0 // c = 0 in case |x| < pi/4
}
+;;
+
+{ .mmf
+ ldfe FR_PP_5 = [GR_ad_pp], 16 // Load PP_5 for normal path
+ ldfe FR_QQ_5 = [GR_ad_qq], 16 // Load QQ_5 for normal path
+ nop.f 999
+}
+;;
+
+// Here if 0 < |x| < 2^24
{ .mfi
- nop.m 999
-//
-// Select the case when |Arg| < pi/4
-// Else Select the case when |Arg| >= pi/4
-//
-(p0) fcvt.fx.s1 FR_N_fix = FR_N_float
- nop.i 999 ;;
+ ldfe FR_S_5 = [GR_ad_se], -16 // Load S_5 if i_1=0
+ fcmp.lt.s1 p6, p7 = FR_abs_x, FR_Pi_by_4 // Test |x| < pi/4
+ nop.i 999
}
{ .mfi
- nop.m 999
+ ldfe FR_C_5 = [GR_ad_ce], -16 // Load C_5 if i_1=1
+ fms.s1 FR_N_float = FR_N_float_signif, FR_2tom64, FR_rshf
+ nop.i 999
+}
+;;
+
+{ .mmi
+ ldfe FR_S_4 = [GR_ad_se], -16 // Load S_4 if i_1=0
+ ldfe FR_C_4 = [GR_ad_ce], -16 // Load C_4 if i_1=1
+ nop.i 999
+}
+;;
+
//
// N = Arg * 2/pi
// Check if Arg < pi/4
//
-(p6) fcmp.gt.s1 p6, p7 = FR_Input_X, FR_Neg_Pi_by_4
- nop.i 999 ;;
-}
//
// Case 2: Convert integer N_fix back to normalized floating-point value.
// Case 1: p8 is only affected when p6 is set
//
-{ .mfi
-(p7) ldfs FR_Two_to_M33 = [GR_Table_Base1], 4
//
// Grab the integer part of N and call it N_fix
//
-(p6) fmerge.se FR_r = FR_Input_X, FR_Input_X
-// If |x| < pi/4, r = x and c = 0
+{ .mfi
+(p7) ldfps FR_Two_to_M33, FR_Neg_Two_to_M33 = [GR_ad_d], 8
+(p6) fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 // r^3 if |x| < pi/4
+(p6) mov GR_N_Inc = GR_Sin_or_Cos // N_Inc if |x| < pi/4
+}
+;;
+
+// If |x| < pi/4, r = x and c = 0
// lf |x| < pi/4, is x < 2**(-3).
-// r = Arg
+// r = Arg
// c = 0
-(p6) mov GR_N_Inc = GR_Sin_or_Cos ;;
-}
-{ .mmf
- nop.m 999
-(p7) ldfs FR_Neg_Two_to_M33 = [GR_Table_Base1], 4
-(p6) fmerge.se FR_c = f0, f0
-}
-{ .mfi
- nop.m 999
-(p6) fcmp.lt.unc.s1 p8, p9 = FR_Input_X, FR_Two_to_M3
- nop.i 999 ;;
+{ .mmi
+(p7) getf.sig GR_N_Inc = FR_N_float_signif
+(p6) cmp.lt.unc p8,p0 = GR_exp_x, GR_exp_2_to_m3 // Is |x| < 2^-3
+(p6) tbit.z p9,p10 = GR_N_Inc, 0 // p9 if i_1=0, N mod 4 = 0,1
+ // p10 if i_1=1, N mod 4 = 2,3
}
-{ .mfi
- nop.m 999
+;;
+
//
// lf |x| < pi/4, is -2**(-3)< x < 2**(-3) - set p8.
-// If |x| >= pi/4,
-// Create the right N for |x| < pi/4 and otherwise
+// If |x| >= pi/4,
+// Create the right N for |x| < pi/4 and otherwise
// Case 2: Place integer part of N in GP register
//
-(p7) fcvt.xf FR_N_float = FR_N_fix
- nop.i 999 ;;
-}
-{ .mmf
- nop.m 999
-(p7) getf.sig GR_N_Inc = FR_N_fix
-(p8) fcmp.gt.s1 p8, p0 = FR_Input_X, FR_Neg_Two_to_M3 ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-//
-// Load 2**(-33), -2**(-33)
-//
-(p8) br.cond.spnt L(SINCOSL_SMALL_R) ;;
+
+
+{ .mbb
+ nop.m 999
+(p8) br.cond.spnt SINCOSL_SMALL_R_0 // Branch if 0 < |x| < 2^-3
+(p6) br.cond.spnt SINCOSL_NORMAL_R_0 // Branch if 2^-3 <= |x| < pi/4
}
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.sptk L(SINCOSL_NORMAL_R) ;;
+;;
+
+// Here if pi/4 <= |x| < 2^24
+{ .mfi
+ ldfs FR_Neg_Two_to_M67 = [GR_ad_d], 8 // Load -2^-67
+ fnma.s1 FR_s = FR_N_float, FR_P_1, FR_Input_X // s = -N * P_1 + Arg
+ add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos // Adjust N_Inc for sin/cos
}
-//
-// if |x| < pi/4, branch based on |x| < 2**(-3) or otherwise.
-//
-//
-// In this branch, |x| >= pi/4.
-//
{ .mfi
-(p0) ldfs FR_Neg_Two_to_M67 = [GR_Table_Base1], 8
-//
-// Load -2**(-67)
-//
-(p0) fnma.s1 FR_s = FR_N_float, FR_P_1, FR_Input_X
-//
-// w = N * P_2
-// s = -N * P_1 + Arg
-//
-(p0) add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos
+ nop.m 999
+ fma.s1 FR_w = FR_N_float, FR_P_2, f0 // w = N * P_2
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_w = FR_N_float, FR_P_2, f0
- nop.i 999 ;;
+ nop.m 999
+ fms.s1 FR_r = FR_s, f1, FR_w // r = s - w, assume |s| >= 2^-33
+ tbit.z p9,p10 = GR_N_Inc, 0 // p9 if i_1=0, N mod 4 = 0,1
+ // p10 if i_1=1, N mod 4 = 2,3
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Adjust N_fix by N_inc to determine whether sine or
-// cosine is being calculated
-//
-(p0) fcmp.lt.unc.s1 p7, p6 = FR_s, FR_Two_to_M33
- nop.i 999 ;;
+ nop.m 999
+ fcmp.lt.s1 p7, p6 = FR_s, FR_Two_to_M33
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p7) fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33
- nop.i 999 ;;
+ nop.m 999
+(p7) fcmp.gt.s1 p7, p6 = FR_s, FR_Neg_Two_to_M33 // p6 if |s| >= 2^-33, else p7
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-// Remember x >= pi/4.
-// Is s <= -2**(-33) or s >= 2**(-33) (p6)
-// or -2**(-33) < s < 2**(-33) (p7)
-(p6) fms.s1 FR_r = FR_s, f1, FR_w
- nop.i 999
+ nop.m 999
+ fms.s1 FR_c = FR_s, f1, FR_r // c = s - r, for |s| >= 2^-33
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 FR_w = FR_N_float, FR_P_3, f0
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_rsq = FR_r, FR_r, f0 // rsq = r * r, for |s| >= 2^-33
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p7) fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w
- nop.i 999
+ nop.m 999
+(p7) fma.s1 FR_w = FR_N_float, FR_P_3, f0
+ nop.i 999
}
+;;
+
+{ .mmf
+(p9) ldfe FR_C_1 = [GR_ad_pp], 16 // Load C_1 if i_1=0
+(p10) ldfe FR_S_1 = [GR_ad_qq], 16 // Load S_1 if i_1=1
+ frcpa.s1 FR_r_hi, p15 = f1, FR_r // r_hi = frcpa(r)
+}
+;;
+
{ .mfi
- nop.m 999
-(p6) fms.s1 FR_c = FR_s, f1, FR_r
- nop.i 999 ;;
+ nop.m 999
+(p6) fcmp.lt.unc.s1 p8, p13 = FR_r, FR_Two_to_M3 // If big s, test r with 2^-3
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// For big s: r = s - w: No futher reduction is necessary
+ nop.m 999
+(p7) fma.s1 FR_U_1 = FR_N_float, FR_P_2, FR_w
+ nop.i 999
+}
+;;
+
+//
+// For big s: r = s - w: No futher reduction is necessary
// For small s: w = N * P_3 (change sign) More reduction
//
-(p6) fcmp.lt.unc.s1 p8, p9 = FR_r, FR_Two_to_M3
- nop.i 999 ;;
+{ .mfi
+ nop.m 999
+(p8) fcmp.gt.s1 p8, p13 = FR_r, FR_Neg_Two_to_M3 // If big s, p8 if |r| < 2^-3
+ nop.i 999 ;;
}
+
{ .mfi
- nop.m 999
-(p8) fcmp.gt.s1 p8, p9 = FR_r, FR_Neg_Two_to_M3
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 // poly = rsq*PP_8+PP_7 if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 // poly = rsq*QQ_8+QQ_7 if i_1=1
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
(p7) fms.s1 FR_r = FR_s, f1, FR_U_1
- nop.i 999
+ nop.i 999
}
-{ .mfb
- nop.m 999
+;;
+
+{ .mfi
+ nop.m 999
+(p6) fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 // rcubed = r * rsq
+ nop.i 999
+}
+;;
+
+{ .mfi
//
// For big s: Is |r| < 2**(-3)?
// For big s: c = S - r
@@ -1095,355 +1125,356 @@ L(SINCOSL_CONTINUE):
// If p9 is set, prepare to branch to Normal_R.
// For big s, r is complete here.
//
-(p6) fms.s1 FR_c = FR_c, f1, FR_w
-//
+//
// For big s: c = c + w (w has not been negated.)
// For small s: r = S - U_1
//
-(p8) br.cond.spnt L(SINCOSL_SMALL_R) ;;
+ nop.m 999
+(p6) fms.s1 FR_c = FR_c, f1, FR_w
+ nop.i 999
}
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.sptk L(SINCOSL_NORMAL_R) ;;
+{ .mbb
+ nop.m 999
+(p8) br.cond.spnt SINCOSL_SMALL_R_1 // Branch if |s|>=2^-33, |r| < 2^-3,
+ // and pi/4 <= |x| < 2^24
+(p13) br.cond.sptk SINCOSL_NORMAL_R_1 // Branch if |s|>=2^-33, |r| >= 2^-3,
+ // and pi/4 <= |x| < 2^24
}
-{ .mfi
-(p7) add GR_Table_Base1 = 224, GR_Table_Base1
+;;
+
+SINCOSL_S_TINY:
+//
+// Here if |s| < 2^-33, and pi/4 <= |x| < 2^24
//
-// Branch to SINCOSL_SMALL_R or SINCOSL_NORMAL_R
+{ .mfi
+ fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1
//
-(p7) fms.s1 FR_U_2 = FR_N_float, FR_P_2, FR_U_1
-//
// c = S - U_1
// r = S_1 * r
//
//
-(p7) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;
}
+;;
+
{ .mmi
- nop.m 999
+ nop.m 999
//
// Get [i_0,i_1] - two lsb of N_fix_gr.
// Do dummy fmpy so inexact is always set.
//
-(p7) cmp.eq.unc p9, p10 = 0x0, GR_i_1
-(p7) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
+ tbit.z p9,p10 = GR_N_Inc, 0 // p9 if i_1=0, N mod 4 = 0,1
+ // p10 if i_1=1, N mod 4 = 2,3
}
-//
+;;
+
+//
// For small s: U_2 = N * P_2 - U_1
// S_1 stored constant - grab the one stored with the
// coefficients.
-//
+//
{ .mfi
-(p7) ldfe FR_S_1 = [GR_Table_Base1], 16
+ ldfe FR_S_1 = [GR_ad_s1], 16
//
// Check if i_1 and i_0 != 0
//
-(p10) fma.s1 FR_poly = f0, f1, FR_Neg_Two_to_M67
-(p7) cmp.eq.unc p11, p12 = 0x0, GR_i_0 ;;
+(p10) fma.s1 FR_poly = f0, f1, FR_Neg_Two_to_M67
+ tbit.z p11,p12 = GR_N_Inc, 1 // p11 if i_0=0, N mod 4 = 0,2
+ // p12 if i_0=1, N mod 4 = 1,3
}
+;;
+
{ .mfi
- nop.m 999
-(p7) fms.s1 FR_s = FR_s, f1, FR_r
- nop.i 999
+ nop.m 999
+ fms.s1 FR_s = FR_s, f1, FR_r
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
+ nop.m 999
+//
// S = S - r
// U_2 = U_2 + w
// load S_1
//
-(p7) fma.s1 FR_rsq = FR_r, FR_r, f0
- nop.i 999 ;;
+ fma.s1 FR_rsq = FR_r, FR_r, f0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p7) fma.s1 FR_U_2 = FR_U_2, f1, FR_w
- nop.i 999
+ nop.m 999
+ fma.s1 FR_U_2 = FR_U_2, f1, FR_w
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fmerge.se FR_Input_X = FR_r, FR_r
- nop.i 999 ;;
+ nop.m 999
+ fmerge.se FR_tmp_result = FR_r, FR_r
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_Input_X = f0, f1, f1
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_tmp_result = f0, f1, f1
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
+ nop.m 999
+//
// FR_rsq = r * r
// Save r as the result.
//
-(p7) fms.s1 FR_c = FR_s, f1, FR_U_1
- nop.i 999 ;;
+ fms.s1 FR_c = FR_s, f1, FR_U_1
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
+ nop.m 999
+//
// if ( i_1 ==0) poly = c + S_1*r*r*r
// else Result = 1
//
-(p12) fnma.s1 FR_Input_X = FR_Input_X, f1, f0
- nop.i 999
+(p12) fnma.s1 FR_tmp_result = FR_tmp_result, f1, f0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fma.s1 FR_r = FR_S_1, FR_r, f0
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_r = FR_S_1, FR_r, f0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p7) fma.s0 FR_S_1 = FR_S_1, FR_S_1, f0
- nop.i 999 ;;
+ nop.m 999
+ fma.s0 FR_S_1 = FR_S_1, FR_S_1, f0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// If i_1 != 0, poly = 2**(-67)
//
-(p7) fms.s1 FR_c = FR_c, f1, FR_U_2
- nop.i 999 ;;
+ fms.s1 FR_c = FR_c, f1, FR_U_2
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-//
+ nop.m 999
+//
// c = c - U_2
-//
+//
(p9) fma.s1 FR_poly = FR_r, FR_rsq, FR_c
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// i_0 != 0, so Result = -Result
//
-(p11) fma.s0 FR_Input_X = FR_Input_X, f1, FR_poly
- nop.i 999 ;;
+(p11) fma.s0 FR_Result = FR_tmp_result, f1, FR_poly
+ nop.i 999 ;;
}
{ .mfb
- nop.m 999
-(p12) fms.s0 FR_Input_X = FR_Input_X, f1, FR_poly
+ nop.m 999
+(p12) fms.s0 FR_Result = FR_tmp_result, f1, FR_poly
//
// if (i_0 == 0), Result = Result + poly
// else Result = Result - poly
//
-(p0) br.ret.sptk b0 ;;
-}
-L(SINCOSL_LARGER_ARG):
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0
- nop.i 999
+ br.ret.sptk b0 // Exit if |s| < 2^-33, and pi/4 <= |x| < 2^24
}
;;
-// This path for argument > 2*24
-// Adjust table_ptr1 to beginning of table.
+SINCOSL_LARGER_ARG:
//
-
-{ .mmi
- nop.m 999
-(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp
- nop.i 999
-}
-;;
-
-{ .mmi
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.m 999
- nop.i 999
+// Here if 2^24 <= |x| < 2^63
+//
+{ .mfi
+ ldfe FR_d_1 = [GR_ad_p], 16 // Load d_1 for |x| >= 2^24 path
+ fma.s1 FR_N_0 = FR_Input_X, FR_Inv_P_0, f0
+ nop.i 999
}
;;
-
-//
-// Point to 2*-14
+//
// N_0 = Arg * Inv_P_0
//
+// Load values 2**(-14) and -2**(-14)
{ .mmi
-(p0) add GR_Table_Base = 688, GR_Table_Base ;;
-(p0) ldfs FR_Two_to_M14 = [GR_Table_Base], 4
- nop.i 999 ;;
+ ldfps FR_Two_to_M14, FR_Neg_Two_to_M14 = [GR_ad_m14]
+ nop.i 999 ;;
}
{ .mfi
-(p0) ldfs FR_Neg_Two_to_M14 = [GR_Table_Base], 0
- nop.f 999
- nop.i 999 ;;
+ ldfe FR_d_2 = [GR_ad_p], 16 // Load d_2 for |x| >= 2^24 path
+ nop.f 999
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// Load values 2**(-14) and -2**(-14)
//
-(p0) fcvt.fx.s1 FR_N_0_fix = FR_N_0
- nop.i 999 ;;
+ fcvt.fx.s1 FR_N_0_fix = FR_N_0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N_0_fix = integer part of N_0
//
-(p0) fcvt.xf FR_N_0 = FR_N_0_fix
- nop.i 999 ;;
+ fcvt.xf FR_N_0 = FR_N_0_fix
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// Make N_0 the integer part
//
-(p0) fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X
- nop.i 999
+ fnma.s1 FR_ArgPrime = FR_N_0, FR_P_0, FR_Input_X
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_w = FR_N_0, FR_d_1, f0
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_w = FR_N_0, FR_d_1, f0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// Arg' = -N_0 * P_0 + Arg
// w = N_0 * d_1
//
-(p0) fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0
- nop.i 999 ;;
+ fma.s1 FR_N_float = FR_ArgPrime, FR_Inv_pi_by_2, f0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// N = A' * 2/pi
+// N = A' * 2/pi
//
-(p0) fcvt.fx.s1 FR_N_fix = FR_N_float
- nop.i 999 ;;
+ fcvt.fx.s1 FR_N_fix = FR_N_float
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// N_fix is the integer part
+// N_fix is the integer part
//
-(p0) fcvt.xf FR_N_float = FR_N_fix
- nop.i 999 ;;
+ fcvt.xf FR_N_float = FR_N_fix
+ nop.i 999 ;;
}
{ .mfi
-(p0) getf.sig GR_N_Inc = FR_N_fix
- nop.f 999
- nop.i 999 ;;
+ getf.sig GR_N_Inc = FR_N_fix
+ nop.f 999
+ nop.i 999 ;;
}
{ .mii
- nop.m 999
- nop.i 999 ;;
-(p0) add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;;
+ nop.m 999
+ nop.i 999 ;;
+ add GR_N_Inc = GR_N_Inc, GR_Sin_or_Cos ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N is the integer part of the reduced-reduced argument.
// Put the integer in a GP register
//
-(p0) fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime
- nop.i 999
+ fnma.s1 FR_s = FR_N_float, FR_P_1, FR_ArgPrime
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 FR_w = FR_N_float, FR_P_2, FR_w
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// s = -N*P_1 + Arg'
// w = -N*P_2 + w
// N_fix_gr = N_fix_gr + N_inc
//
-(p0) fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14
- nop.i 999 ;;
+ fcmp.lt.unc.s1 p9, p8 = FR_s, FR_Two_to_M14
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p9) fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14
- nop.i 999 ;;
+ nop.m 999
+(p9) fcmp.gt.s1 p9, p8 = FR_s, FR_Neg_Two_to_M14 // p9 if |s| < 2^-14
+ nop.i 999 ;;
}
+
{ .mfi
- nop.m 999
+ nop.m 999
//
// For |s| > 2**(-14) r = S + w (r complete)
// Else U_hi = N_0 * d_1
//
(p9) fma.s1 FR_V_hi = FR_N_float, FR_P_2, f0
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p9) fma.s1 FR_U_hi = FR_N_0, FR_d_1, f0
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// Either S <= -2**(-14) or S >= 2**(-14)
// or -2**(-14) < s < 2**(-14)
//
(p8) fma.s1 FR_r = FR_s, f1, FR_w
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p9) fma.s1 FR_w = FR_N_float, FR_P_3, f0
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// We need abs of both U_hi and V_hi - don't
// worry about switched sign of V_hi.
//
(p9) fms.s1 FR_A = FR_U_hi, f1, FR_V_hi
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// Big s: finish up c = (S - r) + w (c complete)
+// Big s: finish up c = (S - r) + w (c complete)
// Case 4: A = U_hi + V_hi
// Note: Worry about switched sign of V_hi, so subtract instead of add.
//
(p9) fnma.s1 FR_V_lo = FR_N_float, FR_P_2, FR_V_hi
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mmf
- nop.m 999
- nop.m 999
+ nop.m 999
+ nop.m 999
(p9) fms.s1 FR_U_lo = FR_N_0, FR_d_1, FR_U_hi
}
{ .mfi
- nop.m 999
+ nop.m 999
(p9) fmerge.s FR_V_hiabs = f0, FR_V_hi
- nop.i 999 ;;
+ nop.i 999 ;;
}
+//{ .mfb
+//(p9) fmerge.s f8= FR_V_lo,FR_V_lo
+//(p9) br.ret.sptk b0
+//}
+//;;
{ .mfi
- nop.m 999
+ nop.m 999
// For big s: c = S - r
// For small s do more work: U_lo = N_0 * d_1 - U_hi
//
(p9) fmerge.s FR_U_hiabs = f0, FR_U_hi
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// For big s: Is |r| < 2**(-3)
+// For big s: Is |r| < 2**(-3)
// For big s: if p12 set, prepare to branch to Small_R.
// For big s: If p13 set, prepare to branch to Normal_R.
//
-(p8) fms.s1 FR_c = FR_s, f1, FR_r
- nop.i 999 ;;
+(p8) fms.s1 FR_c = FR_s, f1, FR_r
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// For small S: V_hi = N * P_2
// w = N * P_3
@@ -1451,104 +1482,99 @@ L(SINCOSL_LARGER_ARG):
// so (-) missing for V_hi and w.
//
(p8) fcmp.lt.unc.s1 p12, p13 = FR_r, FR_Two_to_M3
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fcmp.gt.s1 p12, p13 = FR_r, FR_Neg_Two_to_M3
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p8) fma.s1 FR_c = FR_c, f1, FR_w
- nop.i 999
+ nop.i 999
}
{ .mfb
- nop.m 999
+ nop.m 999
(p9) fms.s1 FR_w = FR_N_0, FR_d_2, FR_w
-(p12) br.cond.spnt L(SINCOSL_SMALL_R) ;;
+(p12) br.cond.spnt SINCOSL_SMALL_R // Branch if |r| < 2^-3
+ // and 2^24 <= |x| < 2^63
}
+;;
+
{ .mib
- nop.m 999
- nop.i 999
-(p13) br.cond.sptk L(SINCOSL_NORMAL_R) ;;
+ nop.m 999
+ nop.i 999
+(p13) br.cond.sptk SINCOSL_NORMAL_R // Branch if |r| >= 2^-3
+ // and 2^24 <= |x| < 2^63
}
+;;
+
+SINCOSL_LARGER_S_TINY:
+//
+// Here if |s| < 2^-14, and 2^24 <= |x| < 2^63
+//
{ .mfi
- nop.m 999
-//
-// Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true.
+ nop.m 999
+//
+// Big s: Vector off when |r| < 2**(-3). Recall that p8 will be true.
// The remaining stuff is for Case 4.
// Small s: V_lo = N * P_2 + U_hi (U_hi is in place of V_hi in writeup)
// Note: the (-) is still missing for V_lo.
// Small s: w = w + N_0 * d_2
// Note: the (-) is now incorporated in w.
//
-(p9) fcmp.ge.unc.s1 p10, p11 = FR_U_hiabs, FR_V_hiabs
-(p0) extr.u GR_i_1 = GR_N_Inc, 0, 1
+ fcmp.ge.unc.s1 p7, p8 = FR_U_hiabs, FR_V_hiabs
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// C_hi = S + A
//
-(p9) fma.s1 FR_t = FR_U_lo, f1, FR_V_lo
-(p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
+ fma.s1 FR_t = FR_U_lo, f1, FR_V_lo
}
+;;
+
{ .mfi
- nop.m 999
+ nop.m 999
//
-// t = U_lo + V_lo
+// t = U_lo + V_lo
//
//
-(p10) fms.s1 FR_a = FR_U_hi, f1, FR_A
- nop.i 999 ;;
+(p7) fms.s1 FR_a = FR_U_hi, f1, FR_A
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p11) fma.s1 FR_a = FR_V_hi, f1, FR_A
- nop.i 999
-}
-;;
-
-{ .mmi
- nop.m 999
-(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp
- nop.i 999
-}
-;;
-
-{ .mmi
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.m 999
- nop.i 999
+ nop.m 999
+(p8) fma.s1 FR_a = FR_V_hi, f1, FR_A
+ nop.i 999
}
;;
-
{ .mfi
-(p0) add GR_Table_Base = 528, GR_Table_Base
//
// Is U_hiabs >= V_hiabs?
//
-(p9) fma.s1 FR_C_hi = FR_s, f1, FR_A
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_C_hi = FR_s, f1, FR_A
+ nop.i 999 ;;
}
{ .mmi
-(p0) ldfe FR_C_1 = [GR_Table_Base], 16 ;;
-(p0) ldfe FR_C_2 = [GR_Table_Base], 64
- nop.i 999 ;;
+ ldfe FR_C_1 = [GR_ad_c], 16 ;;
+ ldfe FR_C_2 = [GR_ad_c], 64
+ nop.i 999 ;;
}
//
// c = c + C_lo finished.
// Load C_2
//
{ .mfi
-(p0) ldfe FR_S_1 = [GR_Table_Base], 16
+ ldfe FR_S_1 = [GR_ad_s], 16
//
-// C_lo = S - C_hi
+// C_lo = S - C_hi
//
-(p0) fma.s1 FR_t = FR_t, f1, FR_w
- nop.i 999 ;;
+ fma.s1 FR_t = FR_t, f1, FR_w
+ nop.i 999 ;;
}
//
// r and c have been computed.
@@ -1558,855 +1584,695 @@ L(SINCOSL_LARGER_ARG):
// Load S_1
//
{ .mfi
-(p0) ldfe FR_S_2 = [GR_Table_Base], 64
+ ldfe FR_S_2 = [GR_ad_s], 64
//
-// t = t + w
+// t = t + w
//
-(p10) fms.s1 FR_a = FR_a, f1, FR_V_hi
-(p0) cmp.eq.unc p9, p10 = 0x0, GR_i_0 ;;
+(p7) fms.s1 FR_a = FR_a, f1, FR_V_hi
+ tbit.z p9,p10 = GR_N_Inc, 0 // p9 if i_1=0, N mod 4 = 0,1
+ // p10 if i_1=1, N mod 4 = 2,3
}
+;;
{ .mfi
- nop.m 999
+ nop.m 999
//
// For larger u than v: a = U_hi - A
// Else a = V_hi - A (do an add to account for missing (-) on V_hi
//
-(p0) fms.s1 FR_C_lo = FR_s, f1, FR_C_hi
- nop.i 999 ;;
+ fms.s1 FR_C_lo = FR_s, f1, FR_C_hi
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p11) fms.s1 FR_a = FR_U_hi, f1, FR_a
-(p0) cmp.eq.unc p11, p12 = 0x0, GR_i_1 ;;
+ nop.m 999
+(p8) fms.s1 FR_a = FR_U_hi, f1, FR_a
+ tbit.z p11,p12 = GR_N_Inc, 1 // p11 if i_0=0, N mod 4 = 0,2
+ // p12 if i_0=1, N mod 4 = 1,3
}
+;;
+
{ .mfi
- nop.m 999
+ nop.m 999
//
// If u > v: a = (U_hi - A) + V_hi
// Else a = (V_hi - A) + U_hi
// In each case account for negative missing from V_hi.
//
-(p0) fma.s1 FR_C_lo = FR_C_lo, f1, FR_A
- nop.i 999 ;;
+ fma.s1 FR_C_lo = FR_C_lo, f1, FR_A
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// C_lo = (S - C_hi) + A
+// C_lo = (S - C_hi) + A
//
-(p0) fma.s1 FR_t = FR_t, f1, FR_a
- nop.i 999 ;;
+ fma.s1 FR_t = FR_t, f1, FR_a
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// t = t + a
+// t = t + a
//
-(p0) fma.s1 FR_C_lo = FR_C_lo, f1, FR_t
- nop.i 999 ;;
+ fma.s1 FR_C_lo = FR_C_lo, f1, FR_t
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// C_lo = C_lo + t
-// Adjust Table_Base to beginning of table
//
-(p0) fma.s1 FR_r = FR_C_hi, f1, FR_C_lo
- nop.i 999 ;;
+ fma.s1 FR_r = FR_C_hi, f1, FR_C_lo
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// Load S_2
//
-(p0) fma.s1 FR_rsq = FR_r, FR_r, f0
- nop.i 999
+ fma.s1 FR_rsq = FR_r, FR_r, f0
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// Table_Base points to C_1
// r = C_hi + C_lo
//
-(p0) fms.s1 FR_c = FR_C_hi, f1, FR_r
- nop.i 999 ;;
+ fms.s1 FR_c = FR_C_hi, f1, FR_r
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// if i_1 ==0: poly = S_2 * FR_rsq + S_1
// else poly = C_2 * FR_rsq + C_1
//
-(p11) fma.s1 FR_Input_X = f0, f1, FR_r
- nop.i 999 ;;
+(p9) fma.s1 FR_tmp_result = f0, f1, FR_r
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p12) fma.s1 FR_Input_X = f0, f1, f1
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_tmp_result = f0, f1, f1
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// Compute r_cube = FR_rsq * r
+// Compute r_cube = FR_rsq * r
//
-(p11) fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1
- nop.i 999 ;;
+(p9) fma.s1 FR_poly = FR_rsq, FR_S_2, FR_S_1
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p12) fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1
- nop.i 999
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_C_2, FR_C_1
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// Compute FR_rsq = r * r
// Is i_1 == 0 ?
//
-(p0) fma.s1 FR_r_cubed = FR_rsq, FR_r, f0
- nop.i 999 ;;
+ fma.s1 FR_r_cubed = FR_rsq, FR_r, f0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// c = C_hi - r
// Load C_1
//
-(p0) fma.s1 FR_c = FR_c, f1, FR_C_lo
- nop.i 999
+ fma.s1 FR_c = FR_c, f1, FR_C_lo
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// if i_1 ==0: poly = r_cube * poly + c
// else poly = FR_rsq * poly
//
-(p10) fms.s1 FR_Input_X = f0, f1, FR_Input_X
- nop.i 999 ;;
+(p12) fms.s1 FR_tmp_result = f0, f1, FR_tmp_result
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// if i_1 ==0: Result = r
// else Result = 1.0
//
-(p11) fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c
- nop.i 999 ;;
+(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, FR_c
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p12) fma.s1 FR_poly = FR_rsq, FR_poly, f0
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// if i_0 !=0: Result = -Result
+// if i_0 !=0: Result = -Result
//
-(p9) fma.s0 FR_Input_X = FR_Input_X, f1, FR_poly
- nop.i 999 ;;
+(p11) fma.s0 FR_Result = FR_tmp_result, f1, FR_poly
+ nop.i 999 ;;
}
{ .mfb
- nop.m 999
-(p10) fms.s0 FR_Input_X = FR_Input_X, f1, FR_poly
+ nop.m 999
+(p12) fms.s0 FR_Result = FR_tmp_result, f1, FR_poly
//
// if i_0 == 0: Result = Result + poly
// else Result = Result - poly
//
-(p0) br.ret.sptk b0 ;;
+ br.ret.sptk b0 // Exit for |s| < 2^-14, and 2^24 <= |x| < 2^63
}
-L(SINCOSL_SMALL_R):
-{ .mii
- nop.m 999
-(p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;
+;;
+
+
+SINCOSL_SMALL_R:
//
+// Here if |r| < 2^-3
+//
+// Enter with r, c, and N_Inc computed
//
// Compare both i_1 and i_0 with 0.
// if i_1 == 0, set p9.
// if i_0 == 0, set p11.
//
-(p0) cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_rsq = FR_r, FR_r, f0
-(p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
-}
+
{ .mfi
- nop.m 999
-//
-// Z = Z * FR_rsq
-//
-(p10) fnma.s1 FR_c = FR_c, FR_r, f0
-(p0) cmp.eq.unc p11, p12 = 0x0, GR_i_0
+ nop.m 999
+ fma.s1 FR_rsq = FR_r, FR_r, f0 // rsq = r * r
+ tbit.z p9,p10 = GR_N_Inc, 0 // p9 if i_1=0, N mod 4 = 0,1
+ // p10 if i_1=1, N mod 4 = 2,3
}
;;
-// ******************************************************************
-// ******************************************************************
-// ******************************************************************
-// r and c have been computed.
-// We know whether this is the sine or cosine routine.
-// Make sure ftz mode is set - should be automatic when using wre
-// |r| < 2**(-3)
-//
-// Set table_ptr1 to beginning of constant table.
-// Get [i_0,i_1] - two lsb of N_fix_gr.
-//
-
{ .mmi
- nop.m 999
-(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp
+(p9) ldfe FR_S_5 = [GR_ad_se], -16 // Load S_5 if i_1=0
+(p10) ldfe FR_C_5 = [GR_ad_ce], -16 // Load C_5 if i_1=1
nop.i 999
}
;;
{ .mmi
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.m 999
+(p9) ldfe FR_S_4 = [GR_ad_se], -16 // Load S_4 if i_1=0
+(p10) ldfe FR_C_4 = [GR_ad_ce], -16 // Load C_4 if i_1=1
nop.i 999
}
;;
-
-//
-// Set table_ptr1 to point to S_5.
-// Set table_ptr1 to point to C_5.
-// Compute FR_rsq = r * r
-//
-{ .mfi
-(p9) add GR_Table_Base = 672, GR_Table_Base
-(p10) fmerge.s FR_r = f1, f1
-(p10) add GR_Table_Base = 592, GR_Table_Base ;;
+SINCOSL_SMALL_R_0:
+// Entry point for 2^-3 < |x| < pi/4
+.pred.rel "mutex",p9,p10
+SINCOSL_SMALL_R_1:
+// Entry point for pi/4 < |x| < 2^24 and |r| < 2^-3
+.pred.rel "mutex",p9,p10
+{ .mfi
+(p9) ldfe FR_S_3 = [GR_ad_se], -16 // Load S_3 if i_1=0
+ fma.s1 FR_Z = FR_rsq, FR_rsq, f0 // Z = rsq * rsq
+ nop.i 999
}
-//
-// Set table_ptr1 to point to S_5.
-// Set table_ptr1 to point to C_5.
-//
-{ .mmi
-(p9) ldfe FR_S_5 = [GR_Table_Base], -16 ;;
-//
-// if (i_1 == 0) load S_5
-// if (i_1 != 0) load C_5
-//
-(p9) ldfe FR_S_4 = [GR_Table_Base], -16
- nop.i 999 ;;
+{ .mfi
+(p10) ldfe FR_C_3 = [GR_ad_ce], -16 // Load C_3 if i_1=1
+(p10) fnma.s1 FR_c = FR_c, FR_r, f0 // c = -c * r if i_1=0
+ nop.i 999
}
+;;
+
{ .mmf
-(p10) ldfe FR_C_5 = [GR_Table_Base], -16
-//
-// Z = FR_rsq * FR_rsq
-//
-(p9) ldfe FR_S_3 = [GR_Table_Base], -16
-//
-// Compute FR_rsq = r * r
-// if (i_1 == 0) load S_4
-// if (i_1 != 0) load C_4
-//
-(p0) fma.s1 FR_Z = FR_rsq, FR_rsq, f0 ;;
-}
-//
-// if (i_1 == 0) load S_3
-// if (i_1 != 0) load C_3
-//
-{ .mmi
-(p9) ldfe FR_S_2 = [GR_Table_Base], -16 ;;
-//
-// if (i_1 == 0) load S_2
-// if (i_1 != 0) load C_2
-//
-(p9) ldfe FR_S_1 = [GR_Table_Base], -16
- nop.i 999
-}
-{ .mmi
-(p10) ldfe FR_C_4 = [GR_Table_Base], -16 ;;
-(p10) ldfe FR_C_3 = [GR_Table_Base], -16
- nop.i 999 ;;
+(p9) ldfe FR_S_2 = [GR_ad_se], -16 // Load S_2 if i_1=0
+(p10) ldfe FR_C_2 = [GR_ad_ce], -16 // Load C_2 if i_1=1
+(p10) fmerge.s FR_r = f1, f1
}
+;;
+
{ .mmi
-(p10) ldfe FR_C_2 = [GR_Table_Base], -16 ;;
-(p10) ldfe FR_C_1 = [GR_Table_Base], -16
- nop.i 999
-}
-{ .mfi
- nop.m 999
-//
-// if (i_1 != 0):
-// poly_lo = FR_rsq * C_5 + C_4
-// poly_hi = FR_rsq * C_2 + C_1
-//
-(p9) fma.s1 FR_Z = FR_Z, FR_r, f0
- nop.i 999 ;;
+(p9) ldfe FR_S_1 = [GR_ad_se], -16 // Load S_1 if i_1=0
+(p10) ldfe FR_C_1 = [GR_ad_ce], -16 // Load C_1 if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// if (i_1 == 0) load S_1
-// if (i_1 != 0) load C_1
-//
-(p9) fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4
- nop.i 999
+ nop.m 999
+(p9) fma.s1 FR_Z = FR_Z, FR_r, f0 // Z = Z * r if i_1=0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// c = -c * r
-// dummy fmpy's to flag inexact.
-//
-(p9) fma.s0 FR_S_4 = FR_S_4, FR_S_4, f0
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_poly_lo = FR_rsq, FR_S_5, FR_S_4 // poly_lo=rsq*S_5+S_4 if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// poly_lo = FR_rsq * poly_lo + C_3
-// poly_hi = FR_rsq * poly_hi
-//
-(p0) fma.s1 FR_Z = FR_Z, FR_rsq, f0
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4 // poly_lo=rsq*C_5+C_4 if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1
- nop.i 999
+ nop.m 999
+(p9) fma.s1 FR_poly_hi = FR_rsq, FR_S_2, FR_S_1 // poly_hi=rsq*S_2+S_1 if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// if (i_1 == 0):
-// poly_lo = FR_rsq * S_5 + S_4
-// poly_hi = FR_rsq * S_2 + S_1
-//
-(p10) fma.s1 FR_poly_lo = FR_rsq, FR_C_5, FR_C_4
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1 // poly_hi=rsq*C_2+C_1 if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// if (i_1 == 0):
-// Z = Z * r for only one of the small r cases - not there
-// in original implementation notes.
-//
-(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_Z = FR_Z, FR_rsq, f0 // Z = Z * rsq
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly_hi = FR_rsq, FR_C_2, FR_C_1
- nop.i 999
+ nop.m 999
+(p9) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_S_3 // p_lo=p_lo*rsq+S_3, i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p10) fma.s0 FR_C_1 = FR_C_1, FR_C_1, f0
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3 // p_lo=p_lo*rsq+C_3, i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0
- nop.i 999
+ nop.m 999
+(p9) fma.s0 FR_inexact = FR_S_4, FR_S_4, f0 // Dummy op to set inexact
+ tbit.z p11,p12 = GR_N_Inc, 1 // p11 if i_0=0, N mod 4 = 0,2
+ // p12 if i_0=1, N mod 4 = 1,3
}
{ .mfi
- nop.m 999
-//
-// poly_lo = FR_rsq * poly_lo + S_3
-// poly_hi = FR_rsq * poly_hi
-//
-(p10) fma.s1 FR_poly_lo = FR_rsq, FR_poly_lo, FR_C_3
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s0 FR_inexact = FR_C_1, FR_C_1, f0 // Dummy op to set inexact
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 // p_hi=p_hi*rsq if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// if (i_1 == 0): dummy fmpy's to flag inexact
-// r = 1
-//
-(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0
- nop.i 999
+ nop.m 999
+(p10) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, f0 // p_hi=p_hi*rsq if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly_hi = r * poly_hi
-//
-(p0) fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_poly = FR_Z, FR_poly_lo, FR_c // poly=Z*poly_lo+c
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p12) fms.s1 FR_r = f0, f1, FR_r
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_poly_hi = FR_r, FR_poly_hi, f0 // p_hi=r*p_hi if i_1=0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly_hi = Z * poly_lo + c
-// if i_0 == 1: r = -r
-//
-(p0) fma.s1 FR_poly = FR_poly, f1, FR_poly_hi
- nop.i 999 ;;
+ nop.m 999
+(p12) fms.s1 FR_r = f0, f1, FR_r // r = -r if i_0=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p12) fms.s0 FR_Input_X = FR_r, f1, FR_poly
- nop.i 999
+ nop.m 999
+ fma.s1 FR_poly = FR_poly, f1, FR_poly_hi // poly=poly+poly_hi
+ nop.i 999
}
-{ .mfb
- nop.m 999
-//
-// poly = poly + poly_hi
-//
-(p11) fma.s0 FR_Input_X = FR_r, f1, FR_poly
+;;
+
//
// if (i_0 == 0) Result = r + poly
// if (i_0 != 0) Result = r - poly
//
-(p0) br.ret.sptk b0 ;;
-}
-L(SINCOSL_NORMAL_R):
-{ .mii
- nop.m 999
-(p0) extr.u GR_i_1 = GR_N_Inc, 0, 1 ;;
-//
-// Set table_ptr1 and table_ptr2 to base address of
-// constant table.
-(p0) cmp.eq.unc p9, p10 = 0x0, GR_i_1 ;;
-}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_rsq = FR_r, FR_r, f0
-(p0) extr.u GR_i_0 = GR_N_Inc, 1, 1 ;;
+ nop.m 999
+(p11) fma.s0 FR_Result = FR_r, f1, FR_poly
+ nop.i 999
}
-{ .mfi
- nop.m 999
-(p0) frcpa.s1 FR_r_hi, p6 = f1, FR_r
-(p0) cmp.eq.unc p11, p12 = 0x0, GR_i_0
+{ .mfb
+ nop.m 999
+(p12) fms.s0 FR_Result = FR_r, f1, FR_poly
+ br.ret.sptk b0 // Exit for |r| < 2^-3
}
;;
-// ******************************************************************
-// ******************************************************************
-// ******************************************************************
+
+SINCOSL_NORMAL_R:
//
-// r and c have been computed.
-// We known whether this is the sine or cosine routine.
-// Make sure ftz mode is set - should be automatic when using wre
-// Get [i_0,i_1] - two lsb of N_fix_gr alone.
+// Here if 2^-3 <= |r| < pi/4
+// THIS IS THE MAIN PATH
//
-
-{ .mmi
- nop.m 999
-(p0) addl GR_Table_Base = @ltoff(FSINCOSL_CONSTANTS#), gp
+// Enter with r, c, and N_Inc having been computed
+//
+{ .mfi
+ ldfe FR_PP_6 = [GR_ad_pp], 16 // Load PP_6
+ fma.s1 FR_rsq = FR_r, FR_r, f0 // rsq = r * r
+ tbit.z p9,p10 = GR_N_Inc, 0 // p9 if i_1=0, N mod 4 = 0,1
+ // p10 if i_1=1, N mod 4 = 2,3
+}
+{ .mfi
+ ldfe FR_QQ_6 = [GR_ad_qq], 16 // Load QQ_6
+ nop.f 999
nop.i 999
}
;;
{ .mmi
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.m 999
+(p9) ldfe FR_PP_5 = [GR_ad_pp], 16 // Load PP_5 if i_1=0
+(p10) ldfe FR_QQ_5 = [GR_ad_qq], 16 // Load QQ_5 if i_1=1
nop.i 999
}
;;
+SINCOSL_NORMAL_R_0:
+// Entry for 2^-3 < |x| < pi/4
+.pred.rel "mutex",p9,p10
+{ .mmf
+(p9) ldfe FR_C_1 = [GR_ad_pp], 16 // Load C_1 if i_1=0
+(p10) ldfe FR_S_1 = [GR_ad_qq], 16 // Load S_1 if i_1=1
+ frcpa.s1 FR_r_hi, p6 = f1, FR_r // r_hi = frcpa(r)
+}
+;;
{ .mfi
-(p10) add GR_Table_Base = 384, GR_Table_Base
-(p12) fms.s1 FR_Input_X = f0, f1, f1
-(p9) add GR_Table_Base = 224, GR_Table_Base ;;
+ nop.m 999
+(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7 // poly = rsq*PP_8+PP_7 if i_1=0
+ nop.i 999
}
{ .mfi
-(p10) ldfe FR_QQ_8 = [GR_Table_Base], 16
-//
-// if (i_1==0) poly = poly * FR_rsq + PP_1_lo
-// else poly = FR_rsq * poly
-//
-(p11) fma.s1 FR_Input_X = f0, f1, f1
- nop.i 999 ;;
-}
-{ .mmb
-(p10) ldfe FR_QQ_7 = [GR_Table_Base], 16
-//
-// Adjust table pointers based on i_0
-// Compute rsq = r * r
-//
-(p9) ldfe FR_PP_8 = [GR_Table_Base], 16
- nop.b 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7 // poly = rsq*QQ_8+QQ_7 if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_r_cubed = FR_r, FR_rsq, f0
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_r_cubed = FR_r, FR_rsq, f0 // rcubed = r * rsq
+ nop.i 999
}
+;;
+
+
+SINCOSL_NORMAL_R_1:
+// Entry for pi/4 <= |x| < 2^24
+.pred.rel "mutex",p9,p10
{ .mmf
-(p9) ldfe FR_PP_7 = [GR_Table_Base], 16
-(p10) ldfe FR_QQ_6 = [GR_Table_Base], 16
-//
-// Load PP_8 and QQ_8; PP_7 and QQ_7
-//
-(p0) frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi ;;
-}
-//
-// if (i_1==0) poly = PP_7 + FR_rsq * PP_8.
-// else poly = QQ_7 + FR_rsq * QQ_8.
-//
-{ .mmb
-(p9) ldfe FR_PP_6 = [GR_Table_Base], 16
-(p10) ldfe FR_QQ_5 = [GR_Table_Base], 16
- nop.b 999 ;;
-}
-{ .mmb
-(p9) ldfe FR_PP_5 = [GR_Table_Base], 16
-(p10) ldfe FR_S_1 = [GR_Table_Base], 16
- nop.b 999 ;;
-}
-{ .mmb
-(p10) ldfe FR_QQ_1 = [GR_Table_Base], 16
-(p9) ldfe FR_C_1 = [GR_Table_Base], 16
- nop.b 999 ;;
-}
-{ .mmb
-(p10) ldfe FR_QQ_4 = [GR_Table_Base], 16
-(p9) ldfe FR_PP_1 = [GR_Table_Base], 16
- nop.b 999 ;;
-}
-{ .mmb
-(p10) ldfe FR_QQ_3 = [GR_Table_Base], 16
-//
-// if (i_1=0) corr = corr + c*c
-// else corr = corr * c
-//
-(p9) ldfe FR_PP_4 = [GR_Table_Base], 16
- nop.b 999 ;;
-}
-{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_QQ_8, FR_QQ_7
- nop.i 999 ;;
-}
-//
-// if (i_1=0) poly = rsq * poly + PP_5
-// else poly = rsq * poly + QQ_5
-// Load PP_4 or QQ_4
-//
-{ .mmi
-(p9) ldfe FR_PP_3 = [GR_Table_Base], 16 ;;
-(p10) ldfe FR_QQ_2 = [GR_Table_Base], 16
- nop.i 999
+(p9) ldfe FR_PP_1 = [GR_ad_pp], 16 // Load PP_1_hi if i_1=0
+(p10) ldfe FR_QQ_1 = [GR_ad_qq], 16 // Load QQ_1 if i_1=1
+ frcpa.s1 FR_r_hi, p6 = f1, FR_r_hi // r_hi = frpca(frcpa(r))
}
+;;
+
{ .mfi
- nop.m 999
-//
-// r_hi = frcpa(frcpa(r)).
-// r_cube = r * FR_rsq.
-//
-(p9) fma.s1 FR_poly = FR_rsq, FR_PP_8, FR_PP_7
- nop.i 999 ;;
+(p9) ldfe FR_PP_4 = [GR_ad_pp], 16 // Load PP_4 if i_1=0
+(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6 // poly = rsq*poly+PP_6 if i_1=0
+ nop.i 999
}
-//
-// Do dummy multiplies so inexact is always set.
-//
{ .mfi
-(p9) ldfe FR_PP_2 = [GR_Table_Base], 16
-//
-// r_lo = r - r_hi
-//
-(p9) fma.s1 FR_U_lo = FR_r_hi, FR_r_hi, f0
- nop.i 999 ;;
-}
-{ .mbb
-(p9) ldfe FR_PP_1_lo = [GR_Table_Base], 16
- nop.b 999
- nop.b 999 ;;
+(p10) ldfe FR_QQ_4 = [GR_ad_qq], 16 // Load QQ_4 if i_1=1
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6 // poly = rsq*poly+QQ_6 if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r
- nop.i 999
+ nop.m 999
+(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0 // corr = C_1 * rsq if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_6
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_corr = FR_S_1, FR_r_cubed, FR_r // corr = S_1 * r^3 + r if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// if (i_1=0) U_lo = r_hi * r_hi
-// else U_lo = r_hi + r
-//
-(p9) fma.s1 FR_corr = FR_C_1, FR_rsq, f0
- nop.i 999 ;;
+(p9) ldfe FR_PP_3 = [GR_ad_pp], 16 // Load PP_3 if i_1=0
+ fma.s1 FR_r_hi_sq = FR_r_hi, FR_r_hi, f0 // r_hi_sq = r_hi * r_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// if (i_1=0) corr = C_1 * rsq
-// else corr = S_1 * r_cubed + r
-//
-(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_6
- nop.i 999 ;;
+(p10) ldfe FR_QQ_3 = [GR_ad_qq], 16 // Load QQ_3 if i_1=1
+ fms.s1 FR_r_lo = FR_r, f1, FR_r_hi // r_lo = r - r_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r
- nop.i 999
+(p9) ldfe FR_PP_2 = [GR_ad_pp], 16 // Load PP_2 if i_1=0
+(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5 // poly = rsq*poly+PP_5 if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// if (i_1=0) U_hi = r_hi + U_hi
-// else U_hi = QQ_1 * U_hi + 1
-//
-(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_U_lo
- nop.i 999 ;;
+(p10) ldfe FR_QQ_2 = [GR_ad_qq], 16 // Load QQ_2 if i_1=1
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5 // poly = rsq*poly+QQ_5 if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// U_hi = r_hi * r_hi
-//
-(p0) fms.s1 FR_r_lo = FR_r, f1, FR_r_hi
- nop.i 999
+(p9) ldfe FR_PP_1_lo = [GR_ad_pp], 16 // Load PP_1_lo if i_1=0
+(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c // corr = corr * c + c if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Load PP_1, PP_6, PP_5, and C_1
-// Load QQ_1, QQ_6, QQ_5, and S_1
-//
-(p0) fma.s1 FR_U_hi = FR_r_hi, FR_r_hi, f0
- nop.i 999 ;;
+ nop.m 999
+(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0 // corr = -corr * c if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_5
- nop.i 999
+ nop.m 999
+(p9) fma.s1 FR_U_lo = FR_r, FR_r_hi, FR_r_hi_sq // U_lo = r*r_hi+r_hi_sq, i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p10) fnma.s1 FR_corr = FR_corr, FR_c, f0
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_U_lo = FR_r_hi, f1, FR_r // U_lo = r_hi + r if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// if (i_1=0) U_lo = r * r_hi + U_lo
-// else U_lo = r_lo * U_lo
-//
-(p9) fma.s1 FR_corr = FR_corr, FR_c, FR_c
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_U_hi = FR_r_hi, FR_r_hi_sq, f0 // U_hi = r_hi*r_hi_sq if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_5
- nop.i 999
+ nop.m 999
+(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_r_hi_sq, f1 // U_hi = QQ_1*r_hi_sq+1, i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// if (i_1 =0) U_hi = r + U_hi
-// if (i_1 =0) U_lo = r_lo * U_lo
-//
-//
-(p9) fma.s0 FR_PP_5 = FR_PP_5, FR_PP_4, f0
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4 // poly = poly*rsq+PP_4 if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4 // poly = poly*rsq+QQ_4 if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_U_lo = FR_r, FR_r, FR_U_lo // U_lo = r * r + U_lo if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// if (i_1=0) poly = poly * rsq + PP_6
-// else poly = poly * rsq + QQ_6
-//
-(p9) fma.s1 FR_U_hi = FR_r_hi, FR_U_hi, f0
- nop.i 999
+ nop.m 999
+(p10) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 // U_lo = r_lo * U_lo if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_4
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0 // U_hi = PP_1 * U_hi if i_1=0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_U_hi = FR_QQ_1, FR_U_hi, f1
- nop.i 999
+ nop.m 999
+(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3 // poly = poly*rsq+PP_3 if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p10) fma.s0 FR_QQ_5 = FR_QQ_5, FR_QQ_5, f0
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3 // poly = poly*rsq+QQ_3 if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// if (i_1!=0) U_hi = PP_1 * U_hi
-// if (i_1!=0) U_lo = r * r + U_lo
-// Load PP_3 or QQ_3
-//
-(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_4
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0 // U_lo = r_lo * U_lo if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p9) fma.s1 FR_U_lo = FR_r_lo, FR_U_lo, f0
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0 // U_lo = QQ_1 * U_lo if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_U_lo = FR_QQ_1,FR_U_lo, f0
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi // U_hi = r + U_hi if i_1=0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p9) fma.s1 FR_U_hi = FR_PP_1, FR_U_hi, f0
- nop.i 999
+ nop.m 999
+(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2 // poly = poly*rsq+PP_2 if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_3
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2 // poly = poly*rsq+QQ_2 if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Load PP_2, QQ_2
-//
-(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_3
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0 // U_lo = PP_1 * U_lo if i_1=0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// if (i_1==0) poly = FR_rsq * poly + PP_3
-// else poly = FR_rsq * poly + QQ_3
-// Load PP_1_lo
-//
-(p9) fma.s1 FR_U_lo = FR_PP_1, FR_U_lo, f0
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo // poly =poly*rsq+PP1lo i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// if (i_1 =0) poly = poly * rsq + pp_r4
-// else poly = poly * rsq + qq_r4
-//
-(p9) fma.s1 FR_U_hi = FR_r, f1, FR_U_hi
- nop.i 999
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 // poly = poly*rsq if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, FR_QQ_2
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_V = FR_U_lo, f1, FR_corr // V = U_lo + corr
+ tbit.z p11,p12 = GR_N_Inc, 1 // p11 if i_0=0, N mod 4 = 0,2
+ // p12 if i_0=1, N mod 4 = 1,3
}
+;;
+
{ .mfi
- nop.m 999
-//
-// if (i_1==0) U_lo = PP_1_hi * U_lo
-// else U_lo = QQ_1 * U_lo
-//
-(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_2
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s0 FR_inexact = FR_PP_5, FR_PP_4, f0 // Dummy op to set inexact
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// if (i_0==0) Result = 1
-// else Result = -1
-//
-(p0) fma.s1 FR_V = FR_U_lo, f1, FR_corr
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s0 FR_inexact = FR_QQ_5, FR_QQ_5, f0 // Dummy op to set inexact
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0 // poly = poly*r^3 if i_1=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// if (i_1==0) poly = FR_rsq * poly + PP_2
-// else poly = FR_rsq * poly + QQ_2
-//
-(p9) fma.s1 FR_poly = FR_rsq, FR_poly, FR_PP_1_lo
- nop.i 999 ;;
+ nop.m 999
+(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0 // poly = poly*rsq if i_1=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fma.s1 FR_poly = FR_rsq, FR_poly, f0
- nop.i 999 ;;
+ nop.m 999
+(p11) fma.s1 FR_tmp_result = f0, f1, f1// tmp_result=+1.0 if i_0=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// V = U_lo + corr
-//
-(p9) fma.s1 FR_poly = FR_r_cubed, FR_poly, f0
- nop.i 999 ;;
+ nop.m 999
+(p12) fms.s1 FR_tmp_result = f0, f1, f1// tmp_result=-1.0 if i_0=1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// if (i_1==0) poly = r_cube * poly
-// else poly = FR_rsq * poly
-//
-(p0) fma.s1 FR_V = FR_poly, f1, FR_V
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_V = FR_poly, f1, FR_V // V = poly + V
+ nop.i 999
}
+;;
+
+// If i_0 = 0 Result = U_hi + V
+// If i_0 = 1 Result = -U_hi - V
{ .mfi
- nop.m 999
-(p12) fms.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V
- nop.i 999
+ nop.m 999
+(p11) fma.s0 FR_Result = FR_tmp_result, FR_U_hi, FR_V
+ nop.i 999
}
{ .mfb
- nop.m 999
-//
-// V = V + poly
-//
-(p11) fma.s0 FR_Input_X = FR_Input_X, FR_U_hi, FR_V
-//
-// if (i_0==0) Result = Result * U_hi + V
-// else Result = Result * U_hi - V
-//
-(p0) br.ret.sptk b0
-};;
-
-//
-// If cosine, FR_Input_X = 1
-// If sine, FR_Input_X = +/-Zero (Input FR_Input_X)
-// Results are exact, no exceptions
-//
+ nop.m 999
+(p12) fms.s0 FR_Result = FR_tmp_result, FR_U_hi, FR_V
+ br.ret.sptk b0 // Exit for 2^-3 <= |r| < pi/4
+}
+;;
-L(SINCOSL_ZERO):
-{ .mbb
-(p0) cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos
- nop.b 999
- nop.b 999 ;;
+SINCOSL_ZERO:
+// Here if x = 0
+{ .mfi
+ cmp.eq.unc p6, p7 = 0x1, GR_Sin_or_Cos
+ nop.f 999
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p7) fmerge.s FR_Input_X = FR_Input_X, FR_Input_X
- nop.i 999
+ nop.m 999
+(p7) fmerge.s FR_Result = FR_Input_X, FR_Input_X // If sin, result = input
+ nop.i 999
}
{ .mfb
- nop.m 999
-(p6) fmerge.s FR_Input_X = f1, f1
-(p0) br.ret.sptk b0 ;;
+ nop.m 999
+(p6) fma.s0 FR_Result = f1, f1, f0 // If cos, result=1.0
+ br.ret.sptk b0 // Exit for x=0
}
-L(SINCOSL_SPECIAL):
+;;
+
+
+SINCOSL_DENORMAL:
+{ .mmb
+ getf.exp GR_signexp_x = FR_norm_x // Get sign and exponent of x
+ nop.m 999
+ br.cond.sptk SINCOSL_COMMON // Return to common code
+}
+;;
+
+SINCOSL_SPECIAL:
{ .mfb
nop.m 999
//
@@ -2414,106 +2280,82 @@ L(SINCOSL_SPECIAL):
// Invalid can be raised. SNaNs
// become QNaNs
//
-(p0) fmpy.s0 FR_Input_X = FR_Input_X, f0
-(p0) br.ret.sptk b0 ;;
+ fmpy.s0 FR_Result = FR_Input_X, f0
+ br.ret.sptk b0 ;;
}
-.endp cosl#
-ASM_SIZE_DIRECTIVE(cosl#)
-// Call int pi_by_2_reduce(double* x, double *y)
-// for |arguments| >= 2**63
-// Address to save r and c as double
-//
-// sp+32 -> f0
-// r45 sp+16 -> f0
-// r44 -> sp -> InputX
-//
+GLOBAL_IEEE754_END(cosl)
+// *******************************************************************
+// *******************************************************************
+// *******************************************************************
+//
+// Special Code to handle very large argument case.
+// Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63
+// The interface is custom:
+// On input:
+// (Arg or x) is in f8
+// On output:
+// r is in f8
+// c is in f9
+// N is in r8
+// Be sure to allocate at least 2 GP registers as output registers for
+// __libm_pi_by_2_reduce. This routine uses r49-50. These are used as
+// scratch registers within the __libm_pi_by_2_reduce routine (for speed).
+//
+// We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127. We
+// use this to eliminate save/restore of key fp registers in this calling
+// function.
+//
+// *******************************************************************
+// *******************************************************************
+// *******************************************************************
-.proc __libm_callout
-__libm_callout:
-L(SINCOSL_ARG_TOO_LARGE):
+LOCAL_LIBM_ENTRY(__libm_callout)
+SINCOSL_ARG_TOO_LARGE:
.prologue
{ .mfi
- add r45=-32,sp // Parameter: r address
nop.f 0
.save ar.pfs,GR_SAVE_PFS
mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
-}
-{ .mfi
-.fframe 64
- add sp=-64,sp // Create new stack
- nop.f 0
- mov GR_SAVE_GP=gp // Save gp
};;
+
{ .mmi
- stfe [r45] = f0,16 // Clear Parameter r on stack
- add r44 = 16,sp // Parameter x address
+ setf.exp FR_Two_to_M3 = GR_exp_2_to_m3 // Form 2^-3
+ mov GR_SAVE_GP=gp // Save gp
.save b0, GR_SAVE_B0
mov GR_SAVE_B0=b0 // Save b0
};;
+
.body
+//
+// Call argument reduction with x in f8
+// Returns with N in r8, r in f8, c in f9
+// Assumes f71-127 are preserved across the call
+//
{ .mib
- stfe [r45] = f0,-16 // Clear Parameter c on stack
- nop.i 0
- nop.b 0
-}
-{ .mib
- stfe [r44] = FR_Input_X // Store Parameter x on stack
+ setf.exp FR_Neg_Two_to_M3 = GR_exp_m2_to_m3 // Form -(2^-3)
nop.i 0
-(p0) br.call.sptk b0=__libm_pi_by_2_reduce# ;;
+ br.call.sptk b0=__libm_pi_by_2_reduce#
};;
-{ .mii
-(p0) ldfe FR_Input_X =[r44],16
-//
-// Get r and c off stack
-//
-(p0) adds GR_Table_Base1 = -16, GR_Table_Base1
-//
-// Get r and c off stack
-//
-(p0) add GR_N_Inc = GR_Sin_or_Cos,r8 ;;
-}
-{ .mmb
-(p0) ldfe FR_r =[r45],16
-//
-// Get X off the stack
-// Readjust Table ptr
-//
-(p0) ldfs FR_Two_to_M3 = [GR_Table_Base1],4
- nop.b 999 ;;
-}
-{ .mmb
-(p0) ldfs FR_Neg_Two_to_M3 = [GR_Table_Base1],0
-(p0) ldfe FR_c =[r45]
- nop.b 999 ;;
-}
+
{ .mfi
-.restore sp
- add sp = 64,sp // Restore stack pointer
-(p0) fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3
+ add GR_N_Inc = GR_Sin_or_Cos,r8
+ fcmp.lt.unc.s1 p6, p0 = FR_r, FR_Two_to_M3
mov b0 = GR_SAVE_B0 // Restore return address
};;
-{ .mib
+
+{ .mfi
mov gp = GR_SAVE_GP // Restore gp
+(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- nop.b 0
};;
-{ .mfi
- nop.m 999
-(p6) fcmp.gt.unc.s1 p6, p0 = FR_r, FR_Neg_Two_to_M3
- nop.i 999 ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(SINCOSL_SMALL_R) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p0) br.cond.sptk L(SINCOSL_NORMAL_R) ;;
-}
-.endp __libm_callout
-ASM_SIZE_DIRECTIVE(__libm_callout)
+
+{ .mbb
+ nop.m 999
+(p6) br.cond.spnt SINCOSL_SMALL_R // Branch if |r|< 2^-3 for |x| >= 2^63
+ br.cond.sptk SINCOSL_NORMAL_R // Branch if |r|>=2^-3 for |x| >= 2^63
+};;
+
+.endp
.type __libm_pi_by_2_reduce#,@function
.global __libm_pi_by_2_reduce#
diff --git a/sysdeps/ia64/fpu/s_erf.S b/sysdeps/ia64/fpu/s_erf.S
new file mode 100644
index 0000000000..8b8cc7ff83
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_erf.S
@@ -0,0 +1,924 @@
+.file "erf.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 08/15/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// double erf(double)
+//
+// Overview of operation
+//==============================================================
+// Background
+//
+//
+// There are 9 paths:
+// 1. x = +/-0.0
+// Return erf(x) = +/-0.0
+//
+// 2. 0.0 < |x| < 0.5
+// Return erf(x) = x *Pol9(x^2)
+//
+// 3. For several subranges of 0.5 <= |x| < 5.90625
+// Return erf(x) = sign(x)*Pol19(y),
+// where y = (|x|-b)/a, Pol19(y) = A0 + A1*y^1 + A2*y^2 + ... + A19*y^19
+//
+// For each subrange there is particular set of coefficients.
+// Below is the list of subranges:
+// 3.1 0.5 <= |x| < 1.0 b = a = 0.5
+// 3.2 1.0 <= |x| < 2.0, b = a = 1.0
+// 3.3 2.0 <= |x| < 3.25 b = a = 2.0
+// 3.4 4.0 <= |x| < 5.90625 b = 4.0, a = 2.0
+//
+// 4. 3.25 <= |x| < 4.0
+// Return erf(x) = sign(x)*Pol14(|x| - 3.25)
+//
+// 5. 5.90625 <= |x| < +INF
+// Return erf(x) = sign(x)*(1.0d - 2^(-63))
+//
+// 6. |x| = INF
+// Return erf(x) = sign(x) * 1.0
+//
+// 7. x = [S,Q]NaN
+// Return erf(x) = QNaN
+//
+// 8. x is positive denormal
+// Return erf(x) = A0*x - x^2,
+// where A0 = 2.0/sqrt(Pi)
+//
+// 9. x is negative denormal
+// Return erf(x) = A0*x + x^2,
+// where A0 = 2.0/sqrt(Pi)
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input, output
+// f32 -> f63
+
+// General registers used:
+// r32 -> r48, r2, r3
+
+// Predicate registers used:
+// p0, p6 -> p15
+
+// p6 to filter out case when x = denormal
+// p7 to filter out case when x = [Q,S]NaN or +/-0,
+// used also to process denormals
+// p8 to filter out case when 3.25 <= |x| < 4.0,
+// used also to process denormals
+// p9 to filter out case when |x| = inf
+// p10 to filter out case when |x| < 0.5
+// p11 set when |x| < 3.25 or |x| > 4.0
+// p12 to filter out case when |x| >= 5.90625
+// p13 set if 4.0 <=|x| < 5.90625
+// p14 set to 1 for positive x
+// p15 set to 1 for negative x
+
+// Assembly macros
+//==============================================================
+rDataPtr = r2
+rDataPtr1 = r3
+
+rBias = r33
+rCoeffAddr3 = r34
+rThreeAndQ = r35
+rCoeffAddr2 = r36
+rMask = r37
+rArg = r38
+rSignBit = r39
+rAbsArg = r40
+rSaturation = r41
+rIndex = r42
+rCoeffAddr1 = r43
+rCoeffAddr4 = r44
+rShiftedArg = r45
+rShiftedArgMasked = r46
+rBiasedExpOf4 = r47
+rShiftedAbsArg = r48
+
+//==============================================================
+fA0 = f32
+fA1 = f33
+fA2 = f34
+fA3 = f35
+fA4 = f36
+fA5 = f37
+fA6 = f38
+fA7 = f39
+fA8 = f40
+fA9 = f41
+fA10 = f42
+fA11 = f43
+fA12 = f44
+fA13 = f45
+fA14 = f46
+fA15 = f47
+fA16 = f48
+fA17 = f49
+fA18 = f50
+fA19 = f51
+fArgSqr = f52
+fArgAbsNorm = f53
+fSignumX = f54
+fRes = f55
+fThreeAndQ = f56
+fArgAbs = f57
+fTSqr = f58
+fTQuadr = f59
+fTDeg3 = f60
+fTDeg7 = f61
+fArgAbsNormSgn = f62
+fTQuadrSgn = f63
+
+// Data tables
+//==============================================================
+RODATA
+
+.align 64
+
+LOCAL_OBJECT_START(erf_data)
+// Coefficients ##0..15
+// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
+data8 0xB69AC40646D1F6C1, 0x00003FD2 //A19
+data8 0x90AD48C0118FA10C, 0x00003FD7 //A18
+data8 0x826FBAD055EA4AB8, 0x0000BFDB //A17
+data8 0x8DAB171246CC2B89, 0x00003FDC //A16
+data8 0xC0B1D6662F8A7564, 0x00003FDF //A15
+data8 0xA46374AC35099BAF, 0x0000BFE1 //A14
+data8 0xB2F230996346EF27, 0x0000BFE4 //A13
+data8 0xCDEC50950FACE04A, 0x00003FE6 //A12
+data8 0x826014649396E9D2, 0x00003FE9 //A11
+data8 0xCDB787DC718B13F9, 0x0000BFEB //A10
+data8 0x8E0B23C24EE0C8EE, 0x0000BFED //A9
+data8 0xA49EA40A4E5A3F76, 0x00003FF0 //A8
+data8 0xB11E30BE912617D3, 0x00003FF0 //A7
+data8 0xCCF89D9351CE26E3, 0x0000BFF4 //A6
+data8 0xEFF75AD1F0F22809, 0x00003FF2 //A5
+data8 0xBB793EF404C09A22, 0x00003FF8 //A4
+// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
+data8 0xBAE93FF4174EA59B, 0x00003FE6 //A19
+data8 0x8A0FD46092F95D44, 0x0000BFEA //A18
+data8 0xA37B3242B7809E12, 0x00003FEC //A17
+data8 0xA0330A5CD2E91689, 0x0000BFED //A16
+data8 0x8E34A678F3497D17, 0x0000BFEC //A15
+data8 0xAC185D45A2772384, 0x00003FEF //A14
+data8 0xB0C11347CE7EEDE8, 0x00003FEF //A13
+data8 0xD3330DC14EA0E4EB, 0x0000BFF2 //A12
+data8 0xB4A6DFDE578A428F, 0x00003FF1 //A11
+data8 0xA0B4034310D2D9CB, 0x00003FF5 //A10
+data8 0xF71662D3132B7759, 0x0000BFF5 //A9
+data8 0x9C88BF157695E9EC, 0x0000BFF7 //A8
+data8 0xF84B80EFCA43895D, 0x00003FF8 //A7
+data8 0x9722D22DA628A17B, 0x00003FF7 //A6
+data8 0x8DB0A586F8F3381F, 0x0000BFFB //A5
+data8 0x8DB0A5879F87E5BE, 0x00003FFB //A4
+// Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25
+data8 0x9C4AF1F3A4B21AFC, 0x00003FF6 //A19
+data8 0x8D40D5D5DB741AB8, 0x0000BFF9 //A18
+data8 0xDEBE7099E0A75BA4, 0x00003FFA //A17
+data8 0xB99A33294D32429D, 0x0000BFFB //A16
+data8 0x8109D9C7197BC7C9, 0x00003FFB //A15
+data8 0xC30DE8E2EFC2D760, 0x00003FFA //A14
+data8 0x80DDA28C5B35DC73, 0x0000BFFC //A13
+data8 0x9BE4DE5095BACE0D, 0x00003FF9 //A12
+data8 0xDA4092509EE7D111, 0x00003FFC //A11
+data8 0x89D98C561B0C9040, 0x0000BFFD //A10
+data8 0xD20B26EB2F0881D4, 0x0000BFF9 //A9
+data8 0xD089C56948731561, 0x00003FFD //A8
+data8 0xDD704DEFFB21B7E7, 0x0000BFFD //A7
+data8 0xF0C9A6BBDE469115, 0x00003FF9 //A6
+data8 0xD673A02CB5766633, 0x00003FFD //A5
+data8 0x8D162CBAD8A12649, 0x0000BFFE //A4
+// Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0
+data8 0xD4428B75C6FE8FD1, 0x0000BFFC //A19
+data8 0xF76BE1935675D5C8, 0x00003FFE //A18
+data8 0xFD6BB3B14AA7A8E6, 0x0000BFFF //A17
+data8 0x8BE8F573D348DDA4, 0x00004000 //A16
+data8 0x81E91923A1030502, 0x0000BFFF //A15
+data8 0xCE7FE87B26CFD286, 0x0000BFFE //A14
+data8 0x84EF6B4E17404384, 0x00004000 //A13
+data8 0x91FEF33015404991, 0x0000C000 //A12
+data8 0xDEDF6A9370747E56, 0x00003FFF //A11
+data8 0x8397E6FF56CDFD9D, 0x0000BFFF //A10
+data8 0xFAD1CE912473937B, 0x00003FFD //A9
+data8 0xC48C1EA8AAA624EA, 0x0000BFFC //A8
+data8 0xFECAF0097ACF981B, 0x00003FFA //A7
+data8 0x8829A394065E4B95, 0x0000BFF9 //A6
+data8 0xED3003E477A53EE7, 0x00003FF6 //A5
+data8 0xA4C07E9BB3FCB0F3, 0x0000BFF4 //A4
+//
+// Coefficients ##16..19
+// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
+data8 0x95FA98C337005D13, 0x0000BFF9 //A3
+data8 0xE0F7E524D2808A97, 0x0000BFFB //A2
+data8 0xE0F7E524D2808A98, 0x00003FFD //A1
+data8 0x853F7AE0C76E915F, 0x00003FFE //A0
+// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
+data8 0x8DB0A587A96ABCF0, 0x00003FFC //A3
+data8 0xD488F84B7DE18DA8, 0x0000BFFD //A2
+data8 0xD488F84B7DE12E9C, 0x00003FFD //A1
+data8 0xD7BB3D3A08445636, 0x00003FFE //A0
+// Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25
+data8 0xC58571D23D5C4B3A, 0x00003FFD //A3
+data8 0xA94DCF467CD6AFF3, 0x0000BFFC //A2
+data8 0xA94DCF467CD10A16, 0x00003FFA //A1
+data8 0xFECD70A13CAF1997, 0x00003FFE //A0
+// Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0
+data8 0xB01D2B4F0D5AB8B0, 0x00003FF1 //A3
+data8 0x8858A465CE594BD1, 0x0000BFEE //A2
+data8 0x8858A447456DE61D, 0x00003FEA //A1
+data8 0xFFFFFFBDC88BB107, 0x00003FFE //A0
+// Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.5
+data8 0xBE839EDBB36C7FCE //A9
+data8 0x3EBB7745A18DD242 //A8
+data8 0xBF4C02DB238F2AFC //A5
+data8 0x3F7565BCD0A9A3EA //A4
+data8 0xC093A3581BCF3333, 0x0000BFFD //A1
+data8 0xBEEF4BB82AD8AE22 //A7
+data8 0x3F1F9A2A57A218CD //A6
+data8 0xBF9B82CE3127F4E4 //A3
+data8 0x3FBCE2F21A042B25 //A2
+data8 0x906EBA8214DB688D, 0x00003FFF //A0
+// 1.0 - 2^(-63)
+data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE
+// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4.0
+data8 0x95E91576C7A12250, 0x00003FE7 //A14
+data8 0x8E5E0D0E1F5D3CB5, 0x0000BFEA //A13
+data8 0xED761DAFAF814DE9, 0x00003FEB //A12
+data8 0xB3A77D921D0ACFC7, 0x0000BFEC //A11
+data8 0xA662D27096B08D7C, 0x0000BFEC //A10
+data8 0xDA0F410AE6233EA5, 0x00003FEF //A9
+data8 0xAB4A8B16B3124327, 0x0000BFF1 //A8
+data8 0xB241E236A5EDCED3, 0x00003FF2 //A7
+data8 0x8A2A65BA1F551F77, 0x0000BFF3 //A6
+data8 0xA4852D0B1D87000A, 0x00003FF3 //A5
+data8 0x963EB00039489476, 0x0000BFF3 //A4
+data8 0xCD5244FF4F7313A5, 0x00003FF2 //A3
+data8 0xC6F1E695363BCB26, 0x0000BFF1 //A2
+data8 0xF4DAF4680DA54C02, 0x00003FEF //A1
+data8 0xFFFFB7CFB3F2ABBE, 0x00003FFE //A0
+// A = 2.0/sqrt(Pi)
+data8 0x906EBA8214DB688D, 0x00003FFF
+LOCAL_OBJECT_END(erf_data)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(erf)
+
+{ .mfi
+ alloc r32 = ar.pfs, 0, 17, 0, 0
+ fmerge.se fArgAbsNorm = f1, f8 // normalized x
+ adds rSignBit = 0x1, r0
+}
+{ .mfi
+ addl rDataPtr = @ltoff(erf_data), gp
+ fma.s1 fArgSqr = f8, f8, f0 // x^2
+ addl rThreeAndQ = 0x400A0, r0 // shifted bits of 3.25
+}
+;;
+{ .mfi
+ getf.d rArg = f8 // x in GR
+ fclass.m p6,p0 = f8, 0x0b // is x denormal ?
+ shl rThreeAndQ = rThreeAndQ, 44 // bits of 3.25
+}
+{ .mfi
+ ld8 rDataPtr = [rDataPtr]
+ nop.f 0
+ addl rBiasedExpOf4 = 0x40100, r0 // shifted bits of 4.0
+}
+;;
+{ .mfi
+ addl rSaturation = 0x4017A, r0 // shifted bits of 5.90625
+ fclass.m p7,p0 = f8, 0xc7 // is x [S,Q]NaN or +/-0 ?
+ shl rSignBit = rSignBit, 63 // mask for sign bit
+}
+{ .mfi
+ addl rMask = 0x7FF00, r0 // Mask for index bits
+ nop.f 0
+ addl rBias = 0x3FE00, r0 // bias of 0.5 << 8
+}
+;;
+{ .mfi
+ setf.d fThreeAndQ = rThreeAndQ // 3.25 if FP register
+ fclass.m p9,p0 = f8, 0x23 // is x +/- inf?
+ shr.u rShiftedArg = rArg, 44
+}
+{ .mfb
+ andcm rAbsArg = rArg, rSignBit // |x| in GR
+ nop.f 0
+(p6) br.cond.spnt erf_denormal // branch out if x is denormal
+}
+;;
+{ .mfi
+ and rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8
+ fmerge.s fArgAbs = f1, f8 // |x|
+ shr rShiftedAbsArg = rAbsArg, 44
+}
+{ .mfb
+ cmp.lt p8, p11 = rThreeAndQ, rAbsArg // p8 = 1 if |x| >= 3.25
+(p7) fma.d.s0 f8 = f8,f1,f8 // NaN or +/-0
+(p7) br.ret.spnt b0 // exit for x = NaN or +/-0
+}
+;;
+{ .mfi
+ sub rIndex = rShiftedArgMasked, rBias // index << 8
+ nop.f 0
+ cmp.lt p10, p0 = rShiftedArgMasked, rBias // p10 = 1 if |x| < 0.5
+}
+{ .mfb
+ // p8 = 1 if 3.25 <= |x| < 4.0
+(p8) cmp.lt p8, p11 = rShiftedAbsArg, rBiasedExpOf4
+ fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1
+(p10) br.cond.spnt erf_near_zero // branch out if |x| < 0.5
+}
+;;
+.pred.rel "mutex", p8, p11
+{ .mfi
+(p8) adds rCoeffAddr1 = 1392, rDataPtr // coeff. for 3.25 <=|x|<4.0
+(p9) fmerge.s f8 = f8,f1 // +/- inf
+ nop.i 0
+}
+{ .mfb
+(p11) add rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14
+ nop.f 0
+(p9) br.ret.spnt b0 // exit for x = +/- inf
+}
+;;
+{ .mfi
+ adds rCoeffAddr2 = 16, rCoeffAddr1
+ fmerge.s fSignumX = f8, f1 // signum(x)
+ nop.i 0
+}
+{ .mfb
+ cmp.lt p12, p0 = rSaturation, rShiftedAbsArg // |x| > 5.90625?
+ nop.f 0
+(p12) br.cond.spnt erf_saturation // branch out if x |x| >= 6.0
+}
+;;
+// Here if paths #3,4
+// if path #4 we'll branch out after loading of 14 necessary coefficients
+{.mfi
+ ldfe fA19 = [rCoeffAddr1], 32
+ nop.f 0
+ nop.i 0
+}
+{.mfi
+ ldfe fA18 = [rCoeffAddr2], 32
+ nop.f 0
+ adds rCoeffAddr3 = 1024, rDataPtr
+}
+;;
+{.mfi
+ ldfe fA17 = [rCoeffAddr1], 32
+ nop.f 0
+ nop.i 0
+}
+{.mfi
+ ldfe fA16 = [rCoeffAddr2], 32
+ nop.f 0
+ nop.i 0
+}
+;;
+{.mfi
+ ldfe fA15 = [rCoeffAddr1], 32
+ fma.s1 fTSqr = fArgAbsNorm, fArgAbsNorm, f0
+ shr.u rIndex = rIndex, 2
+}
+{.mfi
+ ldfe fA14 = [rCoeffAddr2], 32
+ nop.f 0
+ adds rCoeffAddr4 = 16, r0
+}
+;;
+{.mfi
+ ldfe fA13 = [rCoeffAddr1], 32
+ nop.f 0
+ // address of coefficients ##16..23
+ add rCoeffAddr3 = rCoeffAddr3, rIndex
+}
+{.mfi
+ ldfe fA12 = [rCoeffAddr2], 32
+ nop.f 0
+ cmp.lt p15, p14 = rArg, r0
+}
+;;
+{.mfi
+ ldfe fA11 = [rCoeffAddr1], 32
+ nop.f 0
+ add rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4
+}
+{.mfi
+ ldfe fA10 = [rCoeffAddr2], 32
+ nop.f 0
+ nop.i 0
+}
+;;
+{.mfi
+ ldfe fA9 = [rCoeffAddr1], 32
+ nop.f 0
+ nop.i 0
+}
+{.mfi
+ ldfe fA8 = [rCoeffAddr2], 32
+ nop.f 0
+ nop.i 0
+}
+;;
+{.mfi
+ ldfe fA7 = [rCoeffAddr1], 32
+ fms.s1 fArgAbs = fArgAbs, f1, fThreeAndQ
+ nop.i 0
+}
+{.mfb
+ ldfe fA6 = [rCoeffAddr2], 32
+ nop.f 0
+(p8) br.cond.spnt erf_3q_4 // branch out if 3.25 < |x| < 4.0
+}
+;;
+{.mfi
+ ldfe fA5 = [rCoeffAddr1], 32
+ fma.s1 fTDeg3 = fArgAbsNorm, fTSqr, f0
+ nop.i 0
+}
+{.mfi
+ ldfe fA4 = [rCoeffAddr2], 32
+ fma.s1 fTQuadr = fTSqr, fTSqr, f0
+ nop.i 0
+}
+;;
+// Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm
+{.mfi
+ ldfe fA3 = [rCoeffAddr3], 32
+ fma.s1 fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0
+ nop.i 0
+}
+{.mfi
+ ldfe fA2 = [rCoeffAddr4], 32
+ nop.f 0
+ nop.i 0
+}
+;;
+{.mfi
+ ldfe fA1 = [rCoeffAddr3], 32
+ fma.s1 fRes = fA19, fArgAbsNorm, fA18
+ nop.i 0
+}
+{.mfi
+ ldfe fA0 = [rCoeffAddr4], 32
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, fArgAbsNorm, fA16
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fArgAbsNorm, fA14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fTDeg7 = fTDeg3, fTQuadr, f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, fArgAbsNorm, fA12
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, fArgAbsNorm, fA10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, fArgAbsNorm, fA8
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTSqr, fA17
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA7, fArgAbsNorm, fA6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA5 = fA5, fArgAbsNorm, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fTSqr, fA13
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA4, fArgAbsNorm, fA3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA2 = fA2, fArgAbsNorm, fA1
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, fTSqr, fA9
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA7, fTSqr, fA5
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTQuadr, fA15
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA4, fTSqr, fA2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTQuadr, fA11
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA7, fTDeg3, fA4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTDeg7, fA4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // result for negative argument
+(p15) fms.d.s0 f8 = fRes, fArgAbsNormSgn, fA0
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ // result for positive argument
+(p14) fma.d.s0 f8 = fRes, fArgAbsNormSgn, fA0
+ br.ret.sptk b0
+}
+
+// Here if 3.25 < |x| < 4.0
+.align 32
+erf_3q_4:
+.pred.rel "mutex", p14, p15
+{ .mfi
+ ldfe fA5 = [rCoeffAddr1], 32
+ fma.s1 fTSqr = fArgAbs, fArgAbs, f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fA19, fArgAbs, fA18
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, fArgAbs, fA16
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fArgAbs, fA14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, fArgAbs, fA12
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, fArgAbs, fA10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, fArgAbs, fA8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fArgAbsNormSgn = fArgAbs, fSignumX, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fTQuadr = fTSqr, fTSqr, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTSqr, fA17
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fTSqr, fA13
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, fTSqr, fA9
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA7, fArgAbs, fA6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fTDeg7 = fTQuadr, fTSqr, f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTQuadr, fA15
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, fTSqr, fA7
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTDeg7, fA11
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // result for negative argument
+(p15) fms.d.s0 f8 = fRes, fArgAbsNormSgn, fA5
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ // result for positive argument
+(p14) fma.d.s0 f8 = fRes, fArgAbsNormSgn, fA5
+ br.ret.sptk b0
+}
+;;
+
+// Here if |x| < 0.5
+.align 32
+erf_near_zero:
+{ .mfi
+ adds rCoeffAddr1 = 1280, rDataPtr // address of A9
+ fma.s1 fTSqr = fArgSqr, fArgSqr, f0 // x^4
+ nop.i 0
+}
+{ .mfi
+ adds rCoeffAddr2 = 1328, rDataPtr // address of A7
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA9, fA8 = [rCoeffAddr1], 16
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA7, fA6 = [rCoeffAddr2], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA5, fA4 = [rCoeffAddr1], 16
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA3, fA2 = [rCoeffAddr2], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fA1 = [rCoeffAddr1]
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe fA0 = [rCoeffAddr2]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fTQuadr = fTSqr, fTSqr, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fA9, fArgSqr, fA8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA7, fArgSqr, fA6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA3 = fA3, fArgSqr, fA2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA5 = fA5, fArgSqr, fA4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA1 = fA1, fArgSqr, fA0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fTQuadrSgn = fTQuadr, f8, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTSqr, fA7
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA1 = fA3, fTSqr, fA1
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTSqr, fA5
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA1 = fA1, f8, f0
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = fRes, fTQuadrSgn, fA1 // x*Pol9(x^2)
+ br.ret.sptk b0 // Exit for |x| < 0.5
+};;
+
+// Here if 5.90625 <= |x| < +inf
+.align 32
+erf_saturation:
+{ .mfi
+ adds rDataPtr = 1376, rDataPtr // address of A0
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fA0 = [rDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0 - 2^(-63))
+ // Exit for 5.90625 <= |x| < +inf
+ br.ret.sptk b0 // Exit for 5.90625 <=|x|< +inf
+}
+;;
+
+// Here if x is double precision denormal
+.align 32
+erf_denormal:
+{ .mfi
+ adds rDataPtr = 1632, rDataPtr // address of A0
+ fclass.m p7,p8 = f8, 0x0a // is x -denormal ?
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fA0 = [rDataPtr] // A0
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA0 = fA0,f8,f0 // A0*x
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+(p7) fma.d.s0 f8 = f8,f8,fA0 // -denormal
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p8) fnma.d.s0 f8 = f8,f8,fA0 // +denormal
+ br.ret.sptk b0 // Exit for denormal
+}
+;;
+
+GLOBAL_LIBM_END(erf)
+
diff --git a/sysdeps/ia64/fpu/s_erfc.S b/sysdeps/ia64/fpu/s_erfc.S
new file mode 100644
index 0000000000..8b223275c7
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_erfc.S
@@ -0,0 +1,1197 @@
+.file "erfc.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 11/12/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// double erfc(double)
+//
+// Overview of operation
+//==============================================================
+// 1. 0 <= x <= 28.0
+//
+// erfc(x) = P14(z) * exp( -x^2 ), z = x - x(i).
+//
+// Comment:
+//
+// Let x(i) = -1.0 + 2^(i/4),i=0,...19. So we have 20 unequal
+// argument intervals [x(i),x(i+1)] with length ratio q = 2^(1/4).
+// Values x(i) we have in the table erfc_xb_table.
+//
+// Let x(i)<= x < x(i+1).
+// We can find i as exponent of number (x + 1)^4.
+//
+// Let P14(z) - polynomial approximation of degree 14 for function
+// erfc(z+x(i)) * exp( (z+x(i))^2) and 0 <= z <= x(i+1)-x(i).
+// Polynomial coeffitients we have in the table erfc_p_table.
+//
+// So we can find result for erfc(x) as above.
+// Algorithm description for exp function see below.
+//
+// 2. -6 <= x < 0
+//
+// erfc(x) = 2.0 - erfc(-x)
+//
+// 3. x > 28.0
+// erfc(x) ~=~ 0.0
+//
+// 4. x < -6.0
+// erfc(x) ~=~ 2.0
+
+// Special values
+//==============================================================
+// erfc(+0) = 1.0
+// erfc(-0) = 1.0
+
+// erfc(+qnan) = +qnan
+// erfc(-qnan) = -qnan
+// erfc(+snan) = +qnan
+// erfc(-snan) = -qnan
+
+// erfc(-inf) = 2.0
+// erfc(+inf) = +0
+
+//==============================================================
+// Take double exp(double) from libm_64.
+//
+// Overview of operation
+//==============================================================
+// Take the input x. w is "how many log2/128 in x?"
+// w = x * 128/log2
+// n = int(w)
+// x = n log2/128 + r + delta
+
+// n = 128M + index_1 + 2^4 index_2
+// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta
+
+// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta)
+// Construct 2^M
+// Get 2^(index_1/128) from table_1;
+// Get 2^(index_2/8) from table_2;
+// Calculate exp(r) by series
+// r = x - n (log2/128)_high
+// delta = - n (log2/128)_low
+// Calculate exp(delta) as 1 + delta
+//==============================================================
+// Comment for exp for erfc:
+//
+// We use quad precision for calculate input argument -x^2 and add
+// result low bits to value delta in exp.
+
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f9 -> f15, f32 -> f93
+
+// General registers used:
+// r32 -> r68
+
+// Predicate registers used:
+// p6 -> p15
+
+// Assembly macros
+//==============================================================
+
+exp_GR_rshf = r33
+EXP_AD_TB1 = r34
+EXP_AD_TB2 = r35
+EXP_AD_P = r36
+exp_GR_N = r37
+exp_GR_index_1 = r38
+exp_GR_index_2_16 = r39
+exp_GR_biased_M = r40
+EXP_AD_T1 = r41
+EXP_AD_T2 = r42
+exp_GR_sig_inv_ln2 = r43
+exp_GR_17ones = r44
+exp_TB1_size = r45
+exp_TB2_size = r46
+exp_GR_rshf_2to56 = r47
+exp_GR_exp_2tom56 = r48
+
+// GR for erfc(x)
+//==============================================================
+GR_POS_ARG_ASYMP = r49
+GR_NEG_ARG_ASYMP = r50
+GR_ARG_ASYMP = r51
+GR_ERFC_XB_TB = r52
+GR_ERFC_P_TB = r53
+GR_IndxPlusBias = r54
+GR_BIAS = r55
+GR_P_A12 = r56
+GR_P_A13 = r57
+GR_AbsArg = r58
+GR_ShftXBi = r59
+GR_ShftPi = r60
+GR_mBIAS = r61
+GR_ShftPi_bias = r62
+GR_ShftXBi_bias = r63
+GR_ShftA12 = r64
+GR_ShftA13 = r65
+GR_EpsNorm = r66
+GR_0x1 = r67
+GR_ShftPi_8 = r68
+
+// GR for __libm_support call
+
+//==============================================================
+
+GR_SAVE_B0 = r61
+GR_SAVE_PFS = r62
+GR_SAVE_GP = r63
+GR_SAVE_SP = r64
+
+GR_Parameter_X = r65
+GR_Parameter_Y = r66
+GR_Parameter_RESULT = r67
+GR_Parameter_TAG = r68
+
+
+// FR for exp(-x^2)
+//==============================================================
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+EXP_2TOM56 = f6
+EXP_INV_LN2_2TO63 = f7
+EXP_W_2TO56_RSH = f9
+EXP_RSHF_2TO56 = f10
+
+exp_P4 = f11
+exp_P3 = f12
+exp_P2 = f13
+exp_P1 = f14
+exp_ln2_by_128_hi = f15
+
+exp_ln2_by_128_lo = f32
+EXP_RSHF = f33
+EXP_Nfloat = f34
+exp_r = f35
+exp_f = f36
+exp_rsq = f37
+exp_rcube = f38
+EXP_2M = f39
+exp_S1 = f40
+exp_T1 = f41
+exp_rP4pP3 = f42
+exp_P_lo = f43
+exp_P_hi = f44
+exp_P = f45
+exp_S = f46
+EXP_NORM_f8 = f47
+exp_S2 = f48
+exp_T2 = f49
+
+// FR for erfc(x)
+//==============================================================
+FR_AbsArg = f50
+FR_Tmp = f51
+FR_Xb = f52
+FR_A0 = f53
+FR_A1 = f54
+FR_A2 = f55
+FR_A3 = f56
+FR_A4 = f57
+FR_A5 = f58
+FR_A6 = f59
+FR_A7 = f60
+FR_A8 = f61
+FR_A9 = f62
+FR_A10 = f63
+FR_A11 = f64
+FR_A12 = f65
+FR_A13 = f66
+FR_A14 = f67
+
+FR_P14_0_1 = f68
+FR_P14_0_2 = f69
+FR_P14_1_1 = f70
+FR_P14_1_2 = f71
+FR_P14_2_1 = f72
+FR_P14_2_2 = f73
+FR_P14_3_1 = f74
+FR_P14_3_2 = f75
+FR_P14_6_1 = f76
+
+FR_P14_7_1 = f77
+FR_P14_7_2 = f78
+FR_P14_8_1 = f79
+FR_P14_8_2 = f80
+FR_P14_12_1 = f81
+FR_P14_13_1 = f82
+FR_P14_13_2 = f83
+FR_Pol = f84
+FR_Exp = f85
+FR_2 = f86
+f8_sq_lo = f87
+FR_LocArg = f88
+FR_Tmpf = f89
+FR_Tmp1 = f90
+FR_EpsNorm = f91
+FR_UnfBound = f92
+FR_NormX = f93
+
+
+// Data tables
+//==============================================================
+RODATA
+.align 16
+
+// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
+
+LOCAL_OBJECT_START(exp_table_1)
+
+data8 0x403a8b12fc6e4892 , 0 // underflow boundary
+data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi
+data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo
+//
+// Table 1 is 2^(index_1/128) where
+// index_1 goes from 0 to 15
+//
+data8 0x8000000000000000 , 0x00003FFF
+data8 0x80B1ED4FD999AB6C , 0x00003FFF
+data8 0x8164D1F3BC030773 , 0x00003FFF
+data8 0x8218AF4373FC25EC , 0x00003FFF
+data8 0x82CD8698AC2BA1D7 , 0x00003FFF
+data8 0x8383594EEFB6EE37 , 0x00003FFF
+data8 0x843A28C3ACDE4046 , 0x00003FFF
+data8 0x84F1F656379C1A29 , 0x00003FFF
+data8 0x85AAC367CC487B15 , 0x00003FFF
+data8 0x8664915B923FBA04 , 0x00003FFF
+data8 0x871F61969E8D1010 , 0x00003FFF
+data8 0x87DB357FF698D792 , 0x00003FFF
+data8 0x88980E8092DA8527 , 0x00003FFF
+data8 0x8955EE03618E5FDD , 0x00003FFF
+data8 0x8A14D575496EFD9A , 0x00003FFF
+data8 0x8AD4C6452C728924 , 0x00003FFF
+LOCAL_OBJECT_END(exp_table_1)
+
+// Table 2 is 2^(index_1/8) where
+// index_2 goes from 0 to 7
+LOCAL_OBJECT_START(exp_table_2)
+data8 0x8000000000000000 , 0x00003FFF
+data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF
+data8 0x9837F0518DB8A96F , 0x00003FFF
+data8 0xA5FED6A9B15138EA , 0x00003FFF
+data8 0xB504F333F9DE6484 , 0x00003FFF
+data8 0xC5672A115506DADD , 0x00003FFF
+data8 0xD744FCCAD69D6AF4 , 0x00003FFF
+data8 0xEAC0C6E7DD24392F , 0x00003FFF
+LOCAL_OBJECT_END(exp_table_2)
+
+LOCAL_OBJECT_START(exp_p_table)
+data8 0x3f8111116da21757 //P_4
+data8 0x3fa55555d787761c //P_3
+data8 0x3fc5555555555414 //P_2
+data8 0x3fdffffffffffd6a //P_1
+LOCAL_OBJECT_END(exp_p_table)
+
+LOCAL_OBJECT_START(erfc_xb_table)
+data8 0x0000000000000000, 0x00000000 //XB[0] = +0.00000000000000000000e-01L
+data8 0xC1BF828C6DC54B7A, 0x00003FFC //XB[1] = +1.89207115002721066717e-01L
+data8 0xD413CCCFE7799211, 0x00003FFD //XB[2] = +4.14213562373095048802e-01L
+data8 0xAE89F995AD3AD5E8, 0x00003FFE //XB[3] = +6.81792830507429086062e-01L
+data8 0x8000000000000000, 0x00003FFF //XB[4] = +1.00000000000000000000e+00L
+data8 0xB06FE0A31B7152DF, 0x00003FFF //XB[5] = +1.37841423000544213343e+00L
+data8 0xEA09E667F3BCC909, 0x00003FFF //XB[6] = +1.82842712474619009760e+00L
+data8 0x9744FCCAD69D6AF4, 0x00004000 //XB[7] = +2.36358566101485817212e+00L
+data8 0xC000000000000000, 0x00004000 //XB[8] = +3.00000000000000000000e+00L
+data8 0xF06FE0A31B7152DF, 0x00004000 //XB[9] = +3.75682846001088426687e+00L
+data8 0x9504F333F9DE6484, 0x00004001 //XB[10] = +4.65685424949238019521e+00L
+data8 0xB744FCCAD69D6AF4, 0x00004001 //XB[11] = +5.72717132202971634425e+00L
+data8 0xE000000000000000, 0x00004001 //XB[12] = +7.00000000000000000000e+00L
+data8 0x8837F0518DB8A96F, 0x00004002 //XB[13] = +8.51365692002176853374e+00L
+data8 0xA504F333F9DE6484, 0x00004002 //XB[14] = +1.03137084989847603904e+01L
+data8 0xC744FCCAD69D6AF4, 0x00004002 //XB[15] = +1.24543426440594326885e+01L
+data8 0xF000000000000000, 0x00004002 //XB[16] = +1.50000000000000000000e+01L
+data8 0x9037F0518DB8A96F, 0x00004003 //XB[17] = +1.80273138400435370675e+01L
+data8 0xAD04F333F9DE6484, 0x00004003 //XB[18] = +2.16274169979695207808e+01L
+data8 0xCF44FCCAD69D6AF4, 0x00004003 //XB[19] = +2.59086852881188653770e+01L
+LOCAL_OBJECT_END(erfc_xb_table)
+
+LOCAL_OBJECT_START(erfc_p_table)
+
+// Pol0
+data8 0x8000000000000000, 0x00003FFF //A0 = +1.00000000000000000000e+00L
+data8 0x906EBA8214DB688D, 0x0000BFFF //A1 = -1.12837916709551257389e+00L
+data8 0xFFFFFFFFFFFFFFEB, 0x00003FFE //A2 = +9.99999999999999998841e-01L
+data8 0xC093A3581BCF2925, 0x0000BFFE //A3 = -7.52252778063674869885e-01L
+data8 0xFFFFFFFFFFF7CDBD, 0x00003FFD //A4 = +4.99999999999985440383e-01L
+data8 0x9A0FB5E014AE3EFB, 0x0000BFFD //A5 = -3.00901111224757482205e-01L
+data8 0xAAAAAAAA4672B0BD, 0x00003FFC //A6 = +1.66666666643879582201e-01L
+data8 0xB011F45C9F590FC0, 0x0000BFFB //A7 = -8.59717455640916223912e-02L
+data8 0xAAAAA89474161033, 0x00003FFA //A8 = +4.16666588928413935202e-02L
+data8 0x9C818E2CE37D4214, 0x0000BFF9 //A9 = -1.91047455656271634308e-02L
+data8 0x8885969315AB76A1, 0x00003FF8 //A10 = +8.33263115449753085659e-03L
+data8 0xE36112A686F5165B, 0x0000BFF6 //A11 = -3.46953111013788405745e-03L
+data8 0xB3DD6B2DB3307D2E, 0x00003FF5 //A12 = +1.37226041156280127011e-03L
+data8 0x8018A34267FED226, 0x0000BFF4 //A13 = -4.88648380816410282971e-04L
+data8 0xFBBA6A7AEBD3ABD9, 0x00003FF1 //A14 = +1.20033353451879025825e-04L
+// Pol1
+data8 0xD15A1EF03BB91E71, 0x00003FFE //A0 = +8.17781385088640600540e-01L
+data8 0xD1A4ADDAC3337118, 0x0000BFFE //A1 = -8.18919053944410683867e-01L
+data8 0xA9AF9FFA2AD18CB0, 0x00003FFE //A2 = +6.62836073471060947628e-01L
+data8 0xECB77514F0F151B3, 0x0000BFFD //A3 = -4.62337168508812580002e-01L
+data8 0x934AB35EA5CD5EEB, 0x00003FFD //A4 = +2.87679295829458273854e-01L
+data8 0xA71410A68C1EF21C, 0x0000BFFC //A5 = -1.63162479558223113104e-01L
+data8 0xAF506A335238094A, 0x00003FFB //A6 = +8.56025978958108330224e-02L
+data8 0xABFDF67F968765A7, 0x0000BFFA //A7 = -4.19902447551140139048e-02L
+data8 0x9F0B0165A6CDCA99, 0x00003FF9 //A8 = +1.94144274984979538382e-02L
+data8 0x8B8197BFC346CDEA, 0x0000BFF8 //A9 = -8.51478404279186775501e-03L
+data8 0xE950D763FE51AB1E, 0x00003FF6 //A10 = +3.56011637267641495904e-03L
+data8 0xBA821A59FC05FBAD, 0x0000BFF5 //A11 = -1.42294475244146555952e-03L
+data8 0x8D535042E11A0D89, 0x00003FF4 //A12 = +5.39113782651680545599e-04L
+data8 0xBE589447DB26564E, 0x0000BFF2 //A13 = -1.81528103431449706486e-04L
+data8 0xABC8C7EF636F5B0A, 0x00003FF0 //A14 = +4.09565689009869217620e-05L
+// Pol2
+data8 0xA9973ABB272898B2, 0x00003FFE //A0 = +6.62463827792779356910e-01L
+data8 0x945F1A7993F7AADD, 0x0000BFFE //A1 = -5.79576162988785154930e-01L
+data8 0xD84439C6609A8A62, 0x00003FFD //A2 = +4.22395520654665085222e-01L
+data8 0x8A1BBAA7E9CB8C52, 0x0000BFFD //A3 = -2.69742806431984313298e-01L
+data8 0x9F0F67364B466975, 0x00003FFC //A4 = +1.55332195938916594663e-01L
+data8 0xA843F180287DAF7F, 0x0000BFFB //A5 = -8.21608416782158837025e-02L
+data8 0xA59D71B7C690E545, 0x00003FFA //A6 = +4.04333536247671644540e-02L
+data8 0x991A733518C74874, 0x0000BFF9 //A7 = -1.86893701691354422481e-02L
+data8 0x85E7F91148F9F6D2, 0x00003FF8 //A8 = +8.17298243522623724858e-03L
+data8 0xDEE0607CC9D6777E, 0x0000BFF6 //A9 = -3.40082507754089306495e-03L
+data8 0xB145D2CC470B306B, 0x00003FF5 //A10 = +1.35248373238824318949e-03L
+data8 0x86FAEBB4438A20FA, 0x0000BFF4 //A11 = -5.14908443679775343409e-04L
+data8 0xC2503856CE48A657, 0x00003FF2 //A12 = +1.85311660448280465934e-04L
+data8 0xF52642F22A26965B, 0x0000BFF0 //A13 = -5.84481856856861454591e-05L
+data8 0xC98588E1A95FFDBD, 0x00003FEE //A14 = +1.20116245684500489648e-05L
+// Pol3
+data8 0x887CBA2C47B1E2B5, 0x00003FFE //A0 = +5.33153186617432643784e-01L
+data8 0xCD81909CF194328E, 0x0000BFFD //A1 = -4.01379126699602646289e-01L
+data8 0x84DCA15C52122372, 0x00003FFD //A2 = +2.59495775718310530164e-01L
+data8 0x993AA9C76AD28157, 0x0000BFFC //A3 = -1.49637844845261107836e-01L
+data8 0xA140CD8A96FADBA5, 0x00003FFB //A4 = +7.87368829650154013961e-02L
+data8 0x9D36B25E76E56EEA, 0x0000BFFA //A5 = -3.83822410143975630292e-02L
+data8 0x8F8BCC2C0536ECD3, 0x00003FF9 //A6 = +1.75227153523910189727e-02L
+data8 0xF77EDC644BA17AF9, 0x0000BFF7 //A7 = -7.55296479527793552675e-03L
+data8 0xCAB8AC76793C1151, 0x00003FF6 //A8 = +3.09328279988546711083e-03L
+data8 0x9E8FCBC793D555AF, 0x0000BFF5 //A9 = -1.20972979110659888616e-03L
+data8 0xEDC1328664A0CE79, 0x00003FF3 //A10 = +4.53481058502015766058e-04L
+data8 0xAAE3CAAB9D117591, 0x0000BFF2 //A11 = -1.62973223928790256249e-04L
+data8 0xE7704D06A3080C19, 0x00003FF0 //A12 = +5.51792801195012080688e-05L
+data8 0x875A5B53E510F305, 0x0000BFEF //A13 = -1.61353297293572230995e-05L
+data8 0xC8F10CDDB9CC9A42, 0x00003FEC //A14 = +2.99426321046583353559e-06L
+// Pol4
+data8 0xDAEC3C07CAB590C1, 0x00003FFD //A0 = +4.27583576155807004411e-01L
+data8 0x8BE271F8BE0280AC, 0x0000BFFD //A1 = -2.73212014783898564863e-01L
+data8 0x9E13941E19661429, 0x00003FFC //A2 = +1.54371561371908397882e-01L
+data8 0xA241BFC48377449D, 0x0000BFFB //A3 = -7.92269689413235358504e-02L
+data8 0x99E56877AD00D1AE, 0x00003FFA //A4 = +3.75722962151600767952e-02L
+data8 0x887E78DA3BA57C80, 0x0000BFF9 //A5 = -1.66618690872055148862e-02L
+data8 0xE465CAA9F4D54FD8, 0x00003FF7 //A6 = +6.97014232347351913821e-03L
+data8 0xB57930370208D4A7, 0x0000BFF6 //A7 = -2.76906420823065422653e-03L
+data8 0x89A90B5DF0C0C55E, 0x00003FF5 //A8 = +1.05026496655247749532e-03L
+data8 0xC83DB867F08D93C6, 0x0000BFF3 //A9 = -3.81929578900287685559e-04L
+data8 0x8C0C9113FC8061FA, 0x00003FF2 //A10 = +1.33561218944256209215e-04L
+data8 0xBC17A73E9CA51313, 0x0000BFF0 //A11 = -4.48447217225392170834e-05L
+data8 0xED10FE8FC0E44CAD, 0x00003FEE //A12 = +1.41302576244352578317e-05L
+data8 0xFE49912328516F81, 0x0000BFEC //A13 = -3.78917710289305330220e-06L
+data8 0xA8F6077E25DAFD33, 0x00003FEA //A14 = +6.29428967202166402369e-07L
+// Pol5
+data8 0xAF72220985BED710, 0x00003FFD //A0 = +3.42667640364081975844e-01L
+data8 0xBC1CB559042410AB, 0x0000BFFC //A1 = -1.83703263815036934677e-01L
+data8 0xB730BF62E0B63A3C, 0x00003FFB //A2 = +8.94484474229911741150e-02L
+data8 0xA4F307B1D1A1534E, 0x0000BFFA //A3 = -4.02708340235238993824e-02L
+data8 0x8B0327F5117861DB, 0x00003FF9 //A4 = +1.69692783752415790321e-02L
+data8 0xDD4059307B2B081C, 0x0000BFF7 //A5 = -6.75205569219747369303e-03L
+data8 0xA761D738974FECF6, 0x00003FF6 //A6 = +2.55404953403837072821e-03L
+data8 0xF208F6D704F4B487, 0x0000BFF4 //A7 = -9.23290315545127419886e-04L
+data8 0xA7F3658D34EC10B9, 0x00003FF3 //A8 = +3.20340668304962386053e-04L
+data8 0xE079C35CEFD4E6D6, 0x0000BFF1 //A9 = -1.07038324953715640850e-04L
+data8 0x90C5CDD19BB3DD2F, 0x00003FF0 //A10 = +3.45164947021915687751e-05L
+data8 0xB3911863705825F6, 0x0000BFEE //A11 = -1.07030140392753204852e-05L
+data8 0xD023CF5C3F915685, 0x00003FEC //A12 = +3.10152594473606007552e-06L
+data8 0xCA7016FADFF584F5, 0x0000BFEA //A13 = -7.54139761055503416594e-07L
+data8 0xEEBB5CC0901D2BB0, 0x00003FE7 //A14 = +1.11168196441717301549e-07L
+// Pol6
+data8 0x8CD1160326A754AF, 0x00003FFD //A0 = +2.75032699474947383325e-01L
+data8 0xFB22A4C657119388, 0x0000BFFB //A1 = -1.22624671271190511269e-01L
+data8 0xD02B2CA872A774E9, 0x00003FFA //A2 = +5.08224243596176920409e-02L
+data8 0xA23302E146E9E406, 0x0000BFF9 //A3 = -1.97997146844646077750e-02L
+data8 0xEF8918FEDE237C98, 0x00003FF7 //A4 = +7.31004448401605074486e-03L
+data8 0xA8A8B598FA20D881, 0x0000BFF6 //A5 = -2.57353242430059589053e-03L
+data8 0xE3964D9788BFF50F, 0x00003FF4 //A6 = +8.68175969920725727944e-04L
+data8 0x93B83C10B7210AC7, 0x0000BFF3 //A7 = -2.81752903983413936245e-04L
+data8 0xB913B752B0D56A42, 0x00003FF1 //A8 = +8.82515983758695613094e-05L
+data8 0xE0623EFA0B1E8DE9, 0x0000BFEF //A9 = -2.67486302195396417310e-05L
+data8 0x83C4D1A4019E1D2E, 0x00003FEE //A10 = +7.85403393879249335151e-06L
+data8 0x950CBA5D80D8125E, 0x0000BFEC //A11 = -2.22101388436550539151e-06L
+data8 0x9CE72C0409A3E800, 0x00003FEA //A12 = +5.84509280984781223375e-07L
+data8 0x88CCD7A000D1C213, 0x0000BFE8 //A13 = -1.27405082040077425019e-07L
+data8 0x8DF4EC84F093B1C0, 0x00003FE5 //A14 = +1.65259388738830506389e-08L
+// Pol7
+data8 0xE2BF82A153B1B82E, 0x00003FFC //A0 = +2.21433678719152843912e-01L
+data8 0xA72A9AE0BD7F29D5, 0x0000BFFB //A1 = -8.16242313227913578068e-02L
+data8 0xE98939292289EDBE, 0x00003FF9 //A2 = +2.85078159732432477516e-02L
+data8 0x9B93E5E0EEFF9516, 0x0000BFF8 //A3 = -9.49571084105114051468e-03L
+data8 0xC6B39897AABC47BC, 0x00003FF6 //A4 = +3.03194499398790451607e-03L
+data8 0xF442AC7D84DDF1E0, 0x0000BFF4 //A5 = -9.31779649708690069328e-04L
+data8 0x90FBD9F8B41DF23E, 0x00003FF3 //A6 = +2.76534642660360753287e-04L
+data8 0xA6AC59077C78B437, 0x0000BFF1 //A7 = -7.94759910003852154521e-05L
+data8 0xB9FC0BADD531E5E9, 0x00003FEF //A8 = +2.21710864553358009804e-05L
+data8 0xC9CFC8CD93648856, 0x0000BFED //A9 = -6.01445608619100503330e-06L
+data8 0xD4FA51B86A9B2494, 0x00003FEB //A10 = +1.58680833469323702924e-06L
+data8 0xD8D0ED030032926D, 0x0000BFE9 //A11 = -4.03851487695924456733e-07L
+data8 0xCCA1CA2AC3EB8973, 0x00003FE7 //A12 = +9.52891963880517988726e-08L
+data8 0x9E26A080F9DA39DE, 0x0000BFE5 //A13 = -1.84111863600343741644e-08L
+data8 0x8F3DC58F64A92C62, 0x00003FE2 //A14 = +2.08443519336792003049e-09L
+// Pol8
+data8 0xB74C13E914E9666F, 0x00003FFC //A0 = +1.79001151181389950418e-01L
+data8 0xDEB57268A58B763B, 0x0000BFFA //A1 = -5.43722600071728705200e-02L
+data8 0x821FF0D4C605A4CD, 0x00003FF9 //A2 = +1.58843711598712515609e-02L
+data8 0x92C830DD423DB924, 0x0000BFF7 //A3 = -4.47943101836927657394e-03L
+data8 0xA04E61767A095BB6, 0x00003FF5 //A4 = +1.22303905230942532198e-03L
+data8 0xA9EF64E0F6654358, 0x0000BFF3 //A5 = -3.24125543666296226957e-04L
+data8 0xAF39C8969BD163E8, 0x00003FF1 //A6 = +8.35541329311315562274e-05L
+data8 0xB01273B34197330C, 0x0000BFEF //A7 = -2.09894273215824495783e-05L
+data8 0xACAE4C820B99EBAC, 0x00003FED //A8 = +5.14629050848703676006e-06L
+data8 0xA57BF2AEA52B92DF, 0x0000BFEB //A9 = -1.23295315941138567172e-06L
+data8 0x9AD6FE7A852DA239, 0x00003FE9 //A10 = +2.88411640627675721042e-07L
+data8 0x8BFE95FCD7B92763, 0x0000BFE7 //A11 = -6.51900079707465044843e-08L
+data8 0xE9F15C8E7F58CF90, 0x00003FE4 //A12 = +1.36172642554216769522e-08L
+data8 0x9E90F22B11FAF8B5, 0x0000BFE2 //A13 = -2.30744183054978535129e-09L
+data8 0xF8CF74F1A138FBBA, 0x00003FDE //A14 = +2.26291720693360003233e-10L
+// Pol9
+data8 0x94D45274A831ED57, 0x00003FFC //A0 = +1.45341194505862183128e-01L
+data8 0x94D4518B699A4A68, 0x0000BFFA //A1 = -3.63352952323113355459e-02L
+data8 0x90C3B59FF403A916, 0x00003FF8 //A2 = +8.83572327421709216515e-03L
+data8 0x893B796D0E9B4867, 0x0000BFF6 //A3 = -2.09399904729894563201e-03L
+data8 0xFDFFA94903DCB8EA, 0x00003FF3 //A4 = +4.84464029001979577664e-04L
+data8 0xE5CE7C2E4B05CF16, 0x0000BFF1 //A5 = -1.09580317663729186599e-04L
+data8 0xCB88CC8F1146FDAE, 0x00003FEF //A6 = +2.42631878042764234194e-05L
+data8 0xB0AA52C6F44E47C8, 0x0000BFED //A7 = -5.26503698764159271674e-06L
+data8 0x966DD813170F8EBD, 0x00003FEB //A8 = +1.12078397189300511086e-06L
+data8 0xFB75782788A6E378, 0x0000BFE8 //A9 = -2.34189317246047219283e-07L
+data8 0xCDF787C4E5FDCF2A, 0x00003FE6 //A10 = +4.79554094892420966704e-08L
+data8 0xA34CD3DFAC12AA45, 0x0000BFE4 //A11 = -9.50531730989412282035e-09L
+data8 0xEEBB49645DE0E34C, 0x00003FE1 //A12 = +1.73700091999434388879e-09L
+data8 0x8C86D8677DEACFBA, 0x0000BFDF //A13 = -2.55616650187281815453e-10L
+data8 0xBDB223D0FE2A7D6B, 0x00003FDB //A14 = +2.15659223402509415592e-11L
+// Pol10
+data8 0xF2C1812715E4050A, 0x00003FFB //A0 = +1.18533143048567888157e-01L
+data8 0xC7DA2C565ADAEE57, 0x0000BFF9 //A1 = -2.43960252726894623056e-02L
+data8 0xA15CEFFD632F697D, 0x00003FF7 //A2 = +4.92440908672041077933e-03L
+data8 0xFFCFF4D3FB118F69, 0x0000BFF4 //A3 = -9.75846593969603576904e-04L
+data8 0xC73F437D2F226C56, 0x00003FF2 //A4 = +1.90016864347860462550e-04L
+data8 0x989D7E1F60845811, 0x0000BFF0 //A5 = -3.63863004988760879054e-05L
+data8 0xE615A5A669361BE1, 0x00003FED //A6 = +6.85705419984646959791e-06L
+data8 0xAACD08E0BE6270F8, 0x0000BFEB //A7 = -1.27256599602163049440e-06L
+data8 0xF9DEE9C1C02A3062, 0x00003FE8 //A8 = +2.32710274258898439253e-07L
+data8 0xB420E960508A3003, 0x0000BFE6 //A9 = -4.19394488070741280136e-08L
+data8 0xFF5E3ECA229CB0C7, 0x00003FE3 //A10 = +7.43219121339261970485e-09L
+data8 0xAF86504D78D35E89, 0x0000BFE1 //A11 = -1.27711000692808421573e-09L
+data8 0xDE1CE78ADB6DDF04, 0x00003FDE //A12 = +2.02010513073041015283e-10L
+data8 0xE124FFAA267301A5, 0x0000BFDB //A13 = -2.55959692063871343080e-11L
+data8 0x81F1BEBEFBE168D2, 0x00003FD8 //A14 = +1.84661980716000872722e-12L
+// Pol11
+data8 0xC6CE5D7D18203EAA, 0x00003FFB //A0 = +9.70732978630764996752e-02L
+data8 0x86E8A30A76923C88, 0x0000BFF9 //A1 = -1.64683517829920230086e-02L
+data8 0xB4A1CBB7576B4183, 0x00003FF6 //A2 = +2.75622581042760461528e-03L
+data8 0xEEB782FBC8BB352B, 0x0000BFF3 //A3 = -4.55316242981110299585e-04L
+data8 0x9BC489CC00C7E63A, 0x00003FF1 //A4 = +7.42758405750422020216e-05L
+data8 0xC8D418A9F2A78515, 0x0000BFEE //A5 = -1.19703114831817055481e-05L
+data8 0xFFE671DCEE8665A8, 0x00003FEB //A6 = +1.90660487794668853072e-06L
+data8 0xA1313247D3E35365, 0x0000BFE9 //A7 = -3.00243820009225833104e-07L
+data8 0xC8D5A87C970712B1, 0x00003FE6 //A8 = +4.67604496871825103188e-08L
+data8 0xF77258CEF4675E25, 0x0000BFE3 //A9 = -7.20164586117313631144e-09L
+data8 0x96549D79C0F33C27, 0x00003FE1 //A10 = +1.09379854902340983112e-09L
+data8 0xB16A6CC5A3AE6E01, 0x0000BFDE //A11 = -1.61358659378896671620e-10L
+data8 0xC0970F2551C52F96, 0x00003FDB //A12 = +2.18949565869759698947e-11L
+data8 0xA6E029ABB3BB500C, 0x0000BFD8 //A13 = -2.37144541649446501026e-12L
+data8 0xA3E43F3857D1B6A5, 0x00003FD4 //A14 = +1.45564973108152568130e-13L
+// Pol12
+data8 0xA36E35FC807B3E64, 0x00003FFB //A0 = +7.98000543291529334886e-02L
+data8 0xB725A29237C8F94F, 0x0000BFF8 //A1 = -1.11784064873715046550e-02L
+data8 0xCB51EF23EAD5F327, 0x00003FF5 //A2 = +1.55120891755237931425e-03L
+data8 0xDFA838770AE711A2, 0x0000BFF2 //A3 = -2.13296043002775850891e-04L
+data8 0xF3D7B777730B202D, 0x00003FEF //A4 = +2.90683082614108095819e-05L
+data8 0x83C5FF0D475796DD, 0x0000BFED //A5 = -3.92715403535014263671e-06L
+data8 0x8D37B41345244FD5, 0x00003FEA //A6 = +5.26076523514903487927e-07L
+data8 0x9616B7E9C40C1DCC, 0x0000BFE7 //A7 = -6.98905176445499510102e-08L
+data8 0x9E38FDF61B26699A, 0x00003FE4 //A8 = +9.20976891314475742405e-09L
+data8 0xA565DFE27AEA03A1, 0x0000BFE1 //A9 = -1.20342845518628622757e-09L
+data8 0xAAEB9EFB497EC812, 0x00003FDE //A10 = +1.55451193328690040046e-10L
+data8 0xABD305A38349EAEB, 0x0000BFDB //A11 = -1.95341618552982314342e-11L
+data8 0x9EDB00104DB66DD9, 0x00003FD8 //A12 = +2.25747200093121867690e-12L
+data8 0xE9F80AF513F2B8AB, 0x0000BFD4 //A13 = -2.07806143133802417637e-13L
+data8 0xC2B840C3859AB166, 0x00003FD0 //A14 = +1.08091168358477817812e-14L
+// Pol13
+data8 0x86CD0BF01914407A, 0x00003FFB //A0 = +6.58207829138836028568e-02L
+data8 0xF9F4A17FA70807C3, 0x0000BFF7 //A1 = -7.62803922344113067603e-03L
+data8 0xE63BF84EDE20EDAA, 0x00003FF4 //A2 = +8.78273993036530088653e-04L
+data8 0xD2B746011B39D879, 0x0000BFF1 //A3 = -1.00477176633442906101e-04L
+data8 0xBFA4F1F66023C975, 0x00003FEE //A4 = +1.14228914411837438985e-05L
+data8 0xAD3A05E1F1F0EA8F, 0x0000BFEB //A5 = -1.29063913420827451449e-06L
+data8 0x9BA1F2E56DBE1B49, 0x00003FE8 //A6 = +1.44944165416032280452e-07L
+data8 0x8AFE93AF627BAFA6, 0x0000BFE5 //A7 = -1.61810825806733824014e-08L
+data8 0xF6CEAB6E78304875, 0x00003FE1 //A8 = +1.79575947795401009493e-09L
+data8 0xD9BFD64FD9166ECF, 0x0000BFDE //A9 = -1.98041892772535870322e-10L
+data8 0xBE482C8AEA403737, 0x00003FDB //A10 = +2.16325508593741350803e-11L
+data8 0xA1FB98FA19E62A4F, 0x0000BFD8 //A11 = -2.30191407969654156362e-12L
+data8 0xFDB2E0599016AD1E, 0x00003FD4 //A12 = +2.25329742249079975388e-13L
+data8 0x9E179A99CDD4BF4B, 0x0000BFD1 //A13 = -1.75517603530017718494e-14L
+data8 0xDE4DE992A707C7BC, 0x00003FCC //A14 = +7.71273133169032472595e-16L
+// Pol14
+data8 0xDF0639E60CF6E96C, 0x00003FFA //A0 = +5.44492971101228988138e-02L
+data8 0xAB6737B6065BD1C2, 0x0000BFF7 //A1 = -5.23081035867078490333e-03L
+data8 0x8322CC0765FD9C27, 0x00003FF4 //A2 = +5.00243857322493802503e-04L
+data8 0xC7C37C447AABC9BE, 0x0000BFF0 //A3 = -4.76273572257807668623e-05L
+data8 0x977C068C67DD09B3, 0x00003FED //A4 = +4.51458915834329225528e-06L
+data8 0xE4C00648054CBD72, 0x0000BFE9 //A5 = -4.26080256412742187632e-07L
+data8 0xABF9032C426C0F54, 0x00003FE6 //A6 = +4.00405155179176153559e-08L
+data8 0x80BD82177111B70D, 0x0000BFE3 //A7 = -3.74683488305340664541e-09L
+data8 0xBFEFB2BBFC4AAE16, 0x00003FDF //A8 = +3.49130134089615132836e-10L
+data8 0x8E68BCEC2A2F6025, 0x0000BFDC //A9 = -3.23800879252444001040e-11L
+data8 0xD19FEF92B2157585, 0x00003FD8 //A10 = +2.97894685764287382560e-12L
+data8 0x967A0ECC142382D9, 0x0000BFD5 //A11 = -2.67300472044743953909e-13L
+data8 0xC6D8869855133985, 0x00003FD1 //A12 = +2.20763189681614758000e-14L
+data8 0xD10AC0B228ABCECC, 0x0000BFCD //A13 = -1.45052027893524847250e-15L
+data8 0xF7C6DEB4522487A3, 0x00003FC8 //A14 = +5.37280367113168366711e-17L
+// Pol15
+data8 0xB8F57DECFAC3B255, 0x00003FFA //A0 = +4.51559943173131409760e-02L
+data8 0xEC1B8A6C822C036F, 0x0000BFF6 //A1 = -3.60271577347565115947e-03L
+data8 0x963A6DD66951B72E, 0x00003FF3 //A2 = +2.86537625289770759336e-04L
+data8 0xBE93F9E80DF4AE0A, 0x0000BFEF //A3 = -2.27186718010906557773e-05L
+data8 0xF10589FC10D908E0, 0x00003FEB //A4 = +1.79575113004740124999e-06L
+data8 0x97F1A2435C7877EF, 0x0000BFE8 //A5 = -1.41508767557208714648e-07L
+data8 0xBEFF2FB5F00E9327, 0x00003FE4 //A6 = +1.11174782364058338591e-08L
+data8 0xEF5E09DC714DF198, 0x0000BFE0 //A7 = -8.70813302639377671664e-10L
+data8 0x958A6EB9408970A4, 0x00003FDD //A8 = +6.80032608255179732632e-11L
+data8 0xBA31F40954675710, 0x0000BFD9 //A9 = -5.29198388081297293593e-12L
+data8 0xE63B9CEEDC4CF0E6, 0x00003FD5 //A10 = +4.08975721481205179918e-13L
+data8 0x8AF8F1E3FED32CEC, 0x0000BFD2 //A11 = -3.08580807479307213059e-14L
+data8 0x9A88033A08842BEA, 0x00003FCE //A12 = +2.14455258045503137285e-15L
+data8 0x88BCF775B7B3A939, 0x0000BFCA //A13 = -1.18601440246395438386e-16L
+data8 0x88687B63A5B7135E, 0x00003FC5 //A14 = +3.69734984736162880476e-18L
+// Pol16
+data8 0x99B8A501204BF3E7, 0x00003FFA //A0 = +3.75296063885057657456e-02L
+data8 0xA33FA20D2867C79C, 0x0000BFF6 //A1 = -2.49097544033960143953e-03L
+data8 0xACFD14CA6AA55829, 0x00003FF2 //A2 = +1.64974783411741182991e-04L
+data8 0xB6E9B4ED9B378B09, 0x0000BFEE //A3 = -1.09024594422859744844e-05L
+data8 0xC0FD95D38ADCF301, 0x00003FEA //A4 = +7.18945888498730738040e-07L
+data8 0xCB302F7AAFFFA074, 0x0000BFE6 //A5 = -4.73084450875945514829e-08L
+data8 0xD578674188198402, 0x00003FE2 //A6 = +3.10640208133938026422e-09L
+data8 0xDFCC6ED4219E7FC4, 0x0000BFDE //A7 = -2.03543610142159316364e-10L
+data8 0xEA1F448AA373E4A9, 0x00003FDA //A8 = +1.33083028465054001215e-11L
+data8 0xF44780B8EACD37B5, 0x0000BFD6 //A9 = -8.67854438613319891312e-13L
+data8 0xFD55794492F53AEE, 0x00003FD2 //A10 = +5.62514216652784597182e-14L
+data8 0x805C040421E7A098, 0x0000BFCF //A11 = -3.56269003968981157635e-15L
+data8 0xEFCCD20DE93A138E, 0x00003FCA //A12 = +2.07993414310230172191e-16L
+data8 0xB259764466732080, 0x0000BFC6 //A13 = -9.66834364652262630640e-18L
+data8 0x9597C1DB6AF830E4, 0x00003FC1 //A14 = +2.53420063550355940811e-19L
+// Pol17
+data8 0xFFFCBD66BAA4368C, 0x00003FF9 //A0 = +3.12484454387527380657e-02L
+data8 0xE28174723762D197, 0x0000BFF5 //A1 = -1.72810121976742793952e-03L
+data8 0xC81D832836019EC4, 0x00003FF1 //A2 = +9.54224026432644399736e-05L
+data8 0xB0885530C7D7AB5B, 0x0000BFED //A3 = -5.26107996417947739207e-06L
+data8 0x9B7EA64F62F6FD06, 0x00003FE9 //A4 = +2.89631495607631932854e-07L
+data8 0x88C24ACAA9042166, 0x0000BFE5 //A5 = -1.59208376111789845204e-08L
+data8 0xF033E5CD9B7F2822, 0x00003FE0 //A6 = +8.73852423930118273815e-10L
+data8 0xD2A1B161FB4DFBFE, 0x0000BFDC //A7 = -4.78920839886600387264e-11L
+data8 0xB86B27FCBB5A1E9D, 0x00003FD8 //A8 = +2.62074563162805723295e-12L
+data8 0xA124E1303F08E508, 0x0000BFD4 //A9 = -1.43124677534734729453e-13L
+data8 0x8C0B270950D7C697, 0x00003FD0 //A10 = +7.77397948226387851915e-15L
+data8 0xEE034E350C65D2D9, 0x0000BFCB //A11 = -4.12886586201102092942e-16L
+data8 0xBA94473E52495304, 0x00003FC7 //A12 = +2.02289587087169937807e-17L
+data8 0xE913D34CBB853CEE, 0x0000BFC2 //A13 = -7.89697093687557412061e-19L
+data8 0xA44576A85E8CAB59, 0x00003FBD //A14 = +1.73929048516879172258e-20L
+// Pol18
+data8 0xD579A3FE4622DED2, 0x00003FF9 //A0 = +2.60589793198885278242e-02L
+data8 0x9D97EB84E7CD89C8, 0x0000BFF5 //A1 = -1.20234251012583627659e-03L
+data8 0xE86EFDC2CCA5C47B, 0x00003FF0 //A2 = +5.54164790116744315389e-05L
+data8 0xAB39FA5621E39B15, 0x0000BFEC //A3 = -2.55147332073979814633e-06L
+data8 0xFC0244F58F8D8097, 0x00003FE7 //A4 = +1.17350772365097747003e-07L
+data8 0xB941D44B71B14FE2, 0x0000BFE3 //A5 = -5.39169255673480031672e-09L
+data8 0x880B4A40B6F2C901, 0x00003FDF //A6 = +2.47462779512141204748e-10L
+data8 0xC7998AE5652CDCFC, 0x0000BFDA //A7 = -1.13459336509953900777e-11L
+data8 0x92438AA45915CD95, 0x00003FD6 //A8 = +5.19633524685027215673e-13L
+data8 0xD6067243AD3AEAE6, 0x0000BFD1 //A9 = -2.37615683835509918256e-14L
+data8 0x9BD0722A07669E4D, 0x00003FCD //A10 = +1.08117849400479298186e-15L
+data8 0xDDF6F1B79F50E3C4, 0x0000BFC8 //A11 = -4.81309059042573202592e-17L
+data8 0x91F283C0351A9ACA, 0x00003FC4 //A12 = +1.97795505638619048412e-18L
+data8 0x990BC4FAFA9C7542, 0x0000BFBF //A13 = -6.48174913943425248713e-20L
+data8 0xB536865B89676892, 0x00003FB9 //A14 = +1.19916696090758913485e-21L
+// Pol19
+data8 0xB241CEB1B7C953F1, 0x00003FF9 //A0 = +2.17598950382519671244e-02L
+data8 0xDBD6FBA9B11B85E1, 0x0000BFF4 //A1 = -8.38622198373701898430e-04L
+data8 0x877605B1AD082441, 0x00003FF0 //A2 = +3.22964249573360786077e-05L
+data8 0xA6D04DC067A5D310, 0x0000BFEB //A3 = -1.24285881515578912302e-06L
+data8 0xCD458A72BC161315, 0x00003FE6 //A4 = +4.77935289502172654216e-08L
+data8 0xFC6902CFB5DE90A2, 0x0000BFE1 //A5 = -1.83652591038905929358e-09L
+data8 0x9B12B0707DFE615C, 0x00003FDD //A6 = +7.05190381049444126079e-11L
+data8 0xBE67972F2C8EE5AE, 0x0000BFD8 //A7 = -2.70581282732878853626e-12L
+data8 0xE99D8CAF9A3FFE02, 0x00003FD3 //A8 = +1.03746090805854376435e-13L
+data8 0x8F35F5BBEF9E4299, 0x0000BFCF //A9 = -3.97489765699919189983e-15L
+data8 0xAF6E62C3C91B7178, 0x00003FCA //A10 = +1.52162305785839987182e-16L
+data8 0xD6636229C1646963, 0x0000BFC5 //A11 = -5.81100425482928485309e-18L
+data8 0x810331BF289E068F, 0x00003FC1 //A12 = +2.18555638648715837944e-19L
+data8 0x8E3D07CA59546B83, 0x0000BFBC //A13 = -7.53003820427900359431e-21L
+data8 0xD5970B291ED73560, 0x00003FB6 //A14 = +1.76677518655145552907e-22L
+LOCAL_OBJECT_END(erfc_p_table)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(erfc)
+
+{ .mfi
+ alloc r32 = ar.pfs, 0, 33, 4, 0
+ fma.s1 FR_Tmp = f1, f1, f8 // |x|+1, if x >= 0
+ nop.i 0
+}
+{ .mfi
+ addl EXP_AD_TB1 = @ltoff(exp_table_1), gp
+ fms.s1 FR_Tmp1 = f1, f1, f8 // |x|+1, if x < 0
+ mov exp_GR_rshf_2to56 = 0x4768 // begin 1.1 2^(63+56)
+};;
+
+{ .mfi
+ ld8 EXP_AD_TB1 = [EXP_AD_TB1]
+ fcmp.ge.s1 p6,p7 = f8, f0 // p6: x >= 0 ,p7: x<0
+ mov exp_GR_rshf_2to56 = 0x4768 // begin 1.1 2^(63+56)
+}
+{ .mlx
+ mov exp_TB1_size = 0x100
+ movl exp_GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc //signif. of 1/ln2
+};;
+
+{ .mfi
+ nop.m 0
+ fclass.m p8,p0 = f8,0x07 // p8: x = 0
+ shl exp_GR_rshf_2to56 = exp_GR_rshf_2to56, 48 //end 1.1 2^(63+56)
+}
+{ .mfi
+ mov exp_GR_exp_2tom56 = 0xffff-56
+ fnma.s1 EXP_NORM_f8 = f8, f8, f0 // high bits for -x^2
+ nop.i 0
+};;
+
+
+.pred.rel "mutex",p6,p7
+{ .mfi
+ setf.sig EXP_INV_LN2_2TO63 = exp_GR_sig_inv_ln2 // form 1/ln2 * 2^63
+(p6) fma.s1 FR_AbsArg = f1, f0, f8 // |x|, if x >= 0
+ mov GR_POS_ARG_ASYMP = 0x403C
+}
+{ .mfi
+ mov GR_NEG_ARG_ASYMP = 0x4018
+(p7) fms.s1 FR_AbsArg = f1, f0, f8 // |x|, if x < 0
+ mov exp_GR_rshf = 0x43e8 // begin 1.1 2^63 for right shift
+};;
+
+{ .mfi
+ setf.exp EXP_2TOM56 = exp_GR_exp_2tom56 // 2^-56 for scaling Nfloat
+ fclass.m p10,p0 = f8, 0x21 // p10: x = +inf
+ mov exp_GR_17ones = 0x1FFFF
+}
+{ .mlx
+ setf.d EXP_RSHF_2TO56 = exp_GR_rshf_2to56 // const 1.10*2^(63+56)
+ movl GR_ERFC_XB_TB = 0x1A0
+};;
+
+
+.pred.rel "mutex",p6,p7
+{ .mfi
+ ldfd FR_UnfBound = [EXP_AD_TB1], 16
+(p6) fma.s1 FR_Tmp = FR_Tmp, FR_Tmp, f0 // (|x|+1)^2,x >=0
+ shl exp_GR_rshf = exp_GR_rshf, 48 //end 1.1 2^63 for right shift
+}
+{ .mfi
+ nop.m 0
+(p7) fma.s1 FR_Tmp = FR_Tmp1, FR_Tmp1, f0 // (|x|+1)^2, x<0
+ mov GR_0x1 = 0x1
+};;
+
+{ .mfi
+ mov GR_BIAS = 0x0FFFF
+ fclass.m p9,p0 = f8, 0x22 // p9: x = -inf
+ shl GR_EpsNorm = GR_0x1,53
+}
+{ .mfb
+ mov exp_TB2_size = 0x80
+(p8) fma.d.s0 f8 = f1, f1, f0 //p8: y = 1.0, x = 0
+(p8) br.ret.spnt b0 //p8: quick exit for x = 0
+};;
+
+{ .mfi
+ nop.m 0
+ fclass.m p11,p0 = f8, 0xc3 // p11: x = nan
+ nop.i 0
+}
+{ .mfi
+ setf.d EXP_RSHF = exp_GR_rshf //Form right shift const 1.100 * 2^63
+ fma.s1 FR_NormX = f8,f1,f0
+ nop.i 0
+};;
+
+{ .mfi
+ setf.d FR_EpsNorm = GR_EpsNorm
+ nop.f 0
+(p6) shl GR_ARG_ASYMP = GR_POS_ARG_ASYMP, 48//p6:ARG_ASYMP= 28.0,x>=0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_2 = f1, f1, f1
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe exp_ln2_by_128_hi = [EXP_AD_TB1],16
+ fma.s1 f8_sq_lo = f8, f8, EXP_NORM_f8 // low bits for -x^2
+(p7) shl GR_ARG_ASYMP = GR_NEG_ARG_ASYMP, 48//p6:ARG_ASYMP= 6.0,x < 0
+};;
+
+{ .mfi
+ sub GR_mBIAS = r0, GR_BIAS
+ fma.s1 FR_Tmp = FR_Tmp, FR_Tmp, f0 // (|x|+1)^4
+ nop.i 0
+}
+{ .mfi
+ ldfe exp_ln2_by_128_lo = [EXP_AD_TB1], 16
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ getf.d GR_AbsArg = FR_AbsArg
+ nop.f 0
+ add GR_ERFC_XB_TB = GR_ERFC_XB_TB, EXP_AD_TB1//pointer to XB_TBL
+}
+{ .mfb
+ shladd GR_ShftPi_bias = GR_BIAS, 4, GR_mBIAS // BIAS * 2^4 - BIAS
+(p9) fma.d.s0 f8 = f1, f1, f1 // p9: y = 2 for x = -inf
+(p9) br.ret.spnt b0 // p9: quick exit for x = -inf
+};;
+
+{ .mfi
+ add GR_ERFC_P_TB = 0x140, GR_ERFC_XB_TB // pointer to P_TBL
+ fma.s1 EXP_W_2TO56_RSH = EXP_NORM_f8,EXP_INV_LN2_2TO63,EXP_RSHF_2TO56
+ shladd GR_ShftPi_bias = GR_ShftPi_bias, 4, r0 // BIAS * 240
+}
+{ .mfb
+ nop.m 0
+(p10) fma.d.s0 f8 = f0, f1, f0 // p10: y = 0 for x = +inf
+(p10) br.ret.spnt b0 // p10: quick exit for x = +inf
+};;
+
+
+.pred.rel "mutex",p6,p7
+{ .mfi
+(p6) cmp.gt.unc p15,p0 = GR_AbsArg,GR_ARG_ASYMP //p15: x > 28.0,p6: x >= 0
+ nop.f 0
+(p7) cmp.gt.unc p14,p0 = GR_AbsArg, GR_ARG_ASYMP //p14: x < - 6.0,p7: x < 0
+}
+{ .mfb
+ add EXP_AD_TB2 = exp_TB1_size, EXP_AD_TB1
+(p11) fma.d.s0 f8 = f8, f1, f0 //p11: y = x for x = nan
+(p11) br.ret.spnt b0 //p11: quick exit for x = nan
+};;
+
+{ .mfi
+ add EXP_AD_P = exp_TB2_size, EXP_AD_TB2
+ fms.s1 f8_sq_lo = f1, f1, f8_sq_lo // 1 - low bits for -x^2
+ nop.i 0
+};;
+
+{ .mfi
+ ldfpd exp_P4, exp_P3 = [EXP_AD_P], 16
+ fmerge.s FR_X = f8,f8
+ shladd GR_ShftXBi_bias = GR_mBIAS, 4, r0
+}
+{ .mfb
+ nop.m 0
+(p14) fnma.d.s0 FR_RESULT = FR_EpsNorm,FR_EpsNorm,FR_2 //p14:y ~=~ 2,x< -6.0
+(p14) br.ret.spnt b0 //p14: quick exit for x < -6.0
+};;
+
+//p15: y ~=~ 0.0(result with underflow error), x > ARG_ASYMP = 28,
+{ .mfi
+ ldfpd exp_P2, exp_P1 = [EXP_AD_P]
+ fma.d.s0 FR_Tmpf = f1, f1, FR_EpsNorm // flag i
+ nop.i 0
+}
+{ .mfb
+(p15) mov GR_Parameter_TAG = 208
+(p15) fma.d.s0 FR_RESULT = FR_EpsNorm,FR_EpsNorm,f0
+(p15) br.cond.spnt __libm_error_region
+};;
+
+//p8: x < 27.0, result without ungerflow error
+{ .mfi
+ getf.exp GR_IndxPlusBias = FR_Tmp // exp + bias for (|x|+1)^4
+ fcmp.lt.s1 p8,p0 = FR_NormX,FR_UnfBound
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 EXP_Nfloat = EXP_W_2TO56_RSH, EXP_2TOM56, EXP_RSHF
+ nop.i 0
+};;
+
+{ .mmi
+ shladd GR_ShftXBi = GR_IndxPlusBias, 4, GR_ShftXBi_bias
+ shladd GR_ShftPi = GR_IndxPlusBias, 4, GR_ShftPi_bias
+ shl GR_ShftPi_8 = GR_IndxPlusBias, 8
+};;
+
+{ .mmi
+ getf.sig exp_GR_N = EXP_W_2TO56_RSH
+ add GR_ERFC_XB_TB = GR_ERFC_XB_TB, GR_ShftXBi// pointer to XB[i]
+ sub GR_ShftPi = GR_ShftPi_8, GR_ShftPi // (256-16)*i
+};;
+
+{ .mmi
+ ldfe FR_Xb = [GR_ERFC_XB_TB]
+ add GR_ShftA12 = 0xC0, GR_ShftPi // pointer shift for A12
+ add GR_ShftA13 = 0xD0, GR_ShftPi // pointer shift for A13
+};;
+
+{ .mfi
+ add GR_P_A13 = GR_ERFC_P_TB, GR_ShftA13 // pointer to A13
+ nop.f 0
+ and exp_GR_index_1 = 0x0f, exp_GR_N
+}
+{ .mfi
+ add GR_P_A12 = GR_ERFC_P_TB, GR_ShftA12 // pointer to A12
+ fnma.s1 exp_r = EXP_Nfloat, exp_ln2_by_128_hi, EXP_NORM_f8
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_A12 = [GR_P_A12], -64
+ nop.f 0
+ and exp_GR_index_2_16 = 0x70, exp_GR_N
+}
+{ .mfi
+ ldfe FR_A13 = [GR_P_A13], -64
+ nop.f 0
+ shladd EXP_AD_T1 = exp_GR_index_1, 4, EXP_AD_TB1
+};;
+
+{ .mmi
+ ldfe FR_A8 = [GR_P_A12], 32
+ ldfe FR_A9 = [GR_P_A13], 32
+ add EXP_AD_T2 = EXP_AD_TB2, exp_GR_index_2_16
+};;
+
+{ .mmi
+ ldfe FR_A10 = [GR_P_A12], -96
+ ldfe FR_A11 = [GR_P_A13], -96
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe FR_A4 = [GR_P_A12], 32
+ ldfe FR_A5 = [GR_P_A13], 32
+ shr r2 = exp_GR_N, 0x7
+};;
+
+{ .mfi
+ ldfe FR_A6 = [GR_P_A12], -64
+ fma.s1 exp_rP4pP3 = exp_r, exp_P4, exp_P3
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_A7 = [GR_P_A13], -64
+ fma.s1 exp_rsq = exp_r, exp_r, f0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe FR_A2 = [GR_P_A12], -32
+ ldfe FR_A3 = [GR_P_A13], -32
+ addl exp_GR_biased_M = 0xffff, r2
+};;
+
+{ .mmi
+ ldfe FR_A0 = [GR_P_A12], 224
+ ldfe FR_A1 = [GR_P_A13]
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_A14 = [GR_P_A12]
+ fms.s1 FR_LocArg = FR_AbsArg, f1, FR_Xb // xloc = x - x[i]
+ nop.i 0
+};;
+
+{ .mmi
+ setf.exp EXP_2M = exp_GR_biased_M
+ ldfe exp_T1 = [EXP_AD_T1]
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe exp_T2 = [EXP_AD_T2]
+ fma.s1 exp_P_hi = exp_rsq, exp_P1, exp_r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 exp_rcube = exp_r, exp_rsq, f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 exp_P_lo = exp_r, exp_rP4pP3, exp_P2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 exp_f = EXP_Nfloat, exp_ln2_by_128_lo, f8_sq_lo
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_0_1 = FR_LocArg, FR_LocArg, f0 // xloc ^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_0_2 = FR_A13, FR_LocArg, FR_A12
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_1_1 = FR_A9, FR_LocArg, FR_A8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_1_2 = FR_A11, FR_LocArg, FR_A10
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_2_1 = FR_A5, FR_LocArg, FR_A4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_2_2 = FR_A7, FR_LocArg, FR_A6
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_3_1 = FR_A1, FR_LocArg, FR_A0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_3_2 = FR_A3, FR_LocArg, FR_A2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_6_1 = FR_P14_0_1, FR_A14, FR_P14_0_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_7_2 = FR_P14_0_1, FR_P14_0_1, f0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_7_1 = FR_P14_0_1, FR_P14_1_2, FR_P14_1_1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 exp_S2 = exp_f, exp_T2, f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 exp_S1 = EXP_2M, exp_T1, f0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_8_1 = FR_P14_0_1, FR_P14_3_2, FR_P14_3_1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_8_2 = FR_P14_0_1, FR_P14_2_2, FR_P14_2_1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_12_1 = FR_P14_7_2, FR_P14_6_1, FR_P14_7_1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 exp_S = exp_S1, exp_S2, f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 exp_P = exp_rcube, exp_P_lo, exp_P_hi
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_13_1 = FR_P14_7_2, FR_P14_8_2, FR_P14_8_1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P14_13_2 = FR_P14_7_2, FR_P14_7_2, f0 // xloc^8
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Exp = exp_S, exp_P, exp_S // exp(-x^2)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Pol = FR_P14_13_2, FR_P14_12_1, FR_P14_13_1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.d.s0 FR_Tmpf = f8, f1, f0 // flag d
+ nop.i 0
+};;
+
+//p6: result for 0 < x < = 28.0,
+//p7: result for -6.0 <= x < 0,
+//p8: exit for - 6.0 <= x < UnfBound ~=~ 26.54..
+
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+(p6) fma.d.s0 f8 = FR_Exp, FR_Pol, f0
+ nop.i 0
+}
+{ .mfb
+ mov GR_Parameter_TAG = 208
+(p7) fnma.d.s0 f8 = FR_Exp, FR_Pol, FR_2
+(p8) br.ret.sptk b0
+};;
+
+GLOBAL_LIBM_END(erfc)
+// call via (p15) br.cond.spnt __libm_error_region
+// for x > ARG_ASYMP = 28.0
+// or
+//
+// after .endp erfc for UnfBound < = x < = ARG_ASYMP = 28.0
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+{ .mmi
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
+
+
diff --git a/sysdeps/ia64/fpu/s_erfcf.S b/sysdeps/ia64/fpu/s_erfcf.S
new file mode 100644
index 0000000000..7d9e2a9fa8
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_erfcf.S
@@ -0,0 +1,981 @@
+.file "erfcf.s"
+
+
+// Copyright (c) 2002 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 01/17/02 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// float erfcf(float)
+//
+// Overview of operation
+//==============================================================
+// 1. 0 <= x <= 10.06
+//
+// erfcf(x) = P15(x) * exp( -x^2 )
+//
+// Comment:
+//
+// Let x(0)=0, x(i) = 2^(i), i=1,...3, x(4)= 10.06
+//
+// Let x(i)<= x < x(i+1).
+// We can find i as exponent of argument x (let i = 0 for 0<= x < 2 )
+//
+// Let P15(x) - polynomial approximation of degree 15 for function
+// erfcf(x) * exp( x^2) and x(i) <= x <= x(i+1), i = 0,1,2,3
+// Polynomial coeffitients we have in the table erfc_p_table.
+//
+// So we can find result for erfcf(x) as above.
+// Algorithm description for exp function see below.
+//
+// 2. -4.4 <= x < 0
+//
+// erfcf(x) = 2.0 - erfcf(-x)
+//
+// 3. x > 10.06
+//
+// erfcf(x) ~=~ 0.0
+//
+// 4. x < -4.4
+//
+// erfcf(x) ~=~ 2.0
+
+// Special values
+//==============================================================
+// erfcf(+0) = 1.0
+// erfcf(-0) = 1.0
+
+// erfcf(+qnan) = +qnan
+// erfcf(-qnan) = -qnan
+// erfcf(+snan) = +qnan
+// erfcf(-snan) = -qnan
+
+// erfcf(-inf) = 2.0
+// erfcf(+inf) = +0
+
+//==============================================================
+// Take double exp(double) from libm_64.
+//
+// Overview of operation
+//==============================================================
+// Take the input x. w is "how many log2/128 in x?"
+// w = x * 128/log2
+// n = int(w)
+// x = n log2/128 + r + delta
+
+// n = 128M + index_1 + 2^4 index_2
+// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta
+
+// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta)
+// Construct 2^M
+// Get 2^(index_1/128) from table_1;
+// Get 2^(index_2/8) from table_2;
+// Calculate exp(r) by series
+// r = x - n (log2/128)_high
+// delta = - n (log2/128)_low
+// Calculate exp(delta) as 1 + delta
+//
+// Comment for erfcf:
+//
+// Let exp(r) = 1 + x + 0.5*x^2 + (1/6)*x^3
+// Let delta = 0.
+//==============================================================
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f6,f7,f9 -> f11, f32 -> f92
+
+// General registers used:
+// r14 -> r22,r32 -> r50
+
+// Predicate registers used:
+// p6 -> p15
+
+// Assembly macros
+//==============================================================
+EXP_AD_TB1 = r14
+exp_GR_sig_inv_ln2 = r15
+exp_TB1_size = r16
+exp_GR_rshf_2to56 = r17
+exp_GR_exp_2tom56 = r18
+
+exp_GR_rshf = r33
+EXP_AD_TB2 = r34
+EXP_AD_P = r35
+exp_GR_N = r36
+exp_GR_index_1 = r37
+exp_GR_index_2_16 = r38
+exp_GR_biased_M = r39
+EXP_AD_T1 = r40
+EXP_AD_T2 = r41
+exp_TB2_size = r42
+
+// GR for erfcf(x)
+//==============================================================
+GR_IndxPlusBias = r19
+GR_ExpMask = r20
+GR_BIAS = r21
+GR_ShftPi_bias = r22
+
+GR_P_POINT_1 = r43
+GR_P_POINT_2 = r44
+GR_P_POINT_3 = r45
+GR_P_POINT_4 = r46
+
+GR_ShftPi = r47
+GR_EpsNorm = r48
+
+GR_05 = r49
+GR_1_by_6 = r50
+
+// GR for __libm_support call
+//==============================================================
+
+GR_SAVE_B0 = r43
+GR_SAVE_PFS = r44
+GR_SAVE_GP = r45
+GR_SAVE_SP = r46
+
+GR_Parameter_X = r47
+GR_Parameter_Y = r48
+GR_Parameter_RESULT = r49
+GR_Parameter_TAG = r50
+
+
+// FR for exp(-x^2)
+//==============================================================
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+EXP_2TOM56 = f6
+EXP_INV_LN2_2TO63 = f7
+EXP_W_2TO56_RSH = f9
+exp_ln2_by_128_hi = f11
+
+EXP_RSHF_2TO56 = f32
+exp_ln2_by_128_lo = f33
+EXP_RSHF = f34
+EXP_Nfloat = f35
+exp_r = f36
+exp_rsq = f37
+EXP_2M = f38
+exp_S1 = f39
+exp_T1 = f40
+exp_P = f41
+exp_S = f42
+EXP_NORM_f8 = f43
+exp_S2 = f44
+exp_T2 = f45
+
+// FR for erfcf(x)
+//==============================================================
+FR_AbsArg = f46
+FR_Tmp = f47
+FR_Tmp1 = f48
+FR_Tmpf = f49
+FR_NormX = f50
+
+FR_A15 = f51
+FR_A14 = f52
+
+FR_A13 = f53
+FR_A12 = f54
+
+FR_A11 = f55
+FR_A10 = f56
+
+FR_A9 = f57
+FR_A8 = f58
+
+FR_A7 = f59
+FR_A6 = f60
+
+FR_A5 = f61
+FR_A4 = f62
+
+FR_A3 = f63
+FR_A2 = f64
+
+FR_A1 = f65
+FR_A0 = f66
+
+FR_P15_0_1 = f67
+FR_P15_1_1 = f68
+FR_P15_1_2 = f69
+FR_P15_2_1 = f70
+FR_P15_2_2 = f71
+FR_P15_3_1 = f72
+FR_P15_3_2 = f73
+FR_P15_4_1 = f74
+FR_P15_4_2 = f75
+FR_P15_7_1 = f76
+FR_P15_7_2 = f77
+FR_P15_8_1 = f78
+FR_P15_9_1 = f79
+FR_P15_9_2 = f80
+FR_P15_13_1 = f81
+FR_P15_14_1 = f82
+FR_P15_14_2 = f83
+
+FR_2 = f84
+FR_05 = f85
+FR_1_by_6 = f86
+FR_Pol = f87
+FR_Exp = f88
+
+FR_POS_ARG_ASYMP = f89
+FR_NEG_ARG_ASYMP = f90
+
+FR_UnfBound = f91
+FR_EpsNorm = f92
+
+// Data tables
+//==============================================================
+RODATA
+.align 16
+
+// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
+
+// double-extended 1/ln(2)
+// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
+// 3fff b8aa 3b29 5c17 f0bc
+// For speed the significand will be loaded directly with a movl and setf.sig
+// and the exponent will be bias+63 instead of bias+0. Thus subsequent
+// computations need to scale appropriately.
+// The constant 128/ln(2) is needed for the computation of w. This is also
+// obtained by scaling the computations.
+//
+// Two shifting constants are loaded directly with movl and setf.d.
+// 1. EXP_RSHF_2TO56 = 1.1000..00 * 2^(63-7)
+// This constant is added to x*1/ln2 to shift the integer part of
+// x*128/ln2 into the rightmost bits of the significand.
+// The result of this fma is EXP_W_2TO56_RSH.
+// 2. EXP_RSHF = 1.1000..00 * 2^(63)
+// This constant is subtracted from EXP_W_2TO56_RSH * 2^(-56) to give
+// the integer part of w, n, as a floating-point number.
+// The result of this fms is EXP_Nfloat.
+
+
+LOCAL_OBJECT_START(exp_table_1)
+
+data4 0x4120f5c3, 0x408ccccd //POS_ARG_ASYMP = 10.06, NEG_ARG_ASYMP = 4.4
+data4 0x41131Cdf, 0x00800000 //UnfBound ~=~ 9.1, EpsNorm ~=~ 1.1754944e-38
+//
+data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi
+data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo
+//
+// Table 1 is 2^(index_1/128) where
+// index_1 goes from 0 to 15
+//
+data8 0x8000000000000000 , 0x00003FFF
+data8 0x80B1ED4FD999AB6C , 0x00003FFF
+data8 0x8164D1F3BC030773 , 0x00003FFF
+data8 0x8218AF4373FC25EC , 0x00003FFF
+data8 0x82CD8698AC2BA1D7 , 0x00003FFF
+data8 0x8383594EEFB6EE37 , 0x00003FFF
+data8 0x843A28C3ACDE4046 , 0x00003FFF
+data8 0x84F1F656379C1A29 , 0x00003FFF
+data8 0x85AAC367CC487B15 , 0x00003FFF
+data8 0x8664915B923FBA04 , 0x00003FFF
+data8 0x871F61969E8D1010 , 0x00003FFF
+data8 0x87DB357FF698D792 , 0x00003FFF
+data8 0x88980E8092DA8527 , 0x00003FFF
+data8 0x8955EE03618E5FDD , 0x00003FFF
+data8 0x8A14D575496EFD9A , 0x00003FFF
+data8 0x8AD4C6452C728924 , 0x00003FFF
+LOCAL_OBJECT_END(exp_table_1)
+
+// Table 2 is 2^(index_1/8) where
+// index_2 goes from 0 to 7
+
+LOCAL_OBJECT_START(exp_table_2)
+
+data8 0x8000000000000000 , 0x00003FFF
+data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF
+data8 0x9837F0518DB8A96F , 0x00003FFF
+data8 0xA5FED6A9B15138EA , 0x00003FFF
+data8 0xB504F333F9DE6484 , 0x00003FFF
+data8 0xC5672A115506DADD , 0x00003FFF
+data8 0xD744FCCAD69D6AF4 , 0x00003FFF
+data8 0xEAC0C6E7DD24392F , 0x00003FFF
+LOCAL_OBJECT_END(exp_table_2)
+
+LOCAL_OBJECT_START(erfc_p_table)
+
+// Pol_0
+data8 0xBEA3260C63CB0446 //A15 = -5.70673541831883454676e-07
+data8 0x3EE63D6178077654 //A14 = +1.06047480138940182343e-05
+data8 0xBF18646BC5FC70A7 //A13 = -9.30491237309283694347e-05
+data8 0x3F40F92F909117FE //A12 = +5.17986512144075019133e-04
+data8 0xBF611344289DE1E6 //A11 = -2.08438217390159994419e-03
+data8 0x3F7AF9FE6AD16DC0 //A10 = +6.58606893292862351928e-03
+data8 0xBF91D219E196CBA7 //A9 = -1.74030345858217321001e-02
+data8 0x3FA4AFDDA355854C //A8 = +4.04042493708041968315e-02
+data8 0xBFB5D465BB7025AE //A7 = -8.52721769916999425445e-02
+data8 0x3FC54C15A95B717D //A6 = +1.66384418195672549029e-01
+data8 0xBFD340A75B4B1AB5 //A5 = -3.00821150926292166899e-01
+data8 0x3FDFFFC0BFCD247F //A4 = +4.99984919839853542841e-01
+data8 0xBFE81270C361852B //A3 = -7.52251035312075583309e-01
+data8 0x3FEFFFFFC67295FC //A2 = +9.99999892800303301771e-01
+data8 0xBFF20DD74F8CD2BF //A1 = -1.12837916445020868099e+00
+data8 0x3FEFFFFFFFFE7C1D //A0 = +9.99999999988975570714e-01
+// Pol_1
+data8 0xBDE8EC4BDD953B56 //A15 = -1.81338928934942767144e-10
+data8 0x3E43607F269E2A1C //A14 = +9.02309090272196442358e-09
+data8 0xBE8C4D9E69C10E02 //A13 = -2.10875261143659275328e-07
+data8 0x3EC9CF2F84566725 //A12 = +3.07671055805877356583e-06
+data8 0xBF007980B1B46A4D //A11 = -3.14228438702169818945e-05
+data8 0x3F2F4C3AD6DEF24A //A10 = +2.38783056770846320260e-04
+data8 0xBF56F5129F8D30FA //A9 = -1.40120333363130546426e-03
+data8 0x3F7AA6C7ABFC38EE //A8 = +6.50671002200751820429e-03
+data8 0xBF98E7522CB84BEF //A7 = -2.43199195666185511109e-02
+data8 0x3FB2F68EB1C3D073 //A6 = +7.40746673580490638637e-02
+data8 0xBFC7C16055AC6385 //A5 = -1.85588876564704611769e-01
+data8 0x3FD8A707AEF5A440 //A4 = +3.85194702967570635211e-01
+data8 0xBFE547BFE39AE2EA //A3 = -6.65008492032112467310e-01
+data8 0x3FEE7C91BDF13578 //A2 = +9.52706213932898128515e-01
+data8 0xBFF1CB5B61F8C589 //A1 = -1.11214769621105541214e+00
+data8 0x3FEFEA56BC81FD37 //A0 = +9.97355812243688815239e-01
+// Pol_2
+data8 0xBD302724A12F46E0 //A15 = -5.73866382814058809406e-14
+data8 0x3D98889B75D3102E //A14 = +5.57829983681360947356e-12
+data8 0xBDF16EA15074A1E9 //A13 = -2.53671153922423457844e-10
+data8 0x3E3EC6E688CFEE5F //A12 = +7.16581828336436419561e-09
+data8 0xBE82E5ED44C52609 //A11 = -1.40802202239825487803e-07
+data8 0x3EC120BE5CE42353 //A10 = +2.04180535157522081699e-06
+data8 0xBEF7B8B0311A1911 //A9 = -2.26225266204633600888e-05
+data8 0x3F29A281F43FC238 //A8 = +1.95577968156184077632e-04
+data8 0xBF55E19858B3B7A4 //A7 = -1.33552434527526534043e-03
+data8 0x3F7DAC8C3D12E5FD //A6 = +7.24463253680473816303e-03
+data8 0xBF9FF9C04613FB47 //A5 = -3.12261622211693854028e-02
+data8 0x3FBB3D5DBF9D9366 //A4 = +1.06405123978743883370e-01
+data8 0xBFD224DE9F62C258 //A3 = -2.83500342989133623476e-01
+data8 0x3FE28A95CB8C6D3E //A2 = +5.79417131000276437708e-01
+data8 0xBFEC21205D358672 //A1 = -8.79043752717008257224e-01
+data8 0x3FEDAE44D5EDFE5B //A0 = +9.27523057776805771830e-01
+// Pol_3
+data8 0xBCA3BCA734AC82F1 //A15 = -1.36952437983096410260e-16
+data8 0x3D16740DC3990612 //A14 = +1.99425676175410093285e-14
+data8 0xBD77F4353812C46A //A13 = -1.36162367755616790260e-12
+data8 0x3DCFD0BE13C73DB4 //A12 = +5.78718761040355136007e-11
+data8 0xBE1D728DF71189B4 //A11 = -1.71406885583934105120e-09
+data8 0x3E64252C8CB710B5 //A10 = +3.75233795940731111303e-08
+data8 0xBEA514B93180F33D //A9 = -6.28261292774310809962e-07
+data8 0x3EE1381118CC7151 //A8 = +8.21066421390821904504e-06
+data8 0xBF1634404FB0FA72 //A7 = -8.47019436358372148764e-05
+data8 0x3F46B2CBBCF0EB32 //A6 = +6.92700845213200923490e-04
+data8 0xBF725C2B445E6D81 //A5 = -4.48243046949004063741e-03
+data8 0x3F974E7CFA4D89D9 //A4 = +2.27603462002522228717e-02
+data8 0xBFB6D7BAC2E342D1 //A3 = -8.92292714882032736443e-02
+data8 0x3FD0D156AD9CE2A6 //A2 = +2.62777013343603696631e-01
+data8 0xBFE1C228572AADB0 //A1 = -5.54950876471982857725e-01
+data8 0x3FE8A739F48B9A3B //A0 = +7.70413377406675619766e-01
+LOCAL_OBJECT_END(erfc_p_table)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(erfcf)
+
+// Form index i for table erfc_p_table as exponent of x
+// We use i + bias in real calculations
+{ .mlx
+ getf.exp GR_IndxPlusBias = f8 // (sign + exp + bias) of x
+ movl exp_GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc //signif.of 1/ln2
+}
+{ .mlx
+ addl EXP_AD_TB1 = @ltoff(exp_table_1), gp
+ movl exp_GR_rshf_2to56 = 0x4768000000000000 // 1.100 2^(63+56)
+}
+;;
+
+// Form argument EXP_NORM_f8 for exp(-x^2)
+{ .mfi
+ ld8 EXP_AD_TB1 = [EXP_AD_TB1]
+ fcmp.ge.s1 p6,p7 = f8, f0 // p6: x >= 0 ,p7: x<0
+ mov GR_BIAS = 0x0FFFF
+}
+{ .mfi
+ mov exp_GR_exp_2tom56 = 0xffff-56
+ fnma.s1 EXP_NORM_f8 = f8, f8, f0 // -x^2
+ mov GR_ExpMask = 0x1ffff
+}
+;;
+
+// Form two constants we need
+// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
+// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand
+
+// p9: x = 0,+inf,-inf,nan,unnorm.
+// p10: x!= 0,+inf,-inf,nan,unnorm.
+{ .mfi
+ setf.sig EXP_INV_LN2_2TO63 = exp_GR_sig_inv_ln2 // Form 1/ln2*2^63
+ fclass.m p9,p10 = f8,0xef
+ shl GR_ShftPi_bias = GR_BIAS, 7
+}
+{ .mfi
+ setf.d EXP_RSHF_2TO56 = exp_GR_rshf_2to56 //Const 1.10*2^(63+56)
+ nop.f 0
+ and GR_IndxPlusBias = GR_IndxPlusBias, GR_ExpMask // i + bias
+}
+;;
+
+{ .mfi
+ alloc r32 = ar.pfs, 0, 15, 4, 0
+(p6) fma.s1 FR_AbsArg = f1, f0, f8 // |x| if x >= 0
+ cmp.lt p15,p0 = GR_IndxPlusBias, GR_BIAS//p15: i < 0 (for |x|<1)
+}
+{ .mlx
+ setf.exp EXP_2TOM56 = exp_GR_exp_2tom56 //2^-56 for scaling Nfloat
+ movl exp_GR_rshf = 0x43e8000000000000 //1.10 2^63,right shift.
+}
+;;
+
+{ .mfi
+ ldfps FR_POS_ARG_ASYMP, FR_NEG_ARG_ASYMP = [EXP_AD_TB1],8
+ nop.f 0
+(p15) mov GR_IndxPlusBias = GR_BIAS //Let i = 0 if i < 0
+}
+{ .mlx
+ mov GR_P_POINT_3 = 0x1A0
+ movl GR_05 = 0x3fe0000000000000
+}
+;;
+
+// Form shift GR_ShftPi from the beginning of erfc_p_table
+// to the polynomial with number i
+{ .mfi
+ ldfps FR_UnfBound, FR_EpsNorm = [EXP_AD_TB1],8
+ nop.f 0
+ shl GR_ShftPi = GR_IndxPlusBias, 7
+}
+{ .mfi
+ setf.d EXP_RSHF = exp_GR_rshf // Form right shift 1.100 * 2^63
+(p7) fms.s1 FR_AbsArg = f1, f0, f8 // |x| if x < 0
+ mov exp_TB1_size = 0x100
+}
+;;
+
+// Form pointer GR_P_POINT_3 to the beginning of erfc_p_table
+{ .mfi
+ setf.d FR_05 = GR_05
+ nop.f 0
+ sub GR_ShftPi = GR_ShftPi,GR_ShftPi_bias
+}
+{ .mfb
+ add GR_P_POINT_3 = GR_P_POINT_3, EXP_AD_TB1
+ nop.f 0
+(p9) br.cond.spnt SPECIAL // For x = 0,+inf,-inf,nan,unnorm
+}
+;;
+
+{ .mfi
+ add GR_P_POINT_1 = GR_P_POINT_3, GR_ShftPi
+ nop.f 0
+ add GR_P_POINT_2 = GR_P_POINT_3, GR_ShftPi
+}
+{ .mfi
+ ldfe exp_ln2_by_128_hi = [EXP_AD_TB1],16
+ fma.s1 FR_NormX = f8,f1,f0
+ add GR_P_POINT_3 = GR_P_POINT_3, GR_ShftPi
+}
+;;
+
+// Load coefficients for polynomial P15(x)
+{ .mfi
+ ldfpd FR_A15, FR_A14 = [GR_P_POINT_1], 16
+ nop.f 0
+ add GR_P_POINT_3 = 0x30, GR_P_POINT_3
+}
+{ .mfi
+ ldfe exp_ln2_by_128_lo = [EXP_AD_TB1], 16
+ nop.f 0
+ add GR_P_POINT_2 = 0x20, GR_P_POINT_2
+}
+;;
+
+// Now EXP_AD_TB1 points to the beginning of table 1
+{ .mlx
+ ldfpd FR_A13, FR_A12 = [GR_P_POINT_1]
+ movl GR_1_by_6 = 0x3FC5555555555555
+}
+{ .mfi
+ add GR_P_POINT_4 = 0x30, GR_P_POINT_2
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfpd FR_A11, FR_A10 = [GR_P_POINT_2]
+ fma.s1 FR_2 = f1, f1, f1
+ mov exp_TB2_size = 0x80
+}
+{ .mfi
+ ldfpd FR_A9, FR_A8 = [GR_P_POINT_3],16
+ nop.f 0
+ add GR_P_POINT_1 = 0x60 ,GR_P_POINT_1
+}
+;;
+
+// W = X * Inv_log2_by_128
+// By adding 1.10...0*2^63 we shift and get round_int(W) in significand.
+// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing.
+{ .mfi
+ ldfpd FR_A7, FR_A6 = [GR_P_POINT_3]
+ fma.s1 EXP_W_2TO56_RSH = EXP_NORM_f8,EXP_INV_LN2_2TO63,EXP_RSHF_2TO56
+ add EXP_AD_TB2 = exp_TB1_size, EXP_AD_TB1
+
+}
+{ .mfi
+ ldfpd FR_A5, FR_A4 = [GR_P_POINT_4], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfpd FR_A3, FR_A2 = [GR_P_POINT_4]
+ fmerge.s FR_X = f8,f8
+ nop.i 0
+}
+{ .mfi
+ ldfpd FR_A1, FR_A0 = [GR_P_POINT_1]
+ nop.f 0
+ nop.i 0
+}
+;;
+
+//p14: x < - NEG_ARG_ASYMP = -4.4 -> erfcf(x) ~=~ 2.0
+{ .mfi
+ setf.d FR_1_by_6 = GR_1_by_6
+(p7) fcmp.gt.unc.s1 p14,p0 = FR_AbsArg, FR_NEG_ARG_ASYMP //p7: x < 0
+ nop.i 0
+}
+;;
+
+//p15: x > POS_ARG_ASYMP = 10.06 -> erfcf(x) ~=~ 0.0
+{ .mfi
+ nop.m 0
+(p6) fcmp.gt.unc.s1 p15,p0 = FR_AbsArg, FR_POS_ARG_ASYMP //p6: x > 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fcmp.le.s1 p8,p0 = FR_NormX, FR_UnfBound // p8: x <= UnfBound
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p14) fnma.s.s0 FR_RESULT = FR_EpsNorm, FR_EpsNorm, FR_2//y = 2 if x <-4.4
+(p14) br.ret.spnt b0
+}
+;;
+
+// Nfloat = round_int(W)
+// The signficand of EXP_W_2TO56_RSH contains the rounded integer part of W,
+// as a twos complement number in the lower bits (that is, it may be negative).
+// That twos complement number (called N) is put into exp_GR_N.
+
+// Since EXP_W_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56
+// before the shift constant 1.10000 * 2^63 is subtracted to yield EXP_Nfloat.
+// Thus, EXP_Nfloat contains the floating point version of N
+
+{ .mfi
+ nop.m 0
+ fms.s1 EXP_Nfloat = EXP_W_2TO56_RSH, EXP_2TOM56, EXP_RSHF
+ nop.i 0
+}
+{ .mfb
+(p15) mov GR_Parameter_TAG = 209
+(p15) fma.s.s0 FR_RESULT = FR_EpsNorm,FR_EpsNorm,f0 //Result.for x>10.06
+(p15) br.cond.spnt __libm_error_region
+}
+;;
+
+// Now we can calculate polynomial P15(x)
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_1_1 = FR_AbsArg, FR_AbsArg, f0 // x ^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_0_1 = FR_A15, FR_AbsArg, FR_A14
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_1_2 = FR_A13, FR_AbsArg, FR_A12
+ nop.i 0
+}
+;;
+
+{ .mfi
+ getf.sig exp_GR_N = EXP_W_2TO56_RSH
+ fma.s1 FR_P15_2_1 = FR_A9, FR_AbsArg, FR_A8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_2_2 = FR_A11, FR_AbsArg, FR_A10
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_3_1 = FR_A5, FR_AbsArg, FR_A4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_3_2 = FR_A7, FR_AbsArg, FR_A6
+ nop.i 0
+}
+;;
+
+// exp_GR_index_1 has index_1
+// exp_GR_index_2_16 has index_2 * 16
+// exp_GR_biased_M has M
+// exp_GR_index_1_16 has index_1 * 16
+
+// r2 has true M
+{ .mfi
+ and exp_GR_index_1 = 0x0f, exp_GR_N
+ fma.s1 FR_P15_4_1 = FR_A1, FR_AbsArg, FR_A0
+ shr r2 = exp_GR_N, 0x7
+
+}
+{ .mfi
+ and exp_GR_index_2_16 = 0x70, exp_GR_N
+ fma.s1 FR_P15_4_2 = FR_A3, FR_AbsArg, FR_A2
+ nop.i 0
+}
+;;
+
+// EXP_AD_T1 has address of T1
+// EXP_AD_T2 has address if T2
+
+{ .mfi
+ add EXP_AD_T2 = EXP_AD_TB2, exp_GR_index_2_16
+ nop.f 0
+ shladd EXP_AD_T1 = exp_GR_index_1, 4, EXP_AD_TB1
+}
+{ .mfi
+ addl exp_GR_biased_M = 0xffff, r2
+ fnma.s1 exp_r = EXP_Nfloat, exp_ln2_by_128_hi, EXP_NORM_f8
+ nop.i 0
+}
+;;
+
+// Create Scale = 2^M
+// r = x - Nfloat * ln2_by_128_hi
+
+{ .mfi
+ setf.exp EXP_2M = exp_GR_biased_M
+ fma.s1 FR_P15_7_1 = FR_P15_0_1, FR_P15_1_1, FR_P15_1_2
+ nop.i 0
+}
+{ .mfi
+ ldfe exp_T2 = [EXP_AD_T2]
+ nop.f 0
+ nop.i 0
+}
+;;
+
+// Load T1 and T2
+
+{ .mfi
+ ldfe exp_T1 = [EXP_AD_T1]
+ fma.s1 FR_P15_7_2 = FR_P15_1_1, FR_P15_1_1, f0 // x^4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_8_1 = FR_P15_1_1, FR_P15_2_2, FR_P15_2_1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_9_1 = FR_P15_1_1, FR_P15_4_2, FR_P15_4_1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_9_2 = FR_P15_1_1, FR_P15_3_2, FR_P15_3_1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 exp_P = FR_1_by_6, exp_r, FR_05
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 exp_rsq = exp_r, exp_r, f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_13_1 = FR_P15_7_2, FR_P15_7_1, FR_P15_8_1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_14_1 = FR_P15_7_2, FR_P15_9_2, FR_P15_9_1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_14_2 = FR_P15_7_2, FR_P15_7_2, f0 // x^8
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 exp_P = exp_P, exp_rsq, exp_r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 exp_S1 = EXP_2M, exp_T2, f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Pol = FR_P15_14_2, FR_P15_13_1, FR_P15_14_1 // P15(x)
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 exp_S = exp_S1, exp_T1, f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Exp = exp_S, exp_P, exp_S // exp(-x^2)
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s.s0 FR_Tmpf = f8, f1, f0 // Flag d
+ nop.i 0
+}
+;;
+
+//p6: result for 0 < x < = POS_ARG_ASYMP
+//p7: result for - NEG_ARG_ASYMP <= x < 0
+//p8: exit for - NEG_ARG_ASYMP <= x <= UnfBound, x!=0
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+(p6) fma.s.s0 f8 = FR_Exp, FR_Pol, f0
+ nop.i 0
+}
+{ .mfb
+ mov GR_Parameter_TAG = 209
+(p7) fnma.s.s0 f8 = FR_Exp, FR_Pol, FR_2
+(p8) br.ret.sptk b0
+}
+;;
+
+//p10: branch for UnfBound < x < = POS_ARG_ASYMP
+{ .mfb
+ nop.m 0
+ nop.f 0
+(p10) br.cond.spnt __libm_error_region
+}
+;;
+
+//Only via (p9) br.cond.spnt SPECIAL for x = 0,+inf,-inf,nan,unnorm
+SPECIAL:
+
+{ .mfi
+ nop.m 0
+ fclass.m.unc p10,p0 = f8,0x07 // p10: x = 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fclass.m.unc p11,p0 = f8,0x21 // p11: x = +inf
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fclass.m.unc p12,p0 = f8,0x22 // p12 x = -inf
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p10) fma.s.s0 f8 = f1, f1, f0
+(p10) br.ret.sptk b0 // Quick exit for x = 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fclass.m.unc p13,p0 = f8,0xc3 // p13: x = nan
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p11) fma.s.s0 f8 = f0, f1, f0
+(p11) br.ret.spnt b0 // Quick exit for x = +inf
+}
+;;
+{ .mfi
+ nop.m 0
+ fclass.m.unc p14,p0 = f8,0x0b // P14: x = unnormalized
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p12) fma.s.s0 f8 = f1, f1, f1
+(p12) br.ret.spnt b0 // Quick exit for x = -inf
+}
+;;
+
+{ .mfb
+ nop.m 0
+(p13) fma.s.s0 f8 = f8, f1, f0
+(p13) br.ret.sptk b0 // Quick exit for x = nan
+}
+;;
+
+{ .mfb
+ nop.m 0
+(p14) fnma.s.s0 f8 = f8, f1, f1
+(p14) br.ret.sptk b0 // Quick exit for x = unnormalized
+}
+;;
+
+GLOBAL_LIBM_END(erfcf)
+
+// Call via (p10) br.cond.spnt __libm_error_region
+// for UnfBound < x < = POS_ARG_ASYMP
+// and
+//
+// call via (p15) br.cond.spnt __libm_error_region
+// for x > POS_ARG_ASYMP
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfs [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfs [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+{ .mmi
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
+
+
diff --git a/sysdeps/ia64/fpu/s_erfcl.S b/sysdeps/ia64/fpu/s_erfcl.S
new file mode 100644
index 0000000000..f06e26f59f
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_erfcl.S
@@ -0,0 +1,2064 @@
+.file "erfcl.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 11/12/01 Initial version
+// 02/08/02 Added missing }
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
+//
+// API
+//==============================================================
+// long double erfcl(long double)
+//
+// Implementation and Algorithm Notes:
+//==============================================================
+// 1. 0 <= x <= 107.0
+//
+// erfcl(x) ~=~ P15(z) * expl( -x^2 )/(dx + x), z = x - xc(i).
+//
+// Comment:
+//
+// Let x(i) = -1.0 + 2^(i/4),i=0,...27. So we have 28 unequal
+// argument intervals [x(i),x(i+1)] with length ratio q = 2^(1/4).
+// Values xc(i) we have in the table erfc_xc_table,xc(i)=x(i)for i = 0
+// and xc(i)= 0.5*( x(i)+x(i+1) ) for i>0.
+//
+// Let x(i)<= x < x(i+1).
+// We can find i as exponent of number (x + 1)^4.
+//
+// Let P15(z)= a0+ a1*z +..+a15*z^15 - polynomial approximation of degree 15
+// for function erfcl(z+xc(i)) * expl( (z+xc(i))^2)* (dx+z+xc(i)) and
+// -0.5*[x(i+1)-x(i)] <= z <= 0.5*[x(i+1)-x(i)].
+//
+// Let Q(z)= (P(z)- S)/S, S = a0, rounded to 16 bits.
+// Polynomial coeffitients for Q(z) we have in the table erfc_Q_table as
+// long double values
+//
+// We use multi precision to calculate input argument -x^2 for expl and
+// for u = 1/(dx + x).
+//
+// Algorithm description for expl function see below. In accordance with
+// denotation of this algorithm we have for expl:
+//
+// expl(X) ~=~ 2^K*T_1*(1+W_1)*T_2*(1+W_2)*(1+ poly(r)), X = -x^2.
+//
+// Final calculations for erfcl:
+//
+// erfcl(x) ~=~
+//
+// 2^K*T_1*(1+W_1)*T_2*(1+W_2)*(1+ poly(r))*(1-dy)*S*(1+Q(z))*u*(1+du),
+//
+// where dy - low bits of x^2 and u, u*du - hi and low bits of 1/(dx + x).
+//
+// The order of calculations is the next:
+//
+// 1) M = 2^K*T_1*T_2*S without rounding error,
+// 2) W = W_1 + (W_2 + W_1*W_2), where 1+W ~=~ (1+W_1)(1+W_2),
+// 3) H = W - dy, where 1+H ~=~ (1+W )(1-dy),
+// 4) R = poly(r)*H + poly(r),
+// 5) R = H + R , where 1+R ~=~ (1+H )(1+poly(r)),
+// 6) G = Q(z)*R + Q(z),
+// 7) R1 = R + du, where 1+R1 ~=~ (1+R)(1+du),
+// 8) G1 = R1 + G, where 1+G1 ~=~ (1+R1)(1+Q(z)),
+// 9) V = G1*M*u,
+// 10) erfcl(x) ~=~ M*u + V
+//
+// 2. -6.5 <= x < 0
+//
+// erfcl(x) = 2.0 - erfl(-x)
+//
+// 3. x > 107.0
+// erfcl(x) ~=~ 0.0
+//
+// 4. x < -6.5
+// erfcl(x) ~=~ 2.0
+
+// Special values
+//==============================================================
+// erfcl(+0) = 1.0
+// erfcl(-0) = 1.0
+
+// erfcl(+qnan) = +qnan
+// erfcl(-qnan) = -qnan
+// erfcl(+snan) = +qnan
+// erfcl(-snan) = -qnan
+
+// erfcl(-inf) = 2.0
+// erfcl(+inf) = +0
+
+//==============================================================
+// Algorithm description of used expl function.
+//
+// Implementation and Algorithm Notes:
+//
+// ker_exp_64( in_FR : X,
+// out_FR : Y_hi,
+// out_FR : Y_lo,
+// out_FR : scale,
+// out_PR : Safe )
+//
+// On input, X is in register format
+//
+// On output,
+//
+// scale*(Y_hi + Y_lo) approximates exp(X)
+//
+// The accuracy is sufficient for a highly accurate 64 sig.
+// bit implementation. Safe is set if there is no danger of
+// overflow/underflow when the result is composed from scale,
+// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set.
+// Otherwise, one must prepare to handle the possible exception
+// appropriately. Note that SAFE not set (false) does not mean
+// that overflow/underflow will occur; only the setting of SAFE
+// guarantees the opposite.
+//
+// **** High Level Overview ****
+//
+// The method consists of three cases.
+//
+// If |X| < Tiny use case exp_tiny;
+// else if |X| < 2^(-6) use case exp_small;
+// else use case exp_regular;
+//
+// Case exp_tiny:
+//
+// 1 + X can be used to approximate exp(X)
+// X + X^2/2 can be used to approximate exp(X) - 1
+//
+// Case exp_small:
+//
+// Here, exp(X) and exp(X) - 1 can all be
+// appproximated by a relatively simple polynomial.
+//
+// This polynomial resembles the truncated Taylor series
+//
+// exp(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n!
+//
+// Case exp_regular:
+//
+// Here we use a table lookup method. The basic idea is that in
+// order to compute exp(X), we accurately decompose X into
+//
+// X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13.
+//
+// Hence
+//
+// exp(X) = 2^( N / 2^12 ) * exp(r).
+//
+// The value 2^( N / 2^12 ) is obtained by simple combinations
+// of values calculated beforehand and stored in table; exp(r)
+// is approximated by a short polynomial because |r| is small.
+//
+// We elaborate this method in 4 steps.
+//
+// Step 1: Reduction
+//
+// The value 2^12/log(2) is stored as a double-extended number
+// L_Inv.
+//
+// N := round_to_nearest_integer( X * L_Inv )
+//
+// The value log(2)/2^12 is stored as two numbers L_hi and L_lo so
+// that r can be computed accurately via
+//
+// r := (X - N*L_hi) - N*L_lo
+//
+// We pick L_hi such that N*L_hi is representable in 64 sig. bits
+// and thus the FMA X - N*L_hi is error free. So r is the
+// 1 rounding error from an exact reduction with respect to
+//
+// L_hi + L_lo.
+//
+// In particular, L_hi has 30 significant bit and can be stored
+// as a double-precision number; L_lo has 64 significant bits and
+// stored as a double-extended number.
+//
+// Step 2: Approximation
+//
+// exp(r) - 1 is approximated by a short polynomial of the form
+//
+// r + A_1 r^2 + A_2 r^3 + A_3 r^4 .
+//
+// Step 3: Composition from Table Values
+//
+// The value 2^( N / 2^12 ) can be composed from a couple of tables
+// of precalculated values. First, express N as three integers
+// K, M_1, and M_2 as
+//
+// N = K * 2^12 + M_1 * 2^6 + M_2
+//
+// Where 0 <= M_1, M_2 < 2^6; and K can be positive or negative.
+// When N is represented in 2's complement, M_2 is simply the 6
+// lsb's, M_1 is the next 6, and K is simply N shifted right
+// arithmetically (sign extended) by 12 bits.
+//
+// Now, 2^( N / 2^12 ) is simply
+//
+// 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 )
+//
+// Clearly, 2^K needs no tabulation. The other two values are less
+// trivial because if we store each accurately to more than working
+// precision, than its product is too expensive to calculate. We
+// use the following method.
+//
+// Define two mathematical values, delta_1 and delta_2, implicitly
+// such that
+//
+// T_1 = exp( [M_1 log(2)/2^6] - delta_1 )
+// T_2 = exp( [M_2 log(2)/2^12] - delta_2 )
+//
+// are representable as 24 significant bits. To illustrate the idea,
+// we show how we define delta_1:
+//
+// T_1 := round_to_24_bits( exp( M_1 log(2)/2^6 ) )
+// delta_1 = (M_1 log(2)/2^6) - log( T_1 )
+//
+// The last equality means mathematical equality. We then tabulate
+//
+// W_1 := exp(delta_1) - 1
+// W_2 := exp(delta_2) - 1
+//
+// Both in double precision.
+//
+// From the tabulated values T_1, T_2, W_1, W_2, we compose the values
+// T and W via
+//
+// T := T_1 * T_2 ...exactly
+// W := W_1 + (1 + W_1)*W_2
+//
+// W approximates exp( delta ) - 1 where delta = delta_1 + delta_2.
+// The mathematical product of T and (W+1) is an accurate representation
+// of 2^(M_1/2^6) * 2^(M_2/2^12).
+//
+// Step 4. Reconstruction
+//
+// Finally, we can reconstruct exp(X), exp(X) - 1.
+// Because
+//
+// X = K * log(2) + (M_1*log(2)/2^6 - delta_1)
+// + (M_2*log(2)/2^12 - delta_2)
+// + delta_1 + delta_2 + r ...accurately
+// We have
+//
+// exp(X) ~=~ 2^K * ( T + T*[exp(delta_1+delta_2+r) - 1] )
+// ~=~ 2^K * ( T + T*[exp(delta + r) - 1] )
+// ~=~ 2^K * ( T + T*[(exp(delta)-1)
+// + exp(delta)*(exp(r)-1)] )
+// ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) )
+// ~=~ 2^K * ( Y_hi + Y_lo )
+//
+// where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r))
+//
+// For exp(X)-1, we have
+//
+// exp(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1
+// ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) )
+//
+// and we combine Y_hi + Y_lo - 2^(-N) into the form of two
+// numbers Y_hi + Y_lo carefully.
+//
+// **** Algorithm Details ****
+//
+// A careful algorithm must be used to realize the mathematical ideas
+// accurately. We describe each of the three cases. We assume SAFE
+// is preset to be TRUE.
+//
+// Case exp_tiny:
+//
+// The important points are to ensure an accurate result under
+// different rounding directions and a correct setting of the SAFE
+// flag.
+//
+// If expm1 is 1, then
+// SAFE := False ...possibility of underflow
+// Scale := 1.0
+// Y_hi := X
+// Y_lo := 2^(-17000)
+// Else
+// Scale := 1.0
+// Y_hi := 1.0
+// Y_lo := X ...for different rounding modes
+// Endif
+//
+// Case exp_small:
+//
+// Here we compute a simple polynomial. To exploit parallelism, we split
+// the polynomial into several portions.
+//
+// Let r = X
+//
+// If exp ...i.e. exp( argument )
+//
+// rsq := r * r;
+// r4 := rsq*rsq
+// poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6))
+// poly_hi := r + rsq*(P_1 + r*P_2)
+// Y_lo := poly_hi + r4 * poly_lo
+// Y_hi := 1.0
+// Scale := 1.0
+//
+// Else ...i.e. exp( argument ) - 1
+//
+// rsq := r * r
+// r4 := rsq * rsq
+// r6 := rsq * r4
+// poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7))
+// poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4))
+// Y_lo := rsq*poly_hi + poly_lo
+// Y_hi := X
+// Scale := 1.0
+//
+// Endif
+//
+// Case exp_regular:
+//
+// The previous description contain enough information except the
+// computation of poly and the final Y_hi and Y_lo in the case for
+// exp(X)-1.
+//
+// The computation of poly for Step 2:
+//
+// rsq := r*r
+// poly := r + rsq*(A_1 + r*(A_2 + r*A_3))
+//
+// For the case exp(X) - 1, we need to incorporate 2^(-K) into
+// Y_hi and Y_lo at the end of Step 4.
+//
+// If K > 10 then
+// Y_lo := Y_lo - 2^(-K)
+// Else
+// If K < -10 then
+// Y_lo := Y_hi + Y_lo
+// Y_hi := -2^(-K)
+// Else
+// Y_hi := Y_hi - 2^(-K)
+// End If
+// End If
+//
+
+// Overview of operation
+//==============================================================
+
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f9 -> f14, f36 -> f126
+
+// General registers used:
+// r32 -> r71
+
+// Predicate registers used:
+// p6 -> p15
+
+// Assembly macros
+//==============================================================
+// GR for exp(X)
+GR_ad_Arg = r33
+GR_ad_C = r34
+GR_ERFC_S_TB = r35
+GR_signexp_x = r36
+GR_exp_x = r36
+GR_exp_mask = r37
+GR_ad_W1 = r38
+GR_ad_W2 = r39
+GR_M2 = r40
+GR_M1 = r41
+GR_K = r42
+GR_exp_2_k = r43
+GR_ad_T1 = r44
+GR_ad_T2 = r45
+GR_N_fix = r46
+GR_ad_P = r47
+GR_exp_bias = r48
+GR_BIAS = r48
+GR_exp_half = r49
+GR_sig_inv_ln2 = r50
+GR_rshf_2to51 = r51
+GR_exp_2tom51 = r52
+GR_rshf = r53
+
+// GR for erfcl(x)
+//==============================================================
+
+GR_ERFC_XC_TB = r54
+GR_ERFC_P_TB = r55
+GR_IndxPlusBias = r56
+GR_P_POINT_1 = r57
+GR_P_POINT_2 = r58
+GR_AbsArg = r59
+GR_ShftXBi = r60
+GR_ShftPi = r61
+GR_mBIAS = r62
+GR_ShftPi_bias = r63
+GR_ShftXBi_bias = r64
+GR_ShftA14 = r65
+GR_ShftA15 = r66
+GR_EpsNorm = r67
+GR_0x1 = r68
+GR_ShftPi_8 = r69
+GR_26PlusBias = r70
+GR_27PlusBias = r71
+
+// GR for __libm_support call
+//==============================================================
+GR_SAVE_B0 = r64
+GR_SAVE_PFS = r65
+GR_SAVE_GP = r66
+GR_SAVE_SP = r67
+
+GR_Parameter_X = r68
+GR_Parameter_Y = r69
+GR_Parameter_RESULT = r70
+GR_Parameter_TAG = r71
+
+//==============================================================
+// Floating Point Registers
+//
+FR_RSHF_2TO51 = f10
+FR_INV_LN2_2TO63 = f11
+FR_W_2TO51_RSH = f12
+FR_2TOM51 = f13
+FR_RSHF = f14
+
+FR_scale = f36
+FR_float_N = f37
+FR_N_signif = f38
+FR_L_hi = f39
+FR_L_lo = f40
+FR_r = f41
+FR_W1 = f42
+FR_T1 = f43
+FR_W2 = f44
+FR_T2 = f45
+FR_rsq = f46
+FR_C2 = f47
+FR_C3 = f48
+FR_poly = f49
+FR_P6 = f49
+FR_T = f50
+FR_P5 = f50
+FR_P4 = f51
+FR_W = f51
+FR_P3 = f52
+FR_Wp1 = f52
+FR_P2 = f53
+FR_P1 = f54
+FR_Q7 = f56
+FR_Q6 = f57
+FR_Q5 = f58
+FR_Q4 = f59
+FR_Q3 = f60
+FR_Q2 = f61
+FR_Q1 = f62
+FR_C1 = f63
+FR_A15 = f64
+FR_ch_dx = f65
+FR_T_scale = f66
+FR_norm_x = f67
+FR_AbsArg = f68
+FR_POS_ARG_ASYMP = f69
+FR_NEG_ARG_ASYMP = f70
+FR_Tmp = f71
+FR_Xc = f72
+FR_A0 = f73
+FR_A1 = f74
+FR_A2 = f75
+FR_A3 = f76
+FR_A4 = f77
+FR_A5 = f78
+FR_A6 = f79
+FR_A7 = f80
+FR_A8 = f81
+FR_A9 = f82
+FR_A10 = f83
+FR_A11 = f84
+FR_A12 = f85
+FR_A13 = f86
+FR_A14 = f87
+FR_P15_0_1 = f88
+FR_P15_8_1 = f88
+FR_P15_1_1 = f89
+FR_P15_8_2 = f89
+FR_P15_1_2 = f90
+FR_P15_2_1 = f91
+FR_P15_2_2 = f92
+FR_P15_3_1 = f93
+FR_P15_3_2 = f94
+FR_P15_4_2 = f95
+FR_P15_7_1 = f96
+FR_P15_7_2 = f97
+FR_P15_9_1 = f98
+FR_P15_9_2 = f99
+FR_P15_13_1 = f100
+FR_P15_14_1 = f101
+FR_P15_14_2 = f102
+FR_Tmp2 = f103
+FR_Xpdx_lo = f104
+FR_2 = f105
+FR_xsq_lo = f106
+FR_LocArg = f107
+FR_Tmpf = f108
+FR_Tmp1 = f109
+FR_EpsNorm = f110
+FR_UnfBound = f111
+FR_NormX = f112
+FR_Xpdx_hi = f113
+FR_dU = f114
+FR_H = f115
+FR_G = f116
+FR_V = f117
+FR_M = f118
+FR_U = f119
+FR_Q = f120
+FR_S = f121
+FR_R = f122
+FR_res_pos_x_hi = f123
+FR_res_pos_x_lo = f124
+FR_dx = f125
+FR_dx1 = f126
+
+// for error handler routine
+FR_X = f9
+FR_Y = f0
+FR_RESULT = f8
+
+// Data tables
+//==============================================================
+RODATA
+.align 16
+
+// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
+LOCAL_OBJECT_START(exp_table_1)
+
+data8 0xae89f995ad3ad5ea , 0x00003ffe // x = 0.681..,bound for dx = 0.875
+data8 0x405AC00000000000 , 0x401A000000000000 //ARG_ASYMP,NEG_ARG_ASYMP
+data8 0x3FE4000000000000 , 0x3FEC000000000000 //0.625,0.875
+data8 0xD5126065B720A4e9 , 0x00004005 // underflow boundary
+data8 0x8000000000000000 , 0x00000001 //FR_EpsNorm
+LOCAL_OBJECT_END(exp_table_1)
+
+LOCAL_OBJECT_START(Constants_exp_64_Arg)
+data8 0xB17217F400000000,0x00003FF2 //L_hi = hi part log(2)/2^12
+data8 0xF473DE6AF278ECE6,0x00003FD4 //L_lo = lo part log(2)/2^12
+LOCAL_OBJECT_END(Constants_exp_64_Arg)
+
+LOCAL_OBJECT_START(Constants_exp_64_C)
+data8 0xAAAAAAABB1B736A0,0x00003FFA // C3
+data8 0xAAAAAAAB90CD6327,0x00003FFC // C2
+data8 0xFFFFFFFFFFFFFFFF,0x00003FFD // C1
+LOCAL_OBJECT_END(Constants_exp_64_C)
+
+LOCAL_OBJECT_START(Constants_exp_64_T1)
+data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
+data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
+data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
+data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
+data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
+data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516
+data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A
+data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4
+data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B
+data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD
+data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15
+data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B
+data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
+data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
+data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
+data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
+LOCAL_OBJECT_END(Constants_exp_64_T1)
+
+LOCAL_OBJECT_START(Constants_exp_64_T2)
+data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
+data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
+data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
+data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
+data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
+data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
+data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
+data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
+data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
+data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
+data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
+data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
+data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
+data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
+data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
+data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
+LOCAL_OBJECT_END(Constants_exp_64_T2)
+
+LOCAL_OBJECT_START(Constants_exp_64_W1)
+data8 0x0000000000000000, 0xBE384454171EC4B4
+data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8
+data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36
+data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE
+data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F
+data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329
+data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5
+data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F
+data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF
+data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F
+data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92
+data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E
+data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D
+data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29
+data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A
+data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA
+data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6
+data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF
+data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC
+data8 0xBE51C2141AA42614, 0xBE48D087C37293F4
+data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38
+data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962
+data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788
+data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7
+data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2
+data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4
+data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA
+data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B
+data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A
+data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719
+data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D
+data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707
+LOCAL_OBJECT_END(Constants_exp_64_W1)
+
+LOCAL_OBJECT_START(Constants_exp_64_W2)
+data8 0x0000000000000000, 0xBE641F2537A3D7A2
+data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6
+data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE
+data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3
+data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4
+data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B
+data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7
+data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA
+data8 0xBE56856B49BFF529, 0x3E66DD3300508651
+data8 0x3E51165FC114BC13, 0x3E53333DC453290F
+data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696
+data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93
+data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE
+data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22
+data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97
+data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8
+data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC
+data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1
+data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7
+data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D
+data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C
+data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5
+data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9
+data8 0xBE559725ADE45917, 0xBE68C29C042FC476
+data8 0xBE67593B01E511FA, 0xBE4A4313398801ED
+data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E
+data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D
+data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F
+data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1
+data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795
+data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E
+data8 0x3E68BF5C17365712, 0x3E3956F9B3785569
+LOCAL_OBJECT_END(Constants_exp_64_W2)
+
+
+LOCAL_OBJECT_START(erfc_xc_table)
+
+data8 0x0000000000000000, 0x00000000 //XC[0] = +0.00000000000000000000e-01L
+data8 0x9A79C70000000000, 0x00003FFD //XC[1] = +3.01710337400436401367e-01L
+data8 0x8C49EF0000000000, 0x00003FFE //XC[2] = +5.48003137111663818359e-01L
+data8 0xD744FC0000000000, 0x00003FFE //XC[3] = +8.40896368026733398438e-01L
+data8 0x9837F00000000000, 0x00003FFF //XC[4] = +1.18920707702636718750e+00L
+data8 0xCD3CE30000000000, 0x00003FFF //XC[5] = +1.60342061519622802734e+00L
+data8 0x8624F70000000000, 0x00004000 //XC[6] = +2.09600615501403808594e+00L
+data8 0xABA27E0000000000, 0x00004000 //XC[7] = +2.68179273605346679688e+00L
+data8 0xD837F00000000000, 0x00004000 //XC[8] = +3.37841415405273437500e+00L
+data8 0x869E710000000000, 0x00004001 //XC[9] = +4.20684099197387695313e+00L
+data8 0xA624F70000000000, 0x00004001 //XC[10] = +5.19201231002807617188e+00L
+data8 0xCBA27E0000000000, 0x00004001 //XC[11] = +6.36358547210693359375e+00L
+data8 0xF837F00000000000, 0x00004001 //XC[12] = +7.75682830810546875000e+00L
+data8 0x969E710000000000, 0x00004002 //XC[13] = +9.41368198394775390625e+00L
+data8 0xB624F70000000000, 0x00004002 //XC[14] = +1.13840246200561523438e+01L
+data8 0xDBA27E0000000000, 0x00004002 //XC[15] = +1.37271709442138671875e+01L
+data8 0x841BF80000000000, 0x00004003 //XC[16] = +1.65136566162109375000e+01L
+data8 0x9E9E710000000000, 0x00004003 //XC[17] = +1.98273639678955078125e+01L
+data8 0xBE24F70000000000, 0x00004003 //XC[18] = +2.37680492401123046875e+01L
+data8 0xE3A27E0000000000, 0x00004003 //XC[19] = +2.84543418884277343750e+01L
+data8 0x881BF80000000000, 0x00004004 //XC[20] = +3.40273132324218750000e+01L
+data8 0xA29E710000000000, 0x00004004 //XC[21] = +4.06547279357910156250e+01L
+data8 0xC224F70000000000, 0x00004004 //XC[22] = +4.85360984802246093750e+01L
+data8 0xE7A27E0000000000, 0x00004004 //XC[23] = +5.79086837768554687500e+01L
+data8 0x8A1BF80000000000, 0x00004005 //XC[24] = +6.90546264648437500000e+01L
+data8 0xA49E710000000000, 0x00004005 //XC[25] = +8.23094558715820312500e+01L
+data8 0xC424F70000000000, 0x00004005 //XC[26] = +9.80721969604492187500e+01L
+data8 0xD5A27E0000000000, 0x00004005 //XC[27] = +1.06817367553710937500e+02L
+LOCAL_OBJECT_END(erfc_xc_table)
+
+LOCAL_OBJECT_START(erfc_s_table)
+
+data8 0xE000000000000000, 0x00003FFE //s[0] = +8.75000000000000000000e-01L
+data8 0xDCEF000000000000, 0x00003FFE //s[1] = +8.63021850585937500000e-01L
+data8 0xD79D000000000000, 0x00003FFE //s[2] = +8.42239379882812500000e-01L
+data8 0xB25E000000000000, 0x00003FFE //s[3] = +6.96746826171875000000e-01L
+data8 0xB0EA000000000000, 0x00003FFE //s[4] = +6.91070556640625000000e-01L
+data8 0xAE3F000000000000, 0x00003FFE //s[5] = +6.80648803710937500000e-01L
+data8 0xAB05000000000000, 0x00003FFE //s[6] = +6.68045043945312500000e-01L
+data8 0xA7AC000000000000, 0x00003FFE //s[7] = +6.54968261718750000000e-01L
+data8 0xA478000000000000, 0x00003FFE //s[8] = +6.42456054687500000000e-01L
+data8 0xA18D000000000000, 0x00003FFE //s[9] = +6.31057739257812500000e-01L
+data8 0x9EF8000000000000, 0x00003FFE //s[10] = +6.20971679687500000000e-01L
+data8 0x9CBA000000000000, 0x00003FFE //s[11] = +6.12213134765625000000e-01L
+data8 0x9ACD000000000000, 0x00003FFE //s[12] = +6.04690551757812500000e-01L
+data8 0x992A000000000000, 0x00003FFE //s[13] = +5.98297119140625000000e-01L
+data8 0x97C7000000000000, 0x00003FFE //s[14] = +5.92880249023437500000e-01L
+data8 0x969C000000000000, 0x00003FFE //s[15] = +5.88317871093750000000e-01L
+data8 0x95A0000000000000, 0x00003FFE //s[16] = +5.84472656250000000000e-01L
+data8 0x94CB000000000000, 0x00003FFE //s[17] = +5.81222534179687500000e-01L
+data8 0x9419000000000000, 0x00003FFE //s[18] = +5.78506469726562500000e-01L
+data8 0x9383000000000000, 0x00003FFE //s[19] = +5.76217651367187500000e-01L
+data8 0x9305000000000000, 0x00003FFE //s[20] = +5.74295043945312500000e-01L
+data8 0x929B000000000000, 0x00003FFE //s[21] = +5.72677612304687500000e-01L
+data8 0x9242000000000000, 0x00003FFE //s[22] = +5.71319580078125000000e-01L
+data8 0x91F8000000000000, 0x00003FFE //s[23] = +5.70190429687500000000e-01L
+data8 0x91B9000000000000, 0x00003FFE //s[24] = +5.69229125976562500000e-01L
+data8 0x9184000000000000, 0x00003FFE //s[25] = +5.68420410156250000000e-01L
+data8 0x9158000000000000, 0x00003FFE //s[26] = +5.67749023437500000000e-01L
+data8 0x9145000000000000, 0x00003FFE //s[27] = +5.67459106445312500000e-01L
+LOCAL_OBJECT_END(erfc_s_table)
+
+LOCAL_OBJECT_START(erfc_Q_table)
+// Q(z)= (P(z)- S)/S
+
+// Pol0
+data8 0x98325D50F9DC3499, 0x0000BFAA //A0 = +3.07358861423101280650e-26L
+data8 0xED35081A2494DDD9, 0x00003FF8 //A1 = +1.44779757616302832466e-02L
+data8 0x9443549BCD0F94CE, 0x0000BFFD //A2 = -2.89576190966300084405e-01L
+data8 0xC7FD4B98ECF3DBBF, 0x00003FFD //A3 = +3.90604364793467799170e-01L
+data8 0xB82CE31288B49759, 0x0000BFFD //A4 = -3.59717460644199233866e-01L
+data8 0x8A8293447BEF69B5, 0x00003FFD //A5 = +2.70527460203054582368e-01L
+data8 0xB5793E30EE36766C, 0x0000BFFC //A6 = -1.77220317589265674647e-01L
+data8 0xD6066D16BBDECE17, 0x00003FFB //A7 = +1.04504444366724593714e-01L
+data8 0xE7C783CE3C997BD8, 0x0000BFFA //A8 = -5.65867565781331646771e-02L
+data8 0xE9969EBC2F5B2828, 0x00003FF9 //A9 = +2.85142040533900194955e-02L
+data8 0xDD31D619F29AD7BF, 0x0000BFF8 //A10 = -1.35006514390540367929e-02L
+data8 0xC63A20EB59768F3A, 0x00003FF7 //A11 = +6.04940993680332271481e-03L
+data8 0xA8DEC641AACEB600, 0x0000BFF6 //A12 = -2.57675495383156581601e-03L
+data8 0x87F0E77BA914FBEB, 0x00003FF5 //A13 = +1.03714776726541296794e-03L
+data8 0xC306C2894C5CEF2D, 0x0000BFF3 //A14 = -3.71983348634136412407e-04L
+data8 0xBDAB416A989D0697, 0x00003FF1 //A15 = +9.04412111877987292294e-05L
+// Pol1
+data8 0x82808893DA2DD83F, 0x00003FEE //A0 = +7.77853035974467145290e-06L
+data8 0xAE9CD9DCADC86113, 0x0000BFFB //A1 = -8.52601070853077921197e-02L
+data8 0x9D429743E312AD9F, 0x0000BFFB //A2 = -7.67871682732076080494e-02L
+data8 0x8637FC533AE805DC, 0x00003FFC //A3 = +1.31072943286859831330e-01L
+data8 0xF68DBE3639ABCB6E, 0x0000BFFB //A4 = -1.20387540845703264588e-01L
+data8 0xB168FFC3CFA71256, 0x00003FFB //A5 = +8.66260511047190247534e-02L
+data8 0xDBC5078A7EA89236, 0x0000BFFA //A6 = -5.36546988077281230848e-02L
+data8 0xF4331FEDB2CB838F, 0x00003FF9 //A7 = +2.98095344165515989564e-02L
+data8 0xF909173C0E61C25D, 0x0000BFF8 //A8 = -1.51999213123642373375e-02L
+data8 0xEC83560A2ACB23E9, 0x00003FF7 //A9 = +7.21780491979582106904e-03L
+data8 0xD350D62C4FEAD8F5, 0x0000BFF6 //A10 = -3.22442272982896360044e-03L
+data8 0xB2F44F4B3FD9B826, 0x00003FF5 //A11 = +1.36531322425499451283e-03L
+data8 0x9078BC61927671C6, 0x0000BFF4 //A12 = -5.51115510818844954547e-04L
+data8 0xDF67AC6287A63B03, 0x00003FF2 //A13 = +2.13055585989529858265e-04L
+data8 0xA719CFEE67FCE1CE, 0x0000BFF1 //A14 = -7.96798844477905965933e-05L
+data8 0xEF926367BABBB029, 0x00003FEF //A15 = +2.85591875675765038065e-05L
+// Pol2
+data8 0x82B5E5A93B059C50, 0x00003FEF //A0 = +1.55819100856330860049e-05L
+data8 0xDC856BC2542B1938, 0x0000BFFB //A1 = -1.07676355235999875911e-01L
+data8 0xDF225EF5694F14AE, 0x0000BFF8 //A2 = -1.36190345125628043277e-02L
+data8 0xDAF66A954ED22428, 0x00003FFA //A3 = +5.34576571853233908886e-02L
+data8 0xD28AE4F21A392EC6, 0x0000BFFA //A4 = -5.14019911949062230820e-02L
+data8 0x9441A95713F0DB5B, 0x00003FFA //A5 = +3.61954321717769771045e-02L
+data8 0xB0957B5C483C7A04, 0x0000BFF9 //A6 = -2.15556535133667988704e-02L
+data8 0xBB9260E812814F71, 0x00003FF8 //A7 = +1.14484735825400480057e-02L
+data8 0xB68AB17287ABAB04, 0x0000BFF7 //A8 = -5.57073273108465072470e-03L
+data8 0xA56A95E0BC0EF01B, 0x00003FF6 //A9 = +2.52405318381952650677e-03L
+data8 0x8D19C7D286839C00, 0x0000BFF5 //A10 = -1.07651294935087466892e-03L
+data8 0xE45DB3766711A0D3, 0x00003FF3 //A11 = +4.35573615323234291196e-04L
+data8 0xB05949F947FA7AEF, 0x0000BFF2 //A12 = -1.68179306983868501372e-04L
+data8 0x82901D055A0D5CB6, 0x00003FF1 //A13 = +6.22572626227726684168e-05L
+data8 0xBB957698542D6FD0, 0x0000BFEF //A14 = -2.23617364009159182821e-05L
+data8 0x810740E1DF572394, 0x00003FEE //A15 = +7.69068800065192940487e-06L
+// Pol3
+data8 0x9526D1C87655AFA8, 0x00003FEC //A0 = +2.22253260814242012255e-06L
+data8 0xA47E21EBFE73F72F, 0x0000BFF8 //A1 = -1.00398379581527733314e-02L
+data8 0xDE65685FCDF7A913, 0x0000BFFA //A2 = -5.42959286802879105148e-02L
+data8 0xED289CB8F97D4860, 0x00003FFA //A3 = +5.79000589346770417248e-02L
+data8 0xAA3100D5A7D870F1, 0x0000BFFA //A4 = -4.15506394006027604387e-02L
+data8 0xCA0567032C5308C0, 0x00003FF9 //A5 = +2.46607791863290331169e-02L
+data8 0xD3E1794A50F31BEB, 0x0000BFF8 //A6 = -1.29321751094401754013e-02L
+data8 0xCAA02CB4C87CC1F0, 0x00003FF7 //A7 = +6.18364508551740736863e-03L
+data8 0xB3F126AF16B121F2, 0x0000BFF6 //A8 = -2.74569696838501870748e-03L
+data8 0x962B2D64D3900510, 0x00003FF5 //A9 = +1.14569596409019883022e-03L
+data8 0xED8785714A9A00FB, 0x0000BFF3 //A10 = -4.53051338046340380512e-04L
+data8 0xB325DA4515D8B54C, 0x00003FF2 //A11 = +1.70848714622328427290e-04L
+data8 0x8179C36354571747, 0x0000BFF1 //A12 = -6.17387951061077132522e-05L
+data8 0xB40F241C01C907E9, 0x00003FEF //A13 = +2.14647227210702861416e-05L
+data8 0xF436D84AD7D4D316, 0x0000BFED //A14 = -7.27815144835213913238e-06L
+data8 0x9EB432503FB0B7BC, 0x00003FEC //A15 = +2.36487228755136968792e-06L
+// Pol4
+data8 0xE0BA539E4AFC4741, 0x00003FED //A0 = +6.69741148991838024429e-06L
+data8 0x8583BF71139452CF, 0x0000BFFA //A1 = -3.25963476363756051657e-02L
+data8 0x8384FEF6D08AD6CE, 0x0000BFF9 //A2 = -1.60546283500634200479e-02L
+data8 0xB1E67DFB84C97036, 0x00003FF9 //A3 = +2.17163525195697635702e-02L
+data8 0xFB6ACEE6899E360D, 0x0000BFF8 //A4 = -1.53452892792759316229e-02L
+data8 0x8D2B869EB9149905, 0x00003FF8 //A5 = +8.61633440480716870830e-03L
+data8 0x8A90BFE0FD869A41, 0x0000BFF7 //A6 = -4.22868126950622376530e-03L
+data8 0xF7536A76E59F54D2, 0x00003FF5 //A7 = +1.88694643606912107006e-03L
+data8 0xCCF6FE58C16E1CC7, 0x0000BFF4 //A8 = -7.81878732767742447339e-04L
+data8 0x9FCC6ED9914FAA24, 0x00003FF3 //A9 = +3.04791577214885118730e-04L
+data8 0xEC7F5AAACAE593E8, 0x0000BFF1 //A10 = -1.12770784960291779798e-04L
+data8 0xA72CE628A114C940, 0x00003FF0 //A11 = +3.98577182157456408782e-05L
+data8 0xE2DCC5750FD769BA, 0x0000BFEE //A12 = -1.35220520471857266339e-05L
+data8 0x9459160B1E6F1F8D, 0x00003FED //A13 = +4.42111470121432700283e-06L
+data8 0xBE0A05701BD0DD42, 0x0000BFEB //A14 = -1.41590196994052764542e-06L
+data8 0xE905D729105081BF, 0x00003FE9 //A15 = +4.34038814785401120999e-07L
+// Pol5
+data8 0xA33649C3AB459832, 0x00003FEE //A0 = +9.72819704141525206634e-06L
+data8 0x9E4EA2F44C9A24BD, 0x0000BFFA //A1 = -3.86492123987296806210e-02L
+data8 0xE80C0B1280F357BF, 0x0000BFF2 //A2 = -2.21297306012713370124e-04L
+data8 0xDAECCE90A4D45D9A, 0x00003FF7 //A3 = +6.68106161291482829670e-03L
+data8 0xA4006572071BDD4B, 0x0000BFF7 //A4 = -5.00493005170532147076e-03L
+data8 0xB07FD7EB1F4D8E8E, 0x00003FF6 //A5 = +2.69316693731732554959e-03L
+data8 0xA1F471D42ADD73A1, 0x0000BFF5 //A6 = -1.23561753760779610478e-03L
+data8 0x8611D0ED1B4C8176, 0x00003FF4 //A7 = +5.11434914439322741260e-04L
+data8 0xCDADB789B487A541, 0x0000BFF2 //A8 = -1.96150380913036018825e-04L
+data8 0x9470252731687FEE, 0x00003FF1 //A9 = +7.07807859951401721129e-05L
+data8 0xCB9399AD1C376D85, 0x0000BFEF //A10 = -2.42682175234436724152e-05L
+data8 0x858D815F9CA0A9F7, 0x00003FEE //A11 = +7.96036454038012144300e-06L
+data8 0xA878D338E6E6A079, 0x0000BFEC //A12 = -2.51042802626063073967e-06L
+data8 0xCD2C2F079D2FCB36, 0x00003FEA //A13 = +7.64327468786076941271e-07L
+data8 0xF5EF4A4B2EA426F2, 0x0000BFE8 //A14 = -2.29044563492386125272e-07L
+data8 0x8CE52181393820FC, 0x00003FE7 //A15 = +6.56093668622712763489e-08L
+// Pol6
+data8 0xB2015D7F1864B7CF, 0x00003FEC //A0 = +2.65248615880090351276e-06L
+data8 0x954EA7A861B4462A, 0x0000BFFA //A1 = -3.64519642954351295215e-02L
+data8 0x9E46F2A4D9157E69, 0x00003FF7 //A2 = +4.83023498390681965101e-03L
+data8 0xA0D12B422FFD5BAD, 0x00003FF5 //A3 = +1.22693684633643883352e-03L
+data8 0xB291D16A560A740E, 0x0000BFF5 //A4 = -1.36237794246703606647e-03L
+data8 0xC138941BC8AF4A9D, 0x00003FF4 //A5 = +7.37079658343628747256e-04L
+data8 0xA761669D61B405CF, 0x0000BFF3 //A6 = -3.19252914480518163396e-04L
+data8 0x8053680F1C84607E, 0x00003FF2 //A7 = +1.22381025852939439541e-04L
+data8 0xB518F4B6F25015F9, 0x0000BFF0 //A8 = -4.31770048258291369742e-05L
+data8 0xEFF526AC70B9411E, 0x00003FEE //A9 = +1.43025887824433324525e-05L
+data8 0x970B2A848DF5B5C2, 0x0000BFED //A10 = -4.50145058393497252604e-06L
+data8 0xB614D2E61DB86963, 0x00003FEB //A11 = +1.35661172167726780059e-06L
+data8 0xD34EA4D283EC33FA, 0x0000BFE9 //A12 = -3.93590335713880681528e-07L
+data8 0xED209EBD68E1145F, 0x00003FE7 //A13 = +1.10421060667544991323e-07L
+data8 0x83A126E22A17568D, 0x0000BFE6 //A14 = -3.06473811074239684132e-08L
+data8 0x8B778496EDE9F415, 0x00003FE4 //A15 = +8.11804009754249175736e-09L
+// Pol7
+data8 0x8E152F522501B7B9, 0x00003FEE //A0 = +8.46879203970927626532e-06L
+data8 0xFD22F92EE21F491E, 0x0000BFF9 //A1 = -3.09004656656418947425e-02L
+data8 0xAF0C41847D89EC14, 0x00003FF7 //A2 = +5.34203719233189217519e-03L
+data8 0xB7C539C400445956, 0x0000BFF3 //A3 = -3.50514245383356287965e-04L
+data8 0x8428C78B2B1E3622, 0x0000BFF3 //A4 = -2.52073850239006530978e-04L
+data8 0xAFC0CCC7D1A05F5B, 0x00003FF2 //A5 = +1.67611241057491801028e-04L
+data8 0x95DC7272C5695A5A, 0x0000BFF1 //A6 = -7.14593512262564106636e-05L
+data8 0xD6FCA68A61F0E835, 0x00003FEF //A7 = +2.56284375437771117850e-05L
+data8 0x8B71C74DEA936C66, 0x0000BFEE //A8 = -8.31153675277218441096e-06L
+data8 0xA8AC71E2A56AA2C9, 0x00003FEC //A9 = +2.51343269277107451413e-06L
+data8 0xC15DED6C44B46046, 0x0000BFEA //A10 = -7.20347851650066610771e-07L
+data8 0xD42BA1DFBD1277AC, 0x00003FE8 //A11 = +1.97599119274780745741e-07L
+data8 0xE03A81F2C976D11A, 0x0000BFE6 //A12 = -5.22072765405802337371e-08L
+data8 0xE56A19A67DD66100, 0x00003FE4 //A13 = +1.33536787408751203998e-08L
+data8 0xE964D255CB31DFFA, 0x0000BFE2 //A14 = -3.39632729387679010008e-09L
+data8 0xE22E62E932B704D4, 0x00003FE0 //A15 = +8.22842400379225526299e-10L
+// Pol8
+data8 0xB8B835882D46A6C8, 0x00003FEF //A0 = +2.20202883282415435401e-05L
+data8 0xC9D1F63F89B74E90, 0x0000BFF9 //A1 = -2.46362504515706189782e-02L
+data8 0x8E376748B1274F30, 0x00003FF7 //A2 = +4.34010070001387441657e-03L
+data8 0x98174C7EA49B5B37, 0x0000BFF4 //A3 = -5.80181163659971286762e-04L
+data8 0x8D2C40506AE9FF97, 0x00003FEF //A4 = +1.68291159100251734927e-05L
+data8 0xD9A580C115B9D150, 0x00003FEF //A5 = +2.59454841475194555896e-05L
+data8 0xDB35B21F1C3F99CE, 0x0000BFEE //A6 = -1.30659192305072674545e-05L
+data8 0x99FAADAE17A3050E, 0x00003FED //A7 = +4.58893813631592314881e-06L
+data8 0xBA1D259BCD6987A9, 0x0000BFEB //A8 = -1.38665627771423394637e-06L
+data8 0xCDD7FF5BEA0145C2, 0x00003FE9 //A9 = +3.83413844219813384124e-07L
+data8 0xD60857176CE6AB9D, 0x0000BFE7 //A10 = -9.96666862214499946343e-08L
+data8 0xD446A2402112DF4C, 0x00003FE5 //A11 = +2.47121687566658908126e-08L
+data8 0xCA87133235F1F495, 0x0000BFE3 //A12 = -5.89433000014933371980e-09L
+data8 0xBB15B0021581C8B6, 0x00003FE1 //A13 = +1.36122047057936849125e-09L
+data8 0xAC9D6585D4AF505E, 0x0000BFDF //A14 = -3.13984547328132268695e-10L
+data8 0x975A1439C3795183, 0x00003FDD //A15 = +6.88268624429648826457e-11L
+// Pol9
+data8 0x99A7676284CDC9FE, 0x00003FEF //A0 = +1.83169747921764176475e-05L
+data8 0x9AD0AE249A02896C, 0x0000BFF9 //A1 = -1.88983346204739151909e-02L
+data8 0xCB89B4AEC19898BE, 0x00003FF6 //A2 = +3.10574208447745576452e-03L
+data8 0xEBBC47E30E1AC2C2, 0x0000BFF3 //A3 = -4.49629730048297442064e-04L
+data8 0xD1E35B7FCE1CF859, 0x00003FF0 //A4 = +5.00412261289558493438e-05L
+data8 0xB40743664EF24552, 0x0000BFEB //A5 = -1.34131589671166307319e-06L
+data8 0xCAD2F5C596FFE1B4, 0x0000BFEB //A6 = -1.51115702599728593837e-06L
+data8 0xAE42B6D069DFDDF2, 0x00003FEA //A7 = +6.49171330116787223873e-07L
+data8 0xD0739A05BB43A714, 0x0000BFE8 //A8 = -1.94135651872623440782e-07L
+data8 0xD745B854AB601BD7, 0x00003FE6 //A9 = +5.01219983943456578062e-08L
+data8 0xCC4066E13E338B13, 0x0000BFE4 //A10 = -1.18890061172430768892e-08L
+data8 0xB6EAADB55A6C3CB4, 0x00003FE2 //A11 = +2.66178850259168707794e-09L
+data8 0x9CC6C178AD3F96AD, 0x0000BFE0 //A12 = -5.70349182959704086428e-10L
+data8 0x81D0E2AA27DEB74A, 0x00003FDE //A13 = +1.18066926578104076645e-10L
+data8 0xD75FB9049190BEFD, 0x0000BFDB //A14 = -2.44851795398843967972e-11L
+data8 0xA9384A51D48C8703, 0x00003FD9 //A15 = +4.80951837368635202609e-12L
+// Pol10
+data8 0xD2B3482EE449C535, 0x00003FEE //A0 = +1.25587177382575655080e-05L
+data8 0xE7939B2D0607DFCF, 0x0000BFF8 //A1 = -1.41343131436717436429e-02L
+data8 0x8810EB4AC5F0F1CE, 0x00003FF6 //A2 = +2.07620377002350121270e-03L
+data8 0x9546589602AEB955, 0x0000BFF3 //A3 = -2.84719065122144294949e-04L
+data8 0x9333434342229798, 0x00003FF0 //A4 = +3.50952732796136549298e-05L
+data8 0xEB36A98FD81D3DEB, 0x0000BFEC //A5 = -3.50495464815398722482e-06L
+data8 0xAC370EFA025D0477, 0x00003FE8 //A6 = +1.60387784498518639254e-07L
+data8 0xC8DF7F8ACA099426, 0x00003FE6 //A7 = +4.67693991699936842330e-08L
+data8 0xAC694AD4921C02CF, 0x0000BFE5 //A8 = -2.00713167514877937714e-08L
+data8 0xB6E29F2FDE2D8C1A, 0x00003FE3 //A9 = +5.32266106167252495164e-09L
+data8 0xA41F8EEA75474358, 0x0000BFE1 //A10 = -1.19415398856537468324e-09L
+data8 0x869D778A1C56D3D6, 0x00003FDF //A11 = +2.44863450057778470469e-10L
+data8 0xD02658BF31411F4C, 0x0000BFDC //A12 = -4.73277831746128372261e-11L
+data8 0x9A4A95EE59127779, 0x00003FDA //A13 = +8.77044784978207256260e-12L
+data8 0xE518330AF013C2F6, 0x0000BFD7 //A14 = -1.62781453276882333209e-12L
+data8 0xA036A9DF71BD108A, 0x00003FD5 //A15 = +2.84596398987114375607e-13L
+// Pol11
+data8 0x9191CFBF001F3BB3, 0x00003FEE //A0 = +8.67662287973472452343e-06L
+data8 0xAA47E0CF01AE9730, 0x0000BFF8 //A1 = -1.03931136509584404513e-02L
+data8 0xAEABE7F17B01D18F, 0x00003FF5 //A2 = +1.33263784731775399430e-03L
+data8 0xAC0D6A309D04E5DB, 0x0000BFF2 //A3 = -1.64081956462118568288e-04L
+data8 0xA08357DF458054D0, 0x00003FEF //A4 = +1.91346477952797715021e-05L
+data8 0x8A1596B557440FE0, 0x0000BFEC //A5 = -2.05761687274453412571e-06L
+data8 0xCDA0EAE0A5615E9A, 0x00003FE8 //A6 = +1.91506542215670149741e-07L
+data8 0xD36A08FB4E104F9A, 0x0000BFE4 //A7 = -1.23059260396551086769e-08L
+data8 0xD7433F91E78A7A11, 0x0000BFDF //A8 = -3.91560549815575091188e-10L
+data8 0xC2F5308FD4F5CE62, 0x00003FDF //A9 = +3.54626121852421163117e-10L
+data8 0xC83876915F49D630, 0x0000BFDD //A10 = -9.10497688901018285126e-11L
+data8 0xA11C605DEAE1FE9C, 0x00003FDB //A11 = +1.83161825409194847892e-11L
+data8 0xE7977BC1342D19BF, 0x0000BFD8 //A12 = -3.29111645807102123274e-12L
+data8 0x9BC3A7D6396C6756, 0x00003FD6 //A13 = +5.53385887288503961220e-13L
+data8 0xD0110D5683740B8C, 0x0000BFD3 //A14 = -9.24001363293241428519e-14L
+data8 0x81786D7856A5CC92, 0x00003FD1 //A15 = +1.43741041714595023996e-14L
+// Pol12
+data8 0xB85654F6033B3372, 0x00003FEF //A0 = +2.19747106911869287049e-05L
+data8 0xF78B40078736B406, 0x0000BFF7 //A1 = -7.55444170413862312647e-03L
+data8 0xDA8FDE84D88E5D5D, 0x00003FF4 //A2 = +8.33747822263358628569e-04L
+data8 0xBC2D3F3891721AA9, 0x0000BFF1 //A3 = -8.97296647669960333635e-05L
+data8 0x9D15ACFD3BF50064, 0x00003FEE //A4 = +9.36297600601039610762e-06L
+data8 0xFBED3D03F3C1B671, 0x0000BFEA //A5 = -9.38500137149172923985e-07L
+data8 0xBEE615E3B2FA16C8, 0x00003FE7 //A6 = +8.88941676851808958175e-08L
+data8 0x843D32692CF5662A, 0x0000BFE4 //A7 = -7.69732580860195238520e-09L
+data8 0x99E74472FD94E22B, 0x00003FE0 //A8 = +5.59897264617128952416e-10L
+data8 0xCEF63DABF4C32E15, 0x0000BFDB //A9 = -2.35288414996279313219e-11L
+data8 0xA2D86C25C0991123, 0x0000BFD8 //A10 = -2.31417232327307408235e-12L
+data8 0xF50C1B31D2E922BD, 0x00003FD6 //A11 = +8.70582858983364191159e-13L
+data8 0xC0F093DEC2B019A1, 0x0000BFD4 //A12 = -1.71364927865227509533e-13L
+data8 0xFC1441C4CD105981, 0x00003FD1 //A13 = +2.79864052545369490865e-14L
+data8 0x9CC959853267F026, 0x0000BFCF //A14 = -4.35170017302700609509e-15L
+data8 0xB06BA14016154F1E, 0x00003FCC //A15 = +6.12081320471295704631e-16L
+// Pol13
+data8 0xA59E74BF544F2422, 0x00003FEF //A0 = +1.97433196215210145261e-05L
+data8 0xB2814F4EDAE15330, 0x0000BFF7 //A1 = -5.44754383528015875700e-03L
+data8 0x867C249D378F0A23, 0x00003FF4 //A2 = +5.13019308804593120161e-04L
+data8 0xC76644393388AB68, 0x0000BFF0 //A3 = -4.75405403392600215101e-05L
+data8 0x91143AD5CCA229FE, 0x00003FED //A4 = +4.32369180778264703719e-06L
+data8 0xCE6A11FB6840A974, 0x0000BFE9 //A5 = -3.84476663329551178495e-07L
+data8 0x8EC29F66C59DE243, 0x00003FE6 //A6 = +3.32389596787155456596e-08L
+data8 0xBE3FCDDCA94CA24E, 0x0000BFE2 //A7 = -2.76849073931513325199e-09L
+data8 0xF06A84BDC70A0B0D, 0x00003FDE //A8 = +2.18657158231304988330e-10L
+data8 0x8B8E6969D056D124, 0x0000BFDB //A9 = -1.58657139740906811035e-11L
+data8 0x8984985AA29A0567, 0x00003FD7 //A10 = +9.77123802231106533829e-13L
+data8 0xA53ABA084300137C, 0x0000BFD2 //A11 = -3.66882970952892030306e-14L
+data8 0xA90EC851E91C3319, 0x0000BFCE //A12 = -2.34614750044359490986e-15L
+data8 0xEC9CAF64237B5060, 0x00003FCC //A13 = +8.20912960028437475035e-16L
+data8 0xA9156668FCF01479, 0x0000BFCA //A14 = -1.46656639874123613261e-16L
+data8 0xBAEF58D8118DD5D4, 0x00003FC7 //A15 = +2.02675278255254907493e-17L
+// Pol14
+data8 0xC698952E9CEAA800, 0x00003FEF //A0 = +2.36744912073515619263e-05L
+data8 0x800395F8C7B4FA00, 0x0000BFF7 //A1 = -3.90667746392883642897e-03L
+data8 0xA3B2467B6B391831, 0x00003FF3 //A2 = +3.12226081793919541155e-04L
+data8 0xCF2061122A69D72B, 0x0000BFEF //A3 = -2.46914006692526122176e-05L
+data8 0x817FAB6B5DEB9924, 0x00003FEC //A4 = +1.92968114320180123521e-06L
+data8 0x9FC190F5827740E7, 0x0000BFE8 //A5 = -1.48784479265231093475e-07L
+data8 0xC1FE5C1835C8AFCD, 0x00003FE4 //A6 = +1.12919132662720380018e-08L
+data8 0xE7216A9FBB204DA3, 0x0000BFE0 //A7 = -8.40847981461949000003e-10L
+data8 0x867566ED95C5C64F, 0x00003FDD //A8 = +6.11446929759298780795e-11L
+data8 0x97A8BFA723F0F014, 0x0000BFD9 //A9 = -4.31041298699752869577e-12L
+data8 0xA3D24B7034984522, 0x00003FD5 //A10 = +2.91005377301348717042e-13L
+data8 0xA5AAA371C22F3741, 0x0000BFD1 //A11 = -1.83926825395757259128e-14L
+data8 0x95352E5597EACC23, 0x00003FCD //A12 = +1.03533666540077850452e-15L
+data8 0xCCEBE3043B689428, 0x0000BFC8 //A13 = -4.44352525147076912166e-17L
+data8 0xA779DAB4BE1F80BB, 0x0000BFBC //A14 = -8.86610526981738255206e-21L
+data8 0xB171271F3517282C, 0x00003FC1 //A15 = +3.00598445879282370850e-19L
+// Pol15
+data8 0xB7AC727D1C3FEB05, 0x00003FEE //A0 = +1.09478009914822049780e-05L
+data8 0xB6E6274485C10B0A, 0x0000BFF6 //A1 = -2.79081782038927199588e-03L
+data8 0xC5CAE2122D009506, 0x00003FF2 //A2 = +1.88629638738336219173e-04L
+data8 0xD466E7957D0A3362, 0x0000BFEE //A3 = -1.26601440424012313479e-05L
+data8 0xE2593D798DA20E2E, 0x00003FEA //A4 = +8.43214222346512003230e-07L
+data8 0xEF2D2BBA7D2882CC, 0x0000BFE6 //A5 = -5.56876064495961858535e-08L
+data8 0xFA5819BB4AE974C2, 0x00003FE2 //A6 = +3.64298674151704370449e-09L
+data8 0x819BB0CE825FBB28, 0x0000BFDF //A7 = -2.35755881668932259913e-10L
+data8 0x84871099BF728B8F, 0x00003FDB //A8 = +1.50666434199945890414e-11L
+data8 0x858188962DFEBC9F, 0x0000BFD7 //A9 = -9.48617116568458677088e-13L
+data8 0x840F38FF2FBAE753, 0x00003FD3 //A10 = +5.86461827778372616657e-14L
+data8 0xFF47EAF69577B213, 0x0000BFCE //A11 = -3.54273456410181081472e-15L
+data8 0xEF402CCB4D29FAF8, 0x00003FCA //A12 = +2.07516888659313950588e-16L
+data8 0xD6B789E01141231B, 0x0000BFC6 //A13 = -1.16398290506765191078e-17L
+data8 0xB5EEE343E9CFE3EC, 0x00003FC2 //A14 = +6.16413506924643419723e-19L
+data8 0x859B41A39D600346, 0x0000BFBE //A15 = -2.82922705825870414438e-20L
+// Pol16
+data8 0x85708B69FD184E11, 0x00003FED //A0 = +3.97681079176353356199e-06L
+data8 0x824D92BC60A1F70A, 0x0000BFF6 //A1 = -1.98826630037499070532e-03L
+data8 0xEDCF7D3576BB5258, 0x00003FF1 //A2 = +1.13396885054265675352e-04L
+data8 0xD7FC59226A947CDF, 0x0000BFED //A3 = -6.43687650810478871875e-06L
+data8 0xC32C51B574E2651E, 0x00003FE9 //A4 = +3.63538268539251809118e-07L
+data8 0xAF67910F5681401F, 0x0000BFE5 //A5 = -2.04197779750247395258e-08L
+data8 0x9CB3E8D7DCD1EA9D, 0x00003FE1 //A6 = +1.14016272459029850306e-09L
+data8 0x8B14ECFBF7D4F114, 0x0000BFDD //A7 = -6.32470533185766848692e-11L
+data8 0xF518253AE4A3AE72, 0x00003FD8 //A8 = +3.48299974583453268369e-12L
+data8 0xD631A5699AA2F334, 0x0000BFD4 //A9 = -1.90242426474085078079e-13L
+data8 0xB971AD4C30C56E5D, 0x00003FD0 //A10 = +1.02942127356740047925e-14L
+data8 0x9ED0065A601F3160, 0x0000BFCC //A11 = -5.50991880383698965959e-16L
+data8 0x863A04008E12867C, 0x00003FC8 //A12 = +2.91057593756148904838e-17L
+data8 0xDF62F9F44F5C7170, 0x0000BFC3 //A13 = -1.51372666097522872780e-18L
+data8 0xBA4E118E88CFDD31, 0x00003FBF //A14 = +7.89032177282079635722e-20L
+data8 0x942AD897FC4D2F2A, 0x0000BFBB //A15 = -3.92195756076319409245e-21L
+// Pol17
+data8 0xCB8514540566C717, 0x00003FEF //A0 = +2.42614557068144130848e-05L
+data8 0xB94F08D6816E0CD4, 0x0000BFF5 //A1 = -1.41379340061829929314e-03L
+data8 0x8E7C342C2DABB51B, 0x00003FF1 //A2 = +6.79422240687700109911e-05L
+data8 0xDA69DAFF71E30D5B, 0x0000BFEC //A3 = -3.25461473899657142468e-06L
+data8 0xA6D5B2DB69B4B3F6, 0x00003FE8 //A4 = +1.55376978584082701045e-07L
+data8 0xFDF4F76BC1D1BD47, 0x0000BFE3 //A5 = -7.39111857092131684572e-09L
+data8 0xC08BC52C95B12C2D, 0x00003FDF //A6 = +3.50239092565793882444e-10L
+data8 0x91624BF6D3A3F6C9, 0x0000BFDB //A7 = -1.65282439890232458821e-11L
+data8 0xDA91F7A450DE4270, 0x00003FD6 //A8 = +7.76517285902715940501e-13L
+data8 0xA380ADF55416E624, 0x0000BFD2 //A9 = -3.63048822989374426852e-14L
+data8 0xF350FC0CEDEE0FD6, 0x00003FCD //A10 = +1.68834630987974622269e-15L
+data8 0xB3FA19FBDC8F023C, 0x0000BFC9 //A11 = -7.80525639701804380489e-17L
+data8 0x8435328C80940126, 0x00003FC5 //A12 = +3.58349966898667910204e-18L
+data8 0xC0D22F655BA5EF39, 0x0000BFC0 //A13 = -1.63325770165403860181e-19L
+data8 0x8F14B9EBD5A9AB25, 0x00003FBC //A14 = +7.57464305512080733773e-21L
+data8 0xCD4804BBF6DC1B6F, 0x0000BFB7 //A15 = -3.39609459750208886298e-22L
+// Pol18
+data8 0xE251DFE45AB0C22E, 0x00003FEE //A0 = +1.34897126299700418200e-05L
+data8 0x83943CC7D59D4215, 0x0000BFF5 //A1 = -1.00386850310061655307e-03L
+data8 0xAA57896951134BCA, 0x00003FF0 //A2 = +4.06126834109940757047e-05L
+data8 0xDC0A67051E1C4A2C, 0x0000BFEB //A3 = -1.63943048164477430317e-06L
+data8 0x8DCB3C0A8CD07BBE, 0x00003FE7 //A4 = +6.60279229777753829876e-08L
+data8 0xB64DE81C24F7F265, 0x0000BFE2 //A5 = -2.65287705357477481067e-09L
+data8 0xE9CBB7A990DBA8B5, 0x00003FDD //A6 = +1.06318007608620426224e-10L
+data8 0x9583D4B85C2ADC6F, 0x0000BFD9 //A7 = -4.24947087941505088222e-12L
+data8 0xBEB0EE8114EEDF77, 0x00003FD4 //A8 = +1.69367754741562774916e-13L
+data8 0xF2791BB8F06BDA93, 0x0000BFCF //A9 = -6.72997988617021128704e-15L
+data8 0x99A907F6A92195B4, 0x00003FCB //A10 = +2.66558091161711891239e-16L
+data8 0xC213E5E6F833BB93, 0x0000BFC6 //A11 = -1.05209746502719578617e-17L
+data8 0xF41FBBA6B343960F, 0x00003FC1 //A12 = +4.13562069721140021224e-19L
+data8 0x98F194AEE31D188D, 0x0000BFBD //A13 = -1.61935414722333263347e-20L
+data8 0xC42F5029BB622157, 0x00003FB8 //A14 = +6.49121108201931196678e-22L
+data8 0xF43BD08079E50E0F, 0x0000BFB3 //A15 = -2.52531675510242468317e-23L
+// Pol19
+data8 0x82557B149A04D08E, 0x00003FEF //A0 = +1.55370127331027842820e-05L
+data8 0xBAAB433307CE614B, 0x0000BFF4 //A1 = -7.12085701486669872724e-04L
+data8 0xCB52D9DBAC16FE82, 0x00003FEF //A2 = +2.42380662859334411743e-05L
+data8 0xDD214359DBBCE7D1, 0x0000BFEA //A3 = -8.23773197624244883859e-07L
+data8 0xF01E8E968139524C, 0x00003FE5 //A4 = +2.79535729459988509676e-08L
+data8 0x82286A057E0916CE, 0x0000BFE1 //A5 = -9.47023128967039348510e-10L
+data8 0x8CDDDC4E8D013365, 0x00003FDC //A6 = +3.20293663356974901319e-11L
+data8 0x982FEEE90D4E8751, 0x0000BFD7 //A7 = -1.08135537312234452657e-12L
+data8 0xA41D1E84083B8FD6, 0x00003FD2 //A8 = +3.64405720894915411836e-14L
+data8 0xB0A1B6111B72E159, 0x0000BFCD //A9 = -1.22562851790685744085e-15L
+data8 0xBDB77DE6B650FFA2, 0x00003FC8 //A10 = +4.11382657214908334175e-17L
+data8 0xCB54E95CDB66978A, 0x0000BFC3 //A11 = -1.37782909696752432371e-18L
+data8 0xD959E428A62B1B6C, 0x00003FBE //A12 = +4.60258936838597812582e-20L
+data8 0xE7D49EC23F1A16A0, 0x0000BFB9 //A13 = -1.53412587409583783059e-21L
+data8 0xFDE429BC9947B2BE, 0x00003FB4 //A14 = +5.25034823750902928092e-23L
+data8 0x872137A062C042EF, 0x0000BFB0 //A15 = -1.74651114923000080365e-24L
+// Pol20
+data8 0x8B9B185C6A2659AC, 0x00003FEF //A0 = +1.66423130594825442963e-05L
+data8 0x84503AD52588A1E8, 0x0000BFF4 //A1 = -5.04735556466270303549e-04L
+data8 0xF26C7C2B566388E1, 0x00003FEE //A2 = +1.44495826764677427386e-05L
+data8 0xDDDA15FEE262BB47, 0x0000BFE9 //A3 = -4.13231361893675488873e-07L
+data8 0xCACEBC73C90C2FE0, 0x00003FE4 //A4 = +1.18049538609157282958e-08L
+data8 0xB9314D00022B41DD, 0x0000BFDF //A5 = -3.36863342776746896664e-10L
+data8 0xA8E9FBDC714638B9, 0x00003FDA //A6 = +9.60164921624768038366e-12L
+data8 0x99E246C0CC8CA6F6, 0x0000BFD5 //A7 = -2.73352704217713596798e-13L
+data8 0x8C04E7B5DF372EA1, 0x00003FD0 //A8 = +7.77262480048865685174e-15L
+data8 0xFE7B90CAA0B6D5F7, 0x0000BFCA //A9 = -2.20728537958846147109e-16L
+data8 0xE6F40BAD4EC6CB4F, 0x00003FC5 //A10 = +6.26000182616999972048e-18L
+data8 0xD14F4E0538F0F992, 0x0000BFC0 //A11 = -1.77292283439752259258e-19L
+data8 0xBD5A7FAA548CC749, 0x00003FBB //A12 = +5.01214569023722089225e-21L
+data8 0xAB15D69425373A67, 0x0000BFB6 //A13 = -1.41518447770061562822e-22L
+data8 0x9EF95456F75B4DF4, 0x00003FB1 //A14 = +4.10938011540250142351e-24L
+data8 0x8FADCC45E81433E7, 0x0000BFAC //A15 = -1.16062889679749879834e-25L
+// Pol21
+data8 0xB47A917B0F7B50AE, 0x00003FEF //A0 = +2.15147474240529518138e-05L
+data8 0xBB77DC3BA0C937B3, 0x0000BFF3 //A1 = -3.57567223048598672970e-04L
+data8 0x90694DFF4EBF7370, 0x00003FEE //A2 = +8.60758700336677694536e-06L
+data8 0xDE5379AA90A98F3F, 0x0000BFE8 //A3 = -2.07057292787309736495e-07L
+data8 0xAB0322293F1F9CA0, 0x00003FE3 //A4 = +4.97711123919916694625e-09L
+data8 0x837119E59D3B7AC2, 0x0000BFDE //A5 = -1.19545621970063369582e-10L
+data8 0xC9E5B74A38ECF3FC, 0x00003FD8 //A6 = +2.86913359605586285967e-12L
+data8 0x9AEF5110C6885352, 0x0000BFD3 //A7 = -6.88048865490621757799e-14L
+data8 0xED988D52189CE6A3, 0x00003FCD //A8 = +1.64865278639132278935e-15L
+data8 0xB6063CECD8012B6D, 0x0000BFC8 //A9 = -3.94702428606368525374e-17L
+data8 0x8B541EB15E79CEEC, 0x00003FC3 //A10 = +9.44127272399408815784e-19L
+data8 0xD51A136D8C75BC25, 0x0000BFBD //A11 = -2.25630369561137931232e-20L
+data8 0xA2C1C5E19CC79E6F, 0x00003FB8 //A12 = +5.38517493921589837361e-22L
+data8 0xF86F9772306F56C1, 0x0000BFB2 //A13 = -1.28438352359240135735e-23L
+data8 0xC32F6FEEDE86528E, 0x00003FAD //A14 = +3.15338862172962186458e-25L
+data8 0x9534ED189744D7D4, 0x0000BFA8 //A15 = -7.53301543611470014315e-27L
+// Pol22
+data8 0xCBA0A2DB94A2C494, 0x00003FEF //A0 = +2.42742878212752702946e-05L
+data8 0x84C089154A49E0E8, 0x0000BFF3 //A1 = -2.53204520651046300034e-04L
+data8 0xABF5665BD0D8B0CD, 0x00003FED //A2 = +5.12476542947092361490e-06L
+data8 0xDEA1C518E3EEE872, 0x0000BFE7 //A3 = -1.03671063536324831083e-07L
+data8 0x900B77F271559AE8, 0x00003FE2 //A4 = +2.09612770408581408652e-09L
+data8 0xBA4C74A262BE3E4E, 0x0000BFDC //A5 = -4.23594098489216166935e-11L
+data8 0xF0D1680FCC1EAF97, 0x00003FD6 //A6 = +8.55557381760467917779e-13L
+data8 0x9B8F8E033BB83A24, 0x0000BFD1 //A7 = -1.72707138247091685914e-14L
+data8 0xC8DCA6A691DB8335, 0x00003FCB //A8 = +3.48439884388851942939e-16L
+data8 0x819A6CB9CEA5E9BD, 0x0000BFC6 //A9 = -7.02580471688245511753e-18L
+data8 0xA726B4F622585BEA, 0x00003FC0 //A10 = +1.41582572516648501043e-19L
+data8 0xD7727648A4095986, 0x0000BFBA //A11 = -2.85141885626054217632e-21L
+data8 0x8AB627E09CF45997, 0x00003FB5 //A12 = +5.73697507862703019314e-23L
+data8 0xB28C15C117CC604F, 0x0000BFAF //A13 = -1.15383428132352407085e-24L
+data8 0xECB8428626DA072C, 0x00003FA9 //A14 = +2.39025879246942839796e-26L
+data8 0x98B731BCFA2CE2B2, 0x0000BFA4 //A15 = -4.81885474332093262902e-28L
+// Pol23
+data8 0xC6D013811314D31B, 0x00003FED //A0 = +5.92508308918577687876e-06L
+data8 0xBBF3057B8DBACBCF, 0x0000BFF2 //A1 = -1.79242422493281965934e-04L
+data8 0xCCADECA501162313, 0x00003FEC //A2 = +3.04996061562356504918e-06L
+data8 0xDED1FDBE8CCAF3DB, 0x0000BFE6 //A3 = -5.18793887648024117154e-08L
+data8 0xF27B74EDDCA65859, 0x00003FE0 //A4 = +8.82145297317787820675e-10L
+data8 0x83E4415687F01A0C, 0x0000BFDB //A5 = -1.49943414247603665601e-11L
+data8 0x8F6CB350861CE446, 0x00003FD5 //A6 = +2.54773288906376920377e-13L
+data8 0x9BE8456A30CBFC02, 0x0000BFCF //A7 = -4.32729710913845745148e-15L
+data8 0xA9694F7E1033977D, 0x00003FC9 //A8 = +7.34704698157502347441e-17L
+data8 0xB8035A3D5AF82D85, 0x0000BFC3 //A9 = -1.24692123826025468001e-18L
+data8 0xC7CB4B3ACB905FDA, 0x00003FBD //A10 = +2.11540249352095943317e-20L
+data8 0xD8D70AEB2E58D729, 0x0000BFB7 //A11 = -3.58731705184186608576e-22L
+data8 0xEB27A61B1D5C7697, 0x00003FB1 //A12 = +6.07861113430709162243e-24L
+data8 0xFEF9ED74D4F4C9B0, 0x0000BFAB //A13 = -1.02984099170876754831e-25L
+data8 0x8E6F410068C12043, 0x00003FA6 //A14 = +1.79777721804459361762e-27L
+data8 0x9AE2F6705481630E, 0x0000BFA0 //A15 = -3.05459905177379058768e-29L
+// Pol24
+data8 0xD2D858D5B01C9434, 0x00003FEE //A0 = +1.25673476165670766128e-05L
+data8 0x8505330F8B4FDE49, 0x0000BFF2 //A1 = -1.26858053564784963985e-04L
+data8 0xF39171C8B1D418C2, 0x00003FEB //A2 = +1.81472407620770441249e-06L
+data8 0xDEF065C3D7BFD26E, 0x0000BFE5 //A3 = -2.59535215807652675043e-08L
+data8 0xCC0199EA6ACA630C, 0x00003FDF //A4 = +3.71085215769339916703e-10L
+data8 0xBAA25319F01ED248, 0x0000BFD9 //A5 = -5.30445960650683029105e-12L
+data8 0xAAB28A84F8CFE4D1, 0x00003FD3 //A6 = +7.58048850973457592162e-14L
+data8 0x9C14B931AEB311A8, 0x0000BFCD //A7 = -1.08302915828084288776e-15L
+data8 0x8EADA745715A0714, 0x00003FC7 //A8 = +1.54692159263197000533e-17L
+data8 0x82643F3F722CE6B5, 0x0000BFC1 //A9 = -2.20891945694400066611e-19L
+data8 0xEE42ECDE465A99E4, 0x00003FBA //A10 = +3.15336372779307614198e-21L
+data8 0xD99FC74326ACBFC0, 0x0000BFB4 //A11 = -4.50036161691276556269e-23L
+data8 0xC6A4DCACC554911E, 0x00003FAE //A12 = +6.41853356148678957077e-25L
+data8 0xB550CEA09DA96F44, 0x0000BFA8 //A13 = -9.15410112414783078242e-27L
+data8 0xAA9149317996F32F, 0x00003FA2 //A14 = +1.34554050666508391264e-28L
+data8 0x9C3008EFE3F52F19, 0x0000BF9C //A15 = -1.92516125328592532359e-30L
+// Pol25
+data8 0xA68E78218806283F, 0x00003FEF //A0 = +1.98550844852103406280e-05L
+data8 0xBC41423996DC8A37, 0x0000BFF1 //A1 = -8.97669395268764751516e-05L
+data8 0x90E55AE31A2F8271, 0x00003FEB //A2 = +1.07955871580069359702e-06L
+data8 0xDF022272DA4A3BEF, 0x0000BFE4 //A3 = -1.29807937275957214439e-08L
+data8 0xAB95DCBFFB0BAAB8, 0x00003FDE //A4 = +1.56056011861921437794e-10L
+data8 0x83FF2547BA9011FF, 0x0000BFD8 //A5 = -1.87578539510813332135e-12L
+data8 0xCB0C353560EEDC45, 0x00003FD1 //A6 = +2.25428217090412574481e-14L
+data8 0x9C24CEB86E76D2C5, 0x0000BFCB //A7 = -2.70866279585559299821e-16L
+data8 0xF01AFA23DDFDAE0E, 0x00003FC4 //A8 = +3.25403467375734083376e-18L
+data8 0xB892BDFBCF1D9740, 0x0000BFBE //A9 = -3.90848978133441513662e-20L
+data8 0x8DDBBF34415AAECA, 0x00003FB8 //A10 = +4.69370027479731756829e-22L
+data8 0xDA04170D07458C3B, 0x0000BFB1 //A11 = -5.63558091177482043435e-24L
+data8 0xA76F391095A9563A, 0x00003FAB //A12 = +6.76262416498584003290e-26L
+data8 0x8098FA125C18D8DB, 0x0000BFA5 //A13 = -8.11564737276592661642e-28L
+data8 0xCB9E4D5C08923227, 0x00003F9E //A14 = +1.00391606269366059664e-29L
+data8 0x9CEC3BF7A0BE2CAF, 0x0000BF98 //A15 = -1.20888920108938909316e-31L
+// Pol26
+data8 0xC17AB25E269272F7, 0x00003FEE //A0 = +1.15322640047234590651e-05L
+data8 0x85310509E633FEF2, 0x0000BFF1 //A1 = -6.35106483144690768696e-05L
+data8 0xAC5E4C4DCB2D940C, 0x00003FEA //A2 = +6.42122148740412561597e-07L
+data8 0xDF0AAD0571FFDD48, 0x0000BFE3 //A3 = -6.49136789710824396482e-09L
+data8 0x9049D8440AFD180F, 0x00003FDD //A4 = +6.56147932223174570008e-11L
+data8 0xBAA936477C5FA9D7, 0x0000BFD6 //A5 = -6.63153032879993841863e-13L
+data8 0xF17261294EAB1443, 0x00003FCF //A6 = +6.70149477756803680009e-15L
+data8 0x9C22F87C31DB007A, 0x0000BFC9 //A7 = -6.77134581402030645534e-17L
+data8 0xC9E98E633942AC12, 0x00003FC2 //A8 = +6.84105580182052870823e-19L
+data8 0x828998181309642C, 0x0000BFBC //A9 = -6.91059649300859944955e-21L
+data8 0xA8C3D4DCE1ECBAB6, 0x00003FB5 //A10 = +6.97995542988331257517e-23L
+data8 0xDA288D52CC4C351A, 0x0000BFAE //A11 = -7.04907829139578377009e-25L
+data8 0x8CEEACB790B5F374, 0x00003FA8 //A12 = +7.11526399101774993883e-27L
+data8 0xB61C8A29D98F24C0, 0x0000BFA1 //A13 = -7.18303147470398859453e-29L
+data8 0xF296F69FE45BDA7D, 0x00003F9A //A14 = +7.47537230021540031251e-31L
+data8 0x9D4B25BF6FB7234B, 0x0000BF94 //A15 = -7.57340869663212138051e-33L
+// Pol27
+data8 0xC7772CC326D6FBB8, 0x00003FEE //A0 = +1.18890718679826004395e-05L
+data8 0xE0F9D5410565D55D, 0x0000BFF0 //A1 = -5.36384368533203585378e-05L
+data8 0x85C0BE825680E148, 0x00003FEA //A2 = +4.98268406609692971520e-07L
+data8 0x9F058A389D7BA177, 0x0000BFE3 //A3 = -4.62813885933188677790e-09L
+data8 0xBD0B751F0A6BAC7A, 0x00003FDC //A4 = +4.29838009673609430305e-11L
+data8 0xE0B6823570502E9D, 0x0000BFD5 //A5 = -3.99170340031272728535e-13L
+data8 0x858A9C52FC426D86, 0x00003FCF //A6 = +3.70651975271664045723e-15L
+data8 0x9EB4438BFDF1928D, 0x0000BFC8 //A7 = -3.44134780748056488222e-17L
+data8 0xBC968DCD8C06D74E, 0x00003FC1 //A8 = +3.19480670422195579127e-19L
+data8 0xE0133A405F782125, 0x0000BFBA //A9 = -2.96560935615546392028e-21L
+data8 0x851AFEBB70D07E79, 0x00003FB4 //A10 = +2.75255617931932536111e-23L
+data8 0x9E1E21A841BF8738, 0x0000BFAD //A11 = -2.55452923487640676799e-25L
+data8 0xBBCF2EF1C6E72327, 0x00003FA6 //A12 = +2.37048675755308004410e-27L
+data8 0xDF0D320CF12B8BCB, 0x0000BF9F //A13 = -2.19945804585962185550e-29L
+data8 0x8470A76DE5FCADD8, 0x00003F99 //A14 = +2.04056213851532266258e-31L
+data8 0x9D41C15F6A6FBB04, 0x0000BF92 //A15 = -1.89291056020108587823e-33L
+LOCAL_OBJECT_END(erfc_Q_table)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(erfcl)
+
+{ .mfi
+ alloc r32 = ar.pfs, 0, 36, 4, 0
+ fma.s1 FR_Tmp = f1, f1, f8 // |x|+1, if x >= 0
+ nop.i 0
+}
+{ .mfi
+ addl GR_ad_Arg = @ltoff(exp_table_1), gp
+ fms.s1 FR_Tmp1 = f1, f1, f8 // |x|+1, if x < 0
+ mov GR_rshf_2to51 = 0x4718 // begin 1.10000 2^(63+51)
+}
+;;
+
+{ .mfi
+ ld8 GR_ad_Arg = [GR_ad_Arg] // Point to Arg table
+ fcmp.ge.s1 p6,p7 = f8, f0 // p6: x >= 0 ,p7: x<0
+ shl GR_rshf_2to51 = GR_rshf_2to51,48 // end 1.10000 2^(63+51)
+}
+{ .mlx
+ mov GR_rshf = 0x43e8 // begin 1.1000 2^63 for right shift
+ movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // signif. of 1/ln2
+}
+;;
+
+{ .mfi
+ mov GR_exp_2tom51 = 0xffff-51
+ fclass.m p8,p0 = f8,0x07 // p8: x = 0
+ shl GR_rshf = GR_rshf,48 // end 1.1000 2^63 for right shift
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_norm_x = f8, f8, f0 //high bits for -x^2
+ nop.i 0
+}
+;;
+
+.pred.rel "mutex",p6,p7
+{ .mfi
+ setf.sig FR_INV_LN2_2TO63 = GR_sig_inv_ln2 // form 1/ln2 * 2^63
+(p6) fma.s1 FR_AbsArg = f1, f0, f8 // |x|, if x >= 0
+ nop.i 0
+}
+{ .mfi
+ setf.d FR_RSHF_2TO51 = GR_rshf_2to51 //const 1.10 * 2^(63+51)
+(p7) fms.s1 FR_AbsArg = f1, f0, f8 // |x|, if x < 0
+ mov GR_exp_mask = 0x1FFFF // Form exponent mask
+}
+;;
+
+{ .mfi
+ ldfe FR_ch_dx = [GR_ad_Arg], 16
+ fclass.m p10,p0 = f8, 0x21 // p10: x = +inf
+ mov GR_exp_bias = 0x0FFFF // Set exponent bias
+}
+{ .mlx
+ setf.d FR_RSHF = GR_rshf // Right shift const 1.1000 * 2^63
+ movl GR_ERFC_XC_TB = 0x650
+}
+;;
+
+.pred.rel "mutex",p6,p7
+{ .mfi
+ setf.exp FR_2TOM51 = GR_exp_2tom51 // 2^-51 for scaling float_N
+(p6) fma.s1 FR_Tmp = FR_Tmp, FR_Tmp, f0 // (|x|+1)^2,x >=0
+ nop.i 0
+}
+{ .mfi
+ ldfpd FR_POS_ARG_ASYMP,FR_NEG_ARG_ASYMP = [GR_ad_Arg], 16
+(p7) fma.s1 FR_Tmp = FR_Tmp1, FR_Tmp1, f0 // (|x|+1)^2, x<0
+ mov GR_0x1 = 0x1
+}
+;;
+
+//p8: y = 1.0, x = 0.0,quick exit
+{ .mfi
+ ldfpd FR_dx,FR_dx1 = [GR_ad_Arg], 16
+ fclass.m p9,p0 = f8, 0x22 // p9: x = -inf
+ nop.i 0
+
+}
+{ .mfb
+ nop.m 0
+(p8) fma.s0 f8 = f1, f1, f0
+(p8) br.ret.spnt b0
+}
+;;
+
+{ .mfi
+ ldfe FR_UnfBound = [GR_ad_Arg], 16
+ fclass.m p11,p0 = f8, 0xc3 // p11: x = nan
+ mov GR_BIAS = 0x0FFFF
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_NormX = f8,f1,f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe FR_EpsNorm = [GR_ad_Arg], 16
+ fmerge.s FR_X = f8,f8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_xsq_lo = f8, f8, FR_norm_x // low bits for -x^2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ add GR_ad_C = 0x20, GR_ad_Arg // Point to C table
+ nop.f 0
+ add GR_ad_T1 = 0x50, GR_ad_Arg // Point to T1 table
+}
+{ .mfi
+ add GR_ad_T2 = 0x150, GR_ad_Arg // Point to T2 table
+ nop.f 0
+ add GR_ERFC_XC_TB = GR_ERFC_XC_TB, GR_ad_Arg //poin.to XB_TBL
+}
+;;
+
+{ .mfi
+ getf.exp GR_signexp_x = FR_norm_x // Extr. sign and exponent of x
+ fma.s1 FR_Tmp = FR_Tmp, FR_Tmp, f0 // (|x|+1)^4
+ add GR_ad_W1 = 0x100, GR_ad_T2 // Point to W1 table
+}
+{ .mfi
+ ldfe FR_L_hi = [GR_ad_Arg],16 // Get L_hi
+ nop.f 0
+ add GR_ad_W2 = 0x300, GR_ad_T2 // Point to W2 table
+}
+;;
+
+// p9: y = 2.0, x = -inf, quick exit
+{ .mfi
+ sub GR_mBIAS = r0, GR_BIAS
+ fma.s1 FR_2 = f1, f1, f1
+ nop.i 0
+}
+{ .mfb
+ ldfe FR_L_lo = [GR_ad_Arg],16 // Get L_lo
+(p9) fma.s0 f8 = f1, f1, f1
+(p9) br.ret.spnt b0
+}
+;;
+
+// p10: y = 0.0, x = +inf, quick exit
+{ .mfi
+ adds GR_ERFC_P_TB = 0x380, GR_ERFC_XC_TB // pointer to P_TBL
+ fma.s1 FR_N_signif = FR_norm_x, FR_INV_LN2_2TO63, FR_RSHF_2TO51
+ and GR_exp_x = GR_signexp_x, GR_exp_mask
+}
+{ .mfb
+ adds GR_ERFC_S_TB = 0x1C0, GR_ERFC_XC_TB // pointer to S_TBL
+(p10) fma.s0 f8 = f0, f1, f0
+(p10) br.ret.spnt b0
+}
+;;
+
+// p12: |x| < 0.681... -> dx = 0.875 (else dx = 0.625 )
+// p11: y = x, x = nan, quick exit
+{ .mfi
+ ldfe FR_C3 = [GR_ad_C],16 // Get C3 for normal path
+ fcmp.lt.s1 p12,p0 = FR_AbsArg, FR_ch_dx
+ shl GR_ShftPi_bias = GR_BIAS, 8 // BIAS * 256
+}
+{ .mfb
+ sub GR_exp_x = GR_exp_x, GR_exp_bias // Get exponent
+(p11) fma.s0 f8 = f8, f1, f0
+(p11) br.ret.spnt b0
+
+}
+;;
+
+{ .mfi
+ ldfe FR_C2 = [GR_ad_C],16 // Get A2 for main path
+ nop.f 0
+ nop.i 0
+}
+;;
+
+//p15: x > POS_ARG_ASYMP = 107.0 -> erfcl(x) ~=~ 0.0
+{ .mfi
+ ldfe FR_C1 = [GR_ad_C],16 // Get C1 for main path
+(p6) fcmp.gt.unc.s1 p15,p0 = FR_AbsArg, FR_POS_ARG_ASYMP // p6: x >= 0
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p12) fma.s1 FR_dx = FR_dx1, f1, f0 //p12: dx = 0.875 for x < 0.681
+ nop.b 0
+}
+;;
+
+//p14: x < - NEG_ARG_ASYMP = -6.5 -> erfcl(x) ~=~ 2.0
+{ .mfi
+ nop.m 0
+(p7) fcmp.gt.unc.s1 p14,p0 = FR_AbsArg,FR_NEG_ARG_ASYMP // p7: x < 0
+ shladd GR_ShftXBi_bias = GR_mBIAS, 4, r0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s0 FR_Tmpf = f1, f1, FR_EpsNorm // flag i
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_float_N = FR_N_signif, FR_2TOM51, FR_RSHF
+ nop.i 0
+}
+;;
+
+// p8: x < UnfBound ~=~ 106.53... -> result without underflow error
+// p14: y ~=~ 2, x < -6.5,quick exit
+{ .mfi
+ getf.exp GR_IndxPlusBias = FR_Tmp // exp + bias for (|x|+1)^4
+ fcmp.lt.s1 p8,p0 = FR_NormX,FR_UnfBound
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p14) fnma.s0 FR_RESULT = FR_EpsNorm,FR_EpsNorm,FR_2
+(p14) br.ret.spnt b0
+
+}
+;;
+
+// p15: y ~=~ 0.0 (result with underflow error), x > POS_ARG_ASYMP = 107.0,
+// call __libm_error_region
+{ .mfb
+(p15) mov GR_Parameter_TAG = 207
+(p15) fma.s0 FR_RESULT = FR_EpsNorm,FR_EpsNorm,f0
+(p15) br.cond.spnt __libm_error_region
+}
+;;
+
+{ .mfi
+ getf.sig GR_N_fix = FR_N_signif // Get N from significand
+ nop.f 0
+ shl GR_ShftPi = GR_IndxPlusBias, 8
+
+}
+{ .mfi
+ shladd GR_ShftXBi = GR_IndxPlusBias, 4, GR_ShftXBi_bias
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mmi
+ add GR_ERFC_S_TB = GR_ERFC_S_TB, GR_ShftXBi //poin.to S[i]
+ add GR_ERFC_XC_TB = GR_ERFC_XC_TB, GR_ShftXBi //poin.to XC[i]
+ sub GR_ShftPi = GR_ShftPi, GR_ShftPi_bias // 256*i
+}
+;;
+
+{ .mfi
+ ldfe FR_Xc = [GR_ERFC_XC_TB]
+ fma.s1 FR_Xpdx_hi = FR_AbsArg, f1, FR_dx // x + dx
+ add GR_ShftA14 = 0xE0, GR_ShftPi // pointer shift for A14
+
+
+}
+{ .mfi
+ ldfe FR_S = [GR_ERFC_S_TB]
+ fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_norm_x//r= -L_hi*float_N+x
+ add GR_ShftA15 = 0xF0, GR_ShftPi // pointer shift for A15
+}
+;;
+
+{ .mfi
+ add GR_P_POINT_1 = GR_ERFC_P_TB, GR_ShftA14 // pointer to A14
+ fcmp.gt.s1 p9,p10 = FR_AbsArg, FR_dx //p9: x > dx, p10: x <= dx
+ extr.u GR_M1 = GR_N_fix, 6, 6 // Extract index M_1
+}
+{ .mfi
+ add GR_P_POINT_2 = GR_ERFC_P_TB, GR_ShftA15 // pointer to A15
+ nop.f 0
+ nop.i 0
+
+}
+;;
+
+{ .mfi
+ ldfe FR_A14 = [GR_P_POINT_1], -32
+ nop.f 0
+ extr.u GR_M2 = GR_N_fix, 0, 6 // Extract index M_2
+}
+{ .mfi
+ ldfe FR_A15 = [GR_P_POINT_2], -32
+ nop.f 0
+ shladd GR_ad_W1 = GR_M1,3,GR_ad_W1 // Point to W1
+}
+;;
+
+{ .mfi
+ ldfe FR_A12 = [GR_P_POINT_1], -64
+ nop.f 0
+ extr GR_K = GR_N_fix, 12, 32 // Extract limite range K
+}
+{ .mfi
+ ldfe FR_A13 = [GR_P_POINT_2], -64
+ nop.f 0
+ shladd GR_ad_T1 = GR_M1,2,GR_ad_T1 // Point to T1
+}
+;;
+
+{ .mfi
+ ldfe FR_A8 = [GR_P_POINT_1], 32
+ nop.f 0
+ add GR_exp_2_k = GR_exp_bias, GR_K // Form exponent of 2^k
+}
+{ .mfi
+ ldfe FR_A9 = [GR_P_POINT_2], 32
+ nop.f 0
+ shladd GR_ad_W2 = GR_M2,3,GR_ad_W2 // Point to W2
+}
+;;
+
+{ .mfi
+ ldfe FR_A10 = [GR_P_POINT_1], -96
+ nop.f 0
+ shladd GR_ad_T2 = GR_M2,2,GR_ad_T2 // Point to T2
+}
+{ .mfi
+ ldfe FR_A11 = [GR_P_POINT_2], -96
+ fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r //r = -L_lo*float_N + r
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe FR_A4 = [GR_P_POINT_1], 32
+(p10) fms.s1 FR_Tmp = FR_dx,f1, FR_Xpdx_hi //for lo of x+dx, x<=dx
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_A5 = [GR_P_POINT_2], 32
+(p9) fms.s1 FR_Tmp = FR_AbsArg, f1, FR_Xpdx_hi //for lo of x+dx, x>dx
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe FR_A6 = [GR_P_POINT_1], -64
+ frcpa.s1 FR_U,p11 = f1, FR_Xpdx_hi // hi of 1 /(x + dx)
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_A7 = [GR_P_POINT_2], -64
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe FR_A2 = [GR_P_POINT_1], -32
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_A3 = [GR_P_POINT_2], -32
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe FR_A0 = [GR_P_POINT_1], 224
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_A1 = [GR_P_POINT_2]
+ fms.s1 FR_LocArg = FR_AbsArg, f1, FR_Xc // xloc = x - x[i]
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfd FR_W1 = [GR_ad_W1],0 // Get W1
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfd FR_W2 = [GR_ad_W2],0 // Get W2
+ fma.s1 FR_poly = FR_r, FR_C3, FR_C2 // poly = r * A3 + A2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfs FR_T1 = [GR_ad_T1],0 // Get T1
+(p10) fma.s1 FR_Xpdx_lo = FR_AbsArg,f1, FR_Tmp//lo of x + dx , x <= dx
+ nop.i 0
+}
+{ .mfi
+ ldfs FR_T2 = [GR_ad_T2],0 // Get T2
+(p9) fma.s1 FR_Xpdx_lo = FR_dx,f1, FR_Tmp // lo of x + dx, x > dx
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_Tmp1 = FR_Xpdx_hi, FR_U, FR_2 // N-R, iter. N1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+}
+;;
+
+{ .mfi
+ setf.exp FR_scale = GR_exp_2_k // Set scale = 2^k
+ fma.s1 FR_P15_1_1 = FR_LocArg, FR_LocArg, f0 // xloc ^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_0_1 = FR_A15, FR_LocArg, FR_A14
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_1_2 = FR_A13, FR_LocArg, FR_A12
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly = FR_r, FR_poly, FR_C1 // poly = r * poly + A1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_2_1 = FR_A9, FR_LocArg, FR_A8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_2_2 = FR_A11, FR_LocArg, FR_A10
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U = FR_U, FR_Tmp1, f0 // N-R, iter. N1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_3_1 = FR_A5, FR_LocArg, FR_A4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_3_2 = FR_A7, FR_LocArg, FR_A6
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_4_2 = FR_A3, FR_LocArg, FR_A2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_W = FR_W1, FR_W2, FR_W2 // W = W1 * W2 + W2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_T = FR_T1, FR_T2 // T = T1 * T2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_7_1 = FR_P15_0_1, FR_P15_1_1, FR_P15_1_2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_7_2 = FR_P15_1_1, FR_P15_1_1, f0 // xloc^4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_8_1 = FR_P15_1_1, FR_P15_2_2, FR_P15_2_1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_Tmp = FR_Xpdx_hi, FR_U, FR_2 // N-R, iter. N2
+ nop.i 0
+}
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_poly = FR_rsq, FR_poly, FR_r // poly = rsq * poly + r
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_9_1 = FR_P15_1_1, FR_P15_4_2, FR_A0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_9_2 = FR_P15_1_1, FR_P15_3_2, FR_P15_3_1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_W = FR_W, f1, FR_W1 // W = W + W1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_T_scale = FR_T, FR_scale, f0 // T_scale = T * scale
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_13_1 = FR_P15_7_2, FR_P15_7_1, FR_P15_8_1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U = FR_U, FR_Tmp, f0 // N-R, iter. N2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_14_1 = FR_P15_7_2, FR_P15_9_2, FR_P15_9_1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_P15_14_2 = FR_P15_7_2, FR_P15_7_2, f0 // xloc^8
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_M = FR_T_scale, FR_S, f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_Tmp = FR_Xpdx_hi, FR_U, FR_2 // N-R, iter. N3
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Q = FR_P15_14_2, FR_P15_13_1, FR_P15_14_1
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_H = FR_W, f1, FR_xsq_lo // H = W - xsq_lo
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_U = FR_U, FR_Tmp, f0 // N-R, iter. N3
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Q = FR_A1, FR_LocArg, FR_Q
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_Tmp = FR_Xpdx_hi, FR_U, f1 // for du
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_R = FR_H, FR_poly, FR_poly
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_res_pos_x_hi = FR_M, FR_U, f0 // M *U
+ nop.i 0
+
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_R = FR_R, f1, FR_H // R = H + P(r) + H*P(r)
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s0 FR_Tmpf = f8, f1, f0 // flag d
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_dU = FR_Xpdx_lo, FR_U, FR_Tmp
+ nop.i 0
+}
+;;
+
+// p7: we begin to calculate y(x) = 2 - erfcl(-x) in multi precision
+// for -6.5 <= x < 0
+{ .mfi
+ nop.m 0
+ fms.s1 FR_res_pos_x_lo = FR_M, FR_U, FR_res_pos_x_hi
+ nop.i 0
+
+}
+{ .mfi
+ nop.m 0
+(p7) fnma.s1 FR_Tmp1 = FR_res_pos_x_hi, f1, FR_2 //p7: x < 0
+ nop.i 0
+
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_G = FR_R, FR_Q, FR_Q
+ nop.i 0
+
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Tmp = FR_R, f1, FR_dU // R + du
+ nop.i 0
+
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p7) fnma.s1 FR_Tmp2 = FR_Tmp1, f1, FR_2 //p7: x < 0
+ nop.i 0
+
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_G = FR_G, f1, FR_Tmp
+ nop.i 0
+
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p7) fnma.s1 FR_Tmp2 = FR_res_pos_x_hi, f1, FR_Tmp2 //p7: x < 0
+ nop.i 0
+
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_V = FR_G, FR_res_pos_x_hi, f0 // V = G * M *U
+ nop.i 0
+
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p7) fma.s1 FR_res_pos_x_lo = FR_res_pos_x_lo, f1, FR_V //p7: x < 0
+ nop.i 0
+
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p7) fnma.s1 FR_Tmp2 = FR_res_pos_x_lo, f1, FR_Tmp2 //p7: x < 0
+ nop.i 0
+
+}
+;;
+
+
+//p6: result for 0 < x < = POS_ARG_ASYMP
+//p7: result for - NEG_ARG_ASYMP <= x < 0
+//p8: exit for - NEG_ARG_ASYMP <= x < UnfBound
+
+ERFC_RESULT:
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+(p6) fma.s0 f8 = FR_M, FR_U, FR_V // p6: x >= 0
+ nop.i 0
+}
+{ .mfb
+ mov GR_Parameter_TAG = 207
+(p7) fma.s0 f8 = FR_Tmp2, f1, FR_Tmp1 // p7: x < 0
+(p8) br.ret.sptk b0
+};;
+
+GLOBAL_LIBM_END(erfcl)
+// call via (p15) br.cond.spnt __libm_error_region
+// for x > POS_ARG_ASYMP
+// or
+//
+// after .endp erfcl for UnfBound < = x < = POS_ARG_ASYMP
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfe [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfe [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfe [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+{ .mmi
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
+
+
diff --git a/sysdeps/ia64/fpu/s_erff.S b/sysdeps/ia64/fpu/s_erff.S
new file mode 100644
index 0000000000..204446fbdf
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_erff.S
@@ -0,0 +1,557 @@
+.file "erff.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 08/14/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// float erff(float)
+//
+// Overview of operation
+//==============================================================
+// Background
+//
+//
+// There are 8 paths:
+// 1. x = +/-0.0
+// Return erff(x) = +/-0.0
+//
+// 2. 0.0 < |x| < 0.125
+// Return erff(x) = x *Pol3(x^2),
+// where Pol3(x^2) = C3*x^6 + C2*x^4 + C1*x^2 + C0
+//
+// 3. 0.125 <= |x| < 4.0
+// Return erff(x) = sign(x)*PolD(x)*PolC(|x|) + sign(x)*PolA(|x|),
+// where sign(x)*PolD(x) = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4),
+// PolC(|x|) = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0,
+// PolA(|x|) = A3|x|^3 + A2*x^2 + A1*|x| + A0
+//
+// Actually range 0.125<=|x|< 4.0 is splitted to 5 subranges.
+// For each subrange there is particular set of coefficients.
+// Below is the list of subranges:
+// 3.1 0.125 <= |x| < 0.25
+// 3.2 0.25 <= |x| < 0.5
+// 3.3 0.5 <= |x| < 1.0
+// 3.4 1.0 <= |x| < 2.0
+// 3.5 2.0 <= |x| < 4.0
+//
+// 4. 4.0 <= |x| < +INF
+// Return erff(x) = sign(x)*(1.0d - 2^(-52))
+//
+// 5. |x| = INF
+// Return erff(x) = sign(x) * 1.0
+//
+// 6. x = [S,Q]NaN
+// Return erff(x) = QNaN
+//
+// 7. x is positive denormal
+// Return erff(x) = C0*x - x^2,
+// where C0 = 2.0/sqrt(Pi)
+//
+// 8. x is negative denormal
+// Return erff(x) = C0*x + x^2,
+// where C0 = 2.0/sqrt(Pi)
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f32 -> f59
+
+// General registers used:
+// r32 -> r45, r2, r3
+
+// Predicate registers used:
+// p0, p6 -> p12, p14, p15
+
+// p6 to filter out case when x = [Q,S]NaN or +/-0
+// p7 to filter out case when x = denormal
+// p8 set if |x| >= 0.3125, used also to process denormal input
+// p9 to filter out case when |x| = inf
+// p10 to filter out case when |x| < 0.125
+// p11 to filter out case when 0.125 <= |x| < 4.0
+// p12 to filter out case when |x| >= 4.0
+// p14 set to 1 for positive x
+// p15 set to 1 for negative x
+
+// Assembly macros
+//==============================================================
+rDataPtr = r2
+rDataPtr1 = r3
+
+rBias = r33
+rCoeffAddr3 = r34
+rCoeffAddr1 = r35
+rCoeffAddr2 = r36
+rOffset2 = r37
+rBias2 = r38
+rMask = r39
+rArg = r40
+rBound = r41
+rSignBit = r42
+rAbsArg = r43
+rDataPtr2 = r44
+rSaturation = r45
+
+//==============================================================
+fA0 = f32
+fA1 = f33
+fA2 = f34
+fA3 = f35
+fC0 = f36
+fC1 = f37
+fC2 = f38
+fC3 = f39
+fD0 = f40
+fD1 = f41
+fD2 = f42
+fB0 = f43
+fArgSqr = f44
+fAbsArg = f45
+fSignumX = f46
+fArg4 = f47
+fArg4Sgn = f48
+fArg3 = f49
+fArg3Sgn = f50
+fArg7Sgn = f51
+fArg6Sgn = f52
+fPolC = f53
+fPolCTmp = f54
+fPolA = f55
+fPolATmp = f56
+fPolD = f57
+fPolDTmp = f58
+fArgSqrSgn = f59
+
+// Data tables
+//==============================================================
+
+RODATA
+
+.align 16
+
+LOCAL_OBJECT_START(erff_data)
+// Polynomial coefficients for the erf(x), 0.125 <= |x| < 0.25
+data8 0xBE4218BB56B49E66 // C0
+data8 0x3F7AFB8315DA322B // C1
+data8 0x3F615D6EBEE0CA32 // C2
+data8 0xBF468D71CF4F0918 // C3
+data8 0x40312115B0932F24 // D0
+data8 0xC0160D6CD0991EA3 // D1
+data8 0xBFE04A567A6DBE4A // D2
+data8 0xBF4207BC640D1509 // B0
+// Polynomial coefficients for the erf(x), 0.25 <= |x| < 0.5
+data8 0x3F90849356383F58 // C0
+data8 0x3F830BD5BA240F09 // C1
+data8 0xBF3FA4970E2BCE23 // C2
+data8 0xBF6061798E58D0FD // C3
+data8 0xBF68C0D83DD22E02 // D0
+data8 0x401C0A9EE4108F94 // D1
+data8 0xC01056F9B5E387F5 // D2
+data8 0x3F1C9744E36A5706 // B0
+// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
+data8 0x3F85F7D419A13DE3 // C0
+data8 0x3F791A13FF66D45A // C1
+data8 0x3F46B17B16B5929F // C2
+data8 0xBF5124947A8BF45E // C3
+data8 0x3FA1B3FD95EA9564 // D0
+data8 0x40250CECD79A020A // D1
+data8 0xC0190DC96FF66CCD // D2
+data8 0x3F4401AE28BA4DD5 // B0
+// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
+data8 0xBF49E07E3584C3AE // C0
+data8 0x3F3166621131445C // C1
+data8 0xBF65B7FC1EAC2099 // C2
+data8 0x3F508C6BD211D736 // C3
+data8 0xC053FABD70601067 // D0
+data8 0x404A06640EE87808 // D1
+data8 0xC0283F30817A3F08 // D2
+data8 0xBF2F6DBBF4D6257F // B0
+// Polynomial coefficients for the erf(x), 2.0 <= |x| < 4.0
+data8 0xBF849855D67E9407 // C0
+data8 0x3F5ECA5FEC01C70C // C1
+data8 0xBF483110C30FABA4 // C2
+data8 0x3F1618DA72860403 // C3
+data8 0xC08A5C9D5FE8B9F6 // D0
+data8 0x406EFF5F088CEC4B // D1
+data8 0xC03A5743DF38FDE0 // D2
+data8 0xBEE397A9FA5686A2 // B0
+// Polynomial coefficients for the erf(x), -0.125 < x < 0.125
+data8 0x3FF20DD7504270CB // C0
+data8 0xBFD8127465AFE719 // C1
+data8 0x3FBCE2D77791DD77 // C2
+data8 0xBF9B582755CDF345 // C3
+// Polynomial coefficients for the erf(x), 0.125 <= |x| < 0.25
+data8 0xBD54E7E451AF0E36 // A0
+data8 0x3FF20DD75043FE20 // A1
+data8 0xBE05680ACF8280E4 // A2
+data8 0xBFD812745E92C3D3 // A3
+// Polynomial coefficients for the erf(x), 0.25 <= |x| < 0.5
+data8 0xBE1ACEC2859CB55F // A0
+data8 0x3FF20DD75E8D2B64 // A1
+data8 0xBEABC6A83208FCFC // A2
+data8 0xBFD81253E42E7B99 // A3
+// Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
+data8 0x3EABD5A2482B4979 // A0
+data8 0x3FF20DCAA52085D5 // A1
+data8 0x3F13A994A348795B // A2
+data8 0xBFD8167B2DFCDE44 // A3
+// Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
+data8 0xBF5BA377DDAB4E17 // A0
+data8 0x3FF2397F1D8FC0ED // A1
+data8 0xBF9945BFC1915C21 // A2
+data8 0xBFD747AAABB690D8 // A3
+// Polynomial coefficients for the erf(x), 2.0 <= |x| < 4.0
+data8 0x3FF0E2920E0391AF // A0
+data8 0xC00D249D1A95A5AE // A1
+data8 0x40233905061C3803 // A2
+data8 0xC027560B851F7690 // A3
+//
+data8 0x3FEFFFFFFFFFFFFF // 1.0 - epsilon
+data8 0x3FF20DD750429B6D // C0 = 2.0/sqrt(Pi)
+LOCAL_OBJECT_END(erff_data)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(erff)
+
+{ .mfi
+ alloc r32 = ar.pfs, 0, 14, 0, 0
+ fmerge.s fAbsArg = f1, f8 // |x|
+ addl rMask = 0x806, r0
+}
+{ .mfi
+ addl rDataPtr = @ltoff(erff_data), gp
+ fma.s1 fArgSqr = f8, f8, f0 // x^2
+ adds rSignBit = 0x1, r0
+}
+;;
+
+{ .mfi
+ getf.s rArg = f8 // x in GR
+ fclass.m p7,p0 = f8, 0x0b // is x denormal ?
+ // sign bit and 2 most bits in significand
+ shl rMask = rMask, 20
+}
+{ .mfi
+ ld8 rDataPtr = [rDataPtr]
+ nop.f 0
+ adds rBias2 = 0x1F0, r0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fmerge.s fSignumX = f8, f1 // signum(x)
+ shl rSignBit = rSignBit, 31 // mask for sign bit
+}
+{ .mfi
+ adds rBound = 0x3E0, r0
+ nop.f 0
+ adds rSaturation = 0x408, r0
+}
+;;
+
+{ .mfi
+ andcm rOffset2 = rArg, rMask
+ fclass.m p6,p0 = f8, 0xc7 // is x [S,Q]NaN or +/-0 ?
+ shl rBound = rBound, 20 // 0.125f in GR
+}
+{ .mfb
+ andcm rAbsArg = rArg, rSignBit // |x| in GR
+ nop.f 0
+(p7) br.cond.spnt erff_denormal // branch out if x is denormal
+}
+;;
+
+{ .mfi
+ adds rCoeffAddr2 = 352, rDataPtr
+ fclass.m p9,p0 = f8, 0x23 // is x +/- inf?
+ shr rOffset2 = rOffset2, 21
+}
+{ .mfi
+ cmp.lt p10, p8 = rAbsArg, rBound // |x| < 0.125?
+ nop.f 0
+ adds rCoeffAddr3 = 16, rDataPtr
+}
+;;
+
+{ .mfi
+(p8) sub rBias = rOffset2, rBias2
+ fma.s1 fArg4 = fArgSqr, fArgSqr, f0 // x^4
+ shl rSaturation = rSaturation, 20// 4.0 in GR (saturation bound)
+}
+{ .mfb
+(p10) adds rBias = 0x14, r0
+(p6) fma.s.s0 f8 = f8,f1,f8 // NaN or +/-0
+(p6) br.ret.spnt b0 // exit for x = NaN or +/-0
+}
+;;
+
+{ .mfi
+ shladd rCoeffAddr1 = rBias, 4, rDataPtr
+ fma.s1 fArg3Sgn = fArgSqr, f8, f0 // sign(x)*|x|^3
+ // is |x| < 4.0?
+ cmp.lt p11, p12 = rAbsArg, rSaturation
+}
+{ .mfi
+ shladd rCoeffAddr3 = rBias, 4, rCoeffAddr3
+ fma.s1 fArg3 = fArgSqr, fAbsArg, f0 // |x|^3
+ shladd rCoeffAddr2 = rBias, 3, rCoeffAddr2
+}
+;;
+
+{ .mfi
+(p11) ldfpd fC0, fC1 = [rCoeffAddr1]
+(p9) fmerge.s f8 = f8,f1 // +/- inf
+(p12) adds rDataPtr = 512, rDataPtr
+}
+{ .mfb
+(p11) ldfpd fC2, fC3 = [rCoeffAddr3], 16
+ nop.f 0
+(p9) br.ret.spnt b0 // exit for x = +/- inf
+}
+;;
+
+{ .mfi
+(p11) ldfpd fA0, fA1 = [rCoeffAddr2], 16
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ add rCoeffAddr1 = 48, rCoeffAddr1
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+(p11) ldfpd fD0, fD1 = [rCoeffAddr3]
+ nop.f 0
+ nop.i 0
+}
+{ .mfb
+(p11) ldfpd fD2, fB0 = [rCoeffAddr1]
+ // sign(x)*|x|^2
+ fma.s1 fArgSqrSgn = fArgSqr, fSignumX, f0
+(p10) br.cond.spnt erff_near_zero
+}
+;;
+
+{ .mfi
+(p11) ldfpd fA2, fA3 = [rCoeffAddr2], 16
+ fcmp.lt.s1 p15, p14 = f8,f0
+ nop.i 0
+}
+{ .mfb
+(p12) ldfd fA0 = [rDataPtr]
+ fma.s1 fArg4Sgn = fArg4, fSignumX, f0 // sign(x)*|x|^4
+(p12) br.cond.spnt erff_saturation
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fArg7Sgn = fArg4, fArg3Sgn, f0 // sign(x)*|x|^7
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fArg6Sgn = fArg3, fArg3Sgn, f0 // sign(x)*|x|^6
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fPolC = fC3, fAbsArg, fC2 // C3*|x| + C2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPolCTmp = fC1, fAbsArg, fC0 // C1*|x| + C0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fPolA = fA1, fAbsArg, fA0 // A1*|x| + A0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fPolD = fD1, fAbsArg, fD0 // D1*|x| + D0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // sign(x)*(|x|^7 + D2*x^6)
+ fma.s1 fPolDTmp = fArg6Sgn, fD2, fArg7Sgn
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fPolATmp = fA3, fAbsArg, fA2 // A3*|x| + A2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB0 = fB0, fArg4, f0 // B0*x^4
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ // C3*|x|^3 + C2*x^2 + C1*|x| + C0
+ fma.s1 fPolC = fPolC, fArgSqr, fPolCTmp
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // PolD = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4)
+ fma.d.s1 fPolD = fPolD, fArg4Sgn, fPolDTmp
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0
+ fma.d.s1 fPolA = fPolATmp, fArgSqr, fPolA
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0
+ fma.d.s1 fPolC = fPolC, f1, fB0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p14) fma.s.s0 f8 = fPolC, fPolD, fPolA // for positive x
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p15) fms.s.s0 f8 = fPolC, fPolD, fPolA // for negative x
+ br.ret.sptk b0 // Exit for 0.125 <=|x|< 4.0
+};;
+
+
+// Here if |x| < 0.125
+erff_near_zero:
+{ .mfi
+ nop.m 0
+ fma.s1 fPolC = fC3, fArgSqr, fC2 // C3*x^2 + C2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPolCTmp = fC1, fArgSqr, fC0 // C1*x^2 + C0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fPolC = fPolC, fArg4, fPolCTmp // C3*x^6 + C2*x^4 + C1*x^2 + C0
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+ // x*(C3*x^6 + C2*x^4 + C1*x^2 + C0)
+ fma.s.s0 f8 = fPolC, f8, f0
+ br.ret.sptk b0 // Exit for |x| < 0.125
+};;
+
+// Here if 4.0 <= |x| < +inf
+erff_saturation:
+{ .mfb
+ nop.m 0
+ fma.s.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0d - 2^(-52))
+ // Exit for 4.0 <= |x| < +inf
+ br.ret.sptk b0 // Exit for 4.0 <=|x|< +inf
+}
+;;
+
+// Here if x is single precision denormal
+erff_denormal:
+{ .mfi
+ adds rDataPtr = 520, rDataPtr // address of C0
+ fclass.m p7,p8 = f8, 0x0a // is x -denormal ?
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd fC0 = [rDataPtr] // C0
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fC0 = fC0,f8,f0 // C0*x
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+(p7) fma.s.s0 f8 = f8,f8,fC0 // -denormal
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p8) fnma.s.s0 f8 = f8,f8,fC0 // +denormal
+ br.ret.sptk b0 // Exit for denormal
+}
+;;
+
+GLOBAL_LIBM_END(erff)
diff --git a/sysdeps/ia64/fpu/s_erfl.S b/sysdeps/ia64/fpu/s_erfl.S
new file mode 100644
index 0000000000..902539be48
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_erfl.S
@@ -0,0 +1,1239 @@
+.file "erfl.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 11/21/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 08/14/02 Changed mli templates to mlx
+// 02/06/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// long double erfl(long double)
+//
+// Overview of operation
+//==============================================================
+//
+// Algorithm description
+// ---------------------
+//
+// There are 4 paths:
+//
+// 1. Special path: x = 0, Inf, NaNs, denormal
+// Return erfl(x) = +/-0.0 for zeros
+// Return erfl(x) = QNaN for NaNs
+// Return erfl(x) = sign(x)*1.0 for Inf
+// Return erfl(x) = (A0H+A0L)*x + x^2, ((A0H+A0L) = 2.0/sqrt(Pi))
+// for denormals
+//
+// 2. [0;1/8] path: 0.0 < |x| < 1/8
+// Return erfl(x) = x*(A1H+A1L) + x^3*A3 + ... + x^15*A15
+//
+// 3. Main path: 1/8 <= |x| < 6.53
+// For several ranges of 1/8 <= |x| < 6.53
+// Return erfl(x) = sign(x)*((A0H+A0L) + y*(A1H+A1L) + y^2*(A2H+A2L) +
+// + y^3*A3 + y^4*A4 + ... + y^25*A25 )
+// where y = (|x|/a) - b
+//
+// For each range there is particular set of coefficients.
+// Below is the list of ranges:
+// 1/8 <= |x| < 1/4 a = 0.125, b = 1.5
+// 1/4 <= |x| < 1/2 a = 0.25, b = 1.5
+// 1/2 <= |x| < 1.0 a = 0.5, b = 1.5
+// 1.0 <= |x| < 2.0 a = 1.0, b = 1.5
+// 2.0 <= |x| < 3.25 a = 2.0, b = 1.5
+// 3.25 <= |x| < 4.0 a = 2.0, b = 2.0
+// 4.0 <= |x| < 6.53 a = 4.0, b = 1.5
+// ( [3.25;4.0] subrange separated for monotonicity issues resolve )
+//
+// 4. Saturation path: 6.53 <= |x| < +INF
+// Return erfl(x) = sign(x)*(1.0 - tiny_value)
+// (tiny_value ~ 1e-1233)
+//
+// Implementation notes
+// --------------------
+//
+// 1. Special path: x = 0, INF, NaNa, denormals
+//
+// This branch is cut off by one fclass operation.
+// Then zeros+nans, infinities and denormals processed separately.
+// For denormals we had to use multiprecision A0 coefficient to reach
+// necessary accuracy: (A0H+A0L)*x-x^2
+//
+// 2. [0;1/8] path: 0.0 < |x| < 1/8
+//
+// First coefficient of polynomial we must split to multiprecision too.
+// Also we can parallelise computations:
+// (x*(A1H+A1L)) calculated in parallel with "tail" (x^3*A3 + ... + x^15*A15)
+// Furthermore the second part is factorized using binary tree technique.
+//
+// 3. Main path: 1/8 <= |x| < 6.53
+//
+// Multiprecision have to be performed only for first few
+// polynomial iterations (up to 3-rd x degree)
+// Here we use the same parallelisation way as above:
+// Split whole polynomial to first, "multiprecision" part, and second,
+// so called "tail", native precision part.
+//
+// 1) Multiprecision part:
+// [v1=(A0H+A0L)+y*(A1H+A1L)] + [v2=y^2*((A2H+A2L)+y*A3)]
+// v1 and v2 terms calculated in parallel
+//
+// 2) Tail part:
+// v3 = x^4 * ( A4 + x*A5 + ... + x^21*A25 )
+// v3 is splitted to 2 even parts (10 coefficient in each one).
+// These 2 parts are also factorized using binary tree technique.
+//
+// So Multiprecision and Tail parts cost is almost the same
+// and we have both results ready before final summation.
+//
+// 4. Saturation path: 6.53 <= |x| < +INF
+//
+// We use formula sign(x)*(1.0 - tiny_value) instead of simple sign(x)*1.0
+// just to meet IEEE requirements for different rounding modes in this case.
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8 - input & output
+// f32 -> f90
+
+// General registers used:
+// r2, r3, r32 -> r52
+
+// Predicate registers used:
+// p0, p6 -> p11, p14, p15
+
+// p6 - arg is zero, denormal or special IEEE
+// p7 - arg is in [4;8] binary interval
+// p8 - arg is in [3.25;4] interval
+// p9 - arg < 1/8
+// p10 - arg is NOT in [3.25;4] interval
+// p11 - arg in saturation domain
+// p14 - arg is positive
+// p15 - arg is negative
+
+// Assembly macros
+//==============================================================
+rDataPtr = r2
+rTailDataPtr = r3
+
+rBias = r33
+rSignBit = r34
+rInterval = r35
+
+rArgExp = r36
+rArgSig = r37
+r3p25Offset = r38
+r2to4 = r39
+r1p25 = r40
+rOffset = r41
+r1p5 = r42
+rSaturation = r43
+r3p25Sign = r44
+rTiny = r45
+rAddr1 = r46
+rAddr2 = r47
+rTailAddr1 = r48
+rTailAddr2 = r49
+rTailOffset = r50
+rTailAddOffset = r51
+rShiftedDataPtr = r52
+
+//==============================================================
+fA0H = f32
+fA0L = f33
+fA1H = f34
+fA1L = f35
+fA2H = f36
+fA2L = f37
+fA3 = f38
+fA4 = f39
+fA5 = f40
+fA6 = f41
+fA7 = f42
+fA8 = f43
+fA9 = f44
+fA10 = f45
+fA11 = f46
+fA12 = f47
+fA13 = f48
+fA14 = f49
+fA15 = f50
+fA16 = f51
+fA17 = f52
+fA18 = f53
+fA19 = f54
+fA20 = f55
+fA21 = f56
+fA22 = f57
+fA23 = f58
+fA24 = f59
+fA25 = f60
+
+fArgSqr = f61
+fArgCube = f62
+fArgFour = f63
+fArgEight = f64
+
+fArgAbsNorm = f65
+fArgAbsNorm2 = f66
+fArgAbsNorm2L = f67
+fArgAbsNorm3 = f68
+fArgAbsNorm4 = f69
+fArgAbsNorm11 = f70
+
+fRes = f71
+fResH = f72
+fResL = f73
+fRes1H = f74
+fRes1L = f75
+fRes1Hd = f76
+fRes2H = f77
+fRes2L = f78
+fRes3H = f79
+fRes3L = f80
+fRes4 = f81
+
+fTT = f82
+fTH = f83
+fTL = f84
+fTT2 = f85
+fTH2 = f86
+fTL2 = f87
+
+f1p5 = f88
+f2p0 = f89
+fTiny = f90
+
+
+// Data tables
+//==============================================================
+RODATA
+
+.align 64
+LOCAL_OBJECT_START(erfl_data)
+////////// Main tables ///////////
+_0p125_to_0p25_data: // exp = 2^-3
+// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
+data8 0xACD9ED470F0BB048, 0x0000BFF4 //A3 = -6.5937529303909561891162915809e-04
+data8 0xBF6A254428DDB452 //A2H = -3.1915980570631852578089571182e-03
+data8 0xBC131B3BE3AC5079 //A2L = -2.5893976889070198978842231134e-19
+data8 0x3FC16E2D7093CD8C //A1H = 1.3617485043469590433318217038e-01
+data8 0x3C6979A52F906B4C //A1L = 1.1048096806003284897639351952e-17
+data8 0x3FCAC45E37FE2526 //A0H = 2.0911767705937583938791135552e-01
+data8 0x3C648D48536C61E3 //A0L = 8.9129592834861155344147026365e-18
+data8 0xD1FC135B4A30E746, 0x00003F90 //A25 = 6.3189963203954877364460345654e-34
+data8 0xB1C79B06DD8C988C, 0x00003F97 //A24 = 6.8478253118093953461840838106e-32
+data8 0xCC7AE121D1DEDA30, 0x0000BF9A //A23 = -6.3010264109146390803803408666e-31
+data8 0x8927B8841D1E0CA8, 0x0000BFA1 //A22 = -5.4098171537601308358556861717e-29
+data8 0xB4E84D6D0C8F3515, 0x00003FA4 //A21 = 5.7084320046554628404861183887e-28
+data8 0xC190EAE69A67959A, 0x00003FAA //A20 = 3.9090359419467121266470910523e-26
+data8 0x90122425D312F680, 0x0000BFAE //A19 = -4.6551806872355374409398000522e-25
+data8 0xF8456C9C747138D6, 0x0000BFB3 //A18 = -2.5670639225386507569611436435e-23
+data8 0xCDCAE0B3C6F65A3A, 0x00003FB7 //A17 = 3.4045511783329546779285646369e-22
+data8 0x8F41909107C62DCC, 0x00003FBD //A16 = 1.5167830861896169812375771948e-20
+data8 0x82F0FCB8A4B8C0A3, 0x0000BFC1 //A15 = -2.2182328575376704666050112195e-19
+data8 0x92E992C58B7C3847, 0x0000BFC6 //A14 = -7.9641369349930600223371163611e-18
+LOCAL_OBJECT_END(erfl_data)
+
+LOCAL_OBJECT_START(_0p25_to_0p5_data)
+// Polynomial coefficients for the erf(x), 1/4 <= |x| < 1/2
+data8 0xF083628E8F7CE71D, 0x0000BFF6 //A3 = -3.6699405305266733332335619531e-03
+data8 0xBF978749A434FE4E //A2H = -2.2977018973732214746075186440e-02
+data8 0xBC30B3FAFBC21107 //A2L = -9.0547407100537663337591537643e-19
+data8 0x3FCF5F0CDAF15313 //A1H = 2.4508820238647696654332719390e-01
+data8 0x3C1DFF29F5AD8117 //A1L = 4.0653155218104625249413579084e-19
+data8 0x3FD9DD0D2B721F38 //A0H = 4.0411690943482225790717166092e-01
+data8 0x3C874C71FEF1759E //A0L = 4.0416653425001310671815863946e-17
+data8 0xA621D99B8C12595E, 0x0000BFAB //A25 = -6.7100271986703749013021666304e-26
+data8 0xBD7BBACB439992E5, 0x00003FAE //A24 = 6.1225362452814749024566661525e-25
+data8 0xFF2FEFF03A98E410, 0x00003FB2 //A23 = 1.3192871864994282747963195183e-23
+data8 0xAE8180957ABE6FD5, 0x0000BFB6 //A22 = -1.4434787102181180110707433640e-22
+data8 0xAF0566617B453AA6, 0x0000BFBA //A21 = -2.3163848847252215762970075142e-21
+data8 0x8F33D3616B9B8257, 0x00003FBE //A20 = 3.0324297082969526400202995913e-20
+data8 0xD58AB73354438856, 0x00003FC1 //A19 = 3.6175397854863872232142412590e-19
+data8 0xD214550E2F3210DF, 0x0000BFC5 //A18 = -5.6942141660091333278722310354e-18
+data8 0xE2CA60C328F3BBF5, 0x0000BFC8 //A17 = -4.9177359011428870333915211291e-17
+data8 0x88D9BB274F9B3873, 0x00003FCD //A16 = 9.4959118337089189766177270051e-16
+data8 0xCA4A00AB538A2DB2, 0x00003FCF //A15 = 5.6146496538690657993449251855e-15
+data8 0x9CC8FFFBDDCF9853, 0x0000BFD4 //A14 = -1.3925319209173383944263942226e-13
+LOCAL_OBJECT_END(_0p25_to_0p5_data)
+
+LOCAL_OBJECT_START(_0p5_to_1_data)
+// Polynomial coefficients for the erf(x), 1/2 <= |x| < 1
+data8 0xDB742C8FB372DBE0, 0x00003FF6 //A3 = 3.3485993187250381721535255963e-03
+data8 0xBFBEDC5644353C26 //A2H = -1.2054957547410136142751468924e-01
+data8 0xBC6D7215B023455F //A2L = -1.2770012232203569059818773287e-17
+data8 0x3FD492E42D78D2C4 //A1H = 3.2146553459760363047337250464e-01
+data8 0x3C83A163CAC22E05 //A1L = 3.4053365952542489137756724868e-17
+data8 0x3FE6C1C9759D0E5F //A0H = 7.1115563365351508462453011816e-01
+data8 0x3C8B1432F2CBC455 //A0L = 4.6974407716428899960674098333e-17
+data8 0x95A6B92162813FF8, 0x00003FC3 //A25 = 1.0140763985766801318711038400e-18
+data8 0xFE5EC3217F457B83, 0x0000BFC6 //A24 = -1.3789434273280972156856405853e-17
+data8 0x9B49651031B5310B, 0x0000BFC8 //A23 = -3.3672435142472427475576375889e-17
+data8 0xDBF73927E19B7C8D, 0x00003FCC //A22 = 7.6315938248752024965922341872e-16
+data8 0xF55CBA3052730592, 0x00003FCB //A21 = 4.2563559623888750271176552350e-16
+data8 0xA1DC9380DA82CFF6, 0x0000BFD2 //A20 = -3.5940500736023122607663701015e-14
+data8 0xAAD1AE1067F3D577, 0x00003FD2 //A19 = 3.7929451192558641569555227613e-14
+data8 0xCD1DB83F3B9D2090, 0x00003FD7 //A18 = 1.4574374961011929143375716362e-12
+data8 0x87235ACB5E8BB298, 0x0000BFD9 //A17 = -3.8408559294899660346666452560e-12
+data8 0xDA417B78FF9F46B4, 0x0000BFDC //A16 = -4.9625621225715971268115023451e-11
+data8 0xF075762685484436, 0x00003FDE //A15 = 2.1869603559309150844390066920e-10
+data8 0xB989FDB3795165C7, 0x00003FE1 //A14 = 1.3499740992928183247608593000e-09
+LOCAL_OBJECT_END(_0p5_to_1_data)
+
+LOCAL_OBJECT_START(_1_to_2_data)
+// Polynomial coefficients for the erf(x), 1 <= |x| < 2.0
+data8 0x8E15015F5B55BEAC, 0x00003FFC //A3 = 1.3875200409423426678618977531e-01
+data8 0xBFC6D5A95D0A1B7E //A2H = -1.7839543383544403942764233761e-01
+data8 0xBC7499F704C80E02 //A2L = -1.7868888188464394090788198634e-17
+data8 0x3FBE723726B824A8 //A1H = 1.1893028922362935961842822508e-01
+data8 0x3C6B77F399C2AD27 //A1L = 1.1912589318015368492508652194e-17
+data8 0x3FEEEA5557137ADF //A0H = 9.6610514647531064991170524081e-01
+data8 0x3C963D0DDD0A762F //A0L = 7.7155271023949055047261953350e-17
+data8 0x8FAA405DAD409771, 0x0000BFDB //A25 = -1.6332824616946528652252813763e-11
+data8 0x941386F4697976D8, 0x0000BFDD //A24 = -6.7337295147729213955410252613e-11
+data8 0xBCBE75234530B404, 0x00003FDF //A23 = 3.4332329029092304943838374908e-10
+data8 0xF55E2CE71A00D040, 0x00003FDF //A22 = 4.4632156034175937694868068394e-10
+data8 0xA6CADFE489D2671F, 0x0000BFE3 //A21 = -4.8543000253822277507724949798e-09
+data8 0xA4C69F11FEAFB3A8, 0x00003FE2 //A20 = 2.3978044150868471771557059958e-09
+data8 0xD63441E3BED59703, 0x00003FE6 //A19 = 4.9873285553412397317802071288e-08
+data8 0xDFDAED9D3089D732, 0x0000BFE7 //A18 = -1.0424069510877052249228047044e-07
+data8 0xB47287FF165756A5, 0x0000BFE9 //A17 = -3.3610945128073834488448164164e-07
+data8 0xCDAF2DC0A79A9059, 0x00003FEB //A16 = 1.5324673941628851136481785187e-06
+data8 0x9FD6A7B2ECE8EDA9, 0x00003FEA //A15 = 5.9544479989469083598476592569e-07
+data8 0xEC6E63BB4507B585, 0x0000BFEE //A14 = -1.4092398243085031882423746824e-05
+LOCAL_OBJECT_END(_1_to_2_data)
+
+LOCAL_OBJECT_START(_2_to_3p25_data)
+// Polynomial coefficients for the erf(x), 2 <= |x| < 3.25
+data8 0xCEDBA58E8EE6F055, 0x00003FF7 //A3 = 6.3128050215859026984338771121e-03
+data8 0xBF5B60D5E974CBBD //A2H = -1.6710366233609740427984435840e-03
+data8 0xBC0E11E2AEC18AF6 //A2L = -2.0376133202996259839305825162e-19
+data8 0x3F32408E9BA3327E //A1H = 2.7850610389349567379974059733e-04
+data8 0x3BE41010E4B3B224 //A1L = 3.3987633691879253781833531576e-20
+data8 0x3FEFFFD1AC4135F9 //A0H = 9.9997790950300136092465663751e-01
+data8 0x3C8EEAFA1E97EAE0 //A0L = 5.3633970564750967956196033852e-17
+data8 0xBF9C6F2C6D7263C1, 0x00003FF0 //A25 = 4.5683639377039166585098497471e-05
+data8 0xCB4167CC4798096D, 0x00003FF0 //A24 = 4.8459885139772945417160731273e-05
+data8 0xE1394FECFE972D32, 0x0000BFF2 //A23 = -2.1479022581129892562916533804e-04
+data8 0xC7F9E47581FC2A5F, 0x0000BFF2 //A22 = -1.9071211076537531370822343363e-04
+data8 0xDD612EDFAA41BEAE, 0x00003FF2 //A21 = 2.1112405918671957390188348542e-04
+data8 0x8C166AA4CB2AD8FD, 0x0000BFF4 //A20 = -5.3439165021555312536009227942e-04
+data8 0xEFBE33D9F62B68D4, 0x0000BFF2 //A19 = -2.2863672131516067770956697877e-04
+data8 0xCCB92F5D91562494, 0x00003FF5 //A18 = 1.5619154280865226092321881421e-03
+data8 0x80A5DBE71D4BA0E2, 0x0000BFF6 //A17 = -1.9630109664962540123775799179e-03
+data8 0xA0ADEB2D4C41347A, 0x0000BFF4 //A16 = -6.1294315248639348947483422457e-04
+data8 0xB1F5D4911B911665, 0x00003FF7 //A15 = 5.4309165882071876864550213817e-03
+data8 0xF2F3D8D21E8762E0, 0x0000BFF7 //A14 = -7.4143227286535936033409745884e-03
+LOCAL_OBJECT_END(_2_to_3p25_data)
+
+LOCAL_OBJECT_START(_4_to_6p53_data)
+// Polynomial coefficients for the erf(x), 4 <= |x| < 6.53
+data8 0xDF3151BE8652827E, 0x00003FD5 //A3 = 3.9646979666953349095427642209e-13
+data8 0xBD1C4A9787DF888B //A2H = -2.5127788450714750484839908889e-14
+data8 0xB99B35483E4603FD //A2L = -3.3536613901268985626466020210e-31
+data8 0x3CD2DBF507F1A1F3 //A1H = 1.0468963266736687758710258897e-15
+data8 0x398A97B60913B4BD //A1L = 1.6388968267515149775818013207e-31
+data8 0x3FEFFFFFFFFFFFFF //A0H = 9.9999999999999988897769753748e-01
+data8 0x3C99CC25E658129E //A0L = 8.9502895736398715695745861054e-17
+data8 0xB367B21294713D39, 0x00003FFB //A25 = 8.7600127403270828432337605471e-02
+data8 0xCEE3A423ADEC0F4C, 0x00003FFD //A24 = 4.0408051429309221404807497715e-01
+data8 0xC389626CF2D727C0, 0x00003FFE //A23 = 7.6381507072332210580356159947e-01
+data8 0xD15A03E082D0A307, 0x00003FFE //A22 = 8.1777977210259904277239787430e-01
+data8 0x8FD3DA92675E8E00, 0x00003FFE //A21 = 5.6182638239203638864793584264e-01
+data8 0xFD375E6EE167AA58, 0x00003FFC //A20 = 2.4728152801285544751731937424e-01
+data8 0x89A9482FADE66AE1, 0x00003FFB //A19 = 6.7217410998398471333985773237e-02
+data8 0xC62E1F02606C04DD, 0x00003FF7 //A18 = 6.0479785358923404401184993359e-03
+data8 0xEE7BF2BE71CC531C, 0x0000BFF5 //A17 = -1.8194898432032114199803271708e-03
+data8 0x8084081981CDC79C, 0x0000BFF5 //A16 = -9.8049734947701208487713246099e-04
+data8 0x8975DFB834C118C3, 0x0000BFF0 //A15 = -3.2773123965143773578608926094e-05
+data8 0x965DA4A80008B7BC, 0x0000BFEE //A14 = -8.9624997201558650125662820562e-06
+LOCAL_OBJECT_END(_4_to_6p53_data)
+
+LOCAL_OBJECT_START(_3p25_to_4_data)
+// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4
+data8 0xB01D29846286CE08, 0x00003FEE //A3 = 1.0497207328743021499800978059e-05
+data8 0xBEC10B1488AEB234 //A2H = -2.0317175474986489113480084279e-06
+data8 0xBB7F19701B8B74F9 //A2L = -4.1159669348226960337518214996e-22
+data8 0x3E910B1488AEB234 //A1H = 2.5396469343733111391850105348e-07
+data8 0x3B4F1944906D5D60 //A1L = 5.1448487494628801547474934193e-23
+data8 0x3FEFFFFFF7B91176 //A0H = 9.9999998458274208523732795584e-01
+data8 0x3C70B2865615DB3F //A0L = 1.4482653192002495179309994964e-17
+data8 0xA818D085D56F3021, 0x00003FEC //A25 = 2.5048394770210505593609705765e-06
+data8 0xD9C5C509AAE5561F, 0x00003FEC //A24 = 3.2450636894654766492719395406e-06
+data8 0x9682D71C549EEB07, 0x0000BFED //A23 = -4.4855801709974050650263470866e-06
+data8 0xBC230E1EB6FBF8B9, 0x00003FEA //A22 = 7.0086469577174843181452303996e-07
+data8 0xE1432649FF29D4DE, 0x0000BFEA //A21 = -8.3916747195472308725504497231e-07
+data8 0xB40CEEBD2803D2F0, 0x0000BFEF //A20 = -2.1463694318102769992677291330e-05
+data8 0xEAAB57ABFFA003EB, 0x00003FEF //A19 = 2.7974761309213643228699449426e-05
+data8 0xFBFA4D0B893A5BFB, 0x0000BFEE //A18 = -1.5019043571612821858165073446e-05
+data8 0xBB6AA248EED3E364, 0x0000BFF0 //A17 = -4.4683584873907316507141131797e-05
+data8 0x86C1B3AE3E500ED9, 0x00003FF2 //A16 = 1.2851395412345761361068234880e-04
+data8 0xB60729445F0C37B5, 0x0000BFF2 //A15 = -1.7359540313300841352152461287e-04
+data8 0xCA389F9E707337B1, 0x00003FF1 //A14 = 9.6426575465763394281615740282e-05
+LOCAL_OBJECT_END(_3p25_to_4_data)
+
+
+//////// "Tail" tables //////////
+LOCAL_OBJECT_START(_0p125_to_0p25_data_tail)
+// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
+data8 0x93086CBD21ED3962, 0x00003FCA //A13 = 1.2753071968462837024755878679e-16
+data8 0x83CB5045A6D4B419, 0x00003FCF //A12 = 3.6580237062957773626379648530e-15
+data8 0x8FCDB723209690EB, 0x0000BFD3 //A11 = -6.3861616307180801527566117146e-14
+data8 0xCAA173F680B5D56B, 0x0000BFD7 //A10 = -1.4397775466324880354578008779e-12
+data8 0xF0CEA934AD6AC013, 0x00003FDB //A9 = 2.7376616955640415767655526857e-11
+data8 0x81C69F9D0B5AB8EE, 0x00003FE0 //A8 = 4.7212187567505249115688961488e-10
+data8 0xA8B590298C20A194, 0x0000BFE4 //A7 = -9.8201697105565925460801441797e-09
+data8 0x84F3DE72AC964615, 0x0000BFE8 //A6 = -1.2382176987480830706988411266e-07
+data8 0xC01A1398868CC4BD, 0x00003FEC //A5 = 2.8625408039722670291121341583e-06
+data8 0xCC43247F4410C54A, 0x00003FEF //A4 = 2.4349960762505993017186935493e-05
+LOCAL_OBJECT_END(_0p125_to_0p25_data_tail)
+
+LOCAL_OBJECT_START(_0p25_to_0p5_data_tail)
+// Polynomial coefficients for the erf(x), 1/4 <= |x| < 1/2
+data8 0x8CEAC59AF361B78A, 0x0000BFD6 //A13 = -5.0063802958258679384986669123e-13
+data8 0x9BC67404F348C0CE, 0x00003FDB //A12 = 1.7709590771868743572061278273e-11
+data8 0xF4B5D0348AFAAC7A, 0x00003FDB //A11 = 2.7820329729584630464848160970e-11
+data8 0x83AB447FF619DA4A, 0x0000BFE2 //A10 = -1.9160363295631539615395477207e-09
+data8 0x82115AB487202E7B, 0x00003FE0 //A9 = 4.7318386460142606822119637959e-10
+data8 0xB84D5B0AE17054AA, 0x00003FE8 //A8 = 1.7164477188916895004843908951e-07
+data8 0xB2E085C1C4AA06E5, 0x0000BFE9 //A7 = -3.3318445266863554512523957574e-07
+data8 0xCD3CA2E6C3971666, 0x0000BFEE //A6 = -1.2233070175554502732980949519e-05
+data8 0xBA445C53F8DD40E6, 0x00003FF0 //A5 = 4.4409521535330413551781808621e-05
+data8 0xAA94D5E68033B764, 0x00003FF4 //A4 = 6.5071635765452563856926608000e-04
+LOCAL_OBJECT_END(_0p25_to_0p5_data_tail)
+
+LOCAL_OBJECT_START(_0p5_to_1_data_tail)
+// Polynomial coefficients for the erf(x), 1/2 <= |x| < 1
+data8 0x9ED99EDF111CB785, 0x0000BFE4 //A13 = -9.2462916180079278241704711522e-09
+data8 0xDEAF7539AE2FB062, 0x0000BFE5 //A12 = -2.5923990465973151101298441139e-08
+data8 0xA392D5E5CC9DB1A7, 0x00003FE9 //A11 = 3.0467952847327075747032372101e-07
+data8 0xC311A7619B96CA1A, 0x00003FE8 //A10 = 1.8167212632079596881709988649e-07
+data8 0x82082E6B6A93F116, 0x0000BFEE //A9 = -7.7505086843257228386931766018e-06
+data8 0x96D9997CF326A36D, 0x00003FEE //A8 = 8.9913605625817479172071008270e-06
+data8 0x97057D85DCB0ED99, 0x00003FF2 //A7 = 1.4402527482741758767786898553e-04
+data8 0xDC23BCB3599C0490, 0x0000BFF3 //A6 = -4.1988296144950673955519083419e-04
+data8 0xDA150C4867208A81, 0x0000BFF5 //A5 = -1.6638352864915033417887831090e-03
+data8 0x9A4DAF550A2CC29A, 0x00003FF8 //A4 = 9.4179355839141698591817907680e-03
+LOCAL_OBJECT_END(_0p5_to_1_data_tail)
+
+LOCAL_OBJECT_START(_1_to_2_data_tail)
+// Polynomial coefficients for the erf(x), 1 <= |x| < 2.0
+data8 0x969EAC5C7B46CAB9, 0x00003FEF //A13 = 1.7955281439310148162059582795e-05
+data8 0xA2ED832912E9FCD9, 0x00003FF1 //A12 = 7.7690020847111408916570845775e-05
+data8 0x85677C39C48E43E7, 0x0000BFF3 //A11 = -2.5444839340796031538582511806e-04
+data8 0xC2DAFA91683DAAE4, 0x0000BFF1 //A10 = -9.2914288456063075386925076097e-05
+data8 0xE01C061CBC6A2825, 0x00003FF5 //A9 = 1.7098195515864039518892834211e-03
+data8 0x9AD7271CAFD01C78, 0x0000BFF6 //A8 = -2.3626776207372761518718893636e-03
+data8 0x9B6B9D30EDD5F4FF, 0x0000BFF7 //A7 = -4.7430532011804570628999212874e-03
+data8 0x9E51EB9623F1D446, 0x00003FF9 //A6 = 1.9326171998839772791190405201e-02
+data8 0xF391B935C12546DE, 0x0000BFF8 //A5 = -1.4866286152953671441682166195e-02
+data8 0xB6AD4AE850DBF526, 0x0000BFFA //A4 = -4.4598858458861014323191919669e-02
+LOCAL_OBJECT_END(_1_to_2_data_tail)
+
+LOCAL_OBJECT_START(_2_to_3p25_data_tail)
+// Polynomial coefficients for the erf(x), 2 <= |x| < 3.25
+data8 0x847C24DAC7C7558B, 0x00003FF5 //A13 = 1.0107798565424606512130100541e-03
+data8 0xCB6340EAF02C3DF8, 0x00003FF8 //A12 = 1.2413800617425931997420375435e-02
+data8 0xB5163D252DBBC107, 0x0000BFF9 //A11 = -2.2105330871844825370020459523e-02
+data8 0x82FF9C0B68E331E4, 0x00003FF9 //A10 = 1.5991024756001692140897408128e-02
+data8 0xE9519E4A49752E04, 0x00003FF7 //A9 = 7.1203253651891723548763348088e-03
+data8 0x8D52F11B7AE846D9, 0x0000BFFA //A8 = -3.4502927613795425888684181521e-02
+data8 0xCCC5A3E32BC6FA30, 0x00003FFA //A7 = 4.9993171868423886228679106871e-02
+data8 0xC1791AD8284A1919, 0x0000BFFA //A6 = -4.7234635220336795411997070641e-02
+data8 0x853DAAA35A8A3C18, 0x00003FFA //A5 = 3.2529512934760303976755163452e-02
+data8 0x88E42D8F47FAB60E, 0x0000BFF9 //A4 = -1.6710366233609742619461063050e-02
+LOCAL_OBJECT_END(_2_to_3p25_data_tail)
+
+LOCAL_OBJECT_START(_4_to_6p53_data_tail)
+// Polynomial coefficients for the erf(x), 4 <= |x| < 6.53
+data8 0xD8235ABF08B8A6D1, 0x00003FEE //A13 = 1.2882834877224764938429832586e-05
+data8 0xAEDF44F9C77844C2, 0x0000BFEC //A12 = -2.6057980393716019511497492890e-06
+data8 0xCCD5490956A4FCFD, 0x00003FEA //A11 = 7.6306293047300300284923464089e-07
+data8 0xF71AF0126EE26AEA, 0x0000BFE8 //A10 = -2.3013467500738417953513680935e-07
+data8 0xE4CE68089858AC20, 0x00003FE6 //A9 = 5.3273112263151109935867439775e-08
+data8 0xBD15106FBBAEE593, 0x0000BFE4 //A8 = -1.1006037358336556244645388790e-08
+data8 0x8BBF9A5769B6E480, 0x00003FE2 //A7 = 2.0336075804332107927300019116e-09
+data8 0xB049D845D105E302, 0x0000BFDF //A6 = -3.2066683399502826067820249320e-10
+data8 0xBAC69B3F0DFE5483, 0x00003FDC //A5 = 4.2467901578369360007795282687e-11
+data8 0xA29C398F83F8A0D1, 0x0000BFD9 //A4 = -4.6216613698438694005327544047e-12
+LOCAL_OBJECT_END(_4_to_6p53_data_tail)
+
+LOCAL_OBJECT_START(_3p25_to_4_data_tail)
+// Polynomial coefficients for the erf(x), 3.25 <= |x| < 4
+data8 0x95BE1BEAD738160F, 0x00003FF2 //A13 = 1.4280568455209843005829620687e-04
+data8 0x8108C8FFAC0F0B21, 0x0000BFF4 //A12 = -4.9222685622046459346377033307e-04
+data8 0xD72A7FAEE7832BBE, 0x00003FF4 //A11 = 8.2079319302109644436194651098e-04
+data8 0x823AB4281CA7BBE7, 0x0000BFF5 //A10 = -9.9357079675971109178261577703e-04
+data8 0xFA1232D476048D11, 0x00003FF4 //A9 = 9.5394549599882496825916138915e-04
+data8 0xC463D7AF88025FB2, 0x0000BFF4 //A8 = -7.4916843357898101689031755368e-04
+data8 0xFEBE32B6B379D072, 0x00003FF3 //A7 = 4.8588363901002111193445057206e-04
+data8 0x882829BB68409BF3, 0x0000BFF3 //A6 = -2.5969865184916169002074135516e-04
+data8 0xED2F886E29DAAB09, 0x00003FF1 //A5 = 1.1309894347742479284610149994e-04
+data8 0xA4C07129436555B2, 0x0000BFF0 //A4 = -3.9279872584973887163830479579e-05
+LOCAL_OBJECT_END(_3p25_to_4_data_tail)
+
+
+LOCAL_OBJECT_START(_0_to_1o8_data)
+// Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.125
+data8 0x3FF20DD750429B6D, 0x3C71AE3A8DDFFEDE //A1H, A1L
+data8 0xF8B0DACE42525CC2, 0x0000BFEE //A15
+data8 0xFCD02E1BF0EC2C37, 0x00003FF1 //A13
+data8 0xE016D968FE473B5E, 0x0000BFF4 //A11
+data8 0xAB2DE68711BF5A79, 0x00003FF7 //A9
+data8 0xDC16718944518309, 0x0000BFF9 //A7
+data8 0xE71790D0215F0C8F, 0x00003FFB //A5
+data8 0xC093A3581BCF3612, 0x0000BFFD //A3
+LOCAL_OBJECT_END(_0_to_1o8_data)
+
+
+LOCAL_OBJECT_START(_denorm_data)
+data8 0x3FF20DD750429B6D //A1H = 1.1283791670955125585606992900e+00
+data8 0x3C71AE3A914FED80 //A1L = 1.5335459613165880745599768129e-17
+LOCAL_OBJECT_END(_denorm_data)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(erfl)
+
+{ .mfi
+ alloc r32 = ar.pfs, 0, 21, 0, 0
+ fmerge.se fArgAbsNorm = f1, f8 // normalized x (1.0 <= x < 2.0)
+ addl rSignBit = 0x20000, r0 // Set sign bit for exponent
+}
+{ .mlx
+ addl rDataPtr = @ltoff(erfl_data), gp // Get common data ptr
+ movl r1p5 = 0x3FF8000000000000 // 1.5 in dbl repres.
+};;
+
+{ .mfi
+ getf.exp rArgExp = f8 // Get arg exponent
+ fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials
+ // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf
+ addl rBias = 0xfffc, r0 // Value to subtract from exp
+ // to get actual interval number
+}
+{ .mfi
+ ld8 rDataPtr = [rDataPtr] // Get real common data pointer
+ fma.s1 fArgSqr = f8, f8, f0 // x^2 (for [0;1/8] path)
+ addl r2to4 = 0x10000, r0 // unbiased exponent
+ // for [2;4] binary interval
+};;
+
+{ .mfi
+ getf.sig rArgSig = f8 // Get arg significand
+ fcmp.lt.s1 p15, p14 = f8, f0 // Is arg negative/positive?
+ addl rSaturation = 0xd0e, r0 // First 12 bits of
+ // saturation value signif.
+}
+{ .mfi
+ setf.d f1p5 = r1p5 // 1.5 construction
+ fma.s1 f2p0 = f1,f1,f1 // 2.0 construction
+ addl r3p25Sign = 0xd00, r0 // First 12 bits of
+ // 3.25 value signif.
+};;
+
+{ .mfi
+ addl rTailDataPtr = 0x700, rDataPtr // Pointer to "tail" data
+ nop.f 0
+ andcm rArgExp = rArgExp, rSignBit // Remove sign of exp
+}
+{ .mfb
+ addl rTiny = 0xf000, r0 // Tiny value for saturation path
+ nop.f 0
+(p6) br.cond.spnt erfl_spec // Branch to zero, denorm & specs
+};;
+
+{ .mfi
+ sub rInterval = rArgExp, rBias // Get actual interval number
+ nop.f 0
+ shr.u rArgSig = rArgSig, 52 // Leave only 12 bits of sign.
+}
+{ .mfi
+ adds rShiftedDataPtr = 0x10, rDataPtr // Second ptr to data
+ nop.f 0
+ cmp.eq p8, p10 = r2to4, rArgExp // If exp is in 2to4 interval?
+};;
+
+{ .mfi
+(p8) cmp.le p8, p10 = r3p25Sign, rArgSig // If sign. is greater
+ // than 1.25? (means arg is in [3.25;4] interval)
+ nop.f 0
+ shl rOffset = rInterval, 8 // Make offset from
+ // interval number
+}
+{ .mfi
+ cmp.gt p9, p0 = 0x0, rInterval // If interval is less than 0
+ // (means arg is in [0; 1/8])
+ nop.f 0
+ cmp.eq p7, p0 = 0x5, rInterval // If arg is in [4:8] interv.?
+};;
+
+{ .mfi
+(p8) adds rOffset = 0x200, rOffset // Add additional offset
+ // if arg is in [3.25;4] (another data set)
+ fma.s1 fArgCube = fArgSqr, f8, f0 // x^3 (for [0;1/8] path)
+ shl rTailOffset = rInterval, 7 // Make offset to "tail" data
+ // from interval number
+}
+{ .mib
+ setf.exp fTiny = rTiny // Construct "tiny" value
+ // for saturation path
+ cmp.ltu p11, p0 = 0x5, rInterval // if arg > 8
+(p9) br.cond.spnt _0_to_1o8
+};;
+
+{ .mfi
+ add rAddr1 = rDataPtr, rOffset // Get address for
+ // interval data
+ nop.f 0
+ shl rTailAddOffset = rInterval, 5 // Offset to interval
+ // "tail" data
+}
+{ .mib
+ add rAddr2 = rShiftedDataPtr, rOffset // Get second
+ // address for interval data
+(p7) cmp.leu p11, p0 = rSaturation, rArgSig // if arg is
+ // in [6.53;8] interval
+(p11) br.cond.spnt _saturation // Branch to Saturation path
+};;
+
+{ .mmi
+ ldfe fA3 = [rAddr1], 0x90 // Load A3
+ ldfpd fA2H, fA2L = [rAddr2], 16 // Load A2High, A2Low
+ add rTailOffset = rTailOffset, rTailAddOffset // "Tail" offset
+};;
+
+{ .mmi
+ ldfe fA20 = [rAddr1], 16 // Load A20
+ ldfpd fA1H, fA1L = [rAddr2], 16 // Load A1High, A1Low
+(p8) adds rTailOffset = 0x140, rTailOffset // Additional offset
+ // for [3.24;4] interval
+};;
+
+{ .mmi
+ ldfe fA19 = [rAddr1], 16 // Load A19
+ ldfpd fA0H, fA0L = [rAddr2], 16 // Load A0High, A0Low
+ add rTailAddr1 = rTailDataPtr, rTailOffset // First tail
+ // data address
+};;
+
+.pred.rel "mutex",p8,p10
+{ .mfi
+ ldfe fA18 = [rAddr1], 16 // Load A18
+(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f2p0 // Add 2.0
+ // to normalized arg (for [3.24;4] interval)
+ adds rTailAddr2 = 0x10, rTailAddr1 // First tail
+ // data address
+}
+{ .mfi
+ ldfe fA25 = [rAddr2], 16 // Load A25
+(p10) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1p5 // Add 1.5
+ // to normalized arg
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA17 = [rAddr1], 16 // Load A17
+ ldfe fA24 = [rAddr2], 16 // Load A24
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA16 = [rAddr1], 16 // Load A16
+ ldfe fA23 = [rAddr2], 16 // Load A23
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA15 = [rAddr1], 16 // Load A15
+ ldfe fA22 = [rAddr2], 16 // Load A22
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA14 = [rAddr1], 16 // Load A14
+ ldfe fA21 = [rAddr2], 16 // Load A21
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe fA13 = [rTailAddr1], 32 // Load A13
+ fms.s1 fArgAbsNorm2 = fArgAbsNorm, fArgAbsNorm, f0 // x^2
+ nop.i 0
+}
+{ .mfi
+ ldfe fA12 = [rTailAddr2], 32 // Load A12
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe fA11 = [rTailAddr1], 32 // Load A11
+ fma.s1 fRes3H = fA3, fArgAbsNorm, fA2H // (A3*x+A2)*x^2
+ nop.i 0
+}
+{ .mfi
+ ldfe fA10 = [rTailAddr2], 32 // Load A10
+ fma.s1 fTH = fA3, fArgAbsNorm, f0 // (A3*x+A2)*x^2
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe fA9 = [rTailAddr1], 32 // Load A9
+ fma.s1 fTT2 = fA1L, fArgAbsNorm, f0 // A1*x+A0
+ nop.i 0
+}
+{ .mfi
+ ldfe fA8 = [rTailAddr2], 32 // Load A8
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA7 = [rTailAddr1], 32 // Load A7
+ ldfe fA6 = [rTailAddr2], 32 // Load A6
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA5 = [rTailAddr1], 32 // Load A5
+ ldfe fA4 = [rTailAddr2], 32 // Load A4
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fArgAbsNorm2L = fArgAbsNorm, fArgAbsNorm, fArgAbsNorm2
+ // Low part of x^2 (delta)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fArgAbsNorm4 = fArgAbsNorm2, fArgAbsNorm2, f0 // x^4
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fRes3L = fA2H, f1, fRes3H // // (A3*x+A2)*x^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fArgAbsNorm3 = fArgAbsNorm2, fArgAbsNorm, f0 // x^3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fTH2 = fA1H, fArgAbsNorm, fTT2 // A1*x+A0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA23 = fA24, fArgAbsNorm, fA23 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA21 = fA22, fArgAbsNorm, fA21 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA12 = fA13, fArgAbsNorm, fA12 // Polynomial tail
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3L = fRes3L, f1, fTH // (A3*x+A2)*x^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA20, fArgAbsNorm, fA19 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1H = fTH2, f1, fA0H // A1*x+A0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fTL2 = fA1H, fArgAbsNorm, fTH2 // A1*x+A0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA8 = fA9, fArgAbsNorm, fA8 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA10 = fA11, fArgAbsNorm, fA10 // Polynomial tail
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA16, fArgAbsNorm, fA15 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA18, fArgAbsNorm, fA17 // Polynomial tail
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fms.s1 fArgAbsNorm11 = fArgAbsNorm4, fArgAbsNorm4, f0 // x^8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA5, fArgAbsNorm, fA4 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3L = fRes3L, f1, fA2L // (A3*x+A2)*x^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA6 = fA7, fArgAbsNorm, fA6 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fTL2 = fTL2, f1, fTT2 // A1*x+A0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1L = fA0H, f1, fRes1H // A1*x+A0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA23 = fA25, fArgAbsNorm2, fA23 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA12 = fA14, fArgAbsNorm2, fA12 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA21, fArgAbsNorm2, fA19 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA8 = fA10, fArgAbsNorm2, fA8 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA17, fArgAbsNorm2, fA15 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fArgAbsNorm11 = fArgAbsNorm11, fArgAbsNorm3, f0 // x^11
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fTT = fRes3L, fArgAbsNorm2, f0 // (A3*x+A2)*x^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA6, fArgAbsNorm2, fA4 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes1L, f1, fTH2 // A1*x+A0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA23, fArgAbsNorm4, fA19 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA8 = fA12, fArgAbsNorm4, fA8 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fTT = fRes3H, fArgAbsNorm2L, fTT // (A3*x+A2)*x^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes1L, f1, fTL2 // A1*x+A0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA19, fArgAbsNorm4, fA15 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA8, fArgAbsNorm4, fA4 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2H = fRes3H, fArgAbsNorm2, fTT // (A3*x+A2)*x^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes1L, f1, fA0L // A1*x+A0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes4 = fA15, fArgAbsNorm11, fA4 // Result of
+ // polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fRes2L = fRes3H, fArgAbsNorm2, fRes2H // (A3*x+A2)*x^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fRes2H, f1, fRes1H // High result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes4, fArgAbsNorm4, fRes1L // A1*x+A0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2L = fRes2L, f1, fTT // (A3*x+A2)*x^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fResL = fRes1H, f1, fResH // Low result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes1L, f1, fRes2L // Low result
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fResL = fResL, f1, fRes2H // Low result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fneg fResH = fResH // Invert high result if arg is neg.
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fResL = fResL, f1, fRes1L // Low result
+ nop.i 0
+};;
+
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p14) fma.s0 f8 = fResH, f1, fResL // Add high and low results
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p15) fms.s0 f8 = fResH, f1, fResL // Add high and low results
+ br.ret.sptk b0 // Main path return
+};;
+
+// satiration path ////////////////////////////////////////////////////////////
+_saturation:
+
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p14) fms.s0 f8 = f1, f1, fTiny // Saturation result r = 1-tiny
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+(p15) fnma.s0 f8 = f1, f1, fTiny // Saturation result r = tiny-1
+ br.ret.sptk b0 // Saturation path return
+};;
+
+
+// 0, denormals and special IEEE numbers path /////////////////////////////////
+erfl_spec:
+
+{ .mfi
+ addl rDataPtr = 0xBE0, rDataPtr // Ptr to denormals coeffs
+ fclass.m p6,p0 = f8, 0x23 // To filter infinities
+ // 0x23 = @pos|@neg|@inf
+ nop.i 0
+};;
+
+{ .mfi
+ ldfpd fA1H, fA1L = [rDataPtr] // Load denormals coeffs A1H, A1L
+ fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros
+ // 0xC7 = @pos|@neg|@zero|@qnan|@snan
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args
+(p6) br.ret.spnt b0 // exit for x = INF
+};;
+
+{ .mfb
+ nop.m 0
+(p7) fma.s0 f8 = f8, f1, f8 // +/-0 for 0 args
+ // and NaNs for NaNs
+(p7) br.ret.spnt b0 // exit for x = NaN or +/-0
+};;
+
+{ .mfi
+ nop.m 0
+ fnorm.s0 f8 = f8 // Normalize arg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1H = f8, fA1H, f0 // HighRes
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1L = f8, fA1L, f0 // LowRes
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1Hd = f8, fA1H, fRes1H // HighRes delta
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes1L, f1, fRes1Hd // LowRes+HighRes delta
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = f8, f8, fRes // r=x^2+r
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+ fma.s0 f8 = fRes, f1, fRes1H // res = r+ResHigh
+ br.ret.sptk b0 // 0, denormals, specials return
+};;
+
+
+// 0 < |x| < 1/8 path /////////////////////////////////////////////////////////
+_0_to_1o8:
+
+{ .mmi
+ adds rAddr1 = 0xB60, rDataPtr // Ptr 1 to coeffs
+ adds rAddr2 = 0xB80, rDataPtr // Ptr 2 to coeffs
+ nop.i 0
+};;
+
+{ .mmi
+ ldfpd fA1H, fA1L = [rAddr1], 16 // Load A1High, A1Low
+ ldfe fA13 = [rAddr2], 16 // Load A13
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA15 = [rAddr1], 48 // Load A15
+ ldfe fA11 = [rAddr2], 32 // Load A11
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA9 = [rAddr1], 32 // Load A9
+ ldfe fA7 = [rAddr2], 32 // Load A7
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA5 = [rAddr1] // Load A5
+ ldfe fA3 = [rAddr2] // Load A3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1H = f8, fA1H, f0 // x*(A1H+A1L)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1L = f8, fA1L, f0 // x*(A1H+A1L)
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA13, fArgSqr, fA11 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fArgFour = fArgSqr, fArgSqr, f0 // a^4
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA3 = fA5, fArgSqr, fA3 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA9, fArgSqr, fA7 // Polynomial tail
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1Hd = f8, fA1H, fRes1H // x*(A1H+A1L) delta
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA15, fArgFour, fA11 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA3 = fA7, fArgFour, fA3 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fArgEight = fArgFour, fArgFour, f0 // a^8
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 f8 = fRes1L, f1, fRes1Hd // x*(A1H+A1L)
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fA11, fArgEight, fA3 //Polynomial tail result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 f8 = fRes, fArgCube, f8 // (Polynomial tail)*x^3
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+ fma.s0 f8 = f8, f1, fRes1H // (Polynomial tail)*x^3 +
+ // + x*(A1H+A1L)
+ br.ret.sptk b0 // [0;1/8] interval return
+};;
+
+
+GLOBAL_LIBM_END(erfl)
+
+
diff --git a/sysdeps/ia64/fpu/s_expm1.S b/sysdeps/ia64/fpu/s_expm1.S
index 19a237990c..41b9954ee8 100644
--- a/sysdeps/ia64/fpu/s_expm1.S
+++ b/sysdeps/ia64/fpu/s_expm1.S
@@ -1,10 +1,10 @@
.file "exp_m1.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2002, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,1694 +20,819 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
-// HISTORY
-// 2/02/00 Initial Version
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/02/00 Initial Version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 07/07/01 Improved speed of all paths
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 11/20/02 Improved speed, algorithm based on exp
+
+// API
+//==============================================================
+// double expm1(double)
+
+// Overview of operation
+//==============================================================
+// 1. Inputs of Nan, Inf, Zero, NatVal handled with special paths
+//
+// 2. |x| < 2^-60
+// Result = x, computed by x + x*x to handle appropriate flags and rounding
//
-// *********************************************************************
-//
-// Function: Combined exp(x) and expm1(x), where
-// x
-// exp(x) = e , for double precision x values
-// x
-// expm1(x) = e - 1 for double precision x values
-//
-// *********************************************************************
-//
-// Accuracy: Within .7 ulps for 80-bit floating point values
-// Very accurate for double precision values
-//
-// *********************************************************************
-//
-// Resources Used:
-//
-// Floating-Point Registers: f8 (Input and Return Value)
-// f9,f32-f61, f99-f102
-//
-// General Purpose Registers:
-// r32-r61
-// r62-r65 (Used to pass arguments to error handling routine)
-//
-// Predicate Registers: p6-p15
-//
-// *********************************************************************
-//
-// IEEE Special Conditions:
-//
-// Denormal fault raised on denormal inputs
-// Overflow exceptions raised when appropriate for exp and expm1
-// Underflow exceptions raised when appropriate for exp and expm1
-// (Error Handling Routine called for overflow and Underflow)
-// Inexact raised when appropriate by algorithm
-//
-// exp(inf) = inf
-// exp(-inf) = +0
-// exp(SNaN) = QNaN
-// exp(QNaN) = QNaN
-// exp(0) = 1
-// exp(EM_special Values) = QNaN
-// exp(inf) = inf
-// expm1(-inf) = -1
-// expm1(SNaN) = QNaN
-// expm1(QNaN) = QNaN
-// expm1(0) = 0
-// expm1(EM_special Values) = QNaN
-//
-// *********************************************************************
-//
-// Implementation and Algorithm Notes:
-//
-// ker_exp_64( in_FR : X,
-// in_GR : Flag,
-// in_GR : Expo_Range
-// out_FR : Y_hi,
-// out_FR : Y_lo,
-// out_FR : scale,
-// out_PR : Safe )
-//
-// On input, X is in register format and
-// Flag = 0 for exp,
-// Flag = 1 for expm1,
-//
-// On output, provided X and X_cor are real numbers, then
-//
-// scale*(Y_hi + Y_lo) approximates exp(X) if Flag is 0
-// scale*(Y_hi + Y_lo) approximates exp(X)-1 if Flag is 1
-//
-// The accuracy is sufficient for a highly accurate 64 sig.
-// bit implementation. Safe is set if there is no danger of
-// overflow/underflow when the result is composed from scale,
-// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set.
-// Otherwise, one must prepare to handle the possible exception
-// appropriately. Note that SAFE not set (false) does not mean
-// that overflow/underflow will occur; only the setting of SAFE
-// guarantees the opposite.
-//
-// **** High Level Overview ****
-//
-// The method consists of three cases.
-//
-// If |X| < Tiny use case exp_tiny;
-// else if |X| < 2^(-6) use case exp_small;
-// else use case exp_regular;
-//
-// Case exp_tiny:
-//
-// 1 + X can be used to approximate exp(X) or exp(X+X_cor);
-// X + X^2/2 can be used to approximate exp(X) - 1
-//
-// Case exp_small:
-//
-// Here, exp(X), exp(X+X_cor), and exp(X) - 1 can all be
-// appproximated by a relatively simple polynomial.
-//
-// This polynomial resembles the truncated Taylor series
-//
-// exp(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n!
-//
-// Case exp_regular:
-//
-// Here we use a table lookup method. The basic idea is that in
-// order to compute exp(X), we accurately decompose X into
-//
-// X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13.
-//
-// Hence
-//
-// exp(X) = 2^( N / 2^12 ) * exp(r).
-//
-// The value 2^( N / 2^12 ) is obtained by simple combinations
-// of values calculated beforehand and stored in table; exp(r)
-// is approximated by a short polynomial because |r| is small.
-//
-// We elaborate this method in 4 steps.
-//
-// Step 1: Reduction
-//
-// The value 2^12/log(2) is stored as a double-extended number
-// L_Inv.
-//
-// N := round_to_nearest_integer( X * L_Inv )
-//
-// The value log(2)/2^12 is stored as two numbers L_hi and L_lo so
-// that r can be computed accurately via
-//
-// r := (X - N*L_hi) - N*L_lo
-//
-// We pick L_hi such that N*L_hi is representable in 64 sig. bits
-// and thus the FMA X - N*L_hi is error free. So r is the
-// 1 rounding error from an exact reduction with respect to
-//
-// L_hi + L_lo.
-//
-// In particular, L_hi has 30 significant bit and can be stored
-// as a double-precision number; L_lo has 64 significant bits and
-// stored as a double-extended number.
-//
-// In the case Flag = 2, we further modify r by
-//
-// r := r + X_cor.
-//
-// Step 2: Approximation
-//
-// exp(r) - 1 is approximated by a short polynomial of the form
-//
-// r + A_1 r^2 + A_2 r^3 + A_3 r^4 .
-//
-// Step 3: Composition from Table Values
-//
-// The value 2^( N / 2^12 ) can be composed from a couple of tables
-// of precalculated values. First, express N as three integers
-// K, M_1, and M_2 as
-//
-// N = K * 2^12 + M_1 * 2^6 + M_2
-//
-// Where 0 <= M_1, M_2 < 2^6; and K can be positive or negative.
-// When N is represented in 2's complement, M_2 is simply the 6
-// lsb's, M_1 is the next 6, and K is simply N shifted right
-// arithmetically (sign extended) by 12 bits.
-//
-// Now, 2^( N / 2^12 ) is simply
-//
-// 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 )
-//
-// Clearly, 2^K needs no tabulation. The other two values are less
-// trivial because if we store each accurately to more than working
-// precision, than its product is too expensive to calculate. We
-// use the following method.
-//
-// Define two mathematical values, delta_1 and delta_2, implicitly
-// such that
-//
-// T_1 = exp( [M_1 log(2)/2^6] - delta_1 )
-// T_2 = exp( [M_2 log(2)/2^12] - delta_2 )
-//
-// are representable as 24 significant bits. To illustrate the idea,
-// we show how we define delta_1:
-//
-// T_1 := round_to_24_bits( exp( M_1 log(2)/2^6 ) )
-// delta_1 = (M_1 log(2)/2^6) - log( T_1 )
-//
-// The last equality means mathematical equality. We then tabulate
-//
-// W_1 := exp(delta_1) - 1
-// W_2 := exp(delta_2) - 1
-//
-// Both in double precision.
-//
-// From the tabulated values T_1, T_2, W_1, W_2, we compose the values
-// T and W via
+// 3. 2^-60 <= |x| < 2^-2
+// Result determined by 13th order Taylor series polynomial
+// expm1f(x) = x + Q2*x^2 + ... + Q13*x^13
//
-// T := T_1 * T_2 ...exactly
-// W := W_1 + (1 + W_1)*W_2
+// 4. x < -48.0
+// Here we know result is essentially -1 + eps, where eps only affects
+// rounded result. Set I.
//
-// W approximates exp( delta ) - 1 where delta = delta_1 + delta_2.
-// The mathematical product of T and (W+1) is an accurate representation
-// of 2^(M_1/2^6) * 2^(M_2/2^12).
+// 5. x >= 709.7827
+// Result overflows. Set I, O, and call error support
//
-// Step 4. Reconstruction
-//
-// Finally, we can reconstruct exp(X), exp(X) - 1.
-// Because
-//
-// X = K * log(2) + (M_1*log(2)/2^6 - delta_1)
-// + (M_2*log(2)/2^12 - delta_2)
-// + delta_1 + delta_2 + r ...accurately
-// We have
-//
-// exp(X) ~=~ 2^K * ( T + T*[exp(delta_1+delta_2+r) - 1] )
-// ~=~ 2^K * ( T + T*[exp(delta + r) - 1] )
-// ~=~ 2^K * ( T + T*[(exp(delta)-1)
-// + exp(delta)*(exp(r)-1)] )
-// ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) )
-// ~=~ 2^K * ( Y_hi + Y_lo )
-//
-// where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r))
-//
-// For exp(X)-1, we have
-//
-// exp(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1
-// ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) )
-//
-// and we combine Y_hi + Y_lo - 2^(-N) into the form of two
-// numbers Y_hi + Y_lo carefully.
-//
-// **** Algorithm Details ****
-//
-// A careful algorithm must be used to realize the mathematical ideas
-// accurately. We describe each of the three cases. We assume SAFE
-// is preset to be TRUE.
-//
-// Case exp_tiny:
-//
-// The important points are to ensure an accurate result under
-// different rounding directions and a correct setting of the SAFE
-// flag.
-//
-// If Flag is 1, then
-// SAFE := False ...possibility of underflow
-// Scale := 1.0
-// Y_hi := X
-// Y_lo := 2^(-17000)
-// Else
-// Scale := 1.0
-// Y_hi := 1.0
-// Y_lo := X ...for different rounding modes
-// Endif
-//
-// Case exp_small:
-//
-// Here we compute a simple polynomial. To exploit parallelism, we split
-// the polynomial into several portions.
-//
-// Let r = X
-//
-// If Flag is not 1 ...i.e. exp( argument )
-//
-// rsq := r * r;
-// r4 := rsq*rsq
-// poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6))
-// poly_hi := r + rsq*(P_1 + r*P_2)
-// Y_lo := poly_hi + r4 * poly_lo
-// set lsb(Y_lo) to 1
-// Y_hi := 1.0
-// Scale := 1.0
-//
-// Else ...i.e. exp( argument ) - 1
-//
-// rsq := r * r
-// r4 := rsq * rsq
-// r6 := rsq * r4
-// poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7))
-// poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4))
-// Y_lo := rsq*poly_hi + poly_lo
-// set lsb(Y_lo) to 1
-// Y_hi := X
-// Scale := 1.0
-//
-// Endif
-//
-// Case exp_regular:
-//
-// The previous description contain enough information except the
-// computation of poly and the final Y_hi and Y_lo in the case for
-// exp(X)-1.
-//
-// The computation of poly for Step 2:
-//
-// rsq := r*r
-// poly := r + rsq*(A_1 + r*(A_2 + r*A_3))
-//
-// For the case exp(X) - 1, we need to incorporate 2^(-K) into
-// Y_hi and Y_lo at the end of Step 4.
-//
-// If K > 10 then
-// Y_lo := Y_lo - 2^(-K)
-// Else
-// If K < -10 then
-// Y_lo := Y_hi + Y_lo
-// Y_hi := -2^(-K)
-// Else
-// Y_hi := Y_hi - 2^(-K)
-// End If
-// End If
-//
-
-#include "libm_support.h"
-
-GR_SAVE_PFS = r59
-GR_SAVE_B0 = r60
-GR_SAVE_GP = r61
-
-GR_Parameter_X = r62
-GR_Parameter_Y = r63
-GR_Parameter_RESULT = r64
-
-FR_X = f9
-FR_Y = f1
-FR_RESULT = f99
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-
-.align 64
-Constants_exp_64_Arg:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object)
-data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000
-data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000
-data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000
-// /* Inv_L, L_hi, L_lo */
-ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg)
-
-.align 64
-Constants_exp_64_Exponents:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object)
-data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF
-data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF
-data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
-data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
-data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF
-data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF
-ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents)
-
-.align 64
-Constants_exp_64_A:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object)
-data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000
-data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000
-data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000
-// /* Reversed */
-ASM_SIZE_DIRECTIVE(Constants_exp_64_A)
-
-.align 64
-Constants_exp_64_P:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object)
-data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000
-data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000
-data4 0x7474C518,0x88888888,0x00003FF8,0x00000000
-data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000
-data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000
-data4 0x000004C7,0x80000000,0x00003FFE,0x00000000
-// /* Reversed */
-ASM_SIZE_DIRECTIVE(Constants_exp_64_P)
-
-.align 64
-Constants_exp_64_Q:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object)
-data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000
-data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000
-data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000
-data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000
-data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000
-data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000
-data4 0x00000000,0x80000000,0x00003FFE,0x00000000
-// /* Reversed */
-ASM_SIZE_DIRECTIVE(Constants_exp_64_Q)
-
-.align 64
-Constants_exp_64_T1:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object)
-data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
-data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
-data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
-data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
-data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
-data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516
-data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A
-data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4
-data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B
-data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD
-data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15
-data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B
-data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
-data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
-data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
-data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
-ASM_SIZE_DIRECTIVE(Constants_exp_64_T1)
-
-.align 64
-Constants_exp_64_T2:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object)
-data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
-data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
-data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
-data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
-data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
-data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
-data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
-data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
-data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
-data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
-data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
-data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
-data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
-data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
-data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
-data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
-ASM_SIZE_DIRECTIVE(Constants_exp_64_T2)
-
-.align 64
-Constants_exp_64_W1:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object)
-data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454
-data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6
-data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA
-data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50
-data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2
-data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE
-data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B
-data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04
-data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419
-data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376
-data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A
-data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB
-data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E
-data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA
-data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08
-data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B
-data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75
-data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79
-data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7
-data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087
-data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB
-data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643
-data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C
-data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D
-data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873
-data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F
-data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861
-data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0
-data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC
-data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB
-data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB
-data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148
-ASM_SIZE_DIRECTIVE(Constants_exp_64_W1)
-
-.align 64
-Constants_exp_64_W2:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object)
-data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25
-data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8
-data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A
-data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E
-data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9
-data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2
-data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0
-data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509
-data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33
-data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D
-data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87
-data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3
-data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9
-data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F
-data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82
-data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4
-data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D
-data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030
-data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29
-data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED
-data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B
-data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893
-data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35
-data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C
-data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313
-data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE
-data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426
-data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550
-data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4
-data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31
-data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE
-data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9
-ASM_SIZE_DIRECTIVE(Constants_exp_64_W2)
+// 6. 2^-2 <= x < 709.7827 or -48.0 <= x < -2^-2
+// This is the main path. The algorithm is described below:
-.section .text
-.proc expm1#
-.global expm1#
-.align 64
-
-expm1:
-#ifdef _LIBC
-.global __expm1#
-__expm1:
-#endif
-
-
-{ .mii
- alloc r32 = ar.pfs,0,30,4,0
-(p0) add r33 = 1, r0
-(p0) cmp.eq.unc p7, p0 = r0, r0
-}
-;;
-
-
-//
-// Set p7 true for expm1
-// Set Flag = r33 = 1 for expm1
-// These are really no longer necesary, but are a remnant
-// when this file had multiple entry points.
-// They should be carefully removed
+// Take the input x. w is "how many log2/128 in x?"
+// w = x * 128/log2
+// n = int(w)
+// x = n log2/128 + r + delta
+
+// n = 128M + index_1 + 2^4 index_2
+// x = M log2 + (log2/128) index_1 + (log2/8) index_2 + r + delta
+
+// exp(x) = 2^M 2^(index_1/128) 2^(index_2/8) exp(r) exp(delta)
+// Construct 2^M
+// Get 2^(index_1/128) from table_1;
+// Get 2^(index_2/8) from table_2;
+// Calculate exp(r) by series by 5th order polynomial
+// r = x - n (log2/128)_high
+// delta = - n (log2/128)_low
+// Calculate exp(delta) as 1 + delta
+
+
+// Special values
+//==============================================================
+// expm1(+0) = +0.0
+// expm1(-0) = -0.0
+
+// expm1(+qnan) = +qnan
+// expm1(-qnan) = -qnan
+// expm1(+snan) = +qnan
+// expm1(-snan) = -qnan
+
+// expm1(-inf) = -1.0
+// expm1(+inf) = +inf
+
+// Overflow and Underflow
+//=======================
+// expm1(x) = largest double normal when
+// x = 709.7827 = 40862e42fefa39ef
+//
+// Underflow is handled as described in case 2 above.
+
+
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f9 -> f15, f32 -> f75
+
+// General registers used:
+// r14 -> r40
+
+// Predicate registers used:
+// p6 -> p15
+
+// Assembly macros
+//==============================================================
+
+rRshf = r14
+rAD_TB1 = r15
+rAD_T1 = r15
+rAD_TB2 = r16
+rAD_T2 = r16
+rAD_Ln2_lo = r17
+rAD_P = r17
+
+rN = r18
+rIndex_1 = r19
+rIndex_2_16 = r20
+
+rM = r21
+rBiased_M = r21
+rIndex_1_16 = r22
+rSignexp_x = r23
+rExp_x = r24
+rSig_inv_ln2 = r25
+
+rAD_Q1 = r26
+rAD_Q2 = r27
+rTmp = r27
+rExp_bias = r28
+rExp_mask = r29
+rRshf_2to56 = r30
+
+rGt_ln = r31
+rExp_2tom56 = r31
+
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+fRSHF_2TO56 = f6
+fINV_LN2_2TO63 = f7
+fW_2TO56_RSH = f9
+f2TOM56 = f11
+fP5 = f12
+fP54 = f50
+fP5432 = f50
+fP4 = f13
+fP3 = f14
+fP32 = f14
+fP2 = f15
+
+fLn2_by_128_hi = f33
+fLn2_by_128_lo = f34
+
+fRSHF = f35
+fNfloat = f36
+fW = f37
+fR = f38
+fF = f39
+
+fRsq = f40
+fRcube = f41
+
+f2M = f42
+fS1 = f43
+fT1 = f44
+
+fMIN_DBL_OFLOW_ARG = f45
+fMAX_DBL_MINUS_1_ARG = f46
+fMAX_DBL_NORM_ARG = f47
+fP_lo = f51
+fP_hi = f52
+fP = f53
+fS = f54
+
+fNormX = f56
+
+fWre_urm_f8 = f57
+
+fGt_pln = f58
+fTmp = f58
+
+fS2 = f59
+fT2 = f60
+fSm1 = f61
+
+fXsq = f62
+fX6 = f63
+fX4 = f63
+fQ7 = f64
+fQ76 = f64
+fQ7654 = f64
+fQ765432 = f64
+fQ6 = f65
+fQ5 = f66
+fQ54 = f66
+fQ4 = f67
+fQ3 = f68
+fQ32 = f68
+fQ2 = f69
+fQD = f70
+fQDC = f70
+fQDCBA = f70
+fQDCBA98 = f70
+fQDCBA98765432 = f70
+fQC = f71
+fQB = f72
+fQBA = f72
+fQA = f73
+fQ9 = f74
+fQ98 = f74
+fQ8 = f75
+
+// Data tables
+//==============================================================
+
+RODATA
+.align 16
+
+// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
+
+// double-extended 1/ln(2)
+// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
+// 3fff b8aa 3b29 5c17 f0bc
+// For speed the significand will be loaded directly with a movl and setf.sig
+// and the exponent will be bias+63 instead of bias+0. Thus subsequent
+// computations need to scale appropriately.
+// The constant 128/ln(2) is needed for the computation of w. This is also
+// obtained by scaling the computations.
+//
+// Two shifting constants are loaded directly with movl and setf.d.
+// 1. fRSHF_2TO56 = 1.1000..00 * 2^(63-7)
+// This constant is added to x*1/ln2 to shift the integer part of
+// x*128/ln2 into the rightmost bits of the significand.
+// The result of this fma is fW_2TO56_RSH.
+// 2. fRSHF = 1.1000..00 * 2^(63)
+// This constant is subtracted from fW_2TO56_RSH * 2^(-56) to give
+// the integer part of w, n, as a floating-point number.
+// The result of this fms is fNfloat.
+
+
+LOCAL_OBJECT_START(exp_Table_1)
+data8 0x40862e42fefa39f0 // smallest dbl overflow arg
+data8 0xc048000000000000 // approx largest arg for minus one result
+data8 0x40862e42fefa39ef // largest dbl arg to give normal dbl result
+data8 0x0 // pad
+data8 0xb17217f7d1cf79ab , 0x00003ff7 // ln2/128 hi
+data8 0xc9e3b39803f2f6af , 0x00003fb7 // ln2/128 lo
+//
+// Table 1 is 2^(index_1/128) where
+// index_1 goes from 0 to 15
+//
+data8 0x8000000000000000 , 0x00003FFF
+data8 0x80B1ED4FD999AB6C , 0x00003FFF
+data8 0x8164D1F3BC030773 , 0x00003FFF
+data8 0x8218AF4373FC25EC , 0x00003FFF
+data8 0x82CD8698AC2BA1D7 , 0x00003FFF
+data8 0x8383594EEFB6EE37 , 0x00003FFF
+data8 0x843A28C3ACDE4046 , 0x00003FFF
+data8 0x84F1F656379C1A29 , 0x00003FFF
+data8 0x85AAC367CC487B15 , 0x00003FFF
+data8 0x8664915B923FBA04 , 0x00003FFF
+data8 0x871F61969E8D1010 , 0x00003FFF
+data8 0x87DB357FF698D792 , 0x00003FFF
+data8 0x88980E8092DA8527 , 0x00003FFF
+data8 0x8955EE03618E5FDD , 0x00003FFF
+data8 0x8A14D575496EFD9A , 0x00003FFF
+data8 0x8AD4C6452C728924 , 0x00003FFF
+LOCAL_OBJECT_END(exp_Table_1)
+
+// Table 2 is 2^(index_1/8) where
+// index_2 goes from 0 to 7
+LOCAL_OBJECT_START(exp_Table_2)
+data8 0x8000000000000000 , 0x00003FFF
+data8 0x8B95C1E3EA8BD6E7 , 0x00003FFF
+data8 0x9837F0518DB8A96F , 0x00003FFF
+data8 0xA5FED6A9B15138EA , 0x00003FFF
+data8 0xB504F333F9DE6484 , 0x00003FFF
+data8 0xC5672A115506DADD , 0x00003FFF
+data8 0xD744FCCAD69D6AF4 , 0x00003FFF
+data8 0xEAC0C6E7DD24392F , 0x00003FFF
+LOCAL_OBJECT_END(exp_Table_2)
+
+
+LOCAL_OBJECT_START(exp_p_table)
+data8 0x3f8111116da21757 //P5
+data8 0x3fa55555d787761c //P4
+data8 0x3fc5555555555414 //P3
+data8 0x3fdffffffffffd6a //P2
+LOCAL_OBJECT_END(exp_p_table)
+
+LOCAL_OBJECT_START(exp_Q1_table)
+data8 0x3de6124613a86d09 // QD = 1/13!
+data8 0x3e21eed8eff8d898 // QC = 1/12!
+data8 0x3ec71de3a556c734 // Q9 = 1/9!
+data8 0x3efa01a01a01a01a // Q8 = 1/8!
+data8 0x8888888888888889,0x3ff8 // Q5 = 1/5!
+data8 0xaaaaaaaaaaaaaaab,0x3ffc // Q3 = 1/3!
+data8 0x0,0x0 // Pad to avoid bank conflicts
+LOCAL_OBJECT_END(exp_Q1_table)
+
+LOCAL_OBJECT_START(exp_Q2_table)
+data8 0x3e5ae64567f544e4 // QB = 1/11!
+data8 0x3e927e4fb7789f5c // QA = 1/10!
+data8 0x3f2a01a01a01a01a // Q7 = 1/7!
+data8 0x3f56c16c16c16c17 // Q6 = 1/6!
+data8 0xaaaaaaaaaaaaaaab,0x3ffa // Q4 = 1/4!
+data8 0x8000000000000000,0x3ffe // Q2 = 1/2!
+LOCAL_OBJECT_END(exp_Q2_table)
+.section .text
+GLOBAL_IEEE754_ENTRY(expm1)
-{ .mfi
-(p0) add r32 = 1,r0
-(p0) fnorm.s1 f9 = f8
- nop.i 999
+{ .mlx
+ getf.exp rSignexp_x = f8 // Must recompute if x unorm
+ movl rSig_inv_ln2 = 0xb8aa3b295c17f0bc // signif of 1/ln2
}
-
-
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6, p8 = f8, 0x1E7
- nop.i 999
+{ .mlx
+ addl rAD_TB1 = @ltoff(exp_Table_1), gp
+ movl rRshf_2to56 = 0x4768000000000000 // 1.10000 2^(63+56)
}
+;;
+// We do this fnorm right at the beginning to normalize
+// any input unnormals so that SWA is not taken.
{ .mfi
- nop.m 999
-(p0) fclass.nm.unc p9, p0 = f8, 0x1FF
- nop.i 999
+ ld8 rAD_TB1 = [rAD_TB1]
+ fclass.m p6,p0 = f8,0x0b // Test for x=unorm
+ mov rExp_mask = 0x1ffff
}
-
{ .mfi
- nop.m 999
-(p0) mov f36 = f1
- nop.i 999 ;;
-}
-
-//
-// Identify NatVals, NaNs, Infs, and Zeros.
-// Identify EM unsupporteds.
-// Save special input registers
-//
-// Create FR_X_cor = 0.0
-// GR_Flag = 0
-// GR_Expo_Range = 1
-// FR_Scale = 1.0
-//
-
-{ .mfb
- nop.m 999
-(p0) mov f32 = f0
-(p6) br.cond.spnt EXP_64_SPECIAL ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.spnt EXP_64_UNSUPPORTED ;;
-}
-
-//
-// Branch out for special input values
-//
-
-{ .mfi
-(p0) cmp.ne.unc p12, p13 = 0x01, r33
-(p0) fcmp.lt.unc.s0 p9,p0 = f8, f0
-(p0) cmp.eq.unc p15, p0 = r0, r0
-}
-
-//
-// Raise possible denormal operand exception
-// Normalize x
-//
-// This function computes exp( x + x_cor)
-// Input FR 1: FR_X
-// Input FR 2: FR_X_cor
-// Input GR 1: GR_Flag
-// Input GR 2: GR_Expo_Range
-// Output FR 3: FR_Y_hi
-// Output FR 4: FR_Y_lo
-// Output FR 5: FR_Scale
-// Output PR 1: PR_Safe
-
-//
-// Prepare to load constants
-// Set Safe = True
-//
-
-{ .mmi
-(p0) addl r34 = @ltoff(Constants_exp_64_Arg#), gp
-(p0) addl r40 = @ltoff(Constants_exp_64_W1#), gp
-(p0) addl r41 = @ltoff(Constants_exp_64_W2#), gp
-}
-;;
-
-{ .mmi
- ld8 r34 = [r34]
- ld8 r40 = [r40]
-(p0) addl r50 = @ltoff(Constants_exp_64_T1#), gp
-}
-;;
-
-
-{ .mmi
- ld8 r41 = [r41]
-(p0) ldfe f37 = [r34],16
-(p0) addl r51 = @ltoff(Constants_exp_64_T2#), gp
-}
-;;
-
-//
-// N = fcvt.fx(float_N)
-// Set p14 if -6 > expo_X
-//
-
-
-//
-// Bias = 0x0FFFF
-// expo_X = expo_X and Mask
-//
-
-//
-// Load L_lo
-// Set p10 if 14 < expo_X
-//
-
-{ .mmi
- ld8 r50 = [r50]
-(p0) ldfe f40 = [r34],16
- nop.i 999
+ mov rExp_bias = 0xffff
+ fnorm.s1 fNormX = f8
+ mov rExp_2tom56 = 0xffff-56
}
;;
-{ .mlx
- nop.m 999
-(p0) movl r58 = 0x0FFFF
-}
-;;
-
-//
-// Load W2_ptr
-// Branch to SMALL is expo_X < -6
-//
+// Form two constants we need
+// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
+// 1.1000..000 * 2^(63+63-7) to right shift int(w) into the significand
-//
-// float_N = X * L_Inv
-// expo_X = exponent of X
-// Mask = 0x1FFFF
-//
-
-{ .mmi
- ld8 r51 = [r51]
-(p0) ldfe f41 = [r34],16
+{ .mfi
+ setf.sig fINV_LN2_2TO63 = rSig_inv_ln2 // form 1/ln2 * 2^63
+ fclass.m p8,p0 = f8,0x07 // Test for x=0
+ nop.i 0
}
-;;
-
{ .mlx
-(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp
-(p0) movl r39 = 0x1FFFF
-}
-;;
-
-{ .mmi
- ld8 r34 = [r34]
-(p0) getf.exp r37 = f9
- nop.i 999
+ setf.d fRSHF_2TO56 = rRshf_2to56 // Form 1.100 * 2^(63+56)
+ movl rRshf = 0x43e8000000000000 // 1.10000 2^63 for rshift
}
;;
-{ .mii
- nop.m 999
- nop.i 999
-(p0) and r37 = r37, r39 ;;
-}
-
-{ .mmi
-(p0) sub r37 = r37, r58 ;;
-(p0) cmp.gt.unc p14, p0 = -6, r37
-(p0) cmp.lt.unc p10, p0 = 14, r37 ;;
-}
-
{ .mfi
- nop.m 999
-//
-// Load L_inv
-// Set p12 true for Flag = 0 (exp)
-// Set p13 true for Flag = 1 (expm1)
-//
-(p0) fmpy.s1 f38 = f9, f37
- nop.i 999 ;;
+ setf.exp f2TOM56 = rExp_2tom56 // form 2^-56 for scaling Nfloat
+ fclass.m p9,p0 = f8,0x22 // Test for x=-inf
+ add rAD_TB2 = 0x140, rAD_TB1 // Point to Table 2
}
-
-{ .mfb
- nop.m 999
-//
-// Load L_hi
-// expo_X = expo_X - Bias
-// get W1_ptr
-//
-(p0) fcvt.fx.s1 f39 = f38
-(p14) br.cond.spnt EXP_SMALL ;;
-}
-
{ .mib
- nop.m 999
- nop.i 999
-(p10) br.cond.spnt EXP_HUGE ;;
-}
-
-{ .mmi
-(p0) shladd r34 = r32,4,r34
-(p0) addl r35 = @ltoff(Constants_exp_64_A#), gp
- nop.i 999
+ add rAD_Q1 = 0x1e0, rAD_TB1 // Point to Q table for small path
+ add rAD_Ln2_lo = 0x30, rAD_TB1 // Point to ln2_by_128_lo
+(p6) br.cond.spnt EXPM1_UNORM // Branch if x unorm
}
;;
-{ .mmi
- ld8 r35 = [r35]
- nop.m 999
- nop.i 999
-}
-;;
-
-//
-// Load T_1,T_2
-//
-
-{ .mmb
-(p0) ldfe f51 = [r35],16
-(p0) ld8 r45 = [r34],8
- nop.b 999 ;;
-}
-//
-// Set Safe = True if k >= big_expo_neg
-// Set Safe = False if k < big_expo_neg
-//
-
-{ .mmb
-(p0) ldfe f49 = [r35],16
-(p0) ld8 r48 = [r34],0
- nop.b 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Branch to HUGE is expo_X > 14
-//
-(p0) fcvt.xf f38 = f39
- nop.i 999 ;;
-}
-
+EXPM1_COMMON:
{ .mfi
-(p0) getf.sig r52 = f39
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
-(p0) extr.u r43 = r52, 6, 6 ;;
-//
-// r = r - float_N * L_lo
-// K = extr(N_fix,12,52)
-//
-(p0) shladd r40 = r43,3,r40 ;;
-}
-
-{ .mfi
-(p0) shladd r50 = r43,2,r50
-(p0) fnma.s1 f42 = f40, f38, f9
-//
-// float_N = float(N)
-// N_fix = signficand N
-//
-(p0) extr.u r42 = r52, 0, 6
-}
-
-{ .mmi
-(p0) ldfd f43 = [r40],0 ;;
-(p0) shladd r41 = r42,3,r41
-(p0) shladd r51 = r42,2,r51
-}
-//
-// W_1_p1 = 1 + W_1
-//
-
-{ .mmi
-(p0) ldfs f44 = [r50],0 ;;
-(p0) ldfd f45 = [r41],0
-//
-// M_2 = extr(N_fix,0,6)
-// M_1 = extr(N_fix,6,6)
-// r = X - float_N * L_hi
-//
-(p0) extr r44 = r52, 12, 52
-}
-
-{ .mmi
-(p0) ldfs f46 = [r51],0 ;;
-(p0) sub r46 = r58, r44
-(p0) cmp.gt.unc p8, p15 = r44, r45
-}
-//
-// W = W_1 + W_1_p1*W_2
-// Load A_2
-// Bias_m_K = Bias - K
-//
-
-{ .mii
-(p0) ldfe f40 = [r35],16
-//
-// load A_1
-// poly = A_2 + r*A_3
-// rsq = r * r
-// neg_2_mK = exponent of Bias_m_k
-//
-(p0) add r47 = r58, r44 ;;
-//
-// Set Safe = True if k <= big_expo_pos
-// Set Safe = False if k > big_expo_pos
-// Load A_3
-//
-(p15) cmp.lt p8,p15 = r44,r48 ;;
-}
-
-{ .mmf
-(p0) setf.exp f61 = r46
-//
-// Bias_p + K = Bias + K
-// T = T_1 * T_2
-//
-(p0) setf.exp f36 = r47
-(p0) fnma.s1 f42 = f41, f38, f42 ;;
+ ldfpd fMIN_DBL_OFLOW_ARG, fMAX_DBL_MINUS_1_ARG = [rAD_TB1],16
+ fclass.m p10,p0 = f8,0x1e1 // Test for x=+inf, NaN, NaT
+ add rAD_Q2 = 0x50, rAD_Q1 // Point to Q table for small path
}
-
-{ .mfi
- nop.m 999
-//
-// Load W_1,W_2
-// Load big_exp_pos, load big_exp_neg
-//
-(p0) fadd.s1 f47 = f43, f1
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+ nop.f 0
+(p8) br.ret.spnt b0 // Exit for x=0, return x
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 f52 = f42, f51, f49
- nop.i 999
+ ldfd fMAX_DBL_NORM_ARG = [rAD_TB1],16
+ nop.f 0
+ and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x
}
-
-{ .mfi
- nop.m 999
-(p0) fmpy.s1 f48 = f42, f42
- nop.i 999 ;;
+{ .mfb
+ setf.d fRSHF = rRshf // Form right shift const 1.100 * 2^63
+(p9) fms.d.s0 f8 = f0,f0,f1 // quick exit for x=-inf
+(p9) br.ret.spnt b0
}
+;;
{ .mfi
- nop.m 999
-(p0) fmpy.s1 f53 = f44, f46
- nop.i 999 ;;
+ ldfpd fQD, fQC = [rAD_Q1], 16 // Load coeff for small path
+ nop.f 0
+ sub rExp_x = rExp_x, rExp_bias // True exponent of x
}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 f54 = f45, f47, f43
- nop.i 999
+{ .mfb
+ ldfpd fQB, fQA = [rAD_Q2], 16 // Load coeff for small path
+(p10) fma.d.s0 f8 = f8, f1, f0 // For x=+inf, NaN, NaT
+(p10) br.ret.spnt b0 // Exit for x=+inf, NaN, NaT
}
+;;
{ .mfi
- nop.m 999
-(p0) fneg f61 = f61
- nop.i 999 ;;
+ ldfpd fQ9, fQ8 = [rAD_Q1], 16 // Load coeff for small path
+ fma.s1 fXsq = fNormX, fNormX, f0 // x*x for small path
+ cmp.gt p7, p8 = -2, rExp_x // Test |x| < 2^(-2)
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 f52 = f42, f52, f40
- nop.i 999 ;;
+ ldfpd fQ7, fQ6 = [rAD_Q2], 16 // Load coeff for small path
+ nop.f 0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fadd.s1 f55 = f54, f1
- nop.i 999
+ ldfe fQ5 = [rAD_Q1], 16 // Load coeff for small path
+ nop.f 0
+ nop.i 0
}
-
-{ .mfi
- nop.m 999
-//
-// W + Wp1 * poly
-//
-(p0) mov f34 = f53
- nop.i 999 ;;
+{ .mib
+ ldfe fQ4 = [rAD_Q2], 16 // Load coeff for small path
+(p7) cmp.gt.unc p6, p7 = -60, rExp_x // Test |x| < 2^(-60)
+(p7) br.cond.spnt EXPM1_SMALL // Branch if 2^-60 <= |x| < 2^-2
}
+;;
-{ .mfi
- nop.m 999
-//
-// A_1 + r * poly
-// Scale = setf_exp(Bias_p_k)
-//
-(p0) fma.s1 f52 = f48, f52, f42
- nop.i 999 ;;
-}
+// W = X * Inv_log2_by_128
+// By adding 1.10...0*2^63 we shift and get round_int(W) in significand.
+// We actually add 1.10...0*2^56 to X * Inv_log2 to do the same thing.
{ .mfi
- nop.m 999
-//
-// poly = r + rsq(A_1 + r*poly)
-// Wp1 = 1 + W
-// neg_2_mK = -neg_2_mK
-//
-(p0) fma.s1 f35 = f55, f52, f54
- nop.i 999 ;;
+ ldfe fLn2_by_128_hi = [rAD_TB1],32
+ fma.s1 fW_2TO56_RSH = fNormX, fINV_LN2_2TO63, fRSHF_2TO56
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p0) fmpy.s1 f35 = f35, f53
-//
-// Y_hi = T
-// Y_lo = T * (W + Wp1*poly)
-//
-(p12) br.cond.sptk EXP_MAIN ;;
+ ldfe fLn2_by_128_lo = [rAD_Ln2_lo]
+(p6) fma.d.s0 f8 = f8, f8, f8 // If x < 2^-60, result=x+x*x
+(p6) br.ret.spnt b0 // Exit if x < 2^-60
}
-//
-// Branch if exp(x)
-// Continue for exp(x-1)
-//
+;;
-{ .mii
-(p0) cmp.lt.unc p12, p13 = 10, r44
- nop.i 999 ;;
-//
-// Set p12 if 10 < K, Else p13
-//
-(p13) cmp.gt.unc p13, p14 = -10, r44 ;;
-}
+// Divide arguments into the following categories:
+// Certain minus one p11 - -inf < x <= MAX_DBL_MINUS_1_ARG
+// Possible Overflow p14 - MAX_DBL_NORM_ARG < x < MIN_DBL_OFLOW_ARG
+// Certain Overflow p15 - MIN_DBL_OFLOW_ARG <= x < +inf
//
-// K > 10: Y_lo = Y_lo + neg_2_mK
-// K <=10: Set p13 if -10 > K, Else set p14
+// If the input is really a double arg, then there will never be "Possible
+// Overflow" arguments.
//
-{ .mfi
-(p13) cmp.eq p15, p0 = r0, r0
-(p14) fadd.s1 f34 = f61, f34
- nop.i 999 ;;
-}
+// After that last load, rAD_TB1 points to the beginning of table 1
{ .mfi
- nop.m 999
-(p12) fadd.s1 f35 = f35, f61
- nop.i 999 ;;
+ nop.m 0
+ fcmp.ge.s1 p15,p14 = fNormX,fMIN_DBL_OFLOW_ARG
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p13) fadd.s1 f35 = f35, f34
- nop.i 999
+ add rAD_P = 0x80, rAD_TB2
+ fcmp.le.s1 p11,p0 = fNormX,fMAX_DBL_MINUS_1_ARG
+ nop.i 0
}
+;;
{ .mfb
- nop.m 999
-//
-// K <= 10 and K < -10, Set Safe = True
-// K <= 10 and K < 10, Y_lo = Y_hi + Y_lo
-// K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk
-//
-(p13) mov f34 = f61
-(p0) br.cond.sptk EXP_MAIN ;;
-}
-EXP_SMALL:
-
-{ .mmi
-(p12) addl r35 = @ltoff(Constants_exp_64_P#), gp
-(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp
- nop.i 999
+ ldfpd fP5, fP4 = [rAD_P] ,16
+(p14) fcmp.gt.unc.s1 p14,p0 = fNormX,fMAX_DBL_NORM_ARG
+(p15) br.cond.spnt EXPM1_CERTAIN_OVERFLOW
}
;;
-{ .mmi
-(p12) ld8 r35 = [r35]
- ld8 r34 = [r34]
- nop.i 999
-}
-;;
+// Nfloat = round_int(W)
+// The signficand of fW_2TO56_RSH contains the rounded integer part of W,
+// as a twos complement number in the lower bits (that is, it may be negative).
+// That twos complement number (called N) is put into rN.
+// Since fW_2TO56_RSH is scaled by 2^56, it must be multiplied by 2^-56
+// before the shift constant 1.10000 * 2^63 is subtracted to yield fNfloat.
+// Thus, fNfloat contains the floating point version of N
-{ .mmi
-(p13) addl r35 = @ltoff(Constants_exp_64_Q#), gp
- nop.m 999
- nop.i 999
+{ .mfb
+ ldfpd fP3, fP2 = [rAD_P]
+ fms.s1 fNfloat = fW_2TO56_RSH, f2TOM56, fRSHF
+(p11) br.cond.spnt EXPM1_CERTAIN_MINUS_ONE
}
;;
-
-//
-// Return
-// K <= 10 and K < 10, Y_hi = neg_2_mk
-//
-// /*******************************************************/
-// /*********** Branch EXP_SMALL *************************/
-// /*******************************************************/
-
{ .mfi
-(p13) ld8 r35 = [r35]
-(p0) mov f42 = f9
-(p0) add r34 = 0x48,r34
+ getf.sig rN = fW_2TO56_RSH
+ nop.f 0
+ nop.i 0
}
;;
-//
-// Flag = 0
-// r4 = rsq * rsq
-//
+// rIndex_1 has index_1
+// rIndex_2_16 has index_2 * 16
+// rBiased_M has M
+// rIndex_1_16 has index_1 * 16
+// r = x - Nfloat * ln2_by_128_hi
+// f = 1 - Nfloat * ln2_by_128_lo
{ .mfi
-(p0) ld8 r49 =[r34],0
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-//
-// Flag = 1
-//
-(p0) cmp.lt.unc p14, p0 = r37, r49 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// r = X
-//
-(p0) fmpy.s1 f48 = f42, f42
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-//
-// rsq = r * r
-//
-(p0) fmpy.s1 f50 = f48, f48
-//
-// Is input very small?
-//
-(p14) br.cond.spnt EXP_VERY_SMALL ;;
-}
-//
-// Flag_not1: Y_hi = 1.0
-// Flag is 1: r6 = rsq * r4
-//
-
-{ .mfi
-(p12) ldfe f52 = [r35],16
-(p12) mov f34 = f1
-(p0) add r53 = 0x1,r0 ;;
-}
-
-{ .mfi
-(p13) ldfe f51 = [r35],16
-//
-// Flag_not_1: Y_lo = poly_hi + r4 * poly_lo
-//
-(p13) mov f34 = f9
- nop.i 999 ;;
-}
-
-{ .mmf
-(p12) ldfe f53 = [r35],16
-//
-// For Flag_not_1, Y_hi = X
-// Scale = 1
-// Create 0x000...01
-//
-(p0) setf.sig f37 = r53
-(p0) mov f36 = f1 ;;
-}
-
-{ .mmi
-(p13) ldfe f52 = [r35],16 ;;
-(p12) ldfe f54 = [r35],16
- nop.i 999 ;;
+ and rIndex_1 = 0x0f, rN
+ fnma.s1 fR = fNfloat, fLn2_by_128_hi, fNormX
+ shr rM = rN, 0x7
}
-
{ .mfi
-(p13) ldfe f53 = [r35],16
-(p13) fmpy.s1 f58 = f48, f50
- nop.i 999 ;;
+ and rIndex_2_16 = 0x70, rN
+ fnma.s1 fF = fNfloat, fLn2_by_128_lo, f1
+ nop.i 0
}
-//
-// Flag_not1: poly_lo = P_5 + r*P_6
-// Flag_1: poly_lo = Q_6 + r*Q_7
-//
+;;
-{ .mmi
-(p13) ldfe f54 = [r35],16 ;;
-(p12) ldfe f55 = [r35],16
- nop.i 999 ;;
-}
+// rAD_T1 has address of T1
+// rAD_T2 has address if T2
{ .mmi
-(p12) ldfe f56 = [r35],16 ;;
-(p13) ldfe f55 = [r35],16
- nop.i 999 ;;
+ add rBiased_M = rExp_bias, rM
+ add rAD_T2 = rAD_TB2, rIndex_2_16
+ shladd rAD_T1 = rIndex_1, 4, rAD_TB1
}
+;;
+// Create Scale = 2^M
+// Load T1 and T2
{ .mmi
-(p12) ldfe f57 = [r35],0 ;;
-(p13) ldfe f56 = [r35],16
- nop.i 999 ;;
-}
-
-{ .mfi
-(p13) ldfe f57 = [r35],0
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// For Flag_not_1, load p5,p6,p1,p2
-// Else load p5,p6,p1,p2
-//
-(p12) fma.s1 f60 = f52, f42, f53
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p13) fma.s1 f60 = f51, f42, f52
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p12) fma.s1 f60 = f60, f42, f54
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p12) fma.s1 f59 = f56, f42, f57
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p13) fma.s1 f60 = f42, f60, f53
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p12) fma.s1 f59 = f59, f48, f42
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7)
-// Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6)
-// Flag_not1: poly_hi = (P_1 + r*P_2)
-//
-(p13) fmpy.s1 f60 = f60, f58
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p12) fma.s1 f60 = f60, f42, f55
- nop.i 999 ;;
+ setf.exp f2M = rBiased_M
+ ldfe fT2 = [rAD_T2]
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-//
-// Flag_1: poly_lo = r6 *(Q_5 + ....)
-// Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2)
-//
-(p12) fma.s1 f35 = f60, f50, f59
- nop.i 999
+ ldfe fT1 = [rAD_T1]
+ fmpy.s0 fTmp = fLn2_by_128_lo, fLn2_by_128_lo // Force inexact
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p13) fma.s1 f59 = f54, f42, f55
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fP54 = fR, fP5, fP4
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-//
-// Flag_not1: Y_lo = rsq* poly_hi + poly_lo
-// Flag_1: poly_lo = rsq* poly_hi + poly_lo
-//
-(p13) fma.s1 f59 = f59, f42, f56
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Flag_not_1: (P_1 + r*P_2)
-//
-(p13) fma.s1 f59 = f59, f42, f57
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fP32 = fR, fP3, fP2
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-//
-// Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2)
-//
-(p13) fma.s1 f35 = f59, f48, f60
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fRsq = fR, fR, f0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-//
-// Create 0.000...01
-//
-(p0) for f37 = f35, f37
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-//
-// Set lsb of Y_lo to 1
-//
-(p0) fmerge.se f35 = f35,f37
-(p0) br.cond.sptk EXP_MAIN ;;
-}
-EXP_VERY_SMALL:
-
-{ .mmi
- nop.m 999
-(p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp
- nop.i 999;;
+ nop.m 0
+ fma.s1 fP5432 = fRsq, fP54, fP32
+ nop.i 0
}
+;;
{ .mfi
-(p13) ld8 r34 = [r34];
-(p12) mov f35 = f9
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-(p12) mov f34 = f1
-(p12) br.cond.sptk EXP_MAIN ;;
-}
-
-{ .mlx
-(p13) add r34 = 8,r34
-(p13) movl r39 = 0x0FFFE ;;
+ nop.m 0
+ fma.s1 fS2 = fF,fT2,f0
+ nop.i 0
}
-//
-// Load big_exp_neg
-// Create 1/2's exponent
-//
-
-{ .mii
-(p13) setf.exp f56 = r39
-(p13) shladd r34 = r32,4,r34 ;;
- nop.i 999
-}
-//
-// Negative exponents are stored after positive
-//
-
{ .mfi
-(p13) ld8 r45 = [r34],0
-//
-// Y_hi = x
-// Scale = 1
-//
-(p13) fmpy.s1 f35 = f9, f9
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fS1 = f2M,fT1,f0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-//
-// Reset Safe if necessary
-// Create 1/2
-//
-(p13) mov f34 = f9
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fP = fRsq, fP5432, fR
+ nop.i 0
}
+;;
{ .mfi
-(p13) cmp.lt.unc p0, p15 = r37, r45
-(p13) mov f36 = f1
- nop.i 999 ;;
+ nop.m 0
+ fms.s1 fSm1 = fS1,fS2,f1 // S - 1.0
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-//
-// Y_lo = x * x
-//
-(p13) fmpy.s1 f35 = f35, f56
-//
-// Y_lo = x*x/2
-//
-(p13) br.cond.sptk EXP_MAIN ;;
-}
-EXP_HUGE:
-
-{ .mfi
- nop.m 999
-(p0) fcmp.gt.unc.s1 p14, p0 = f9, f0
- nop.i 999
-}
-
-{ .mlx
- nop.m 999
-(p0) movl r39 = 0x15DC0 ;;
-}
-
-{ .mfi
-(p14) setf.exp f34 = r39
-(p14) mov f35 = f1
-(p14) cmp.eq p0, p15 = r0, r0 ;;
+ nop.m 0
+ fma.s1 fS = fS1,fS2,f0
+(p14) br.cond.spnt EXPM1_POSSIBLE_OVERFLOW
}
+;;
{ .mfb
- nop.m 999
-(p14) mov f36 = f34
-//
-// If x > 0, Set Safe = False
-// If x > 0, Y_hi = 2**(24,000)
-// If x > 0, Y_lo = 1.0
-// If x > 0, Scale = 2**(24,000)
-//
-(p14) br.cond.sptk EXP_MAIN ;;
-}
-
-{ .mlx
- nop.m 999
-(p12) movl r39 = 0xA240
-}
-
-{ .mlx
- nop.m 999
-(p12) movl r38 = 0xA1DC ;;
-}
-
-{ .mmb
-(p13) cmp.eq p15, p14 = r0, r0
-(p12) setf.exp f34 = r39
- nop.b 999 ;;
-}
-
-{ .mlx
-(p12) setf.exp f35 = r38
-(p13) movl r39 = 0xFF9C
+ nop.m 0
+ fma.d.s0 f8 = fS, fP, fSm1
+ br.ret.sptk b0 // Normal path exit
}
+;;
-{ .mfi
- nop.m 999
-(p13) fsub.s1 f34 = f0, f1
- nop.i 999 ;;
+// Here if 2^-60 <= |x| <2^-2
+// Compute 13th order polynomial
+EXPM1_SMALL:
+{ .mmf
+ ldfe fQ3 = [rAD_Q1], 16
+ ldfe fQ2 = [rAD_Q2], 16
+ fma.s1 fX4 = fXsq, fXsq, f0
}
+;;
{ .mfi
- nop.m 999
-(p12) mov f36 = f34
-(p12) cmp.eq p0, p15 = r0, r0 ;;
+ nop.m 0
+ fma.s1 fQDC = fQD, fNormX, fQC
+ nop.i 0
}
-
{ .mfi
-(p13) setf.exp f35 = r39
-(p13) mov f36 = f1
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fQBA = fQB, fNormX, fQA
+ nop.i 0
}
-EXP_MAIN:
+;;
{ .mfi
-(p0) cmp.ne.unc p12, p0 = 0x01, r33
-(p0) fmpy.s1 f101 = f36, f35
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fQ98 = fQ9, fNormX, fQ8
+ nop.i 0
}
-
-{ .mfb
- nop.m 999
-(p0) fma.d.s0 f99 = f34, f36, f101
-(p15) br.cond.sptk EXP_64_RETURN;;
-}
-
{ .mfi
- nop.m 999
-(p0) fsetc.s3 0x7F,0x01
- nop.i 999
-}
-
-{ .mlx
- nop.m 999
-(p0) movl r50 = 0x000000000103FF ;;
-}
-//
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// S3 user supplied status + RZ + TD (Underflows)
-//
-//
-// If (Safe) is true, then
-// Compute result using user supplied status field.
-// No overflow or underflow here, but perhaps inexact.
-// Return
-// Else
-// Determine if overflow or underflow was raised.
-// Fetch +/- overflow threshold for IEEE single, double,
-// double extended
-//
-
-{ .mfi
-(p0) setf.exp f60 = r50
-(p0) fma.d.s3 f102 = f34, f36, f101
- nop.i 999
+ nop.m 0
+ fma.s1 fQ76= fQ7, fNormX, fQ6
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fsetc.s3 0x7F,0x40
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fQ54 = fQ5, fNormX, fQ4
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-//
-// For Safe, no need to check for over/under.
-// For expm1, handle errors like exp.
-//
-(p0) fsetc.s2 0x7F,0x42
- nop.i 999;;
+ nop.m 0
+ fma.s1 fX6 = fX4, fXsq, f0
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.d.s2 f100 = f34, f36, f101
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fQ32= fQ3, fNormX, fQ2
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fsetc.s2 0x7F,0x40
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fQDCBA = fQDC, fXsq, fQBA
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p7) fclass.m.unc p12, p0 = f102, 0x00F
- nop.i 999
+ nop.m 0
+ fma.s1 fQ7654 = fQ76, fXsq, fQ54
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p11, p0 = f102, 0x00F
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fQDCBA98 = fQDCBA, fXsq, fQ98
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p7) fcmp.ge.unc.s1 p10, p0 = f100, f60
- nop.i 999
+ nop.m 0
+ fma.s1 fQ765432 = fQ7654, fXsq, fQ32
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-//
-// Create largest double exponent + 1.
-// Create smallest double exponent - 1.
-//
-(p0) fcmp.ge.unc.s1 p8, p0 = f100, f60
- nop.i 999 ;;
-}
-//
-// fcmp: resultS2 >= + overflow threshold -> set (a) if true
-// fcmp: resultS2 <= - overflow threshold -> set (b) if true
-// fclass: resultS3 is denorm/unorm/0 -> set (d) if true
-//
-
-{ .mib
-(p10) mov r65 = 41
- nop.i 999
-(p10) br.cond.sptk __libm_error_region ;;
-}
-
-{ .mib
-(p8) mov r65 = 14
- nop.i 999
-(p8) br.cond.sptk __libm_error_region ;;
+ nop.m 0
+ fma.s1 fQDCBA98765432 = fQDCBA98, fX6, fQ765432
+ nop.i 0
}
-//
-// Report that exp overflowed
-//
+;;
-{ .mib
-(p12) mov r65 = 42
- nop.i 999
-(p12) br.cond.sptk __libm_error_region ;;
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = fQDCBA98765432, fXsq, fNormX
+ br.ret.sptk b0 // Exit small branch
}
+;;
-{ .mib
-(p11) mov r65 = 15
- nop.i 999
-(p11) br.cond.sptk __libm_error_region ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-//
-// Report that exp underflowed
-//
-(p0) br.cond.sptk EXP_64_RETURN;;
-}
-EXP_64_SPECIAL:
+EXPM1_POSSIBLE_OVERFLOW:
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6, p0 = f8, 0x0c3
- nop.i 999
-}
+// Here if fMAX_DBL_NORM_ARG < x < fMIN_DBL_OFLOW_ARG
+// This cannot happen if input is a double, only if input higher precision.
+// Overflow is a possibility, not a certainty.
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p13, p8 = f8, 0x007
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p7) fclass.m.unc p14, p0 = f8, 0x007
- nop.i 999
-}
+// Recompute result using status field 2 with user's rounding mode,
+// and wre set. If result is larger than largest double, then we have
+// overflow
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p12, p9 = f8, 0x021
- nop.i 999 ;;
+ mov rGt_ln = 0x103ff // Exponent for largest dbl + 1 ulp
+ fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p11, p0 = f8, 0x022
- nop.i 999
+ setf.exp fGt_pln = rGt_ln // Create largest double + 1 ulp
+ fma.d.s2 fWre_urm_f8 = fS, fP, fSm1 // Result with wre set
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p7) fclass.m.unc p10, p0 = f8, 0x022
- nop.i 999 ;;
+ nop.m 0
+ fsetc.s2 0x7F,0x40 // Turn off wre in sf2
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-//
-// Identify +/- 0, Inf, or -Inf
-// Generate the right kind of NaN.
-//
-(p13) fadd.d.s0 f99 = f0, f1
- nop.i 999 ;;
+ nop.m 0
+ fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
+ nop.i 0
}
+;;
-{ .mfi
- nop.m 999
-(p14) mov f99 = f8
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+ nop.f 0
+(p6) br.cond.spnt EXPM1_CERTAIN_OVERFLOW // Branch if overflow
}
+;;
{ .mfb
- nop.m 999
-(p6) fadd.d.s0 f99 = f8, f1
-//
-// exp(+/-0) = 1
-// expm1(+/-0) = +/-0
-// No exceptions raised
-//
-(p6) br.cond.sptk EXP_64_RETURN;;
+ nop.m 0
+ fma.d.s0 f8 = fS, fP, fSm1
+ br.ret.sptk b0 // Exit if really no overflow
}
+;;
-{ .mib
- nop.m 999
- nop.i 999
-(p14) br.cond.sptk EXP_64_RETURN;;
+EXPM1_CERTAIN_OVERFLOW:
+{ .mmi
+ sub rTmp = rExp_mask, r0, 1
+;;
+ setf.exp fTmp = rTmp
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p11) mov f99 = f0
- nop.i 999 ;;
+ alloc r32=ar.pfs,1,4,4,0
+ fmerge.s FR_X = f8,f8
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p10) fsub.d.s1 f99 = f0, f1
-//
-// exp(-Inf) = 0
-// expm1(-Inf) = -1
-// No exceptions raised.
-//
-(p10) br.cond.sptk EXP_64_RETURN;;
+ mov GR_Parameter_TAG = 41
+ fma.d.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result
+ br.cond.sptk __libm_error_region
}
+;;
+// Here if x unorm
+EXPM1_UNORM:
{ .mfb
- nop.m 999
-(p12) fmpy.d.s1 f99 = f8, f1
-//
-// exp(+Inf) = Inf
-// No exceptions raised.
-//
-(p0) br.cond.sptk EXP_64_RETURN;;
+ getf.exp rSignexp_x = fNormX // Must recompute if x unorm
+ fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag
+ br.cond.sptk EXPM1_COMMON
}
+;;
-
-EXP_64_UNSUPPORTED:
-
-{ .mfb
- nop.m 999
-(p0) fmpy.d.s0 f99 = f8, f0
- nop.b 0;;
+// here if result will be -1 and inexact, x <= -48.0
+EXPM1_CERTAIN_MINUS_ONE:
+{ .mmi
+ mov rTmp = 1
+;;
+ setf.exp fTmp = rTmp
+ nop.i 0
}
+;;
-EXP_64_RETURN:
{ .mfb
- nop.m 999
-(p0) mov f8 = f99
-(p0) br.ret.sptk b0
+ nop.m 0
+ fms.d.s0 FR_RESULT = fTmp, fTmp, f1 // Set I, rounded -1+eps result
+ br.ret.sptk b0
}
-.endp expm1
-ASM_SIZE_DIRECTIVE(expm1)
+;;
-.proc __libm_error_region
-__libm_error_region:
+GLOBAL_IEEE754_END(expm1)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
-// (1)
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
@@ -1716,38 +841,32 @@ __libm_error_region:
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
-
-// (2)
{ .mmi
stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
-
.body
-// (3)
{ .mib
- stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
- stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
+ stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
-
-// (4)
{ .mmi
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
@@ -1760,9 +879,6 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
-
-
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_expm1f.S b/sysdeps/ia64/fpu/s_expm1f.S
index cc2c537ba2..0c5f2e67a8 100644
--- a/sysdeps/ia64/fpu/s_expm1f.S
+++ b/sysdeps/ia64/fpu/s_expm1f.S
@@ -1,10 +1,10 @@
-.file "exp_m1f.s"
+.file "expf_m1.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2002, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,1735 +20,649 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
-// HISTORY
-// 2/02/00 Initial Version
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+
+// History
+//*********************************************************************
+// 02/02/00 Initial Version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 07/07/01 Improved speed of all paths
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 11/20/02 Improved speed, algorithm based on expf
//
-// *********************************************************************
-//
-// Function: Combined expf(x) and expm1f(x), where
-// x
-// expf(x) = e , for single precision x values
-// x
-// expm1f(x) = e - 1 for single precision x values
-//
-// *********************************************************************
-//
-// Accuracy: Within .7 ulps for 80-bit floating point values
-// Very accurate for single precision values
-//
-// *********************************************************************
-//
-// Resources Used:
-//
-// Floating-Point Registers: f8 (Input and Return Value)
-// f9,f32-f61, f99-f102
-//
-// General Purpose Registers:
-// r32-r61
-// r62-r65 (Used to pass arguments to error handling routine)
-//
-// Predicate Registers: p6-p15
-//
-// *********************************************************************
-//
-// IEEE Special Conditions:
-//
-// Denormal fault raised on denormal inputs
-// Overflow exceptions raised when appropriate for exp and expm1
-// Underflow exceptions raised when appropriate for exp and expm1
-// (Error Handling Routine called for overflow and Underflow)
-// Inexact raised when appropriate by algorithm
-//
-// expf(inf) = inf
-// expf(-inf) = +0
-// expf(SNaN) = QNaN
-// expf(QNaN) = QNaN
-// expf(0) = 1
-// expf(EM_special Values) = QNaN
-// expf(inf) = inf
-// expm1f(-inf) = -1
-// expm1f(SNaN) = QNaN
-// expm1f(QNaN) = QNaN
-// expm1f(0) = 0
-// expm1f(EM_special Values) = QNaN
-//
-// *********************************************************************
-//
-// Implementation and Algorithm Notes:
-//
-// ker_exp_64( in_FR : X,
-// in_GR : Flag,
-// in_GR : Expo_Range
-// out_FR : Y_hi,
-// out_FR : Y_lo,
-// out_FR : scale,
-// out_PR : Safe )
-//
-// On input, X is in register format and
-// Flag = 0 for exp,
-// Flag = 1 for expm1,
-//
-// On output, provided X and X_cor are real numbers, then
-//
-// scale*(Y_hi + Y_lo) approximates expf(X) if Flag is 0
-// scale*(Y_hi + Y_lo) approximates expf(X)-1 if Flag is 1
-//
-// The accuracy is sufficient for a highly accurate 64 sig.
-// bit implementation. Safe is set if there is no danger of
-// overflow/underflow when the result is composed from scale,
-// Y_hi and Y_lo. Thus, we can have a fast return if Safe is set.
-// Otherwise, one must prepare to handle the possible exception
-// appropriately. Note that SAFE not set (false) does not mean
-// that overflow/underflow will occur; only the setting of SAFE
-// guarantees the opposite.
-//
-// **** High Level Overview ****
-//
-// The method consists of three cases.
-//
-// If |X| < Tiny use case exp_tiny;
-// else if |X| < 2^(-6) use case exp_small;
-// else use case exp_regular;
-//
-// Case exp_tiny:
-//
-// 1 + X can be used to approximate expf(X) or expf(X+X_cor);
-// X + X^2/2 can be used to approximate expf(X) - 1
-//
-// Case exp_small:
-//
-// Here, expf(X), expf(X+X_cor), and expf(X) - 1 can all be
-// appproximated by a relatively simple polynomial.
-//
-// This polynomial resembles the truncated Taylor series
-//
-// expf(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n!
-//
-// Case exp_regular:
-//
-// Here we use a table lookup method. The basic idea is that in
-// order to compute expf(X), we accurately decompose X into
-//
-// X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13.
-//
-// Hence
-//
-// expf(X) = 2^( N / 2^12 ) * expf(r).
-//
-// The value 2^( N / 2^12 ) is obtained by simple combinations
-// of values calculated beforehand and stored in table; expf(r)
-// is approximated by a short polynomial because |r| is small.
-//
-// We elaborate this method in 4 steps.
-//
-// Step 1: Reduction
-//
-// The value 2^12/log(2) is stored as a double-extended number
-// L_Inv.
-//
-// N := round_to_nearest_integer( X * L_Inv )
-//
-// The value log(2)/2^12 is stored as two numbers L_hi and L_lo so
-// that r can be computed accurately via
-//
-// r := (X - N*L_hi) - N*L_lo
-//
-// We pick L_hi such that N*L_hi is representable in 64 sig. bits
-// and thus the FMA X - N*L_hi is error free. So r is the
-// 1 rounding error from an exact reduction with respect to
-//
-// L_hi + L_lo.
-//
-// In particular, L_hi has 30 significant bit and can be stored
-// as a double-precision number; L_lo has 64 significant bits and
-// stored as a double-extended number.
-//
-// In the case Flag = 2, we further modify r by
-//
-// r := r + X_cor.
-//
-// Step 2: Approximation
-//
-// expf(r) - 1 is approximated by a short polynomial of the form
-//
-// r + A_1 r^2 + A_2 r^3 + A_3 r^4 .
-//
-// Step 3: Composition from Table Values
-//
-// The value 2^( N / 2^12 ) can be composed from a couple of tables
-// of precalculated values. First, express N as three integers
-// K, M_1, and M_2 as
-//
-// N = K * 2^12 + M_1 * 2^6 + M_2
-//
-// Where 0 <= M_1, M_2 < 2^6; and K can be positive or negative.
-// When N is represented in 2's complement, M_2 is simply the 6
-// lsb's, M_1 is the next 6, and K is simply N shifted right
-// arithmetically (sign extended) by 12 bits.
-//
-// Now, 2^( N / 2^12 ) is simply
-//
-// 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 )
-//
-// Clearly, 2^K needs no tabulation. The other two values are less
-// trivial because if we store each accurately to more than working
-// precision, than its product is too expensive to calculate. We
-// use the following method.
-//
-// Define two mathematical values, delta_1 and delta_2, implicitly
-// such that
-//
-// T_1 = expf( [M_1 log(2)/2^6] - delta_1 )
-// T_2 = expf( [M_2 log(2)/2^12] - delta_2 )
-//
-// are representable as 24 significant bits. To illustrate the idea,
-// we show how we define delta_1:
-//
-// T_1 := round_to_24_bits( expf( M_1 log(2)/2^6 ) )
-// delta_1 = (M_1 log(2)/2^6) - log( T_1 )
-//
-// The last equality means mathematical equality. We then tabulate
-//
-// W_1 := expf(delta_1) - 1
-// W_2 := expf(delta_2) - 1
-//
-// Both in double precision.
-//
-// From the tabulated values T_1, T_2, W_1, W_2, we compose the values
-// T and W via
-//
-// T := T_1 * T_2 ...exactly
-// W := W_1 + (1 + W_1)*W_2
-//
-// W approximates expf( delta ) - 1 where delta = delta_1 + delta_2.
-// The mathematical product of T and (W+1) is an accurate representation
-// of 2^(M_1/2^6) * 2^(M_2/2^12).
-//
-// Step 4. Reconstruction
-//
-// Finally, we can reconstruct expf(X), expf(X) - 1.
-// Because
-//
-// X = K * log(2) + (M_1*log(2)/2^6 - delta_1)
-// + (M_2*log(2)/2^12 - delta_2)
-// + delta_1 + delta_2 + r ...accurately
-// We have
-//
-// expf(X) ~=~ 2^K * ( T + T*[expf(delta_1+delta_2+r) - 1] )
-// ~=~ 2^K * ( T + T*[expf(delta + r) - 1] )
-// ~=~ 2^K * ( T + T*[(expf(delta)-1)
-// + expf(delta)*(expf(r)-1)] )
-// ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) )
-// ~=~ 2^K * ( Y_hi + Y_lo )
-//
-// where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r))
-//
-// For expf(X)-1, we have
-//
-// expf(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1
-// ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) )
-//
-// and we combine Y_hi + Y_lo - 2^(-N) into the form of two
-// numbers Y_hi + Y_lo carefully.
-//
-// **** Algorithm Details ****
-//
-// A careful algorithm must be used to realize the mathematical ideas
-// accurately. We describe each of the three cases. We assume SAFE
-// is preset to be TRUE.
-//
-// Case exp_tiny:
-//
-// The important points are to ensure an accurate result under
-// different rounding directions and a correct setting of the SAFE
-// flag.
-//
-// If Flag is 1, then
-// SAFE := False ...possibility of underflow
-// Scale := 1.0
-// Y_hi := X
-// Y_lo := 2^(-17000)
-// Else
-// Scale := 1.0
-// Y_hi := 1.0
-// Y_lo := X ...for different rounding modes
-// Endif
-//
-// Case exp_small:
-//
-// Here we compute a simple polynomial. To exploit parallelism, we split
-// the polynomial into several portions.
-//
-// Let r = X
-//
-// If Flag is not 1 ...i.e. expf( argument )
-//
-// rsq := r * r;
-// r4 := rsq*rsq
-// poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6))
-// poly_hi := r + rsq*(P_1 + r*P_2)
-// Y_lo := poly_hi + r4 * poly_lo
-// set lsb(Y_lo) to 1
-// Y_hi := 1.0
-// Scale := 1.0
-//
-// Else ...i.e. expf( argument ) - 1
-//
-// rsq := r * r
-// r4 := rsq * rsq
-// r6 := rsq * r4
-// poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7))
-// poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4))
-// Y_lo := rsq*poly_hi + poly_lo
-// set lsb(Y_lo) to 1
-// Y_hi := X
-// Scale := 1.0
-//
-// Endif
-//
-// Case exp_regular:
-//
-// The previous description contain enough information except the
-// computation of poly and the final Y_hi and Y_lo in the case for
-// expf(X)-1.
-//
-// The computation of poly for Step 2:
-//
-// rsq := r*r
-// poly := r + rsq*(A_1 + r*(A_2 + r*A_3))
-//
-// For the case expf(X) - 1, we need to incorporate 2^(-K) into
-// Y_hi and Y_lo at the end of Step 4.
-//
-// If K > 10 then
-// Y_lo := Y_lo - 2^(-K)
-// Else
-// If K < -10 then
-// Y_lo := Y_hi + Y_lo
-// Y_hi := -2^(-K)
-// Else
-// Y_hi := Y_hi - 2^(-K)
-// End If
-// End If
//
+// API
+//*********************************************************************
+// float expm1f(float)
+//
+// Overview of operation
+//*********************************************************************
+// 1. Inputs of Nan, Inf, Zero, NatVal handled with special paths
+//
+// 2. |x| < 2^-40
+// Result = x, computed by x + x*x to handle appropriate flags and rounding
+//
+// 3. 2^-40 <= |x| < 2^-2
+// Result determined by 8th order Taylor series polynomial
+// expm1f(x) = x + A2*x^2 + ... + A8*x^8
+//
+// 4. x < -24.0
+// Here we know result is essentially -1 + eps, where eps only affects
+// rounded result. Set I.
+//
+// 5. x >= 88.7228
+// Result overflows. Set I, O, and call error support
+//
+// 6. 2^-2 <= x < 88.7228 or -24.0 <= x < -2^-2
+// This is the main path. The algorithm is described below:
+
+// Take the input x. w is "how many log2/128 in x?"
+// w = x * 64/log2
+// NJ = int(w)
+// x = NJ*log2/64 + R
+
+// NJ = 64*n + j
+// x = n*log2 + (log2/64)*j + R
+//
+// So, exp(x) = 2^n * 2^(j/64)* exp(R)
+//
+// T = 2^n * 2^(j/64)
+// Construct 2^n
+// Get 2^(j/64) table
+// actually all the entries of 2^(j/64) table are stored in DP and
+// with exponent bits set to 0 -> multiplication on 2^n can be
+// performed by doing logical "or" operation with bits presenting 2^n
+
+// exp(R) = 1 + (exp(R) - 1)
+// P = exp(R) - 1 approximated by Taylor series of 3rd degree
+// P = A3*R^3 + A2*R^2 + R, A3 = 1/6, A2 = 1/2
+//
+
+// The final result is reconstructed as follows
+// expm1f(x) = T*P + (T - 1.0)
+
+// Special values
+//*********************************************************************
+// expm1f(+0) = +0.0
+// expm1f(-0) = -0.0
+
+// expm1f(+qnan) = +qnan
+// expm1f(-qnan) = -qnan
+// expm1f(+snan) = +qnan
+// expm1f(-snan) = -qnan
+
+// expm1f(-inf) = -1.0
+// expm1f(+inf) = +inf
+
+// Overflow and Underflow
+//*********************************************************************
+// expm1f(x) = largest single normal when
+// x = 88.7228 = 0x42b17217
+//
+// Underflow is handled as described in case 2 above.
+
+
+// Registers used
+//*********************************************************************
+// Floating Point registers used:
+// f8, input
+// f6,f7, f9 -> f15, f32 -> f45
+
+// General registers used:
+// r3, r20 -> r38
+
+// Predicate registers used:
+// p9 -> p15
+
+// Assembly macros
+//*********************************************************************
+// integer registers used
+// scratch
+rNJ = r3
+
+rExp_half = r20
+rSignexp_x = r21
+rExp_x = r22
+rExp_mask = r23
+rExp_bias = r24
+rTmp = r25
+rM1_lim = r25
+rGt_ln = r25
+rJ = r26
+rN = r27
+rTblAddr = r28
+rLn2Div64 = r29
+rRightShifter = r30
+r64DivLn2 = r31
+// stacked
+GR_SAVE_PFS = r32
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_Parameter_X = r35
+GR_Parameter_Y = r36
+GR_Parameter_RESULT = r37
+GR_Parameter_TAG = r38
+
+// floating point registers used
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+// scratch
+fRightShifter = f6
+f64DivLn2 = f7
+fNormX = f9
+fNint = f10
+fN = f11
+fR = f12
+fLn2Div64 = f13
+fA2 = f14
+fA3 = f15
+// stacked
+fP = f32
+fX3 = f33
+fT = f34
+fMIN_SGL_OFLOW_ARG = f35
+fMAX_SGL_NORM_ARG = f36
+fMAX_SGL_MINUS_1_ARG = f37
+fA4 = f38
+fA43 = f38
+fA432 = f38
+fRSqr = f39
+fA5 = f40
+fTmp = f41
+fGt_pln = f41
+fXsq = f41
+fA7 = f42
+fA6 = f43
+fA65 = f43
+fTm1 = f44
+fA8 = f45
+fA87 = f45
+fA8765 = f45
+fA8765432 = f45
+fWre_urm_f8 = f45
+
+RODATA
+.align 16
+LOCAL_OBJECT_START(_expf_table)
+data8 0x3efa01a01a01a01a // A8 = 1/8!
+data8 0x3f2a01a01a01a01a // A7 = 1/7!
+data8 0x3f56c16c16c16c17 // A6 = 1/6!
+data8 0x3f81111111111111 // A5 = 1/5!
+data8 0x3fa5555555555555 // A4 = 1/4!
+data8 0x3fc5555555555555 // A3 = 1/3!
+//
+data4 0x42b17218 // Smallest sgl arg to overflow sgl result
+data4 0x42b17217 // Largest sgl arg to give sgl result
+//
+// 2^(j/64) table, j goes from 0 to 63
+data8 0x0000000000000000 // 2^(0/64)
+data8 0x00002C9A3E778061 // 2^(1/64)
+data8 0x000059B0D3158574 // 2^(2/64)
+data8 0x0000874518759BC8 // 2^(3/64)
+data8 0x0000B5586CF9890F // 2^(4/64)
+data8 0x0000E3EC32D3D1A2 // 2^(5/64)
+data8 0x00011301D0125B51 // 2^(6/64)
+data8 0x0001429AAEA92DE0 // 2^(7/64)
+data8 0x000172B83C7D517B // 2^(8/64)
+data8 0x0001A35BEB6FCB75 // 2^(9/64)
+data8 0x0001D4873168B9AA // 2^(10/64)
+data8 0x0002063B88628CD6 // 2^(11/64)
+data8 0x0002387A6E756238 // 2^(12/64)
+data8 0x00026B4565E27CDD // 2^(13/64)
+data8 0x00029E9DF51FDEE1 // 2^(14/64)
+data8 0x0002D285A6E4030B // 2^(15/64)
+data8 0x000306FE0A31B715 // 2^(16/64)
+data8 0x00033C08B26416FF // 2^(17/64)
+data8 0x000371A7373AA9CB // 2^(18/64)
+data8 0x0003A7DB34E59FF7 // 2^(19/64)
+data8 0x0003DEA64C123422 // 2^(20/64)
+data8 0x0004160A21F72E2A // 2^(21/64)
+data8 0x00044E086061892D // 2^(22/64)
+data8 0x000486A2B5C13CD0 // 2^(23/64)
+data8 0x0004BFDAD5362A27 // 2^(24/64)
+data8 0x0004F9B2769D2CA7 // 2^(25/64)
+data8 0x0005342B569D4F82 // 2^(26/64)
+data8 0x00056F4736B527DA // 2^(27/64)
+data8 0x0005AB07DD485429 // 2^(28/64)
+data8 0x0005E76F15AD2148 // 2^(29/64)
+data8 0x0006247EB03A5585 // 2^(30/64)
+data8 0x0006623882552225 // 2^(31/64)
+data8 0x0006A09E667F3BCD // 2^(32/64)
+data8 0x0006DFB23C651A2F // 2^(33/64)
+data8 0x00071F75E8EC5F74 // 2^(34/64)
+data8 0x00075FEB564267C9 // 2^(35/64)
+data8 0x0007A11473EB0187 // 2^(36/64)
+data8 0x0007E2F336CF4E62 // 2^(37/64)
+data8 0x00082589994CCE13 // 2^(38/64)
+data8 0x000868D99B4492ED // 2^(39/64)
+data8 0x0008ACE5422AA0DB // 2^(40/64)
+data8 0x0008F1AE99157736 // 2^(41/64)
+data8 0x00093737B0CDC5E5 // 2^(42/64)
+data8 0x00097D829FDE4E50 // 2^(43/64)
+data8 0x0009C49182A3F090 // 2^(44/64)
+data8 0x000A0C667B5DE565 // 2^(45/64)
+data8 0x000A5503B23E255D // 2^(46/64)
+data8 0x000A9E6B5579FDBF // 2^(47/64)
+data8 0x000AE89F995AD3AD // 2^(48/64)
+data8 0x000B33A2B84F15FB // 2^(49/64)
+data8 0x000B7F76F2FB5E47 // 2^(50/64)
+data8 0x000BCC1E904BC1D2 // 2^(51/64)
+data8 0x000C199BDD85529C // 2^(52/64)
+data8 0x000C67F12E57D14B // 2^(53/64)
+data8 0x000CB720DCEF9069 // 2^(54/64)
+data8 0x000D072D4A07897C // 2^(55/64)
+data8 0x000D5818DCFBA487 // 2^(56/64)
+data8 0x000DA9E603DB3285 // 2^(57/64)
+data8 0x000DFC97337B9B5F // 2^(58/64)
+data8 0x000E502EE78B3FF6 // 2^(59/64)
+data8 0x000EA4AFA2A490DA // 2^(60/64)
+data8 0x000EFA1BEE615A27 // 2^(61/64)
+data8 0x000F50765B6E4540 // 2^(62/64)
+data8 0x000FA7C1819E90D8 // 2^(63/64)
+LOCAL_OBJECT_END(_expf_table)
-#include "libm_support.h"
-
-
-GR_SAVE_B0 = r60
-GR_SAVE_PFS = r59
-GR_SAVE_GP = r61
-
-GR_Parameter_X = r62
-GR_Parameter_Y = r63
-GR_Parameter_RESULT = r64
-GR_Parameter_TAG = r65
-
-FR_X = f9
-FR_Y = f1
-FR_RESULT = f99
-
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-
-.align 64
-Constants_exp_64_Arg:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object)
-data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000
-data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000
-data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000
-// /* Inv_L, L_hi, L_lo */
-ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg)
-
-.align 64
-Constants_exp_64_Exponents:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object)
-data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF
-data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF
-data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
-data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
-data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF
-data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF
-ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents)
-
-.align 64
-Constants_exp_64_A:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object)
-data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000
-data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000
-data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000
-// /* Reversed */
-ASM_SIZE_DIRECTIVE(Constants_exp_64_A)
-
-.align 64
-Constants_exp_64_P:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object)
-data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000
-data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000
-data4 0x7474C518,0x88888888,0x00003FF8,0x00000000
-data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000
-data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000
-data4 0x000004C7,0x80000000,0x00003FFE,0x00000000
-// /* Reversed */
-ASM_SIZE_DIRECTIVE(Constants_exp_64_P)
-
-.align 64
-Constants_exp_64_Q:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object)
-data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000
-data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000
-data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000
-data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000
-data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000
-data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000
-data4 0x00000000,0x80000000,0x00003FFE,0x00000000
-// /* Reversed */
-ASM_SIZE_DIRECTIVE(Constants_exp_64_Q)
-
-.align 64
-Constants_exp_64_T1:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object)
-data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
-data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
-data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
-data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
-data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
-data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516
-data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A
-data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4
-data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B
-data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD
-data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15
-data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B
-data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
-data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
-data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
-data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
-ASM_SIZE_DIRECTIVE(Constants_exp_64_T1)
-
-.align 64
-Constants_exp_64_T2:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object)
-data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
-data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
-data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
-data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
-data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
-data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
-data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
-data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
-data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
-data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
-data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
-data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
-data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
-data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
-data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
-data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
-ASM_SIZE_DIRECTIVE(Constants_exp_64_T2)
-
-.align 64
-Constants_exp_64_W1:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object)
-data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454
-data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6
-data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA
-data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50
-data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2
-data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE
-data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B
-data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04
-data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419
-data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376
-data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A
-data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB
-data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E
-data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA
-data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08
-data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B
-data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75
-data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79
-data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7
-data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087
-data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB
-data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643
-data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C
-data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D
-data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873
-data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F
-data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861
-data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0
-data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC
-data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB
-data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB
-data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148
-ASM_SIZE_DIRECTIVE(Constants_exp_64_W1)
-
-.align 64
-Constants_exp_64_W2:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object)
-data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25
-data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8
-data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A
-data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E
-data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9
-data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2
-data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0
-data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509
-data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33
-data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D
-data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87
-data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3
-data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9
-data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F
-data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82
-data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4
-data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D
-data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030
-data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29
-data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED
-data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B
-data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893
-data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35
-data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C
-data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313
-data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE
-data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426
-data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550
-data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4
-data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31
-data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE
-data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9
-ASM_SIZE_DIRECTIVE(Constants_exp_64_W2)
.section .text
-.proc expm1f#
-.global expm1f#
-.align 64
-
-expm1f:
-#ifdef _LIBC
-.global __expm1f#
-__expm1f:
-#endif
-
+GLOBAL_IEEE754_ENTRY(expm1f)
-{ .mii
- alloc r32 = ar.pfs,0,30,4,0
-(p0) add r33 = 1, r0
-(p0) cmp.eq.unc p7, p0 = r0, r0
-}
-;;
-
-//
-// Set p7 true for expm1
-// Set Flag = r33 = 1 for expm1
-// These are really no longer necesary, but are a remnant
-// when this file had multiple entry points.
-// They should be carefully removed
-
-
-{ .mfi
-(p0) add r32 = 0,r0
-(p0) fnorm.s1 f9 = f8
- nop.i 0
-}
-
-{ .mfi
- nop.m 0
-//
-// Set p7 false for exp
-// Set Flag = r33 = 0 for exp
-//
-(p0) fclass.m.unc p6, p8 = f8, 0x1E7
- nop.i 0 ;;
+{ .mlx
+ getf.exp rSignexp_x = f8 // Must recompute if x unorm
+ movl r64DivLn2 = 0x40571547652B82FE // 64/ln(2)
}
-
-{ .mfi
- nop.m 999
-(p0) fclass.nm.unc p9, p0 = f8, 0x1FF
- nop.i 0
+{ .mlx
+ addl rTblAddr = @ltoff(_expf_table),gp
+ movl rRightShifter = 0x43E8000000000000 // DP Right Shifter
}
+;;
{ .mfi
- nop.m 999
-(p0) mov f36 = f1
- nop.i 999 ;;
-}
-
-//
-// Identify NatVals, NaNs, Infs, and Zeros.
-// Identify EM unsupporteds.
-// Save special input registers
-//
-// Create FR_X_cor = 0.0
-// GR_Flag = 0
-// GR_Expo_Range = 0 (r32) for single precision
-// FR_Scale = 1.0
-//
-
-{ .mfb
- nop.m 999
-(p0) mov f32 = f0
-(p6) br.cond.spnt EXPF_64_SPECIAL ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.spnt EXPF_64_UNSUPPORTED ;;
+ // point to the beginning of the table
+ ld8 rTblAddr = [rTblAddr]
+ fclass.m p14, p0 = f8 , 0x22 // test for -INF
+ mov rExp_mask = 0x1ffff // Exponent mask
}
-
-//
-// Branch out for special input values
-//
-
{ .mfi
-(p0) cmp.ne.unc p12, p13 = 0x01, r33
-(p0) fcmp.lt.unc.s0 p9,p0 = f8, f0
-(p0) cmp.eq.unc p15, p0 = r0, r0
-}
-
-//
-// Raise possible denormal operand exception
-// Normalize x
-//
-// This function computes expf( x + x_cor)
-// Input FR 1: FR_X
-// Input FR 2: FR_X_cor
-// Input GR 1: GR_Flag
-// Input GR 2: GR_Expo_Range
-// Output FR 3: FR_Y_hi
-// Output FR 4: FR_Y_lo
-// Output FR 5: FR_Scale
-// Output PR 1: PR_Safe
-
-//
-// Prepare to load constants
-// Set Safe = True
-//
-
-{ .mmi
-(p0) addl r34 = @ltoff(Constants_exp_64_Arg#),gp
-(p0) addl r40 = @ltoff(Constants_exp_64_W1#),gp
-(p0) addl r41 = @ltoff(Constants_exp_64_W2#),gp
-};;
-
-{ .mmi
- ld8 r34 = [r34]
- ld8 r40 = [r40]
-(p0) addl r50 = @ltoff(Constants_exp_64_T1#), gp
+ nop.m 0
+ fnorm.s1 fNormX = f8 // normalized x
+ nop.i 0
}
;;
-{ .mmi
- ld8 r41 = [r41]
-(p0) ldfe f37 = [r34],16
-(p0) addl r51 = @ltoff(Constants_exp_64_T2#), gp
-}
-;;
-//
-// N = fcvt.fx(float_N)
-// Set p14 if -6 > expo_X
-//
-//
-// Bias = 0x0FFFF
-// expo_X = expo_X and Mask
-//
-{ .mmi
- ld8 r50 = [r50]
-(p0) ldfe f40 = [r34],16
- nop.i 999
+{ .mfi
+ setf.d f64DivLn2 = r64DivLn2 // load 64/ln(2) to FP reg
+ fclass.m p9, p0 = f8 , 0x0b // test for x unorm
+ mov rExp_bias = 0xffff // Exponent bias
}
-;;
-
-{ .mlx
- nop.m 999
-(p0) movl r58 = 0x0FFFF
-};;
-
-//
-// Load W2_ptr
-// Branch to SMALL is expo_X < -6
-//
-//
-// float_N = X * L_Inv
-// expo_X = exponent of X
-// Mask = 0x1FFFF
-//
-
-{ .mmi
- ld8 r51 = [r51]
-(p0) ldfe f41 = [r34],16
-//
-// float_N = X * L_Inv
-// expo_X = exponent of X
-// Mask = 0x1FFFF
-//
- nop.i 0
-};;
-
{ .mlx
-(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp
-(p0) movl r39 = 0x1FFFF
+ // load Right Shifter to FP reg
+ setf.d fRightShifter = rRightShifter
+ movl rLn2Div64 = 0x3F862E42FEFA39EF // DP ln(2)/64 in GR
}
;;
-{ .mmi
- ld8 r34 = [r34]
-(p0) getf.exp r37 = f9
- nop.i 999
-}
-;;
-
-{ .mii
- nop.m 999
- nop.i 999
-(p0) and r37 = r37, r39 ;;
-}
-
-{ .mmi
-(p0) sub r37 = r37, r58 ;;
-(p0) cmp.gt.unc p14, p0 = -6, r37
-(p0) cmp.lt.unc p10, p0 = 14, r37 ;;
-}
-
{ .mfi
- nop.m 999
-//
-// Load L_inv
-// Set p12 true for Flag = 0 (exp)
-// Set p13 true for Flag = 1 (expm1)
-//
-(p0) fmpy.s1 f38 = f9, f37
- nop.i 999 ;;
+ ldfpd fA8, fA7 = [rTblAddr], 16
+ fcmp.eq.s1 p13, p0 = f0, f8 // test for x = 0.0
+ mov rExp_half = 0xfffe
}
-
{ .mfb
- nop.m 999
-//
-// Load L_hi
-// expo_X = expo_X - Bias
-// get W1_ptr
-//
-(p0) fcvt.fx.s1 f39 = f38
-(p14) br.cond.spnt EXPF_SMALL ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p10) br.cond.spnt EXPF_HUGE ;;
-}
-
-{ .mmi
-(p0) shladd r34 = r32,4,r34
-(p0) addl r35 = @ltoff(Constants_exp_64_A#),gp
- nop.i 999
+ setf.d fLn2Div64 = rLn2Div64 // load ln(2)/64 to FP reg
+ nop.f 0
+(p9) br.cond.spnt EXPM1_UNORM // Branch if x unorm
}
;;
-{ .mmi
- ld8 r35 = [r35]
- nop.m 999
- nop.i 999
+EXPM1_COMMON:
+{ .mfb
+ ldfpd fA6, fA5 = [rTblAddr], 16
+(p14) fms.s.s0 f8 = f0, f0, f1 // result if x = -inf
+(p14) br.ret.spnt b0 // exit here if x = -inf
}
;;
-//
-// Load T_1,T_2
-//
-
-{ .mmb
-(p0) ldfe f51 = [r35],16
-(p0) ld8 r45 = [r34],8
- nop.b 999 ;;
-}
-//
-// Set Safe = True if k >= big_expo_neg
-// Set Safe = False if k < big_expo_neg
-//
-
-{ .mmb
-(p0) ldfe f49 = [r35],16
-(p0) ld8 r48 = [r34],0
- nop.b 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Branch to HUGE is expo_X > 14
-//
-(p0) fcvt.xf f38 = f39
- nop.i 999 ;;
-}
-
-{ .mfi
-(p0) getf.sig r52 = f39
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
-(p0) extr.u r43 = r52, 6, 6 ;;
-//
-// r = r - float_N * L_lo
-// K = extr(N_fix,12,52)
-//
-(p0) shladd r40 = r43,3,r40 ;;
-}
-
-{ .mfi
-(p0) shladd r50 = r43,2,r50
-(p0) fnma.s1 f42 = f40, f38, f9
-//
-// float_N = float(N)
-// N_fix = signficand N
-//
-(p0) extr.u r42 = r52, 0, 6
-}
-
-{ .mmi
-(p0) ldfd f43 = [r40],0 ;;
-(p0) shladd r41 = r42,3,r41
-(p0) shladd r51 = r42,2,r51
-}
-//
-// W_1_p1 = 1 + W_1
-//
-
-{ .mmi
-(p0) ldfs f44 = [r50],0 ;;
-(p0) ldfd f45 = [r41],0
-//
-// M_2 = extr(N_fix,0,6)
-// M_1 = extr(N_fix,6,6)
-// r = X - float_N * L_hi
-//
-(p0) extr r44 = r52, 12, 52
-}
-
-{ .mmi
-(p0) ldfs f46 = [r51],0 ;;
-(p0) sub r46 = r58, r44
-(p0) cmp.gt.unc p8, p15 = r44, r45
-}
-//
-// W = W_1 + W_1_p1*W_2
-// Load A_2
-// Bias_m_K = Bias - K
-//
-
-{ .mii
-(p0) ldfe f40 = [r35],16
-//
-// load A_1
-// poly = A_2 + r*A_3
-// rsq = r * r
-// neg_2_mK = exponent of Bias_m_k
-//
-(p0) add r47 = r58, r44 ;;
-//
-// Set Safe = True if k <= big_expo_pos
-// Set Safe = False if k > big_expo_pos
-// Load A_3
-//
-(p15) cmp.lt p8,p15 = r44,r48 ;;
-}
-
-{ .mmf
-(p0) setf.exp f61 = r46
-//
-// Bias_p + K = Bias + K
-// T = T_1 * T_2
-//
-(p0) setf.exp f36 = r47
-(p0) fnma.s1 f42 = f41, f38, f42 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Load W_1,W_2
-// Load big_exp_pos, load big_exp_neg
-//
-(p0) fadd.s1 f47 = f43, f1
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 f52 = f42, f51, f49
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fmpy.s1 f48 = f42, f42
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fmpy.s1 f53 = f44, f46
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 f54 = f45, f47, f43
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fneg f61 = f61
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 f52 = f42, f52, f40
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fadd.s1 f55 = f54, f1
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// W + Wp1 * poly
-//
-(p0) mov f34 = f53
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// A_1 + r * poly
-// Scale = setf_expf(Bias_p_k)
-//
-(p0) fma.s1 f52 = f48, f52, f42
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// poly = r + rsq(A_1 + r*poly)
-// Wp1 = 1 + W
-// neg_2_mK = -neg_2_mK
-//
-(p0) fma.s1 f35 = f55, f52, f54
- nop.i 999 ;;
-}
-
{ .mfb
- nop.m 999
-(p0) fmpy.s1 f35 = f35, f53
-//
-// Y_hi = T
-// Y_lo = T * (W + Wp1*poly)
-//
-(p12) br.cond.sptk EXPF_MAIN ;;
-}
-//
-// Branch if expf(x)
-// Continue for expf(x-1)
-//
-
-{ .mii
-(p0) cmp.lt.unc p12, p13 = 10, r44
- nop.i 999 ;;
-//
-// Set p12 if 10 < K, Else p13
-//
-(p13) cmp.gt.unc p13, p14 = -10, r44 ;;
+ ldfpd fA4, fA3 = [rTblAddr], 16
+ fclass.m p15, p0 = f8 , 0x1e1 // test for NaT,NaN,+Inf
+(p13) br.ret.spnt b0 // exit here if x =0.0, result is x
}
-//
-// K > 10: Y_lo = Y_lo + neg_2_mK
-// K <=10: Set p13 if -10 > K, Else set p14
-//
+;;
{ .mfi
-(p13) cmp.eq p15, p0 = r0, r0
-(p14) fadd.s1 f34 = f61, f34
- nop.i 999 ;;
+ // overflow thresholds
+ ldfps fMIN_SGL_OFLOW_ARG, fMAX_SGL_NORM_ARG = [rTblAddr], 8
+ fma.s1 fXsq = fNormX, fNormX, f0 // x^2 for small path
+ and rExp_x = rExp_mask, rSignexp_x // Biased exponent of x
}
-
-{ .mfi
- nop.m 999
-(p12) fadd.s1 f35 = f35, f61
- nop.i 999 ;;
+{ .mlx
+ nop.m 0
+ movl rM1_lim = 0xc1c00000 // Minus -1 limit (-24.0), SP
}
+;;
{ .mfi
- nop.m 999
-(p13) fadd.s1 f35 = f35, f34
- nop.i 999
+ setf.exp fA2 = rExp_half
+ // x*(64/ln(2)) + Right Shifter
+ fma.s1 fNint = fNormX, f64DivLn2, fRightShifter
+ sub rExp_x = rExp_x, rExp_bias // True exponent of x
}
-
{ .mfb
- nop.m 999
-//
-// K <= 10 and K < -10, Set Safe = True
-// K <= 10 and K < 10, Y_lo = Y_hi + Y_lo
-// K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk
-//
-(p13) mov f34 = f61
-(p0) br.cond.sptk EXPF_MAIN ;;
-}
-EXPF_SMALL:
-{ .mmi
-(p12) addl r35 = @ltoff(Constants_exp_64_P#), gp
-(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp
- nop.i 999
-}
-;;
-
-{ .mmi
-(p12) ld8 r35 = [r35]
- ld8 r34 = [r34]
- nop.i 999
+ nop.m 0
+(p15) fma.s.s0 f8 = f8, f1, f0 // result if x = NaT,NaN,+Inf
+(p15) br.ret.spnt b0 // exit here if x = NaT,NaN,+Inf
}
;;
-
-{ .mmi
-(p13) addl r35 = @ltoff(Constants_exp_64_Q#), gp
- nop.m 999
- nop.i 999
-}
-;;
-
-
-//
-// Return
-// K <= 10 and K < 10, Y_hi = neg_2_mk
-//
-// /*******************************************************/
-// /*********** Branch EXP_SMALL *************************/
-// /*******************************************************/
-
{ .mfi
-(p13) ld8 r35 = [r35]
-(p0) mov f42 = f9
-(p0) add r34 = 0x48,r34
+ setf.s fMAX_SGL_MINUS_1_ARG = rM1_lim // -1 threshold, -24.0
+ nop.f 0
+ cmp.gt p7, p8 = -2, rExp_x // Test |x| < 2^(-2)
}
;;
-//
-// Flag = 0
-// r4 = rsq * rsq
-//
-
{ .mfi
-(p0) ld8 r49 =[r34],0
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-//
-// Flag = 1
-//
-(p0) cmp.lt.unc p14, p0 = r37, r49 ;;
+(p7) cmp.gt.unc p6, p7 = -40, rExp_x // Test |x| < 2^(-40)
+ fma.s1 fA87 = fA8, fNormX, fA7 // Small path, A8*x+A7
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-//
-// r = X
-//
-(p0) fmpy.s1 f48 = f42, f42
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fA65 = fA6, fNormX, fA5 // Small path, A6*x+A5
+ nop.i 0
}
+;;
{ .mfb
- nop.m 999
-//
-// rsq = r * r
-//
-(p0) fmpy.s1 f50 = f48, f48
-//
-// Is input very small?
-//
-(p14) br.cond.spnt EXPF_VERY_SMALL ;;
-}
-//
-// Flag_not1: Y_hi = 1.0
-// Flag is 1: r6 = rsq * r4
-//
-
-{ .mfi
-(p12) ldfe f52 = [r35],16
-(p12) mov f34 = f1
-(p0) add r53 = 0x1,r0 ;;
-}
-
-{ .mfi
-(p13) ldfe f51 = [r35],16
-//
-// Flag_not_1: Y_lo = poly_hi + r4 * poly_lo
-//
-(p13) mov f34 = f9
- nop.i 999 ;;
-}
-
-{ .mmf
-(p12) ldfe f53 = [r35],16
-//
-// For Flag_not_1, Y_hi = X
-// Scale = 1
-// Create 0x000...01
-//
-(p0) setf.sig f37 = r53
-(p0) mov f36 = f1 ;;
-}
-
-{ .mmi
-(p13) ldfe f52 = [r35],16 ;;
-(p12) ldfe f54 = [r35],16
- nop.i 999 ;;
-}
-
-{ .mfi
-(p13) ldfe f53 = [r35],16
-(p13) fmpy.s1 f58 = f48, f50
- nop.i 999 ;;
-}
-//
-// Flag_not1: poly_lo = P_5 + r*P_6
-// Flag_1: poly_lo = Q_6 + r*Q_7
-//
-
-{ .mmi
-(p13) ldfe f54 = [r35],16 ;;
-(p12) ldfe f55 = [r35],16
- nop.i 999 ;;
-}
-
-{ .mmi
-(p12) ldfe f56 = [r35],16 ;;
-(p13) ldfe f55 = [r35],16
- nop.i 999 ;;
-}
-
-{ .mmi
-(p12) ldfe f57 = [r35],0 ;;
-(p13) ldfe f56 = [r35],16
- nop.i 999 ;;
-}
-
-{ .mfi
-(p13) ldfe f57 = [r35],0
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// For Flag_not_1, load p5,p6,p1,p2
-// Else load p5,p6,p1,p2
-//
-(p12) fma.s1 f60 = f52, f42, f53
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.s.s0 f8 = f8, f8, f8 // If x < 2^-40, result=x+x*x
+(p6) br.ret.spnt b0 // Exit if x < 2^-40
}
+;;
{ .mfi
- nop.m 999
-(p13) fma.s1 f60 = f51, f42, f52
- nop.i 999 ;;
+ nop.m 0
+ // check for overflow
+ fcmp.gt.s1 p15, p14 = fNormX, fMIN_SGL_OFLOW_ARG
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p12) fma.s1 f60 = f60, f42, f54
- nop.i 999 ;;
+ nop.m 0
+ fms.s1 fN = fNint, f1, fRightShifter // n in FP register
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p12) fma.s1 f59 = f56, f42, f57
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s1 fA43 = fA4, fNormX, fA3 // Small path, A4*x+A3
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p13) fma.s1 f60 = f42, f60, f53
- nop.i 999 ;;
+ getf.sig rNJ = fNint // bits of n, j
+(p7) fma.s1 fA8765 = fA87, fXsq, fA65 // Small path, A87*xsq+A65
+ nop.i 0
}
-
-{ .mfi
- nop.m 999
-(p12) fma.s1 f59 = f59, f48, f42
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+(p7) fma.s1 fX3 = fXsq, fNormX, f0 // Small path, x^3
+ // branch out if overflow
+(p15) br.cond.spnt EXPM1_CERTAIN_OVERFLOW
}
+;;
{ .mfi
- nop.m 999
-//
-// Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7)
-// Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6)
-// Flag_not1: poly_hi = (P_1 + r*P_2)
-//
-(p13) fmpy.s1 f60 = f60, f58
- nop.i 999 ;;
+ addl rN = 0xffff-63, rNJ // biased and shifted n
+ fnma.s1 fR = fLn2Div64, fN, fNormX // R = x - N*ln(2)/64
+ extr.u rJ = rNJ , 0 , 6 // bits of j
}
+;;
{ .mfi
- nop.m 999
-(p12) fma.s1 f60 = f60, f42, f55
- nop.i 999 ;;
+ shladd rJ = rJ, 3, rTblAddr // address in the 2^(j/64) table
+ // check for certain -1
+ fcmp.le.s1 p13, p0 = fNormX, fMAX_SGL_MINUS_1_ARG
+ shr rN = rN, 6 // biased n
}
-
{ .mfi
- nop.m 999
-//
-// Flag_1: poly_lo = r6 *(Q_5 + ....)
-// Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2)
-//
-(p12) fma.s1 f35 = f60, f50, f59
- nop.i 999
+ nop.m 0
+(p7) fma.s1 fA432 = fA43, fNormX, fA2 // Small path, A43*x+A2
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p13) fma.s1 f59 = f54, f42, f55
- nop.i 999 ;;
+ ld8 rJ = [rJ]
+ nop.f 0
+ shl rN = rN , 52 // 2^n bits in DP format
}
+;;
-{ .mfi
- nop.m 999
-//
-// Flag_not1: Y_lo = rsq* poly_hi + poly_lo
-// Flag_1: poly_lo = rsq* poly_hi + poly_lo
-//
-(p13) fma.s1 f59 = f59, f42, f56
- nop.i 999 ;;
+{ .mmi
+ or rN = rN, rJ // bits of 2^n * 2^(j/64) in DP format
+(p13) mov rTmp = 1 // Make small value for -1 path
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-//
-// Flag_not_1: (P_1 + r*P_2)
-//
-(p13) fma.s1 f59 = f59, f42, f57
- nop.i 999 ;;
+ setf.d fT = rN // 2^n
+ // check for possible overflow (only happens if input higher precision)
+(p14) fcmp.gt.s1 p14, p0 = fNormX, fMAX_SGL_NORM_ARG
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-//
-// Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2)
-//
-(p13) fma.s1 f35 = f59, f48, f60
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s1 fA8765432 = fA8765, fX3, fA432 // A8765*x^3+A432
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-//
-// Create 0.000...01
-//
-(p0) for f37 = f35, f37
- nop.i 999 ;;
+(p13) setf.exp fTmp = rTmp // Make small value for -1 path
+ fma.s1 fP = fA3, fR, fA2 // A3*R + A2
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-//
-// Set lsb of Y_lo to 1
-//
-(p0) fmerge.se f35 = f35,f37
-(p0) br.cond.sptk EXPF_MAIN ;;
-}
-EXPF_VERY_SMALL:
-
-{ .mmi
- nop.m 999
-(p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp
- nop.i 999;;
-}
-
-{ .mfi
-(p13) ld8 r34 = [r34];
-(p12) mov f35 = f9
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fRSqr = fR, fR, f0 // R^2
+(p13) br.cond.spnt EXPM1_CERTAIN_MINUS_ONE // Branch if x < -24.0
}
+;;
{ .mfb
- nop.m 999
-(p12) mov f34 = f1
-(p12) br.cond.sptk EXPF_MAIN ;;
-}
-
-{ .mlx
-(p13) add r34 = 8,r34
-(p13) movl r39 = 0x0FFFE ;;
-}
-//
-// Load big_exp_neg
-// Create 1/2's exponent
-//
-
-{ .mii
-(p13) setf.exp f56 = r39
-(p13) shladd r34 = r32,4,r34 ;;
- nop.i 999
-}
-//
-// Negative exponents are stored after positive
-//
-
-{ .mfi
-(p13) ld8 r45 = [r34],0
-//
-// Y_hi = x
-// Scale = 1
-//
-(p13) fmpy.s1 f35 = f9, f9
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Reset Safe if necessary
-// Create 1/2
-//
-(p13) mov f34 = f9
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s.s0 f8 = fA8765432, fXsq, fNormX // Small path,
+ // result=xsq*A8765432+x
+(p7) br.ret.spnt b0 // Exit if 2^-40 <= |x| < 2^-2
}
+;;
{ .mfi
-(p13) cmp.lt.unc p0, p15 = r37, r45
-(p13) mov f36 = f1
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fP = fP, fRSqr, fR // P = (A3*R + A2)*Rsqr + R
+ nop.i 0
}
+;;
{ .mfb
- nop.m 999
-//
-// Y_lo = x * x
-//
-(p13) fmpy.s1 f35 = f35, f56
-//
-// Y_lo = x*x/2
-//
-(p13) br.cond.sptk EXPF_MAIN ;;
-}
-EXPF_HUGE:
-
-{ .mfi
- nop.m 999
-(p0) fcmp.gt.unc.s1 p14, p0 = f9, f0
- nop.i 999
-}
-
-{ .mlx
- nop.m 999
-(p0) movl r39 = 0x15DC0 ;;
-}
-
-{ .mfi
-(p14) setf.exp f34 = r39
-(p14) mov f35 = f1
-(p14) cmp.eq p0, p15 = r0, r0 ;;
+ nop.m 0
+ fms.s1 fTm1 = fT, f1, f1 // T - 1.0
+(p14) br.cond.spnt EXPM1_POSSIBLE_OVERFLOW
}
+;;
{ .mfb
- nop.m 999
-(p14) mov f36 = f34
-//
-// If x > 0, Set Safe = False
-// If x > 0, Y_hi = 2**(24,000)
-// If x > 0, Y_lo = 1.0
-// If x > 0, Scale = 2**(24,000)
-//
-(p14) br.cond.sptk EXPF_MAIN ;;
-}
-
-{ .mlx
- nop.m 999
-(p12) movl r39 = 0xA240
-}
-
-{ .mlx
- nop.m 999
-(p12) movl r38 = 0xA1DC ;;
-}
-
-{ .mmb
-(p13) cmp.eq p15, p14 = r0, r0
-(p12) setf.exp f34 = r39
- nop.b 999 ;;
-}
-
-{ .mlx
-(p12) setf.exp f35 = r38
-(p13) movl r39 = 0xFF9C
-}
-
-{ .mfi
- nop.m 999
-(p13) fsub.s1 f34 = f0, f1
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p12) mov f36 = f34
-(p12) cmp.eq p0, p15 = r0, r0 ;;
-}
-
-{ .mfi
-(p13) setf.exp f35 = r39
-(p13) mov f36 = f1
- nop.i 999 ;;
-}
-EXPF_MAIN:
-
-{ .mfi
-(p0) cmp.ne.unc p12, p0 = 0x01, r33
-(p0) fmpy.s1 f101 = f36, f35
- nop.i 999 ;;
+ nop.m 0
+ fma.s.s0 f8 = fP, fT, fTm1
+ br.ret.sptk b0 // Result for main path
+ // minus_one_limit < x < -2^-2
+ // and +2^-2 <= x < overflow_limit
}
+;;
+// Here if x unorm
+EXPM1_UNORM:
{ .mfb
- nop.m 999
-(p0) fma.s.s0 f99 = f34, f36, f101
-(p15) br.cond.sptk EXPF_64_RETURN ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fsetc.s3 0x7F,0x01
- nop.i 999
-}
-
-{ .mlx
- nop.m 999
-(p0) movl r50 = 0x0000000001007F ;;
-}
-//
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// S3 user supplied status + RZ + TD (Underflows)
-//
-//
-// If (Safe) is true, then
-// Compute result using user supplied status field.
-// No overflow or underflow here, but perhaps inexact.
-// Return
-// Else
-// Determine if overflow or underflow was raised.
-// Fetch +/- overflow threshold for IEEE single, double,
-// double extended
-//
-
-{ .mfi
-(p0) setf.exp f60 = r50
-(p0) fma.s.s3 f102 = f34, f36, f101
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fsetc.s3 0x7F,0x40
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// For Safe, no need to check for over/under.
-// For expm1, handle errors like exp.
-//
-(p0) fsetc.s2 0x7F,0x42
- nop.i 999;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s.s2 f100 = f34, f36, f101
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fsetc.s2 0x7F,0x40
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p7) fclass.m.unc p12, p0 = f102, 0x00F
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p11, p0 = f102, 0x00F
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p7) fcmp.ge.unc.s1 p10, p0 = f100, f60
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// Create largest double exponent + 1.
-// Create smallest double exponent - 1.
-//
-(p0) fcmp.ge.unc.s1 p8, p0 = f100, f60
- nop.i 999 ;;
-}
-//
-// fcmp: resultS2 >= + overflow threshold -> set (a) if true
-// fcmp: resultS2 <= - overflow threshold -> set (b) if true
-// fclass: resultS3 is denorm/unorm/0 -> set (d) if true
-//
-
-{ .mib
-(p10) mov GR_Parameter_TAG = 43
- nop.i 999
-(p10) br.cond.sptk __libm_error_region ;;
-}
-
-{ .mib
-(p8) mov GR_Parameter_TAG = 16
- nop.i 999
-(p8) br.cond.sptk __libm_error_region ;;
+ getf.exp rSignexp_x = fNormX // Must recompute if x unorm
+ fcmp.eq.s0 p6, p0 = f8, f0 // Set D flag
+ br.cond.sptk EXPM1_COMMON
}
-//
-// Report that exp overflowed
-//
-
-{ .mib
-(p12) mov GR_Parameter_TAG = 44
- nop.i 999
-(p12) br.cond.sptk __libm_error_region ;;
-}
-
-{ .mib
-(p11) mov GR_Parameter_TAG = 17
- nop.i 999
-(p11) br.cond.sptk __libm_error_region ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-//
-// Report that exp underflowed
-//
-(p0) br.cond.sptk EXPF_64_RETURN ;;
-}
-EXPF_64_SPECIAL:
+;;
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6, p0 = f8, 0x0c3
- nop.i 999
+// here if result will be -1 and inexact, x <= -24.0
+EXPM1_CERTAIN_MINUS_ONE:
+{ .mfb
+ nop.m 0
+ fms.s.s0 f8 = fTmp, fTmp, f1 // Result -1, and Inexact set
+ br.ret.sptk b0
}
+;;
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p13, p8 = f8, 0x007
- nop.i 999 ;;
-}
+EXPM1_POSSIBLE_OVERFLOW:
-{ .mfi
- nop.m 999
-(p7) fclass.m.unc p14, p0 = f8, 0x007
- nop.i 999
-}
+// Here if fMAX_SGL_NORM_ARG < x < fMIN_SGL_OFLOW_ARG
+// This cannot happen if input is a single, only if input higher precision.
+// Overflow is a possibility, not a certainty.
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p12, p9 = f8, 0x021
- nop.i 999 ;;
-}
+// Recompute result using status field 2 with user's rounding mode,
+// and wre set. If result is larger than largest single, then we have
+// overflow
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p11, p0 = f8, 0x022
- nop.i 999
+ mov rGt_ln = 0x1007f // Exponent for largest sgl + 1 ulp
+ fsetc.s2 0x7F,0x42 // Get user's round mode, set wre
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p7) fclass.m.unc p10, p0 = f8, 0x022
- nop.i 999 ;;
+ setf.exp fGt_pln = rGt_ln // Create largest single + 1 ulp
+ fma.s.s2 fWre_urm_f8 = fP, fT, fTm1 // Result with wre set
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-//
-// Identify +/- 0, Inf, or -Inf
-// Generate the right kind of NaN.
-//
-(p13) fadd.s.s0 f99 = f0, f1
- nop.i 999 ;;
+ nop.m 0
+ fsetc.s2 0x7F,0x40 // Turn off wre in sf2
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p14) mov f99 = f8
- nop.i 999 ;;
+ nop.m 0
+ fcmp.ge.s1 p6, p0 = fWre_urm_f8, fGt_pln // Test for overflow
+ nop.i 0
}
+;;
{ .mfb
- nop.m 999
-(p6) fadd.s.s0 f99 = f8, f1
-//
-// expf(+/-0) = 1
-// expm1f(+/-0) = +/-0
-// No exceptions raised
-//
-(p6) br.cond.sptk EXPF_64_RETURN ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p14) br.cond.sptk EXPF_64_RETURN ;;
-}
-
-{ .mfi
- nop.m 999
-(p11) mov f99 = f0
- nop.i 999 ;;
+ nop.m 0
+ nop.f 0
+(p6) br.cond.spnt EXPM1_CERTAIN_OVERFLOW // Branch if overflow
}
+;;
{ .mfb
- nop.m 999
-(p10) fsub.s.s1 f99 = f0, f1
-//
-// expf(-Inf) = 0
-// expm1f(-Inf) = -1
-// No exceptions raised.
-//
-(p10) br.cond.sptk EXPF_64_RETURN ;;
+ nop.m 0
+ fma.s.s0 f8 = fP, fT, fTm1
+ br.ret.sptk b0 // Exit if really no overflow
}
+;;
-{ .mfb
- nop.m 999
-(p12) fmpy.s.s1 f99 = f8, f1
-//
-// expf(+Inf) = Inf
-// No exceptions raised.
-//
-(p0) br.cond.sptk EXPF_64_RETURN ;;
+// here if overflow
+EXPM1_CERTAIN_OVERFLOW:
+{ .mmi
+ addl rTmp = 0x1FFFE, r0;;
+ setf.exp fTmp = rTmp
+ nop.i 999
}
-EXPF_64_UNSUPPORTED:
+;;
-{ .mfb
- nop.m 999
-(p0) fmpy.s.s0 f99 = f8, f0
- nop.b 0;;
+{ .mfi
+ alloc r32 = ar.pfs, 0, 3, 4, 0 // get some registers
+ fmerge.s FR_X = fNormX,fNormX
+ nop.i 0
}
-
-EXPF_64_RETURN:
{ .mfb
- nop.m 999
-(p0) mov f8 = f99
-(p0) br.ret.sptk b0
+ mov GR_Parameter_TAG = 43
+ fma.s.s0 FR_RESULT = fTmp, fTmp, f0 // Set I,O and +INF result
+ br.cond.sptk __libm_error_region
}
-.endp expm1f
-ASM_SIZE_DIRECTIVE(expm1f)
+;;
+GLOBAL_IEEE754_END(expm1f)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
- nop.f 0
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 999
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
- nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
};;
{ .mmi
- stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
.body
-{ .mib
- stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0 // Parameter 3 address
+{ .mfi
+ stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ nop.f 0
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
}
{ .mib
- stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
- add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
+
{ .mmi
- nop.m 0
- nop.m 0
- add GR_Parameter_RESULT = 48,sp
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
+
{ .mmi
- ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
- add sp = 64,sp // Restore stack pointer
- mov b0 = GR_SAVE_B0 // Restore return address
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
-};;
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
diff --git a/sysdeps/ia64/fpu/s_expm1l.S b/sysdeps/ia64/fpu/s_expm1l.S
index e53d3c8d7c..069856d244 100644
--- a/sysdeps/ia64/fpu/s_expm1l.S
+++ b/sysdeps/ia64/fpu/s_expm1l.S
@@ -1,10 +1,10 @@
-.file "exp_m1l.s"
+.file "expl_m1.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,15 +35,22 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial Version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 07/07/01 Improved speed of all paths
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
+// 03/11/03 Improved accuracy and performance, corrected missing inexact flags
+// 04/17/03 Eliminated misplaced and unused data label
//
-// *********************************************************************
+//*********************************************************************
//
// Function: Combined expl(x) and expm1l(x), where
// x
@@ -51,20 +58,20 @@
// x
// expm1l(x) = e - 1 for double-extended precision x values
//
-// *********************************************************************
+//*********************************************************************
//
// Resources Used:
//
// Floating-Point Registers: f8 (Input and Return Value)
-// f9,f32-f61, f99-f102
+// f9-f15,f32-f77
//
// General Purpose Registers:
-// r32-r61
-// r62-r65 (Used to pass arguments to error handling routine)
+// r14-r38
+// r35-r38 (Used to pass arguments to error handling routine)
//
// Predicate Registers: p6-p15
//
-// *********************************************************************
+//*********************************************************************
//
// IEEE Special Conditions:
//
@@ -74,39 +81,37 @@
// (Error Handling Routine called for overflow and Underflow)
// Inexact raised when appropriate by algorithm
//
-// expl(inf) = inf
-// expl(-inf) = +0
-// expl(SNaN) = QNaN
-// expl(QNaN) = QNaN
-// expl(0) = 1
-// expl(EM_special Values) = QNaN
-// expl(inf) = inf
-// expm1l(-inf) = -1
-// expm1l(SNaN) = QNaN
-// expm1l(QNaN) = QNaN
-// expm1l(0) = 0
-// expm1l(EM_special Values) = QNaN
+// exp(inf) = inf
+// exp(-inf) = +0
+// exp(SNaN) = QNaN
+// exp(QNaN) = QNaN
+// exp(0) = 1
+// exp(EM_special Values) = QNaN
+// exp(inf) = inf
+// expm1(-inf) = -1
+// expm1(SNaN) = QNaN
+// expm1(QNaN) = QNaN
+// expm1(0) = 0
+// expm1(EM_special Values) = QNaN
//
-// *********************************************************************
+//*********************************************************************
//
// Implementation and Algorithm Notes:
//
// ker_exp_64( in_FR : X,
-// in_GR : Flag,
-// in_GR : Expo_Range
// out_FR : Y_hi,
// out_FR : Y_lo,
// out_FR : scale,
// out_PR : Safe )
//
-// On input, X is in register format and
-// Flag = 0 for exp,
-// Flag = 1 for expm1,
+// On input, X is in register format
+// p6 for exp,
+// p7 for expm1,
//
-// On output, provided X and X_cor are real numbers, then
+// On output,
//
-// scale*(Y_hi + Y_lo) approximates expl(X) if Flag is 0
-// scale*(Y_hi + Y_lo) approximates expl(X)-1 if Flag is 1
+// scale*(Y_hi + Y_lo) approximates exp(X) if exp
+// scale*(Y_hi + Y_lo) approximates exp(X)-1 if expm1
//
// The accuracy is sufficient for a highly accurate 64 sig.
// bit implementation. Safe is set if there is no danger of
@@ -122,36 +127,36 @@
// The method consists of three cases.
//
// If |X| < Tiny use case exp_tiny;
-// else if |X| < 2^(-6) use case exp_small;
+// else if |X| < 2^(-m) use case exp_small; m=12 for exp, m=7 for expm1
// else use case exp_regular;
//
// Case exp_tiny:
//
-// 1 + X can be used to approximate expl(X) or expl(X+X_cor);
-// X + X^2/2 can be used to approximate expl(X) - 1
+// 1 + X can be used to approximate exp(X)
+// X + X^2/2 can be used to approximate exp(X) - 1
//
// Case exp_small:
//
-// Here, expl(X), expl(X+X_cor), and expl(X) - 1 can all be
+// Here, exp(X) and exp(X) - 1 can all be
// appproximated by a relatively simple polynomial.
//
// This polynomial resembles the truncated Taylor series
//
-// expl(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n!
+// exp(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n!
//
// Case exp_regular:
//
// Here we use a table lookup method. The basic idea is that in
-// order to compute expl(X), we accurately decompose X into
+// order to compute exp(X), we accurately decompose X into
//
// X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13.
//
// Hence
//
-// expl(X) = 2^( N / 2^12 ) * expl(r).
+// exp(X) = 2^( N / 2^12 ) * exp(r).
//
// The value 2^( N / 2^12 ) is obtained by simple combinations
-// of values calculated beforehand and stored in table; expl(r)
+// of values calculated beforehand and stored in table; exp(r)
// is approximated by a short polynomial because |r| is small.
//
// We elaborate this method in 4 steps.
@@ -178,13 +183,9 @@
// as a double-precision number; L_lo has 64 significant bits and
// stored as a double-extended number.
//
-// In the case Flag = 2, we further modify r by
-//
-// r := r + X_cor.
-//
// Step 2: Approximation
//
-// expl(r) - 1 is approximated by a short polynomial of the form
+// exp(r) - 1 is approximated by a short polynomial of the form
//
// r + A_1 r^2 + A_2 r^3 + A_3 r^4 .
//
@@ -213,19 +214,19 @@
// Define two mathematical values, delta_1 and delta_2, implicitly
// such that
//
-// T_1 = expl( [M_1 log(2)/2^6] - delta_1 )
-// T_2 = expl( [M_2 log(2)/2^12] - delta_2 )
+// T_1 = exp( [M_1 log(2)/2^6] - delta_1 )
+// T_2 = exp( [M_2 log(2)/2^12] - delta_2 )
//
// are representable as 24 significant bits. To illustrate the idea,
// we show how we define delta_1:
//
-// T_1 := round_to_24_bits( expl( M_1 log(2)/2^6 ) )
+// T_1 := round_to_24_bits( exp( M_1 log(2)/2^6 ) )
// delta_1 = (M_1 log(2)/2^6) - log( T_1 )
//
// The last equality means mathematical equality. We then tabulate
//
-// W_1 := expl(delta_1) - 1
-// W_2 := expl(delta_2) - 1
+// W_1 := exp(delta_1) - 1
+// W_2 := exp(delta_2) - 1
//
// Both in double precision.
//
@@ -235,13 +236,13 @@
// T := T_1 * T_2 ...exactly
// W := W_1 + (1 + W_1)*W_2
//
-// W approximates expl( delta ) - 1 where delta = delta_1 + delta_2.
+// W approximates exp( delta ) - 1 where delta = delta_1 + delta_2.
// The mathematical product of T and (W+1) is an accurate representation
// of 2^(M_1/2^6) * 2^(M_2/2^12).
//
// Step 4. Reconstruction
//
-// Finally, we can reconstruct expl(X), expl(X) - 1.
+// Finally, we can reconstruct exp(X), exp(X) - 1.
// Because
//
// X = K * log(2) + (M_1*log(2)/2^6 - delta_1)
@@ -249,18 +250,18 @@
// + delta_1 + delta_2 + r ...accurately
// We have
//
-// expl(X) ~=~ 2^K * ( T + T*[expl(delta_1+delta_2+r) - 1] )
-// ~=~ 2^K * ( T + T*[expl(delta + r) - 1] )
-// ~=~ 2^K * ( T + T*[(expl(delta)-1)
-// + expl(delta)*(expl(r)-1)] )
+// exp(X) ~=~ 2^K * ( T + T*[exp(delta_1+delta_2+r) - 1] )
+// ~=~ 2^K * ( T + T*[exp(delta + r) - 1] )
+// ~=~ 2^K * ( T + T*[(exp(delta)-1)
+// + exp(delta)*(exp(r)-1)] )
// ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) )
// ~=~ 2^K * ( Y_hi + Y_lo )
//
// where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r))
//
-// For expl(X)-1, we have
+// For exp(X)-1, we have
//
-// expl(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1
+// exp(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1
// ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) )
//
// and we combine Y_hi + Y_lo - 2^(-N) into the form of two
@@ -278,7 +279,7 @@
// different rounding directions and a correct setting of the SAFE
// flag.
//
-// If Flag is 1, then
+// If expm1 is 1, then
// SAFE := False ...possibility of underflow
// Scale := 1.0
// Y_hi := X
@@ -296,26 +297,25 @@
//
// Let r = X
//
-// If Flag is not 1 ...i.e. expl( argument )
+// If exp ...i.e. exp( argument )
//
// rsq := r * r;
// r4 := rsq*rsq
// poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6))
// poly_hi := r + rsq*(P_1 + r*P_2)
// Y_lo := poly_hi + r4 * poly_lo
-// set lsb(Y_lo) to 1
// Y_hi := 1.0
// Scale := 1.0
//
-// Else ...i.e. expl( argument ) - 1
+// Else ...i.e. exp( argument ) - 1
//
// rsq := r * r
// r4 := rsq * rsq
-// r6 := rsq * r4
-// poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7))
-// poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4))
-// Y_lo := rsq*poly_hi + poly_lo
-// set lsb(Y_lo) to 1
+// poly_lo := Q_7 + r*(Q_8 + r*Q_9))
+// poly_med:= Q_3 + r*Q_4 + rsq*(Q_5 + r*Q_6)
+// poly_med:= poly_med + r4*poly_lo
+// poly_hi := Q_1 + r*Q_2
+// Y_lo := rsq*(poly_hi + rsq*poly_lo)
// Y_hi := X
// Scale := 1.0
//
@@ -325,14 +325,14 @@
//
// The previous description contain enough information except the
// computation of poly and the final Y_hi and Y_lo in the case for
-// expl(X)-1.
+// exp(X)-1.
//
// The computation of poly for Step 2:
//
// rsq := r*r
// poly := r + rsq*(A_1 + r*(A_2 + r*A_3))
//
-// For the case expl(X) - 1, we need to incorporate 2^(-K) into
+// For the case exp(X) - 1, we need to incorporate 2^(-K) into
// Y_hi and Y_lo at the end of Step 4.
//
// If K > 10 then
@@ -346,72 +346,197 @@
// End If
// End If
//
+//=======================================================
+// General Purpose Registers
+//
+GR_ad_Arg = r14
+GR_ad_A = r15
+GR_sig_inv_ln2 = r15
+GR_rshf_2to51 = r16
+GR_ad_PQ = r16
+GR_ad_Q = r16
+GR_signexp_x = r17
+GR_exp_x = r17
+GR_small_exp = r18
+GR_rshf = r18
+GR_exp_mask = r19
+GR_ad_W1 = r20
+GR_exp_2tom51 = r20
+GR_ad_W2 = r21
+GR_exp_underflow = r21
+GR_M2 = r22
+GR_huge_exp = r22
+GR_M1 = r23
+GR_huge_signif = r23
+GR_K = r24
+GR_one = r24
+GR_minus_one = r24
+GR_exp_bias = r25
+GR_ad_Limits = r26
+GR_N_fix = r26
+GR_exp_2_mk = r26
+GR_ad_P = r27
+GR_exp_2_k = r27
+GR_big_expo_neg = r28
+GR_very_small_exp = r29
+GR_exp_half = r29
+GR_ad_T1 = r30
+GR_ad_T2 = r31
-#include "libm_support.h"
+GR_SAVE_PFS = r32
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_Parameter_X = r35
+GR_Parameter_Y = r36
+GR_Parameter_RESULT = r37
+GR_Parameter_TAG = r38
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+// Floating Point Registers
+//
+FR_norm_x = f9
+FR_RSHF_2TO51 = f10
+FR_INV_LN2_2TO63 = f11
+FR_W_2TO51_RSH = f12
+FR_2TOM51 = f13
+FR_RSHF = f14
+FR_Y_hi = f34
+FR_Y_lo = f35
+FR_scale = f36
+FR_tmp = f37
+FR_float_N = f38
+FR_N_signif = f39
+FR_L_hi = f40
+FR_L_lo = f41
+FR_r = f42
+FR_W1 = f43
+FR_T1 = f44
+FR_W2 = f45
+FR_T2 = f46
+FR_W1_p1 = f47
+FR_rsq = f48
+FR_A2 = f49
+FR_r4 = f50
+FR_A3 = f51
+FR_poly = f52
+FR_T = f53
+FR_W = f54
+FR_Wp1 = f55
+FR_p21 = f59
+FR_p210 = f59
+FR_p65 = f60
+FR_p654 = f60
+FR_p6543 = f60
+FR_2_mk = f61
+FR_P4Q7 = f61
+FR_P4 = f61
+FR_Q7 = f61
+FR_P3Q6 = f62
+FR_P3 = f62
+FR_Q6 = f62
+FR_q65 = f62
+FR_q6543 = f62
+FR_P2Q5 = f63
+FR_P2 = f63
+FR_Q5 = f63
+FR_P1Q4 = f64
+FR_P1 = f64
+FR_Q4 = f64
+FR_q43 = f64
+FR_Q3 = f65
+FR_Q2 = f66
+FR_q21 = f66
+FR_Q1 = f67
+FR_A1 = f68
+FR_P6Q9 = f68
+FR_P6 = f68
+FR_Q9 = f68
+FR_P5Q8 = f69
+FR_P5 = f69
+FR_Q8 = f69
+FR_q987 = f69
+FR_q98 = f69
+FR_q9876543 = f69
+FR_min_oflow_x = f70
+FR_huge_exp = f70
+FR_zero_uflow_x = f71
+FR_huge_signif = f71
+FR_huge = f72
+FR_small = f72
+FR_half = f73
+FR_T_scale = f74
+FR_result_lo = f75
+FR_W_T_scale = f76
+FR_Wp1_T_scale = f77
+FR_ftz = f77
+FR_half_x = f77
+//
-.align 64
-Constants_exp_64_Arg:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object)
-data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000
-data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000
-data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000
-// /* Inv_L, L_hi, L_lo */
-ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg)
+FR_X = f9
+FR_Y = f0
+FR_RESULT = f15
-.align 64
-Constants_exp_64_Exponents:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object)
-data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF
-data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF
-data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
-data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
-data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF
-data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF
-ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents)
+// ************* DO NOT CHANGE ORDER OF THESE TABLES ********************
-.align 64
-Constants_exp_64_A:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object)
-data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000
-data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000
-data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000
-// /* Reversed */
-ASM_SIZE_DIRECTIVE(Constants_exp_64_A)
+// double-extended 1/ln(2)
+// 3fff b8aa 3b29 5c17 f0bb be87fed0691d3e88
+// 3fff b8aa 3b29 5c17 f0bc
+// For speed the significand will be loaded directly with a movl and setf.sig
+// and the exponent will be bias+63 instead of bias+0. Thus subsequent
+// computations need to scale appropriately.
+// The constant 2^12/ln(2) is needed for the computation of N. This is also
+// obtained by scaling the computations.
+//
+// Two shifting constants are loaded directly with movl and setf.d.
+// 1. RSHF_2TO51 = 1.1000..00 * 2^(63-12)
+// This constant is added to x*1/ln2 to shift the integer part of
+// x*2^12/ln2 into the rightmost bits of the significand.
+// The result of this fma is N_signif.
+// 2. RSHF = 1.1000..00 * 2^(63)
+// This constant is subtracted from N_signif * 2^(-51) to give
+// the integer part of N, N_fix, as a floating-point number.
+// The result of this fms is float_N.
+RODATA
.align 64
-Constants_exp_64_P:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object)
-data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000
-data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000
-data4 0x7474C518,0x88888888,0x00003FF8,0x00000000
-data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000
-data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000
-data4 0x000004C7,0x80000000,0x00003FFE,0x00000000
-// /* Reversed */
-ASM_SIZE_DIRECTIVE(Constants_exp_64_P)
+LOCAL_OBJECT_START(Constants_exp_64_Arg)
+//data8 0xB8AA3B295C17F0BC,0x0000400B // Inv_L = 2^12/log(2)
+data8 0xB17217F400000000,0x00003FF2 // L_hi = hi part log(2)/2^12
+data8 0xF473DE6AF278ECE6,0x00003FD4 // L_lo = lo part log(2)/2^12
+LOCAL_OBJECT_END(Constants_exp_64_Arg)
-.align 64
-Constants_exp_64_Q:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object)
-data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000
-data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000
-data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000
-data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000
-data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000
-data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000
-data4 0x00000000,0x80000000,0x00003FFE,0x00000000
-// /* Reversed */
-ASM_SIZE_DIRECTIVE(Constants_exp_64_Q)
+LOCAL_OBJECT_START(Constants_exp_64_Limits)
+data8 0xb17217f7d1cf79ac,0x0000400c // Smallest long dbl oflow x
+data8 0xb220000000000000,0x0000c00c // Small long dbl uflow zero x
+LOCAL_OBJECT_END(Constants_exp_64_Limits)
-.align 64
-Constants_exp_64_T1:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object)
+LOCAL_OBJECT_START(Constants_exp_64_A)
+data8 0xAAAAAAABB1B736A0,0x00003FFA // A3
+data8 0xAAAAAAAB90CD6327,0x00003FFC // A2
+data8 0xFFFFFFFFFFFFFFFF,0x00003FFD // A1
+LOCAL_OBJECT_END(Constants_exp_64_A)
+
+LOCAL_OBJECT_START(Constants_exp_64_P)
+data8 0xD00D6C8143914A8A,0x00003FF2 // P6
+data8 0xB60BC4AC30304B30,0x00003FF5 // P5
+data8 0x888888887474C518,0x00003FF8 // P4
+data8 0xAAAAAAAA8DAE729D,0x00003FFA // P3
+data8 0xAAAAAAAAAAAAAF61,0x00003FFC // P2
+data8 0x80000000000004C7,0x00003FFE // P1
+LOCAL_OBJECT_END(Constants_exp_64_P)
+
+LOCAL_OBJECT_START(Constants_exp_64_Q)
+data8 0x93F2AC5F7471F32E, 0x00003FE9 // Q9
+data8 0xB8DA0F3550B3E764, 0x00003FEC // Q8
+data8 0xD00D00D0028E89C4, 0x00003FEF // Q7
+data8 0xD00D00DAEB8C4E91, 0x00003FF2 // Q6
+data8 0xB60B60B60B60B6F5, 0x00003FF5 // Q5
+data8 0x888888888886CC23, 0x00003FF8 // Q4
+data8 0xAAAAAAAAAAAAAAAB, 0x00003FFA // Q3
+data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC // Q2
+data8 0x8000000000000000, 0x00003FFE // Q1
+LOCAL_OBJECT_END(Constants_exp_64_Q)
+
+LOCAL_OBJECT_START(Constants_exp_64_T1)
data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
@@ -428,11 +553,9 @@ data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
-ASM_SIZE_DIRECTIVE(Constants_exp_64_T1)
+LOCAL_OBJECT_END(Constants_exp_64_T1)
-.align 64
-Constants_exp_64_T2:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object)
+LOCAL_OBJECT_START(Constants_exp_64_T2)
data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
@@ -449,1124 +572,824 @@ data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
-ASM_SIZE_DIRECTIVE(Constants_exp_64_T2)
+LOCAL_OBJECT_END(Constants_exp_64_T2)
-.align 64
-Constants_exp_64_W1:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object)
-data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454
-data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6
-data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA
-data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50
-data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2
-data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE
-data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B
-data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04
-data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419
-data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376
-data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A
-data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB
-data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E
-data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA
-data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08
-data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B
-data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75
-data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79
-data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7
-data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087
-data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB
-data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643
-data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C
-data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D
-data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873
-data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F
-data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861
-data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0
-data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC
-data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB
-data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB
-data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148
-ASM_SIZE_DIRECTIVE(Constants_exp_64_W1)
+LOCAL_OBJECT_START(Constants_exp_64_W1)
+data8 0x0000000000000000, 0xBE384454171EC4B4
+data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8
+data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36
+data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE
+data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F
+data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329
+data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5
+data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F
+data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF
+data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F
+data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92
+data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E
+data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D
+data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29
+data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A
+data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA
+data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6
+data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF
+data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC
+data8 0xBE51C2141AA42614, 0xBE48D087C37293F4
+data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38
+data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962
+data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788
+data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7
+data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2
+data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4
+data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA
+data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B
+data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A
+data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719
+data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D
+data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707
+LOCAL_OBJECT_END(Constants_exp_64_W1)
-.align 64
-Constants_exp_64_W2:
-ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object)
-data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25
-data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8
-data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A
-data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E
-data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9
-data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2
-data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0
-data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509
-data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33
-data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D
-data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87
-data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3
-data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9
-data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F
-data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82
-data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4
-data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D
-data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030
-data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29
-data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED
-data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B
-data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893
-data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35
-data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C
-data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313
-data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE
-data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426
-data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550
-data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4
-data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31
-data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE
-data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9
-ASM_SIZE_DIRECTIVE(Constants_exp_64_W2)
-
-GR_SAVE_PFS = r59
-GR_SAVE_B0 = r60
-GR_SAVE_GP = r61
-GR_Parameter_X = r62
-GR_Parameter_Y = r63
-GR_Parameter_RESULT = r64
-GR_Parameter_TAG = r65
+LOCAL_OBJECT_START(Constants_exp_64_W2)
+data8 0x0000000000000000, 0xBE641F2537A3D7A2
+data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6
+data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE
+data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3
+data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4
+data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B
+data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7
+data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA
+data8 0xBE56856B49BFF529, 0x3E66DD3300508651
+data8 0x3E51165FC114BC13, 0x3E53333DC453290F
+data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696
+data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93
+data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE
+data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22
+data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97
+data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8
+data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC
+data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1
+data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7
+data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D
+data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C
+data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5
+data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9
+data8 0xBE559725ADE45917, 0xBE68C29C042FC476
+data8 0xBE67593B01E511FA, 0xBE4A4313398801ED
+data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E
+data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D
+data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F
+data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1
+data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795
+data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E
+data8 0x3E68BF5C17365712, 0x3E3956F9B3785569
+LOCAL_OBJECT_END(Constants_exp_64_W2)
-FR_X = f9
-FR_Y = f9
-FR_RESULT = f99
.section .text
-.proc expm1l#
-.global expm1l#
-.align 64
-expm1l:
-#ifdef _LIBC
-.global __expm1l#
-__expm1l:
-#endif
-{ .mii
-alloc r32 = ar.pfs,0,30,4,0
-(p0) add r33 = 1, r0
-(p0) cmp.eq.unc p7, p0 = r0, r0
-}
-{ .mbb
- nop.m 999
-(p0) br.cond.sptk exp_continue
- nop.b 999 ;;
-}
+
+GLOBAL_IEEE754_ENTRY(expm1l)
//
-// Set p7 true for expm1
-// Set Flag = r33 = 1 for expm1
+// Set p7 true for expm1, p6 false
//
-.endp expm1l
-ASM_SIZE_DIRECTIVE(expm1l)
-
-#ifdef _LIBC
-libm_hidden_def (__expm1l)
-#endif
-
-.section .text
-.proc expl#
-.global expl#
-.align 64
-expl:
-#ifdef _LIBC
-.global __ieee754_expl#
-__ieee754_expl:
-#endif
-{ .mii
-alloc r32 = ar.pfs,0,30,4,0
-(p0) add r33 = r0, r0
-(p0) cmp.eq.unc p0, p7 = r0, r0 ;;
+{ .mlx
+ getf.exp GR_signexp_x = f8 // Get sign and exponent of x, redo if unorm
+ movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
}
-exp_continue:
-{ .mfi
-(p0) add r32 = 2,r0
-(p0) fnorm.s1 f9 = f8
- nop.i 0
+{ .mlx
+ addl GR_ad_Arg = @ltoff(Constants_exp_64_Arg#),gp
+ movl GR_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51)
}
+;;
+
{ .mfi
-(p0) nop.m 0
+ ld8 GR_ad_Arg = [GR_ad_Arg] // Point to Arg table
+ fclass.m p8, p0 = f8, 0x1E7 // Test x for natval, nan, inf, zero
+ cmp.eq p7, p6 = r0, r0
+}
+{ .mfb
+ mov GR_exp_half = 0x0FFFE // Exponent of 0.5, for very small path
+ fnorm.s1 FR_norm_x = f8 // Normalize x
+ br.cond.sptk exp_continue
+}
+;;
+
+GLOBAL_IEEE754_END(expm1l)
+
+GLOBAL_IEEE754_ENTRY(expl)
//
-// Set p7 false for exp
-// Set Flag = r33 = 0 for exp
+// Set p7 false for exp, p6 true
//
-(p0) fclass.m.unc p6, p8 = f8, 0x1E7
- nop.i 0;;
+{ .mlx
+ getf.exp GR_signexp_x = f8 // Get sign and exponent of x, redo if unorm
+ movl GR_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
}
+{ .mlx
+ addl GR_ad_Arg = @ltoff(Constants_exp_64_Arg#),gp
+ movl GR_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51)
+}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.nm.unc p9, p0 = f8, 0x1FF
- nop.i 0
+ ld8 GR_ad_Arg = [GR_ad_Arg] // Point to Arg table
+ fclass.m p8, p0 = f8, 0x1E7 // Test x for natval, nan, inf, zero
+ cmp.eq p6, p7 = r0, r0
}
{ .mfi
- nop.m 999
-(p0) mov f36 = f1
- nop.i 999 ;;
+ mov GR_exp_half = 0x0FFFE // Exponent of 0.5, for very small path
+ fnorm.s1 FR_norm_x = f8 // Normalize x
+ nop.i 999
}
-{ .mfb
- nop.m 999
-//
-// Identify NatVals, NaNs, Infs, and Zeros.
-// Identify EM unsupporteds.
-// Save special input registers
-(p0) mov f32 = f0
-//
-// Create FR_X_cor = 0.0
-// GR_Flag = 0
-// GR_Expo_Range = 2 (r32) for double-extended precision
-// FR_Scale = 1.0
-//
-(p6) br.cond.spnt EXPL_64_SPECIAL ;;
+;;
+
+exp_continue:
+// Form two constants we need
+// 1/ln2 * 2^63 to compute w = x * 1/ln2 * 128
+// 1.1000..000 * 2^(63+63-12) to right shift int(N) into the significand
+
+{ .mfi
+ setf.sig FR_INV_LN2_2TO63 = GR_sig_inv_ln2 // form 1/ln2 * 2^63
+ fclass.nm.unc p9, p0 = f8, 0x1FF // Test x for unsupported
+ mov GR_exp_2tom51 = 0xffff-51
+}
+{ .mlx
+ setf.d FR_RSHF_2TO51 = GR_rshf_2to51 // Form const 1.1000 * 2^(63+51)
+ movl GR_rshf = 0x43e8000000000000 // 1.10000 2^63 for right shift
+}
+;;
+
+{ .mfi
+ setf.exp FR_half = GR_exp_half // Form 0.5 for very small path
+ fma.s1 FR_scale = f1,f1,f0 // Scale = 1.0
+ mov GR_exp_bias = 0x0FFFF // Set exponent bias
}
{ .mib
- nop.m 999
- nop.i 999
-(p9) br.cond.spnt EXPL_64_UNSUPPORTED ;;
+ add GR_ad_Limits = 0x20, GR_ad_Arg // Point to Limits table
+ mov GR_exp_mask = 0x1FFFF // Form exponent mask
+(p8) br.cond.spnt EXP_64_SPECIAL // Branch if natval, nan, inf, zero
}
+;;
+
{ .mfi
-(p0) cmp.ne.unc p12, p13 = 0x01, r33
-//
-// Branch out for special input values
-//
-(p0) fcmp.lt.unc.s0 p9,p0 = f8, f0
-(p0) cmp.eq.unc p15, p0 = r0, r0
+ setf.exp FR_2TOM51 = GR_exp_2tom51 // Form 2^-51 for scaling float_N
+ nop.f 999
+ add GR_ad_A = 0x40, GR_ad_Arg // Point to A table
}
-{ .mmi
- nop.m 999
-//
-// Raise possible denormal operand exception
-// Normalize x
-//
-// This function computes expl( x + x_cor)
-// Input FR 1: FR_X
-// Input FR 2: FR_X_cor
-// Input GR 1: GR_Flag
-// Input GR 2: GR_Expo_Range
-// Output FR 3: FR_Y_hi
-// Output FR 4: FR_Y_lo
-// Output FR 5: FR_Scale
-// Output PR 1: PR_Safe
-(p0) addl r34 = @ltoff(Constants_exp_64_Arg#),gp
-(p0) addl r40 = @ltoff(Constants_exp_64_W1#),gp
-};;
-//
-// Prepare to load constants
-// Set Safe = True
-//
+{ .mib
+ setf.d FR_RSHF = GR_rshf // Form right shift const 1.1000 * 2^63
+ add GR_ad_T1 = 0x160, GR_ad_Arg // Point to T1 table
+(p9) br.cond.spnt EXP_64_UNSUPPORTED // Branch if unsupported
+}
+;;
-{ .mmi
- ld8 r34 = [r34]
- ld8 r40 = [r40]
-(p0) addl r41 = @ltoff(Constants_exp_64_W2#),gp
+.pred.rel "mutex",p6,p7
+{ .mfi
+ ldfe FR_L_hi = [GR_ad_Arg],16 // Get L_hi
+ fcmp.eq.s0 p9,p0 = f8, f0 // Dummy op to flag denormals
+(p6) add GR_ad_PQ = 0x30, GR_ad_A // Point to P table for exp
+}
+{ .mfi
+ ldfe FR_min_oflow_x = [GR_ad_Limits],16 // Get min x to cause overflow
+ fmpy.s1 FR_rsq = f8, f8 // rsq = x * x for small path
+(p7) add GR_ad_PQ = 0x90, GR_ad_A // Point to Q table for expm1
};;
{ .mmi
-(p0) ldfe f37 = [r34],16
-(p0) ld8 r41 = [r41] ;;
+ ldfe FR_L_lo = [GR_ad_Arg],16 // Get L_lo
+ ldfe FR_zero_uflow_x = [GR_ad_Limits],16 // Get x for zero uflow result
+ add GR_ad_W1 = 0x200, GR_ad_T1 // Point to W1 table
}
+;;
-//
-// N = fcvt.fx(float_N)
-// Set p14 if -6 > expo_X
-//
-//
-// Bias = 0x0FFFF
-// expo_X = expo_X and Mask
-//
-
-{ .mmi
-(p0) ldfe f40 = [r34],16
- nop.m 999
-//
-// Load L_lo
-// Set p10 if 14 < expo_X
-//
-(p0) addl r50 = @ltoff(Constants_exp_64_T1#),gp
+{ .mfi
+ ldfe FR_P6Q9 = [GR_ad_PQ],16 // P6(exp) or Q9(expm1) for small path
+ mov FR_r = FR_norm_x // r = X for small path
+ mov GR_very_small_exp = -60 // Exponent of x for very small path
}
-{ .mmi
- nop.m 999
- nop.m 999
-(p0) addl r51 = @ltoff(Constants_exp_64_T2#),gp ;;
+{ .mfi
+ add GR_ad_W2 = 0x400, GR_ad_T1 // Point to W2 table
+ nop.f 999
+(p7) mov GR_small_exp = -7 // Exponent of x for small path expm1
}
-//
-// Load W2_ptr
-// Branch to SMALL is expo_X < -6
-//
+;;
-{.mmi
-(p0) ld8 r50 = [r50]
-(p0) ld8 r51 = [r51]
-};;
+{ .mmi
+ ldfe FR_P5Q8 = [GR_ad_PQ],16 // P5(exp) or Q8(expm1) for small path
+ and GR_exp_x = GR_signexp_x, GR_exp_mask
+(p6) mov GR_small_exp = -12 // Exponent of x for small path exp
+}
+;;
-{ .mlx
-(p0) ldfe f41 = [r34],16
-//
-// float_N = X * L_Inv
-// expo_X = exponent of X
-// Mask = 0x1FFFF
-//
-(p0) movl r58 = 0x0FFFF
+// N_signif = X * Inv_log2_by_2^12
+// By adding 1.10...0*2^63 we shift and get round_int(N_signif) in significand.
+// We actually add 1.10...0*2^51 to X * Inv_log2 to do the same thing.
+{ .mfi
+ ldfe FR_P4Q7 = [GR_ad_PQ],16 // P4(exp) or Q7(expm1) for small path
+ fma.s1 FR_N_signif = FR_norm_x, FR_INV_LN2_2TO63, FR_RSHF_2TO51
+ nop.i 999
}
-{ .mlx
- nop.m 999
-(p0) movl r39 = 0x1FFFF ;;
+{ .mfi
+ sub GR_exp_x = GR_exp_x, GR_exp_bias // Get exponent
+ fmpy.s1 FR_r4 = FR_rsq, FR_rsq // Form r4 for small path
+ cmp.eq.unc p15, p0 = r0, r0 // Set Safe as default
}
+;;
+
{ .mmi
-(p0) getf.exp r37 = f9
- nop.m 999
-(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp ;;
+ ldfe FR_P3Q6 = [GR_ad_PQ],16 // P3(exp) or Q6(expm1) for small path
+ cmp.lt p14, p0 = GR_exp_x, GR_very_small_exp // Is |x| < 2^-60?
+ nop.i 999
}
-{ .mii
-(p0) ld8 r34 = [r34]
- nop.i 999
-(p0) and r37 = r37, r39 ;;
+;;
+
+{ .mfi
+ ldfe FR_P2Q5 = [GR_ad_PQ],16 // P2(exp) or Q5(expm1) for small path
+ fmpy.s1 FR_half_x = FR_half, FR_norm_x // 0.5 * x for very small path
+ cmp.lt p13, p0 = GR_exp_x, GR_small_exp // Is |x| < 2^-m?
}
-{ .mmi
-(p0) sub r37 = r37, r58 ;;
-(p0) cmp.gt.unc p14, p0 = -6, r37
-(p0) cmp.lt.unc p10, p0 = 14, r37 ;;
+{ .mib
+ nop.m 999
+ nop.i 999
+(p14) br.cond.spnt EXP_VERY_SMALL // Branch if |x| < 2^-60
}
+;;
+
{ .mfi
-(p0) nop.m 0
-//
-// Load L_inv
-// Set p12 true for Flag = 0 (exp)
-// Set p13 true for Flag = 1 (expm1)
-//
-(p0) fmpy.s1 f38 = f9, f37
- nop.i 999 ;;
+ ldfe FR_A3 = [GR_ad_A],16 // Get A3 for normal path
+ fcmp.ge.s1 p10,p0 = FR_norm_x, FR_min_oflow_x // Will result overflow?
+ mov GR_big_expo_neg = -16381 // -0x3ffd
}
{ .mfb
- nop.m 999
-//
-// Load L_hi
-// expo_X = expo_X - Bias
-// get W1_ptr
-//
-(p0) fcvt.fx.s1 f39 = f38
-(p14) br.cond.spnt EXPL_SMALL ;;
+ ldfe FR_P1Q4 = [GR_ad_PQ],16 // P1(exp) or Q4(expm1) for small path
+ nop.f 999
+(p13) br.cond.spnt EXP_SMALL // Branch if |x| < 2^-m
+ // m=12 for exp, m=7 for expm1
}
-{ .mib
- nop.m 999
- nop.i 999
-(p10) br.cond.spnt EXPL_HUGE ;;
+;;
+
+// Now we are on the main path for |x| >= 2^-m, m=12 for exp, m=7 for expm1
+//
+// float_N = round_int(N_signif)
+// The signficand of N_signif contains the rounded integer part of X * 2^12/ln2,
+// as a twos complement number in the lower bits (that is, it may be negative).
+// That twos complement number (called N) is put into GR_N.
+
+// Since N_signif is scaled by 2^51, it must be multiplied by 2^-51
+// before the shift constant 1.10000 * 2^63 is subtracted to yield float_N.
+// Thus, float_N contains the floating point version of N
+
+
+{ .mfi
+ ldfe FR_A2 = [GR_ad_A],16 // Get A2 for main path
+ fcmp.lt.s1 p11,p0 = FR_norm_x, FR_zero_uflow_x // Certain zero, uflow?
+ add GR_ad_T2 = 0x100, GR_ad_T1 // Point to T2 table
}
-{ .mmi
-(p0) shladd r34 = r32,4,r34
+{ .mfi
nop.m 999
-(p0) addl r35 = @ltoff(Constants_exp_64_A#),gp ;;
-}
-//
-// Load T_1,T_2
-//
-{ .mmi
- nop.m 999
- ld8 r35 =[r35]
- nop.i 99
-};;
-{ .mmb
-(p0) ldfe f51 = [r35],16
-(p0) ld8 r45 = [r34],8
- nop.b 999 ;;
+ fms.s1 FR_float_N = FR_N_signif, FR_2TOM51, FR_RSHF // Form float_N
+ nop.i 999
}
-//
-// Set Safe = True if k >= big_expo_neg
-// Set Safe = False if k < big_expo_neg
-//
-{ .mmb
-(p0) ldfe f49 = [r35],16
-(p0) ld8 r48 = [r34],0
- nop.b 999 ;;
+;;
+
+{ .mbb
+ getf.sig GR_N_fix = FR_N_signif // Get N from significand
+(p10) br.cond.spnt EXP_OVERFLOW // Branch if result will overflow
+(p11) br.cond.spnt EXP_CERTAIN_UNDERFLOW_ZERO // Branch if certain zero, uflow
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Branch to HUGE is expo_X > 14
-//
-(p0) fcvt.xf f38 = f39
- nop.i 999 ;;
+ ldfe FR_A1 = [GR_ad_A],16 // Get A1 for main path
+ fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_norm_x // r = -L_hi * float_N + x
+ extr.u GR_M1 = GR_N_fix, 6, 6 // Extract index M_1
}
{ .mfi
-(p0) getf.sig r52 = f39
- nop.f 999
- nop.i 999 ;;
+ and GR_M2 = 0x3f, GR_N_fix // Extract index M_2
+ nop.f 999
+ nop.i 999
}
-{ .mii
- nop.m 999
-(p0) extr.u r43 = r52, 6, 6 ;;
-//
-// r = r - float_N * L_lo
-// K = extr(N_fix,12,52)
-//
-(p0) shladd r40 = r43,3,r40 ;;
+;;
+
+// N_fix is only correct up to 50 bits because of our right shift technique.
+// Actually in the normal path we will have restricted K to about 14 bits.
+// Somewhat arbitrarily we extract 32 bits.
+{ .mfi
+ shladd GR_ad_W1 = GR_M1,3,GR_ad_W1 // Point to W1
+ nop.f 999
+ extr GR_K = GR_N_fix, 12, 32 // Extract limited range K
}
{ .mfi
-(p0) shladd r50 = r43,2,r50
-(p0) fnma.s1 f42 = f40, f38, f9
-//
-// float_N = float(N)
-// N_fix = signficand N
-//
-(p0) extr.u r42 = r52, 0, 6
+ shladd GR_ad_T1 = GR_M1,2,GR_ad_T1 // Point to T1
+ nop.f 999
+ shladd GR_ad_T2 = GR_M2,2,GR_ad_T2 // Point to T2
}
+;;
+
{ .mmi
-(p0) ldfd f43 = [r40],0 ;;
-(p0) shladd r41 = r42,3,r41
-(p0) shladd r51 = r42,2,r51
-}
-//
-// W_1_p1 = 1 + W_1
-//
-{ .mmi
-(p0) ldfs f44 = [r50],0 ;;
-(p0) ldfd f45 = [r41],0
-//
-// M_2 = extr(N_fix,0,6)
-// M_1 = extr(N_fix,6,6)
-// r = X - float_N * L_hi
-//
-(p0) extr r44 = r52, 12, 52
+ ldfs FR_T1 = [GR_ad_T1],0 // Get T1
+ ldfd FR_W1 = [GR_ad_W1],0 // Get W1
+ add GR_exp_2_k = GR_exp_bias, GR_K // Form exponent of 2^k
}
+;;
+
{ .mmi
-(p0) ldfs f46 = [r51],0 ;;
-(p0) sub r46 = r58, r44
-(p0) cmp.gt.unc p8, p15 = r44, r45
-}
-//
-// W = W_1 + W_1_p1*W_2
-// Load A_2
-// Bias_m_K = Bias - K
-//
-{ .mii
-(p0) ldfe f40 = [r35],16
-//
-// load A_1
-// poly = A_2 + r*A_3
-// rsq = r * r
-// neg_2_mK = exponent of Bias_m_k
-//
-(p0) add r47 = r58, r44 ;;
-//
-// Set Safe = True if k <= big_expo_pos
-// Set Safe = False if k > big_expo_pos
-// Load A_3
-//
-(p15) cmp.lt p8,p15 = r44,r48 ;;
+ ldfs FR_T2 = [GR_ad_T2],0 // Get T2
+ shladd GR_ad_W2 = GR_M2,3,GR_ad_W2 // Point to W2
+ sub GR_exp_2_mk = GR_exp_bias, GR_K // Form exponent of 2^-k
}
+;;
+
{ .mmf
-(p0) setf.exp f61 = r46
-//
-// Bias_p + K = Bias + K
-// T = T_1 * T_2
-//
-(p0) setf.exp f36 = r47
-(p0) fnma.s1 f42 = f41, f38, f42 ;;
+ ldfd FR_W2 = [GR_ad_W2],0 // Get W2
+ setf.exp FR_scale = GR_exp_2_k // Set scale = 2^k
+ fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r // r = -L_lo * float_N + r
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Load W_1,W_2
-// Load big_exp_pos, load big_exp_neg
-//
-(p0) fadd.s1 f47 = f43, f1
- nop.i 999 ;;
+ setf.exp FR_2_mk = GR_exp_2_mk // Form 2^-k
+ fma.s1 FR_poly = FR_r, FR_A3, FR_A2 // poly = r * A3 + A2
+ cmp.lt p8,p15 = GR_K,GR_big_expo_neg // Set Safe if K > big_expo_neg
}
{ .mfi
- nop.m 999
-(p0) fma.s1 f52 = f42, f51, f49
- nop.i 999
+ nop.m 999
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmpy.s1 f48 = f42, f42
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 FR_T = FR_T1, FR_T2 // T = T1 * T2
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 f53 = f44, f46
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 FR_W1_p1 = FR_W1, f1 // W1_p1 = W1 + 1.0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 f54 = f45, f47, f43
- nop.i 999
+(p7) cmp.lt.unc p8, p9 = 10, GR_K // If expm1, set p8 if K > 10
+ fma.s1 FR_poly = FR_r, FR_poly, FR_A1 // poly = r * poly + A1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fneg f61 = f61
- nop.i 999 ;;
+(p7) cmp.eq p15, p0 = r0, r0 // If expm1, set Safe flag
+ fma.s1 FR_T_scale = FR_T, FR_scale, f0 // T_scale = T * scale
+(p9) cmp.gt.unc p9, p10 = -10, GR_K // If expm1, set p9 if K < -10
+ // If expm1, set p10 if -10<=K<=10
}
{ .mfi
- nop.m 999
-(p0) fma.s1 f52 = f42, f52, f40
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_W = FR_W2, FR_W1_p1, FR_W1 // W = W2 * (W1+1.0) + W1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fadd.s1 f55 = f54, f1
- nop.i 999
+ nop.m 999
+ mov FR_Y_hi = FR_T // Assume Y_hi = T
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// W + Wp1 * poly
-//
-(p0) mov f34 = f53
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_poly = FR_rsq, FR_poly, FR_r // poly = rsq * poly + r
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// A_1 + r * poly
-// Scale = setf_expl(Bias_p_k)
-//
-(p0) fma.s1 f52 = f48, f52, f42
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 FR_Wp1_T_scale = FR_W, FR_T_scale, FR_T_scale // (W+1)*T*scale
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// poly = r + rsq(A_1 + r*poly)
-// Wp1 = 1 + W
-// neg_2_mK = -neg_2_mK
-//
-(p0) fma.s1 f35 = f55, f52, f54
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p0) fmpy.s1 f35 = f35, f53
-//
-// Y_hi = T
-// Y_lo = T * (W + Wp1*poly)
-//
-(p12) br.cond.sptk EXPL_MAIN ;;
-}
-//
-// Branch if expl(x)
-// Continue for expl(x-1)
-//
-{ .mii
-(p0) cmp.lt.unc p12, p13 = 10, r44
- nop.i 999 ;;
-//
-// Set p12 if 10 < K, Else p13
-//
-(p13) cmp.gt.unc p13, p14 = -10, r44 ;;
+ nop.m 999
+ fma.s1 FR_W_T_scale = FR_W, FR_T_scale, f0 // W*T*scale
+ nop.i 999
}
-//
-// K > 10: Y_lo = Y_lo + neg_2_mK
-// K <=10: Set p13 if -10 > K, Else set p14
-//
+;;
+
{ .mfi
-(p13) cmp.eq p15, p0 = r0, r0
-(p14) fadd.s1 f34 = f61, f34
- nop.i 999 ;;
+ nop.m 999
+(p9) fsub.s1 FR_Y_hi = f0, FR_2_mk // If expm1, if K < -10 set Y_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p12) fadd.s1 f35 = f35, f61
- nop.i 999 ;;
+ nop.m 999
+(p10) fsub.s1 FR_Y_hi = FR_T, FR_2_mk // If expm1, if |K|<=10 set Y_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p13) fadd.s1 f35 = f35, f34
- nop.i 999
-}
-{ .mfb
- nop.m 999
-//
-// K <= 10 and K < -10, Set Safe = True
-// K <= 10 and K < 10, Y_lo = Y_hi + Y_lo
-// K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk
-//
-(p13) mov f34 = f61
-(p0) br.cond.sptk EXPL_MAIN ;;
-}
-EXPL_SMALL:
-{ .mmi
nop.m 999
-(p0) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp
-(p12) addl r35 = @ltoff(Constants_exp_64_P#),gp ;;
+ fma.s1 FR_result_lo = FR_Wp1_T_scale, FR_poly, FR_W_T_scale
+ nop.i 999
}
-.pred.rel "mutex",p12,p13
-{ .mmi
-(p12) ld8 r35=[r35]
-nop.m 999
-(p13) addl r35 = @ltoff(Constants_exp_64_Q#),gp
-};;
-{ .mmi
-(p13) ld8 r35=[r35]
-(p0) ld8 r34=[r34]
-nop.i 999
-};;
+;;
+
+.pred.rel "mutex",p8,p9
+// If K > 10 adjust result_lo = result_lo - scale * 2^-k
+// If |K| <= 10 adjust result_lo = result_lo + scale * T
{ .mfi
-(p0) add r34 = 0x48,r34
-//
-// Return
-// K <= 10 and K < 10, Y_hi = neg_2_mk
-//
-// /*******************************************************/
-// /*********** Branch EXPL_SMALL ************************/
-// /*******************************************************/
-(p0) mov f42 = f9
- nop.i 999 ;;
+ nop.m 999
+(p8) fnma.s1 FR_result_lo = FR_scale, FR_2_mk, FR_result_lo // If K > 10
+ nop.i 999
}
-//
-// Flag = 0
-// r4 = rsq * rsq
-//
{ .mfi
-(p0) ld8 r49 =[r34],0
- nop.f 999
- nop.i 999 ;;
-}
-{ .mii
- nop.m 999
- nop.i 999 ;;
-//
-// Flag = 1
-//
-(p0) cmp.lt.unc p14, p0 = r37, r49 ;;
+ nop.m 999
+(p9) fma.s1 FR_result_lo = FR_T_scale, f1, FR_result_lo // If |K| <= 10
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// r = X
-//
-(p0) fmpy.s1 f48 = f42, f42
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s0 FR_tmp = FR_A1, FR_A1 // Dummy op to set inexact
+ nop.i 999
}
{ .mfb
- nop.m 999
-//
-// rsq = r * r
-//
-(p0) fmpy.s1 f50 = f48, f48
-//
-// Is input very small?
-//
-(p14) br.cond.spnt EXPL_VERY_SMALL ;;
-}
-//
-// Flag_not1: Y_hi = 1.0
-// Flag is 1: r6 = rsq * r4
-//
-{ .mfi
-(p12) ldfe f52 = [r35],16
-(p12) mov f34 = f1
-(p0) add r53 = 0x1,r0 ;;
-}
-{ .mfi
-(p13) ldfe f51 = [r35],16
-//
-// Flag_not_1: Y_lo = poly_hi + r4 * poly_lo
-//
-(p13) mov f34 = f9
- nop.i 999 ;;
-}
-{ .mmf
-(p12) ldfe f53 = [r35],16
-//
-// For Flag_not_1, Y_hi = X
-// Scale = 1
-// Create 0x000...01
-//
-(p0) setf.sig f37 = r53
-(p0) mov f36 = f1 ;;
+ nop.m 999
+(p15) fma.s0 f8 = FR_Y_hi, FR_scale, FR_result_lo // Safe result
+(p15) br.ret.sptk b0 // Safe exit for normal path
}
-{ .mmi
-(p13) ldfe f52 = [r35],16 ;;
-(p12) ldfe f54 = [r35],16
- nop.i 999 ;;
+;;
+
+// Here if unsafe, will only be here for exp with K < big_expo_neg
+{ .mfb
+ nop.m 999
+ fma.s0 FR_RESULT = FR_Y_hi, FR_scale, FR_result_lo // Prelim result
+ br.cond.sptk EXP_POSSIBLE_UNDERFLOW // Branch to unsafe code
}
+;;
+
+
+EXP_SMALL:
+// Here if 2^-60 < |x| < 2^-m, m=12 for exp, m=7 for expm1
{ .mfi
-(p13) ldfe f53 = [r35],16
-(p13) fmpy.s1 f58 = f48, f50
- nop.i 999 ;;
-}
-//
-// Flag_not1: poly_lo = P_5 + r*P_6
-// Flag_1: poly_lo = Q_6 + r*Q_7
-//
-{ .mmi
-(p13) ldfe f54 = [r35],16 ;;
-(p12) ldfe f55 = [r35],16
- nop.i 999 ;;
-}
-{ .mmi
-(p12) ldfe f56 = [r35],16 ;;
-(p13) ldfe f55 = [r35],16
- nop.i 999 ;;
-}
-{ .mmi
-(p12) ldfe f57 = [r35],0 ;;
-(p13) ldfe f56 = [r35],16
- nop.i 999 ;;
+(p7) ldfe FR_Q3 = [GR_ad_Q],16 // Get Q3 for small path, if expm1
+(p6) fma.s1 FR_p65 = FR_P6, FR_r, FR_P5 // If exp, p65 = P6 * r + P5
+ nop.i 999
}
{ .mfi
-(p13) ldfe f57 = [r35],0
- nop.f 999
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-//
-// For Flag_not_1, load p5,p6,p1,p2
-// Else load p5,p6,p1,p2
-//
-(p12) fma.s1 f60 = f52, f42, f53
- nop.i 999 ;;
+ mov GR_minus_one = -1
+(p7) fma.s1 FR_q98 = FR_Q9, FR_r, FR_Q8 // If expm1, q98 = Q9 * r + Q8
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p13) fma.s1 f60 = f51, f42, f52
- nop.i 999 ;;
+(p7) ldfe FR_Q2 = [GR_ad_Q],16 // Get Q2 for small path, if expm1
+(p7) fma.s1 FR_q65 = FR_Q6, FR_r, FR_Q5 // If expm1, q65 = Q6 * r + Q5
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p12) fma.s1 f60 = f60, f42, f54
- nop.i 999 ;;
+ setf.sig FR_tmp = GR_minus_one // Create value to force inexact
+(p6) fma.s1 FR_p21 = FR_P2, FR_r, FR_P1 // If exp, p21 = P2 * r + P1
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p12) fma.s1 f59 = f56, f42, f57
- nop.i 999 ;;
+(p7) ldfe FR_Q1 = [GR_ad_Q],16 // Get Q1 for small path, if expm1
+(p7) fma.s1 FR_q43 = FR_Q4, FR_r, FR_Q3 // If expm1, q43 = Q4 * r + Q3
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p13) fma.s1 f60 = f42, f60, f53
- nop.i 999 ;;
+ nop.m 999
+(p6) fma.s1 FR_p654 = FR_p65, FR_r, FR_P4 // If exp, p654 = p65 * r + P4
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p12) fma.s1 f59 = f59, f48, f42
- nop.i 999 ;;
+ nop.m 999
+(p7) fma.s1 FR_q987 = FR_q98, FR_r, FR_Q7 // If expm1, q987 = q98 * r + Q7
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7)
-// Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6)
-// Flag_not1: poly_hi = (P_1 + r*P_2)
-//
-(p13) fmpy.s1 f60 = f60, f58
- nop.i 999 ;;
+ nop.m 999
+(p7) fma.s1 FR_q21 = FR_Q2, FR_r, FR_Q1 // If expm1, q21 = Q2 * r + Q1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p12) fma.s1 f60 = f60, f42, f55
- nop.i 999 ;;
+ nop.m 999
+(p6) fma.s1 FR_p210 = FR_p21, FR_rsq, FR_r // If exp, p210 = p21 * r + P0
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Flag_1: poly_lo = r6 *(Q_5 + ....)
-// Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2)
-//
-(p12) fma.s1 f35 = f60, f50, f59
- nop.i 999
+ nop.m 999
+(p7) fma.s1 FR_q6543 = FR_q65, FR_rsq, FR_q43 // If expm1, q6543 = q65*r2+q43
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p13) fma.s1 f59 = f54, f42, f55
- nop.i 999 ;;
+ nop.m 999
+(p6) fma.s1 FR_p6543 = FR_p654, FR_r, FR_P3 // If exp, p6543 = p654 * r + P3
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Flag_not1: Y_lo = rsq* poly_hi + poly_lo
-// Flag_1: poly_lo = rsq* poly_hi + poly_lo
-//
-(p13) fma.s1 f59 = f59, f42, f56
- nop.i 999 ;;
+ nop.m 999
+(p7) fma.s1 FR_q9876543 = FR_q987, FR_r4, FR_q6543 // If expm1, q9876543 = ...
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Flag_not_1: (P_1 + r*P_2)
-//
-(p13) fma.s1 f59 = f59, f42, f57
- nop.i 999 ;;
+ nop.m 999
+(p6) fma.s1 FR_Y_lo = FR_p6543, FR_r4, FR_p210 // If exp, form Y_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2)
-//
-(p13) fma.s1 f35 = f59, f48, f60
- nop.i 999 ;;
+ nop.m 999
+(p7) fma.s1 FR_Y_lo = FR_q9876543, FR_rsq, FR_q21 // If expm1, form Y_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Create 0.000...01
-//
-(p0) for f37 = f35, f37
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-//
-// Set lsb of Y_lo to 1
-//
-(p0) fmerge.se f35 = f35,f37
-(p0) br.cond.sptk EXPL_MAIN ;;
-}
-EXPL_VERY_SMALL:
-{ .mmi
- nop.m 999
- nop.m 999
-(p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp
+ nop.m 999
+ fmpy.s0 FR_tmp = FR_tmp, FR_tmp // Dummy op to set inexact
+ nop.i 999
}
+;;
+
+.pred.rel "mutex",p6,p7
{ .mfi
- nop.m 999
-(p12) mov f35 = f9
- nop.i 999 ;;
+ nop.m 999
+(p6) fma.s0 f8 = FR_Y_lo, f1, f1 // If exp, result = 1 + Y_lo
+ nop.i 999
}
{ .mfb
-(p13) ld8 r34 = [r34]
-(p12) mov f34 = f1
-(p12) br.cond.sptk EXPL_MAIN ;;
-}
-{ .mlx
-(p13) add r34 = 8,r34
-(p13) movl r39 = 0x0FFFE ;;
-}
-//
-// Load big_exp_neg
-// Create 1/2's exponent
-//
-{ .mii
-(p13) setf.exp f56 = r39
-(p13) shladd r34 = r32,4,r34 ;;
- nop.i 999
+ nop.m 999
+(p7) fma.s0 f8 = FR_Y_lo, FR_rsq, FR_norm_x // If expm1, result = Y_lo*r2+x
+ br.ret.sptk b0 // Exit for 2^-60 <= |x| < 2^-m
+ // m=12 for exp, m=7 for expm1
}
+;;
+
+
+EXP_VERY_SMALL:
//
-// Negative exponents are stored after positive
+// Here if 0 < |x| < 2^-60
+// If exp, result = 1.0 + x
+// If expm1, result = x +x*x/2, but have to check for possible underflow
//
+
{ .mfi
-(p13) ld8 r45 = [r34],0
-//
-// Y_hi = x
-// Scale = 1
-//
-(p13) fmpy.s1 f35 = f9, f9
- nop.i 999 ;;
+(p7) mov GR_exp_underflow = -16381 // Exponent for possible underflow
+(p6) fadd.s0 f8 = f1, FR_norm_x // If exp, result = 1+x
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Reset Safe if necessary
-// Create 1/2
-//
-(p13) mov f34 = f9
- nop.i 999 ;;
+ nop.m 999
+(p7) fmpy.s1 FR_result_lo = FR_half_x, FR_norm_x // If expm1 result_lo = x*x/2
+ nop.i 999
}
+;;
+
{ .mfi
-(p13) cmp.lt.unc p0, p15 = r37, r45
-(p13) mov f36 = f1
- nop.i 999 ;;
+(p7) cmp.lt.unc p0, p8 = GR_exp_x, GR_exp_underflow // Unsafe if expm1 x small
+(p7) mov FR_Y_hi = FR_norm_x // If expm1, Y_hi = x
+(p7) cmp.lt p0, p15 = GR_exp_x, GR_exp_underflow // Unsafe if expm1 x small
}
+;;
+
{ .mfb
- nop.m 999
-//
-// Y_lo = x * x
-//
-(p13) fmpy.s1 f35 = f35, f56
-//
-// Y_lo = x*x/2
-//
-(p13) br.cond.sptk EXPL_MAIN ;;
-}
-EXPL_HUGE:
-{ .mfi
- nop.m 999
-(p0) fcmp.gt.unc.s1 p14, p0 = f9, f0
- nop.i 999
-}
-{ .mlx
- nop.m 999
-(p0) movl r39 = 0x15DC0 ;;
-}
-{ .mfi
-(p14) setf.exp f34 = r39
-(p14) mov f35 = f1
-(p14) cmp.eq p0, p15 = r0, r0 ;;
+ nop.m 999
+(p8) fma.s0 f8 = FR_norm_x, f1, FR_result_lo // If expm1, result=x+x*x/2
+(p15) br.ret.sptk b0 // If Safe, exit
}
+;;
+
+// Here if expm1 and 0 < |x| < 2^-16381; may be possible underflow
{ .mfb
- nop.m 999
-(p14) mov f36 = f34
-//
-// If x > 0, Set Safe = False
-// If x > 0, Y_hi = 2**(24,000)
-// If x > 0, Y_lo = 1.0
-// If x > 0, Scale = 2**(24,000)
-//
-(p14) br.cond.sptk EXPL_MAIN ;;
-}
-{ .mlx
- nop.m 999
-(p12) movl r39 = 0xA240
-}
-{ .mlx
- nop.m 999
-(p12) movl r38 = 0xA1DC ;;
-}
-{ .mmb
-(p13) cmp.eq p15, p14 = r0, r0
-(p12) setf.exp f34 = r39
- nop.b 999 ;;
-}
-{ .mlx
-(p12) setf.exp f35 = r38
-(p13) movl r39 = 0xFF9C
-}
-{ .mfi
- nop.m 999
-(p13) fsub.s1 f34 = f0, f1
- nop.i 999 ;;
+ nop.m 999
+ fma.s0 FR_RESULT = FR_Y_hi, FR_scale, FR_result_lo // Prelim result
+ br.cond.sptk EXP_POSSIBLE_UNDERFLOW // Branch to unsafe code
}
-{ .mfi
- nop.m 999
-(p12) mov f36 = f34
-(p12) cmp.eq p0, p15 = r0, r0 ;;
+;;
+
+EXP_CERTAIN_UNDERFLOW_ZERO:
+// Here if x < zero_uflow_x
+// For exp, set result to tiny+0.0 and set I, U, and branch to error handling
+// For expm1, set result to tiny-1.0 and set I, and exit
+{ .mmi
+ alloc GR_SAVE_PFS = ar.pfs,0,3,4,0
+ nop.m 999
+ mov GR_one = 1
}
-{ .mfi
-(p13) setf.exp f35 = r39
-(p13) mov f36 = f1
- nop.i 999 ;;
+;;
+
+{ .mmi
+ setf.exp FR_small = GR_one // Form small value
+ nop.m 999
+(p6) mov GR_Parameter_TAG = 13 // Error tag for exp underflow
}
-EXPL_MAIN:
+;;
+
{ .mfi
-(p0) cmp.ne.unc p12, p0 = 0x01, r33
-(p0) fmpy.s1 f101 = f36, f35
- nop.i 999 ;;
+ nop.m 999
+ fmerge.s FR_X = f8,f8 // Save x for error call
+ nop.i 999
}
+;;
+
+.pred.rel "mutex",p6,p7
{ .mfb
- nop.m 999
-(p0) fma.s0 f99 = f34, f36, f101
-(p15) br.cond.sptk EXPL_64_RETURN ;;
-}
-{ .mfi
- nop.m 999
-(p0) fsetc.s3 0x7F,0x01
- nop.i 999
+ nop.m 999
+(p6) fma.s0 FR_RESULT = FR_small, FR_small, f0 // If exp, set I,U, tiny result
+(p6) br.cond.sptk __libm_error_region // If exp, go to error handling
}
-{ .mlx
- nop.m 999
-(p0) movl r50 = 0x00000000013FFF ;;
+{ .mfb
+ nop.m 999
+(p7) fms.s0 f8 = FR_small, FR_small, f1 // If expm1, set I, result -1.0
+(p7) br.ret.sptk b0 // If expm1, exit
+}
+;;
+
+
+EXP_OVERFLOW:
+// Here if x >= min_oflow_x
+{ .mmi
+ alloc GR_SAVE_PFS = ar.pfs,0,3,4,0
+ mov GR_huge_exp = 0x1fffe
+ nop.i 999
}
-//
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// S3 user supplied status + RZ + TD (Underflows)
-//
-//
-// If (Safe) is true, then
-// Compute result using user supplied status field.
-// No overflow or underflow here, but perhaps inexact.
-// Return
-// Else
-// Determine if overflow or underflow was raised.
-// Fetch +/- overflow threshold for IEEE single, double,
-// double extended
-//
{ .mfi
-(p0) setf.exp f60 = r50
-(p0) fma.s3 f102 = f34, f36, f101
- nop.i 999
+ mov GR_huge_signif = -0x1
+ nop.f 999
+(p6) mov GR_Parameter_TAG = 12 // Error tag for exp overflow
}
-{ .mfi
- nop.m 999
-(p0) fsetc.s3 0x7F,0x40
- nop.i 999 ;;
+;;
+
+{ .mmf
+ setf.exp FR_huge_exp = GR_huge_exp // Create huge value
+ setf.sig FR_huge_signif = GR_huge_signif // Create huge value
+ fmerge.s FR_X = f8,f8 // Save x for error call
}
+;;
+
{ .mfi
- nop.m 999
-//
-// For Safe, no need to check for over/under.
-// For expm1, handle errors like exp.
-//
-(p0) fsetc.s2 0x7F,0x42
- nop.i 999;;
+ nop.m 999
+ fmerge.se FR_huge = FR_huge_exp, FR_huge_signif
+(p7) mov GR_Parameter_TAG = 39 // Error tag for expm1 overflow
}
-{ .mfi
- nop.m 999
-(p0) fma.s2 f100 = f34, f36, f101
- nop.i 999 ;;
+;;
+
+{ .mfb
+ nop.m 999
+ fma.s0 FR_RESULT = FR_huge, FR_huge, FR_huge // Force I, O, and Inf
+ br.cond.sptk __libm_error_region // Branch to error handling
}
+;;
+
+
+
+EXP_POSSIBLE_UNDERFLOW:
+// Here if exp and zero_uflow_x < x < about -11356 [where k < -16381]
+// Here if expm1 and |x| < 2^-16381
{ .mfi
- nop.m 999
-(p0) fsetc.s2 0x7F,0x40
- nop.i 999 ;;
+ alloc GR_SAVE_PFS = ar.pfs,0,3,4,0
+ fsetc.s2 0x7F,0x41 // Set FTZ and disable traps
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p7) fclass.m.unc p12, p0 = f102, 0x00F
- nop.i 999
+ nop.m 999
+ fma.s2 FR_ftz = FR_Y_hi, FR_scale, FR_result_lo // Result with FTZ
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p11, p0 = f102, 0x00F
- nop.i 999 ;;
+ nop.m 999
+ fsetc.s2 0x7F,0x40 // Disable traps (set s2 default)
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p7) fcmp.ge.unc.s1 p10, p0 = f100, f60
- nop.i 999
+ nop.m 999
+(p7) fclass.m.unc p12, p0 = FR_ftz, 0x00F // If expm1, FTZ result denorm, zero?
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Create largest double exponent + 1.
-// Create smallest double exponent - 1.
-//
-(p0) fcmp.ge.unc.s1 p8, p0 = f100, f60
- nop.i 999 ;;
-}
-//
-// fcmp: resultS2 >= + overflow threshold -> set (a) if true
-// fcmp: resultS2 <= - overflow threshold -> set (b) if true
-// fclass: resultS3 is denorm/unorm/0 -> set (d) if true
-//
-{ .mib
-(p10) mov GR_Parameter_TAG = 39
- nop.i 999
-(p10) br.cond.sptk __libm_error_region ;;
-}
-{ .mib
-(p8) mov GR_Parameter_TAG = 12
- nop.i 999
-(p8) br.cond.sptk __libm_error_region ;;
-}
-//
-// Report that exp overflowed
-//
-{ .mib
-(p12) mov GR_Parameter_TAG = 40
- nop.i 999
-(p12) br.cond.sptk __libm_error_region ;;
+ nop.m 999
+(p6) fclass.m.unc p11, p0 = FR_ftz, 0x00F // If exp, FTZ result denorm or zero?
+ nop.i 999
}
-{ .mib
-(p11) mov GR_Parameter_TAG = 13
- nop.i 999
-(p11) br.cond.sptk __libm_error_region ;;
+;;
+
+{ .mfb
+(p12) mov GR_Parameter_TAG = 40 // expm1 underflow
+ fmerge.s FR_X = f8,f8 // Save x for error call
+(p12) br.cond.spnt __libm_error_region // Branch on expm1 underflow
}
+;;
+
{ .mib
- nop.m 999
- nop.i 999
-//
-// Report that exp underflowed
-//
-(p0) br.cond.sptk EXPL_64_RETURN ;;
+(p11) mov GR_Parameter_TAG = 13 // exp underflow
+ nop.i 999
+(p11) br.cond.spnt __libm_error_region // Branch on exp underflow
}
-EXPL_64_SPECIAL:
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6, p0 = f8, 0x0c3
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p13, p8 = f8, 0x007
- nop.i 999 ;;
+;;
+
+{ .mfb
+ nop.m 999
+ mov f8 = FR_RESULT // Was safe after all
+ br.ret.sptk b0
}
+;;
+
+
+EXP_64_SPECIAL:
+// Here if x natval, nan, inf, zero
+// If x natval, +inf, or if expm1 and x zero, just return x.
+// The other cases must be tested for, and results set.
+// These cases do not generate exceptions.
{ .mfi
- nop.m 999
-(p7) fclass.m.unc p14, p0 = f8, 0x007
- nop.i 999
+ nop.m 999
+ fclass.m p8, p0 = f8, 0x0c3 // Is x nan?
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p12, p9 = f8, 0x021
- nop.i 999 ;;
+ nop.m 999
+(p6) fclass.m.unc p13, p0 = f8, 0x007 // If exp, is x zero?
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p11, p0 = f8, 0x022
- nop.i 999
+ nop.m 999
+(p6) fclass.m.unc p11, p0 = f8, 0x022 // If exp, is x -inf?
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fclass.m.unc p10, p0 = f8, 0x022
- nop.i 999 ;;
+ nop.m 999
+(p8) fadd.s0 f8 = f8, f1 // If x nan, result quietized x
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Identify +/- 0, Inf, or -Inf
-// Generate the right kind of NaN.
-//
-(p13) fadd.s0 f99 = f0, f1
- nop.i 999 ;;
+ nop.m 999
+(p7) fclass.m.unc p10, p0 = f8, 0x022 // If expm1, is x -inf?
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p14) mov f99 = f8
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p6) fadd.s0 f99 = f8, f1
-//
-// expl(+/-0) = 1
-// expm1l(+/-0) = +/-0
-// No exceptions raised
-//
-(p6) br.cond.sptk EXPL_64_RETURN ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p14) br.cond.sptk EXPL_64_RETURN ;;
+ nop.m 999
+(p13) fadd.s0 f8 = f0, f1 // If exp and x zero, result 1.0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p11) mov f99 = f0
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p10) fsub.s1 f99 = f0, f1
-//
-// expl(-Inf) = 0
-// expm1l(-Inf) = -1
-// No exceptions raised.
-//
-(p10) br.cond.sptk EXPL_64_RETURN ;;
-}
-{ .mfb
- nop.m 999
-(p12) fmpy.s1 f99 = f8, f1
-//
-// expl(+Inf) = Inf
-// No exceptions raised.
-//
-(p0) br.cond.sptk EXPL_64_RETURN ;;
+ nop.m 999
+(p11) mov f8 = f0 // If exp and x -inf, result 0
+ nop.i 999
}
-EXPL_64_UNSUPPORTED:
+;;
+
{ .mfb
- nop.m 999
-(p0) fmpy.s0 f99 = f8, f0
-(p0) br.cond.sptk EXPL_64_RETURN ;;
+ nop.m 999
+(p10) fsub.s1 f8 = f0, f1 // If expm1, x -inf, result -1.0
+ br.ret.sptk b0 // Exit special cases
}
-EXPL_64_RETURN:
+;;
+
+
+EXP_64_UNSUPPORTED:
+// Here if x unsupported type
{ .mfb
nop.m 999
-(p0) mov f8 = f99
-(p0) br.ret.sptk b0
+ fmpy.s0 f8 = f8, f0 // Return nan
+ br.ret.sptk b0
}
-.endp
-ASM_SIZE_DIRECTIVE(expl)
+;;
-.proc __libm_error_region
-__libm_error_region:
+GLOBAL_IEEE754_END(expl)
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
@@ -1598,9 +1421,9 @@ __libm_error_region:
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
{ .mmi
ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
@@ -1613,8 +1436,7 @@ __libm_error_region:
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region#)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_fabs.S b/sysdeps/ia64/fpu/s_fabs.S
index ea3908dbc3..3434389a3c 100644
--- a/sysdeps/ia64/fpu/s_fabs.S
+++ b/sysdeps/ia64/fpu/s_fabs.S
@@ -1,34 +1,82 @@
-/* Copyright (C) 2000 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA. */
-
-#include <sysdep.h>
-#undef ret
-
-ENTRY (__fabs)
-{
- fabs fret0 = farg0
- br.ret.sptk.many rp
-}
-END (__fabs)
-
-strong_alias (__fabs, __fabsf)
-strong_alias (__fabs, __fabsl)
-
-weak_alias (__fabs, fabs)
-weak_alias (__fabsf, fabsf)
-weak_alias (__fabsl, fabsl)
+.file "fabs.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/02/00 Initial version
+// 02/07/02 Added __libm_fabs entry point to test in case compiler inlines
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// double fabs (double x)
+//
+// Overview of operation
+//==============================================================
+// returns absolute value of x
+
+// floating-point registers used: 1
+// f8, input
+
+.section .text
+.global __libm_fabs#
+
+.proc __libm_fabs#
+__libm_fabs:
+.endp __libm_fabs#
+
+GLOBAL_IEEE754_ENTRY(fabs)
+
+// set invalid or denormal flags and take fault if
+// necessary
+
+{ .mfi
+ nop.m 999
+ fcmp.eq.unc.s0 p6,p7 = f8,f1
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+ fmerge.s f8 = f0,f8
+ br.ret.sptk b0 ;;
+}
+
+GLOBAL_IEEE754_END(fabs)
diff --git a/sysdeps/ia64/fpu/s_fabsf.S b/sysdeps/ia64/fpu/s_fabsf.S
index 7e5abde625..71bb6da882 100644
--- a/sysdeps/ia64/fpu/s_fabsf.S
+++ b/sysdeps/ia64/fpu/s_fabsf.S
@@ -1 +1,82 @@
-/* __fabsf is in s_fabs.S. */
+.file "fabsf.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/02/00 Initial version
+// 02/07/02 Added __libm_fabsf entry point to test in case compiler inlines
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// float fabsf (float x)
+//
+// Overview of operation
+//==============================================================
+// returns absolute value of x
+
+// floating-point registers used: 1
+// f8, input
+
+.section .text
+.global __libm_fabsf#
+
+.proc __libm_fabsf#
+__libm_fabsf:
+.endp __libm_fabsf#
+
+GLOBAL_IEEE754_ENTRY(fabsf)
+
+// set invalid or denormal flags and take fault if
+// necessary
+
+{ .mfi
+ nop.m 999
+ fcmp.eq.unc.s0 p6,p7 = f8,f1
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+ fmerge.s f8 = f0,f8
+ br.ret.sptk b0 ;;
+}
+
+GLOBAL_IEEE754_END(fabsf)
diff --git a/sysdeps/ia64/fpu/s_fabsl.S b/sysdeps/ia64/fpu/s_fabsl.S
index 3d7a41fe2b..a048949147 100644
--- a/sysdeps/ia64/fpu/s_fabsl.S
+++ b/sysdeps/ia64/fpu/s_fabsl.S
@@ -1 +1,82 @@
-/* __fabsl is in s_fabs.S. */
+.file "fabsl.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/02/00 Initial version
+// 02/07/02 Added __libm_fabsl entry point to test in case compiler inlines
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// long double fabsl (long double x)
+//
+// Overview of operation
+//==============================================================
+// returns absolute value of x
+
+// floating-point registers used: 1
+// f8, input
+
+.section .text
+.global __libm_fabsl#
+
+.proc __libm_fabsl#
+__libm_fabsl:
+.endp __libm_fabsl#
+
+GLOBAL_IEEE754_ENTRY(fabsl)
+
+// set invalid or denormal flags and take fault if
+// necessary
+
+{ .mfi
+ nop.m 999
+ fcmp.eq.unc.s0 p6,p7 = f8,f1
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+ fmerge.s f8 = f0,f8
+ br.ret.sptk b0 ;;
+}
+
+GLOBAL_IEEE754_END(fabsl)
diff --git a/sysdeps/ia64/fpu/s_fdim.S b/sysdeps/ia64/fpu/s_fdim.S
new file mode 100644
index 0000000000..96ff67bf15
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_fdim.S
@@ -0,0 +1,227 @@
+.file "fdim.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 06/08/01 Initial version
+// 08/23/01 Corrected error tag number
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/28/03 Improved performance
+//
+// API
+//==============================================================
+// double fdim( double x, double y );
+// input floating point f8, f9
+// output floating point f8
+//
+//
+// Overview of operation
+//==============================================================
+// fdim determines the positive difference between the arguments
+// Result = x - y if x > y
+// = +0 if x <= y
+//
+// Error support is called if x-y overflows for x > y
+//
+
+// Registers used
+//==============================================================
+// General purpose registers: r14, r32 - r39
+
+rExpBig = r14
+
+// r36-39 parameters for libm_error_support
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_SAVE_PFS = r35
+
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+GR_Parameter_TAG = r39
+
+// Floating-point registers: f8 - f12
+
+f_tmp_result = f10
+fBig = f11
+fNormX = f12
+
+// Predicate registers: p6 - p10
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(fdim)
+
+{ .mfi
+ mov rExpBig = 0x103ff // Exponent to indicate overflow
+ fcmp.le.s1 p6,p7 = f8, f9 // Is x <= y?
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnorm.s1 fNormX = f8 // Save x
+ nop.i 0
+}
+;;
+
+{ .mfi
+ setf.exp fBig = rExpBig // Constant to test for overflow
+ fcmp.eq.s0 p8,p0 = f8, f9 // Dummy op to set Denormal or Invalid
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fclass.m p9,p10 = f8, 0x1e3 // Test for x natval, nan, inf
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p6) fmerge.s f8 = f0, f0 // Result is +0 if x <= y
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fms.d.s0 f8 = f8, f1, f9 // Result is x - y if x > y
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p10) fclass.m p9,p10 = f9, 0x1e3 // Test for y natval, nan, inf
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p10) fcmp.ge.s1 p8,p0 = f8, fBig // Test result for overflow
+ nop.i 0
+}
+;;
+
+{ .mbb
+(p9) cmp.ne p8,p0 = r0,r0 // Clear p8 if x or y natval,nan,inf
+(p8) br.cond.spnt FDIM_OVERFLOW // Branch if result overflows
+ br.ret.sptk b0 // Normal return
+}
+;;
+
+
+// Here if result will overflow
+FDIM_OVERFLOW:
+{ .mfi
+ alloc r32=ar.pfs,2,2,4,0
+ fms.d.s0 f_tmp_result = f8,f1,f9 // Normalize result force overflow
+ nop.i 0
+}
+{ .mfb
+ mov GR_Parameter_TAG = 196 // Error code
+ nop.f 0
+ br.cond.sptk __libm_error_region // Branch to error code
+}
+;;
+
+GLOBAL_LIBM_END(fdim)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+// Call error support to report possible range error
+.prologue
+
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+{ .mmi
+ stfd [GR_Parameter_Y] = f9,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+{ .mib
+ stfd [GR_Parameter_X] = fNormX // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfd [GR_Parameter_Y] = f_tmp_result // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+
+{ .mmi
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/s_fdimf.S b/sysdeps/ia64/fpu/s_fdimf.S
new file mode 100644
index 0000000000..19e14d373a
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_fdimf.S
@@ -0,0 +1,227 @@
+.file "fdimf.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 06/08/01 Initial version
+// 08/23/01 Corrected error tag number
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/28/03 Improved performance; fixed parameters for call to error routine
+//
+// API
+//==============================================================
+// float fdimf( float x, float y );
+// input floating point f8, f9
+// output floating point f8
+//
+//
+// Overview of operation
+//==============================================================
+// fdimf determines the positive difference between the arguments
+// Result = x - y if x > y
+// = +0 if x <= y
+//
+// Error support is called if x-y overflows for x > y
+//
+
+// Registers used
+//==============================================================
+// General purpose registers: r14, r32 - r39
+
+rExpBig = r14
+
+// r36-39 parameters for libm_error_support
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_SAVE_PFS = r35
+
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+GR_Parameter_TAG = r39
+
+// Floating-point registers: f8 - f12
+
+f_tmp_result = f10
+fBig = f11
+fNormX = f12
+
+// Predicate registers: p6 - p10
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(fdimf)
+
+{ .mfi
+ mov rExpBig = 0x1007f // Exponent to indicate overflow
+ fcmp.le.s1 p6,p7 = f8, f9 // Is x <= y?
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnorm.s1 fNormX = f8 // Save x
+ nop.i 0
+}
+;;
+
+{ .mfi
+ setf.exp fBig = rExpBig // Constant to test for overflow
+ fcmp.eq.s0 p8,p0 = f8, f9 // Dummy op to set Denormal or Invalid
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fclass.m p9,p10 = f8, 0x1e3 // Test for x natval, nan, inf
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p6) fmerge.s f8 = f0, f0 // Result is +0 if x <= y
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fms.s.s0 f8 = f8, f1, f9 // Result is x - y if x > y
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p10) fclass.m p9,p10 = f9, 0x1e3 // Test for y natval, nan, inf
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p10) fcmp.ge.s1 p8,p0 = f8, fBig // Test result for overflow
+ nop.i 0
+}
+;;
+
+{ .mbb
+(p9) cmp.ne p8,p0 = r0,r0 // Clear p8 if x or y natval,nan,inf
+(p8) br.cond.spnt FDIM_OVERFLOW // Branch if result overflows
+ br.ret.sptk b0 // Normal return
+}
+;;
+
+
+// Here if result will overflow
+FDIM_OVERFLOW:
+{ .mfi
+ alloc r32=ar.pfs,2,2,4,0
+ fms.s.s0 f_tmp_result = f8,f1,f9 // Normalize result force overflow
+ nop.i 0
+}
+{ .mfb
+ mov GR_Parameter_TAG = 197 // Error code
+ nop.f 0
+ br.cond.sptk __libm_error_region // Branch to error code
+}
+;;
+
+GLOBAL_LIBM_END(fdimf)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+// Call error support to report possible range error
+.prologue
+
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+{ .mmi
+ stfs [GR_Parameter_Y] = f9,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+{ .mib
+ stfs [GR_Parameter_X] = fNormX // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfs [GR_Parameter_Y] = f_tmp_result // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+
+{ .mmi
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/s_fdiml.S b/sysdeps/ia64/fpu/s_fdiml.S
new file mode 100644
index 0000000000..00876c3904
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_fdiml.S
@@ -0,0 +1,227 @@
+.file "fdiml.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 06/08/01 Initial version
+// 08/23/01 Corrected error tag number
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/28/03 Improved performance; fixed parameters for call to error routine
+//
+// API
+//==============================================================
+// long double fdiml( long double x, long double y );
+// input floating point f8, f9
+// output floating point f8
+//
+//
+// Overview of operation
+//==============================================================
+// fdiml determines the positive difference between the arguments
+// Result = x - y if x > y
+// = +0 if x <= y
+//
+// Error support is called if x-y overflows for x > y
+//
+
+// Registers used
+//==============================================================
+// General purpose registers: r14, r32 - r39
+
+rExpBig = r14
+
+// r36-39 parameters for libm_error_support
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
+GR_SAVE_PFS = r35
+
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+GR_Parameter_TAG = r39
+
+// Floating-point registers: f8 - f12
+
+f_tmp_result = f10
+fBig = f11
+fNormX = f12
+
+// Predicate registers: p6 - p10
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(fdiml)
+
+{ .mfi
+ mov rExpBig = 0x13fff // Exponent to indicate overflow
+ fcmp.le.s1 p6,p7 = f8, f9 // Is x <= y?
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnorm.s1 fNormX = f8 // Save x
+ nop.i 0
+}
+;;
+
+{ .mfi
+ setf.exp fBig = rExpBig // Constant to test for overflow
+ fcmp.eq.s0 p8,p0 = f8, f9 // Dummy op to set Denormal or Invalid
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fclass.m p9,p10 = f8, 0x1e3 // Test for x natval, nan, inf
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p6) fmerge.s f8 = f0, f0 // Result is +0 if x <= y
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fms.s0 f8 = f8, f1, f9 // Result is x - y if x > y
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p10) fclass.m p9,p10 = f9, 0x1e3 // Test for y natval, nan, inf
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p10) fcmp.ge.s1 p8,p0 = f8, fBig // Test result for overflow
+ nop.i 0
+}
+;;
+
+{ .mbb
+(p9) cmp.ne p8,p0 = r0,r0 // Clear p8 if x or y natval,nan,inf
+(p8) br.cond.spnt FDIM_OVERFLOW // Branch if result overflows
+ br.ret.sptk b0 // Normal return
+}
+;;
+
+
+// Here if result will overflow
+FDIM_OVERFLOW:
+{ .mfi
+ alloc r32=ar.pfs,2,2,4,0
+ fms.s0 f_tmp_result = f8,f1,f9 // Normalize result force overflow
+ nop.i 0
+}
+{ .mfb
+ mov GR_Parameter_TAG = 195 // Error code
+ nop.f 0
+ br.cond.sptk __libm_error_region // Branch to error code
+}
+;;
+
+GLOBAL_LIBM_END(fdiml)
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+// Call error support to report possible range error
+.prologue
+
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+{ .mmi
+ stfe [GR_Parameter_Y] = f9,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+{ .mib
+ stfe [GR_Parameter_X] = fNormX // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfe [GR_Parameter_Y] = f_tmp_result // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+
+{ .mmi
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/s_floor.S b/sysdeps/ia64/fpu/s_floor.S
index 438b0fa867..9ed9d6dcdb 100644
--- a/sysdeps/ia64/fpu/s_floor.S
+++ b/sysdeps/ia64/fpu/s_floor.S
@@ -1,10 +1,10 @@
.file "floor.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,86 +20,68 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-.align 32
-.global floor#
-
-.section .text
-.proc floor#
-.align 32
-
// History
//==============================================================
-// 2/02/00: Initial version
-// 3/22/00: Updated to improve performance
-// 6/13/00: Improved speed, fixed setting of inexact flag
-// 6/27/00: Eliminated incorrect invalid flag setting
-// 2/07/01: Corrected sign of zero result in round to -inf mode
+// 02/02/00 Initial version
+// 03/22/00 Updated to improve performance
+// 06/13/00 Improved speed, fixed setting of inexact flag
+// 06/27/00 Eliminated incorrect invalid flag setting
+// 02/07/01 Corrected sign of zero result in round to -inf mode
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/28/03 Improved performance
+//==============================================================
// API
//==============================================================
// double floor(double x)
+//==============================================================
-// general input registers:
-
-floor_GR_FFFF = r14
-floor_GR_signexp = r15
-floor_GR_exponent = r16
-floor_GR_expmask = r17
-floor_GR_bigexp = r18
-
-
-// predicate registers used:
+// general input registers:
+// r14 - r18
-// p6 ==> Input is NaN, infinity, zero
-// p7 ==> Input is denormal
-// p8 ==> Input is <0
-// p9 ==> Input is >=0
-// p10 ==> Input is already an integer (bigger than largest integer)
-// p11 ==> Input is not a large integer
-// p12 ==> Input is a smaller integer
-// p13 ==> Input is not an even integer, so inexact must be set
+rSignexp = r14
+rExp = r15
+rExpMask = r16
+rBigexp = r17
+rM1 = r18
+// floating-point registers:
+// f8 - f13
-// floating-point registers used:
+fXInt = f9
+fNormX = f10
+fTmp = f11
+fAdj = f12
+fPreResult = f13
-FLOOR_NORM_f8 = f9
-FLOOR_FFFF = f10
-FLOOR_INEXACT = f11
-FLOOR_FLOAT_INT_f8 = f12
-FLOOR_INT_f8 = f13
-FLOOR_adj = f14
+// predicate registers used:
+// p6 - p9
// Overview of operation
//==============================================================
-
// double floor(double x)
-// Return an integer value (represented as a double) that is the largest
+// Return an integer value (represented as a double) that is the largest
// value not greater than x
// This is x rounded toward -infinity to an integral value.
// Inexact is set if x != floor(x)
-// **************************************************************************
-
-// Set denormal flag for denormal input and
-// and take denormal fault if necessary.
-
-// Is the input an integer value already?
+//==============================================================
// double_extended
// if the exponent is > 1003e => 3F(true) = 63(decimal)
@@ -120,121 +102,115 @@ FLOOR_adj = f14
// If we multiply by 2^23, we no longer have a fractional part
// So input is an integer value already.
-// If x is NAN, ZERO, or INFINITY, then return
-
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 1 11 0xe7
-
-#include "libm_support.h"
-floor:
-#ifdef _LIBC
-.global __floor
-__floor:
-#endif
+.section .text
+GLOBAL_IEEE754_ENTRY(floor)
{ .mfi
- getf.exp floor_GR_signexp = f8
- fcvt.fx.trunc.s1 FLOOR_INT_f8 = f8
- addl floor_GR_bigexp = 0x10033, r0
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
+ addl rBigexp = 0x10033, r0 // Set exponent at which is integer
}
{ .mfi
- addl floor_GR_FFFF = -1,r0
- fcmp.lt.s1 p8,p9 = f8,f0
- mov floor_GR_expmask = 0x1FFFF ;;
+ mov rM1 = -1 // Set all ones
+ fcvt.fx.trunc.s1 fXInt = f8 // Convert to int in significand
+ mov rExpMask = 0x1FFFF // Form exponent mask
}
+;;
-// p7 ==> denorm
{ .mfi
- setf.sig FLOOR_FFFF = floor_GR_FFFF
- fclass.m p7,p0 = f8, 0x0b
- nop.i 999
+ nop.m 0
+ fcmp.lt.s1 p8,p9 = f8, f0 // Test x < 0
+ nop.i 0
}
-{ .mfi
- nop.m 999
- fnorm.s1 FLOOR_NORM_f8 = f8
- nop.i 999 ;;
+{ .mfb
+ setf.sig fTmp = rM1 // Make const for setting inexact
+ fnorm.s1 fNormX = f8 // Normalize input
+(p7) br.cond.spnt FLOOR_UNORM // Branch if x unorm
}
+;;
-// p6 ==> NAN, INF, ZERO
-{ .mfb
- nop.m 999
- fclass.m p6,p10 = f8, 0xe7
-(p7) br.cond.spnt L(FLOOR_DENORM) ;;
+FLOOR_COMMON:
+// Return here from FLOOR_UNORM
+{ .mfi
+ nop.m 0
+ fclass.m p6,p0 = f8, 0x1e7 // Test x natval, nan, inf, 0
+ nop.i 0
}
+;;
-L(FLOOR_COMMON):
.pred.rel "mutex",p8,p9
-// Set adjustment to subtract from trunc(x) for result
-// If x<0, adjustment is -1.0
-// If x>=0, adjustment is 0.0
{ .mfi
- and floor_GR_exponent = floor_GR_signexp, floor_GR_expmask
-(p8) fnma.s1 FLOOR_adj = f1,f1,f0
- nop.i 999
+ nop.m 0
+(p8) fnma.s1 fAdj = f1, f1, f0 // If x < 0, adjustment is -1
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p9) fadd.s1 FLOOR_adj = f0,f0
- nop.i 999 ;;
+ nop.m 0
+(p9) fma.s1 fAdj = f0, f0, f0 // If x > 0, adjustment is 0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
- fcmp.eq.s0 p12,p0 = f8,f0 // Dummy op to set denormal and invalid flag
- nop.i 999
+ nop.m 0
+ fcvt.xf fPreResult = fXInt // trunc(x)
+ nop.i 0
}
-{ .mfi
-(p10) cmp.ge.unc p10,p11 = floor_GR_exponent, floor_GR_bigexp
-(p6) fnorm.d f8 = f8
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+(p6) fma.d.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf, 0
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf, 0
}
+;;
-{ .mfi
- nop.m 999
-(p11) fcvt.xf FLOOR_FLOAT_INT_f8 = FLOOR_INT_f8
- nop.i 999 ;;
+{ .mmi
+ and rExp = rSignexp, rExpMask // Get biased exponent
+;;
+ cmp.ge p7,p6 = rExp, rBigexp // Is |x| >= 2^52?
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p10) fnorm.d f8 = FLOOR_NORM_f8
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.d.s0 f8 = fPreResult, f1, fAdj // Result if !int, |x| < 2^52
+ nop.i 0
}
-
-
{ .mfi
- nop.m 999
-(p11) fadd.d f8 = FLOOR_FLOAT_INT_f8,FLOOR_adj
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.d.s0 f8 = fNormX, f1, f0 // Result, if |x| >= 2^52
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p11) fcmp.eq.unc.s1 p12,p13 = FLOOR_FLOAT_INT_f8, FLOOR_NORM_f8
- nop.i 999 ;;
+ nop.m 0
+(p6) fcmp.eq.unc.s1 p8, p9 = fPreResult, fNormX // Is trunc(x) = x ?
+ nop.i 0
}
+;;
-// Set inexact if result not equal to input
{ .mfi
- nop.m 999
-(p13) fmpy.s0 FLOOR_INEXACT = FLOOR_FFFF,FLOOR_FFFF
- nop.i 999
+ nop.m 0
+(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact
+ nop.i 0
}
-// Set result to input if integer
{ .mfb
- nop.m 999
-(p12) fnorm.d f8 = FLOOR_NORM_f8
- br.ret.sptk b0 ;;
+ nop.m 0
+(p8) fma.d.s0 f8 = fNormX, f1, f0 // If x int, result normalized x
+ br.ret.sptk b0 // Exit main path, 0 < |x| < 2^52
}
+;;
+
-// Here if input denorm
-L(FLOOR_DENORM):
+FLOOR_UNORM:
+// Here if x unorm
{ .mfb
- getf.exp floor_GR_signexp = FLOOR_NORM_f8
- fcvt.fx.trunc.s1 FLOOR_INT_f8 = FLOOR_NORM_f8
- br.cond.sptk L(FLOOR_COMMON) ;;
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk FLOOR_COMMON // Return to main path
}
+;;
-.endp floor
-ASM_SIZE_DIRECTIVE(floor)
+GLOBAL_IEEE754_END(floor)
diff --git a/sysdeps/ia64/fpu/s_floorf.S b/sysdeps/ia64/fpu/s_floorf.S
index 15b2bbd31d..a3f2095931 100644
--- a/sysdeps/ia64/fpu/s_floorf.S
+++ b/sysdeps/ia64/fpu/s_floorf.S
@@ -1,10 +1,10 @@
.file "floorf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,85 +20,67 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-.align 32
-.global floorf#
-
-.section .text
-.proc floorf#
-.align 32
-
// History
//==============================================================
-// 2/02/00: Initial version
-// 6/13/00: Improved speed
-// 6/27/00: Eliminated incorrect invalid flag setting
-// 2/07/01: Corrected sign of zero result in round to -inf mode
+// 02/02/00 Initial version
+// 06/13/00 Improved speed
+// 06/27/00 Eliminated incorrect invalid flag setting
+// 02/07/01 Corrected sign of zero result in round to -inf mode
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/28/03 Improved performance
+//==============================================================
// API
//==============================================================
// float floorf(float x)
+//==============================================================
-// general input registers:
-
-floor_GR_FFFF = r14
-floor_GR_signexp = r15
-floor_GR_exponent = r16
-floor_GR_expmask = r17
-floor_GR_bigexp = r18
-
-
-// predicate registers used:
+// general input registers:
+// r14 - r18
-// p6 ==> Input is NaN, infinity, zero
-// p7 ==> Input is denormal
-// p8 ==> Input is <0
-// p9 ==> Input is >=0
-// p10 ==> Input is already an integer (bigger than largest integer)
-// p11 ==> Input is not a large integer
-// p12 ==> Input is a smaller integer
-// p13 ==> Input is not an even integer, so inexact must be set
+rSignexp = r14
+rExp = r15
+rExpMask = r16
+rBigexp = r17
+rM1 = r18
+// floating-point registers:
+// f8 - f13
-// floating-point registers used:
+fXInt = f9
+fNormX = f10
+fTmp = f11
+fAdj = f12
+fPreResult = f13
-FLOOR_NORM_f8 = f9
-FLOOR_FFFF = f10
-FLOOR_INEXACT = f11
-FLOOR_FLOAT_INT_f8 = f12
-FLOOR_INT_f8 = f13
-FLOOR_adj = f14
+// predicate registers used:
+// p6 - p9
// Overview of operation
//==============================================================
-
// float floorf(float x)
-// Return an integer value (represented as a float) that is the largest
+// Return an integer value (represented as a float) that is the largest
// value not greater than x
// This is x rounded toward -infinity to an integral value.
// Inexact is set if x != floorf(x)
-// **************************************************************************
-
-// Set denormal flag for denormal input and
-// and take denormal fault if necessary.
-
-// Is the input an integer value already?
+//==============================================================
// double_extended
// if the exponent is > 1003e => 3F(true) = 63(decimal)
@@ -119,119 +101,115 @@ FLOOR_adj = f14
// If we multiply by 2^23, we no longer have a fractional part
// So input is an integer value already.
-// If x is NAN, ZERO, or INFINITY, then return
-
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 1 11 0xe7
-
-#include "libm_support.h"
-floorf:
-#ifdef _LIBC
-.global __floorf
-__floorf:
-#endif
+.section .text
+GLOBAL_IEEE754_ENTRY(floorf)
{ .mfi
- getf.exp floor_GR_signexp = f8
- fcvt.fx.trunc.s1 FLOOR_INT_f8 = f8
- addl floor_GR_bigexp = 0x10016, r0
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
+ addl rBigexp = 0x10016, r0 // Set exponent at which is integer
}
{ .mfi
- addl floor_GR_FFFF = -1,r0
- fcmp.lt.s1 p8,p9 = f8,f0
- mov floor_GR_expmask = 0x1FFFF ;;
+ mov rM1 = -1 // Set all ones
+ fcvt.fx.trunc.s1 fXInt = f8 // Convert to int in significand
+ mov rExpMask = 0x1FFFF // Form exponent mask
}
+;;
-// p7 ==> denorm
{ .mfi
- setf.sig FLOOR_FFFF = floor_GR_FFFF
- fclass.m p7,p0 = f8, 0x0b
- nop.i 999
+ nop.m 0
+ fcmp.lt.s1 p8,p9 = f8, f0 // Test x < 0
+ nop.i 0
}
-{ .mfi
- nop.m 999
- fnorm.s1 FLOOR_NORM_f8 = f8
- nop.i 999 ;;
+{ .mfb
+ setf.sig fTmp = rM1 // Make const for setting inexact
+ fnorm.s1 fNormX = f8 // Normalize input
+(p7) br.cond.spnt FLOOR_UNORM // Branch if x unorm
}
+;;
-// p6 ==> NAN, INF, ZERO
-{ .mfb
- nop.m 999
- fclass.m p6,p10 = f8, 0xe7
-(p7) br.cond.spnt L(FLOOR_DENORM) ;;
+FLOOR_COMMON:
+// Return here from FLOOR_UNORM
+{ .mfi
+ nop.m 0
+ fclass.m p6,p0 = f8, 0x1e7 // Test x natval, nan, inf, 0
+ nop.i 0
}
+;;
-L(FLOOR_COMMON):
.pred.rel "mutex",p8,p9
-// Set adjustment to subtract from trunc(x) for result
-// If x<0, adjustment is -1.0
-// If x>=0, adjustment is 0.0
{ .mfi
- and floor_GR_exponent = floor_GR_signexp, floor_GR_expmask
-(p8) fnma.s1 FLOOR_adj = f1,f1,f0
- nop.i 999
+ nop.m 0
+(p8) fnma.s1 fAdj = f1, f1, f0 // If x < 0, adjustment is -1
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p9) fadd.s1 FLOOR_adj = f0,f0
- nop.i 999 ;;
+ nop.m 0
+(p9) fma.s1 fAdj = f0, f0, f0 // If x > 0, adjustment is 0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
- fcmp.eq.s0 p12,p0 = f8,f0 // Dummy op to set denormal and invalid flag
- nop.i 999
+ nop.m 0
+ fcvt.xf fPreResult = fXInt // trunc(x)
+ nop.i 0
}
-{ .mfi
-(p10) cmp.ge.unc p10,p11 = floor_GR_exponent, floor_GR_bigexp
-(p6) fnorm.s f8 = f8
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+(p6) fma.s.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf, 0
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf, 0
}
+;;
-{ .mfi
- nop.m 999
-(p11) fcvt.xf FLOOR_FLOAT_INT_f8 = FLOOR_INT_f8
- nop.i 999 ;;
+{ .mmi
+ and rExp = rSignexp, rExpMask // Get biased exponent
+;;
+ cmp.ge p7,p6 = rExp, rBigexp // Is |x| >= 2^23?
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p10) fnorm.s f8 = FLOOR_NORM_f8
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.s.s0 f8 = fPreResult, f1, fAdj // Result if !int, |x| < 2^23
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p11) fadd.s f8 = FLOOR_FLOAT_INT_f8,FLOOR_adj
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s.s0 f8 = fNormX, f1, f0 // Result, if |x| >= 2^23
+ nop.i 0
}
+;;
+
{ .mfi
- nop.m 999
-(p11) fcmp.eq.unc.s1 p12,p13 = FLOOR_FLOAT_INT_f8, FLOOR_NORM_f8
- nop.i 999 ;;
+ nop.m 0
+(p6) fcmp.eq.unc.s1 p8, p9 = fPreResult, fNormX // Is trunc(x) = x ?
+ nop.i 0
}
+;;
-// Set inexact if result not equal to input
{ .mfi
- nop.m 999
-(p13) fmpy.s0 FLOOR_INEXACT = FLOOR_FFFF,FLOOR_FFFF
- nop.i 999
+ nop.m 0
+(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact
+ nop.i 0
}
-// Set result to input if integer
{ .mfb
- nop.m 999
-(p12) fnorm.s f8 = FLOOR_NORM_f8
- br.ret.sptk b0 ;;
+ nop.m 0
+(p8) fma.s.s0 f8 = fNormX, f1, f0 // If x int, result normalized x
+ br.ret.sptk b0 // Exit main path, 0 < |x| < 2^23
}
+;;
+
-// Here if input denorm
-L(FLOOR_DENORM):
+FLOOR_UNORM:
+// Here if x unorm
{ .mfb
- getf.exp floor_GR_signexp = FLOOR_NORM_f8
- fcvt.fx.trunc.s1 FLOOR_INT_f8 = FLOOR_NORM_f8
- br.cond.sptk L(FLOOR_COMMON) ;;
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk FLOOR_COMMON // Return to main path
}
+;;
-.endp floorf
-ASM_SIZE_DIRECTIVE(floorf)
+GLOBAL_IEEE754_END(floorf)
diff --git a/sysdeps/ia64/fpu/s_floorl.S b/sysdeps/ia64/fpu/s_floorl.S
index 294578e1a7..345c4f30dd 100644
--- a/sysdeps/ia64/fpu/s_floorl.S
+++ b/sysdeps/ia64/fpu/s_floorl.S
@@ -1,10 +1,10 @@
.file "floorl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,85 +20,67 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-.align 32
-.global floorl#
-
-.section .text
-.proc floorl#
-.align 32
-
// History
//==============================================================
-// 2/02/00: Initial version
-// 6/13/00: Improved speed
-// 6/27/00: Eliminated incorrect invalid flag setting
-// 2/07/01: Corrected sign of zero result in round to -inf mode
+// 02/02/00 Initial version
+// 06/13/00 Improved speed
+// 06/27/00 Eliminated incorrect invalid flag setting
+// 02/07/01 Corrected sign of zero result in round to -inf mode
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/28/03 Improved performance
+//==============================================================
// API
//==============================================================
// long double floorl(long double x)
+//==============================================================
-// general input registers:
-
-floor_GR_FFFF = r14
-floor_GR_signexp = r15
-floor_GR_exponent = r16
-floor_GR_expmask = r17
-floor_GR_bigexp = r18
-
-
-// predicate registers used:
+// general input registers:
+// r14 - r18
-// p6 ==> Input is NaN, infinity, zero
-// p7 ==> Input is denormal
-// p8 ==> Input is <0
-// p9 ==> Input is >=0
-// p10 ==> Input is already an integer (bigger than largest integer)
-// p11 ==> Input is not a large integer
-// p12 ==> Input is a smaller integer
-// p13 ==> Input is not an even integer, so inexact must be set
+rSignexp = r14
+rExp = r15
+rExpMask = r16
+rBigexp = r17
+rM1 = r18
+// floating-point registers:
+// f8 - f13
-// floating-point registers used:
+fXInt = f9
+fNormX = f10
+fTmp = f11
+fAdj = f12
+fPreResult = f13
-FLOOR_NORM_f8 = f9
-FLOOR_FFFF = f10
-FLOOR_INEXACT = f11
-FLOOR_FLOAT_INT_f8 = f12
-FLOOR_INT_f8 = f13
-FLOOR_adj = f14
+// predicate registers used:
+// p6 - p9
// Overview of operation
//==============================================================
-
// long double floorl(long double x)
-// Return an integer value (represented as a long double) that is the largest
+// Return an integer value (represented as a long double) that is the largest
// value not greater than x
// This is x rounded toward -infinity to an integral value.
// Inexact is set if x != floorl(x)
-// **************************************************************************
-
-// Set denormal flag for denormal input and
-// and take denormal fault if necessary.
-
-// Is the input an integer value already?
+//==============================================================
// double_extended
// if the exponent is > 1003e => 3F(true) = 63(decimal)
@@ -119,119 +101,115 @@ FLOOR_adj = f14
// If we multiply by 2^23, we no longer have a fractional part
// So input is an integer value already.
-// If x is NAN, ZERO, or INFINITY, then return
-
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 1 11 0xe7
-
-#include "libm_support.h"
-floorl:
-#ifdef _LIBC
-.global __floorl
-__floorl:
-#endif
+.section .text
+GLOBAL_IEEE754_ENTRY(floorl)
{ .mfi
- getf.exp floor_GR_signexp = f8
- fcvt.fx.trunc.s1 FLOOR_INT_f8 = f8
- addl floor_GR_bigexp = 0x1003e, r0
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
+ addl rBigexp = 0x1003e, r0 // Set exponent at which is integer
}
{ .mfi
- addl floor_GR_FFFF = -1,r0
- fcmp.lt.s1 p8,p9 = f8,f0
- mov floor_GR_expmask = 0x1FFFF ;;
+ mov rM1 = -1 // Set all ones
+ fcvt.fx.trunc.s1 fXInt = f8 // Convert to int in significand
+ mov rExpMask = 0x1FFFF // Form exponent mask
}
+;;
-// p7 ==> denorm
{ .mfi
- setf.sig FLOOR_FFFF = floor_GR_FFFF
- fclass.m p7,p0 = f8, 0x0b
- nop.i 999
+ nop.m 0
+ fcmp.lt.s1 p8,p9 = f8, f0 // Test x < 0
+ nop.i 0
}
-{ .mfi
- nop.m 999
- fnorm.s1 FLOOR_NORM_f8 = f8
- nop.i 999 ;;
+{ .mfb
+ setf.sig fTmp = rM1 // Make const for setting inexact
+ fnorm.s1 fNormX = f8 // Normalize input
+(p7) br.cond.spnt FLOOR_UNORM // Branch if x unorm
}
+;;
-// p6 ==> NAN, INF, ZERO
-{ .mfb
- nop.m 999
- fclass.m p6,p10 = f8, 0xe7
-(p7) br.cond.spnt L(FLOOR_DENORM) ;;
+FLOOR_COMMON:
+// Return here from FLOOR_UNORM
+{ .mfi
+ nop.m 0
+ fclass.m p6,p0 = f8, 0x1e7 // Test x natval, nan, inf, 0
+ nop.i 0
}
+;;
-L(FLOOR_COMMON):
.pred.rel "mutex",p8,p9
-// Set adjustment to subtract from trunc(x) for result
-// If x<0, adjustment is -1.0
-// If x>=0, adjustment is 0.0
{ .mfi
- and floor_GR_exponent = floor_GR_signexp, floor_GR_expmask
-(p8) fnma.s1 FLOOR_adj = f1,f1,f0
- nop.i 999
+ nop.m 0
+(p8) fnma.s1 fAdj = f1, f1, f0 // If x < 0, adjustment is -1
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p9) fadd.s1 FLOOR_adj = f0,f0
- nop.i 999 ;;
+ nop.m 0
+(p9) fma.s1 fAdj = f0, f0, f0 // If x > 0, adjustment is 0
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
- fcmp.eq.s0 p12,p0 = f8,f0 // Dummy op to set denormal and invalid flag
- nop.i 999
+ nop.m 0
+ fcvt.xf fPreResult = fXInt // trunc(x)
+ nop.i 0
}
-{ .mfi
-(p10) cmp.ge.unc p10,p11 = floor_GR_exponent, floor_GR_bigexp
-(p6) fnorm f8 = f8
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+(p6) fma.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf, 0
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf, 0
}
+;;
-{ .mfi
- nop.m 999
-(p11) fcvt.xf FLOOR_FLOAT_INT_f8 = FLOOR_INT_f8
- nop.i 999 ;;
+{ .mmi
+ and rExp = rSignexp, rExpMask // Get biased exponent
+;;
+ cmp.ge p7,p6 = rExp, rBigexp // Is |x| >= 2^63?
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p10) fnorm f8 = FLOOR_NORM_f8
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.s0 f8 = fPreResult, f1, fAdj // Result if !int, |x| < 2^63
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p11) fadd f8 = FLOOR_FLOAT_INT_f8,FLOOR_adj
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s0 f8 = fNormX, f1, f0 // Result, if |x| >= 2^63
+ nop.i 0
}
+;;
+
{ .mfi
- nop.m 999
-(p11) fcmp.eq.unc.s1 p12,p13 = FLOOR_FLOAT_INT_f8, FLOOR_NORM_f8
- nop.i 999 ;;
+ nop.m 0
+(p6) fcmp.eq.unc.s1 p8, p9 = fPreResult, fNormX // Is trunc(x) = x ?
+ nop.i 0
}
+;;
-// Set inexact if result not equal to input
{ .mfi
- nop.m 999
-(p13) fmpy.s0 FLOOR_INEXACT = FLOOR_FFFF,FLOOR_FFFF
- nop.i 999
+ nop.m 0
+(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact
+ nop.i 0
}
-// Set result to input if integer
{ .mfb
- nop.m 999
-(p12) fnorm f8 = FLOOR_NORM_f8
- br.ret.sptk b0 ;;
+ nop.m 0
+(p8) fma.s0 f8 = fNormX, f1, f0 // If x int, result normalized x
+ br.ret.sptk b0 // Exit main path, 0 < |x| < 2^63
}
+;;
+
-// Here if input denorm
-L(FLOOR_DENORM):
+FLOOR_UNORM:
+// Here if x unorm
{ .mfb
- getf.exp floor_GR_signexp = FLOOR_NORM_f8
- fcvt.fx.trunc.s1 FLOOR_INT_f8 = FLOOR_NORM_f8
- br.cond.sptk L(FLOOR_COMMON) ;;
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk FLOOR_COMMON // Return to main path
}
+;;
-.endp floorl
-ASM_SIZE_DIRECTIVE(floorl)
+GLOBAL_IEEE754_END(floorl)
diff --git a/sysdeps/ia64/fpu/s_fma.S b/sysdeps/ia64/fpu/s_fma.S
new file mode 100644
index 0000000000..7798790d50
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_fma.S
@@ -0,0 +1,71 @@
+.file "fma.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 06/07/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// double fma (double x, double y, double z)
+//
+// Overview of operation
+//==============================================================
+// returns x * y + z with one rounding error
+
+// All the special cases are handled by the fma instruction itself
+
+// floating-point registers used: 3
+// f8, input x, output
+// f9, input y
+// f10, input z
+
+.section .text
+GLOBAL_LIBM_ENTRY(fma)
+
+{ .mfb
+ nop.m 999
+ fma.d.s0 f8 = f8, f9, f10 // Result = x * y + z
+ br.ret.sptk b0
+}
+;;
+
+GLOBAL_LIBM_END(fma)
diff --git a/sysdeps/ia64/fpu/s_fmaf.S b/sysdeps/ia64/fpu/s_fmaf.S
new file mode 100644
index 0000000000..db112b2a6c
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_fmaf.S
@@ -0,0 +1,71 @@
+.file "fmaf.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 06/07/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// float fmaf (float x, float y, float z)
+//
+// Overview of operation
+//==============================================================
+// returns x * y + z with one rounding error
+
+// All the special cases are handled by the fma instruction itself
+
+// floating-point registers used: 3
+// f8, input x, output
+// f9, input y
+// f10, input z
+
+.section .text
+GLOBAL_LIBM_ENTRY(fmaf)
+
+{ .mfb
+ nop.m 999
+ fma.s.s0 f8 = f8, f9, f10 // Result = x * y + z
+ br.ret.sptk b0
+}
+;;
+
+GLOBAL_LIBM_END(fmaf)
diff --git a/sysdeps/ia64/fpu/s_fmal.S b/sysdeps/ia64/fpu/s_fmal.S
new file mode 100644
index 0000000000..2bdef0b3ed
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_fmal.S
@@ -0,0 +1,71 @@
+.file "fmal.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 06/07/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// long double fmal (long double x, long double y, long double z)
+//
+// Overview of operation
+//==============================================================
+// returns x * y + z with one rounding error
+
+// All the special cases are handled by the fma instruction itself
+
+// floating-point registers used: 3
+// f8, input x, output
+// f9, input y
+// f10, input z
+
+.section .text
+GLOBAL_LIBM_ENTRY(fmal)
+
+{ .mfb
+ nop.m 999
+ fma.s0 f8 = f8, f9, f10 // Result = x * y + z
+ br.ret.sptk b0
+}
+;;
+
+GLOBAL_LIBM_END(fmal)
diff --git a/sysdeps/ia64/fpu/s_fmax.S b/sysdeps/ia64/fpu/s_fmax.S
new file mode 100644
index 0000000000..6fd38dfe9d
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_fmax.S
@@ -0,0 +1,114 @@
+.file "fmax.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 05/31/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// double fmax (double x, double y)
+//
+// Overview of operation
+//==============================================================
+// returns the algebraic maximum of 2 input values
+//
+// Special cases:
+// fmax(x, nan) returns x if x is numeric // Must special case this one
+// fmax(nan, y) returns y if y is numeric
+// fmax(nan1, nan2) returns quietized nan2
+// fmax(+0,+0) returns +0
+// fmax(-0,+0) returns +0
+// fmax(-0,-0) returns -0
+// fmax(+0,-0) returns +0 // Must special case this one
+//
+// SNaN causes invalid to be set
+
+// floating-point registers used: 2
+// f8, input x, output
+// f9, input y
+
+.section .text
+GLOBAL_LIBM_ENTRY(fmax)
+
+{ .mfi
+ nop.m 999
+ fcmp.unord.s0 p6,p7 = f8, f9 // Is x or y a nan? Raise invalid or denormal
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p7) fclass.m.unc p8,p9 = f9, 0x06 // If no nan, is y=-0?
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p6) fclass.m.unc p10,p0 = f8, 0xc3 // Is x nan?
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p9) fmax.s0 f8 = f8, f9 // Normal case, no nan and y not -0
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p8) fmax.s0 f8 = f9, f8 // No nan and y -0
+ nop.i 999
+}
+;;
+
+{ .mfb
+ nop.m 999
+(p10) fmerge.s f8 = f9, f9 // If x nan, return y, else do nothing (returns x)
+ br.ret.sptk b0
+}
+;;
+
+GLOBAL_LIBM_END(fmax)
diff --git a/sysdeps/ia64/fpu/s_fmaxf.S b/sysdeps/ia64/fpu/s_fmaxf.S
new file mode 100644
index 0000000000..cac283c66a
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_fmaxf.S
@@ -0,0 +1,114 @@
+.file "fmaxf.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 05/31/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// float fmaxf (float x, float y)
+//
+// Overview of operation
+//==============================================================
+// returns the algebraic maximum of 2 input values
+//
+// Special cases:
+// fmaxf(x, nan) returns x if x is numeric // Must special case this one
+// fmaxf(nan, y) returns y if y is numeric
+// fmaxf(nan1, nan2) returns quietized nan2
+// fmaxf(+0,+0) returns +0
+// fmaxf(-0,+0) returns +0
+// fmaxf(-0,-0) returns -0
+// fmaxf(+0,-0) returns +0 // Must special case this one
+//
+// SNaN causes invalid to be set
+
+// floating-point registers used: 2
+// f8, input x, output
+// f9, input y
+
+.section .text
+GLOBAL_LIBM_ENTRY(fmaxf)
+
+{ .mfi
+ nop.m 999
+ fcmp.unord.s0 p6,p7 = f8, f9 // Is x or y a nan? Raise invalid or denormal
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p7) fclass.m.unc p8,p9 = f9, 0x06 // If no nan, is y=-0?
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p6) fclass.m.unc p10,p0 = f8, 0xc3 // Is x nan?
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p9) fmax.s0 f8 = f8, f9 // Normal case, no nan and y not -0
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p8) fmax.s0 f8 = f9, f8 // No nan and y -0
+ nop.i 999
+}
+;;
+
+{ .mfb
+ nop.m 999
+(p10) fmerge.s f8 = f9, f9 // If x nan, return y, else do nothing (returns x)
+ br.ret.sptk b0
+}
+;;
+
+GLOBAL_LIBM_END(fmaxf)
diff --git a/sysdeps/ia64/fpu/s_fmaxl.S b/sysdeps/ia64/fpu/s_fmaxl.S
new file mode 100644
index 0000000000..fb8861dcdf
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_fmaxl.S
@@ -0,0 +1,114 @@
+.file "fmaxl.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 05/31/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// long double fmaxl (long double x, long double y)
+//
+// Overview of operation
+//==============================================================
+// returns the algebraic maximum of 2 input values
+//
+// Special cases:
+// fmaxl(x, nan) returns x if x is numeric // Must special case this one
+// fmaxl(nan, y) returns y if y is numeric
+// fmaxl(nan1, nan2) returns quietized nan2
+// fmaxl(+0,+0) returns +0
+// fmaxl(-0,+0) returns +0
+// fmaxl(-0,-0) returns -0
+// fmaxl(+0,-0) returns +0 // Must special case this one
+//
+// SNaN causes invalid to be set
+
+// floating-point registers used: 2
+// f8, input x, output
+// f9, input y
+
+.section .text
+GLOBAL_LIBM_ENTRY(fmaxl)
+
+{ .mfi
+ nop.m 999
+ fcmp.unord.s0 p6,p7 = f8, f9 // Is x or y a nan? Raise invalid or denormal
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p7) fclass.m.unc p8,p9 = f9, 0x06 // If no nan, is y=-0?
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p6) fclass.m.unc p10,p0 = f8, 0xc3 // Is x nan?
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p9) fmax.s0 f8 = f8, f9 // Normal case, no nan and y not -0
+ nop.i 999
+}
+;;
+
+{ .mfi
+ nop.m 999
+(p8) fmax.s0 f8 = f9, f8 // No nan and y -0
+ nop.i 999
+}
+;;
+
+{ .mfb
+ nop.m 999
+(p10) fmerge.s f8 = f9, f9 // If x nan, return y, else do nothing (returns x)
+ br.ret.sptk b0
+}
+;;
+
+GLOBAL_LIBM_END(fmaxl)
diff --git a/sysdeps/ia64/fpu/s_frexp.c b/sysdeps/ia64/fpu/s_frexp.c
index 98349bca47..c67500695f 100644
--- a/sysdeps/ia64/fpu/s_frexp.c
+++ b/sysdeps/ia64/fpu/s_frexp.c
@@ -1,8 +1,10 @@
-//
-// Copyright (C) 2000, 2001, Intel Corporation
+/* file: frexp.c */
+
+
+// Copyright (c) 2000-2002, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
+// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
//
// Redistribution and use in source and binary forms, with or without
@@ -19,14 +21,15 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
+
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
@@ -34,22 +37,30 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
//
+// History
+//=====================================================================
+// 2/02/00 Initial version
+// 1/23/02 Calls kernel with parameter to specify 32- or 64-bit int
//
+//=====================================================================
#include "libm_support.h"
+double __libm_frexp(double, int*, int);
+
double frexp(double x, int *y)
{
-#ifdef SIZE_INT_64
- return( __libm_frexp_8(x, y) );
+#ifdef SIZE_INT_64
+ return( __libm_frexp(x, y, 1) );
#else
-#ifdef SIZE_INT_32
- return( _GI___libm_frexp_4(x, y) );
+#ifdef SIZE_INT_32
+ return( __libm_frexp(x, y, 0) );
#endif
#endif
diff --git a/sysdeps/ia64/fpu/s_frexpf.c b/sysdeps/ia64/fpu/s_frexpf.c
index f666304147..c21a21dfba 100644
--- a/sysdeps/ia64/fpu/s_frexpf.c
+++ b/sysdeps/ia64/fpu/s_frexpf.c
@@ -1,8 +1,10 @@
-//
-// Copyright (C) 2000, 2001, Intel Corporation
+/* file: frexpf.c */
+
+
+// Copyright (c) 2000-2002, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
+// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
//
// Redistribution and use in source and binary forms, with or without
@@ -19,14 +21,15 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
+
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
@@ -34,22 +37,30 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
//
+// History
+//=====================================================================
+// 2/02/00 Initial version
+// 1/23/02 Calls kernel with parameter to specify 32- or 64-bit int
//
+//=====================================================================
#include "libm_support.h"
+float __libm_frexpf(float, int*, int);
+
float frexpf(float x, int *y)
{
-#ifdef SIZE_INT_64
- return( __libm_frexp_8f(x, y) );
+#ifdef SIZE_INT_64
+ return( __libm_frexpf(x, y, 1) );
#else
-#ifdef SIZE_INT_32
- return( _GI___libm_frexp_4f(x, y) );
+#ifdef SIZE_INT_32
+ return( __libm_frexpf(x, y, 0) );
#endif
#endif
diff --git a/sysdeps/ia64/fpu/s_frexpl.c b/sysdeps/ia64/fpu/s_frexpl.c
index 3edc971e3f..13d44ab8b5 100644
--- a/sysdeps/ia64/fpu/s_frexpl.c
+++ b/sysdeps/ia64/fpu/s_frexpl.c
@@ -1,8 +1,10 @@
-//
-// Copyright (C) 2000, 2001, Intel Corporation
+/* file: frexpl.c */
+
+
+// Copyright (c) 2000-2002, Intel Corporation
// All rights reserved.
//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
+// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
//
// Redistribution and use in source and binary forms, with or without
@@ -19,14 +21,15 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
+
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
@@ -34,22 +37,30 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
//
+// History
+//=====================================================================
+// 2/02/00 Initial version
+// 1/23/02 Calls kernel with parameter to specify 32- or 64-bit int
//
+//=====================================================================
#include "libm_support.h"
+long double __libm_frexpl(long double, int*, int);
+
long double frexpl(long double x, int *y)
{
-#ifdef SIZE_INT_64
- return( __libm_frexp_8l(x, y) );
+#ifdef SIZE_INT_64
+ return( __libm_frexpl(x, y, 1) );
#else
-#ifdef SIZE_INT_32
- return( _GI___libm_frexp_4l(x, y) );
+#ifdef SIZE_INT_32
+ return( __libm_frexpl(x, y, 0) );
#endif
#endif
diff --git a/sysdeps/ia64/fpu/s_ilogb.S b/sysdeps/ia64/fpu/s_ilogb.S
index 61975dd941..3f2733cabd 100644
--- a/sysdeps/ia64/fpu/s_ilogb.S
+++ b/sysdeps/ia64/fpu/s_ilogb.S
@@ -1,10 +1,10 @@
.file "ilogb.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,234 +20,248 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/03/00 Initial version
-// 5/26/00 Fix bug when x a double-extended denormal;
+// 02/03/00 Initial version
+// 05/26/00 Fix bug when x a double-extended denormal;
// if x=0 call error routine, per C9X
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-// 1/20/01 Fixed result for x=0, corrected error tag value.
-
-.align 32
-.global ilogb#
-
-.section .text
-.proc ilogb#
-.align 32
-
+// 01/20/01 Fixed result for x=0, corrected error tag value.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance
+//
// API
//==============================================================
-// int = ilogb(double)
-
+// int ilogb( double x );
+//
// Overview of operation
//==============================================================
-// ilogb computes log2(x) as an int
+// The ilogb function extracts the exponent of x as an integer
// and returns it in r8
-
-// ilogb is similar to logb but differs in the following ways:
+//
+// ilogb is similar to logb but differs in the following ways:
// +-inf
// ilogb: returns INT_MAX
// logb: returns +inf
-// Nan returns FP_ILOGBNAN (which is either INT_MAX or INT_MIN)
+// Nan returns FP_LOGBNAN (which is either INT_MAX or INT_MIN)
// ilogb: returns INT_MAX (7fffffff)
-// logb: returns QNAN (quieted SNAN)
+// logb: returns QNAN (quietized SNAN)
// 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX)
-// ilogb: returns INT_MIN (80000000)
-// logb: returns -inf
-
+// ilogb: returns -INT_MAX (80000001)
+// logb: returns -inf, raises the divide-by-zero exception,
+// and calls libm_error_support to set domain error
+//
// Registers used
//==============================================================
+// general registers used:
+// r26 -> r39
+// r36 -> r39 used as parameters to error path
+//
+// predicate registers used:
+// p6 -> p10
+// floating-point registers used:
+// f9, f10, f11
+// f8, input
-// general local registers:
-// ar.pfs r32
-// r33 -> r37
-// r38 -> r41 used as parameters to error path
-
-// predicate registers used:
-// p6 - x nan, inf
-// p7 - x 0
-// p8 - x norm, unorm
-// p9 - x unorm
-
-// floating-point registers used:
-// f8 - f10
-
-#include "libm_support.h"
+rExpBias = r26
+rExpMask = r27
+rSignexp_x = r28
+rExp_x = r29
+rIntMax = r30
+rExp_2to64 = r31
GR_SAVE_PFS = r32
+rTrialResult = r33
GR_SAVE_B0 = r34
GR_SAVE_GP = r35
-GR_Parameter_X = r38
-GR_Parameter_Y = r39
-GR_Parameter_RESULT = r40
-GR_Parameter_TAG = r41
-FR_X = f8
-FR_Y = f0
-FR_RESULT = f0
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+GR_Parameter_TAG = r39
+fTmp = f9
+fNorm_x = f10
+f2to64 = f11
-ilogb:
+.section .text
+GLOBAL_LIBM_ENTRY(ilogb)
-// Form signexp of 2^64 in case need to scale denormal
-{ .mmf
- alloc r32=ar.pfs,1,5,4,0
-(p0) mov r37 = 0x1003f
-(p0) fnorm f9 = f8 ;;
+// X NORMAL
+// TrueExp_x = exp(f8) - 0xffff
+// r8 = TrueExp_x
+{ .mfi
+ getf.exp rSignexp_x = f8
+ fclass.m p8,p0 = f8, 0x0b // Test for x unorm
+ mov rExpBias = 0xffff // Exponent bias
}
-
-// Form 2^64 in case need to scale denormal
{ .mfi
-(p0) setf.exp f10 = r37
-(p0) fclass.m.unc p7, p8 = f8, 0xe3
-(p0) mov r34 = 0xffff ;;
+ nop.m 0
+ fnorm.s1 fNorm_x = f8
+ mov rExpMask = 0x1ffff // Exponent mask
}
+;;
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 0 11
-// e 3
-// X ZERO, returns INT_MIN
-// X INF or NAN, returns INT_MAX
+// Form signexp of 2^64 in case need to scale denormal
+{ .mfb
+ mov rExp_2to64 = 0x1003f
+ fclass.m p6,p9 = f8, 0x1e3 // Test x natval, nan, inf
+(p8) br.cond.spnt ILOGB_DENORM // Branch if x unorm
+}
+;;
+ILOGB_COMMON:
+// Return here from ILOGB_DENORM
{ .mfi
-(p0) mov r35 = 0x1ffff
-(p8) fclass.m.unc p6, p8 = f8, 0x07
- nop.i 999 ;;
+ and rExp_x = rSignexp_x, rExpMask // Get biased exponent
+ fclass.m p7,p10 = f8, 0x07 // Test x zero
+ nop.i 0
}
{ .mlx
- nop.m 999
-(p7) movl r8 = 0x000000007fffffff ;;
+ nop.m 0
+ movl rIntMax = 0x000000007fffffff // Form INT_MAX
}
+;;
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(ILOGB_ZERO) ;;
-}
-
-// Test for denormal
+.pred.rel "mutex",p6,p9
{ .mfi
- nop.m 999
-(p8) fclass.m.unc p9, p0 = f9, 0x0b
- nop.i 999 ;;
+(p9) sub r8 = rExp_x, rExpBias // Get true exponent for normal path
+(p6) fma.s0 fTmp = f8, f8, f0 // Dummy to set Invalid flag
+(p6) mov r8 = rIntMax // If nan, inf, return INT_MAX
+}
+{ .mbb
+ nop.m 0
+(p7) br.cond.spnt ILOGB_ZERO // Branch if x zero
+(p10) br.ret.sptk b0 // Exit if x not zero
}
+;;
-L(ILOGB_COMMON):
-// X NORMAL returns true exponent
-{ .mmi
- nop.m 999
-(p8) getf.exp r33 = f9
- nop.i 999 ;;
+
+ILOGB_DENORM:
+// Form 2^64 in case need to scale denormal
+// Check to see if double-extended denormal
+{ .mfi
+ setf.exp f2to64 = rExp_2to64
+ fclass.m p8,p0 = fNorm_x, 0x0b
+ nop.i 0
}
+;;
-// If denormal add 64 to exponent bias for scaling
-{ .mfb
-(p9) add r34 = 64, r34
- nop.f 999
-(p9) br.cond.spnt L(ILOGB_DENORM) ;;
+{ .mfi
+ nop.m 0
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ nop.i 0
}
+;;
-{ .mmi
-(p8) and r36 = r35, r33
- nop.m 999
- nop.i 999 ;;
+// If double-extended denormal add 64 to exponent bias for scaling
+// If double-extended denormal form x * 2^64 which is normal
+{ .mfi
+(p8) add rExpBias = 64, rExpBias
+(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64
+ nop.i 0
}
+;;
+// Logic is the same as normal path but use normalized input
{ .mib
-(p8) sub r8 = r36, r34
- nop.i 999
-(p0) br.ret.sptk b0 ;;
+ getf.exp rSignexp_x = fNorm_x
+ nop.i 0
+ br.cond.sptk ILOGB_COMMON // Return to main path
}
+;;
-L(ILOGB_DENORM):
-// Here if x denormal
-// Form x * 2^64 which is normal
-// Return to common code
-{ .mfb
- cmp.eq p8,p9 = r0,r0
- fmpy f9 = f9, f10
- br.cond.sptk L(ILOGB_COMMON) ;;
+ILOGB_ZERO:
+// Here if x zero
+// Return INT_MIN, call error support
+
+{ .mlx
+ alloc r32=ar.pfs,1,3,4,0
+ movl rTrialResult = 0x0000000080000000
+}
+{ .mib
+ mov GR_Parameter_TAG = 157 // Error code
+ nop.i 0
+ br.cond.sptk __libm_error_region // Call error support
}
+;;
-// X ZERO
-// return INT_MIN, call error support
-L(ILOGB_ZERO):
-{.mlx
- mov GR_Parameter_TAG = 157
-(p6) movl r33 = 0x0000000080000000 ;;
-};;
-.endp ilogb
-ASM_SIZE_DIRECTIVE(ilogb)
+GLOBAL_LIBM_END(ilogb)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
+
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
+
{ .mmi
- stfd [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
+
.body
{ .mib
- stfd [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0 // Parameter 3 address
+ stfd [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfd [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ stfd [GR_Parameter_Y] = f9 // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
+
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
+
{ .mmi
- mov r8 = r33 // Store result
+ mov r8 = rTrialResult
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
+
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
+ br.ret.sptk b0
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
+
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_ilogbf.S b/sysdeps/ia64/fpu/s_ilogbf.S
index ffa6d3b672..1b6ade6148 100644
--- a/sysdeps/ia64/fpu/s_ilogbf.S
+++ b/sysdeps/ia64/fpu/s_ilogbf.S
@@ -1,10 +1,10 @@
.file "ilogbf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,234 +20,248 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/03/00 Initial version
-// 5/26/00 Fix bug when x a double-extended denormal;
+// 02/03/00 Initial version
+// 05/26/00 Fix bug when x a double-extended denormal;
// if x=0 call error routine, per C9X
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-// 1/20/01 Fixed result for x=0
-
-.align 32
-.global ilogbf#
-
-.section .text
-.proc ilogbf#
-.align 32
-
+// 01/20/01 Fixed result for x=0
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance
+//
// API
//==============================================================
-// int = ilogbf(float)
-
+// int ilogbf( float x );
+//
// Overview of operation
//==============================================================
-// ilogbf computes log2(x) as an int
+// The ilogbf function extracts the exponent of x as an integer
// and returns it in r8
-
-// ilogbf is similar to logbf but differs in the following ways:
+//
+// ilogbf is similar to logbf but differs in the following ways:
// +-inf
// ilogbf: returns INT_MAX
// logbf: returns +inf
-// Nan returns FP_ILOGBNAN (which is either INT_MAX or INT_MIN)
+// Nan returns FP_LOGBNAN (which is either INT_MAX or INT_MIN)
// ilogbf: returns INT_MAX (7fffffff)
-// logbf: returns QNAN (quieted SNAN)
+// logbf: returns QNAN (quietized SNAN)
// 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX)
-// ilogbf: returns INT_MIN (80000000)
-// logbf: returns -inf
-
+// ilogbf: returns -INT_MAX (80000001)
+// logbf: returns -inf, raises the divide-by-zero exception,
+// and calls libm_error_support to set domain error
+//
// Registers used
//==============================================================
+// general registers used:
+// r26 -> r39
+// r36 -> r39 used as parameters to error path
+//
+// predicate registers used:
+// p6 -> p10
+// floating-point registers used:
+// f9, f10, f11
+// f8, input
-// general local registers:
-// ar.pfs r32
-// r33 -> r37
-// r38 -> r41 used as parameters to error path
-
-// predicate registers used:
-// p6 - x nan, inf
-// p7 - x 0
-// p8 - x norm, unorm
-// p9 - x unorm
-
-// floating-point registers used:
-// f8 - f10
-
-#include "libm_support.h"
+rExpBias = r26
+rExpMask = r27
+rSignexp_x = r28
+rExp_x = r29
+rIntMax = r30
+rExp_2to64 = r31
GR_SAVE_PFS = r32
+rTrialResult = r33
GR_SAVE_B0 = r34
GR_SAVE_GP = r35
-GR_Parameter_X = r38
-GR_Parameter_Y = r39
-GR_Parameter_RESULT = r40
-GR_Parameter_TAG = r41
-FR_X = f8
-FR_Y = f0
-FR_RESULT = f0
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+GR_Parameter_TAG = r39
+fTmp = f9
+fNorm_x = f10
+f2to64 = f11
-ilogbf:
+.section .text
+GLOBAL_LIBM_ENTRY(ilogbf)
-// Form signexp of 2^64 in case need to scale denormal
-{ .mmf
- alloc r32=ar.pfs,1,5,4,0
-(p0) mov r37 = 0x1003f
-(p0) fnorm f9 = f8 ;;
+// X NORMAL
+// TrueExp_x = exp(f8) - 0xffff
+// r8 = TrueExp_x
+{ .mfi
+ getf.exp rSignexp_x = f8
+ fclass.m p8,p0 = f8, 0x0b // Test for x unorm
+ mov rExpBias = 0xffff // Exponent bias
}
-
-// Form 2^64 in case need to scale denormal
{ .mfi
-(p0) setf.exp f10 = r37
-(p0) fclass.m.unc p7, p8 = f8, 0xe3
-(p0) mov r34 = 0xffff ;;
+ nop.m 0
+ fnorm.s1 fNorm_x = f8
+ mov rExpMask = 0x1ffff // Exponent mask
}
+;;
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 0 11
-// e 3
-// X ZERO, returns INT_MIN
-// X INF or NAN, returns INT_MAX
+// Form signexp of 2^64 in case need to scale denormal
+{ .mfb
+ mov rExp_2to64 = 0x1003f
+ fclass.m p6,p9 = f8, 0x1e3 // Test x natval, nan, inf
+(p8) br.cond.spnt ILOGB_DENORM // Branch if x unorm
+}
+;;
+ILOGB_COMMON:
+// Return here from ILOGB_DENORM
{ .mfi
-(p0) mov r35 = 0x1ffff
-(p8) fclass.m.unc p6, p8 = f8, 0x07
- nop.i 999 ;;
+ and rExp_x = rSignexp_x, rExpMask // Get biased exponent
+ fclass.m p7,p10 = f8, 0x07 // Test x zero
+ nop.i 0
}
{ .mlx
- nop.m 999
-(p7) movl r8 = 0x000000007fffffff ;;
+ nop.m 0
+ movl rIntMax = 0x000000007fffffff // Form INT_MAX
}
+;;
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(ILOGB_ZERO) ;;
-}
-
-// Test for denormal
+.pred.rel "mutex",p6,p9
{ .mfi
- nop.m 999
-(p8) fclass.m.unc p9, p0 = f9, 0x0b
- nop.i 999 ;;
+(p9) sub r8 = rExp_x, rExpBias // Get true exponent for normal path
+(p6) fma.s0 fTmp = f8, f8, f0 // Dummy to set Invalid flag
+(p6) mov r8 = rIntMax // If nan, inf, return INT_MAX
+}
+{ .mbb
+ nop.m 0
+(p7) br.cond.spnt ILOGB_ZERO // Branch if x zero
+(p10) br.ret.sptk b0 // Exit if x not zero
}
+;;
-L(ILOGB_COMMON):
-// X NORMAL returns true exponent
-{ .mmi
- nop.m 999
-(p8) getf.exp r33 = f9
- nop.i 999 ;;
+
+ILOGB_DENORM:
+// Form 2^64 in case need to scale denormal
+// Check to see if double-extended denormal
+{ .mfi
+ setf.exp f2to64 = rExp_2to64
+ fclass.m p8,p0 = fNorm_x, 0x0b
+ nop.i 0
}
+;;
-// If denormal add 64 to exponent bias for scaling
-{ .mfb
-(p9) add r34 = 64, r34
- nop.f 999
-(p9) br.cond.spnt L(ILOGB_DENORM) ;;
+{ .mfi
+ nop.m 0
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ nop.i 0
}
+;;
-{ .mmi
-(p8) and r36 = r35, r33
- nop.m 999
- nop.i 999 ;;
+// If double-extended denormal add 64 to exponent bias for scaling
+// If double-extended denormal form x * 2^64 which is normal
+{ .mfi
+(p8) add rExpBias = 64, rExpBias
+(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64
+ nop.i 0
}
+;;
+// Logic is the same as normal path but use normalized input
{ .mib
-(p8) sub r8 = r36, r34
- nop.i 999
-(p0) br.ret.sptk b0 ;;
+ getf.exp rSignexp_x = fNorm_x
+ nop.i 0
+ br.cond.sptk ILOGB_COMMON // Return to main path
}
+;;
-L(ILOGB_DENORM):
-// Here if x denormal
-// Form x * 2^64 which is normal
-// Return to common code
-{ .mfb
- cmp.eq p8,p9 = r0,r0
- fmpy f9 = f9, f10
- br.cond.sptk L(ILOGB_COMMON) ;;
+ILOGB_ZERO:
+// Here if x zero
+// Return INT_MIN, call error support
+
+{ .mlx
+ alloc r32=ar.pfs,1,3,4,0
+ movl rTrialResult = 0x0000000080000000
+}
+{ .mib
+ mov GR_Parameter_TAG = 158 // Error code
+ nop.i 0
+ br.cond.sptk __libm_error_region // Call error support
}
+;;
-// X ZERO
-// return INT_MIN, call error support
-L(ILOGB_ZERO):
-{.mlx
- mov GR_Parameter_TAG = 158
-(p6) movl r33 = 0x0000000080000000 ;;
-};;
-.endp ilogbf
-ASM_SIZE_DIRECTIVE(ilogbf)
+GLOBAL_LIBM_END(ilogbf)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
+
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
+
{ .mmi
- stfs [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfs [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
+
.body
{ .mib
- stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0 // Parameter 3 address
+ stfs [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ stfs [GR_Parameter_Y] = f9 // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
+
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
+
{ .mmi
- mov r8 = r33 // Store result
+ mov r8 = rTrialResult
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
+
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
+ br.ret.sptk b0
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
+
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_ilogbl.S b/sysdeps/ia64/fpu/s_ilogbl.S
index 240da060bf..e462fb706e 100644
--- a/sysdeps/ia64/fpu/s_ilogbl.S
+++ b/sysdeps/ia64/fpu/s_ilogbl.S
@@ -1,10 +1,10 @@
.file "ilogbl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,234 +20,248 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/03/00 Initial version
-// 5/26/00 Fix bug when x a double-extended denormal;
+// 02/03/00 Initial version
+// 05/26/00 Fix bug when x a double-extended denormal;
// if x=0 call error routine, per C9X
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
-// 1/20/01 Fixed result for x=0
-
-.align 32
-.global ilogbl#
-
-.section .text
-.proc ilogbl#
-.align 32
-
+// 01/20/01 Fixed result for x=0
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance
+//
// API
//==============================================================
-// int = ilogbl(double_extended)
-
+// int ilogbl( long double x );
+//
// Overview of operation
//==============================================================
-// ilogbl computes log2(x) as an int
+// The ilogbl function extracts the exponent of x as an integer
// and returns it in r8
-
-// ilogbl is similar to logbl but differs in the following ways:
+//
+// ilogbl is similar to logbl but differs in the following ways:
// +-inf
// ilogbl: returns INT_MAX
// logbl: returns +inf
-// Nan returns FP_ILOGBNAN (which is either INT_MAX or INT_MIN)
+// Nan returns FP_LOGBNAN (which is either INT_MAX or INT_MIN)
// ilogbl: returns INT_MAX (7fffffff)
-// logbl: returns QNAN (quieted SNAN)
+// logbl: returns QNAN (quietized SNAN)
// 0 returns FP_ILOGB0 (which is either INT_MIN or -INT_MAX)
-// ilogbl: returns INT_MIN (80000000)
-// logbl: returns -inf
-
+// ilogbl: returns -INT_MAX (80000001)
+// logbl: returns -inf, raises the divide-by-zero exception,
+// and calls libm_error_support to set domain error
+//
// Registers used
//==============================================================
+// general registers used:
+// r26 -> r39
+// r36 -> r39 used as parameters to error path
+//
+// predicate registers used:
+// p6 -> p10
+// floating-point registers used:
+// f9, f10, f11
+// f8, input
-// general local registers:
-// ar.pfs r32
-// r33 -> r37
-// r38 -> r41 used as parameters to error path
-
-// predicate registers used:
-// p6 - x nan, inf
-// p7 - x 0
-// p8 - x norm, unorm
-// p9 - x unorm
-
-// floating-point registers used:
-// f8 - f10
-
-#include "libm_support.h"
+rExpBias = r26
+rExpMask = r27
+rSignexp_x = r28
+rExp_x = r29
+rIntMax = r30
+rExp_2to64 = r31
GR_SAVE_PFS = r32
+rTrialResult = r33
GR_SAVE_B0 = r34
GR_SAVE_GP = r35
-GR_Parameter_X = r38
-GR_Parameter_Y = r39
-GR_Parameter_RESULT = r40
-GR_Parameter_TAG = r41
-FR_X = f8
-FR_Y = f0
-FR_RESULT = f0
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+GR_Parameter_TAG = r39
+fTmp = f9
+fNorm_x = f10
+f2to64 = f11
-ilogbl:
+.section .text
+GLOBAL_LIBM_ENTRY(ilogbl)
-// Form signexp of 2^64 in case need to scale denormal
-{ .mmf
- alloc r32=ar.pfs,1,5,4,0
-(p0) mov r37 = 0x1003f
-(p0) fnorm f9 = f8 ;;
+// X NORMAL
+// TrueExp_x = exp(f8) - 0xffff
+// r8 = TrueExp_x
+{ .mfi
+ getf.exp rSignexp_x = f8
+ fclass.m p8,p0 = f8, 0x0b // Test for x unorm
+ mov rExpBias = 0xffff // Exponent bias
}
-
-// Form 2^64 in case need to scale denormal
{ .mfi
-(p0) setf.exp f10 = r37
-(p0) fclass.m.unc p7, p8 = f8, 0xe3
-(p0) mov r34 = 0xffff ;;
+ nop.m 0
+ fnorm.s1 fNorm_x = f8
+ mov rExpMask = 0x1ffff // Exponent mask
}
+;;
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 0 11
-// e 3
-// X ZERO, returns INT_MIN
-// X INF or NAN, returns INT_MAX
+// Form signexp of 2^64 in case need to scale denormal
+{ .mfb
+ mov rExp_2to64 = 0x1003f
+ fclass.m p6,p9 = f8, 0x1e3 // Test x natval, nan, inf
+(p8) br.cond.spnt ILOGB_DENORM // Branch if x unorm
+}
+;;
+ILOGB_COMMON:
+// Return here from ILOGB_DENORM
{ .mfi
-(p0) mov r35 = 0x1ffff
-(p8) fclass.m.unc p6, p8 = f8, 0x07
- nop.i 999 ;;
+ and rExp_x = rSignexp_x, rExpMask // Get biased exponent
+ fclass.m p7,p10 = f8, 0x07 // Test x zero
+ nop.i 0
}
{ .mlx
- nop.m 999
-(p7) movl r8 = 0x000000007fffffff ;;
+ nop.m 0
+ movl rIntMax = 0x000000007fffffff // Form INT_MAX
}
+;;
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(ILOGB_ZERO) ;;
-}
-
-// Test for denormal
+.pred.rel "mutex",p6,p9
{ .mfi
- nop.m 999
-(p8) fclass.m.unc p9, p0 = f9, 0x0b
- nop.i 999 ;;
+(p9) sub r8 = rExp_x, rExpBias // Get true exponent for normal path
+(p6) fma.s0 fTmp = f8, f8, f0 // Dummy to set Invalid flag
+(p6) mov r8 = rIntMax // If nan, inf, return INT_MAX
+}
+{ .mbb
+ nop.m 0
+(p7) br.cond.spnt ILOGB_ZERO // Branch if x zero
+(p10) br.ret.sptk b0 // Exit if x not zero
}
+;;
-L(ILOGB_COMMON):
-// X NORMAL returns true exponent
-{ .mmi
- nop.m 999
-(p8) getf.exp r33 = f9
- nop.i 999 ;;
+
+ILOGB_DENORM:
+// Form 2^64 in case need to scale denormal
+// Check to see if double-extended denormal
+{ .mfi
+ setf.exp f2to64 = rExp_2to64
+ fclass.m p8,p0 = fNorm_x, 0x0b
+ nop.i 0
}
+;;
-// If denormal add 64 to exponent bias for scaling
-{ .mfb
-(p9) add r34 = 64, r34
- nop.f 999
-(p9) br.cond.spnt L(ILOGB_DENORM) ;;
+{ .mfi
+ nop.m 0
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ nop.i 0
}
+;;
-{ .mmi
-(p8) and r36 = r35, r33
- nop.m 999
- nop.i 999 ;;
+// If double-extended denormal add 64 to exponent bias for scaling
+// If double-extended denormal form x * 2^64 which is normal
+{ .mfi
+(p8) add rExpBias = 64, rExpBias
+(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64
+ nop.i 0
}
+;;
+// Logic is the same as normal path but use normalized input
{ .mib
-(p8) sub r8 = r36, r34
- nop.i 999
-(p0) br.ret.sptk b0 ;;
+ getf.exp rSignexp_x = fNorm_x
+ nop.i 0
+ br.cond.sptk ILOGB_COMMON // Return to main path
}
+;;
-L(ILOGB_DENORM):
-// Here if x denormal
-// Form x * 2^64 which is normal
-// Return to common code
-{ .mfb
- cmp.eq p8,p9 = r0,r0
- fmpy f9 = f9, f10
- br.cond.sptk L(ILOGB_COMMON) ;;
+ILOGB_ZERO:
+// Here if x zero
+// Return INT_MIN, call error support
+
+{ .mlx
+ alloc r32=ar.pfs,1,3,4,0
+ movl rTrialResult = 0x0000000080000000
+}
+{ .mib
+ mov GR_Parameter_TAG = 156 // Error code
+ nop.i 0
+ br.cond.sptk __libm_error_region // Call error support
}
+;;
-// X ZERO
-// return INT_MIN, call error support
-L(ILOGB_ZERO):
-{.mlx
- mov GR_Parameter_TAG = 156
-(p6) movl r33 = 0x0000000080000000 ;;
-};;
-.endp ilogbl
-ASM_SIZE_DIRECTIVE(ilogbl)
+GLOBAL_LIBM_END(ilogbl)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
+
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
+
{ .mmi
- stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
+
.body
{ .mib
- stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0 // Parameter 3 address
+ stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ stfe [GR_Parameter_Y] = f9 // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
+
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
+
{ .mmi
- mov r8 = r33 // Store result
+ mov r8 = rTrialResult
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
+
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
+ br.ret.sptk b0
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
+
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_ldexp.S b/sysdeps/ia64/fpu/s_ldexp.S
deleted file mode 100644
index 4dcd671c9f..0000000000
--- a/sysdeps/ia64/fpu/s_ldexp.S
+++ /dev/null
@@ -1,380 +0,0 @@
-.file "ldexp.s"
-
-// Copyright (C) 2000, 2001, Intel Corporation
-// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// * The name of Intel Corporation may not be used to endorse or promote
-// products derived from this software without specific prior written
-// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
-//
-// History
-//==============================================================
-// 2/02/00 Initial version
-// 1/26/01 ldex pcompletely reworked and now standalone version
-//
-// API
-//==============================================================
-// double = ldexp (double x, int n)
-// input floating point f8 and int n (r33)
-// output floating point f8
-//
-// Returns x* 2**n using an fma and detects overflow
-// and underflow.
-//
-//
-
-#include "libm_support.h"
-
-FR_Big = f6
-FR_NBig = f7
-FR_Floating_X = f8
-FR_Result = f8
-FR_Result2 = f9
-FR_Result3 = f11
-FR_Norm_X = f12
-FR_Two_N = f14
-FR_Two_to_Big = f15
-
-GR_N_Biased = r15
-GR_Big = r16
-GR_NBig = r17
-GR_Scratch = r18
-GR_Scratch1 = r19
-GR_Bias = r20
-GR_N_as_int = r21
-
-GR_SAVE_B0 = r32
-GR_SAVE_GP = r33
-GR_SAVE_PFS = r34
-GR_Parameter_X = r35
-GR_Parameter_Y = r36
-GR_Parameter_RESULT = r37
-GR_Tag = r38
-
-.align 32
-.global ldexp
-
-.section .text
-.proc ldexp
-.align 32
-
-ldexp:
-
-//
-// Is x NAN, INF, ZERO, +-?
-// Build the exponent Bias
-//
-{ .mfi
- alloc r32=ar.pfs,1,2,4,0
- fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero
- addl GR_Bias = 0x0FFFF,r0
-}
-
-//
-// Sign extend input
-// Is N zero?
-// Normalize x
-//
-{ .mfi
- cmp.eq.unc p6,p0 = r33,r0
- fnorm.s1 FR_Norm_X = FR_Floating_X
- sxt4 GR_N_as_int = r33
-}
-;;
-
-//
-// Normalize x
-// Branch and return special values.
-// Create -35000
-// Create 35000
-//
-{ .mfi
- addl GR_Big = 35000,r0
- nop.f 0
- add GR_N_Biased = GR_Bias,GR_N_as_int
-}
-{ .mfb
- addl GR_NBig = -35000,r0
-(p7) fma.d.s0 FR_Result = FR_Floating_X,f1, f0
-(p7) br.ret.spnt b0
-};;
-
-//
-// Build the exponent Bias
-// Return x when N = 0
-//
-{ .mfi
- setf.exp FR_Two_N = GR_N_Biased
- nop.f 0
- addl GR_Scratch1 = 0x063BF,r0
-}
-{ .mfb
- addl GR_Scratch = 0x019C3F,r0
-(p6) fma.d.s0 FR_Result = FR_Floating_X,f1, f0
-(p6) br.ret.spnt b0
-};;
-
-//
-// Create 2*big
-// Create 2**-big
-// Is N > 35000
-// Is N < -35000
-// Raise Denormal operand flag with compare
-// Main path, create 2**N
-//
-{ .mfi
- setf.exp FR_NBig = GR_Scratch1
- nop.f 0
- cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big
-}
-{ .mfi
- setf.exp FR_Big = GR_Scratch
- fcmp.ge.s0 p0,p11 = FR_Floating_X,f0
- cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig
-};;
-
-//
-// Adjust 2**N if N was very small or very large
-//
-{ .mfi
- nop.m 0
-(p6) fma.s1 FR_Two_N = FR_Big,f1,f0
- nop.i 0
-}
-{ .mlx
- nop.m 999
-(p0) movl GR_Scratch = 0x00000000000303FF
-};;
-
-
-{ .mfi
- nop.m 0
-(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0
- nop.i 0
-}
-{ .mlx
- nop.m 999
-(p0) movl GR_Scratch1= 0x00000000000103FF
-};;
-
-// Set up necessary status fields
-//
-// S0 user supplied status
-// S2 user supplied status + WRE + TD (Overflows)
-// S3 user supplied status + FZ + TD (Underflows)
-//
-{ .mfi
- nop.m 999
-(p0) fsetc.s3 0x7F,0x41
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p0) fsetc.s2 0x7F,0x42
- nop.i 999
-};;
-
-//
-// Do final operation
-//
-{ .mfi
- setf.exp FR_NBig = GR_Scratch
- fma.d.s0 FR_Result = FR_Two_N,FR_Norm_X,f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
- fma.d.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0
- nop.i 999
-};;
-{ .mfi
- setf.exp FR_Big = GR_Scratch1
- fma.d.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0
- nop.i 999
-};;
-
-//
-// Check for overflow or underflow.
-// Restore s3
-// Restore s2
-//
-{ .mfi
- nop.m 0
- fsetc.s3 0x7F,0x40
- nop.i 999
-}
-{ .mfi
- nop.m 0
- fsetc.s2 0x7F,0x40
- nop.i 999
-};;
-
-//
-// Is the result zero?
-//
-{ .mfi
- nop.m 999
- fclass.m.unc p6, p0 = FR_Result3, 0x007
- nop.i 999
-}
-{ .mfi
- addl GR_Tag = 146, r0
- fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big
- nop.i 0
-};;
-
-//
-// Detect masked underflow - Tiny + Inexact Only
-//
-{ .mfi
- nop.m 999
-(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2
- nop.i 999
-};;
-
-//
-// Is result bigger the allowed range?
-// Branch out for underflow
-//
-{ .mfb
-(p6) addl GR_Tag = 147, r0
-(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig
-(p6) br.cond.spnt L(LDEXP_UNDERFLOW)
-};;
-
-//
-// Branch out for overflow
-//
-{ .mbb
- nop.m 0
-(p7) br.cond.spnt L(LDEXP_OVERFLOW)
-(p9) br.cond.spnt L(LDEXP_OVERFLOW)
-};;
-
-//
-// Return from main path.
-//
-{ .mfb
- nop.m 999
- nop.f 0
- br.ret.sptk b0;;
-}
-
-.endp ldexp
-ASM_SIZE_DIRECTIVE(ldexp)
-.proc __libm_error_region
-__libm_error_region:
-
-L(LDEXP_OVERFLOW):
-L(LDEXP_UNDERFLOW):
-
-//
-// Get stack address of N
-//
-.prologue
-{ .mfi
- add GR_Parameter_Y=-32,sp
- nop.f 0
-.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs
-}
-//
-// Adjust sp
-//
-{ .mfi
-.fframe 64
- add sp=-64,sp
- nop.f 0
- mov GR_SAVE_GP=gp
-};;
-
-//
-// Store N on stack in correct position
-// Locate the address of x on stack
-//
-{ .mmi
- st8 [GR_Parameter_Y] = GR_N_as_int,16
- add GR_Parameter_X = 16,sp
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0
-};;
-
-//
-// Store x on the stack.
-// Get address for result on stack.
-//
-.body
-{ .mib
- stfd [GR_Parameter_X] = FR_Norm_X
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0
-}
-{ .mib
- stfd [GR_Parameter_Y] = FR_Result
- add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support#
-};;
-
-//
-// Get location of result on stack
-//
-{ .mmi
- nop.m 0
- nop.m 0
- add GR_Parameter_RESULT = 48,sp
-};;
-
-//
-// Get the new result
-//
-{ .mmi
- ldfd FR_Result = [GR_Parameter_RESULT]
-.restore sp
- add sp = 64,sp
- mov b0 = GR_SAVE_B0
-};;
-
-//
-// Restore gp, ar.pfs and return
-//
-{ .mib
- mov gp = GR_SAVE_GP
- mov ar.pfs = GR_SAVE_PFS
- br.ret.sptk b0
-};;
-
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
-
-.type __libm_error_support#,@function
-.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_ldexp.c b/sysdeps/ia64/fpu/s_ldexp.c
new file mode 100644
index 0000000000..015b6508c1
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_ldexp.c
@@ -0,0 +1,62 @@
+/* file: ldexp.c */
+
+
+// Copyright (c) 2000, 2001, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
+// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//
+
+#include "libm_support.h"
+
+double __libm_ldexp(double, int, int);
+
+
+double ldexp(double x, int n)
+{
+
+#ifdef SIZE_INT_64
+ return __libm_ldexp(x,n,1);
+#else
+
+#ifdef SIZE_INT_32
+ return __libm_ldexp(x,n,0);
+#endif
+
+#endif
+
+}
diff --git a/sysdeps/ia64/fpu/s_ldexpf.c b/sysdeps/ia64/fpu/s_ldexpf.c
new file mode 100644
index 0000000000..eae4051873
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_ldexpf.c
@@ -0,0 +1,62 @@
+/* file: ldexpf.c */
+
+
+// Copyright (c) 2000, 2001, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
+// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//
+
+#include "libm_support.h"
+
+float __libm_ldexpf(float, int, int);
+
+
+float ldexpf(float x, int n)
+{
+
+#ifdef SIZE_INT_64
+ return __libm_ldexpf(x,n,1);
+#else
+
+#ifdef SIZE_INT_32
+ return __libm_ldexpf(x,n,0);
+#endif
+
+#endif
+
+}
diff --git a/sysdeps/ia64/fpu/s_ldexpl.c b/sysdeps/ia64/fpu/s_ldexpl.c
new file mode 100644
index 0000000000..91d826841f
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_ldexpl.c
@@ -0,0 +1,62 @@
+/* file: ldexpl.c */
+
+
+// Copyright (c) 2000, 2001, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
+// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//
+
+#include "libm_support.h"
+
+long double __libm_ldexpl(long double, int, int);
+
+
+long double ldexpl(long double x, int n)
+{
+
+#ifdef SIZE_INT_64
+ return __libm_ldexpl(x,n,1);
+#else
+
+#ifdef SIZE_INT_32
+ return __libm_ldexpl(x,n,0);
+#endif
+
+#endif
+
+}
diff --git a/sysdeps/ia64/fpu/s_libm_ldexp.S b/sysdeps/ia64/fpu/s_libm_ldexp.S
new file mode 100644
index 0000000000..1fc2c3f80c
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_libm_ldexp.S
@@ -0,0 +1,382 @@
+.file "libm_ldexp.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/02/00 Initial version
+// 01/26/01 ldexp completely reworked and now standalone version
+// 01/04/02 Added handling for int 32 or 64 bits
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// double __libm_ldexp (double x, int n, int int_type)
+// input floating point f8 and int n (r33), int int_type (r34)
+// output floating point f8
+//
+// int_type = 0 if int is 32 bits
+// int_type = 1 if int is 64 bits
+//
+// Returns x* 2**n using an fma and detects overflow
+// and underflow.
+//
+//
+
+FR_Big = f6
+FR_NBig = f7
+FR_Floating_X = f8
+FR_Result = f8
+FR_Result2 = f9
+FR_Result3 = f11
+FR_Norm_X = f12
+FR_Two_N = f14
+FR_Two_to_Big = f15
+
+GR_N_Biased = r15
+GR_Big = r16
+GR_NBig = r17
+GR_Scratch = r18
+GR_Scratch1 = r19
+GR_Bias = r20
+GR_N_as_int = r21
+
+GR_SAVE_B0 = r32
+GR_SAVE_GP = r33
+GR_SAVE_PFS = r34
+GR_Parameter_X = r35
+GR_Parameter_Y = r36
+GR_Parameter_RESULT = r37
+GR_Tag = r38
+
+.section .text
+GLOBAL_LIBM_ENTRY(__libm_ldexp)
+
+//
+// Is x NAN, INF, ZERO, +-?
+// Build the exponent Bias
+//
+{ .mfi
+ alloc r32=ar.pfs,3,0,4,0
+ fclass.m p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero
+ addl GR_Bias = 0x0FFFF,r0
+}
+
+//
+// Is N zero?
+// Normalize x
+// Is integer type 32 bits?
+//
+{ .mfi
+ cmp.eq p6,p0 = r33,r0
+ fnorm.s1 FR_Norm_X = FR_Floating_X
+ cmp.eq p8,p9 = r34,r0
+}
+;;
+
+// Sign extend N if int is 32 bits
+{ .mfi
+(p9) mov GR_N_as_int = r33 // Copy N if int is 64 bits
+ nop.f 0
+(p8) sxt4 GR_N_as_int = r33 // Sign extend N if int is 32 bits
+}
+;;
+
+//
+// Branch and return special values.
+// Create -35000
+// Create 35000
+//
+{ .mfi
+ addl GR_Big = 35000,r0
+ nop.f 0
+ add GR_N_Biased = GR_Bias,GR_N_as_int
+}
+{ .mfb
+ addl GR_NBig = -35000,r0
+(p7) fma.d.s0 FR_Result = FR_Floating_X,f1, f0
+(p7) br.ret.spnt b0
+};;
+
+//
+// Build the exponent Bias
+// Return x when N = 0
+//
+{ .mfi
+ setf.exp FR_Two_N = GR_N_Biased
+ nop.f 0
+ addl GR_Scratch1 = 0x063BF,r0
+}
+{ .mfb
+ addl GR_Scratch = 0x019C3F,r0
+(p6) fma.d.s0 FR_Result = FR_Floating_X,f1, f0
+(p6) br.ret.spnt b0
+};;
+
+//
+// Create 2*big
+// Create 2**-big
+// Is N > 35000
+// Is N < -35000
+// Raise Denormal operand flag with compare
+// Main path, create 2**N
+//
+{ .mfi
+ setf.exp FR_NBig = GR_Scratch1
+ nop.f 0
+ cmp.ge p6, p0 = GR_N_as_int, GR_Big
+}
+{ .mfi
+ setf.exp FR_Big = GR_Scratch
+ fcmp.ge.s0 p0,p11 = FR_Floating_X,f0
+ cmp.le p8, p0 = GR_N_as_int, GR_NBig
+};;
+
+//
+// Adjust 2**N if N was very small or very large
+//
+{ .mfi
+ nop.m 0
+(p6) fma.s1 FR_Two_N = FR_Big,f1,f0
+ nop.i 0
+}
+{ .mlx
+ nop.m 999
+ movl GR_Scratch = 0x00000000000303FF
+};;
+
+
+{ .mfi
+ nop.m 0
+(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0
+ nop.i 0
+}
+{ .mlx
+ nop.m 999
+ movl GR_Scratch1= 0x00000000000103FF
+};;
+
+// Set up necessary status fields
+//
+// S0 user supplied status
+// S2 user supplied status + WRE + TD (Overflows)
+// S3 user supplied status + FZ + TD (Underflows)
+//
+{ .mfi
+ nop.m 999
+ fsetc.s3 0x7F,0x41
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+ fsetc.s2 0x7F,0x42
+ nop.i 999
+};;
+
+//
+// Do final operation
+//
+{ .mfi
+ setf.exp FR_NBig = GR_Scratch
+ fma.d.s0 FR_Result = FR_Two_N,FR_Norm_X,f0
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+ fma.d.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0
+ nop.i 999
+};;
+{ .mfi
+ setf.exp FR_Big = GR_Scratch1
+ fma.d.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0
+ nop.i 999
+};;
+
+//
+// Check for overflow or underflow.
+// Restore s3
+// Restore s2
+//
+{ .mfi
+ nop.m 0
+ fsetc.s3 0x7F,0x40
+ nop.i 999
+}
+{ .mfi
+ nop.m 0
+ fsetc.s2 0x7F,0x40
+ nop.i 999
+};;
+
+//
+// Is the result zero?
+//
+{ .mfi
+ nop.m 999
+ fclass.m p6, p0 = FR_Result3, 0x007
+ nop.i 999
+}
+{ .mfi
+ addl GR_Tag = 146, r0
+ fcmp.ge.s1 p7, p8 = FR_Result2 , FR_Big
+ nop.i 0
+};;
+
+//
+// Detect masked underflow - Tiny + Inexact Only
+//
+{ .mfi
+ nop.m 999
+(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2
+ nop.i 999
+};;
+
+//
+// Is result bigger the allowed range?
+// Branch out for underflow
+//
+{ .mfb
+(p6) addl GR_Tag = 147, r0
+(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig
+(p6) br.cond.spnt LDEXP_UNDERFLOW
+};;
+
+//
+// Branch out for overflow
+//
+{ .mbb
+ nop.m 0
+(p7) br.cond.spnt LDEXP_OVERFLOW
+(p9) br.cond.spnt LDEXP_OVERFLOW
+};;
+
+//
+// Return from main path.
+//
+{ .mfb
+ nop.m 999
+ nop.f 0
+ br.ret.sptk b0;;
+}
+
+GLOBAL_LIBM_END(__libm_ldexp)
+__libm_error_region:
+
+LDEXP_OVERFLOW:
+LDEXP_UNDERFLOW:
+
+//
+// Get stack address of N
+//
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs
+}
+//
+// Adjust sp
+//
+{ .mfi
+.fframe 64
+ add sp=-64,sp
+ nop.f 0
+ mov GR_SAVE_GP=gp
+};;
+
+//
+// Store N on stack in correct position
+// Locate the address of x on stack
+//
+{ .mmi
+ st8 [GR_Parameter_Y] = GR_N_as_int,16
+ add GR_Parameter_X = 16,sp
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0
+};;
+
+//
+// Store x on the stack.
+// Get address for result on stack.
+//
+.body
+{ .mib
+ stfd [GR_Parameter_X] = FR_Norm_X
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ nop.b 0
+}
+{ .mib
+ stfd [GR_Parameter_Y] = FR_Result
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support#
+};;
+
+//
+// Get location of result on stack
+//
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+
+//
+// Get the new result
+//
+{ .mmi
+ ldfd FR_Result = [GR_Parameter_RESULT]
+.restore sp
+ add sp = 64,sp
+ mov b0 = GR_SAVE_B0
+};;
+
+//
+// Restore gp, ar.pfs and return
+//
+{ .mib
+ mov gp = GR_SAVE_GP
+ mov ar.pfs = GR_SAVE_PFS
+ br.ret.sptk b0
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_ldexpf.S b/sysdeps/ia64/fpu/s_libm_ldexpf.S
index 36f0111fe1..d7f161c93d 100644
--- a/sysdeps/ia64/fpu/s_ldexpf.S
+++ b/sysdeps/ia64/fpu/s_libm_ldexpf.S
@@ -1,10 +1,10 @@
-//.file "ldexpf.s"
+.file "libm_ldexpf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,26 +35,30 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 1/26/01 ldexpf completely reworked and now standalone version
+// 02/02/00 Initial version
+// 01/26/01 ldexpf completely reworked and now standalone version
+// 01/04/02 Added handling for int 32 or 64 bits
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
-// float = ldexpf (float x, int n)
-// input floating point f8 and int n (r33)
+// float __libm_ldexpf (float x, int n, int int_type)
+// input floating point f8 and int n (r33), int int_type (r34)
// output floating point f8
//
+// int_type = 0 if int is 32 bits
+// int_type = 1 if int is 64 bits
+//
// Returns x* 2**n using an fma and detects overflow
// and underflow.
//
//
-#include "libm_support.h"
-
FR_Big = f6
FR_NBig = f7
FR_Floating_X = f8
@@ -81,39 +85,39 @@ GR_Parameter_Y = r36
GR_Parameter_RESULT = r37
GR_Tag = r38
-.align 32
-.global ldexpf
-
.section .text
-.proc ldexpf
-.align 32
-
-ldexpf:
+GLOBAL_LIBM_ENTRY(__libm_ldexpf)
//
// Is x NAN, INF, ZERO, +-?
// Build the exponent Bias
//
{ .mfi
- alloc r32=ar.pfs,1,2,4,0
- fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero
+ alloc r32=ar.pfs,3,0,4,0
+ fclass.m p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero
addl GR_Bias = 0x0FFFF,r0
}
-
//
-// Sign extend input
// Is N zero?
// Normalize x
+// Is integer type 32 bits?
//
{ .mfi
- cmp.eq.unc p6,p0 = r33,r0
+ cmp.eq p6,p0 = r33,r0
fnorm.s1 FR_Norm_X = FR_Floating_X
- sxt4 GR_N_as_int = r33
+ cmp.eq p8,p9 = r34,r0
+}
+;;
+
+// Sign extend N if int is 32 bits
+{ .mfi
+(p9) mov GR_N_as_int = r33 // Copy N if int is 64 bits
+ nop.f 0
+(p8) sxt4 GR_N_as_int = r33 // Sign extend N if int is 32 bits
}
;;
//
-// Normalize x
// Branch and return special values.
// Create -35000
// Create 35000
@@ -155,12 +159,12 @@ ldexpf:
{ .mfi
setf.exp FR_NBig = GR_Scratch1
nop.f 0
- cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big
+ cmp.ge p6, p0 = GR_N_as_int, GR_Big
}
{ .mfi
setf.exp FR_Big = GR_Scratch
fcmp.ge.s0 p0,p11 = FR_Floating_X,f0
- cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig
+ cmp.le p8, p0 = GR_N_as_int, GR_NBig
};;
//
@@ -173,7 +177,7 @@ ldexpf:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch = 0x000000000003007F
+ movl GR_Scratch = 0x000000000003007F
};;
@@ -184,7 +188,7 @@ ldexpf:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch1= 0x000000000001007F
+ movl GR_Scratch1= 0x000000000001007F
};;
// Set up necessary status fields
@@ -195,12 +199,12 @@ ldexpf:
//
{ .mfi
nop.m 999
-(p0) fsetc.s3 0x7F,0x41
+ fsetc.s3 0x7F,0x41
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fsetc.s2 0x7F,0x42
+ fsetc.s2 0x7F,0x42
nop.i 999
};;
@@ -243,12 +247,12 @@ ldexpf:
//
{ .mfi
nop.m 999
- fclass.m.unc p6, p0 = FR_Result3, 0x007
+ fclass.m p6, p0 = FR_Result3, 0x007
nop.i 999
}
{ .mfi
addl GR_Tag = 148, r0
- fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big
+ fcmp.ge.s1 p7, p8 = FR_Result2 , FR_Big
nop.i 0
};;
@@ -268,7 +272,7 @@ ldexpf:
{ .mfb
(p6) addl GR_Tag = 149, r0
(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig
-(p6) br.cond.spnt L(ldexpf_UNDERFLOW)
+(p6) br.cond.spnt LDEXPF_UNDERFLOW
};;
//
@@ -276,8 +280,8 @@ ldexpf:
//
{ .mbb
nop.m 0
-(p7) br.cond.spnt L(ldexpf_OVERFLOW)
-(p9) br.cond.spnt L(ldexpf_OVERFLOW)
+(p7) br.cond.spnt LDEXPF_OVERFLOW
+(p9) br.cond.spnt LDEXPF_OVERFLOW
};;
//
@@ -289,13 +293,11 @@ ldexpf:
br.ret.sptk b0;;
}
-.endp ldexpf
-ASM_SIZE_DIRECTIVE(ldexpf)
-.proc __libm_error_region
+GLOBAL_LIBM_END(__libm_ldexpf)
__libm_error_region:
-L(ldexpf_OVERFLOW):
-L(ldexpf_UNDERFLOW):
+LDEXPF_OVERFLOW:
+LDEXPF_UNDERFLOW:
//
// Get stack address of N
@@ -372,8 +374,7 @@ L(ldexpf_UNDERFLOW):
br.ret.sptk b0
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_ldexpl.S b/sysdeps/ia64/fpu/s_libm_ldexpl.S
index fb5d3fd452..72d45602cf 100644
--- a/sysdeps/ia64/fpu/s_ldexpl.S
+++ b/sysdeps/ia64/fpu/s_libm_ldexpl.S
@@ -1,10 +1,10 @@
-//.file "ldexpl.s"
+.file "libm_ldexpl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,26 +35,30 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 1/26/01 ldexpl completely reworked and now standalone version
+// 02/02/00 Initial version
+// 01/26/01 ldexpl completely reworked and now standalone version
+// 01/04/02 Added handling for int 32 or 64 bits
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
-// double-extended = ldexpl (double-extended x, int n)
-// input floating point f8 and int n (r34)
+// long double __libm_ldexpl (long double x, int n, int int_type)
+// input floating point f8 and int n (r34), int int_type (r35)
// output floating point f8
//
+// int_type = 0 if int is 32 bits
+// int_type = 1 if int is 64 bits
+//
// Returns x* 2**n using an fma and detects overflow
// and underflow.
//
//
-#include "libm_support.h"
-
FR_Big = f6
FR_NBig = f7
FR_Floating_X = f8
@@ -81,39 +85,40 @@ GR_Parameter_Y = r36
GR_Parameter_RESULT = r37
GR_Tag = r38
-.align 32
-.global ldexpl
-
.section .text
-.proc ldexpl
-.align 32
-
-ldexpl:
+GLOBAL_LIBM_ENTRY(__libm_ldexpl)
//
// Is x NAN, INF, ZERO, +-?
// Build the exponent Bias
//
{ .mfi
- alloc r32=ar.pfs,2,1,4,0
- fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero
+ alloc r32=ar.pfs,3,0,4,0
+ fclass.m p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero
addl GR_Bias = 0x0FFFF,r0
}
//
-// Sign extend input
// Is N zero?
// Normalize x
+// Is integer type 32 bits?
//
{ .mfi
- cmp.eq.unc p6,p0 = r34,r0
+ cmp.eq p6,p0 = r34,r0
fnorm.s1 FR_Norm_X = FR_Floating_X
- sxt4 GR_N_as_int = r34
+ cmp.eq p8,p9 = r35,r0
+}
+;;
+
+// Sign extend N if int is 32 bits
+{ .mfi
+(p9) mov GR_N_as_int = r34 // Copy N if int is 64 bits
+ nop.f 0
+(p8) sxt4 GR_N_as_int = r34 // Sign extend N if int is 32 bits
}
;;
//
-// Normalize x
// Branch and return special values.
// Create -35000
// Create 35000
@@ -125,7 +130,7 @@ ldexpl:
}
{ .mfb
addl GR_NBig = -35000,r0
-(p7) fma.s0 FR_Result = FR_Floating_X,f1, f0
+(p7) fma.s0 FR_Result = FR_Floating_X,f1, f0
(p7) br.ret.spnt b0
};;
@@ -140,7 +145,7 @@ ldexpl:
}
{ .mfb
addl GR_Scratch = 0x019C3F,r0
-(p6) fma.s0 FR_Result = FR_Floating_X,f1, f0
+(p6) fma.s0 FR_Result = FR_Floating_X,f1, f0
(p6) br.ret.spnt b0
};;
@@ -155,12 +160,12 @@ ldexpl:
{ .mfi
setf.exp FR_NBig = GR_Scratch1
nop.f 0
- cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big
+ cmp.ge p6, p0 = GR_N_as_int, GR_Big
}
{ .mfi
setf.exp FR_Big = GR_Scratch
fcmp.ge.s0 p0,p11 = FR_Floating_X,f0
- cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig
+ cmp.le p8, p0 = GR_N_as_int, GR_NBig
};;
//
@@ -173,7 +178,7 @@ ldexpl:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch = 0x0000000000033FFF
+ movl GR_Scratch = 0x0000000000033FFF
};;
@@ -184,7 +189,7 @@ ldexpl:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch1= 0x0000000000013FFF
+ movl GR_Scratch1= 0x0000000000013FFF
};;
// Set up necessary status fields
@@ -195,12 +200,12 @@ ldexpl:
//
{ .mfi
nop.m 999
-(p0) fsetc.s3 0x7F,0x41
+ fsetc.s3 0x7F,0x41
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fsetc.s2 0x7F,0x42
+ fsetc.s2 0x7F,0x42
nop.i 999
};;
@@ -209,17 +214,17 @@ ldexpl:
//
{ .mfi
setf.exp FR_NBig = GR_Scratch
- fma.s0 FR_Result = FR_Two_N,FR_Norm_X,f0
+ fma.s0 FR_Result = FR_Two_N,FR_Norm_X,f0
nop.i 999
}
{ .mfi
nop.m 999
- fma.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0
+ fma.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0
nop.i 999
};;
{ .mfi
setf.exp FR_Big = GR_Scratch1
- fma.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0
+ fma.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0
nop.i 999
};;
@@ -243,12 +248,12 @@ ldexpl:
//
{ .mfi
nop.m 999
- fclass.m.unc p6, p0 = FR_Result3, 0x007
+ fclass.m p6, p0 = FR_Result3, 0x007
nop.i 999
}
{ .mfi
addl GR_Tag = 144, r0
- fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big
+ fcmp.ge.s1 p7, p8 = FR_Result2 , FR_Big
nop.i 0
};;
@@ -268,7 +273,7 @@ ldexpl:
{ .mfb
(p6) addl GR_Tag = 145, r0
(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig
-(p6) br.cond.spnt L(ldexpl_UNDERFLOW)
+(p6) br.cond.spnt LDEXPL_UNDERFLOW
};;
//
@@ -276,8 +281,8 @@ ldexpl:
//
{ .mbb
nop.m 0
-(p7) br.cond.spnt L(ldexpl_OVERFLOW)
-(p9) br.cond.spnt L(ldexpl_OVERFLOW)
+(p7) br.cond.spnt LDEXPL_OVERFLOW
+(p9) br.cond.spnt LDEXPL_OVERFLOW
};;
//
@@ -289,13 +294,11 @@ ldexpl:
br.ret.sptk b0;;
}
-.endp ldexpl
-ASM_SIZE_DIRECTIVE(ldexpl)
-.proc __libm_error_region
+GLOBAL_LIBM_END(__libm_ldexpl)
__libm_error_region:
-L(ldexpl_OVERFLOW):
-L(ldexpl_UNDERFLOW):
+LDEXPL_OVERFLOW:
+LDEXPL_UNDERFLOW:
//
// Get stack address of N
@@ -372,8 +375,7 @@ L(ldexpl_UNDERFLOW):
br.ret.sptk b0
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_scalbn.S b/sysdeps/ia64/fpu/s_libm_scalbn.S
index 50d14b4e30..fb7ab93ff3 100644
--- a/sysdeps/ia64/fpu/s_scalbn.S
+++ b/sysdeps/ia64/fpu/s_libm_scalbn.S
@@ -1,10 +1,10 @@
-.file "scalbn.s"
+.file "libm_scalbn.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,26 +35,30 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 1/26/01 Scalbn completely reworked and now standalone version
+// 02/02/00 Initial version
+// 01/26/01 Scalbn completely reworked and now standalone version
+// 01/04/02 Added handling for int 32 or 64 bits
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
-// double = scalbn (double x, int n)
-// input floating point f8 and int n (r33)
+// double __libm_scalbn (double x, int n, int int_type)
+// input floating point f8 and int n (r33), int int_type (r34)
// output floating point f8
//
+// int_type = 0 if int is 32 bits
+// int_type = 1 if int is 64 bits
+//
// Returns x* 2**n using an fma and detects overflow
// and underflow.
//
//
-#include "libm_support.h"
-
FR_Big = f6
FR_NBig = f7
FR_Floating_X = f8
@@ -81,39 +85,40 @@ GR_Parameter_Y = r36
GR_Parameter_RESULT = r37
GR_Tag = r38
-.align 32
-.global scalbn
-
.section .text
-.proc scalbn
-.align 32
-
-scalbn:
+GLOBAL_LIBM_ENTRY(__libm_scalbn)
//
// Is x NAN, INF, ZERO, +-?
// Build the exponent Bias
//
{ .mfi
- alloc r32=ar.pfs,1,2,4,0
- fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero
+ alloc r32=ar.pfs,3,0,4,0
+ fclass.m p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero
addl GR_Bias = 0x0FFFF,r0
}
//
-// Sign extend input
// Is N zero?
// Normalize x
+// Is integer type 32 bits?
//
{ .mfi
- cmp.eq.unc p6,p0 = r33,r0
+ cmp.eq p6,p0 = r33,r0
fnorm.s1 FR_Norm_X = FR_Floating_X
- sxt4 GR_N_as_int = r33
+ cmp.eq p8,p9 = r34,r0
+}
+;;
+
+// Sign extend N if int is 32 bits
+{ .mfi
+(p9) mov GR_N_as_int = r33 // Copy N if int is 64 bits
+ nop.f 0
+(p8) sxt4 GR_N_as_int = r33 // Sign extend N if int is 32 bits
}
;;
//
-// Normalize x
// Branch and return special values.
// Create -35000
// Create 35000
@@ -155,12 +160,12 @@ scalbn:
{ .mfi
setf.exp FR_NBig = GR_Scratch1
nop.f 0
- cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big
+ cmp.ge p6, p0 = GR_N_as_int, GR_Big
}
{ .mfi
setf.exp FR_Big = GR_Scratch
fcmp.ge.s0 p0,p11 = FR_Floating_X,f0
- cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig
+ cmp.le p8, p0 = GR_N_as_int, GR_NBig
};;
//
@@ -173,7 +178,7 @@ scalbn:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch = 0x00000000000303FF
+ movl GR_Scratch = 0x00000000000303FF
};;
@@ -184,7 +189,7 @@ scalbn:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch1= 0x00000000000103FF
+ movl GR_Scratch1= 0x00000000000103FF
};;
// Set up necessary status fields
@@ -195,12 +200,12 @@ scalbn:
//
{ .mfi
nop.m 999
-(p0) fsetc.s3 0x7F,0x41
+ fsetc.s3 0x7F,0x41
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fsetc.s2 0x7F,0x42
+ fsetc.s2 0x7F,0x42
nop.i 999
};;
@@ -243,12 +248,12 @@ scalbn:
//
{ .mfi
nop.m 999
- fclass.m.unc p6, p0 = FR_Result3, 0x007
+ fclass.m p6, p0 = FR_Result3, 0x007
nop.i 999
}
{ .mfi
addl GR_Tag = 176, r0
- fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big
+ fcmp.ge.s1 p7, p8 = FR_Result2 , FR_Big
nop.i 0
};;
@@ -268,7 +273,7 @@ scalbn:
{ .mfb
(p6) addl GR_Tag = 177, r0
(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig
-(p6) br.cond.spnt L(SCALBN_UNDERFLOW)
+(p6) br.cond.spnt SCALBN_UNDERFLOW
};;
//
@@ -276,8 +281,8 @@ scalbn:
//
{ .mbb
nop.m 0
-(p7) br.cond.spnt L(SCALBN_OVERFLOW)
-(p9) br.cond.spnt L(SCALBN_OVERFLOW)
+(p7) br.cond.spnt SCALBN_OVERFLOW
+(p9) br.cond.spnt SCALBN_OVERFLOW
};;
//
@@ -289,13 +294,11 @@ scalbn:
br.ret.sptk b0;;
}
-.endp scalbn
-ASM_SIZE_DIRECTIVE(scalbn)
-.proc __libm_error_region
+GLOBAL_LIBM_END(__libm_scalbn)
__libm_error_region:
-L(SCALBN_OVERFLOW):
-L(SCALBN_UNDERFLOW):
+SCALBN_OVERFLOW:
+SCALBN_UNDERFLOW:
//
// Get stack address of N
@@ -372,8 +375,7 @@ L(SCALBN_UNDERFLOW):
br.ret.sptk b0
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(scalbn)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_libm_scalbnf.S b/sysdeps/ia64/fpu/s_libm_scalbnf.S
new file mode 100644
index 0000000000..57ab2cc283
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_libm_scalbnf.S
@@ -0,0 +1,381 @@
+.file "libm_scalbnf.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/02/00 Initial version
+// 01/26/01 scalbnf completely reworked and now standalone version
+// 01/04/02 Added handling for int 32 or 64 bits
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// float __libm_scalbnf (float x, int n, int int_type)
+// input floating point f8 and int n (r33), int int_type (r34)
+// output floating point f8
+//
+// int_type = 0 if int is 32 bits
+// int_type = 1 if int is 64 bits
+//
+// Returns x* 2**n using an fma and detects overflow
+// and underflow.
+//
+//
+
+FR_Big = f6
+FR_NBig = f7
+FR_Floating_X = f8
+FR_Result = f8
+FR_Result2 = f9
+FR_Result3 = f11
+FR_Norm_X = f12
+FR_Two_N = f14
+FR_Two_to_Big = f15
+
+GR_N_Biased = r15
+GR_Big = r16
+GR_NBig = r17
+GR_Scratch = r18
+GR_Scratch1 = r19
+GR_Bias = r20
+GR_N_as_int = r21
+
+GR_SAVE_B0 = r32
+GR_SAVE_GP = r33
+GR_SAVE_PFS = r34
+GR_Parameter_X = r35
+GR_Parameter_Y = r36
+GR_Parameter_RESULT = r37
+GR_Tag = r38
+
+.section .text
+GLOBAL_LIBM_ENTRY(__libm_scalbnf)
+
+//
+// Is x NAN, INF, ZERO, +-?
+// Build the exponent Bias
+//
+{ .mfi
+ alloc r32=ar.pfs,3,0,4,0
+ fclass.m p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero
+ addl GR_Bias = 0x0FFFF,r0
+}
+
+//
+// Is N zero?
+// Normalize x
+// Is integer type 32 bits?
+//
+{ .mfi
+ cmp.eq p6,p0 = r33,r0
+ fnorm.s1 FR_Norm_X = FR_Floating_X
+ cmp.eq p8,p9 = r34,r0
+}
+;;
+
+// Sign extend N if int is 32 bits
+{ .mfi
+(p9) mov GR_N_as_int = r33 // Copy N if int is 64 bits
+ nop.f 0
+(p8) sxt4 GR_N_as_int = r33 // Sign extend N if int is 32 bits
+}
+;;
+
+//
+// Branch and return special values.
+// Create -35000
+// Create 35000
+//
+{ .mfi
+ addl GR_Big = 35000,r0
+ nop.f 0
+ add GR_N_Biased = GR_Bias,GR_N_as_int
+}
+{ .mfb
+ addl GR_NBig = -35000,r0
+(p7) fma.s.s0 FR_Result = FR_Floating_X,f1, f0
+(p7) br.ret.spnt b0
+};;
+
+//
+// Build the exponent Bias
+// Return x when N = 0
+//
+{ .mfi
+ setf.exp FR_Two_N = GR_N_Biased
+ nop.f 0
+ addl GR_Scratch1 = 0x063BF,r0
+}
+{ .mfb
+ addl GR_Scratch = 0x019C3F,r0
+(p6) fma.s.s0 FR_Result = FR_Floating_X,f1, f0
+(p6) br.ret.spnt b0
+};;
+
+//
+// Create 2*big
+// Create 2**-big
+// Is N > 35000
+// Is N < -35000
+// Raise Denormal operand flag with compare
+// Main path, create 2**N
+//
+{ .mfi
+ setf.exp FR_NBig = GR_Scratch1
+ nop.f 0
+ cmp.ge p6, p0 = GR_N_as_int, GR_Big
+}
+{ .mfi
+ setf.exp FR_Big = GR_Scratch
+ fcmp.ge.s0 p0,p11 = FR_Floating_X,f0
+ cmp.le p8, p0 = GR_N_as_int, GR_NBig
+};;
+
+//
+// Adjust 2**N if N was very small or very large
+//
+{ .mfi
+ nop.m 0
+(p6) fma.s1 FR_Two_N = FR_Big,f1,f0
+ nop.i 0
+}
+{ .mlx
+ nop.m 999
+ movl GR_Scratch = 0x000000000003007F
+};;
+
+
+{ .mfi
+ nop.m 0
+(p8) fma.s1 FR_Two_N = FR_NBig,f1,f0
+ nop.i 0
+}
+{ .mlx
+ nop.m 999
+ movl GR_Scratch1= 0x000000000001007F
+};;
+
+// Set up necessary status fields
+//
+// S0 user supplied status
+// S2 user supplied status + WRE + TD (Overflows)
+// S3 user supplied status + FZ + TD (Underflows)
+//
+{ .mfi
+ nop.m 999
+ fsetc.s3 0x7F,0x41
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+ fsetc.s2 0x7F,0x42
+ nop.i 999
+};;
+
+//
+// Do final operation
+//
+{ .mfi
+ setf.exp FR_NBig = GR_Scratch
+ fma.s.s0 FR_Result = FR_Two_N,FR_Norm_X,f0
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+ fma.s.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0
+ nop.i 999
+};;
+{ .mfi
+ setf.exp FR_Big = GR_Scratch1
+ fma.s.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0
+ nop.i 999
+};;
+
+// Check for overflow or underflow.
+// Restore s3
+// Restore s2
+//
+{ .mfi
+ nop.m 0
+ fsetc.s3 0x7F,0x40
+ nop.i 999
+}
+{ .mfi
+ nop.m 0
+ fsetc.s2 0x7F,0x40
+ nop.i 999
+};;
+
+//
+// Is the result zero?
+//
+{ .mfi
+ nop.m 999
+ fclass.m p6, p0 = FR_Result3, 0x007
+ nop.i 999
+}
+{ .mfi
+ addl GR_Tag = 178, r0
+ fcmp.ge.s1 p7, p8 = FR_Result2 , FR_Big
+ nop.i 0
+};;
+
+//
+// Detect masked underflow - Tiny + Inexact Only
+//
+{ .mfi
+ nop.m 999
+(p6) fcmp.neq.unc.s1 p6, p0 = FR_Result , FR_Result2
+ nop.i 999
+};;
+
+//
+// Is result bigger the allowed range?
+// Branch out for underflow
+//
+{ .mfb
+(p6) addl GR_Tag = 179, r0
+(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig
+(p6) br.cond.spnt SCALBNF_UNDERFLOW
+};;
+
+//
+// Branch out for overflow
+//
+{ .mbb
+ nop.m 0
+(p7) br.cond.spnt SCALBNF_OVERFLOW
+(p9) br.cond.spnt SCALBNF_OVERFLOW
+};;
+
+//
+// Return from main path.
+//
+{ .mfb
+ nop.m 999
+ nop.f 0
+ br.ret.sptk b0;;
+}
+
+GLOBAL_LIBM_END(__libm_scalbnf)
+__libm_error_region:
+
+SCALBNF_OVERFLOW:
+SCALBNF_UNDERFLOW:
+
+//
+// Get stack address of N
+//
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs
+}
+//
+// Adjust sp
+//
+{ .mfi
+.fframe 64
+ add sp=-64,sp
+ nop.f 0
+ mov GR_SAVE_GP=gp
+};;
+
+//
+// Store N on stack in correct position
+// Locate the address of x on stack
+//
+{ .mmi
+ st8 [GR_Parameter_Y] = GR_N_as_int,16
+ add GR_Parameter_X = 16,sp
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0
+};;
+
+//
+// Store x on the stack.
+// Get address for result on stack.
+//
+.body
+{ .mib
+ stfs [GR_Parameter_X] = FR_Norm_X
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ nop.b 0
+}
+{ .mib
+ stfs [GR_Parameter_Y] = FR_Result
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support#
+};;
+
+//
+// Get location of result on stack
+//
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+
+//
+// Get the new result
+//
+{ .mmi
+ ldfs FR_Result = [GR_Parameter_RESULT]
+.restore sp
+ add sp = 64,sp
+ mov b0 = GR_SAVE_B0
+};;
+
+//
+// Restore gp, ar.pfs and return
+//
+{ .mib
+ mov gp = GR_SAVE_GP
+ mov ar.pfs = GR_SAVE_PFS
+ br.ret.sptk b0
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_scalbnl.S b/sysdeps/ia64/fpu/s_libm_scalbnl.S
index 9e54a2ec0a..6eb6e17f67 100644
--- a/sysdeps/ia64/fpu/s_scalbnl.S
+++ b/sysdeps/ia64/fpu/s_libm_scalbnl.S
@@ -1,10 +1,10 @@
-//.file "scalbnl.s"
+.file "libm_scalbnl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,26 +35,30 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 1/26/01 scalbnl completely reworked and now standalone version
+// 02/02/00 Initial version
+// 01/26/01 scalbnl completely reworked and now standalone version
+// 01/04/02 Added handling for int 32 or 64 bits
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
-// double-extended = scalbnl (double-extended x, int n)
-// input floating point f8 and int n (r34)
+// long double __libm_scalbnl (long double x, int n, int int_type)
+// input floating point f8 and int n (r34), int int_type (r35)
// output floating point f8
//
+// int_type = 0 if int is 32 bits
+// int_type = 1 if int is 64 bits
+//
// Returns x* 2**n using an fma and detects overflow
// and underflow.
//
//
-#include "libm_support.h"
-
FR_Big = f6
FR_NBig = f7
FR_Floating_X = f8
@@ -81,39 +85,40 @@ GR_Parameter_Y = r36
GR_Parameter_RESULT = r37
GR_Tag = r38
-.align 32
-.global scalbnl
-
.section .text
-.proc scalbnl
-.align 32
-
-scalbnl:
+GLOBAL_LIBM_ENTRY(__libm_scalbnl)
//
// Is x NAN, INF, ZERO, +-?
// Build the exponent Bias
//
{ .mfi
- alloc r32=ar.pfs,2,1,4,0
- fclass.m.unc p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero
+ alloc r32=ar.pfs,3,0,4,0
+ fclass.m p7,p0 = FR_Floating_X, 0xe7 //@snan | @qnan | @inf | @zero
addl GR_Bias = 0x0FFFF,r0
}
//
-// Sign extend input
// Is N zero?
// Normalize x
+// Is integer type 32 bits?
//
{ .mfi
- cmp.eq.unc p6,p0 = r34,r0
+ cmp.eq p6,p0 = r34,r0
fnorm.s1 FR_Norm_X = FR_Floating_X
- sxt4 GR_N_as_int = r34
+ cmp.eq p8,p9 = r35,r0
+}
+;;
+
+// Sign extend N if int is 32 bits
+{ .mfi
+(p9) mov GR_N_as_int = r34 // Copy N if int is 64 bits
+ nop.f 0
+(p8) sxt4 GR_N_as_int = r34 // Sign extend N if int is 32 bits
}
;;
//
-// Normalize x
// Branch and return special values.
// Create -35000
// Create 35000
@@ -155,12 +160,12 @@ scalbnl:
{ .mfi
setf.exp FR_NBig = GR_Scratch1
nop.f 0
- cmp.ge.unc p6, p0 = GR_N_as_int, GR_Big
+ cmp.ge p6, p0 = GR_N_as_int, GR_Big
}
{ .mfi
setf.exp FR_Big = GR_Scratch
fcmp.ge.s0 p0,p11 = FR_Floating_X,f0
- cmp.le.unc p8, p0 = GR_N_as_int, GR_NBig
+ cmp.le p8, p0 = GR_N_as_int, GR_NBig
};;
//
@@ -173,7 +178,7 @@ scalbnl:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch = 0x0000000000033FFF
+ movl GR_Scratch = 0x0000000000033FFF
};;
@@ -184,7 +189,7 @@ scalbnl:
}
{ .mlx
nop.m 999
-(p0) movl GR_Scratch1= 0x0000000000013FFF
+ movl GR_Scratch1= 0x0000000000013FFF
};;
// Set up necessary status fields
@@ -195,12 +200,12 @@ scalbnl:
//
{ .mfi
nop.m 999
-(p0) fsetc.s3 0x7F,0x41
+ fsetc.s3 0x7F,0x41
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fsetc.s2 0x7F,0x42
+ fsetc.s2 0x7F,0x42
nop.i 999
};;
@@ -209,17 +214,17 @@ scalbnl:
//
{ .mfi
setf.exp FR_NBig = GR_Scratch
- fma.s0 FR_Result = FR_Two_N,FR_Norm_X,f0
+ fma.s0 FR_Result = FR_Two_N,FR_Norm_X,f0
nop.i 999
}
{ .mfi
nop.m 999
- fma.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0
+ fma.s3 FR_Result3 = FR_Two_N,FR_Norm_X,f0
nop.i 999
};;
{ .mfi
setf.exp FR_Big = GR_Scratch1
- fma.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0
+ fma.s2 FR_Result2 = FR_Two_N,FR_Norm_X,f0
nop.i 999
};;
@@ -243,12 +248,12 @@ scalbnl:
//
{ .mfi
nop.m 999
- fclass.m.unc p6, p0 = FR_Result3, 0x007
+ fclass.m p6, p0 = FR_Result3, 0x007
nop.i 999
}
{ .mfi
addl GR_Tag = 174, r0
- fcmp.ge.unc.s1 p7, p8 = FR_Result2 , FR_Big
+ fcmp.ge.s1 p7, p8 = FR_Result2 , FR_Big
nop.i 0
};;
@@ -268,7 +273,7 @@ scalbnl:
{ .mfb
(p6) addl GR_Tag = 175, r0
(p8) fcmp.le.unc.s1 p9, p10 = FR_Result2 , FR_NBig
-(p6) br.cond.spnt L(scalbnl_UNDERFLOW)
+(p6) br.cond.spnt SCALBNL_UNDERFLOW
};;
//
@@ -276,8 +281,8 @@ scalbnl:
//
{ .mbb
nop.m 0
-(p7) br.cond.spnt L(scalbnl_OVERFLOW)
-(p9) br.cond.spnt L(scalbnl_OVERFLOW)
+(p7) br.cond.spnt SCALBNL_OVERFLOW
+(p9) br.cond.spnt SCALBNL_OVERFLOW
};;
//
@@ -289,13 +294,11 @@ scalbnl:
br.ret.sptk b0;;
}
-.endp scalbnl
-ASM_SIZE_DIRECTIVE(scalbnl)
-.proc __libm_error_region
+GLOBAL_LIBM_END(__libm_scalbnl)
__libm_error_region:
-L(scalbnl_OVERFLOW):
-L(scalbnl_UNDERFLOW):
+SCALBNL_OVERFLOW:
+SCALBNL_UNDERFLOW:
//
// Get stack address of N
@@ -372,8 +375,7 @@ L(scalbnl_UNDERFLOW):
br.ret.sptk b0
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_log1p.S b/sysdeps/ia64/fpu/s_log1p.S
index 0d96c14a55..cd3551984a 100644
--- a/sysdeps/ia64/fpu/s_log1p.S
+++ b/sysdeps/ia64/fpu/s_log1p.S
@@ -1,10 +1,10 @@
-.file "log1p.s"
+.file "log1p.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,1608 +20,1082 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 06/29/01 Improved speed of all paths
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 10/02/02 Improved performance by basing on log algorithm
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+// 04/18/03 Eliminate possible WAW dependency warning
//
-// *********************************************************************
-//
-// Function: log1p(x) = ln(x+1), for double precision x values
-//
-// *********************************************************************
-//
-// Accuracy: Very accurate for double precision values
-//
-// *********************************************************************
-//
-// Resources Used:
-//
-// Floating-Point Registers: f8 (Input and Return Value)
-// f9,f33-f55,f99
-//
-// General Purpose Registers:
-// r32-r53
-// r54-r57 (Used to pass arguments to error handling routine)
-//
-// Predicate Registers: p6-p15
-//
-// *********************************************************************
-//
-// IEEE Special Conditions:
-//
-// Denormal fault raised on denormal inputs
-// Overflow exceptions cannot occur
-// Underflow exceptions raised when appropriate for log1p
-// (Error Handling Routine called for underflow)
-// Inexact raised when appropriate by algorithm
-//
-// log1p(inf) = inf
-// log1p(-inf) = QNaN
-// log1p(+/-0) = +/-0
-// log1p(-1) = -inf
-// log1p(SNaN) = QNaN
-// log1p(QNaN) = QNaN
-// log1p(EM_special Values) = QNaN
-//
-// *********************************************************************
-//
-// Computation is based on the following kernel.
-//
-// ker_log_64( in_FR : X,
-// in_FR : E,
-// in_FR : Em1,
-// in_GR : Expo_Range,
-// out_FR : Y_hi,
-// out_FR : Y_lo,
-// out_FR : Scale,
-// out_PR : Safe )
-//
-// Overview
-//
-// The method consists of three cases.
-//
-// If |X+Em1| < 2^(-80) use case log1p_small;
-// elseif |X+Em1| < 2^(-7) use case log_near1;
-// else use case log_regular;
-//
-// Case log1p_small:
-//
-// log( 1 + (X+Em1) ) can be approximated by (X+Em1).
-//
-// Case log_near1:
-//
-// log( 1 + (X+Em1) ) can be approximated by a simple polynomial
-// in W = X+Em1. This polynomial resembles the truncated Taylor
-// series W - W^/2 + W^3/3 - ...
-//
-// Case log_regular:
-//
-// Here we use a table lookup method. The basic idea is that in
-// order to compute log(Arg) for an argument Arg in [1,2), we
-// construct a value G such that G*Arg is close to 1 and that
-// log(1/G) is obtainable easily from a table of values calculated
-// beforehand. Thus
-//
-// log(Arg) = log(1/G) + log(G*Arg)
-// = log(1/G) + log(1 + (G*Arg - 1))
-//
-// Because |G*Arg - 1| is small, the second term on the right hand
-// side can be approximated by a short polynomial. We elaborate
-// this method in four steps.
-//
-// Step 0: Initialization
-//
-// We need to calculate log( E + X ). Obtain N, S_hi, S_lo such that
-//
-// E + X = 2^N * ( S_hi + S_lo ) exactly
-//
-// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
-// that |S_lo| <= ulp(S_hi).
-//
-// Step 1: Argument Reduction
-//
-// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
-//
-// G := G_1 * G_2 * G_3
-// r := (G * S_hi - 1) + G * S_lo
-//
-// These G_j's have the property that the product is exactly
-// representable and that |r| < 2^(-12) as a result.
-//
-// Step 2: Approximation
-//
-//
-// log(1 + r) is approximated by a short polynomial poly(r).
-//
-// Step 3: Reconstruction
-//
-//
-// Finally, log( E + X ) is given by
-//
-// log( E + X ) = log( 2^N * (S_hi + S_lo) )
-// ~=~ N*log(2) + log(1/G) + log(1 + r)
-// ~=~ N*log(2) + log(1/G) + poly(r).
-//
-// **** Algorithm ****
-//
-// Case log1p_small:
-//
-// Although log(1 + (X+Em1)) is basically X+Em1, we would like to
-// preserve the inexactness nature as well as consistent behavior
-// under different rounding modes. Note that this case can only be
-// taken if E is set to be 1.0. In this case, Em1 is zero, and that
-// X can be very tiny and thus the final result can possibly underflow.
-// Thus, we compare X against a threshold that is dependent on the
-// input Expo_Range. If |X| is smaller than this threshold, we set
-// SAFE to be FALSE.
-//
-// The result is returned as Y_hi, Y_lo, and in the case of SAFE
-// is FALSE, an additional value Scale is also returned.
-//
-// W := X + Em1
-// Threshold := Threshold_Table( Expo_Range )
-// Tiny := Tiny_Table( Expo_Range )
-//
-// If ( |W| > Threshold ) then
-// Y_hi := W
-// Y_lo := -W*W
-// Else
-// Y_hi := W
-// Y_lo := -Tiny
-// Scale := 2^(-100)
-// Safe := FALSE
-// EndIf
-//
-//
-// One may think that Y_lo should be -W*W/2; however, it does not matter
-// as Y_lo will be rounded off completely except for the correct effect in
-// directed rounding. Clearly -W*W is simplier to compute. Moreover,
-// because of the difference in exponent value, Y_hi + Y_lo or
-// Y_hi + Scale*Y_lo is always inexact.
-//
-// Case log_near1:
-//
-// Here we compute a simple polynomial. To exploit parallelism, we split
-// the polynomial into two portions.
-//
-// W := X + Em1
-// Wsq := W * W
-// W4 := Wsq*Wsq
-// W6 := W4*Wsq
-// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4))
-// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8)))
-// set lsb(Y_lo) to be 1
-//
-// Case log_regular:
-//
-// We present the algorithm in four steps.
-//
-// Step 0. Initialization
-// ----------------------
-//
-// Z := X + E
-// N := unbaised exponent of Z
-// S_hi := 2^(-N) * Z
-// S_lo := 2^(-N) * { (max(X,E)-Z) + min(X,E) }
-//
-// Note that S_lo is always 0 for the case E = 0.
-//
-// Step 1. Argument Reduction
-// --------------------------
-//
-// Let
-//
-// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63
-//
-// We obtain G_1, G_2, G_3 by the following steps.
-//
+// API
+//==============================================================
+// double log1p(double)
//
-// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted
-// from S_hi.
+// log1p(x) = log(x+1)
//
-// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated
-// to lsb = 2^(-4).
+// Overview of operation
+//==============================================================
+// Background
+// ----------
//
-// Define index_1 := [ d_1 d_2 d_3 d_4 ].
+// This algorithm is based on fact that
+// log1p(x) = log(1+x) and
+// log(a b) = log(a) + log(b).
+// In our case we have 1+x = 2^N f, where 1 <= f < 2.
+// So
+// log(1+x) = log(2^N f) = log(2^N) + log(f) = n*log(2) + log(f)
//
-// Fetch Z_1 := (1/A_1) rounded UP in fixed point with
-// fixed point lsb = 2^(-15).
-// Z_1 looks like z_0.z_1 z_2 ... z_15
-// Note that the fetching is done using index_1.
-// A_1 is actually not needed in the implementation
-// and is used here only to explain how is the value
-// Z_1 defined.
+// To calculate log(f) we do following
+// log(f) = log(f * frcpa(f) / frcpa(f)) =
+// = log(f * frcpa(f)) + log(1/frcpa(f))
//
-// Fetch G_1 := (1/A_1) truncated to 21 sig. bits.
-// floating pt. Again, fetching is done using index_1. A_1
-// explains how G_1 is defined.
+// According to definition of IA-64's frcpa instruction it's a
+// floating point that approximates 1/f using a lookup on the
+// top of 8 bits of the input number's + 1 significand with relative
+// error < 2^(-8.886). So we have following
//
-// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14)
-// = 1.0 0 0 0 d_5 ... d_14
-// This is accomplised by integer multiplication.
-// It is proved that X_1 indeed always begin
-// with 1.0000 in fixed point.
+// |(1/f - frcpa(f)) / (1/f))| = |1 - f*frcpa(f)| < 1/256
//
+// and
//
-// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
-// truncated to lsb = 2^(-8). Similar to A_1,
-// A_2 is not needed in actual implementation. It
-// helps explain how some of the values are defined.
+// log(f) = log(f * frcpa(f)) + log(1/frcpa(f)) =
+// = log(1 + r) + T
//
-// Define index_2 := [ d_5 d_6 d_7 d_8 ].
+// The first value can be computed by polynomial P(r) approximating
+// log(1 + r) on |r| < 1/256 and the second is precomputed tabular
+// value defined by top 8 bit of f.
//
-// Fetch Z_2 := (1/A_2) rounded UP in fixed point with
-// fixed point lsb = 2^(-15). Fetch done using index_2.
-// Z_2 looks like z_0.z_1 z_2 ... z_15
+// Finally we have that log(1+x) ~ (N*log(2) + T) + P(r)
//
-// Fetch G_2 := (1/A_2) truncated to 21 sig. bits.
-// floating pt.
+// Note that if input argument is close to 0.0 (in our case it means
+// that |x| < 1/256) we can use just polynomial approximation
+// because 1+x = 2^0 * f = f = 1 + r and
+// log(1+x) = log(1 + r) ~ P(r)
//
-// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14)
-// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14
-// This is accomplised by integer multiplication.
-// It is proved that X_2 indeed always begin
-// with 1.00000000 in fixed point.
//
+// Implementation
+// --------------
//
-// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1.
-// This is 2^(-14) + X_2 truncated to lsb = 2^(-13).
+// 1. |x| >= 2^(-8), and x > -1
+// InvX = frcpa(x+1)
+// r = InvX*(x+1) - 1
+// P(r) = r*((r*A3 - A2) + r^4*((A4 + r*A5) + r^2*(A6 + r*A7)),
+// all coefficients are calcutated in quad and rounded to double
+// precision. A7,A6,A5,A4 are stored in memory whereas A3 and A2
+// created with setf.
//
-// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ].
+// N = float(n) where n is true unbiased exponent of x
//
-// Fetch G_3 := (1/A_3) truncated to 21 sig. bits.
-// floating pt. Fetch is done using index_3.
+// T is tabular value of log(1/frcpa(x)) calculated in quad precision
+// and represented by two floating-point numbers 64-bit Thi and 32-bit Tlo.
+// To load Thi,Tlo we get bits from 55 to 62 of register format significand
+// as index and calculate two addresses
+// ad_Thi = Thi_table_base_addr + 8 * index
+// ad_Tlo = Tlo_table_base_addr + 4 * index
//
-// Compute G := G_1 * G_2 * G_3.
+// L1 (log(2)) is calculated in quad
+// precision and represented by two floating-point 64-bit numbers L1hi,L1lo
+// stored in memory.
//
-// This is done exactly since each of G_j only has 21 sig. bits.
+// And final result = ((L1hi*N + Thi) + (N*L1lo + Tlo)) + P(r)
//
-// Compute
//
-// r := (G*S_hi - 1) + G*S_lo using 2 FMA operations.
+// 2. 2^(-80) <= |x| < 2^(-8)
+// r = x
+// P(r) = r*((r*A3 - A2) + r^4*((A4 + r*A5) + r^2*(A6 + r*A7)),
+// A7,A6,A5,A4,A3,A2 are the same as in case |x| >= 1/256
//
-// thus, r approximates G*(S_hi+S_lo) - 1 to within a couple of
-// rounding errors.
+// And final results
+// log(1+x) = P(r)
//
+// 3. 0 < |x| < 2^(-80)
+// Although log1p(x) is basically x, we would like to preserve the inexactness
+// nature as well as consistent behavior under different rounding modes.
+// We can do this by computing the result as
//
-// Step 2. Approximation
-// ---------------------
+// log1p(x) = x - x*x
//
-// This step computes an approximation to log( 1 + r ) where r is the
-// reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13);
-// thus log(1+r) can be approximated by a short polynomial:
//
-// log(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5
+// Note: NaT, any NaNs, +/-INF, +/-0, negatives and unnormalized numbers are
+// filtered and processed on special branches.
//
+
//
-// Step 3. Reconstruction
-// ----------------------
+// Special values
+//==============================================================
//
-// This step computes the desired result of log(X+E):
+// log1p(-1) = -inf // Call error support
//
-// log(X+E) = log( 2^N * (S_hi + S_lo) )
-// = N*log(2) + log( S_hi + S_lo )
-// = N*log(2) + log(1/G) +
-// log(1 + C*(S_hi+S_lo) - 1 )
+// log1p(+qnan) = +qnan
+// log1p(-qnan) = -qnan
+// log1p(+snan) = +qnan
+// log1p(-snan) = -qnan
//
-// log(2), log(1/G_j) are stored as pairs of (single,double) numbers:
-// log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are
-// single-precision numbers and the low parts are double precision
-// numbers. These have the property that
+// log1p(x),x<-1= QNAN Indefinite // Call error support
+// log1p(-inf) = QNAN Indefinite
+// log1p(+inf) = +inf
+// log1p(+/-0) = +/-0
//
-// N*log2_hi + SUM ( log1byGj_hi )
//
-// is computable exactly in double-extended precision (64 sig. bits).
-// Finally
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f7 -> f15, f32 -> f40
//
-// Y_hi := N*log2_hi + SUM ( log1byGj_hi )
-// Y_lo := poly_hi + [ poly_lo +
-// ( SUM ( log1byGj_lo ) + N*log2_lo ) ]
-// set lsb(Y_lo) to be 1
+// General registers used:
+// r8 -> r11
+// r14 -> r20
//
+// Predicate registers used:
+// p6 -> p12
-#include "libm_support.h"
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+// Assembly macros
+//==============================================================
+GR_TAG = r8
+GR_ad_1 = r8
+GR_ad_2 = r9
+GR_Exp = r10
+GR_N = r11
-// P_7, P_6, P_5, P_4, P_3, P_2, and P_1
+GR_signexp_x = r14
+GR_exp_mask = r15
+GR_exp_bias = r16
+GR_05 = r17
+GR_A3 = r18
+GR_Sig = r19
+GR_Ind = r19
+GR_exp_x = r20
-.align 64
-Constants_P:
-ASM_TYPE_DIRECTIVE(Constants_P,@object)
-data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000
-data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000
-data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000
-data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000
-data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000
-data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000
-data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000
-data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_P)
-
-// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
-.align 64
-Constants_Q:
-ASM_TYPE_DIRECTIVE(Constants_Q,@object)
-data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
-data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
-data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
-data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
-data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
-data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_Q)
-
-// Z1 - 16 bit fixed, G1 and H1 - IEEE single
-
-.align 64
-Constants_Z_G_H_h1:
-ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h1,@object)
-data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000
-data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000,0x617D741C,0x3DA163A6
-data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000,0xCBD3D5BB,0x3E2C55E6
-data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000,0xD86EA5E7,0xBE3EB0BF
-data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000,0x86B12760,0x3E2E6A8C
-data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000,0x5C0739BA,0x3E47574C
-data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000,0x13E8AF2F,0x3E20E30F
-data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000,0xF2C630BD,0xBE42885B
-data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000,0x97E577C6,0x3E497F34
-data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000,0xA6B0A5AB,0x3E3E6A6E
-data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000,0xD328D9BE,0xBDF43E3C
-data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000,0x0ADB090A,0x3E4094C3
-data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000,0xFC1FE510,0xBE28FBB2
-data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000,0x10FDE3FA,0x3E3A7895
-data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000,0x7CC8C98F,0x3E508CE5
-data4 0x00004211,0x3F042108,0x3F29516A,0x00000000,0xA223106C,0xBE534874
-ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h1)
-
-// Z2 - 16 bit fixed, G2 and H2 - IEEE single
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
-.align 64
-Constants_Z_G_H_h2:
-ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h2,@object)
-data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000
-data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000,0x22C42273,0x3DB5A116
-data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000,0x21F86ED3,0x3DE620CF
-data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000,0x484F34ED,0xBDAFA07E
-data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000,0x3860BCF6,0xBDFE07F0
-data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000,0xA78093D6,0x3DEA370F
-data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000,0x72A753D0,0x3DFF5791
-data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000,0xA7EF896B,0x3DFEBE6C
-data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000,0x409ECB43,0x3E0CF156
-data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000,0xFFEF71DF,0xBE0B6F97
-data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000,0x5D59EEE8,0xBE080483
-data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000,0xA9192A74,0x3E1F91E9
-data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000,0xBF72A8CD,0xBE139A06
-data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000,0xF8FBA6CF,0x3E1D9202
-data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000,0xBA796223,0xBE1DCCC4
-data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000,0xB6B7C239,0xBE049391
-ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h2)
-
-// G3 and H3 - IEEE single and h3 -IEEE double
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
-.align 64
-Constants_Z_G_H_h3:
-ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h3,@object)
-data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595
-data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2
-data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D
-data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291
-data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8
-data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707
-data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9
-data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47
-data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E
-data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D
-data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441
-data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95
-data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC
-data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337
-data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B
-data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B
-data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21
-data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4
-data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070
-data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC
-data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83
-data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40
-data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7
-data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B
-data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E
-data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06
-data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1
-data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103
-data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B
-data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19
-data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502
-data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17
-ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h3)
-
-//
-// Exponent Thresholds and Tiny Thresholds
-// for 8, 11, 15, and 17 bit exponents
-//
-// Expo_Range Value
-//
-// 0 (8 bits) 2^(-126)
-// 1 (11 bits) 2^(-1022)
-// 2 (15 bits) 2^(-16382)
-// 3 (17 bits) 2^(-16382)
-//
-// Tiny_Table
-// ----------
-// Expo_Range Value
-//
-// 0 (8 bits) 2^(-16382)
-// 1 (11 bits) 2^(-16382)
-// 2 (15 bits) 2^(-16382)
-// 3 (17 bits) 2^(-16382)
-//
-.align 64
-Constants_Threshold:
-ASM_TYPE_DIRECTIVE(Constants_Threshold,@object)
-data4 0x00000000,0x80000000,0x00003F81,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00003C01,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_Threshold)
-.align 64
-Constants_1_by_LN10:
-ASM_TYPE_DIRECTIVE(Constants_1_by_LN10,@object)
-data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000
-data4 0xACCF70C8,0xD56EAABE,0x00003FBD,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_1_by_LN10)
+FR_NormX = f7
+FR_RcpX = f9
+FR_r = f10
+FR_r2 = f11
+FR_r4 = f12
+FR_N = f13
+FR_Ln2hi = f14
+FR_Ln2lo = f15
-FR_Input_X = f8
-FR_Neg_One = f9
-FR_E = f33
-FR_Em1 = f34
-FR_Y_hi = f34
-// Shared with Em1
-FR_Y_lo = f35
-FR_Scale = f36
-FR_X_Prime = f37
-FR_Z = f38
-FR_S_hi = f38
-// Shared with Z
-FR_W = f39
-FR_G = f40
-FR_wsq = f40
-// Shared with G
-FR_H = f41
-FR_w4 = f41
-// Shared with H
-FR_h = f42
-FR_w6 = f42
-// Shared with h
-FR_G_tmp = f43
-FR_poly_lo = f43
-// Shared with G_tmp
-FR_P8 = f43
-// Shared with G_tmp
-FR_H_tmp = f44
-FR_poly_hi = f44
- // Shared with H_tmp
-FR_P7 = f44
-// Shared with H_tmp
-FR_h_tmp = f45
-FR_rsq = f45
-// Shared with h_tmp
-FR_P6 = f45
-// Shared with h_tmp
-FR_abs_W = f46
-FR_r = f46
-// Shared with abs_W
-FR_AA = f47
-FR_log2_hi = f47
-// Shared with AA
-FR_BB = f48
-FR_log2_lo = f48
-// Shared with BB
-FR_S_lo = f49
-FR_two_negN = f50
-FR_float_N = f51
-FR_Q4 = f52
-FR_dummy = f52
-// Shared with Q4
-FR_P4 = f52
-// Shared with Q4
-FR_Threshold = f52
-// Shared with Q4
-FR_Q3 = f53
-FR_P3 = f53
-// Shared with Q3
-FR_Tiny = f53
-// Shared with Q3
-FR_Q2 = f54
-FR_P2 = f54
-// Shared with Q2
-FR_1LN10_hi = f54
-// Shared with Q2
-FR_Q1 = f55
-FR_P1 = f55
-// Shared with Q1
-FR_1LN10_lo = f55
-// Shared with Q1
-FR_P5 = f98
-FR_SCALE = f98
-FR_Output_X_tmp = f99
+FR_A7 = f32
+FR_A6 = f33
+FR_A5 = f34
+FR_A4 = f35
+FR_A3 = f36
+FR_A2 = f37
-GR_Expo_Range = r32
-GR_Table_Base = r34
-GR_Table_Base1 = r35
-GR_Table_ptr = r36
-GR_Index2 = r37
-GR_signif = r38
-GR_X_0 = r39
-GR_X_1 = r40
-GR_X_2 = r41
-GR_Z_1 = r42
-GR_Z_2 = r43
-GR_N = r44
-GR_Bias = r45
-GR_M = r46
-GR_ScaleN = r47
-GR_Index3 = r48
-GR_Perturb = r49
-GR_Table_Scale = r50
+FR_Thi = f38
+FR_NxLn2hipThi = f38
+FR_NxLn2pT = f38
+FR_Tlo = f39
+FR_NxLn2lopTlo = f39
+FR_Xp1 = f40
-GR_SAVE_PFS = r51
-GR_SAVE_B0 = r52
-GR_SAVE_GP = r53
-GR_Parameter_X = r54
-GR_Parameter_Y = r55
-GR_Parameter_RESULT = r56
+FR_Y = f1
+FR_X = f10
+FR_RESULT = f8
-GR_Parameter_TAG = r57
+// Data
+//==============================================================
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(log_data)
+// coefficients of polynomial approximation
+data8 0x3FC2494104381A8E // A7
+data8 0xBFC5556D556BBB69 // A6
+data8 0x3FC999999988B5E9 // A5
+data8 0xBFCFFFFFFFF6FFF5 // A4
+//
+// hi parts of ln(1/frcpa(1+i/256)), i=0...255
+data8 0x3F60040155D5889D // 0
+data8 0x3F78121214586B54 // 1
+data8 0x3F841929F96832EF // 2
+data8 0x3F8C317384C75F06 // 3
+data8 0x3F91A6B91AC73386 // 4
+data8 0x3F95BA9A5D9AC039 // 5
+data8 0x3F99D2A8074325F3 // 6
+data8 0x3F9D6B2725979802 // 7
+data8 0x3FA0C58FA19DFAA9 // 8
+data8 0x3FA2954C78CBCE1A // 9
+data8 0x3FA4A94D2DA96C56 // 10
+data8 0x3FA67C94F2D4BB58 // 11
+data8 0x3FA85188B630F068 // 12
+data8 0x3FAA6B8ABE73AF4C // 13
+data8 0x3FAC441E06F72A9E // 14
+data8 0x3FAE1E6713606D06 // 15
+data8 0x3FAFFA6911AB9300 // 16
+data8 0x3FB0EC139C5DA600 // 17
+data8 0x3FB1DBD2643D190B // 18
+data8 0x3FB2CC7284FE5F1C // 19
+data8 0x3FB3BDF5A7D1EE64 // 20
+data8 0x3FB4B05D7AA012E0 // 21
+data8 0x3FB580DB7CEB5701 // 22
+data8 0x3FB674F089365A79 // 23
+data8 0x3FB769EF2C6B568D // 24
+data8 0x3FB85FD927506A47 // 25
+data8 0x3FB9335E5D594988 // 26
+data8 0x3FBA2B0220C8E5F4 // 27
+data8 0x3FBB0004AC1A86AB // 28
+data8 0x3FBBF968769FCA10 // 29
+data8 0x3FBCCFEDBFEE13A8 // 30
+data8 0x3FBDA727638446A2 // 31
+data8 0x3FBEA3257FE10F79 // 32
+data8 0x3FBF7BE9FEDBFDE5 // 33
+data8 0x3FC02AB352FF25F3 // 34
+data8 0x3FC097CE579D204C // 35
+data8 0x3FC1178E8227E47B // 36
+data8 0x3FC185747DBECF33 // 37
+data8 0x3FC1F3B925F25D41 // 38
+data8 0x3FC2625D1E6DDF56 // 39
+data8 0x3FC2D1610C868139 // 40
+data8 0x3FC340C59741142E // 41
+data8 0x3FC3B08B6757F2A9 // 42
+data8 0x3FC40DFB08378003 // 43
+data8 0x3FC47E74E8CA5F7C // 44
+data8 0x3FC4EF51F6466DE4 // 45
+data8 0x3FC56092E02BA516 // 46
+data8 0x3FC5D23857CD74D4 // 47
+data8 0x3FC6313A37335D76 // 48
+data8 0x3FC6A399DABBD383 // 49
+data8 0x3FC70337DD3CE41A // 50
+data8 0x3FC77654128F6127 // 51
+data8 0x3FC7E9D82A0B022D // 52
+data8 0x3FC84A6B759F512E // 53
+data8 0x3FC8AB47D5F5A30F // 54
+data8 0x3FC91FE49096581B // 55
+data8 0x3FC981634011AA75 // 56
+data8 0x3FC9F6C407089664 // 57
+data8 0x3FCA58E729348F43 // 58
+data8 0x3FCABB55C31693AC // 59
+data8 0x3FCB1E104919EFD0 // 60
+data8 0x3FCB94EE93E367CA // 61
+data8 0x3FCBF851C067555E // 62
+data8 0x3FCC5C0254BF23A5 // 63
+data8 0x3FCCC000C9DB3C52 // 64
+data8 0x3FCD244D99C85673 // 65
+data8 0x3FCD88E93FB2F450 // 66
+data8 0x3FCDEDD437EAEF00 // 67
+data8 0x3FCE530EFFE71012 // 68
+data8 0x3FCEB89A1648B971 // 69
+data8 0x3FCF1E75FADF9BDE // 70
+data8 0x3FCF84A32EAD7C35 // 71
+data8 0x3FCFEB2233EA07CD // 72
+data8 0x3FD028F9C7035C1C // 73
+data8 0x3FD05C8BE0D9635A // 74
+data8 0x3FD085EB8F8AE797 // 75
+data8 0x3FD0B9C8E32D1911 // 76
+data8 0x3FD0EDD060B78080 // 77
+data8 0x3FD122024CF0063F // 78
+data8 0x3FD14BE2927AECD4 // 79
+data8 0x3FD180618EF18ADF // 80
+data8 0x3FD1B50BBE2FC63B // 81
+data8 0x3FD1DF4CC7CF242D // 82
+data8 0x3FD214456D0EB8D4 // 83
+data8 0x3FD23EC5991EBA49 // 84
+data8 0x3FD2740D9F870AFB // 85
+data8 0x3FD29ECDABCDFA03 // 86
+data8 0x3FD2D46602ADCCEE // 87
+data8 0x3FD2FF66B04EA9D4 // 88
+data8 0x3FD335504B355A37 // 89
+data8 0x3FD360925EC44F5C // 90
+data8 0x3FD38BF1C3337E74 // 91
+data8 0x3FD3C25277333183 // 92
+data8 0x3FD3EDF463C1683E // 93
+data8 0x3FD419B423D5E8C7 // 94
+data8 0x3FD44591E0539F48 // 95
+data8 0x3FD47C9175B6F0AD // 96
+data8 0x3FD4A8B341552B09 // 97
+data8 0x3FD4D4F39089019F // 98
+data8 0x3FD501528DA1F967 // 99
+data8 0x3FD52DD06347D4F6 // 100
+data8 0x3FD55A6D3C7B8A89 // 101
+data8 0x3FD5925D2B112A59 // 102
+data8 0x3FD5BF406B543DB1 // 103
+data8 0x3FD5EC433D5C35AD // 104
+data8 0x3FD61965CDB02C1E // 105
+data8 0x3FD646A84935B2A1 // 106
+data8 0x3FD6740ADD31DE94 // 107
+data8 0x3FD6A18DB74A58C5 // 108
+data8 0x3FD6CF31058670EC // 109
+data8 0x3FD6F180E852F0B9 // 110
+data8 0x3FD71F5D71B894EF // 111
+data8 0x3FD74D5AEFD66D5C // 112
+data8 0x3FD77B79922BD37D // 113
+data8 0x3FD7A9B9889F19E2 // 114
+data8 0x3FD7D81B037EB6A6 // 115
+data8 0x3FD8069E33827230 // 116
+data8 0x3FD82996D3EF8BCA // 117
+data8 0x3FD85855776DCBFA // 118
+data8 0x3FD8873658327CCE // 119
+data8 0x3FD8AA75973AB8CE // 120
+data8 0x3FD8D992DC8824E4 // 121
+data8 0x3FD908D2EA7D9511 // 122
+data8 0x3FD92C59E79C0E56 // 123
+data8 0x3FD95BD750EE3ED2 // 124
+data8 0x3FD98B7811A3EE5B // 125
+data8 0x3FD9AF47F33D406B // 126
+data8 0x3FD9DF270C1914A7 // 127
+data8 0x3FDA0325ED14FDA4 // 128
+data8 0x3FDA33440224FA78 // 129
+data8 0x3FDA57725E80C382 // 130
+data8 0x3FDA87D0165DD199 // 131
+data8 0x3FDAAC2E6C03F895 // 132
+data8 0x3FDADCCC6FDF6A81 // 133
+data8 0x3FDB015B3EB1E790 // 134
+data8 0x3FDB323A3A635948 // 135
+data8 0x3FDB56FA04462909 // 136
+data8 0x3FDB881AA659BC93 // 137
+data8 0x3FDBAD0BEF3DB164 // 138
+data8 0x3FDBD21297781C2F // 139
+data8 0x3FDC039236F08818 // 140
+data8 0x3FDC28CB1E4D32FC // 141
+data8 0x3FDC4E19B84723C1 // 142
+data8 0x3FDC7FF9C74554C9 // 143
+data8 0x3FDCA57B64E9DB05 // 144
+data8 0x3FDCCB130A5CEBAF // 145
+data8 0x3FDCF0C0D18F326F // 146
+data8 0x3FDD232075B5A201 // 147
+data8 0x3FDD490246DEFA6B // 148
+data8 0x3FDD6EFA918D25CD // 149
+data8 0x3FDD9509707AE52F // 150
+data8 0x3FDDBB2EFE92C554 // 151
+data8 0x3FDDEE2F3445E4AE // 152
+data8 0x3FDE148A1A2726CD // 153
+data8 0x3FDE3AFC0A49FF3F // 154
+data8 0x3FDE6185206D516D // 155
+data8 0x3FDE882578823D51 // 156
+data8 0x3FDEAEDD2EAC990C // 157
+data8 0x3FDED5AC5F436BE2 // 158
+data8 0x3FDEFC9326D16AB8 // 159
+data8 0x3FDF2391A21575FF // 160
+data8 0x3FDF4AA7EE03192C // 161
+data8 0x3FDF71D627C30BB0 // 162
+data8 0x3FDF991C6CB3B379 // 163
+data8 0x3FDFC07ADA69A90F // 164
+data8 0x3FDFE7F18EB03D3E // 165
+data8 0x3FE007C053C5002E // 166
+data8 0x3FE01B942198A5A0 // 167
+data8 0x3FE02F74400C64EA // 168
+data8 0x3FE04360BE7603AC // 169
+data8 0x3FE05759AC47FE33 // 170
+data8 0x3FE06B5F1911CF51 // 171
+data8 0x3FE078BF0533C568 // 172
+data8 0x3FE08CD9687E7B0E // 173
+data8 0x3FE0A10074CF9019 // 174
+data8 0x3FE0B5343A234476 // 175
+data8 0x3FE0C974C89431CD // 176
+data8 0x3FE0DDC2305B9886 // 177
+data8 0x3FE0EB524BAFC918 // 178
+data8 0x3FE0FFB54213A475 // 179
+data8 0x3FE114253DA97D9F // 180
+data8 0x3FE128A24F1D9AFF // 181
+data8 0x3FE1365252BF0864 // 182
+data8 0x3FE14AE558B4A92D // 183
+data8 0x3FE15F85A19C765B // 184
+data8 0x3FE16D4D38C119FA // 185
+data8 0x3FE18203C20DD133 // 186
+data8 0x3FE196C7BC4B1F3A // 187
+data8 0x3FE1A4A738B7A33C // 188
+data8 0x3FE1B981C0C9653C // 189
+data8 0x3FE1CE69E8BB106A // 190
+data8 0x3FE1DC619DE06944 // 191
+data8 0x3FE1F160A2AD0DA3 // 192
+data8 0x3FE2066D7740737E // 193
+data8 0x3FE2147DBA47A393 // 194
+data8 0x3FE229A1BC5EBAC3 // 195
+data8 0x3FE237C1841A502E // 196
+data8 0x3FE24CFCE6F80D9A // 197
+data8 0x3FE25B2C55CD5762 // 198
+data8 0x3FE2707F4D5F7C40 // 199
+data8 0x3FE285E0842CA383 // 200
+data8 0x3FE294294708B773 // 201
+data8 0x3FE2A9A2670AFF0C // 202
+data8 0x3FE2B7FB2C8D1CC0 // 203
+data8 0x3FE2C65A6395F5F5 // 204
+data8 0x3FE2DBF557B0DF42 // 205
+data8 0x3FE2EA64C3F97654 // 206
+data8 0x3FE3001823684D73 // 207
+data8 0x3FE30E97E9A8B5CC // 208
+data8 0x3FE32463EBDD34E9 // 209
+data8 0x3FE332F4314AD795 // 210
+data8 0x3FE348D90E7464CF // 211
+data8 0x3FE35779F8C43D6D // 212
+data8 0x3FE36621961A6A99 // 213
+data8 0x3FE37C299F3C366A // 214
+data8 0x3FE38AE2171976E7 // 215
+data8 0x3FE399A157A603E7 // 216
+data8 0x3FE3AFCCFE77B9D1 // 217
+data8 0x3FE3BE9D503533B5 // 218
+data8 0x3FE3CD7480B4A8A2 // 219
+data8 0x3FE3E3C43918F76C // 220
+data8 0x3FE3F2ACB27ED6C6 // 221
+data8 0x3FE4019C2125CA93 // 222
+data8 0x3FE4181061389722 // 223
+data8 0x3FE42711518DF545 // 224
+data8 0x3FE436194E12B6BF // 225
+data8 0x3FE445285D68EA69 // 226
+data8 0x3FE45BCC464C893A // 227
+data8 0x3FE46AED21F117FC // 228
+data8 0x3FE47A1527E8A2D3 // 229
+data8 0x3FE489445EFFFCCB // 230
+data8 0x3FE4A018BCB69835 // 231
+data8 0x3FE4AF5A0C9D65D7 // 232
+data8 0x3FE4BEA2A5BDBE87 // 233
+data8 0x3FE4CDF28F10AC46 // 234
+data8 0x3FE4DD49CF994058 // 235
+data8 0x3FE4ECA86E64A683 // 236
+data8 0x3FE503C43CD8EB68 // 237
+data8 0x3FE513356667FC57 // 238
+data8 0x3FE522AE0738A3D7 // 239
+data8 0x3FE5322E26867857 // 240
+data8 0x3FE541B5CB979809 // 241
+data8 0x3FE55144FDBCBD62 // 242
+data8 0x3FE560DBC45153C6 // 243
+data8 0x3FE5707A26BB8C66 // 244
+data8 0x3FE587F60ED5B8FF // 245
+data8 0x3FE597A7977C8F31 // 246
+data8 0x3FE5A760D634BB8A // 247
+data8 0x3FE5B721D295F10E // 248
+data8 0x3FE5C6EA94431EF9 // 249
+data8 0x3FE5D6BB22EA86F5 // 250
+data8 0x3FE5E6938645D38F // 251
+data8 0x3FE5F673C61A2ED1 // 252
+data8 0x3FE6065BEA385926 // 253
+data8 0x3FE6164BFA7CC06B // 254
+data8 0x3FE62643FECF9742 // 255
+//
+// two parts of ln(2)
+data8 0x3FE62E42FEF00000,0x3DD473DE6AF278ED
+//
+// lo parts of ln(1/frcpa(1+i/256)), i=0...255
+data4 0x20E70672 // 0
+data4 0x1F60A5D0 // 1
+data4 0x218EABA0 // 2
+data4 0x21403104 // 3
+data4 0x20E9B54E // 4
+data4 0x21EE1382 // 5
+data4 0x226014E3 // 6
+data4 0x2095E5C9 // 7
+data4 0x228BA9D4 // 8
+data4 0x22932B86 // 9
+data4 0x22608A57 // 10
+data4 0x220209F3 // 11
+data4 0x212882CC // 12
+data4 0x220D46E2 // 13
+data4 0x21FA4C28 // 14
+data4 0x229E5BD9 // 15
+data4 0x228C9838 // 16
+data4 0x2311F954 // 17
+data4 0x221365DF // 18
+data4 0x22BD0CB3 // 19
+data4 0x223D4BB7 // 20
+data4 0x22A71BBE // 21
+data4 0x237DB2FA // 22
+data4 0x23194C9D // 23
+data4 0x22EC639E // 24
+data4 0x2367E669 // 25
+data4 0x232E1D5F // 26
+data4 0x234A639B // 27
+data4 0x2365C0E0 // 28
+data4 0x234646C1 // 29
+data4 0x220CBF9C // 30
+data4 0x22A00FD4 // 31
+data4 0x2306A3F2 // 32
+data4 0x23745A9B // 33
+data4 0x2398D756 // 34
+data4 0x23DD0B6A // 35
+data4 0x23DE338B // 36
+data4 0x23A222DF // 37
+data4 0x223164F8 // 38
+data4 0x23B4E87B // 39
+data4 0x23D6CCB8 // 40
+data4 0x220C2099 // 41
+data4 0x21B86B67 // 42
+data4 0x236D14F1 // 43
+data4 0x225A923F // 44
+data4 0x22748723 // 45
+data4 0x22200D13 // 46
+data4 0x23C296EA // 47
+data4 0x2302AC38 // 48
+data4 0x234B1996 // 49
+data4 0x2385E298 // 50
+data4 0x23175BE5 // 51
+data4 0x2193F482 // 52
+data4 0x23BFEA90 // 53
+data4 0x23D70A0C // 54
+data4 0x231CF30A // 55
+data4 0x235D9E90 // 56
+data4 0x221AD0CB // 57
+data4 0x22FAA08B // 58
+data4 0x23D29A87 // 59
+data4 0x20C4B2FE // 60
+data4 0x2381B8B7 // 61
+data4 0x23F8D9FC // 62
+data4 0x23EAAE7B // 63
+data4 0x2329E8AA // 64
+data4 0x23EC0322 // 65
+data4 0x2357FDCB // 66
+data4 0x2392A9AD // 67
+data4 0x22113B02 // 68
+data4 0x22DEE901 // 69
+data4 0x236A6D14 // 70
+data4 0x2371D33E // 71
+data4 0x2146F005 // 72
+data4 0x23230B06 // 73
+data4 0x22F1C77D // 74
+data4 0x23A89FA3 // 75
+data4 0x231D1241 // 76
+data4 0x244DA96C // 77
+data4 0x23ECBB7D // 78
+data4 0x223E42B4 // 79
+data4 0x23801BC9 // 80
+data4 0x23573263 // 81
+data4 0x227C1158 // 82
+data4 0x237BD749 // 83
+data4 0x21DDBAE9 // 84
+data4 0x23401735 // 85
+data4 0x241D9DEE // 86
+data4 0x23BC88CB // 87
+data4 0x2396D5F1 // 88
+data4 0x23FC89CF // 89
+data4 0x2414F9A2 // 90
+data4 0x2474A0F5 // 91
+data4 0x24354B60 // 92
+data4 0x23C1EB40 // 93
+data4 0x2306DD92 // 94
+data4 0x24353B6B // 95
+data4 0x23CD1701 // 96
+data4 0x237C7A1C // 97
+data4 0x245793AA // 98
+data4 0x24563695 // 99
+data4 0x23C51467 // 100
+data4 0x24476B68 // 101
+data4 0x212585A9 // 102
+data4 0x247B8293 // 103
+data4 0x2446848A // 104
+data4 0x246A53F8 // 105
+data4 0x246E496D // 106
+data4 0x23ED1D36 // 107
+data4 0x2314C258 // 108
+data4 0x233244A7 // 109
+data4 0x245B7AF0 // 110
+data4 0x24247130 // 111
+data4 0x22D67B38 // 112
+data4 0x2449F620 // 113
+data4 0x23BBC8B8 // 114
+data4 0x237D3BA0 // 115
+data4 0x245E8F13 // 116
+data4 0x2435573F // 117
+data4 0x242DE666 // 118
+data4 0x2463BC10 // 119
+data4 0x2466587D // 120
+data4 0x2408144B // 121
+data4 0x2405F0E5 // 122
+data4 0x22381CFF // 123
+data4 0x24154F9B // 124
+data4 0x23A4E96E // 125
+data4 0x24052967 // 126
+data4 0x2406963F // 127
+data4 0x23F7D3CB // 128
+data4 0x2448AFF4 // 129
+data4 0x24657A21 // 130
+data4 0x22FBC230 // 131
+data4 0x243C8DEA // 132
+data4 0x225DC4B7 // 133
+data4 0x23496EBF // 134
+data4 0x237C2B2B // 135
+data4 0x23A4A5B1 // 136
+data4 0x2394E9D1 // 137
+data4 0x244BC950 // 138
+data4 0x23C7448F // 139
+data4 0x2404A1AD // 140
+data4 0x246511D5 // 141
+data4 0x24246526 // 142
+data4 0x23111F57 // 143
+data4 0x22868951 // 144
+data4 0x243EB77F // 145
+data4 0x239F3DFF // 146
+data4 0x23089666 // 147
+data4 0x23EBFA6A // 148
+data4 0x23C51312 // 149
+data4 0x23E1DD5E // 150
+data4 0x232C0944 // 151
+data4 0x246A741F // 152
+data4 0x2414DF8D // 153
+data4 0x247B5546 // 154
+data4 0x2415C980 // 155
+data4 0x24324ABD // 156
+data4 0x234EB5E5 // 157
+data4 0x2465E43E // 158
+data4 0x242840D1 // 159
+data4 0x24444057 // 160
+data4 0x245E56F0 // 161
+data4 0x21AE30F8 // 162
+data4 0x23FB3283 // 163
+data4 0x247A4D07 // 164
+data4 0x22AE314D // 165
+data4 0x246B7727 // 166
+data4 0x24EAD526 // 167
+data4 0x24B41DC9 // 168
+data4 0x24EE8062 // 169
+data4 0x24A0C7C4 // 170
+data4 0x24E8DA67 // 171
+data4 0x231120F7 // 172
+data4 0x24401FFB // 173
+data4 0x2412DD09 // 174
+data4 0x248C131A // 175
+data4 0x24C0A7CE // 176
+data4 0x243DD4C8 // 177
+data4 0x24457FEB // 178
+data4 0x24DEEFBB // 179
+data4 0x243C70AE // 180
+data4 0x23E7A6FA // 181
+data4 0x24C2D311 // 182
+data4 0x23026255 // 183
+data4 0x2437C9B9 // 184
+data4 0x246BA847 // 185
+data4 0x2420B448 // 186
+data4 0x24C4CF5A // 187
+data4 0x242C4981 // 188
+data4 0x24DE1525 // 189
+data4 0x24F5CC33 // 190
+data4 0x235A85DA // 191
+data4 0x24A0B64F // 192
+data4 0x244BA0A4 // 193
+data4 0x24AAF30A // 194
+data4 0x244C86F9 // 195
+data4 0x246D5B82 // 196
+data4 0x24529347 // 197
+data4 0x240DD008 // 198
+data4 0x24E98790 // 199
+data4 0x2489B0CE // 200
+data4 0x22BC29AC // 201
+data4 0x23F37C7A // 202
+data4 0x24987FE8 // 203
+data4 0x22AFE20B // 204
+data4 0x24C8D7C2 // 205
+data4 0x24B28B7D // 206
+data4 0x23B6B271 // 207
+data4 0x24C77CB6 // 208
+data4 0x24EF1DCA // 209
+data4 0x24A4F0AC // 210
+data4 0x24CF113E // 211
+data4 0x2496BBAB // 212
+data4 0x23C7CC8A // 213
+data4 0x23AE3961 // 214
+data4 0x2410A895 // 215
+data4 0x23CE3114 // 216
+data4 0x2308247D // 217
+data4 0x240045E9 // 218
+data4 0x24974F60 // 219
+data4 0x242CB39F // 220
+data4 0x24AB8D69 // 221
+data4 0x23436788 // 222
+data4 0x24305E9E // 223
+data4 0x243E71A9 // 224
+data4 0x23C2A6B3 // 225
+data4 0x23FFE6CF // 226
+data4 0x2322D801 // 227
+data4 0x24515F21 // 228
+data4 0x2412A0D6 // 229
+data4 0x24E60D44 // 230
+data4 0x240D9251 // 231
+data4 0x247076E2 // 232
+data4 0x229B101B // 233
+data4 0x247B12DE // 234
+data4 0x244B9127 // 235
+data4 0x2499EC42 // 236
+data4 0x21FC3963 // 237
+data4 0x23E53266 // 238
+data4 0x24CE102D // 239
+data4 0x23CC45D2 // 240
+data4 0x2333171D // 241
+data4 0x246B3533 // 242
+data4 0x24931129 // 243
+data4 0x24405FFA // 244
+data4 0x24CF464D // 245
+data4 0x237095CD // 246
+data4 0x24F86CBD // 247
+data4 0x24E2D84B // 248
+data4 0x21ACBB44 // 249
+data4 0x24F43A8C // 250
+data4 0x249DB931 // 251
+data4 0x24A385EF // 252
+data4 0x238B1279 // 253
+data4 0x2436213E // 254
+data4 0x24F18A3B // 255
+LOCAL_OBJECT_END(log_data)
+
+
+// Code
+//==============================================================
.section .text
-.proc log1p#
-.global log1p#
-.align 64
-log1p:
-#ifdef _LIBC
-.global __log1p
-__log1p:
-#endif
-
+GLOBAL_IEEE754_ENTRY(log1p)
{ .mfi
-alloc r32 = ar.pfs,0,22,4,0
-(p0) fsub.s1 FR_Neg_One = f0,f1
-(p0) cmp.eq.unc p7, p0 = r0, r0
+ getf.exp GR_signexp_x = f8 // if x is unorm then must recompute
+ fadd.s1 FR_Xp1 = f8, f1 // Form 1+x
+ mov GR_05 = 0xfffe
}
-
-{ .mfi
-(p0) cmp.ne.unc p14, p0 = r0, r0
-(p0) fnorm.s1 FR_X_Prime = FR_Input_X
-(p0) cmp.eq.unc p15, p0 = r0, r0 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3
- nop.i 999
-}
-;;
-
-{ .mfi
- nop.m 999
-(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF
- nop.i 999
+{ .mlx
+ addl GR_ad_1 = @ltoff(log_data),gp
+ movl GR_A3 = 0x3fd5555555555557 // double precision memory
+ // representation of A3
}
;;
{ .mfi
- nop.m 999
-(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f0
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fadd FR_Em1 = f0,f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fadd FR_E = f0,f1
- nop.i 999 ;;
+ ld8 GR_ad_1 = [GR_ad_1]
+ fclass.m p8,p0 = f8,0xb // Is x unorm?
+ mov GR_exp_mask = 0x1ffff
}
-
{ .mfi
- nop.m 999
-(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, FR_Neg_One
- nop.i 999
+ nop.m 0
+ fnorm.s1 FR_NormX = f8 // Normalize x
+ mov GR_exp_bias = 0xffff
}
+;;
{ .mfi
- nop.m 999
-(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, FR_Neg_One
- nop.i 999
-}
-
-
-L(LOG_BEGIN):
-
-{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_Z = FR_X_Prime, FR_E
- nop.i 999
+ setf.exp FR_A2 = GR_05 // create A2 = 0.5
+ fclass.m p9,p0 = f8,0x1E1 // is x NaN, NaT or +Inf?
+ nop.i 0
}
-
-{ .mlx
- nop.m 999
-(p0) movl GR_Table_Scale = 0x0000000000000018 ;;
-}
-
-{ .mmi
- nop.m 999
-//
-// Create E = 1 and Em1 = 0
-// Check for X == 0, meaning log(1+0)
-// Check for X < -1, meaning log(negative)
-// Check for X == -1, meaning log(0)
-// Normalize x
-// Identify NatVals, NaNs, Infs.
-// Identify EM unsupporteds.
-// Identify Negative values - us S1 so as
-// not to raise denormal operand exception
-// Set p15 to true for log1p
-// Set p14 to false for log1p
-// Set p7 true for log and log1p
-//
-(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h1#),gp
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fmax.s1 FR_AA = FR_X_Prime, FR_E
- nop.i 999 ;;
+{ .mib
+ setf.d FR_A3 = GR_A3 // create A3
+ add GR_ad_2 = 16,GR_ad_1 // address of A5,A4
+(p8) br.cond.spnt log1p_unorm // Branch if x=unorm
}
+;;
+log1p_common:
{ .mfi
- ld8 GR_Table_Base = [GR_Table_Base]
-(p0) fmin.s1 FR_BB = FR_X_Prime, FR_E
- nop.i 999
+ nop.m 0
+ frcpa.s1 FR_RcpX,p0 = f1,FR_Xp1
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p0) fadd.s1 FR_W = FR_X_Prime, FR_Em1
-//
-// Begin load of constants base
-// FR_Z = Z = |x| + E
-// FR_W = W = |x| + Em1
-// AA = fmax(|x|,E)
-// BB = fmin(|x|,E)
-//
-(p6) br.cond.spnt L(LOG_64_special) ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p10) br.cond.spnt L(LOG_64_unsupported) ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p13) br.cond.spnt L(LOG_64_negative) ;;
-}
-
-{ .mib
-(p0) getf.sig GR_signif = FR_Z
- nop.i 999
-(p9) br.cond.spnt L(LOG_64_one) ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p8) br.cond.spnt L(LOG_64_zero) ;;
+ nop.m 0
+(p9) fma.d.s0 f8 = f8,f1,f0 // set V-flag
+(p9) br.ret.spnt b0 // exit for NaN, NaT and +Inf
}
+;;
{ .mfi
-(p0) getf.exp GR_N = FR_Z
-//
-// Raise possible denormal operand exception
-// Create Bias
-//
-// This function computes ln( x + e )
-// Input FR 1: FR_X = FR_Input_X
-// Input FR 2: FR_E = FR_E
-// Input FR 3: FR_Em1 = FR_Em1
-// Input GR 1: GR_Expo_Range = GR_Expo_Range = 1
-// Output FR 4: FR_Y_hi
-// Output FR 5: FR_Y_lo
-// Output FR 6: FR_Scale
-// Output PR 7: PR_Safe
-//
-(p0) fsub.s1 FR_S_lo = FR_AA, FR_Z
-//
-// signif = getf.sig(Z)
-// abs_W = fabs(w)
-//
-(p0) extr.u GR_Table_ptr = GR_signif, 59, 4 ;;
+ getf.exp GR_Exp = FR_Xp1 // signexp of x+1
+ fclass.m p10,p0 = FR_Xp1,0x3A // is 1+x < 0?
+ and GR_exp_x = GR_exp_mask, GR_signexp_x // biased exponent of x
}
-
{ .mfi
- nop.m 999
-(p0) fmerge.se FR_S_hi = f1,FR_Z
-(p0) extr.u GR_X_0 = GR_signif, 49, 15
-}
-
-{ .mmi
- nop.m 999
-(p0) addl GR_Table_Base1 = @ltoff(Constants_Z_G_H_h2#),gp
- nop.i 999
+ ldfpd FR_A7,FR_A6 = [GR_ad_1]
+ nop.f 0
+ nop.i 0
}
;;
-{ .mlx
- ld8 GR_Table_Base1 = [GR_Table_Base1]
-(p0) movl GR_Bias = 0x000000000000FFFF ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fabs FR_abs_W = FR_W
-(p0) pmpyshr2.u GR_Table_ptr = GR_Table_ptr,GR_Table_Scale,0
-}
-
{ .mfi
- nop.m 999
-//
-// Branch out for special input values
-//
-(p0) fcmp.lt.unc.s0 p8, p0 = FR_Input_X, f0
- nop.i 999 ;;
+ getf.sig GR_Sig = FR_Xp1 // get significand to calculate index
+ // for Thi,Tlo if |x| >= 2^-8
+ fcmp.eq.s1 p12,p0 = f8,f0 // is x equal to 0?
+ sub GR_exp_x = GR_exp_x, GR_exp_bias // true exponent of x
}
+;;
{ .mfi
- nop.m 999
-//
-// X_0 = extr.u(signif,49,15)
-// Index1 = extr.u(signif,59,4)
-//
-(p0) fadd.s1 FR_S_lo = FR_S_lo, FR_BB
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-//
-// Offset_to_Z1 = 24 * Index1
-// For performance, don't use result
-// for 3 or 4 cycles.
-//
-(p0) add GR_Table_ptr = GR_Table_ptr, GR_Table_Base ;;
+ sub GR_N = GR_Exp,GR_exp_bias // true exponent of x+1
+ fcmp.eq.s1 p11,p0 = FR_Xp1,f0 // is x = -1?
+ cmp.gt p6,p7 = -8, GR_exp_x // Is |x| < 2^-8
}
-//
-// Add Base to Offset for Z1
-// Create Bias
-
-{ .mmi
-(p0) ld4 GR_Z_1 = [GR_Table_ptr],4 ;;
-(p0) ldfs FR_G = [GR_Table_ptr],4
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) ldfs FR_H = [GR_Table_ptr],8 ;;
-(p0) ldfd FR_h = [GR_Table_ptr],0
-(p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
+{ .mfb
+ ldfpd FR_A5,FR_A4 = [GR_ad_2],16
+ nop.f 0
+(p10) br.cond.spnt log1p_lt_minus_1 // jump if x < -1
}
-//
-// Load Z_1
-// Get Base of Table2
-//
+;;
+// p6 is true if |x| < 1/256
+// p7 is true if |x| >= 1/256
+.pred.rel "mutex",p6,p7
{ .mfi
-(p0) getf.exp GR_M = FR_abs_W
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-//
-// M = getf.exp(abs_W)
-// S_lo = AA - Z
-// X_1 = pmpyshr2(X_0,Z_1,15)
-//
-(p0) sub GR_M = GR_M, GR_Bias ;;
+(p7) add GR_ad_1 = 0x820,GR_ad_1 // address of log(2) parts
+(p6) fms.s1 FR_r = f8,f1,f0 // range reduction for |x|<1/256
+(p6) cmp.gt.unc p10,p0 = -80, GR_exp_x // Is |x| < 2^-80
}
-//
-// M = M - Bias
-// Load G1
-// N = getf.exp(Z)
-//
-
-{ .mii
-(p0) cmp.gt.unc p11, p0 = -80, GR_M
-(p0) cmp.gt.unc p12, p0 = -7, GR_M ;;
-(p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;;
-}
-
-{ .mib
- nop.m 999
-//
-// if -80 > M, set p11
-// Index2 = extr.u(X_1,6,4)
-// if -7 > M, set p12
-// Load H1
-//
-(p0) pmpyshr2.u GR_Index2 = GR_Index2,GR_Table_Scale,0
-(p11) br.cond.spnt L(log1p_small) ;;
+{ .mfb
+(p7) setf.sig FR_N = GR_N // copy unbiased exponent of x to the
+ // significand field of FR_N
+(p7) fms.s1 FR_r = FR_RcpX,FR_Xp1,f1 // range reduction for |x|>=1/256
+(p12) br.ret.spnt b0 // exit for x=0, return x
}
+;;
{ .mib
- nop.m 999
- nop.i 999
-(p12) br.cond.spnt L(log1p_near) ;;
-}
-
-{ .mii
-(p0) sub GR_N = GR_N, GR_Bias
-//
-// poly_lo = r * poly_lo
-//
-(p0) add GR_Perturb = 0x1, r0 ;;
-(p0) sub GR_ScaleN = GR_Bias, GR_N
-}
-
-{ .mii
-(p0) setf.sig FR_float_N = GR_N
- nop.i 999 ;;
-//
-// Prepare Index2 - pmpyshr2.u(X_1,Z_2,15)
-// Load h1
-// S_lo = S_lo + BB
-// Branch for -80 > M
-//
-(p0) add GR_Index2 = GR_Index2, GR_Table_Base1
-}
-
-{ .mmi
-(p0) setf.exp FR_two_negN = GR_ScaleN
- nop.m 999
-(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h3#),gp
-};;
-
-//
-// Index2 points to Z2
-// Branch for -7 > M
-//
-
-{ .mmb
-(p0) ld4 GR_Z_2 = [GR_Index2],4
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.b 999 ;;
-}
-(p0) nop.i 999
-//
-// Load Z_2
-// N = N - Bias
-// Tablebase points to Table3
-//
-
-{ .mmi
-(p0) ldfs FR_G_tmp = [GR_Index2],4 ;;
-//
-// Load G_2
-// pmpyshr2 X_2= (X_1,Z_2,15)
-// float_N = setf.sig(N)
-// ScaleN = Bias - N
-//
-(p0) ldfs FR_H_tmp = [GR_Index2],8
- nop.i 999 ;;
-}
-//
-// Load H_2
-// two_negN = setf.exp(scaleN)
-// G = G_1 * G_2
-//
-
-{ .mfi
-(p0) ldfd FR_h_tmp = [GR_Index2],0
- nop.f 999
-(p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;;
-}
-
-{ .mii
- nop.m 999
-(p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;;
-//
-// Load h_2
-// H = H_1 + H_2
-// h = h_1 + h_2
-// Index3 = extr.u(X_2,1,5)
-//
-(p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Base
+(p7) ldfpd FR_Ln2hi,FR_Ln2lo = [GR_ad_1],16
+(p7) extr.u GR_Ind = GR_Sig,55,8 // get bits from 55 to 62 as index
+(p11) br.cond.spnt log1p_eq_minus_1 // jump if x = -1
}
-
-{ .mmi
- nop.m 999
- nop.m 999
-//
-// float_N = fcvt.xf(float_N)
-// load G3
-//
-(p0) addl GR_Table_Base = @ltoff(Constants_Q#),gp ;;
-}
-
-{ .mfi
-ld8 GR_Table_Base = [GR_Table_Base]
-nop.f 999
-nop.i 999
-} ;;
-
-{ .mfi
-(p0) ldfe FR_log2_hi = [GR_Table_Base],16
-(p0) fmpy.s1 FR_S_lo = FR_S_lo, FR_two_negN
- nop.i 999 ;;
-}
-
-{ .mmf
- nop.m 999
-//
-// G = G3 * G
-// Load h3
-// Load log2_hi
-// H = H + H3
-//
-(p0) ldfe FR_log2_lo = [GR_Table_Base],16
-(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp ;;
-}
-
-{ .mmf
-(p0) ldfs FR_G_tmp = [GR_Index3],4
-//
-// h = h + h3
-// r = G * S_hi + 1
-// Load log2_lo
-//
-(p0) ldfe FR_Q4 = [GR_Table_Base],16
-(p0) fadd.s1 FR_h = FR_h, FR_h_tmp ;;
-}
-
-{ .mfi
-(p0) ldfe FR_Q3 = [GR_Table_Base],16
-(p0) fadd.s1 FR_H = FR_H, FR_H_tmp
- nop.i 999 ;;
-}
-
-{ .mmf
-(p0) ldfs FR_H_tmp = [GR_Index3],4
-(p0) ldfe FR_Q2 = [GR_Table_Base],16
-//
-// Comput Index for Table3
-// S_lo = S_lo * two_negN
-//
-(p0) fcvt.xf FR_float_N = FR_float_N ;;
-}
-//
-// If S_lo == 0, set p8 false
-// Load H3
-// Load ptr to table of polynomial coeff.
-//
+;;
{ .mmf
-(p0) ldfd FR_h_tmp = [GR_Index3],0
-(p0) ldfe FR_Q1 = [GR_Table_Base],0
-(p0) fcmp.eq.unc.s1 p0, p8 = FR_S_lo, f0 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_H = FR_H, FR_H_tmp
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fms.s1 FR_r = FR_G, FR_S_hi, f1
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_h = FR_h, FR_h_tmp
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Load Q4
-// Load Q3
-// Load Q2
-// Load Q1
-//
-(p8) fma.s1 FR_r = FR_G, FR_S_lo, FR_r
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// poly_lo = r * Q4 + Q3
-// rsq = r* r
-//
-(p0) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// If (S_lo!=0) r = s_lo * G + r
-//
-(p0) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
- nop.i 999
-}
-//
-// Create a 0x00000....01
-// poly_lo = poly_lo * rsq + h
-//
-
-{ .mfi
-(p0) setf.sig FR_dummy = GR_Perturb
-(p0) fmpy.s1 FR_rsq = FR_r, FR_r
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// h = N * log2_lo + h
-// Y_hi = n * log2_hi + H
-//
-(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// poly_lo = r * poly_o + Q2
-// poly_hi = Q1 * rsq + r
-//
-(p0) fmpy.s1 FR_poly_lo = FR_poly_lo, FR_r
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_rsq, FR_h
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-(p0) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo
-//
-// Create the FR for a binary "or"
-// Y_lo = poly_hi + poly_lo
-//
-// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;;
-//
-// Turn the lsb of Y_lo ON
-//
-// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;;
-//
-// Merge the new lsb into Y_lo, for alone doesn't
-//
-(p0) br.cond.sptk L(LOG_main) ;;
-}
-
-
-L(log1p_near):
-
-{ .mmi
- nop.m 999
- nop.m 999
-// /*******************************************************/
-// /*********** Branch log1p_near ************************/
-// /*******************************************************/
-(p0) addl GR_Table_Base = @ltoff(Constants_P#),gp ;;
-}
-//
-// Load base address of poly. coeff.
-//
-{.mmi
- nop.m 999
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.i 999
-};;
-
-{ .mmb
-(p0) add GR_Table_ptr = 0x40,GR_Table_Base
-//
-// Address tables with separate pointers
-//
-(p0) ldfe FR_P8 = [GR_Table_Base],16
- nop.b 999 ;;
+(p7) shladd GR_ad_2 = GR_Ind,3,GR_ad_2 // address of Thi
+(p7) shladd GR_ad_1 = GR_Ind,2,GR_ad_1 // address of Tlo
+(p10) fnma.d.s0 f8 = f8,f8,f8 // If |x| very small, result=x-x*x
}
+;;
{ .mmb
-(p0) ldfe FR_P4 = [GR_Table_ptr],16
-//
-// Load P4
-// Load P8
-//
-(p0) ldfe FR_P7 = [GR_Table_Base],16
- nop.b 999 ;;
-}
-
-{ .mmf
-(p0) ldfe FR_P3 = [GR_Table_ptr],16
-//
-// Load P3
-// Load P7
-//
-(p0) ldfe FR_P6 = [GR_Table_Base],16
-(p0) fmpy.s1 FR_wsq = FR_W, FR_W ;;
-}
-
-{ .mfi
-(p0) ldfe FR_P2 = [GR_Table_ptr],16
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_W, FR_P4, FR_P3
- nop.i 999
-}
-//
-// Load P2
-// Load P6
-// Wsq = w * w
-// Y_hi = p4 * w + p3
-//
-
-{ .mfi
-(p0) ldfe FR_P5 = [GR_Table_Base],16
-(p0) fma.s1 FR_Y_lo = FR_W, FR_P8, FR_P7
- nop.i 999 ;;
-}
-
-{ .mfi
-(p0) ldfe FR_P1 = [GR_Table_ptr],16
-//
-// Load P1
-// Load P5
-// Y_lo = p8 * w + P7
-//
-(p0) fmpy.s1 FR_w4 = FR_wsq, FR_wsq
- nop.i 999 ;;
+(p7) ldfd FR_Thi = [GR_ad_2]
+(p7) ldfs FR_Tlo = [GR_ad_1]
+(p10) br.ret.spnt b0 // Exit if |x| < 2^(-80)
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P2
- nop.i 999
+ nop.m 0
+ fma.s1 FR_r2 = FR_r,FR_r,f0 // r^2
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P6
-(p0) add GR_Perturb = 0x1, r0 ;;
+ nop.m 0
+ fms.s1 FR_A2 = FR_A3,FR_r,FR_A2 // A3*r+A2
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-//
-// w4 = w2 * w2
-// Y_hi = y_hi * w + p2
-// Y_lo = y_lo * w + p6
-// Create perturbation bit
-//
-(p0) fmpy.s1 FR_w6 = FR_w4, FR_wsq
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 FR_A6 = FR_A7,FR_r,FR_A6 // A7*r+A6
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P1
- nop.i 999
+ nop.m 0
+ fma.s1 FR_A4 = FR_A5,FR_r,FR_A4 // A5*r+A4
+ nop.i 0
}
-//
-// Y_hi = y_hi * w + p1
-// w6 = w4 * w2
-//
+;;
{ .mfi
-(p0) setf.sig FR_Q4 = GR_Perturb
-(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P5
- nop.i 999 ;;
+ nop.m 0
+(p7) fcvt.xf FR_N = FR_N
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_wsq,FR_Y_hi, FR_W
- nop.i 999
-}
-
-{ .mfb
- nop.m 999
-//
-// Y_hi = y_hi * wsq + w
-// Y_lo = y_lo * w + p5
-//
-(p0) fmpy.s1 FR_Y_lo = FR_w6, FR_Y_lo
-//
-// Y_lo = y_lo * w6
-//
-// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;;
-//
-// Set lsb on: Taken out to improve performance
-//
-// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;;
-//
-// Make sure it's on in Y_lo also. Taken out to improve
-// performance
-//
-(p0) br.cond.sptk L(LOG_main) ;;
-}
-
-
-L(log1p_small):
-
-{ .mmi
- nop.m 999
- nop.m 999
-// /*******************************************************/
-// /*********** Branch log1p_small ***********************/
-// /*******************************************************/
-(p0) addl GR_Table_Base = @ltoff(Constants_Threshold#),gp
+ nop.m 0
+ fma.s1 FR_r4 = FR_r2,FR_r2,f0 // r^4
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p0) mov FR_Em1 = FR_W
-(p0) cmp.eq.unc p7, p0 = r0, r0 ;;
-}
-
-{ .mlx
- ld8 GR_Table_Base = [GR_Table_Base]
-(p0) movl GR_Expo_Range = 0x0000000000000002 ;;
-}
-//
-// Set Safe to true
-// Set Expo_Range = 0 for single
-// Set Expo_Range = 2 for double
-// Set Expo_Range = 4 for double-extended
-//
-
-{ .mmi
-(p0) shladd GR_Table_Base = GR_Expo_Range,4,GR_Table_Base ;;
-(p0) ldfe FR_Threshold = [GR_Table_Base],16
- nop.i 999
-}
-
-{ .mlx
- nop.m 999
-(p0) movl GR_Bias = 0x000000000000FF9B ;;
+ nop.m 0
+ // (A3*r+A2)*r^2+r
+ fma.s1 FR_A2 = FR_A2,FR_r2,FR_r
+ nop.i 0
}
+;;
{ .mfi
-(p0) ldfe FR_Tiny = [GR_Table_Base],0
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ // (A7*r+A6)*r^2+(A5*r+A4)
+ fma.s1 FR_A4 = FR_A6,FR_r2,FR_A4
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fcmp.gt.unc.s1 p13, p12 = FR_abs_W, FR_Threshold
- nop.i 999 ;;
+ nop.m 0
+ // N*Ln2hi+Thi
+(p7) fma.s1 FR_NxLn2hipThi = FR_N,FR_Ln2hi,FR_Thi
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p13) fnmpy.s1 FR_Y_lo = FR_W, FR_W
- nop.i 999
+ nop.m 0
+ // N*Ln2lo+Tlo
+(p7) fma.s1 FR_NxLn2lopTlo = FR_N,FR_Ln2lo,FR_Tlo
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p13) fadd FR_SCALE = f0, f1
- nop.i 999 ;;
+ nop.m 0
+(p7) fma.s1 f8 = FR_A4,FR_r4,FR_A2 // P(r) if |x| >= 1/256
+ nop.i 0
}
-
{ .mfi
- nop.m 999
-(p12) fsub.s1 FR_Y_lo = f0, FR_Tiny
-(p12) cmp.ne.unc p7, p0 = r0, r0
+ nop.m 0
+ // (N*Ln2hi+Thi) + (N*Ln2lo+Tlo)
+(p7) fma.s1 FR_NxLn2pT = FR_NxLn2hipThi,f1,FR_NxLn2lopTlo
+ nop.i 0
}
+;;
+.pred.rel "mutex",p6,p7
{ .mfi
-(p12) setf.exp FR_SCALE = GR_Bias
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.d.s0 f8 = FR_A4,FR_r4,FR_A2 // result if 2^(-80) <= |x| < 1/256
+ nop.i 0
}
-
-//
-// Set p7 to SAFE = FALSE
-// Set Scale = 2^-100
-//
{ .mfb
- nop.m 999
-(p0) fma.d.s0 FR_Input_X = FR_Y_lo,FR_SCALE,FR_Y_hi
-(p0) br.ret.sptk b0
+ nop.m 0
+(p7) fma.d.s0 f8 = f8,f1,FR_NxLn2pT // result if |x| >= 1/256
+ br.ret.sptk b0 // Exit if |x| >= 2^(-80)
}
;;
-L(LOG_64_one):
-
+.align 32
+log1p_unorm:
+// Here if x=unorm
{ .mfb
- nop.m 999
-(p0) fmpy.d.s0 FR_Input_X = FR_Input_X, f0
-(p0) br.ret.sptk b0
+ getf.exp GR_signexp_x = FR_NormX // recompute biased exponent
+ nop.f 0
+ br.cond.sptk log1p_common
}
;;
-//
-// Raise divide by zero for +/-0 input.
-//
-L(LOG_64_zero):
-
+.align 32
+log1p_eq_minus_1:
+// Here if x=-1
{ .mfi
-(p0) mov GR_Parameter_TAG = 140
-//
-// If we have log1p(0), return -Inf.
-//
-(p0) fsub.s0 FR_Output_X_tmp = f0, f1
- nop.i 999 ;;
+ nop.m 0
+ fmerge.s FR_X = f8,f8 // keep input argument for subsequent
+ // call of __libm_error_support#
+ nop.i 0
}
-{ .mfb
- nop.m 999
-(p0) frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0
-(p0) br.cond.sptk L(LOG_ERROR_Support) ;;
-}
-
-L(LOG_64_special):
+;;
{ .mfi
- nop.m 999
-//
-// Return -Inf or value from handler.
-//
-(p0) fclass.m.unc p7, p0 = FR_Input_X, 0x1E1
- nop.i 999 ;;
+ mov GR_TAG = 140 // set libm error in case of log1p(-1).
+ frcpa.s0 f8,p0 = f8,f0 // log1p(-1) should be equal to -INF.
+ // We can get it using frcpa because it
+ // sets result to the IEEE-754 mandated
+ // quotient of f8/f0.
+ nop.i 0
}
-{ .mfb
- nop.m 999
-//
-// Check for Natval, QNan, SNaN, +Inf
-//
-(p7) fmpy.d.s0 f8 = FR_Input_X, f1
-//
-// For SNaN raise invalid and return QNaN.
-// For QNaN raise invalid and return QNaN.
-// For +Inf return +Inf.
-//
-(p7) br.ret.sptk b0
+{ .mib
+ nop.m 0
+ nop.i 0
+ br.cond.sptk log_libm_err
}
;;
-//
-// For -Inf raise invalid and return QNaN.
-//
-
-{ .mfb
-(p0) mov GR_Parameter_TAG = 141
-(p0) fmpy.d.s0 FR_Output_X_tmp = FR_Input_X, f0
-(p0) br.cond.sptk L(LOG_ERROR_Support) ;;
+.align 32
+log1p_lt_minus_1:
+// Here if x < -1
+{ .mfi
+ nop.m 0
+ fmerge.s FR_X = f8,f8
+ nop.i 0
}
+;;
-//
-// Report that log1p(-Inf) computed
-//
-
-L(LOG_64_unsupported):
-
-//
-// Return generated NaN or other value .
-//
-
-{ .mfb
- nop.m 999
-(p0) fmpy.d.s0 FR_Input_X = FR_Input_X, f0
-(p0) br.ret.sptk b0 ;;
+{ .mfi
+ mov GR_TAG = 141 // set libm error in case of x < -1.
+ frcpa.s0 f8,p0 = f0,f0 // log1p(x) x < -1 should be equal to NaN.
+ // We can get it using frcpa because it
+ // sets result to the IEEE-754 mandated
+ // quotient of f0/f0 i.e. NaN.
+ nop.i 0
}
+;;
-L(LOG_64_negative):
-
-{ .mfi
- nop.m 999
-//
-// Deal with x < 0 in a special way
-//
-(p0) frcpa.s0 FR_Output_X_tmp, p8 = f0, f0
-//
-// Deal with x < 0 in a special way - raise
-// invalid and produce QNaN indefinite.
-//
-(p0) mov GR_Parameter_TAG = 141
+.align 32
+log_libm_err:
+{ .mmi
+ alloc r32 = ar.pfs,1,4,4,0
+ mov GR_Parameter_TAG = GR_TAG
+ nop.i 0
}
+;;
-.endp log1p#
-ASM_SIZE_DIRECTIVE(log1p)
+GLOBAL_IEEE754_END(log1p)
-.proc __libm_error_region
-__libm_error_region:
-L(LOG_ERROR_Support):
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
-
-// (1)
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
+ add GR_Parameter_Y = -32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp = -64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP = gp // Save gp
};;
-
-
-// (2)
{ .mmi
- stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
+ stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0 = b0 // Save b0
};;
-
.body
-// (3)
{ .mib
- stfd [GR_Parameter_X] =FR_Input_X // STORE Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfd [GR_Parameter_Y] = FR_Output_X_tmp // STORE Parameter 3 on stack
+ stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
-
-// (4)
{ .mmi
- ldfd FR_Input_X = [GR_Parameter_RESULT] // Get return result off stack
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
- add sp = 64,sp // Restore stack pointer
- mov b0 = GR_SAVE_B0 // Restore return address
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
};;
-
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
-
-.proc __libm_LOG_main
-__libm_LOG_main:
-L(LOG_main):
-
-//
-// kernel_log_64 computes ln(X + E)
-//
-
-{ .mfi
- nop.m 999
-(p7) fadd.d.s0 FR_Input_X = FR_Y_lo,FR_Y_hi
- nop.i 999
-}
-
-{ .mmi
- nop.m 999
- nop.m 999
-(p14) addl GR_Table_Base = @ltoff(Constants_1_by_LN10#),gp ;;
-}
-
-{ .mmi
- nop.m 999
-(p14) ld8 GR_Table_Base = [GR_Table_Base]
- nop.i 999
-};;
-
-{ .mmi
-(p14) ldfe FR_1LN10_hi = [GR_Table_Base],16 ;;
-(p14) ldfe FR_1LN10_lo = [GR_Table_Base]
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-(p14) fma.d.s0 FR_Input_X = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp
-(p0) br.ret.sptk b0 ;;
-}
-.endp __libm_LOG_main
-ASM_SIZE_DIRECTIVE(__libm_LOG_main)
-
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/s_log1pf.S b/sysdeps/ia64/fpu/s_log1pf.S
index 8aff9b895a..a148d4b272 100644
--- a/sysdeps/ia64/fpu/s_log1pf.S
+++ b/sysdeps/ia64/fpu/s_log1pf.S
@@ -1,10 +1,10 @@
-.file "log1pf.s"
+.file "log1pf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,1610 +20,768 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 06/29/01 Improved speed of all paths
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 10/02/02 Improved performance by basing on log algorithm
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+// 04/18/03 Eliminate possible WAW dependency warning
//
-// *********************************************************************
-//
-// Function: log1pf(x) = ln(x+1), for single precision values
-//
-// *********************************************************************
-//
-// Accuracy: Very accurate for single precision values
-//
-// *********************************************************************
-//
-// Resources Used:
-//
-// Floating-Point Registers: f8 (Input and Return Value)
-// f9,f33-f55,f99
-//
-// General Purpose Registers:
-// r32-r53
-// r54-r57 (Used to pass arguments to error handling routine)
-//
-// Predicate Registers: p6-p15
-//
-// *********************************************************************
-//
-// IEEE Special Conditions:
-//
-// Denormal fault raised on denormal inputs
-// Overflow exceptions cannot occur
-// Underflow exceptions raised when appropriate for log1pf
-// (Error Handling Routine called for underflow)
-// Inexact raised when appropriate by algorithm
-//
-// log1pf(inf) = inf
-// log1pf(-inf) = QNaN
-// log1pf(+/-0) = +/-0
-// log1pf(-1) = -inf
-// log1pf(SNaN) = QNaN
-// log1pf(QNaN) = QNaN
-// log1pf(EM_special Values) = QNaN
-//
-// *********************************************************************
-//
-// Computation is based on the following kernel.
-//
-// ker_log_64( in_FR : X,
-// in_FR : E,
-// in_FR : Em1,
-// in_GR : Expo_Range,
-// out_FR : Y_hi,
-// out_FR : Y_lo,
-// out_FR : Scale,
-// out_PR : Safe )
-//
-// Overview
-//
-// The method consists of three cases.
-//
-// If |X+Em1| < 2^(-80) use case log1pf_small;
-// elseif |X+Em1| < 2^(-7) use case log_near1;
-// else use case log_regular;
-//
-// Case log1pf_small:
-//
-// log( 1 + (X+Em1) ) can be approximated by (X+Em1).
-//
-// Case log_near1:
-//
-// log( 1 + (X+Em1) ) can be approximated by a simple polynomial
-// in W = X+Em1. This polynomial resembles the truncated Taylor
-// series W - W^/2 + W^3/3 - ...
-//
-// Case log_regular:
-//
-// Here we use a table lookup method. The basic idea is that in
-// order to compute log(Arg) for an argument Arg in [1,2), we
-// construct a value G such that G*Arg is close to 1 and that
-// log(1/G) is obtainable easily from a table of values calculated
-// beforehand. Thus
-//
-// log(Arg) = log(1/G) + log(G*Arg)
-// = log(1/G) + log(1 + (G*Arg - 1))
-//
-// Because |G*Arg - 1| is small, the second term on the right hand
-// side can be approximated by a short polynomial. We elaborate
-// this method in four steps.
-//
-// Step 0: Initialization
-//
-// We need to calculate log( E + X ). Obtain N, S_hi, S_lo such that
-//
-// E + X = 2^N * ( S_hi + S_lo ) exactly
-//
-// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
-// that |S_lo| <= ulp(S_hi).
-//
-// Step 1: Argument Reduction
-//
-// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
-//
-// G := G_1 * G_2 * G_3
-// r := (G * S_hi - 1) + G * S_lo
-//
-// These G_j's have the property that the product is exactly
-// representable and that |r| < 2^(-12) as a result.
-//
-// Step 2: Approximation
-//
-//
-// log(1 + r) is approximated by a short polynomial poly(r).
-//
-// Step 3: Reconstruction
-//
-//
-// Finally, log( E + X ) is given by
-//
-// log( E + X ) = log( 2^N * (S_hi + S_lo) )
-// ~=~ N*log(2) + log(1/G) + log(1 + r)
-// ~=~ N*log(2) + log(1/G) + poly(r).
-//
-// **** Algorithm ****
-//
-// Case log1pf_small:
-//
-// Although log(1 + (X+Em1)) is basically X+Em1, we would like to
-// preserve the inexactness nature as well as consistent behavior
-// under different rounding modes. Note that this case can only be
-// taken if E is set to be 1.0. In this case, Em1 is zero, and that
-// X can be very tiny and thus the final result can possibly underflow.
-// Thus, we compare X against a threshold that is dependent on the
-// input Expo_Range. If |X| is smaller than this threshold, we set
-// SAFE to be FALSE.
-//
-// The result is returned as Y_hi, Y_lo, and in the case of SAFE
-// is FALSE, an additional value Scale is also returned.
-//
-// W := X + Em1
-// Threshold := Threshold_Table( Expo_Range )
-// Tiny := Tiny_Table( Expo_Range )
-//
-// If ( |W| > Threshold ) then
-// Y_hi := W
-// Y_lo := -W*W
-// Else
-// Y_hi := W
-// Y_lo := -Tiny
-// Scale := 2^(-100)
-// Safe := FALSE
-// EndIf
-//
-//
-// One may think that Y_lo should be -W*W/2; however, it does not matter
-// as Y_lo will be rounded off completely except for the correct effect in
-// directed rounding. Clearly -W*W is simplier to compute. Moreover,
-// because of the difference in exponent value, Y_hi + Y_lo or
-// Y_hi + Scale*Y_lo is always inexact.
-//
-// Case log_near1:
-//
-// Here we compute a simple polynomial. To exploit parallelism, we split
-// the polynomial into two portions.
-//
-// W := X + Em1
-// Wsq := W * W
-// W4 := Wsq*Wsq
-// W6 := W4*Wsq
-// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4))
-// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8)))
-// set lsb(Y_lo) to be 1
-//
-// Case log_regular:
-//
-// We present the algorithm in four steps.
-//
-// Step 0. Initialization
-// ----------------------
-//
-// Z := X + E
-// N := unbaised exponent of Z
-// S_hi := 2^(-N) * Z
-// S_lo := 2^(-N) * { (max(X,E)-Z) + min(X,E) }
-//
-// Note that S_lo is always 0 for the case E = 0.
-//
-// Step 1. Argument Reduction
-// --------------------------
-//
-// Let
-//
-// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63
-//
-// We obtain G_1, G_2, G_3 by the following steps.
-//
+// API
+//==============================================================
+// float log1pf(float)
//
-// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted
-// from S_hi.
+// log1p(x) = log(x+1)
//
-// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated
-// to lsb = 2^(-4).
+// Overview of operation
+//==============================================================
+// Background
+// ----------
//
-// Define index_1 := [ d_1 d_2 d_3 d_4 ].
+// This algorithm is based on fact that
+// log1p(x) = log(1+x) and
+// log(a b) = log(a) + log(b).
+// In our case we have 1+x = 2^N f, where 1 <= f < 2.
+// So
+// log(1+x) = log(2^N f) = log(2^N) + log(f) = n*log(2) + log(f)
//
-// Fetch Z_1 := (1/A_1) rounded UP in fixed point with
-// fixed point lsb = 2^(-15).
-// Z_1 looks like z_0.z_1 z_2 ... z_15
-// Note that the fetching is done using index_1.
-// A_1 is actually not needed in the implementation
-// and is used here only to explain how is the value
-// Z_1 defined.
+// To calculate log(f) we do following
+// log(f) = log(f * frcpa(f) / frcpa(f)) =
+// = log(f * frcpa(f)) + log(1/frcpa(f))
//
-// Fetch G_1 := (1/A_1) truncated to 21 sig. bits.
-// floating pt. Again, fetching is done using index_1. A_1
-// explains how G_1 is defined.
+// According to definition of IA-64's frcpa instruction it's a
+// floating point that approximates 1/f using a lookup on the
+// top of 8 bits of the input number's + 1 significand with relative
+// error < 2^(-8.886). So we have following
//
-// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14)
-// = 1.0 0 0 0 d_5 ... d_14
-// This is accomplised by integer multiplication.
-// It is proved that X_1 indeed always begin
-// with 1.0000 in fixed point.
+// |(1/f - frcpa(f)) / (1/f))| = |1 - f*frcpa(f)| < 1/256
//
+// and
//
-// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
-// truncated to lsb = 2^(-8). Similar to A_1,
-// A_2 is not needed in actual implementation. It
-// helps explain how some of the values are defined.
+// log(f) = log(f * frcpa(f)) + log(1/frcpa(f)) =
+// = log(1 + r) + T
//
-// Define index_2 := [ d_5 d_6 d_7 d_8 ].
+// The first value can be computed by polynomial P(r) approximating
+// log(1 + r) on |r| < 1/256 and the second is precomputed tabular
+// value defined by top 8 bit of f.
//
-// Fetch Z_2 := (1/A_2) rounded UP in fixed point with
-// fixed point lsb = 2^(-15). Fetch done using index_2.
-// Z_2 looks like z_0.z_1 z_2 ... z_15
+// Finally we have that log(1+x) ~ (N*log(2) + T) + P(r)
//
-// Fetch G_2 := (1/A_2) truncated to 21 sig. bits.
-// floating pt.
+// Note that if input argument is close to 0.0 (in our case it means
+// that |x| < 1/256) we can use just polynomial approximation
+// because 1+x = 2^0 * f = f = 1 + r and
+// log(1+x) = log(1 + r) ~ P(r)
//
-// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14)
-// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14
-// This is accomplised by integer multiplication.
-// It is proved that X_2 indeed always begin
-// with 1.00000000 in fixed point.
//
+// Implementation
+// --------------
//
-// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1.
-// This is 2^(-14) + X_2 truncated to lsb = 2^(-13).
+// 1. |x| >= 2^(-8), and x > -1
+// InvX = frcpa(x+1)
+// r = InvX*(x+1) - 1
+// P(r) = r*((1 - A2*4) + r^2*(A3 - A4*r)) = r*P2(r),
+// A4,A3,A2 are created with setf instruction.
+// We use Taylor series and so A4 = 1/4, A3 = 1/3,
+// A2 = 1/2 rounded to double.
//
-// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ].
+// N = float(n) where n is true unbiased exponent of x
//
-// Fetch G_3 := (1/A_3) truncated to 21 sig. bits.
-// floating pt. Fetch is done using index_3.
+// T is tabular value of log(1/frcpa(x)) calculated in quad precision
+// and rounded to double. To load T we get bits from 55 to 62 of register
+// format significand as index and calculate address
+// ad_T = table_base_addr + 8 * index
//
-// Compute G := G_1 * G_2 * G_3.
+// L1 (log(2)) is calculated in quad precision and rounded to double;
+// it's created with setf
//
-// This is done exactly since each of G_j only has 21 sig. bits.
+// And final result = P2(r)*r + (T + N*L1)
//
-// Compute
//
-// r := (G*S_hi - 1) + G*S_lo using 2 FMA operations.
+// 2. 2^(-40) <= |x| < 2^(-8)
+// r = x
+// P(r) = r*((1 - A2*4) + r^2*(A3 - A4*r)) = r*P2(r),
+// A4,A3,A2 are the same as in case |x| >= 1/256
//
-// thus, r approximates G*(S_hi+S_lo) - 1 to within a couple of
-// rounding errors.
+// And final result = P2(r)*r
//
+// 3. 0 < |x| < 2^(-40)
+// Although log1p(x) is basically x, we would like to preserve the inexactness
+// nature as well as consistent behavior under different rounding modes.
+// We can do this by computing the result as
//
-// Step 2. Approximation
-// ---------------------
+// log1p(x) = x - x*x
//
-// This step computes an approximation to log( 1 + r ) where r is the
-// reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13);
-// thus log(1+r) can be approximated by a short polynomial:
//
-// log(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5
+// Note: NaT, any NaNs, +/-INF, +/-0, negatives and unnormalized numbers are
+// filtered and processed on special branches.
//
+
//
-// Step 3. Reconstruction
-// ----------------------
+// Special values
+//==============================================================
//
-// This step computes the desired result of log(X+E):
+// log1p(-1) = -inf // Call error support
//
-// log(X+E) = log( 2^N * (S_hi + S_lo) )
-// = N*log(2) + log( S_hi + S_lo )
-// = N*log(2) + log(1/G) +
-// log(1 + C*(S_hi+S_lo) - 1 )
+// log1p(+qnan) = +qnan
+// log1p(-qnan) = -qnan
+// log1p(+snan) = +qnan
+// log1p(-snan) = -qnan
//
-// log(2), log(1/G_j) are stored as pairs of (single,double) numbers:
-// log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are
-// single-precision numbers and the low parts are double precision
-// numbers. These have the property that
+// log1p(x),x<-1= QNAN Indefinite // Call error support
+// log1p(-inf) = QNAN Indefinite
+// log1p(+inf) = +inf
+// log1p(+/-0) = +/-0
//
-// N*log2_hi + SUM ( log1byGj_hi )
//
-// is computable exactly in double-extended precision (64 sig. bits).
-// Finally
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f7 -> f15, f32 -> f36
//
-// Y_hi := N*log2_hi + SUM ( log1byGj_hi )
-// Y_lo := poly_hi + [ poly_lo +
-// ( SUM ( log1byGj_lo ) + N*log2_lo ) ]
-// set lsb(Y_lo) to be 1
+// General registers used:
+// r8 -> r11
+// r14 -> r22
//
+// Predicate registers used:
+// p6 -> p12
-#include "libm_support.h"
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+// Assembly macros
+//==============================================================
+GR_TAG = r8
+GR_ad_T = r9
+GR_Exp = r10
+GR_N = r11
-// P_7, P_6, P_5, P_4, P_3, P_2, and P_1
+GR_signexp_x = r14
+GR_exp_mask = r15
+GR_exp_bias = r16
+GR_05 = r17
+GR_A3 = r18
+GR_Sig = r19
+GR_Ind = r19
+GR_exp_x = r20
+GR_Ln2 = r21
+GR_025 = r22
-.align 64
-Constants_P:
-ASM_TYPE_DIRECTIVE(Constants_P,@object)
-data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000
-data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000
-data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000
-data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000
-data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000
-data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000
-data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000
-data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_P)
-
-// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
-.align 64
-Constants_Q:
-ASM_TYPE_DIRECTIVE(Constants_Q,@object)
-data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
-data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
-data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
-data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
-data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
-data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_Q)
-
-// Z1 - 16 bit fixed, G1 and H1 - IEEE single
-
-.align 64
-Constants_Z_G_H_h1:
-ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h1,@object)
-data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000
-data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000,0x617D741C,0x3DA163A6
-data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000,0xCBD3D5BB,0x3E2C55E6
-data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000,0xD86EA5E7,0xBE3EB0BF
-data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000,0x86B12760,0x3E2E6A8C
-data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000,0x5C0739BA,0x3E47574C
-data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000,0x13E8AF2F,0x3E20E30F
-data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000,0xF2C630BD,0xBE42885B
-data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000,0x97E577C6,0x3E497F34
-data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000,0xA6B0A5AB,0x3E3E6A6E
-data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000,0xD328D9BE,0xBDF43E3C
-data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000,0x0ADB090A,0x3E4094C3
-data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000,0xFC1FE510,0xBE28FBB2
-data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000,0x10FDE3FA,0x3E3A7895
-data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000,0x7CC8C98F,0x3E508CE5
-data4 0x00004211,0x3F042108,0x3F29516A,0x00000000,0xA223106C,0xBE534874
-ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h1)
-
-// Z2 - 16 bit fixed, G2 and H2 - IEEE single
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
-.align 64
-Constants_Z_G_H_h2:
-ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h2,@object)
-data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000
-data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000,0x22C42273,0x3DB5A116
-data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000,0x21F86ED3,0x3DE620CF
-data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000,0x484F34ED,0xBDAFA07E
-data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000,0x3860BCF6,0xBDFE07F0
-data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000,0xA78093D6,0x3DEA370F
-data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000,0x72A753D0,0x3DFF5791
-data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000,0xA7EF896B,0x3DFEBE6C
-data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000,0x409ECB43,0x3E0CF156
-data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000,0xFFEF71DF,0xBE0B6F97
-data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000,0x5D59EEE8,0xBE080483
-data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000,0xA9192A74,0x3E1F91E9
-data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000,0xBF72A8CD,0xBE139A06
-data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000,0xF8FBA6CF,0x3E1D9202
-data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000,0xBA796223,0xBE1DCCC4
-data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000,0xB6B7C239,0xBE049391
-ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h2)
-
-// G3 and H3 - IEEE single and h3 -IEEE double
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
-.align 64
-Constants_Z_G_H_h3:
-ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h3,@object)
-data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595
-data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2
-data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D
-data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291
-data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8
-data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707
-data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9
-data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47
-data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E
-data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D
-data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441
-data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95
-data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC
-data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337
-data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B
-data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B
-data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21
-data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4
-data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070
-data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC
-data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83
-data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40
-data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7
-data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B
-data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E
-data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06
-data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1
-data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103
-data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B
-data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19
-data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502
-data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17
-ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h3)
-
-//
-// Exponent Thresholds and Tiny Thresholds
-// for 8, 11, 15, and 17 bit exponents
-//
-// Expo_Range Value
-//
-// 0 (8 bits) 2^(-126)
-// 1 (11 bits) 2^(-1022)
-// 2 (15 bits) 2^(-16382)
-// 3 (17 bits) 2^(-16382)
-//
-// Tiny_Table
-// ----------
-// Expo_Range Value
-//
-// 0 (8 bits) 2^(-16382)
-// 1 (11 bits) 2^(-16382)
-// 2 (15 bits) 2^(-16382)
-// 3 (17 bits) 2^(-16382)
-//
-.align 64
-Constants_Threshold:
-ASM_TYPE_DIRECTIVE(Constants_Threshold,@object)
-data4 0x00000000,0x80000000,0x00003F81,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00003C01,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_Threshold)
-.align 64
-Constants_1_by_LN10:
-ASM_TYPE_DIRECTIVE(Constants_1_by_LN10,@object)
-data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000
-data4 0xACCF70C8,0xD56EAABE,0x00003FBD,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_1_by_LN10)
+FR_NormX = f7
+FR_RcpX = f9
+FR_r = f10
+FR_r2 = f11
+FR_r4 = f12
+FR_N = f13
+FR_Ln2 = f14
+FR_Xp1 = f15
-FR_Input_X = f8
-FR_Neg_One = f9
-FR_E = f33
-FR_Em1 = f34
-FR_Y_hi = f34
-// Shared with Em1
-FR_Y_lo = f35
-FR_Scale = f36
-FR_X_Prime = f37
-FR_Z = f38
-FR_S_hi = f38
-// Shared with Z
-FR_W = f39
-FR_G = f40
-FR_wsq = f40
-// Shared with G
-FR_H = f41
-FR_w4 = f41
-// Shared with H
-FR_h = f42
-FR_w6 = f42
-// Shared with h
-FR_G_tmp = f43
-FR_poly_lo = f43
-// Shared with G_tmp
-FR_P8 = f43
-// Shared with G_tmp
-FR_H_tmp = f44
-FR_poly_hi = f44
- // Shared with H_tmp
-FR_P7 = f44
-// Shared with H_tmp
-FR_h_tmp = f45
-FR_rsq = f45
-// Shared with h_tmp
-FR_P6 = f45
-// Shared with h_tmp
-FR_abs_W = f46
-FR_r = f46
-// Shared with abs_W
-FR_AA = f47
-FR_log2_hi = f47
-// Shared with AA
-FR_BB = f48
-FR_log2_lo = f48
-// Shared with BB
-FR_S_lo = f49
-FR_two_negN = f50
-FR_float_N = f51
-FR_Q4 = f52
-FR_dummy = f52
-// Shared with Q4
-FR_P4 = f52
-// Shared with Q4
-FR_Threshold = f52
-// Shared with Q4
-FR_Q3 = f53
-FR_P3 = f53
-// Shared with Q3
-FR_Tiny = f53
-// Shared with Q3
-FR_Q2 = f54
-FR_P2 = f54
-// Shared with Q2
-FR_1LN10_hi = f54
-// Shared with Q2
-FR_Q1 = f55
-FR_P1 = f55
-// Shared with Q1
-FR_1LN10_lo = f55
-// Shared with Q1
-FR_P5 = f98
-FR_SCALE = f98
-FR_Output_X_tmp = f99
+FR_A4 = f33
+FR_A3 = f34
+FR_A2 = f35
-GR_Expo_Range = r32
-GR_Table_Base = r34
-GR_Table_Base1 = r35
-GR_Table_ptr = r36
-GR_Index2 = r37
-GR_signif = r38
-GR_X_0 = r39
-GR_X_1 = r40
-GR_X_2 = r41
-GR_Z_1 = r42
-GR_Z_2 = r43
-GR_N = r44
-GR_Bias = r45
-GR_M = r46
-GR_ScaleN = r47
-GR_Index3 = r48
-GR_Perturb = r49
-GR_Table_Scale = r50
+FR_T = f36
+FR_NxLn2pT = f36
-GR_SAVE_PFS = r51
-GR_SAVE_B0 = r52
-GR_SAVE_GP = r53
-GR_Parameter_X = r54
-GR_Parameter_Y = r55
-GR_Parameter_RESULT = r56
+FR_Y = f1
+FR_X = f10
+FR_RESULT = f8
-GR_Parameter_TAG = r57
+// Data
+//==============================================================
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(log_data)
+// ln(1/frcpa(1+i/256)), i=0...255
+data8 0x3F60040155D5889E // 0
+data8 0x3F78121214586B54 // 1
+data8 0x3F841929F96832F0 // 2
+data8 0x3F8C317384C75F06 // 3
+data8 0x3F91A6B91AC73386 // 4
+data8 0x3F95BA9A5D9AC039 // 5
+data8 0x3F99D2A8074325F4 // 6
+data8 0x3F9D6B2725979802 // 7
+data8 0x3FA0C58FA19DFAAA // 8
+data8 0x3FA2954C78CBCE1B // 9
+data8 0x3FA4A94D2DA96C56 // 10
+data8 0x3FA67C94F2D4BB58 // 11
+data8 0x3FA85188B630F068 // 12
+data8 0x3FAA6B8ABE73AF4C // 13
+data8 0x3FAC441E06F72A9E // 14
+data8 0x3FAE1E6713606D07 // 15
+data8 0x3FAFFA6911AB9301 // 16
+data8 0x3FB0EC139C5DA601 // 17
+data8 0x3FB1DBD2643D190B // 18
+data8 0x3FB2CC7284FE5F1C // 19
+data8 0x3FB3BDF5A7D1EE64 // 20
+data8 0x3FB4B05D7AA012E0 // 21
+data8 0x3FB580DB7CEB5702 // 22
+data8 0x3FB674F089365A7A // 23
+data8 0x3FB769EF2C6B568D // 24
+data8 0x3FB85FD927506A48 // 25
+data8 0x3FB9335E5D594989 // 26
+data8 0x3FBA2B0220C8E5F5 // 27
+data8 0x3FBB0004AC1A86AC // 28
+data8 0x3FBBF968769FCA11 // 29
+data8 0x3FBCCFEDBFEE13A8 // 30
+data8 0x3FBDA727638446A2 // 31
+data8 0x3FBEA3257FE10F7A // 32
+data8 0x3FBF7BE9FEDBFDE6 // 33
+data8 0x3FC02AB352FF25F4 // 34
+data8 0x3FC097CE579D204D // 35
+data8 0x3FC1178E8227E47C // 36
+data8 0x3FC185747DBECF34 // 37
+data8 0x3FC1F3B925F25D41 // 38
+data8 0x3FC2625D1E6DDF57 // 39
+data8 0x3FC2D1610C86813A // 40
+data8 0x3FC340C59741142E // 41
+data8 0x3FC3B08B6757F2A9 // 42
+data8 0x3FC40DFB08378003 // 43
+data8 0x3FC47E74E8CA5F7C // 44
+data8 0x3FC4EF51F6466DE4 // 45
+data8 0x3FC56092E02BA516 // 46
+data8 0x3FC5D23857CD74D5 // 47
+data8 0x3FC6313A37335D76 // 48
+data8 0x3FC6A399DABBD383 // 49
+data8 0x3FC70337DD3CE41B // 50
+data8 0x3FC77654128F6127 // 51
+data8 0x3FC7E9D82A0B022D // 52
+data8 0x3FC84A6B759F512F // 53
+data8 0x3FC8AB47D5F5A310 // 54
+data8 0x3FC91FE49096581B // 55
+data8 0x3FC981634011AA75 // 56
+data8 0x3FC9F6C407089664 // 57
+data8 0x3FCA58E729348F43 // 58
+data8 0x3FCABB55C31693AD // 59
+data8 0x3FCB1E104919EFD0 // 60
+data8 0x3FCB94EE93E367CB // 61
+data8 0x3FCBF851C067555F // 62
+data8 0x3FCC5C0254BF23A6 // 63
+data8 0x3FCCC000C9DB3C52 // 64
+data8 0x3FCD244D99C85674 // 65
+data8 0x3FCD88E93FB2F450 // 66
+data8 0x3FCDEDD437EAEF01 // 67
+data8 0x3FCE530EFFE71012 // 68
+data8 0x3FCEB89A1648B971 // 69
+data8 0x3FCF1E75FADF9BDE // 70
+data8 0x3FCF84A32EAD7C35 // 71
+data8 0x3FCFEB2233EA07CD // 72
+data8 0x3FD028F9C7035C1C // 73
+data8 0x3FD05C8BE0D9635A // 74
+data8 0x3FD085EB8F8AE797 // 75
+data8 0x3FD0B9C8E32D1911 // 76
+data8 0x3FD0EDD060B78081 // 77
+data8 0x3FD122024CF0063F // 78
+data8 0x3FD14BE2927AECD4 // 79
+data8 0x3FD180618EF18ADF // 80
+data8 0x3FD1B50BBE2FC63B // 81
+data8 0x3FD1DF4CC7CF242D // 82
+data8 0x3FD214456D0EB8D4 // 83
+data8 0x3FD23EC5991EBA49 // 84
+data8 0x3FD2740D9F870AFB // 85
+data8 0x3FD29ECDABCDFA04 // 86
+data8 0x3FD2D46602ADCCEE // 87
+data8 0x3FD2FF66B04EA9D4 // 88
+data8 0x3FD335504B355A37 // 89
+data8 0x3FD360925EC44F5D // 90
+data8 0x3FD38BF1C3337E75 // 91
+data8 0x3FD3C25277333184 // 92
+data8 0x3FD3EDF463C1683E // 93
+data8 0x3FD419B423D5E8C7 // 94
+data8 0x3FD44591E0539F49 // 95
+data8 0x3FD47C9175B6F0AD // 96
+data8 0x3FD4A8B341552B09 // 97
+data8 0x3FD4D4F3908901A0 // 98
+data8 0x3FD501528DA1F968 // 99
+data8 0x3FD52DD06347D4F6 // 100
+data8 0x3FD55A6D3C7B8A8A // 101
+data8 0x3FD5925D2B112A59 // 102
+data8 0x3FD5BF406B543DB2 // 103
+data8 0x3FD5EC433D5C35AE // 104
+data8 0x3FD61965CDB02C1F // 105
+data8 0x3FD646A84935B2A2 // 106
+data8 0x3FD6740ADD31DE94 // 107
+data8 0x3FD6A18DB74A58C5 // 108
+data8 0x3FD6CF31058670EC // 109
+data8 0x3FD6F180E852F0BA // 110
+data8 0x3FD71F5D71B894F0 // 111
+data8 0x3FD74D5AEFD66D5C // 112
+data8 0x3FD77B79922BD37E // 113
+data8 0x3FD7A9B9889F19E2 // 114
+data8 0x3FD7D81B037EB6A6 // 115
+data8 0x3FD8069E33827231 // 116
+data8 0x3FD82996D3EF8BCB // 117
+data8 0x3FD85855776DCBFB // 118
+data8 0x3FD8873658327CCF // 119
+data8 0x3FD8AA75973AB8CF // 120
+data8 0x3FD8D992DC8824E5 // 121
+data8 0x3FD908D2EA7D9512 // 122
+data8 0x3FD92C59E79C0E56 // 123
+data8 0x3FD95BD750EE3ED3 // 124
+data8 0x3FD98B7811A3EE5B // 125
+data8 0x3FD9AF47F33D406C // 126
+data8 0x3FD9DF270C1914A8 // 127
+data8 0x3FDA0325ED14FDA4 // 128
+data8 0x3FDA33440224FA79 // 129
+data8 0x3FDA57725E80C383 // 130
+data8 0x3FDA87D0165DD199 // 131
+data8 0x3FDAAC2E6C03F896 // 132
+data8 0x3FDADCCC6FDF6A81 // 133
+data8 0x3FDB015B3EB1E790 // 134
+data8 0x3FDB323A3A635948 // 135
+data8 0x3FDB56FA04462909 // 136
+data8 0x3FDB881AA659BC93 // 137
+data8 0x3FDBAD0BEF3DB165 // 138
+data8 0x3FDBD21297781C2F // 139
+data8 0x3FDC039236F08819 // 140
+data8 0x3FDC28CB1E4D32FD // 141
+data8 0x3FDC4E19B84723C2 // 142
+data8 0x3FDC7FF9C74554C9 // 143
+data8 0x3FDCA57B64E9DB05 // 144
+data8 0x3FDCCB130A5CEBB0 // 145
+data8 0x3FDCF0C0D18F326F // 146
+data8 0x3FDD232075B5A201 // 147
+data8 0x3FDD490246DEFA6B // 148
+data8 0x3FDD6EFA918D25CD // 149
+data8 0x3FDD9509707AE52F // 150
+data8 0x3FDDBB2EFE92C554 // 151
+data8 0x3FDDEE2F3445E4AF // 152
+data8 0x3FDE148A1A2726CE // 153
+data8 0x3FDE3AFC0A49FF40 // 154
+data8 0x3FDE6185206D516E // 155
+data8 0x3FDE882578823D52 // 156
+data8 0x3FDEAEDD2EAC990C // 157
+data8 0x3FDED5AC5F436BE3 // 158
+data8 0x3FDEFC9326D16AB9 // 159
+data8 0x3FDF2391A2157600 // 160
+data8 0x3FDF4AA7EE03192D // 161
+data8 0x3FDF71D627C30BB0 // 162
+data8 0x3FDF991C6CB3B379 // 163
+data8 0x3FDFC07ADA69A910 // 164
+data8 0x3FDFE7F18EB03D3E // 165
+data8 0x3FE007C053C5002E // 166
+data8 0x3FE01B942198A5A1 // 167
+data8 0x3FE02F74400C64EB // 168
+data8 0x3FE04360BE7603AD // 169
+data8 0x3FE05759AC47FE34 // 170
+data8 0x3FE06B5F1911CF52 // 171
+data8 0x3FE078BF0533C568 // 172
+data8 0x3FE08CD9687E7B0E // 173
+data8 0x3FE0A10074CF9019 // 174
+data8 0x3FE0B5343A234477 // 175
+data8 0x3FE0C974C89431CE // 176
+data8 0x3FE0DDC2305B9886 // 177
+data8 0x3FE0EB524BAFC918 // 178
+data8 0x3FE0FFB54213A476 // 179
+data8 0x3FE114253DA97D9F // 180
+data8 0x3FE128A24F1D9AFF // 181
+data8 0x3FE1365252BF0865 // 182
+data8 0x3FE14AE558B4A92D // 183
+data8 0x3FE15F85A19C765B // 184
+data8 0x3FE16D4D38C119FA // 185
+data8 0x3FE18203C20DD133 // 186
+data8 0x3FE196C7BC4B1F3B // 187
+data8 0x3FE1A4A738B7A33C // 188
+data8 0x3FE1B981C0C9653D // 189
+data8 0x3FE1CE69E8BB106B // 190
+data8 0x3FE1DC619DE06944 // 191
+data8 0x3FE1F160A2AD0DA4 // 192
+data8 0x3FE2066D7740737E // 193
+data8 0x3FE2147DBA47A394 // 194
+data8 0x3FE229A1BC5EBAC3 // 195
+data8 0x3FE237C1841A502E // 196
+data8 0x3FE24CFCE6F80D9A // 197
+data8 0x3FE25B2C55CD5762 // 198
+data8 0x3FE2707F4D5F7C41 // 199
+data8 0x3FE285E0842CA384 // 200
+data8 0x3FE294294708B773 // 201
+data8 0x3FE2A9A2670AFF0C // 202
+data8 0x3FE2B7FB2C8D1CC1 // 203
+data8 0x3FE2C65A6395F5F5 // 204
+data8 0x3FE2DBF557B0DF43 // 205
+data8 0x3FE2EA64C3F97655 // 206
+data8 0x3FE3001823684D73 // 207
+data8 0x3FE30E97E9A8B5CD // 208
+data8 0x3FE32463EBDD34EA // 209
+data8 0x3FE332F4314AD796 // 210
+data8 0x3FE348D90E7464D0 // 211
+data8 0x3FE35779F8C43D6E // 212
+data8 0x3FE36621961A6A99 // 213
+data8 0x3FE37C299F3C366A // 214
+data8 0x3FE38AE2171976E7 // 215
+data8 0x3FE399A157A603E7 // 216
+data8 0x3FE3AFCCFE77B9D1 // 217
+data8 0x3FE3BE9D503533B5 // 218
+data8 0x3FE3CD7480B4A8A3 // 219
+data8 0x3FE3E3C43918F76C // 220
+data8 0x3FE3F2ACB27ED6C7 // 221
+data8 0x3FE4019C2125CA93 // 222
+data8 0x3FE4181061389722 // 223
+data8 0x3FE42711518DF545 // 224
+data8 0x3FE436194E12B6BF // 225
+data8 0x3FE445285D68EA69 // 226
+data8 0x3FE45BCC464C893A // 227
+data8 0x3FE46AED21F117FC // 228
+data8 0x3FE47A1527E8A2D3 // 229
+data8 0x3FE489445EFFFCCC // 230
+data8 0x3FE4A018BCB69835 // 231
+data8 0x3FE4AF5A0C9D65D7 // 232
+data8 0x3FE4BEA2A5BDBE87 // 233
+data8 0x3FE4CDF28F10AC46 // 234
+data8 0x3FE4DD49CF994058 // 235
+data8 0x3FE4ECA86E64A684 // 236
+data8 0x3FE503C43CD8EB68 // 237
+data8 0x3FE513356667FC57 // 238
+data8 0x3FE522AE0738A3D8 // 239
+data8 0x3FE5322E26867857 // 240
+data8 0x3FE541B5CB979809 // 241
+data8 0x3FE55144FDBCBD62 // 242
+data8 0x3FE560DBC45153C7 // 243
+data8 0x3FE5707A26BB8C66 // 244
+data8 0x3FE587F60ED5B900 // 245
+data8 0x3FE597A7977C8F31 // 246
+data8 0x3FE5A760D634BB8B // 247
+data8 0x3FE5B721D295F10F // 248
+data8 0x3FE5C6EA94431EF9 // 249
+data8 0x3FE5D6BB22EA86F6 // 250
+data8 0x3FE5E6938645D390 // 251
+data8 0x3FE5F673C61A2ED2 // 252
+data8 0x3FE6065BEA385926 // 253
+data8 0x3FE6164BFA7CC06B // 254
+data8 0x3FE62643FECF9743 // 255
+LOCAL_OBJECT_END(log_data)
+
+
+// Code
+//==============================================================
.section .text
-.proc log1pf#
-.global log1pf#
-.align 64
-log1pf:
-#ifdef _LIBC
-.global __log1pf
-__log1pf:
-#endif
-
-{ .mfi
-alloc r32 = ar.pfs,0,22,4,0
-(p0) fsub.s1 FR_Neg_One = f0,f1
-(p0) cmp.eq.unc p7, p0 = r0, r0
-}
-
+GLOBAL_IEEE754_ENTRY(log1pf)
{ .mfi
-(p0) cmp.ne.unc p14, p0 = r0, r0
-(p0) fnorm.s1 FR_X_Prime = FR_Input_X
-(p0) cmp.eq.unc p15, p0 = r0, r0 ;;
+ getf.exp GR_signexp_x = f8 // if x is unorm then must recompute
+ fadd.s1 FR_Xp1 = f8, f1 // Form 1+x
+ mov GR_05 = 0xfffe
}
-
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3
- nop.i 999
+{ .mlx
+ addl GR_ad_T = @ltoff(log_data),gp
+ movl GR_A3 = 0x3fd5555555555555 // double precision memory
+ // representation of A3
}
;;
{ .mfi
- nop.m 999
-(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF
- nop.i 999
+ ld8 GR_ad_T = [GR_ad_T]
+ fclass.m p8,p0 = f8,0xb // Is x unorm?
+ mov GR_exp_mask = 0x1ffff
}
-;;
-
{ .mfi
- nop.m 999
-(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f0
- nop.i 999
+ mov GR_025 = 0xfffd // Exponent of 0.25
+ fnorm.s1 FR_NormX = f8 // Normalize x
+ mov GR_exp_bias = 0xffff
}
+;;
{ .mfi
- nop.m 999
-(p0) fadd FR_Em1 = f0,f0
- nop.i 999 ;;
+ setf.exp FR_A2 = GR_05 // create A2 = 0.5
+ fclass.m p9,p0 = f8,0x1E1 // is x NaN, NaT or +Inf?
+ nop.i 0
}
-
-{ .mfi
- nop.m 999
-(p0) fadd FR_E = f0,f1
- nop.i 999 ;;
+{ .mib
+ setf.d FR_A3 = GR_A3 // create A3
+ nop.i 0
+(p8) br.cond.spnt log1p_unorm // Branch if x=unorm
}
+;;
+log1p_common:
{ .mfi
- nop.m 999
-(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, FR_Neg_One
- nop.i 999
+ setf.exp FR_A4 = GR_025 // create A4 = 0.25
+ frcpa.s1 FR_RcpX,p0 = f1,FR_Xp1
+ nop.i 0
}
-
-{ .mfi
- nop.m 999
-(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, FR_Neg_One
- nop.i 999
+{ .mfb
+ nop.m 0
+(p9) fma.s.s0 f8 = f8,f1,f0 // set V-flag
+(p9) br.ret.spnt b0 // exit for NaN, NaT and +Inf
}
-
-
-L(LOG_BEGIN):
+;;
{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_Z = FR_X_Prime, FR_E
- nop.i 999
+ getf.exp GR_Exp = FR_Xp1 // signexp of x+1
+ fclass.m p10,p0 = FR_Xp1,0x3A // is 1+x < 0?
+ and GR_exp_x = GR_exp_mask, GR_signexp_x // biased exponent of x
}
-
{ .mlx
- nop.m 999
-(p0) movl GR_Table_Scale = 0x0000000000000018 ;;
-}
-
-{ .mmi
- nop.m 999
-//
-// Create E = 1 and Em1 = 0
-// Check for X == 0, meaning log(1+0)
-// Check for X < -1, meaning log(negative)
-// Check for X == -1, meaning log(0)
-// Normalize x
-// Identify NatVals, NaNs, Infs.
-// Identify EM unsupporteds.
-// Identify Negative values - us S1 so as
-// not to raise denormal operand exception
-// Set p15 to true for log1pf
-// Set p14 to false for log1pf
-// Set p7 true for log and log1pf
-//
-(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h1#),gp
- nop.i 999
+ nop.m 0
+ movl GR_Ln2 = 0x3FE62E42FEFA39EF // double precision memory
+ // representation of log(2)
}
+;;
{ .mfi
- nop.m 999
-(p0) fmax.s1 FR_AA = FR_X_Prime, FR_E
- nop.i 999 ;;
+ getf.sig GR_Sig = FR_Xp1 // get significand to calculate index
+ // for T if |x| >= 2^-8
+ fcmp.eq.s1 p12,p0 = f8,f0 // is x equal to 0?
+ sub GR_exp_x = GR_exp_x, GR_exp_bias // true exponent of x
}
+;;
{ .mfi
- ld8 GR_Table_Base = [GR_Table_Base]
-(p0) fmin.s1 FR_BB = FR_X_Prime, FR_E
- nop.i 999
+ sub GR_N = GR_Exp,GR_exp_bias // true exponent of x+1
+ fcmp.eq.s1 p11,p0 = FR_Xp1,f0 // is x = -1?
+ cmp.gt p6,p7 = -8, GR_exp_x // Is |x| < 2^-8
}
-
{ .mfb
- nop.m 999
-(p0) fadd.s1 FR_W = FR_X_Prime, FR_Em1
-//
-// Begin load of constants base
-// FR_Z = Z = |x| + E
-// FR_W = W = |x| + Em1
-// AA = fmax(|x|,E)
-// BB = fmin(|x|,E)
-//
-(p6) br.cond.spnt L(LOG_64_special) ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p10) br.cond.spnt L(LOG_64_unsupported) ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p13) br.cond.spnt L(LOG_64_negative) ;;
-}
-
-{ .mib
-(p0) getf.sig GR_signif = FR_Z
- nop.i 999
-(p9) br.cond.spnt L(LOG_64_one) ;;
-}
-
-{ .mib
- nop.m 999
- nop.i 999
-(p8) br.cond.spnt L(LOG_64_zero) ;;
-}
-
-{ .mfi
-(p0) getf.exp GR_N = FR_Z
-//
-// Raise possible denormal operand exception
-// Create Bias
-//
-// This function computes ln( x + e )
-// Input FR 1: FR_X = FR_Input_X
-// Input FR 2: FR_E = FR_E
-// Input FR 3: FR_Em1 = FR_Em1
-// Input GR 1: GR_Expo_Range = GR_Expo_Range = 1
-// Output FR 4: FR_Y_hi
-// Output FR 5: FR_Y_lo
-// Output FR 6: FR_Scale
-// Output PR 7: PR_Safe
-//
-(p0) fsub.s1 FR_S_lo = FR_AA, FR_Z
-//
-// signif = getf.sig(Z)
-// abs_W = fabs(w)
-//
-(p0) extr.u GR_Table_ptr = GR_signif, 59, 4 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fmerge.se FR_S_hi = f1,FR_Z
-(p0) extr.u GR_X_0 = GR_signif, 49, 15
-}
-
-{ .mmi
- nop.m 999
-(p0) addl GR_Table_Base1 = @ltoff(Constants_Z_G_H_h2#),gp
- nop.i 999
+ nop.m 0
+ nop.f 0
+(p10) br.cond.spnt log1p_lt_minus_1 // jump if x < -1
}
;;
-{ .mlx
- ld8 GR_Table_Base1 = [GR_Table_Base1]
-(p0) movl GR_Bias = 0x000000000000FFFF ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fabs FR_abs_W = FR_W
-(p0) pmpyshr2.u GR_Table_ptr = GR_Table_ptr,GR_Table_Scale,0
-}
-
-{ .mfi
- nop.m 999
-//
-// Branch out for special input values
-//
-(p0) fcmp.lt.unc.s0 p8, p0 = FR_Input_X, f0
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// X_0 = extr.u(signif,49,15)
-// Index1 = extr.u(signif,59,4)
-//
-(p0) fadd.s1 FR_S_lo = FR_S_lo, FR_BB
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-//
-// Offset_to_Z1 = 24 * Index1
-// For performance, don't use result
-// for 3 or 4 cycles.
-//
-(p0) add GR_Table_ptr = GR_Table_ptr, GR_Table_Base ;;
-}
-//
-// Add Base to Offset for Z1
-// Create Bias
-
-{ .mmi
-(p0) ld4 GR_Z_1 = [GR_Table_ptr],4 ;;
-(p0) ldfs FR_G = [GR_Table_ptr],4
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) ldfs FR_H = [GR_Table_ptr],8 ;;
-(p0) ldfd FR_h = [GR_Table_ptr],0
-(p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
-}
-//
-// Load Z_1
-// Get Base of Table2
-//
-
+// p6 is true if |x| < 1/256
+// p7 is true if |x| >= 1/256
+.pred.rel "mutex",p6,p7
{ .mfi
-(p0) getf.exp GR_M = FR_abs_W
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mii
- nop.m 999
- nop.i 999 ;;
-//
-// M = getf.exp(abs_W)
-// S_lo = AA - Z
-// X_1 = pmpyshr2(X_0,Z_1,15)
-//
-(p0) sub GR_M = GR_M, GR_Bias ;;
+ nop.m 0
+(p6) fms.s1 FR_r = f8,f1,f0 // range reduction for |x|<1/256
+(p6) cmp.gt.unc p10,p0 = -40, GR_exp_x // Is |x| < 2^-40
}
-//
-// M = M - Bias
-// Load G1
-// N = getf.exp(Z)
-//
-
-{ .mii
-(p0) cmp.gt.unc p11, p0 = -80, GR_M
-(p0) cmp.gt.unc p12, p0 = -7, GR_M ;;
-(p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;;
-}
-
-{ .mib
- nop.m 999
-//
-// if -80 > M, set p11
-// Index2 = extr.u(X_1,6,4)
-// if -7 > M, set p12
-// Load H1
-//
-(p0) pmpyshr2.u GR_Index2 = GR_Index2,GR_Table_Scale,0
-(p11) br.cond.spnt L(log1pf_small) ;;
+{ .mfb
+(p7) setf.sig FR_N = GR_N // copy unbiased exponent of x to the
+ // significand field of FR_N
+(p7) fms.s1 FR_r = FR_RcpX,FR_Xp1,f1 // range reduction for |x|>=1/256
+(p12) br.ret.spnt b0 // exit for x=0, return x
}
+;;
{ .mib
- nop.m 999
- nop.i 999
-(p12) br.cond.spnt L(log1pf_near) ;;
-}
-
-{ .mii
-(p0) sub GR_N = GR_N, GR_Bias
-//
-// poly_lo = r * poly_lo
-//
-(p0) add GR_Perturb = 0x1, r0 ;;
-(p0) sub GR_ScaleN = GR_Bias, GR_N
-}
-
-{ .mii
-(p0) setf.sig FR_float_N = GR_N
- nop.i 999 ;;
-//
-// Prepare Index2 - pmpyshr2.u(X_1,Z_2,15)
-// Load h1
-// S_lo = S_lo + BB
-// Branch for -80 > M
-//
-(p0) add GR_Index2 = GR_Index2, GR_Table_Base1
-}
-
-{ .mmi
-(p0) setf.exp FR_two_negN = GR_ScaleN
- nop.m 999
-(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h3#),gp
-};;
-
-//
-// Index2 points to Z2
-// Branch for -7 > M
-//
-
-{ .mmb
-(p0) ld4 GR_Z_2 = [GR_Index2],4
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.b 999 ;;
-}
-(p0) nop.i 999
-//
-// Load Z_2
-// N = N - Bias
-// Tablebase points to Table3
-//
-
-{ .mmi
-(p0) ldfs FR_G_tmp = [GR_Index2],4 ;;
-//
-// Load G_2
-// pmpyshr2 X_2= (X_1,Z_2,15)
-// float_N = setf.sig(N)
-// ScaleN = Bias - N
-//
-(p0) ldfs FR_H_tmp = [GR_Index2],8
- nop.i 999 ;;
-}
-//
-// Load H_2
-// two_negN = setf.exp(scaleN)
-// G = G_1 * G_2
-//
-
-{ .mfi
-(p0) ldfd FR_h_tmp = [GR_Index2],0
- nop.f 999
-(p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;;
-}
-
-{ .mii
- nop.m 999
-(p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;;
-//
-// Load h_2
-// H = H_1 + H_2
-// h = h_1 + h_2
-// Index3 = extr.u(X_2,1,5)
-//
-(p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Base
-}
-
-{ .mmi
- nop.m 999
- nop.m 999
-//
-// float_N = fcvt.xf(float_N)
-// load G3
-//
-(p0) addl GR_Table_Base = @ltoff(Constants_Q#),gp ;;
-}
-
-{ .mfi
-ld8 GR_Table_Base = [GR_Table_Base]
-nop.f 999
-nop.i 999
-} ;;
-
-{ .mfi
-(p0) ldfe FR_log2_hi = [GR_Table_Base],16
-(p0) fmpy.s1 FR_S_lo = FR_S_lo, FR_two_negN
- nop.i 999 ;;
-}
-
-{ .mmf
- nop.m 999
-//
-// G = G3 * G
-// Load h3
-// Load log2_hi
-// H = H + H3
-//
-(p0) ldfe FR_log2_lo = [GR_Table_Base],16
-(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp ;;
-}
-
-{ .mmf
-(p0) ldfs FR_G_tmp = [GR_Index3],4
-//
-// h = h + h3
-// r = G * S_hi + 1
-// Load log2_lo
-//
-(p0) ldfe FR_Q4 = [GR_Table_Base],16
-(p0) fadd.s1 FR_h = FR_h, FR_h_tmp ;;
-}
-
-{ .mfi
-(p0) ldfe FR_Q3 = [GR_Table_Base],16
-(p0) fadd.s1 FR_H = FR_H, FR_H_tmp
- nop.i 999 ;;
-}
-
-{ .mmf
-(p0) ldfs FR_H_tmp = [GR_Index3],4
-(p0) ldfe FR_Q2 = [GR_Table_Base],16
-//
-// Comput Index for Table3
-// S_lo = S_lo * two_negN
-//
-(p0) fcvt.xf FR_float_N = FR_float_N ;;
+ setf.d FR_Ln2 = GR_Ln2 // create log(2)
+(p7) extr.u GR_Ind = GR_Sig,55,8 // get bits from 55 to 62 as index
+(p11) br.cond.spnt log1p_eq_minus_1 // jump if x = -1
}
-//
-// If S_lo == 0, set p8 false
-// Load H3
-// Load ptr to table of polynomial coeff.
-//
+;;
{ .mmf
-(p0) ldfd FR_h_tmp = [GR_Index3],0
-(p0) ldfe FR_Q1 = [GR_Table_Base],0
-(p0) fcmp.eq.unc.s1 p0, p8 = FR_S_lo, f0 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_H = FR_H, FR_H_tmp
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fms.s1 FR_r = FR_G, FR_S_hi, f1
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_h = FR_h, FR_h_tmp
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// Load Q4
-// Load Q3
-// Load Q2
-// Load Q1
-//
-(p8) fma.s1 FR_r = FR_G, FR_S_lo, FR_r
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-//
-// poly_lo = r * Q4 + Q3
-// rsq = r* r
-//
-(p0) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// If (S_lo!=0) r = s_lo * G + r
-//
-(p0) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
- nop.i 999
-}
-//
-// Create a 0x00000....01
-// poly_lo = poly_lo * rsq + h
-//
-
-{ .mfi
-(p0) setf.sig FR_dummy = GR_Perturb
-(p0) fmpy.s1 FR_rsq = FR_r, FR_r
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// h = N * log2_lo + h
-// Y_hi = n * log2_hi + H
-//
-(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// poly_lo = r * poly_o + Q2
-// poly_hi = Q1 * rsq + r
-//
-(p0) fmpy.s1 FR_poly_lo = FR_poly_lo, FR_r
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_rsq, FR_h
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-(p0) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo
-//
-// Create the FR for a binary "or"
-// Y_lo = poly_hi + poly_lo
-//
-// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;;
-//
-// Turn the lsb of Y_lo ON
-//
-// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;;
-//
-// Merge the new lsb into Y_lo, for alone doesn't
-//
-(p0) br.cond.sptk L(LOG_main) ;;
-}
-
-
-L(log1pf_near):
-
-{ .mmi
- nop.m 999
- nop.m 999
-// /*******************************************************/
-// /*********** Branch log1pf_near ************************/
-// /*******************************************************/
-(p0) addl GR_Table_Base = @ltoff(Constants_P#),gp ;;
-}
-//
-// Load base address of poly. coeff.
-//
-{.mmi
- nop.m 999
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.i 999
-};;
-
-{ .mmb
-(p0) add GR_Table_ptr = 0x40,GR_Table_Base
-//
-// Address tables with separate pointers
-//
-(p0) ldfe FR_P8 = [GR_Table_Base],16
- nop.b 999 ;;
+(p7) shladd GR_ad_T = GR_Ind,3,GR_ad_T // address of T
+ nop.m 0
+(p10) fnma.s.s0 f8 = f8,f8,f8 // If |x| very small, result=x-x*x
}
+;;
{ .mmb
-(p0) ldfe FR_P4 = [GR_Table_ptr],16
-//
-// Load P4
-// Load P8
-//
-(p0) ldfe FR_P7 = [GR_Table_Base],16
- nop.b 999 ;;
-}
-
-{ .mmf
-(p0) ldfe FR_P3 = [GR_Table_ptr],16
-//
-// Load P3
-// Load P7
-//
-(p0) ldfe FR_P6 = [GR_Table_Base],16
-(p0) fmpy.s1 FR_wsq = FR_W, FR_W ;;
-}
-
-{ .mfi
-(p0) ldfe FR_P2 = [GR_Table_ptr],16
- nop.f 999
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_W, FR_P4, FR_P3
- nop.i 999
-}
-//
-// Load P2
-// Load P6
-// Wsq = w * w
-// Y_hi = p4 * w + p3
-//
-
-{ .mfi
-(p0) ldfe FR_P5 = [GR_Table_Base],16
-(p0) fma.s1 FR_Y_lo = FR_W, FR_P8, FR_P7
- nop.i 999 ;;
-}
-
-{ .mfi
-(p0) ldfe FR_P1 = [GR_Table_ptr],16
-//
-// Load P1
-// Load P5
-// Y_lo = p8 * w + P7
-//
-(p0) fmpy.s1 FR_w4 = FR_wsq, FR_wsq
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P2
- nop.i 999
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P6
-(p0) add GR_Perturb = 0x1, r0 ;;
-}
-
-{ .mfi
- nop.m 999
-//
-// w4 = w2 * w2
-// Y_hi = y_hi * w + p2
-// Y_lo = y_lo * w + p6
-// Create perturbation bit
-//
-(p0) fmpy.s1 FR_w6 = FR_w4, FR_wsq
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P1
- nop.i 999
-}
-//
-// Y_hi = y_hi * w + p1
-// w6 = w4 * w2
-//
-
-{ .mfi
-(p0) setf.sig FR_Q4 = GR_Perturb
-(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P5
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_wsq,FR_Y_hi, FR_W
- nop.i 999
-}
-
-{ .mfb
- nop.m 999
-//
-// Y_hi = y_hi * wsq + w
-// Y_lo = y_lo * w + p5
-//
-(p0) fmpy.s1 FR_Y_lo = FR_w6, FR_Y_lo
-//
-// Y_lo = y_lo * w6
-//
-// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;;
-//
-// Set lsb on: Taken out to improve performance
-//
-// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;;
-//
-// Make sure it's on in Y_lo also. Taken out to improve
-// performance
-//
-(p0) br.cond.sptk L(LOG_main) ;;
-}
-
-
-L(log1pf_small):
-
-{ .mmi
- nop.m 999
- nop.m 999
-// /*******************************************************/
-// /*********** Branch log1pf_small ***********************/
-// /*******************************************************/
-(p0) addl GR_Table_Base = @ltoff(Constants_Threshold#),gp
+(p7) ldfd FR_T = [GR_ad_T]
+ nop.m 0
+(p10) br.ret.spnt b0 // Exit if |x| < 2^-40
}
+;;
{ .mfi
- nop.m 999
-(p0) mov FR_Em1 = FR_W
-(p0) cmp.eq.unc p7, p0 = r0, r0 ;;
-}
-
-{ .mlx
- ld8 GR_Table_Base = [GR_Table_Base]
-(p0) movl GR_Expo_Range = 0x0000000000000002 ;;
-}
-//
-// Set Safe to true
-// Set Expo_Range = 0 for single
-// Set Expo_Range = 2 for double
-// Set Expo_Range = 4 for double-extended
-//
-
-{ .mmi
-(p0) shladd GR_Table_Base = GR_Expo_Range,4,GR_Table_Base ;;
-(p0) ldfe FR_Threshold = [GR_Table_Base],16
- nop.i 999
+ nop.m 0
+ fma.s1 FR_r2 = FR_r,FR_r,f0 // r^2
+ nop.i 0
}
-
-{ .mlx
- nop.m 999
-(p0) movl GR_Bias = 0x000000000000FF9B ;;
-}
-
{ .mfi
-(p0) ldfe FR_Tiny = [GR_Table_Base],0
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fnma.s1 FR_A2 = FR_A2,FR_r,f1 // 1.0 - A2*r
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p0) fcmp.gt.unc.s1 p13, p12 = FR_abs_W, FR_Threshold
- nop.i 999 ;;
+ nop.m 0
+ fnma.s1 FR_A3 = FR_A4,FR_r,FR_A3 // A3 - A4*r
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p13) fnmpy.s1 FR_Y_lo = FR_W, FR_W
- nop.i 999
+ nop.m 0
+(p7) fcvt.xf FR_N = FR_N
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p13) fadd FR_SCALE = f0, f1
- nop.i 999 ;;
+ nop.m 0
+ // (A3*r+A2)*r^2+r
+ fma.s1 FR_A2 = FR_A3,FR_r2,FR_A2 // (A4*r+A3)*r^2+(A2*r+1)
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p12) fsub.s1 FR_Y_lo = f0, FR_Tiny
-(p12) cmp.ne.unc p7, p0 = r0, r0
+ nop.m 0
+ // N*Ln2hi+T
+(p7) fma.s1 FR_NxLn2pT = FR_N,FR_Ln2,FR_T
+ nop.i 0
}
+;;
+.pred.rel "mutex",p6,p7
{ .mfi
-(p12) setf.exp FR_SCALE = GR_Bias
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+(p6) fma.s.s0 f8 = FR_A2,FR_r,f0 // result if 2^(-40) <= |x| < 1/256
+ nop.i 0
}
-
-//
-// Set p7 to SAFE = FALSE
-// Set Scale = 2^-100
-//
{ .mfb
- nop.m 999
-(p0) fma.s.s0 FR_Input_X = FR_Y_lo,FR_SCALE,FR_Y_hi
-(p0) br.ret.sptk b0
+ nop.m 0
+(p7) fma.s.s0 f8 = FR_A2,FR_r,FR_NxLn2pT // result if |x| >= 1/256
+ br.ret.sptk b0 // Exit if |x| >= 2^(-40)
}
;;
-L(LOG_64_one):
-
+.align 32
+log1p_unorm:
+// Here if x=unorm
{ .mfb
- nop.m 999
-(p0) fmpy.s.s0 FR_Input_X = FR_Input_X, f0
-(p0) br.ret.sptk b0
+ getf.exp GR_signexp_x = FR_NormX // recompute biased exponent
+ nop.f 0
+ br.cond.sptk log1p_common
}
;;
-//
-// Raise divide by zero for +/-0 input.
-//
-
-L(LOG_64_zero):
+.align 32
+log1p_eq_minus_1:
+// Here if x=-1
{ .mfi
-(p0) mov GR_Parameter_TAG = 142
-//
-// If we have log1pf(0), return -Inf.
-//
-(p0) fsub.s0 FR_Output_X_tmp = f0, f1
- nop.i 999 ;;
+ nop.m 0
+ fmerge.s FR_X = f8,f8 // keep input argument for subsequent
+ // call of __libm_error_support#
+ nop.i 0
}
-{ .mfb
- nop.m 999
-(p0) frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0
-(p0) br.cond.sptk L(LOG_ERROR_Support) ;;
-}
-
-L(LOG_64_special):
+;;
{ .mfi
- nop.m 999
-//
-// Return -Inf or value from handler.
-//
-(p0) fclass.m.unc p7, p0 = FR_Input_X, 0x1E1
- nop.i 999 ;;
+ mov GR_TAG = 142 // set libm error in case of log1p(-1).
+ frcpa.s0 f8,p0 = f8,f0 // log1p(-1) should be equal to -INF.
+ // We can get it using frcpa because it
+ // sets result to the IEEE-754 mandated
+ // quotient of f8/f0.
+ nop.i 0
}
-
-{ .mfb
- nop.m 999
-//
-// Check for Natval, QNan, SNaN, +Inf
-//
-(p7) fmpy.s.s0 f8 = FR_Input_X, f1
-//
-// For SNaN raise invalid and return QNaN.
-// For QNaN raise invalid and return QNaN.
-// For +Inf return +Inf.
-//
-(p7) br.ret.sptk b0
+{ .mib
+ nop.m 0
+ nop.i 0
+ br.cond.sptk log_libm_err
}
;;
-//
-// For -Inf raise invalid and return QNaN.
-//
-
-{ .mfb
-(p0) mov GR_Parameter_TAG = 143
-(p0) fmpy.s.s0 FR_Output_X_tmp = FR_Input_X, f0
-(p0) br.cond.sptk L(LOG_ERROR_Support) ;;
+.align 32
+log1p_lt_minus_1:
+// Here if x < -1
+{ .mfi
+ nop.m 0
+ fmerge.s FR_X = f8,f8
+ nop.i 0
}
+;;
-//
-// Report that log1pf(-Inf) computed
-//
-
-L(LOG_64_unsupported):
-
-//
-// Return generated NaN or other value .
-//
-
-{ .mfb
- nop.m 999
-(p0) fmpy.s.s0 FR_Input_X = FR_Input_X, f0
-(p0) br.ret.sptk b0 ;;
+{ .mfi
+ mov GR_TAG = 143 // set libm error in case of x < -1.
+ frcpa.s0 f8,p0 = f0,f0 // log1p(x) x < -1 should be equal to NaN.
+ // We can get it using frcpa because it
+ // sets result to the IEEE-754 mandated
+ // quotient of f0/f0 i.e. NaN.
+ nop.i 0
}
+;;
-L(LOG_64_negative):
-
-{ .mfi
- nop.m 999
-//
-// Deal with x < 0 in a special way
-//
-(p0) frcpa.s0 FR_Output_X_tmp, p8 = f0, f0
-//
-// Deal with x < 0 in a special way - raise
-// invalid and produce QNaN indefinite.
-//
-(p0) mov GR_Parameter_TAG = 143;;
+.align 32
+log_libm_err:
+{ .mmi
+ alloc r32 = ar.pfs,1,4,4,0
+ mov GR_Parameter_TAG = GR_TAG
+ nop.i 0
}
+;;
-.endp log1pf#
-ASM_SIZE_DIRECTIVE(log1pf)
+GLOBAL_IEEE754_END(log1pf)
-.proc __libm_error_region
-__libm_error_region:
-L(LOG_ERROR_Support):
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
-
-// (1)
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
+ add GR_Parameter_Y = -32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp = -64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP = gp // Save gp
};;
-
-
-// (2)
{ .mmi
- stfs [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
+ stfs [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0 = b0 // Save b0
};;
-
.body
-// (3)
{ .mib
- stfs [GR_Parameter_X] =FR_Input_X // STORE Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ stfs [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfs [GR_Parameter_Y] = FR_Output_X_tmp // STORE Parameter 3 on stack
+ stfs [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
-
-// (4)
{ .mmi
- ldfs FR_Input_X = [GR_Parameter_RESULT] // Get return result off stack
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
- add sp = 64,sp // Restore stack pointer
- mov b0 = GR_SAVE_B0 // Restore return address
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
- mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
};;
-
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
-
-
-.proc __libm_LOG_main
-__libm_LOG_main:
-L(LOG_main):
-
-//
-// kernel_log_64 computes ln(X + E)
-//
-
-{ .mfi
- nop.m 999
-(p7) fadd.s.s0 FR_Input_X = FR_Y_lo,FR_Y_hi
- nop.i 999
-}
-
-{ .mmi
- nop.m 999
- nop.m 999
-(p14) addl GR_Table_Base = @ltoff(Constants_1_by_LN10#),gp ;;
-}
-
-{ .mmi
- nop.m 999
-(p14) ld8 GR_Table_Base = [GR_Table_Base]
- nop.i 999
-};;
-
-{ .mmi
-(p14) ldfe FR_1LN10_hi = [GR_Table_Base],16 ;;
-(p14) ldfe FR_1LN10_lo = [GR_Table_Base]
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi
- nop.i 999 ;;
-}
-
-{ .mfi
- nop.m 999
-(p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp
- nop.i 999 ;;
-}
-
-{ .mfb
- nop.m 999
-(p14) fma.s.s0 FR_Input_X = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp
-(p0) br.ret.sptk b0 ;;
-}
-.endp __libm_LOG_main
-ASM_SIZE_DIRECTIVE(__libm_LOG_main)
-
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/s_log1pl.S b/sysdeps/ia64/fpu/s_log1pl.S
index 7cd3f7834c..d392a58edf 100644
--- a/sysdeps/ia64/fpu/s_log1pl.S
+++ b/sysdeps/ia64/fpu/s_log1pl.S
@@ -1,10 +1,10 @@
.file "log1pl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,55 +35,49 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-// *********************************************************************
+//*********************************************************************
//
// History:
-// 2/02/00 hand-optimized
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 05/21/01 Removed logl and log10l, putting them in a separate file
+// 06/29/01 Improved speed of all paths
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
//
-// *********************************************************************
+//*********************************************************************
//
-// *********************************************************************
+//*********************************************************************
//
-// Function: Combined logl(x), log1pl(x), and log10l(x) where
-// logl(x) = ln(x), for double-extended precision x values
-// log1pl(x) = ln(x+1), for double-extended precision x values
-// log10l(x) = log (x), for double-extended precision x values
-// 10
+// Function: log1pl(x) = ln(x+1), for double-extended precision x values
//
-// *********************************************************************
+//*********************************************************************
//
// Resources Used:
//
// Floating-Point Registers: f8 (Input and Return Value)
-// f9,f33-f55,f99
+// f34-f82
//
// General Purpose Registers:
-// r32-r53
-// r54-r57 (Used to pass arguments to error handling routine)
+// r32-r56
+// r53-r56 (Used to pass arguments to error handling routine)
//
-// Predicate Registers: p6-p15
+// Predicate Registers: p6-p13
//
-// *********************************************************************
+//*********************************************************************
//
// IEEE Special Conditions:
//
-// Denormal fault raised on denormal inputs
+// Denormal fault raised on denormal inputs
// Overflow exceptions cannot occur
// Underflow exceptions raised when appropriate for log1p
-// (Error Handling Routine called for underflow)
// Inexact raised when appropriate by algorithm
//
-// logl(inf) = inf
-// logl(-inf) = QNaN
-// logl(+/-0) = -inf
-// logl(SNaN) = QNaN
-// logl(QNaN) = QNaN
-// logl(EM_special Values) = QNaN
// log1pl(inf) = inf
// log1pl(-inf) = QNaN
// log1pl(+/-0) = +/-0
@@ -91,54 +85,37 @@
// log1pl(SNaN) = QNaN
// log1pl(QNaN) = QNaN
// log1pl(EM_special Values) = QNaN
-// log10l(inf) = inf
-// log10l(-inf) = QNaN
-// log10l(+/-0) = -inf
-// log10l(SNaN) = QNaN
-// log10l(QNaN) = QNaN
-// log10l(EM_special Values) = QNaN
-//
-// *********************************************************************
-//
-// Computation is based on the following kernel.
-//
-// ker_log_64( in_FR : X,
-// in_FR : E,
-// in_FR : Em1,
-// in_GR : Expo_Range,
-// out_FR : Y_hi,
-// out_FR : Y_lo,
-// out_FR : Scale,
-// out_PR : Safe )
-//
+//
+//*********************************************************************
+//
// Overview
//
// The method consists of three cases.
//
-// If |X+Em1| < 2^(-80) use case log1pl_small;
-// elseif |X+Em1| < 2^(-7) use case log_near1;
-// else use case log_regular;
+// If |X| < 2^(-80) use case log1p_small;
+// else |X| < 2^(-7) use case log_near1;
+// else use case log_regular;
//
-// Case log1pl_small:
+// Case log1p_small:
//
-// logl( 1 + (X+Em1) ) can be approximated by (X+Em1).
+// log1pl( X ) = logl( X+1 ) can be approximated by X
//
// Case log_near1:
//
-// logl( 1 + (X+Em1) ) can be approximated by a simple polynomial
-// in W = X+Em1. This polynomial resembles the truncated Taylor
+// log1pl( X ) = log( X+1 ) can be approximated by a simple polynomial
+// in W = X. This polynomial resembles the truncated Taylor
// series W - W^/2 + W^3/3 - ...
//
// Case log_regular:
//
// Here we use a table lookup method. The basic idea is that in
-// order to compute logl(Arg) for an argument Arg in [1,2), we
-// construct a value G such that G*Arg is close to 1 and that
+// order to compute logl(Arg) = log1pl (Arg-1) for an argument Arg in [1,2),
+// we construct a value G such that G*Arg is close to 1 and that
// logl(1/G) is obtainable easily from a table of values calculated
// beforehand. Thus
//
-// logl(Arg) = logl(1/G) + logl(G*Arg)
-// = logl(1/G) + logl(1 + (G*Arg - 1))
+// logl(Arg) = logl(1/G) + logl(G*Arg)
+// = logl(1/G) + logl(1 + (G*Arg - 1))
//
// Because |G*Arg - 1| is small, the second term on the right hand
// side can be approximated by a short polynomial. We elaborate
@@ -146,9 +123,9 @@
//
// Step 0: Initialization
//
-// We need to calculate logl( E + X ). Obtain N, S_hi, S_lo such that
+// We need to calculate logl( X+1 ). Obtain N, S_hi such that
//
-// E + X = 2^N * ( S_hi + S_lo ) exactly
+// X+1 = 2^N * ( S_hi + S_lo ) exactly
//
// where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
// that |S_lo| <= ulp(S_hi).
@@ -157,8 +134,8 @@
//
// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
//
-// G := G_1 * G_2 * G_3
-// r := (G * S_hi - 1) + G * S_lo
+// G := G_1 * G_2 * G_3
+// r := (G * S_hi - 1) + G * S_lo
//
// These G_j's have the property that the product is exactly
// representable and that |r| < 2^(-12) as a result.
@@ -171,61 +148,34 @@
// Step 3: Reconstruction
//
//
-// Finally, logl( E + X ) is given by
+// Finally, log1pl( X ) = logl( X+1 ) is given by
//
-// logl( E + X ) = logl( 2^N * (S_hi + S_lo) )
+// logl( X+1 ) = logl( 2^N * (S_hi + S_lo) )
// ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
// ~=~ N*logl(2) + logl(1/G) + poly(r).
//
// **** Algorithm ****
//
-// Case log1pl_small:
-//
-// Although logl(1 + (X+Em1)) is basically X+Em1, we would like to
-// preserve the inexactness nature as well as consistent behavior
-// under different rounding modes. Note that this case can only be
-// taken if E is set to be 1.0. In this case, Em1 is zero, and that
-// X can be very tiny and thus the final result can possibly underflow.
-// Thus, we compare X against a threshold that is dependent on the
-// input Expo_Range. If |X| is smaller than this threshold, we set
-// SAFE to be FALSE.
-//
-// The result is returned as Y_hi, Y_lo, and in the case of SAFE
-// is FALSE, an additional value Scale is also returned.
-//
-// W := X + Em1
-// Threshold := Threshold_Table( Expo_Range )
-// Tiny := Tiny_Table( Expo_Range )
-//
-// If ( |W| > Threshold ) then
-// Y_hi := W
-// Y_lo := -W*W
-// Else
-// Y_hi := W
-// Y_lo := -Tiny
-// Scale := 2^(-100)
-// Safe := FALSE
-// EndIf
-//
-//
-// One may think that Y_lo should be -W*W/2; however, it does not matter
-// as Y_lo will be rounded off completely except for the correct effect in
-// directed rounding. Clearly -W*W is simplier to compute. Moreover,
-// because of the difference in exponent value, Y_hi + Y_lo or
-// Y_hi + Scale*Y_lo is always inexact.
+// Case log1p_small:
+//
+// Although log1pl(X) is basically X, we would like to preserve the inexactness
+// nature as well as consistent behavior under different rounding modes.
+// We can do this by computing the result as
+//
+// log1pl(X) = X - X*X
+//
//
// Case log_near1:
//
// Here we compute a simple polynomial. To exploit parallelism, we split
// the polynomial into two portions.
//
-// W := X + Em1
-// Wsq := W * W
-// W4 := Wsq*Wsq
-// W6 := W4*Wsq
-// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4))
-// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8)))
-// set lsb(Y_lo) to be 1
+// W := X
+// Wsq := W * W
+// W4 := Wsq*Wsq
+// W6 := W4*Wsq
+// Y_hi := W + Wsq*(P_1 + W*(P_2 + W*(P_3 + W*P_4))
+// Y_lo := W6*(P_5 + W*(P_6 + W*(P_7 + W*P_8)))
//
// Case log_regular:
//
@@ -234,89 +184,87 @@
// Step 0. Initialization
// ----------------------
//
-// Z := X + E
+// Z := X + 1
// N := unbaised exponent of Z
// S_hi := 2^(-N) * Z
-// S_lo := 2^(-N) * { (max(X,E)-Z) + min(X,E) }
-//
-// Note that S_lo is always 0 for the case E = 0.
+// S_lo := 2^(-N) * { (max(X,1)-Z) + min(X,1) }
//
// Step 1. Argument Reduction
// --------------------------
//
// Let
//
-// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63
+// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63
//
// We obtain G_1, G_2, G_3 by the following steps.
//
//
-// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted
-// from S_hi.
+// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted
+// from S_hi.
//
-// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated
-// to lsb = 2^(-4).
+// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated
+// to lsb = 2^(-4).
//
-// Define index_1 := [ d_1 d_2 d_3 d_4 ].
+// Define index_1 := [ d_1 d_2 d_3 d_4 ].
//
-// Fetch Z_1 := (1/A_1) rounded UP in fixed point with
-// fixed point lsb = 2^(-15).
-// Z_1 looks like z_0.z_1 z_2 ... z_15
-// Note that the fetching is done using index_1.
-// A_1 is actually not needed in the implementation
-// and is used here only to explain how is the value
-// Z_1 defined.
+// Fetch Z_1 := (1/A_1) rounded UP in fixed point with
+// fixed point lsb = 2^(-15).
+// Z_1 looks like z_0.z_1 z_2 ... z_15
+// Note that the fetching is done using index_1.
+// A_1 is actually not needed in the implementation
+// and is used here only to explain how is the value
+// Z_1 defined.
//
-// Fetch G_1 := (1/A_1) truncated to 21 sig. bits.
-// floating pt. Again, fetching is done using index_1. A_1
-// explains how G_1 is defined.
+// Fetch G_1 := (1/A_1) truncated to 21 sig. bits.
+// floating pt. Again, fetching is done using index_1. A_1
+// explains how G_1 is defined.
//
-// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14)
-// = 1.0 0 0 0 d_5 ... d_14
-// This is accomplised by integer multiplication.
-// It is proved that X_1 indeed always begin
-// with 1.0000 in fixed point.
+// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14)
+// = 1.0 0 0 0 d_5 ... d_14
+// This is accomplised by integer multiplication.
+// It is proved that X_1 indeed always begin
+// with 1.0000 in fixed point.
//
//
-// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
-// truncated to lsb = 2^(-8). Similar to A_1,
-// A_2 is not needed in actual implementation. It
-// helps explain how some of the values are defined.
+// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
+// truncated to lsb = 2^(-8). Similar to A_1,
+// A_2 is not needed in actual implementation. It
+// helps explain how some of the values are defined.
//
-// Define index_2 := [ d_5 d_6 d_7 d_8 ].
+// Define index_2 := [ d_5 d_6 d_7 d_8 ].
//
-// Fetch Z_2 := (1/A_2) rounded UP in fixed point with
-// fixed point lsb = 2^(-15). Fetch done using index_2.
-// Z_2 looks like z_0.z_1 z_2 ... z_15
+// Fetch Z_2 := (1/A_2) rounded UP in fixed point with
+// fixed point lsb = 2^(-15). Fetch done using index_2.
+// Z_2 looks like z_0.z_1 z_2 ... z_15
//
-// Fetch G_2 := (1/A_2) truncated to 21 sig. bits.
-// floating pt.
+// Fetch G_2 := (1/A_2) truncated to 21 sig. bits.
+// floating pt.
//
-// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14)
-// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14
-// This is accomplised by integer multiplication.
-// It is proved that X_2 indeed always begin
-// with 1.00000000 in fixed point.
+// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14)
+// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14
+// This is accomplised by integer multiplication.
+// It is proved that X_2 indeed always begin
+// with 1.00000000 in fixed point.
//
//
-// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1.
-// This is 2^(-14) + X_2 truncated to lsb = 2^(-13).
+// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1.
+// This is 2^(-14) + X_2 truncated to lsb = 2^(-13).
//
-// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ].
+// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ].
//
-// Fetch G_3 := (1/A_3) truncated to 21 sig. bits.
-// floating pt. Fetch is done using index_3.
+// Fetch G_3 := (1/A_3) truncated to 21 sig. bits.
+// floating pt. Fetch is done using index_3.
//
-// Compute G := G_1 * G_2 * G_3.
+// Compute G := G_1 * G_2 * G_3.
//
-// This is done exactly since each of G_j only has 21 sig. bits.
+// This is done exactly since each of G_j only has 21 sig. bits.
//
-// Compute
+// Compute
//
-// r := (G*S_hi - 1) + G*S_lo using 2 FMA operations.
+// r := (G*S_hi - 1) + G*S_lo using 2 FMA operations.
//
-// thus, r approximates G*(S_hi+S_lo) - 1 to within a couple of
-// rounding errors.
+// Thus r approximates G*(S_hi + S_lo) - 1 to within a couple of
+// rounding errors.
//
//
// Step 2. Approximation
@@ -326,1258 +274,878 @@
// reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13);
// thus logl(1+r) can be approximated by a short polynomial:
//
-// logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5
+// logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5
//
//
// Step 3. Reconstruction
// ----------------------
//
-// This step computes the desired result of logl(X+E):
+// This step computes the desired result of logl(X+1):
//
-// logl(X+E) = logl( 2^N * (S_hi + S_lo) )
-// = N*logl(2) + logl( S_hi + S_lo )
-// = N*logl(2) + logl(1/G) +
-// logl(1 + C*(S_hi+S_lo) - 1 )
+// logl(X+1) = logl( 2^N * (S_hi + S_lo) )
+// = N*logl(2) + logl( S_hi + S_lo) )
+// = N*logl(2) + logl(1/G) +
+// logl(1 + G * ( S_hi + S_lo ) - 1 )
//
// logl(2), logl(1/G_j) are stored as pairs of (single,double) numbers:
// log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are
// single-precision numbers and the low parts are double precision
// numbers. These have the property that
//
-// N*log2_hi + SUM ( log1byGj_hi )
+// N*log2_hi + SUM ( log1byGj_hi )
//
// is computable exactly in double-extended precision (64 sig. bits).
// Finally
//
-// Y_hi := N*log2_hi + SUM ( log1byGj_hi )
-// Y_lo := poly_hi + [ poly_lo +
-// ( SUM ( log1byGj_lo ) + N*log2_lo ) ]
-// set lsb(Y_lo) to be 1
+// Y_hi := N*log2_hi + SUM ( log1byGj_hi )
+// Y_lo := poly_hi + [ poly_lo +
+// ( SUM ( log1byGj_lo ) + N*log2_lo ) ]
//
-#include "libm_support.h"
+RODATA
+.align 64
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+// ************* DO NOT CHANGE THE ORDER OF THESE TABLES *************
-// P_7, P_6, P_5, P_4, P_3, P_2, and P_1
+// P_8, P_7, P_6, P_5, P_4, P_3, P_2, and P_1
+
+LOCAL_OBJECT_START(Constants_P)
+//data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000
+//data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000
+//data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000
+//data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000
+//data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000
+//data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000
+//data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000
+//data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000
+data8 0xE3936754EFD62B15,0x00003FFB
+data8 0x8003B271A5E56381,0x0000BFFC
+data8 0x9249248C73282DB0,0x00003FFC
+data8 0xAAAAAA9F47305052,0x0000BFFC
+data8 0xCCCCCCCCCCD17FC9,0x00003FFC
+data8 0x8000000000067ED5,0x0000BFFD
+data8 0xAAAAAAAAAAAAAAAA,0x00003FFD
+data8 0xFFFFFFFFFFFFFFFE,0x0000BFFD
+LOCAL_OBJECT_END(Constants_P)
-.align 64
-Constants_P:
-ASM_TYPE_DIRECTIVE(Constants_P,@object)
-data4 0xEFD62B15,0xE3936754,0x00003FFB,0x00000000
-data4 0xA5E56381,0x8003B271,0x0000BFFC,0x00000000
-data4 0x73282DB0,0x9249248C,0x00003FFC,0x00000000
-data4 0x47305052,0xAAAAAA9F,0x0000BFFC,0x00000000
-data4 0xCCD17FC9,0xCCCCCCCC,0x00003FFC,0x00000000
-data4 0x00067ED5,0x80000000,0x0000BFFD,0x00000000
-data4 0xAAAAAAAA,0xAAAAAAAA,0x00003FFD,0x00000000
-data4 0xFFFFFFFE,0xFFFFFFFF,0x0000BFFD,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_P)
-
// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
-.align 64
-Constants_Q:
-ASM_TYPE_DIRECTIVE(Constants_Q,@object)
-data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
-data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
-data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
-data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
-data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
-data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_Q)
-
-// Z1 - 16 bit fixed, G1 and H1 - IEEE single
-
-.align 64
-Constants_Z_G_H_h1:
-ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h1,@object)
-data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000
-data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000,0x617D741C,0x3DA163A6
-data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000,0xCBD3D5BB,0x3E2C55E6
-data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000,0xD86EA5E7,0xBE3EB0BF
-data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000,0x86B12760,0x3E2E6A8C
-data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000,0x5C0739BA,0x3E47574C
-data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000,0x13E8AF2F,0x3E20E30F
-data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000,0xF2C630BD,0xBE42885B
-data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000,0x97E577C6,0x3E497F34
-data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000,0xA6B0A5AB,0x3E3E6A6E
-data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000,0xD328D9BE,0xBDF43E3C
-data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000,0x0ADB090A,0x3E4094C3
-data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000,0xFC1FE510,0xBE28FBB2
-data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000,0x10FDE3FA,0x3E3A7895
-data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000,0x7CC8C98F,0x3E508CE5
-data4 0x00004211,0x3F042108,0x3F29516A,0x00000000,0xA223106C,0xBE534874
-ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h1)
-
-// Z2 - 16 bit fixed, G2 and H2 - IEEE single
-
-.align 64
-Constants_Z_G_H_h2:
-ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h2,@object)
-data4 0x00008000,0x3F800000,0x00000000,0x00000000,0x00000000,0x00000000
-data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000,0x22C42273,0x3DB5A116
-data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000,0x21F86ED3,0x3DE620CF
-data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000,0x484F34ED,0xBDAFA07E
-data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000,0x3860BCF6,0xBDFE07F0
-data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000,0xA78093D6,0x3DEA370F
-data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000,0x72A753D0,0x3DFF5791
-data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000,0xA7EF896B,0x3DFEBE6C
-data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000,0x409ECB43,0x3E0CF156
-data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000,0xFFEF71DF,0xBE0B6F97
-data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000,0x5D59EEE8,0xBE080483
-data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000,0xA9192A74,0x3E1F91E9
-data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000,0xBF72A8CD,0xBE139A06
-data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000,0xF8FBA6CF,0x3E1D9202
-data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000,0xBA796223,0xBE1DCCC4
-data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000,0xB6B7C239,0xBE049391
-ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h2)
-
-// G3 and H3 - IEEE single and h3 -IEEE double
-
-.align 64
-Constants_Z_G_H_h3:
-ASM_TYPE_DIRECTIVE(Constants_Z_G_H_h3,@object)
-data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595
-data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2
-data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D
-data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291
-data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8
-data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707
-data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9
-data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47
-data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E
-data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D
-data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441
-data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95
-data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC
-data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337
-data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B
-data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B
-data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21
-data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4
-data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070
-data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC
-data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83
-data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40
-data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7
-data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B
-data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E
-data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06
-data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1
-data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103
-data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B
-data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19
-data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502
-data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17
-ASM_SIZE_DIRECTIVE(Constants_Z_G_H_h3)
+LOCAL_OBJECT_START(Constants_Q)
+//data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
+//data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
+//data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
+//data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
+//data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
+//data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
+data8 0xB172180000000000,0x00003FFE
+data8 0x82E308654361C4C6,0x0000BFE2
+data8 0xCCCCCAF2328833CB,0x00003FFC
+data8 0x80000077A9D4BAFB,0x0000BFFD
+data8 0xAAAAAAAAAAABE3D2,0x00003FFD
+data8 0xFFFFFFFFFFFFDAB7,0x0000BFFD
+LOCAL_OBJECT_END(Constants_Q)
+
+// 1/ln10_hi, 1/ln10_lo
+
+LOCAL_OBJECT_START(Constants_1_by_LN10)
+//data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000
+//data4 0xACCF70C8,0xD56EAABE,0x00003FBB,0x00000000
+data8 0xDE5BD8A937287195,0x00003FFD
+data8 0xD56EAABEACCF70C8,0x00003FBB
+LOCAL_OBJECT_END(Constants_1_by_LN10)
+
+
+// Z1 - 16 bit fixed
-//
-// Exponent Thresholds and Tiny Thresholds
-// for 8, 11, 15, and 17 bit exponents
-//
-// Expo_Range Value
-//
-// 0 (8 bits) 2^(-126)
-// 1 (11 bits) 2^(-1022)
-// 2 (15 bits) 2^(-16382)
-// 3 (17 bits) 2^(-16382)
-//
-// Tiny_Table
-// ----------
-// Expo_Range Value
-//
-// 0 (8 bits) 2^(-16382)
-// 1 (11 bits) 2^(-16382)
-// 2 (15 bits) 2^(-16382)
-// 3 (17 bits) 2^(-16382)
-//
+LOCAL_OBJECT_START(Constants_Z_1)
+data4 0x00008000
+data4 0x00007879
+data4 0x000071C8
+data4 0x00006BCB
+data4 0x00006667
+data4 0x00006187
+data4 0x00005D18
+data4 0x0000590C
+data4 0x00005556
+data4 0x000051EC
+data4 0x00004EC5
+data4 0x00004BDB
+data4 0x00004925
+data4 0x0000469F
+data4 0x00004445
+data4 0x00004211
+LOCAL_OBJECT_END(Constants_Z_1)
-.align 64
-Constants_Threshold:
-ASM_TYPE_DIRECTIVE(Constants_Threshold,@object)
-data4 0x00000000,0x80000000,0x00003F81,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00003C01,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-data4 0x00000000,0x80000000,0x00000001,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_Threshold)
+// G1 and H1 - IEEE single and h1 - IEEE double
-.align 64
-Constants_1_by_LN10:
-ASM_TYPE_DIRECTIVE(Constants_1_by_LN10,@object)
-data4 0x37287195,0xDE5BD8A9,0x00003FFD,0x00000000
-data4 0xACCF70C8,0xD56EAABE,0x00003FBB,0x00000000
-ASM_SIZE_DIRECTIVE(Constants_1_by_LN10)
-
-FR_Input_X = f8
-FR_Neg_One = f9
-FR_E = f33
-FR_Em1 = f34
-FR_Y_hi = f34
-// Shared with Em1
-FR_Y_lo = f35
-FR_Scale = f36
-FR_X_Prime = f37
-FR_Z = f38
-FR_S_hi = f38
-// Shared with Z
-FR_W = f39
-FR_G = f40
-FR_wsq = f40
-// Shared with G
-FR_H = f41
-FR_w4 = f41
-// Shared with H
-FR_h = f42
-FR_w6 = f42
-// Shared with h
-FR_G_tmp = f43
-FR_poly_lo = f43
-// Shared with G_tmp
-FR_P8 = f43
-// Shared with G_tmp
-FR_H_tmp = f44
-FR_poly_hi = f44
- // Shared with H_tmp
-FR_P7 = f44
-// Shared with H_tmp
-FR_h_tmp = f45
-FR_rsq = f45
-// Shared with h_tmp
-FR_P6 = f45
-// Shared with h_tmp
-FR_abs_W = f46
-FR_r = f46
-// Shared with abs_W
-FR_AA = f47
-FR_log2_hi = f47
-// Shared with AA
-FR_BB = f48
-FR_log2_lo = f48
-// Shared with BB
-FR_S_lo = f49
-FR_two_negN = f50
-FR_float_N = f51
-FR_Q4 = f52
-FR_dummy = f52
-// Shared with Q4
-FR_P4 = f52
-// Shared with Q4
-FR_Threshold = f52
-// Shared with Q4
-FR_Q3 = f53
-FR_P3 = f53
-// Shared with Q3
-FR_Tiny = f53
-// Shared with Q3
-FR_Q2 = f54
-FR_P2 = f54
-// Shared with Q2
-FR_1LN10_hi = f54
-// Shared with Q2
-FR_Q1 = f55
-FR_P1 = f55
-// Shared with Q1
-FR_1LN10_lo = f55
-// Shared with Q1
-FR_P5 = f98
-FR_SCALE = f98
-FR_Output_X_tmp = f99
-
-GR_Expo_Range = r32
-GR_Table_Base = r34
-GR_Table_Base1 = r35
-GR_Table_ptr = r36
-GR_Index2 = r37
-GR_signif = r38
-GR_X_0 = r39
-GR_X_1 = r40
-GR_X_2 = r41
-GR_Z_1 = r42
-GR_Z_2 = r43
-GR_N = r44
-GR_Bias = r45
-GR_M = r46
-GR_ScaleN = r47
-GR_Index3 = r48
-GR_Perturb = r49
-GR_Table_Scale = r50
+LOCAL_OBJECT_START(Constants_G_H_h1)
+data4 0x3F800000,0x00000000
+data8 0x0000000000000000
+data4 0x3F70F0F0,0x3D785196
+data8 0x3DA163A6617D741C
+data4 0x3F638E38,0x3DF13843
+data8 0x3E2C55E6CBD3D5BB
+data4 0x3F579430,0x3E2FF9A0
+data8 0xBE3EB0BFD86EA5E7
+data4 0x3F4CCCC8,0x3E647FD6
+data8 0x3E2E6A8C86B12760
+data4 0x3F430C30,0x3E8B3AE7
+data8 0x3E47574C5C0739BA
+data4 0x3F3A2E88,0x3EA30C68
+data8 0x3E20E30F13E8AF2F
+data4 0x3F321640,0x3EB9CEC8
+data8 0xBE42885BF2C630BD
+data4 0x3F2AAAA8,0x3ECF9927
+data8 0x3E497F3497E577C6
+data4 0x3F23D708,0x3EE47FC5
+data8 0x3E3E6A6EA6B0A5AB
+data4 0x3F1D89D8,0x3EF8947D
+data8 0xBDF43E3CD328D9BE
+data4 0x3F17B420,0x3F05F3A1
+data8 0x3E4094C30ADB090A
+data4 0x3F124920,0x3F0F4303
+data8 0xBE28FBB2FC1FE510
+data4 0x3F0D3DC8,0x3F183EBF
+data8 0x3E3A789510FDE3FA
+data4 0x3F088888,0x3F20EC80
+data8 0x3E508CE57CC8C98F
+data4 0x3F042108,0x3F29516A
+data8 0xBE534874A223106C
+LOCAL_OBJECT_END(Constants_G_H_h1)
-//
-// Added for unwind support
-//
+// Z2 - 16 bit fixed
+
+LOCAL_OBJECT_START(Constants_Z_2)
+data4 0x00008000
+data4 0x00007F81
+data4 0x00007F02
+data4 0x00007E85
+data4 0x00007E08
+data4 0x00007D8D
+data4 0x00007D12
+data4 0x00007C98
+data4 0x00007C20
+data4 0x00007BA8
+data4 0x00007B31
+data4 0x00007ABB
+data4 0x00007A45
+data4 0x000079D1
+data4 0x0000795D
+data4 0x000078EB
+LOCAL_OBJECT_END(Constants_Z_2)
+
+// G2 and H2 - IEEE single and h2 - IEEE double
+
+LOCAL_OBJECT_START(Constants_G_H_h2)
+data4 0x3F800000,0x00000000
+data8 0x0000000000000000
+data4 0x3F7F00F8,0x3B7F875D
+data8 0x3DB5A11622C42273
+data4 0x3F7E03F8,0x3BFF015B
+data8 0x3DE620CF21F86ED3
+data4 0x3F7D08E0,0x3C3EE393
+data8 0xBDAFA07E484F34ED
+data4 0x3F7C0FC0,0x3C7E0586
+data8 0xBDFE07F03860BCF6
+data4 0x3F7B1880,0x3C9E75D2
+data8 0x3DEA370FA78093D6
+data4 0x3F7A2328,0x3CBDC97A
+data8 0x3DFF579172A753D0
+data4 0x3F792FB0,0x3CDCFE47
+data8 0x3DFEBE6CA7EF896B
+data4 0x3F783E08,0x3CFC15D0
+data8 0x3E0CF156409ECB43
+data4 0x3F774E38,0x3D0D874D
+data8 0xBE0B6F97FFEF71DF
+data4 0x3F766038,0x3D1CF49B
+data8 0xBE0804835D59EEE8
+data4 0x3F757400,0x3D2C531D
+data8 0x3E1F91E9A9192A74
+data4 0x3F748988,0x3D3BA322
+data8 0xBE139A06BF72A8CD
+data4 0x3F73A0D0,0x3D4AE46F
+data8 0x3E1D9202F8FBA6CF
+data4 0x3F72B9D0,0x3D5A1756
+data8 0xBE1DCCC4BA796223
+data4 0x3F71D488,0x3D693B9D
+data8 0xBE049391B6B7C239
+LOCAL_OBJECT_END(Constants_G_H_h2)
+
+// G3 and H3 - IEEE single and h3 - IEEE double
+
+LOCAL_OBJECT_START(Constants_G_H_h3)
+data4 0x3F7FFC00,0x38800100
+data8 0x3D355595562224CD
+data4 0x3F7FF400,0x39400480
+data8 0x3D8200A206136FF6
+data4 0x3F7FEC00,0x39A00640
+data8 0x3DA4D68DE8DE9AF0
+data4 0x3F7FE400,0x39E00C41
+data8 0xBD8B4291B10238DC
+data4 0x3F7FDC00,0x3A100A21
+data8 0xBD89CCB83B1952CA
+data4 0x3F7FD400,0x3A300F22
+data8 0xBDB107071DC46826
+data4 0x3F7FCC08,0x3A4FF51C
+data8 0x3DB6FCB9F43307DB
+data4 0x3F7FC408,0x3A6FFC1D
+data8 0xBD9B7C4762DC7872
+data4 0x3F7FBC10,0x3A87F20B
+data8 0xBDC3725E3F89154A
+data4 0x3F7FB410,0x3A97F68B
+data8 0xBD93519D62B9D392
+data4 0x3F7FAC18,0x3AA7EB86
+data8 0x3DC184410F21BD9D
+data4 0x3F7FA420,0x3AB7E101
+data8 0xBDA64B952245E0A6
+data4 0x3F7F9C20,0x3AC7E701
+data8 0x3DB4B0ECAABB34B8
+data4 0x3F7F9428,0x3AD7DD7B
+data8 0x3D9923376DC40A7E
+data4 0x3F7F8C30,0x3AE7D474
+data8 0x3DC6E17B4F2083D3
+data4 0x3F7F8438,0x3AF7CBED
+data8 0x3DAE314B811D4394
+data4 0x3F7F7C40,0x3B03E1F3
+data8 0xBDD46F21B08F2DB1
+data4 0x3F7F7448,0x3B0BDE2F
+data8 0xBDDC30A46D34522B
+data4 0x3F7F6C50,0x3B13DAAA
+data8 0x3DCB0070B1F473DB
+data4 0x3F7F6458,0x3B1BD766
+data8 0xBDD65DDC6AD282FD
+data4 0x3F7F5C68,0x3B23CC5C
+data8 0xBDCDAB83F153761A
+data4 0x3F7F5470,0x3B2BC997
+data8 0xBDDADA40341D0F8F
+data4 0x3F7F4C78,0x3B33C711
+data8 0x3DCD1BD7EBC394E8
+data4 0x3F7F4488,0x3B3BBCC6
+data8 0xBDC3532B52E3E695
+data4 0x3F7F3C90,0x3B43BAC0
+data8 0xBDA3961EE846B3DE
+data4 0x3F7F34A0,0x3B4BB0F4
+data8 0xBDDADF06785778D4
+data4 0x3F7F2CA8,0x3B53AF6D
+data8 0x3DCC3ED1E55CE212
+data4 0x3F7F24B8,0x3B5BA620
+data8 0xBDBA31039E382C15
+data4 0x3F7F1CC8,0x3B639D12
+data8 0x3D635A0B5C5AF197
+data4 0x3F7F14D8,0x3B6B9444
+data8 0xBDDCCB1971D34EFC
+data4 0x3F7F0CE0,0x3B7393BC
+data8 0x3DC7450252CD7ADA
+data4 0x3F7F04F0,0x3B7B8B6D
+data8 0xBDB68F177D7F2A42
+LOCAL_OBJECT_END(Constants_G_H_h3)
-GR_SAVE_PFS = r51
-GR_SAVE_B0 = r52
-GR_SAVE_GP = r53
-GR_Parameter_X = r54
-GR_Parameter_Y = r55
-GR_Parameter_RESULT = r56
-GR_Parameter_TAG = r57
+
+// Floating Point Registers
+
+FR_Input_X = f8
+
+FR_Y_hi = f34
+FR_Y_lo = f35
+
+FR_Scale = f36
+FR_X_Prime = f37
+FR_S_hi = f38
+FR_W = f39
+FR_G = f40
+
+FR_H = f41
+FR_wsq = f42
+FR_w4 = f43
+FR_h = f44
+FR_w6 = f45
+
+FR_G2 = f46
+FR_H2 = f47
+FR_poly_lo = f48
+FR_P8 = f49
+FR_poly_hi = f50
+
+FR_P7 = f51
+FR_h2 = f52
+FR_rsq = f53
+FR_P6 = f54
+FR_r = f55
+
+FR_log2_hi = f56
+FR_log2_lo = f57
+FR_p87 = f58
+FR_p876 = f58
+FR_p8765 = f58
+FR_float_N = f59
+FR_Q4 = f60
+
+FR_p43 = f61
+FR_p432 = f61
+FR_p4321 = f61
+FR_P4 = f62
+FR_G3 = f63
+FR_H3 = f64
+FR_h3 = f65
+
+FR_Q3 = f66
+FR_P3 = f67
+FR_Q2 = f68
+FR_P2 = f69
+FR_1LN10_hi = f70
+
+FR_Q1 = f71
+FR_P1 = f72
+FR_1LN10_lo = f73
+FR_P5 = f74
+FR_rcub = f75
+
+FR_Output_X_tmp = f76
+FR_Neg_One = f77
+FR_Z = f78
+FR_AA = f79
+FR_BB = f80
+FR_S_lo = f81
+FR_2_to_minus_N = f82
FR_X = f8
FR_Y = f0
-FR_RESULT = f99
+FR_RESULT = f76
-.section .text
-.proc logl#
-.global logl#
-.align 64
-logl:
-#ifdef _LIBC
-.global __ieee754_logl
-__ieee754_logl:
-#endif
-{ .mfi
-alloc r32 = ar.pfs,0,22,4,0
-(p0) fnorm.s1 FR_X_Prime = FR_Input_X
-(p0) cmp.eq.unc p7, p0 = r0, r0
-}
-{ .mfi
-(p0) cmp.ne.unc p14, p0 = r0, r0
-(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3
-(p0) cmp.ne.unc p15, p0 = r0, r0 ;;
-}
-{ .mfi
- nop.m 0
-(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF
- nop.i 0
-}
-{ .mfi
-nop.m 999
-(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, f0
- nop.i 0
-}
-{ .mfi
- nop.m 999
-(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, f0
- nop.i 0
-}
-{ .mfi
- nop.m 999
-(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f1
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fsub.s1 FR_Em1 = f0,f1
- nop.i 999
-}
-{ .mfb
- nop.m 999
-(p0) fadd FR_E = f0,f0
-//
-// Create E = 0 and Em1 = -1
-// Check for X == 1, meaning logl(1)
-// Check for X < 0, meaning logl(negative)
-// Check for X == 0, meaning logl(0)
-// Identify NatVals, NaNs, Infs.
-// Identify EM unsupporteds.
-// Identify Negative values - us S1 so as
-// not to raise denormal operand exception
-// Set p15 to false for log
-// Set p14 to false for log
-// Set p7 true for log and log1p
-//
-(p0) br.cond.sptk L(LOGL_BEGIN) ;;
-}
-.endp logl
-ASM_SIZE_DIRECTIVE(logl)
+// General Purpose Registers
-.section .text
-.proc log10l#
-.global log10l#
-.align 64
-log10l:
-#ifdef _LIBC
-.global __ieee754_log10l
-__ieee754_log10l:
-#endif
-{ .mfi
-alloc r32 = ar.pfs,0,22,4,0
-(p0) fadd FR_E = f0,f0
- nop.i 0
-}
-{ .mfi
- nop.m 0
-(p0) fsub.s1 FR_Em1 = f0,f1
- nop.i 0
-}
-{ .mfi
-(p0) cmp.ne.unc p15, p0 = r0, r0
-(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f1
- nop.i 0
-}
-{ .mfi
-(p0) cmp.eq.unc p14, p0 = r0, r0
-(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, f0
-(p0) cmp.ne.unc p7, p0 = r0, r0 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, f0
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3
- nop.i 999
-}
-{ .mfb
- nop.m 999
-(p0) fnorm.s1 FR_X_Prime = FR_Input_X
-//
-// Create E = 0 and Em1 = -1
-// Check for X == 1, meaning logl(1)
-// Check for X < 0, meaning logl(negative)
-// Check for X == 0, meaning logl(0)
-// Identify NatVals, NaNs, Infs.
-// Identify EM unsupporteds.
-// Identify Negative values - us S1 so as
-// Identify Negative values - us S1 so as
-// not to raise denormal operand exception
-// Set p15 to false for log10
-// Set p14 to true for log10
-// Set p7 to false for log10
-//
-(p0) br.cond.sptk L(LOGL_BEGIN) ;;
-}
+GR_ad_p = r33
+GR_Index1 = r34
+GR_Index2 = r35
+GR_signif = r36
+GR_X_0 = r37
+GR_X_1 = r38
+GR_X_2 = r39
+GR_minus_N = r39
+GR_Z_1 = r40
+GR_Z_2 = r41
+GR_N = r42
+GR_Bias = r43
+GR_M = r44
+GR_Index3 = r45
+GR_exp_2tom80 = r45
+GR_ad_p2 = r46
+GR_exp_mask = r47
+GR_exp_2tom7 = r48
+GR_ad_ln10 = r49
+GR_ad_tbl_1 = r50
+GR_ad_tbl_2 = r51
+GR_ad_tbl_3 = r52
+GR_ad_q = r53
+GR_ad_z_1 = r54
+GR_ad_z_2 = r55
+GR_ad_z_3 = r56
+GR_minus_N = r39
+
+//
+// Added for unwind support
+//
-.endp log10l
-ASM_SIZE_DIRECTIVE(log10l)
+GR_SAVE_PFS = r50
+GR_SAVE_B0 = r51
+GR_SAVE_GP = r52
+GR_Parameter_X = r53
+GR_Parameter_Y = r54
+GR_Parameter_RESULT = r55
+GR_Parameter_TAG = r56
.section .text
-.proc log1pl#
-.global log1pl#
-.align 64
-log1pl:
-#ifdef _LIBC
-.global __log1pl
-__log1pl:
-#endif
+GLOBAL_IEEE754_ENTRY(log1pl)
{ .mfi
-alloc r32 = ar.pfs,0,22,4,0
-(p0) fsub.s1 FR_Neg_One = f0,f1
-(p0) cmp.eq.unc p7, p0 = r0, r0
-}
-{ .mfi
-(p0) cmp.ne.unc p14, p0 = r0, r0
-(p0) fnorm.s1 FR_X_Prime = FR_Input_X
-(p0) cmp.eq.unc p15, p0 = r0, r0 ;;
+ alloc r32 = ar.pfs,0,21,4,0
+ fclass.m p6, p0 = FR_Input_X, 0x1E3 // Test for natval, nan, inf
+ nop.i 999
}
{ .mfi
- nop.m 0
-(p0) fclass.m.unc p6, p0 = FR_Input_X, 0x1E3
- nop.i 0
+ addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp
+ fma.s1 FR_Z = FR_Input_X, f1, f1 // x+1
+ nop.i 999
}
+;;
+
{ .mfi
nop.m 999
-(p0) fclass.nm.unc p10, p0 = FR_Input_X, 0x1FF
- nop.i 0
+ fmerge.ns FR_Neg_One = f1, f1 // Form -1.0
+ nop.i 999
}
{ .mfi
nop.m 999
-(p0) fcmp.eq.unc.s1 p9, p0 = FR_Input_X, f0
- nop.i 0
+ fnorm.s1 FR_X_Prime = FR_Input_X // Normalize x
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fadd FR_Em1 = f0,f0
- nop.i 999 ;;
+ ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1
+ nop.f 999
+ mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
}
-{ .mfi
- nop.m 999
-(p0) fadd FR_E = f0,f1
- nop.i 999 ;;
+;;
+
+{ .mfb
+ getf.sig GR_signif = FR_Z // Get significand of x+1
+ fcmp.eq.s1 p9, p0 = FR_Input_X, f0 // Test for x=0
+(p6) br.cond.spnt LOG1P_special // Branch for nan, inf, natval
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fcmp.eq.unc.s1 p8, p0 = FR_Input_X, FR_Neg_One
- nop.i 999
+ add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
+ fcmp.lt.s1 p13, p0 = FR_X_Prime, FR_Neg_One // Test for x<-1
+ add GR_ad_p = -0x100, GR_ad_z_1 // Point to Constants_P
}
{ .mfi
- nop.m 999
-(p0) fcmp.lt.unc.s1 p13, p0 = FR_Input_X, FR_Neg_One
- nop.i 999
+ add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
+ nop.f 999
+ add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
}
-L(LOGL_BEGIN):
+;;
+
{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_Z = FR_X_Prime, FR_E
- nop.i 999
-}
-{ .mlx
- nop.m 999
-(p0) movl GR_Table_Scale = 0x0000000000000018 ;;
+ add GR_ad_q = 0x080, GR_ad_p // Point to Constants_Q
+ fcmp.eq.s1 p8, p0 = FR_X_Prime, FR_Neg_One // Test for x=-1
+ extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
}
-{ .mmi
- nop.m 999
- nop.m 999
-//
-// Create E = 1 and Em1 = 0
-// Check for X == 0, meaning logl(1+0)
-// Check for X < -1, meaning logl(negative)
-// Check for X == -1, meaning logl(0)
-// Normalize x
-// Identify NatVals, NaNs, Infs.
-// Identify EM unsupporteds.
-// Identify Negative values - us S1 so as
-// not to raise denormal operand exception
-// Set p15 to true for log1p
-// Set p14 to false for log1p
-// Set p7 true for log and log1p
-//
-(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h1#),gp
+{ .mfb
+ add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
+ nop.f 999
+(p9) br.ret.spnt b0 // Exit if x=0, return input
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmax.s1 FR_AA = FR_X_Prime, FR_E
- nop.i 999 ;;
+ shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
+ fclass.nm p10, p0 = FR_Input_X, 0x1FF // Test for unsupported
+ extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of significand
}
{ .mfi
- ld8 GR_Table_Base = [GR_Table_Base]
-(p0) fmin.s1 FR_BB = FR_X_Prime, FR_E
- nop.i 999
-}
-{ .mfb
- nop.m 999
-(p0) fadd.s1 FR_W = FR_X_Prime, FR_Em1
-//
-// Begin load of constants base
-// FR_Z = Z = |x| + E
-// FR_W = W = |x| + Em1
-// AA = fmax(|x|,E)
-// BB = fmin(|x|,E)
-//
-(p6) br.cond.spnt L(LOGL_64_special) ;;
+ ldfe FR_P8 = [GR_ad_p],16 // Load P_8 for near1 path
+ fsub.s1 FR_W = FR_X_Prime, f0 // W = x
+ add GR_ad_ln10 = 0x060, GR_ad_q // Point to Constants_1_by_LN10
}
-{ .mib
- nop.m 999
- nop.i 999
-(p10) br.cond.spnt L(LOGL_64_unsupported) ;;
+;;
+
+{ .mfi
+ ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
+ fmax.s1 FR_AA = FR_X_Prime, f1 // For S_lo, form AA = max(X,1.0)
+ mov GR_exp_mask = 0x1FFFF // Create exponent mask
}
{ .mib
- nop.m 999
- nop.i 999
-(p13) br.cond.spnt L(LOGL_64_negative) ;;
+ shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
+ mov GR_Bias = 0x0FFFF // Create exponent bias
+(p13) br.cond.spnt LOG1P_LT_Minus_1 // Branch if x<-1
}
-{ .mib
-(p0) getf.sig GR_signif = FR_Z
- nop.i 999
-(p9) br.cond.spnt L(LOGL_64_one) ;;
+;;
+
+{ .mfb
+ ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
+ fmerge.se FR_S_hi = f1,FR_Z // Form |x+1|
+(p8) br.cond.spnt LOG1P_EQ_Minus_1 // Branch if x=-1
}
-{ .mib
- nop.m 999
- nop.i 999
-(p8) br.cond.spnt L(LOGL_64_zero) ;;
+;;
+
+{ .mmb
+ getf.exp GR_N = FR_Z // Get N = exponent of x+1
+ ldfd FR_h = [GR_ad_tbl_1] // Load h_1
+(p10) br.cond.spnt LOG1P_unsupported // Branch for unsupported type
}
+;;
+
{ .mfi
-(p0) getf.exp GR_N = FR_Z
-//
-// Raise possible denormal operand exception
-// Create Bias
-//
-// This function computes ln( x + e )
-// Input FR 1: FR_X = FR_Input_X
-// Input FR 2: FR_E = FR_E
-// Input FR 3: FR_Em1 = FR_Em1
-// Input GR 1: GR_Expo_Range = GR_Expo_Range = 1
-// Output FR 4: FR_Y_hi
-// Output FR 5: FR_Y_lo
-// Output FR 6: FR_Scale
-// Output PR 7: PR_Safe
-//
-(p0) fsub.s1 FR_S_lo = FR_AA, FR_Z
+ ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
+ fcmp.eq.s0 p8, p0 = FR_Input_X, f0 // Dummy op to flag denormals
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
+}
+;;
+
//
-// signif = getf.sig(Z)
-// abs_W = fabs(w)
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
//
-(p0) extr.u GR_Table_ptr = GR_signif, 59, 4 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fmerge.se FR_S_hi = f1,FR_Z
-(p0) extr.u GR_X_0 = GR_signif, 49, 15
-}
{ .mmi
- nop.m 999
- nop.m 999
-(p0) addl GR_Table_Base1 = @ltoff(Constants_Z_G_H_h2#),gp ;;
-}
-{ .mlx
- ld8 GR_Table_Base1 = [GR_Table_Base1]
-(p0) movl GR_Bias = 0x000000000000FFFF ;;
-}
-{ .mfi
- nop.m 999
-(p0) fabs FR_abs_W = FR_W
-(p0) pmpyshr2.u GR_Table_ptr = GR_Table_ptr,GR_Table_Scale,0
-}
-{ .mfi
- nop.m 999
-//
-// Branch out for special input values
-//
-(p0) fcmp.lt.unc.s0 p8, p0 = FR_Input_X, f0
- nop.i 999 ;;
+ ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
+ sub GR_N = GR_N, GR_Bias
+ mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
}
+;;
+
{ .mfi
- nop.m 999
-//
-// X_0 = extr.u(signif,49,15)
-// Index1 = extr.u(signif,59,4)
-//
-(p0) fadd.s1 FR_S_lo = FR_S_lo, FR_BB
- nop.i 999 ;;
+ ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
+ fms.s1 FR_S_lo = FR_AA, f1, FR_Z // Form S_lo = AA - Z
+ sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
}
-{ .mii
- nop.m 999
- nop.i 999 ;;
-//
-// Offset_to_Z1 = 24 * Index1
-// For performance, don't use result
-// for 3 or 4 cycles.
-//
-(p0) add GR_Table_ptr = GR_Table_ptr, GR_Table_Base ;;
+;;
+
+{ .mmf
+ ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
+ setf.sig FR_float_N = GR_N // Put integer N into rightmost significand
+ fmin.s1 FR_BB = FR_X_Prime, f1 // For S_lo, form BB = min(X,1.0)
}
-//
-// Add Base to Offset for Z1
-// Create Bias
+;;
+
{ .mmi
-(p0) ld4 GR_Z_1 = [GR_Table_ptr],4 ;;
-(p0) ldfs FR_G = [GR_Table_ptr],4
- nop.i 999 ;;
+ getf.exp GR_M = FR_W // Get signexp of w = x
+ ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
}
+;;
+
{ .mmi
-(p0) ldfs FR_H = [GR_Table_ptr],8 ;;
-(p0) ldfd FR_h = [GR_Table_ptr],0
-(p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
-}
-//
-// Load Z_1
-// Get Base of Table2
-//
-{ .mfi
-(p0) getf.exp GR_M = FR_abs_W
- nop.f 999
- nop.i 999 ;;
-}
-{ .mii
- nop.m 999
- nop.i 999 ;;
-//
-// M = getf.exp(abs_W)
-// S_lo = AA - Z
-// X_1 = pmpyshr2(X_0,Z_1,15)
-//
-(p0) sub GR_M = GR_M, GR_Bias ;;
-}
-//
-// M = M - Bias
-// Load G1
-// N = getf.exp(Z)
-//
-{ .mii
-(p0) cmp.gt.unc p11, p0 = -80, GR_M
-(p0) cmp.gt.unc p12, p0 = -7, GR_M ;;
-(p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;;
-}
-{ .mib
- nop.m 999
-//
-// if -80 > M, set p11
-// Index2 = extr.u(X_1,6,4)
-// if -7 > M, set p12
-// Load H1
-//
-(p0) pmpyshr2.u GR_Index2 = GR_Index2,GR_Table_Scale,0
-(p11) br.cond.spnt L(log1pl_small) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p12) br.cond.spnt L(log1pl_near) ;;
-}
-{ .mii
-(p0) sub GR_N = GR_N, GR_Bias
-//
-// poly_lo = r * poly_lo
-//
-(p0) add GR_Perturb = 0x1, r0 ;;
-(p0) sub GR_ScaleN = GR_Bias, GR_N
-}
-{ .mii
-(p0) setf.sig FR_float_N = GR_N
- nop.i 999 ;;
-//
-// Prepare Index2 - pmpyshr2.u(X_1,Z_2,15)
-// Load h1
-// S_lo = S_lo + BB
-// Branch for -80 > M
-//
-(p0) add GR_Index2 = GR_Index2, GR_Table_Base1
+ ldfe FR_Q1 = [GR_ad_q] // Load Q1
+ shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
+ add GR_ad_p2 = 0x30,GR_ad_p // Point to P_4
}
+;;
+
{ .mmi
-(p0) setf.exp FR_two_negN = GR_ScaleN
- nop.m 999
-(p0) addl GR_Table_Base = @ltoff(Constants_Z_G_H_h3#),gp ;;
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
+ and GR_M = GR_exp_mask, GR_M // Get exponent of w = x
}
-//
-// Index2 points to Z2
-// Branch for -7 > M
-//
-{ .mmb
-(p0) ld4 GR_Z_2 = [GR_Index2],4
-(p0) ld8 GR_Table_Base = [GR_Table_Base]
- nop.b 999 ;;
-}
-(p0) nop.i 999
-//
-// Load Z_2
-// N = N - Bias
-// Tablebase points to Table3
-//
+;;
+
{ .mmi
-(p0) ldfs FR_G_tmp = [GR_Index2],4 ;;
-//
-// Load G_2
-// pmpyshr2 X_2= (X_1,Z_2,15)
-// float_N = setf.sig(N)
-// ScaleN = Bias - N
-//
-(p0) ldfs FR_H_tmp = [GR_Index2],8
- nop.i 999 ;;
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
+ cmp.lt p8, p9 = GR_M, GR_exp_2tom7 // Test |x| < 2^-7
+ cmp.lt p7, p0 = GR_M, GR_exp_2tom80 // Test |x| < 2^-80
}
-//
-// Load H_2
-// two_negN = setf.exp(scaleN)
-// G = G_1 * G_2
-//
+;;
+
+// Small path is separate code
+// p7 is for the small path: |x| < 2^-80
+// near1 and regular paths are merged.
+// p8 is for the near1 path: |x| < 2^-7
+// p9 is for regular path: |x| >= 2^-7
+
{ .mfi
-(p0) ldfd FR_h_tmp = [GR_Index2],0
- nop.f 999
-(p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;;
+ ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
+ nop.f 999
+ nop.i 999
}
-{ .mii
- nop.m 999
-(p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;;
-//
-// Load h_2
-// H = H_1 + H_2
-// h = h_1 + h_2
-// Index3 = extr.u(X_2,1,5)
-//
-(p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Base
+{ .mfb
+(p9) setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
+(p7) fnma.s0 f8 = FR_X_Prime, FR_X_Prime, FR_X_Prime // Result x - x*x
+(p7) br.ret.spnt b0 // Branch if |x| < 2^-80
}
+;;
+
{ .mmi
- nop.m 999
- nop.m 999
-//
-// float_N = fcvt.xf(float_N)
-// load G3
-//
-(p0) addl GR_Table_Base = @ltoff(Constants_Q#),gp ;;
+(p8) ldfe FR_P7 = [GR_ad_p],16 // Load P_7 for near1 path
+(p8) ldfe FR_P4 = [GR_ad_p2],16 // Load P_4 for near1 path
+(p9) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
}
-{ .mmi
- nop.m 999
- ld8 GR_Table_Base = [GR_Table_Base]
- nop.i 999
-};;
+;;
-{ .mfi
-(p0) ldfe FR_log2_hi = [GR_Table_Base],16
-(p0) fmpy.s1 FR_S_lo = FR_S_lo, FR_two_negN
- nop.i 999 ;;
-}
-{ .mmf
- nop.m 999
//
-// G = G3 * G
-// Load h3
-// Load log2_hi
-// H = H + H3
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
//
-(p0) ldfe FR_log2_lo = [GR_Table_Base],16
-(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp ;;
-}
{ .mmf
-(p0) ldfs FR_G_tmp = [GR_Index3],4
-//
-// h = h + h3
-// r = G * S_hi + 1
-// Load log2_lo
-//
-(p0) ldfe FR_Q4 = [GR_Table_Base],16
-(p0) fadd.s1 FR_h = FR_h, FR_h_tmp ;;
-}
-{ .mfi
-(p0) ldfe FR_Q3 = [GR_Table_Base],16
-(p0) fadd.s1 FR_H = FR_H, FR_H_tmp
- nop.i 999 ;;
+(p8) ldfe FR_P6 = [GR_ad_p],16 // Load P_6 for near1 path
+(p8) ldfe FR_P3 = [GR_ad_p2],16 // Load P_3 for near1 path
+(p9) fma.s1 FR_S_lo = FR_S_lo, f1, FR_BB // S_lo = S_lo + BB
}
+;;
+
{ .mmf
-(p0) ldfs FR_H_tmp = [GR_Index3],4
-(p0) ldfe FR_Q2 = [GR_Table_Base],16
-//
-// Comput Index for Table3
-// S_lo = S_lo * two_negN
-//
-(p0) fcvt.xf FR_float_N = FR_float_N ;;
+(p8) ldfe FR_P5 = [GR_ad_p],16 // Load P_5 for near1 path
+(p8) ldfe FR_P2 = [GR_ad_p2],16 // Load P_2 for near1 path
+(p8) fmpy.s1 FR_wsq = FR_W, FR_W // wsq = w * w for near1 path
}
-//
-// If S_lo == 0, set p8 false
-// Load H3
-// Load ptr to table of polynomial coeff.
-//
-{ .mmf
-(p0) ldfd FR_h_tmp = [GR_Index3],0
-(p0) ldfe FR_Q1 = [GR_Table_Base],0
-(p0) fcmp.eq.unc.s1 p0, p8 = FR_S_lo, f0 ;;
+;;
+
+{ .mmi
+(p8) ldfe FR_P1 = [GR_ad_p2],16 ;; // Load P_1 for near1 path
+ nop.m 999
+(p9) extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmpy.s1 FR_G = FR_G, FR_G_tmp
- nop.i 999 ;;
+(p9) shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
+(p9) fcvt.xf FR_float_N = FR_float_N
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_H = FR_H, FR_H_tmp
- nop.i 999 ;;
+(p9) ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
+ nop.f 999
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fms.s1 FR_r = FR_G, FR_S_hi, f1
- nop.i 999
+(p9) ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+(p9) fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fadd.s1 FR_h = FR_h, FR_h_tmp
- nop.i 999 ;;
+ nop.m 999
+(p9) fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 999
}
-{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H
- nop.i 999 ;;
+;;
+
+{ .mmf
+ nop.m 999
+ nop.m 999
+(p9) fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Load Q4
-// Load Q3
-// Load Q2
-// Load Q1
-//
-(p8) fma.s1 FR_r = FR_G, FR_S_lo, FR_r
- nop.i 999
+ nop.m 999
+(p8) fmpy.s1 FR_w4 = FR_wsq, FR_wsq // w4 = w^4 for near1 path
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// poly_lo = r * Q4 + Q3
-// rsq = r* r
-//
-(p0) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h
- nop.i 999 ;;
+ nop.m 999
+(p8) fma.s1 FR_p87 = FR_W, FR_P8, FR_P7 // p87 = w * P8 + P7
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// If (S_lo!=0) r = s_lo * G + r
-//
-(p0) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
- nop.i 999
+ nop.m 999
+(p9) fma.s1 FR_S_lo = FR_S_lo, FR_2_to_minus_N, f0 // S_lo = S_lo * 2^(-N)
+ nop.i 999
}
-//
-// Create a 0x00000....01
-// poly_lo = poly_lo * rsq + h
-//
{ .mfi
-(p0) setf.sig FR_dummy = GR_Perturb
-(p0) fmpy.s1 FR_rsq = FR_r, FR_r
- nop.i 999 ;;
+ nop.m 999
+(p8) fma.s1 FR_p43 = FR_W, FR_P4, FR_P3 // p43 = w * P4 + P3
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// h = N * log2_lo + h
-// Y_hi = n * log2_hi + H
-//
-(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
- nop.i 999
+ nop.m 999
+(p9) fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
- nop.i 999 ;;
+ nop.m 999
+(p9) fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly_lo = r * poly_o + Q2
-// poly_hi = Q1 * rsq + r
-//
-(p0) fmpy.s1 FR_poly_lo = FR_poly_lo, FR_r
- nop.i 999 ;;
+ nop.m 999
+(p9) fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_rsq, FR_h
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p0) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo
-//
-// Create the FR for a binary "or"
-// Y_lo = poly_hi + poly_lo
-//
-// (p0) for FR_dummy = FR_Y_lo,FR_dummy ;;
-//
-// Turn the lsb of Y_lo ON
-//
-// (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_dummy ;;
-//
-// Merge the new lsb into Y_lo, for alone doesn't
-//
-(p0) br.cond.sptk LOGL_main ;;
-}
-L(log1pl_near):
-{ .mmi
- nop.m 999
- nop.m 999
-// /*******************************************************/
-// /*********** Branch log1pl_near ************************/
-// /*******************************************************/
-(p0) addl GR_Table_Base = @ltoff(Constants_P#),gp ;;
-}
-{ .mmi
nop.m 999
- ld8 GR_Table_Base = [GR_Table_Base]
+(p8) fmpy.s1 FR_w6 = FR_w4, FR_wsq // w6 = w^6 for near1 path
nop.i 999
-};;
-//
-// Load base address of poly. coeff.
-//
-{ .mmb
-(p0) add GR_Table_ptr = 0x40,GR_Table_Base
-//
-// Address tables with separate pointers
-//
-(p0) ldfe FR_P8 = [GR_Table_Base],16
- nop.b 999 ;;
-}
-{ .mmb
-(p0) ldfe FR_P4 = [GR_Table_ptr],16
-//
-// Load P4
-// Load P8
-//
-(p0) ldfe FR_P7 = [GR_Table_Base],16
- nop.b 999 ;;
-}
-{ .mmf
-(p0) ldfe FR_P3 = [GR_Table_ptr],16
-//
-// Load P3
-// Load P7
-//
-(p0) ldfe FR_P6 = [GR_Table_Base],16
-(p0) fmpy.s1 FR_wsq = FR_W, FR_W ;;
}
+;;
+
{ .mfi
-(p0) ldfe FR_P2 = [GR_Table_ptr],16
- nop.f 999
- nop.i 999 ;;
+ nop.m 999
+(p8) fma.s1 FR_p432 = FR_W, FR_p43, FR_P2 // p432 = w * p43 + P2
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_W, FR_P4, FR_P3
- nop.i 999
+ nop.m 999
+(p8) fma.s1 FR_p876 = FR_W, FR_p87, FR_P6 // p876 = w * p87 + P6
+ nop.i 999
}
-//
-// Load P2
-// Load P6
-// Wsq = w * w
-// Y_hi = p4 * w + p3
-//
+;;
+
{ .mfi
-(p0) ldfe FR_P5 = [GR_Table_Base],16
-(p0) fma.s1 FR_Y_lo = FR_W, FR_P8, FR_P7
- nop.i 999 ;;
+ nop.m 999
+(p9) fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
+ nop.i 999
}
{ .mfi
-(p0) ldfe FR_P1 = [GR_Table_ptr],16
-//
-// Load P1
-// Load P5
-// Y_lo = p8 * w + P7
-//
-(p0) fmpy.s1 FR_w4 = FR_wsq, FR_wsq
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi = N * log2_hi + H
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P2
- nop.i 999
+ nop.m 999
+(p9) fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h = N * log2_lo + h
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P6
-(p0) add GR_Perturb = 0x1, r0 ;;
+ nop.m 999
+(p9) fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r = G * S_lo + (G * S_hi - 1)
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// w4 = w2 * w2
-// Y_hi = y_hi * w + p2
-// Y_lo = y_lo * w + p6
-// Create perturbation bit
-//
-(p0) fmpy.s1 FR_w6 = FR_w4, FR_wsq
- nop.i 999 ;;
+ nop.m 999
+(p8) fma.s1 FR_p4321 = FR_W, FR_p432, FR_P1 // p4321 = w * p432 + P1
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_W, FR_Y_hi, FR_P1
- nop.i 999
+ nop.m 999
+(p8) fma.s1 FR_p8765 = FR_W, FR_p876, FR_P5 // p8765 = w * p876 + P5
+ nop.i 999
}
-//
-// Y_hi = y_hi * w + p1
-// w6 = w4 * w2
-//
+;;
+
{ .mfi
-(p0) setf.sig FR_Q4 = GR_Perturb
-(p0) fma.s1 FR_Y_lo = FR_W, FR_Y_lo, FR_P5
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_dummy = FR_wsq,FR_Y_hi, f0
- nop.i 999
+ nop.m 999
+(p9) fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 FR_Y_hi = FR_W,f1,f0
- nop.i 999
-};;
-{ .mfb
- nop.m 999
-//
-// Y_hi = w
-// Y_lo = y_lo * w + p5
-//
-(p0) fma.s1 FR_Y_lo = FR_w6, FR_Y_lo,FR_dummy
-//
-// Y_lo = y_lo * w6 + y_high order part.
-//
-// performance
-//
-(p0) br.cond.sptk LOGL_main ;;
-}
-L(log1pl_small):
-{ .mmi
- nop.m 999
-// /*******************************************************/
-// /*********** Branch log1pl_small ***********************/
-// /*******************************************************/
-(p0) addl GR_Table_Base = @ltoff(Constants_Threshold#),gp
+ nop.m 999
+(p8) fma.s1 FR_Y_lo = FR_wsq, FR_p4321, f0 // Y_lo = wsq * p4321
+ nop.i 999
}
{ .mfi
nop.m 999
-(p0) mov FR_Em1 = FR_W
-(p0) cmp.eq.unc p7, p0 = r0, r0 ;;
-}
-{ .mlx
- ld8 GR_Table_Base = [GR_Table_Base]
-(p0) movl GR_Expo_Range = 0x0000000000000004 ;;
-}
-//
-// Set Safe to true
-// Set Expo_Range = 0 for single
-// Set Expo_Range = 2 for double
-// Set Expo_Range = 4 for double-extended
-//
-{ .mmi
-(p0) shladd GR_Table_Base = GR_Expo_Range,4,GR_Table_Base ;;
-(p0) ldfe FR_Threshold = [GR_Table_Base],16
- nop.i 999
-}
-{ .mlx
- nop.m 999
-(p0) movl GR_Bias = 0x000000000000FF9B ;;
+(p8) fma.s1 FR_Y_hi = FR_W, f1, f0 // Y_hi = w for near1 path
+ nop.i 999
}
+;;
+
{ .mfi
-(p0) ldfe FR_Tiny = [GR_Table_Base],0
- nop.f 999
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo = poly_lo * r + Q2
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fcmp.gt.unc.s1 p13, p12 = FR_abs_W, FR_Threshold
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p13) fnmpy.s1 FR_Y_lo = FR_W, FR_W
- nop.i 999
+ nop.m 999
+(p8) fma.s1 FR_Y_lo = FR_w6, FR_p8765,FR_Y_lo // Y_lo = w6 * p8765 + w2 * p4321
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p13) fadd FR_SCALE = f0, f1
- nop.i 999 ;;
+ nop.m 999
+(p9) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1 * rsq + r
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p12) fsub.s1 FR_Y_lo = f0, FR_Tiny
-(p12) cmp.ne.unc p7, p0 = r0, r0
+ nop.m 999
+(p9) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h // poly_lo = poly_lo*r^3 + h
+ nop.i 999
}
+;;
+
{ .mfi
-(p12) setf.exp FR_SCALE = GR_Bias
- nop.f 999
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-//
-// Set p7 to SAFE = FALSE
-// Set Scale = 2^-100
-//
-(p0) fma.s0 f8 = FR_Y_lo,FR_SCALE,FR_Y_hi
-(p0) br.ret.sptk b0 ;;
+ nop.m 999
+(p9) fadd.s1 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo = poly_hi + poly_lo
+ nop.i 999
}
-L(LOGL_64_one):
+;;
+
+// Remainder of code is common for near1 and regular paths
{ .mfb
- nop.m 999
-(p0) fmpy.s0 f8 = FR_Input_X, f0
-(p0) br.ret.sptk b0 ;;
+ nop.m 999
+ fadd.s0 f8 = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
+ br.ret.sptk b0 // Common exit for 2^-80 < x < inf
}
-//
-// Raise divide by zero for +/-0 input.
-//
-L(LOGL_64_zero):
-{ .mfi
-(p0) mov GR_Parameter_TAG = 0
+;;
+
+
+// Here if x=-1
+LOG1P_EQ_Minus_1:
//
-// If we have logl(1), log10l(1) or log1pl(0), return 0.
+// If x=-1 raise divide by zero and return -inf
//
-(p0) fsub.s0 FR_Output_X_tmp = f0, f1
- nop.i 999 ;;
-}
-{ .mii
-(p14) mov GR_Parameter_TAG = 6
- nop.i 999 ;;
-(p15) mov GR_Parameter_TAG = 138 ;;
-}
-{ .mfb
- nop.m 999
-(p0) frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0
-(p0) br.cond.sptk __libm_error_region ;;
+{ .mfi
+ mov GR_Parameter_TAG = 138
+ fsub.s1 FR_Output_X_tmp = f0, f1
+ nop.i 999
}
+;;
+
{ .mfb
- nop.m 999
-//
-// Report that logl(0) computed
-// { .mfb
-(p0) mov FR_Input_X = FR_Output_X_tmp
-(p0) br.ret.sptk b0 ;;
+ nop.m 999
+ frcpa.s0 FR_Output_X_tmp, p8 = FR_Output_X_tmp, f0
+ br.cond.sptk __libm_error_region
}
+;;
-L(LOGL_64_special):
+LOG1P_special:
{ .mfi
- nop.m 999
-//
-// Return -Inf or value from handler.
-//
-(p0) fclass.m.unc p7, p0 = FR_Input_X, 0x1E1
- nop.i 999 ;;
+ nop.m 999
+ fclass.m.unc p8, p0 = FR_Input_X, 0x1E1 // Test for natval, nan, +inf
+ nop.i 999
}
-{ .mfb
- nop.m 999
-//
-// Check for Natval, QNan, SNaN, +Inf
-//
-(p7) fmpy.s0 f8 = FR_Input_X, f1
+;;
+
//
// For SNaN raise invalid and return QNaN.
// For QNaN raise invalid and return QNaN.
// For +Inf return +Inf.
//
-(p7) br.ret.sptk b0 ;;
+{ .mfb
+ nop.m 999
+(p8) fmpy.s0 f8 = FR_Input_X, f1
+(p8) br.ret.sptk b0 // Return for natval, nan, +inf
}
+;;
+
//
// For -Inf raise invalid and return QNaN.
//
-{ .mii
-(p0) mov GR_Parameter_TAG = 1
- nop.i 999 ;;
-(p14) mov GR_Parameter_TAG = 7 ;;
-}
-{ .mfi
-(p15) mov GR_Parameter_TAG = 139
- nop.f 999
- nop.i 999 ;;
-}
{ .mfb
- nop.m 999
-(p0) fmpy.s0 FR_Output_X_tmp = FR_Input_X, f0
-(p0) br.cond.sptk __libm_error_region ;;
+ mov GR_Parameter_TAG = 139
+ fmpy.s0 FR_Output_X_tmp = FR_Input_X, f0
+ br.cond.sptk __libm_error_region
}
-//
-// Report that logl(-Inf) computed
-// Report that log10l(-Inf) computed
-// Report that log1p(-Inf) computed
-//
-{ .mfb
- nop.m 0
-(p0) mov FR_Input_X = FR_Output_X_tmp
-(p0) br.ret.sptk b0 ;;
-}
-L(LOGL_64_unsupported):
-{ .mfb
- nop.m 999
+;;
+
+
+LOG1P_unsupported:
//
-// Return generated NaN or other value .
+// Return generated NaN or other value.
//
-(p0) fmpy.s0 f8 = FR_Input_X, f0
-(p0) br.ret.sptk b0 ;;
+{ .mfb
+ nop.m 999
+ fmpy.s0 f8 = FR_Input_X, f0
+ br.ret.sptk b0
}
-L(LOGL_64_negative):
-{ .mfi
- nop.m 999
-//
-// Deal with x < 0 in a special way
-//
-(p0) frcpa.s0 FR_Output_X_tmp, p8 = f0, f0
+;;
+
+// Here if -inf < x < -1
+LOG1P_LT_Minus_1:
//
-// Deal with x < 0 in a special way - raise
+// Deal with x < -1 in a special way - raise
// invalid and produce QNaN indefinite.
//
-(p0) mov GR_Parameter_TAG = 1 ;;
-}
-{ .mii
-(p14) mov GR_Parameter_TAG = 7
- nop.i 999 ;;
-(p15) mov GR_Parameter_TAG = 139
+{ .mfb
+ mov GR_Parameter_TAG = 139
+ frcpa.s0 FR_Output_X_tmp, p8 = f0, f0
+ br.cond.sptk __libm_error_region
}
-.endp log1pl
-ASM_SIZE_DIRECTIVE(log1pl)
+;;
+
-.proc __libm_error_region
-__libm_error_region:
+GLOBAL_IEEE754_END(log1pl)
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
@@ -1609,8 +1177,8 @@ __libm_error_region:
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
{ .mmi
- nop.m 0
- nop.m 0
+ nop.m 999
+ nop.m 999
add GR_Parameter_RESULT = 48,sp
};;
{ .mmi
@@ -1625,52 +1193,7 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
-
-.proc LOGL_main
-LOGL_main:
-{ .mfi
- nop.m 999
-//
-// kernel_log_64 computes ln(X + E)
-//
-(p7) fadd.s0 FR_Input_X = FR_Y_lo,FR_Y_hi
- nop.i 0
-}
-{ .mmi
- nop.m 999
- nop.m 999
-(p14) addl GR_Table_Base = @ltoff(Constants_1_by_LN10#),gp ;;
-}
-{ .mmi
- nop.m 999
-(p14) ld8 GR_Table_Base = [GR_Table_Base]
- nop.i 999
-};;
-
-{ .mmi
-(p14) ldfe FR_1LN10_hi = [GR_Table_Base],16 ;;
-(p14) ldfe FR_1LN10_lo = [GR_Table_Base]
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p14) fmpy.s1 FR_Output_X_tmp = FR_Y_lo,FR_1LN10_hi
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p14) fma.s1 FR_Output_X_tmp = FR_Y_hi,FR_1LN10_lo,FR_Output_X_tmp
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p14) fma.s0 FR_Input_X = FR_Y_hi,FR_1LN10_hi,FR_Output_X_tmp
-(p0) br.ret.sptk b0 ;;
-}
-.endp LOGL_main
-ASM_SIZE_DIRECTIVE(LOGL_main)
+LOCAL_LIBM_END(__libm_error_region#)
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_logb.S b/sysdeps/ia64/fpu/s_logb.S
index 76c4fe778e..dfe581a826 100644
--- a/sysdeps/ia64/fpu/s_logb.S
+++ b/sysdeps/ia64/fpu/s_logb.S
@@ -1,10 +1,10 @@
.file "logb.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,41 +20,43 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 2/16/00 Modified to conform to C9X
-// 3/16/00 Improved speed
-// 4/04/00 Unwind support added
-// 5/30/00 Fixed bug when x double-extended denormal
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 02/16/00 Modified to conform to C9X
+// 03/16/00 Improved speed
+// 04/04/00 Unwind support added
+// 05/30/00 Fixed bug when x double-extended denormal
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance
//
// API
//==============================================================
-// double logb( double x);
+// double logb( double x );
//
// Overview of operation
//==============================================================
-// The logb function extracts the exponent of x as an integer in
-// floating-point format.
+// The logb function extracts the exponent of x as an integer in
+// floating-point format.
// logb computes log2 of x as a double
//
// logb is similar to ilogb but differs in the following ways:
@@ -71,217 +73,168 @@
//
// Registers used
//==============================================================
-// general registers used:
-// ar.pfs r32
-// r33 -> r37
-// r38 -> r41 used as parameters to error path
+// general registers used:
+// r26 -> r38
+// r35 -> r38 used as parameters to error path
//
-// predicate registers used:
+// predicate registers used:
// p6, p7, p8
-// floating-point registers used:
+// floating-point registers used:
// f9, f10, f11
// f8, input
-#include "libm_support.h"
+rExpBias = r26
+rExpMask = r27
+rSignexp_x = r28
+rExp_x = r29
+rTrueExp_x = r30
+rExp_2to64 = r31
-GR_SAVE_B0 = r34
-GR_SAVE_GP = r35
GR_SAVE_PFS = r32
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
-GR_Parameter_X = r38
-GR_Parameter_Y = r39
-GR_Parameter_RESULT = r40
+GR_Parameter_X = r35
+GR_Parameter_Y = r36
+GR_Parameter_RESULT = r37
+GR_Parameter_TAG = r38
-.align 32
-.global logb#
+fExp_in_signif = f9
+fNorm_x = f10
+fFloat_Exp = f10
+f2to64 = f11
.section .text
-.proc logb#
-.align 32
+GLOBAL_LIBM_ENTRY(logb)
-
-logb:
-
-// qnan snan inf norm unorm 0 -+
-// 0 0 0 0 1 0 11
-// 0 b
-{ .mfi
- alloc r32=ar.pfs,1,5,4,0
-(p0) fclass.m.unc p8,p0 = f8, 0x0b
- nop.i 999
-}
// X NORMAL
-// r37 = exp(f8) - - 0xffff
-// sig(f8) = r37
+// TrueExp_x = exp(f8) - 0xffff
+// sig = TrueExp_x
// f8 = convert_to_fp (sig))
{ .mfi
-(p0) getf.exp r35 = f8
-(p0) fnorm f10=f8
- nop.i 999 ;;
+ getf.exp rSignexp_x = f8
+ fclass.m p8,p0 = f8, 0x0b // Test for x unorm
+ mov rExpBias = 0xffff // Exponent bias
}
-
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 0 11
-// e 3
-{ .mmf
-(p0) mov r33 = 0xffff
-(p0) mov r34 = 0x1ffff
-(p0) fclass.m.unc p6,p0 = f8, 0xe3 ;;
+{ .mfi
+ nop.m 0
+ fnorm.s1 fNorm_x = f8
+ mov rExpMask = 0x1ffff // Exponent mask
}
+;;
+// Form signexp of 2^64 in case need to scale denormal
{ .mfb
-(p0) and r36 = r35, r34
-(p0) fclass.m.unc p7,p0 = f8, 0x07
-(p8) br.cond.spnt L(LOGB_DENORM) ;;
+ mov rExp_2to64 = 0x1003f
+ fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf
+(p8) br.cond.spnt LOGB_DENORM // Branch if x unorm
}
+;;
-{ .mib
-(p0) sub r37 = r36, r33
- nop.i 999
-(p6) br.cond.spnt L(LOGB_NAN_INF) ;;
+LOGB_COMMON:
+// Return here from LOGB_DENORM
+{ .mfi
+ and rExp_x = rSignexp_x, rExpMask // Get biased exponent
+ fclass.m p7,p0 = f8, 0x07 // Test x zero
+ nop.i 0
}
+;;
-{ .mib
-(p0) setf.sig f9 = r37
- nop.i 999
-(p7) br.cond.spnt L(LOGB_ZERO) ;;
+// X NAN or INFINITY, return f8 * f8
+{ .mfb
+ sub rTrueExp_x = rExp_x, rExpBias // Get true exponent
+(p6) fma.d.s0 f8= f8,f8,f0 // Result if x natval, nan, inf
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf
}
+;;
-{ .mfi
- nop.m 999
-(p0) fcvt.xf f10 = f9
- nop.i 999 ;;
+{ .mib
+ setf.sig fExp_in_signif = rTrueExp_x // Exponent as integer in fp
+ nop.i 999
+(p7) br.cond.spnt LOGB_ZERO
}
+;;
+// Result can be represented in less than 24 bits, so no precision completer
+// is needed.
{ .mfb
- nop.m 999
-(p0) fnorm.d f8 = f10
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+ fcvt.xf f8 = fExp_in_signif
+ br.ret.sptk b0 // Exit main path, 0 < |x| < inf
}
+;;
-L(LOGB_DENORM):
-// Form signexp of 2^64 in case need to scale denormal
+LOGB_DENORM:
+// Form 2^64 in case need to scale denormal
// Check to see if double-extended denormal
{ .mfi
-(p0) mov r38 = 0x1003f
-(p0) fclass.m.unc p8,p0 = f10, 0x0b
- nop.i 999 ;;
+ setf.exp f2to64 = rExp_2to64
+ fclass.m p8,p0 = fNorm_x, 0x0b
+ nop.i 0
}
+;;
-// Form 2^64 in case need to scale denormal
{ .mfi
-(p0) setf.exp f11 = r38
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ nop.i 0
}
+;;
// If double-extended denormal add 64 to exponent bias for scaling
// If double-extended denormal form x * 2^64 which is normal
{ .mfi
-(p8) add r33 = 64, r33
-(p8) fmpy f10 = f10, f11
- nop.i 999 ;;
+(p8) add rExpBias = 64, rExpBias
+(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64
+ nop.i 0
}
+;;
// Logic is the same as normal path but use normalized input
-{ .mmi
-(p0) getf.exp r35 = f10 ;;
- nop.m 999
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) and r36 = r35, r34 ;;
-(p0) sub r37 = r36, r33
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) setf.sig f9 = r37
- nop.m 999
- nop.i 999 ;;
+{ .mib
+ getf.exp rSignexp_x = fNorm_x
+ nop.i 0
+ br.cond.sptk LOGB_COMMON // Return to main path
}
+;;
-{ .mfi
- nop.m 999
-(p0) fcvt.xf f10 = f9
- nop.i 999 ;;
-}
+LOGB_ZERO:
+// Here if x zero
+// f10 = -|f8|
+// f9 = 1.0/f10 = -1.0/|f8| = -inf
-{ .mfb
- nop.m 999
-(p0) fnorm.d f8 = f10
-(p0) br.ret.sptk b0 ;;
+{ .mmf
+ alloc r32=ar.pfs,1,2,4,0
+ mov GR_Parameter_TAG = 151 // Error code
+ fmerge.ns f10 = f0,f8
}
+;;
-L(LOGB_NAN_INF):
-
-// X NAN or INFINITY, return f8 * f8
{ .mfb
- nop.m 999
-(p0) fma.d f8= f8,f8,f0
-(p0) br.ret.sptk b0 ;;
-}
-
-.endp logb#
-ASM_SIZE_DIRECTIVE(logb)
-
-// Stack operations when calling error support.
-// (1) (2) (3) (call) (4)
-// sp -> + psp -> + psp -> + sp -> +
-// | | | |
-// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
-// | | | |
-// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
-// | | | |
-// | | <- GR_X X1 ->| |
-// | | | |
-// sp-64 -> + sp -> + sp -> + +
-// save ar.pfs save b0 restore gp
-// save gp restore ar.pfs
-
-
-
-.proc __libm_error_region
-__libm_error_region:
-L(LOGB_ZERO):
-.prologue
-
-// f9 = |f8|
-// f10 = -f9 = -|f8|
-// f9 = 1.0/f10 = -1.0/-|f8|
-
-{ .mfi
- mov r41 = 151 // Error code
-(p0) fmerge.s f9 = f0,f8
- nop.i 999
+ nop.m 0
+ frcpa.s0 f9,p6 = f1,f10 // Produce -inf, Z flag
+ br.cond.sptk __libm_error_region // Call error support
}
;;
+GLOBAL_LIBM_END(logb)
-{ .mfi
- nop.m 999
- fmerge.ns f10 = f0,f9
- nop.i 999
-}
-;;
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
-// (1)
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
- frcpa f9,p6 = f1,f10
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
-
-// (2)
{ .mmi
stfd [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
add GR_Parameter_X = 16,sp // Parameter 1 address
@@ -290,38 +243,38 @@ L(LOGB_ZERO):
};;
.body
-// (3)
{ .mib
stfd [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
- nop.b 0
+ nop.b 0
}
{ .mib
stfd [GR_Parameter_Y] = f9 // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
br.call.sptk b0=__libm_error_support# // Call error handling function
};;
+
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
-// (4)
{ .mmi
ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
+
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
br.ret.sptk b0
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
+
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_logbf.S b/sysdeps/ia64/fpu/s_logbf.S
index f2f671f892..1d605cd97c 100644
--- a/sysdeps/ia64/fpu/s_logbf.S
+++ b/sysdeps/ia64/fpu/s_logbf.S
@@ -1,10 +1,10 @@
.file "logbf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,44 +20,46 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 2/16/00 Modified to conform to C9X
-// 3/16/00 Improved speed
-// 4/04/00 Unwind support added
-// 5/30/00 Fixed bug when x double-extended denormal
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 02/16/00 Modified to conform to C9X
+// 03/16/00 Improved speed
+// 04/04/00 Unwind support added
+// 05/30/00 Fixed bug when x double-extended denormal
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance
//
// API
//==============================================================
-// float logbf( float x);
+// float logbf( float x );
//
// Overview of operation
//==============================================================
-// The logbf function extracts the exponent of x as an integer in
-// floating-point format.
+// The logbf function extracts the exponent of x as an integer in
+// floating-point format.
// logbf computes log2 of x as a float
-
-// logbf is similar to ilogbf but differs in the following ways:
+//
+// logbf is similar to ilogbf but differs in the following ways:
// +-inf
// ilogbf: returns INT_MAX
// logbf: returns +inf
@@ -71,243 +73,207 @@
//
// Registers used
//==============================================================
-// general registers used:
-// ar.pfs r32
-// r33 -> r37
-// r38 -> r41 used as parameters to error path
+// general registers used:
+// r26 -> r38
+// r35 -> r38 used as parameters to error path
//
-// predicate registers used:
+// predicate registers used:
// p6, p7, p8
-//
-// floating-point registers used:
+// floating-point registers used:
// f9, f10, f11
// f8, input
-#include "libm_support.h"
-
-GR_SAVE_B0 = r34
-// r40 is address of table of coefficients
-GR_SAVE_PFS = r32
-GR_SAVE_GP = r35
+rExpBias = r26
+rExpMask = r27
+rSignexp_x = r28
+rExp_x = r29
+rTrueExp_x = r30
+rExp_2to64 = r31
-GR_Parameter_X = r38
-GR_Parameter_Y = r39
-GR_Parameter_RESULT = r40
-GR_Parameter_TAG = r41
+GR_SAVE_PFS = r32
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
-FR_X = f8
-FR_Y = f0
-FR_RESULT = f10
+GR_Parameter_X = r35
+GR_Parameter_Y = r36
+GR_Parameter_RESULT = r37
+GR_Parameter_TAG = r38
-
-.align 32
-.global logbf#
+fExp_in_signif = f9
+fNorm_x = f10
+fFloat_Exp = f10
+f2to64 = f11
.section .text
-.proc logbf#
-.align 32
-
+GLOBAL_LIBM_ENTRY(logbf)
-logbf:
-
-// qnan snan inf norm unorm 0 -+
-// 0 0 0 0 1 0 11
-// 0 b
-{ .mfi
- alloc r32=ar.pfs,1,5,4,0
-(p0) fclass.m.unc p8,p0 = f8, 0x0b
- nop.i 999
-}
// X NORMAL
-// r37 = exp(f8) - - 0xffff
-// sig(f8) = r37
+// TrueExp_x = exp(f8) - 0xffff
+// sig = TrueExp_x
// f8 = convert_to_fp (sig))
{ .mfi
-(p0) getf.exp r35 = f8
-(p0) fnorm f10=f8
- nop.i 999 ;;
+ getf.exp rSignexp_x = f8
+ fclass.m p8,p0 = f8, 0x0b // Test for x unorm
+ mov rExpBias = 0xffff // Exponent bias
}
-
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 0 11
-// e 3
-{ .mmf
-(p0) mov r33 = 0xffff
-(p0) mov r34 = 0x1ffff
-(p0) fclass.m.unc p6,p0 = f8, 0xe3 ;;
+{ .mfi
+ nop.m 0
+ fnorm.s1 fNorm_x = f8
+ mov rExpMask = 0x1ffff // Exponent mask
}
+;;
+// Form signexp of 2^64 in case need to scale denormal
{ .mfb
-(p0) and r36 = r35, r34
-(p0) fclass.m.unc p7,p0 = f8, 0x07
-(p8) br.cond.spnt L(LOGB_DENORM) ;;
+ mov rExp_2to64 = 0x1003f
+ fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf
+(p8) br.cond.spnt LOGB_DENORM // Branch if x unorm
}
+;;
-{ .mib
-(p0) sub r37 = r36, r33
- nop.i 999
-(p6) br.cond.spnt L(LOGB_NAN_INF) ;;
+LOGB_COMMON:
+// Return here from LOGB_DENORM
+{ .mfi
+ and rExp_x = rSignexp_x, rExpMask // Get biased exponent
+ fclass.m p7,p0 = f8, 0x07 // Test x zero
+ nop.i 0
}
+;;
-{ .mib
-(p0) setf.sig f9 = r37
- nop.i 999
-(p7) br.cond.spnt L(LOGB_ZERO) ;;
+// X NAN or INFINITY, return f8 * f8
+{ .mfb
+ sub rTrueExp_x = rExp_x, rExpBias // Get true exponent
+(p6) fma.s.s0 f8= f8,f8,f0 // Result if x natval, nan, inf
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf
}
+;;
-{ .mfi
- nop.m 999
-(p0) fcvt.xf f10 = f9
- nop.i 999 ;;
+{ .mib
+ setf.sig fExp_in_signif = rTrueExp_x // Exponent as integer in fp
+ nop.i 999
+(p7) br.cond.spnt LOGB_ZERO
}
+;;
+// Result can be represented in less than 24 bits, so no precision completer
+// is needed.
{ .mfb
- nop.m 999
-(p0) fnorm.s f8 = f10
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+ fcvt.xf f8 = fExp_in_signif
+ br.ret.sptk b0 // Exit main path, 0 < |x| < inf
}
+;;
-L(LOGB_DENORM):
-// Form signexp of 2^64 in case need to scale denormal
+LOGB_DENORM:
+// Form 2^64 in case need to scale denormal
// Check to see if double-extended denormal
{ .mfi
-(p0) mov r38 = 0x1003f
-(p0) fclass.m.unc p8,p0 = f10, 0x0b
- nop.i 999 ;;
+ setf.exp f2to64 = rExp_2to64
+ fclass.m p8,p0 = fNorm_x, 0x0b
+ nop.i 0
}
+;;
-// Form 2^64 in case need to scale denormal
{ .mfi
-(p0) setf.exp f11 = r38
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ nop.i 0
}
+;;
// If double-extended denormal add 64 to exponent bias for scaling
// If double-extended denormal form x * 2^64 which is normal
{ .mfi
-(p8) add r33 = 64, r33
-(p8) fmpy f10 = f10, f11
- nop.i 999 ;;
+(p8) add rExpBias = 64, rExpBias
+(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64
+ nop.i 0
}
+;;
// Logic is the same as normal path but use normalized input
-{ .mmi
-(p0) getf.exp r35 = f10 ;;
- nop.m 999
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) and r36 = r35, r34 ;;
-(p0) sub r37 = r36, r33
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) setf.sig f9 = r37
- nop.m 999
- nop.i 999 ;;
+{ .mib
+ getf.exp rSignexp_x = fNorm_x
+ nop.i 0
+ br.cond.sptk LOGB_COMMON // Return to main path
}
+;;
-{ .mfi
- nop.m 999
-(p0) fcvt.xf f10 = f9
- nop.i 999 ;;
-}
+LOGB_ZERO:
+// Here if x zero
+// f10 = -|f8|
+// f9 = 1.0/f10 = -1.0/|f8| = -inf
-{ .mfb
- nop.m 999
-(p0) fnorm.s f8 = f10
-(p0) br.ret.sptk b0 ;;
+{ .mmf
+ alloc r32=ar.pfs,1,2,4,0
+ mov GR_Parameter_TAG = 152 // Error code
+ fmerge.ns f10 = f0,f8
}
+;;
-L(LOGB_NAN_INF):
-
-// X NAN or INFINITY, return f8 * f8
{ .mfb
- nop.m 999
-(p0) fma.s f8= f8,f8,f0
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+ frcpa.s0 f9,p6 = f1,f10 // Produce -inf, Z flag
+ br.cond.sptk __libm_error_region // Call error support
}
+;;
-L(LOGB_ZERO):
-
-// X ZERO
-// return -1.0/fabs(f8)=-inf, set divide-by-zero flag, call error support
-{ .mfi
- nop.m 999
-(p0) fmerge.s f9 = f0,f8
- nop.i 999 ;;
-}
+GLOBAL_LIBM_END(logbf)
-{ .mfi
- nop.m 999
-(p0) fmerge.ns f10 = f0,f9
- nop.i 999 ;;
-}
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
{ .mfi
- nop.m 999
-(p0) frcpa f10,p6 = f1,f10
- nop.i 999 ;;
-}
-
-.endp logbf
-ASM_SIZE_DIRECTIVE(logbf)
-
-
-.proc __libm_error_region
-__libm_error_region:
-.prologue
-{ .mii
- add GR_Parameter_Y=-32,sp // Parameter 2 value
-(p0) mov GR_Parameter_TAG = 152
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
+
{ .mmi
- stfs [GR_Parameter_Y] = FR_Y,16 // Store Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfs [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
+
.body
{ .mib
- stfs [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0 // Parameter 3 address
+ stfs [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfs [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ stfs [GR_Parameter_Y] = f9 // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
+
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
+
{ .mmi
ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
+
{ .mib
- mov gp = GR_SAVE_GP // Restore gp
+ mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
-};;
+ br.ret.sptk b0
+};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
.type __libm_error_support#,@function
diff --git a/sysdeps/ia64/fpu/s_logbl.S b/sysdeps/ia64/fpu/s_logbl.S
index 38b131f3aa..6a08e94201 100644
--- a/sysdeps/ia64/fpu/s_logbl.S
+++ b/sysdeps/ia64/fpu/s_logbl.S
@@ -1,10 +1,10 @@
.file "logbl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,44 +20,46 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00 Initial version
-// 2/16/00 Modified to conform to C9X
-// 3/16/00 Improved speed
-// 4/04/00 Unwind support added
-// 5/30/00 Fixed bug when x double-extended denormal
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 Initial version
+// 02/16/00 Modified to conform to C9X
+// 03/16/00 Improved speed
+// 04/04/00 Unwind support added
+// 05/30/00 Fixed bug when x double-extended denormal
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance
//
// API
//==============================================================
-// long double logbl( long double x);
+// long double logbl( long double x );
//
// Overview of operation
//==============================================================
-// The logbl function extracts the exponent of x as an integer in
-// floating-point format.
+// The logbl function extracts the exponent of x as an integer in
+// floating-point format.
// logbl computes log2 of x as a long double
//
-// logbl is similar to ilogbl but differs in the following ways:
+// logbl is similar to ilogbl but differs in the following ways:
// +-inf
// ilogbl: returns INT_MAX
// logbl: returns +inf
@@ -71,229 +73,208 @@
//
// Registers used
//==============================================================
-// general registers used:
-// ar.pfs r32
-// r33 -> r37
-// r38 -> r41 used as parameters to error path
+// general registers used:
+// r26 -> r38
+// r35 -> r38 used as parameters to error path
//
-// predicate registers used:
+// predicate registers used:
// p6, p7, p8
-//
-// floating-point registers used:
+// floating-point registers used:
// f9, f10, f11
// f8, input
-#include "libm_support.h"
+rExpBias = r26
+rExpMask = r27
+rSignexp_x = r28
+rExp_x = r29
+rTrueExp_x = r30
+rExp_2to64 = r31
GR_SAVE_PFS = r32
-GR_SAVE_B0 = r34
-GR_SAVE_GP = r35
-GR_Parameter_X = r38
-GR_Parameter_Y = r39
-GR_Parameter_RESULT = r40
-GR_Parameter_TAG = r41
+GR_SAVE_B0 = r33
+GR_SAVE_GP = r34
-FR_X = f8
-FR_Y = f0
-FR_RESULT = f10
+GR_Parameter_X = r35
+GR_Parameter_Y = r36
+GR_Parameter_RESULT = r37
+GR_Parameter_TAG = r38
-.align 32
-.global logbl#
+fExp_in_signif = f9
+fNorm_x = f10
+fFloat_Exp = f10
+f2to64 = f11
.section .text
-.proc logbl#
-.align 32
-
+GLOBAL_LIBM_ENTRY(logbl)
-logbl:
-
-// qnan snan inf norm unorm 0 -+
-// 0 0 0 0 1 0 11
-// 0 b
-{ .mfi
- alloc r32=ar.pfs,1,5,4,0
-(p0) fclass.m.unc p8,p0 = f8, 0x0b
- nop.i 999
-}
// X NORMAL
-// r37 = exp(f8) - - 0xffff
-// sig(f8) = r37
+// TrueExp_x = exp(f8) - 0xffff
+// sig = TrueExp_x
// f8 = convert_to_fp (sig))
{ .mfi
-(p0) getf.exp r35 = f8
-(p0) fnorm f10=f8
- nop.i 999 ;;
+ getf.exp rSignexp_x = f8
+ fclass.m p8,p0 = f8, 0x0b // Test for x unorm
+ mov rExpBias = 0xffff // Exponent bias
}
-
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 0 11
-// e 3
-{ .mmf
-(p0) mov r33 = 0xffff
-(p0) mov r34 = 0x1ffff
-(p0) fclass.m.unc p6,p0 = f8, 0xe3 ;;
+{ .mfi
+ nop.m 0
+ fnorm.s1 fNorm_x = f8
+ mov rExpMask = 0x1ffff // Exponent mask
}
+;;
+// Form signexp of 2^64 in case need to scale denormal
{ .mfb
-(p0) and r36 = r35, r34
-(p0) fclass.m.unc p7,p0 = f8, 0x07
-(p8) br.cond.spnt L(LOGB_DENORM) ;;
+ mov rExp_2to64 = 0x1003f
+ fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf
+(p8) br.cond.spnt LOGB_DENORM // Branch if x unorm
}
+;;
-{ .mib
-(p0) sub r37 = r36, r33
- nop.i 999
-(p6) br.cond.spnt L(LOGB_NAN_INF) ;;
+LOGB_COMMON:
+// Return here from LOGB_DENORM
+{ .mfi
+ and rExp_x = rSignexp_x, rExpMask // Get biased exponent
+ fclass.m p7,p0 = f8, 0x07 // Test x zero
+ nop.i 0
}
+;;
+
+// X NAN or INFINITY, return f8 * f8
+{ .mfb
+ sub rTrueExp_x = rExp_x, rExpBias // Get true exponent
+(p6) fma.s0 f8= f8,f8,f0 // Result if x natval, nan, inf
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf
+}
+;;
{ .mib
-(p0) setf.sig f9 = r37
+ setf.sig fExp_in_signif = rTrueExp_x // Exponent as integer in fp
nop.i 999
-(p7) br.cond.spnt L(LOGB_ZERO) ;;
-}
-{ .mfi
- nop.m 999
-(p0) fcvt.xf f10 = f9
- nop.i 999 ;;
+(p7) br.cond.spnt LOGB_ZERO
}
+;;
+// Result can be represented in less than 24 bits, so no precision completer
+// is needed.
{ .mfb
- nop.m 999
-(p0) fnorm f8 = f10
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+ fcvt.xf f8 = fExp_in_signif
+ br.ret.sptk b0 // Exit main path, 0 < |x| < inf
}
+;;
-L(LOGB_DENORM):
-// Form signexp of 2^64 in case need to scale denormal
+LOGB_DENORM:
+// Form 2^64 in case need to scale denormal
// Check to see if double-extended denormal
{ .mfi
-(p0) mov r38 = 0x1003f
-(p0) fclass.m.unc p8,p0 = f10, 0x0b
- nop.i 999 ;;
+ setf.exp f2to64 = rExp_2to64
+ fclass.m p8,p0 = fNorm_x, 0x0b
+ nop.i 0
}
+;;
-// Form 2^64 in case need to scale denormal
{ .mfi
-(p0) setf.exp f11 = r38
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ nop.i 0
}
+;;
// If double-extended denormal add 64 to exponent bias for scaling
// If double-extended denormal form x * 2^64 which is normal
{ .mfi
-(p8) add r33 = 64, r33
-(p8) fmpy f10 = f10, f11
- nop.i 999 ;;
+(p8) add rExpBias = 64, rExpBias
+(p8) fmpy.s1 fNorm_x = fNorm_x, f2to64
+ nop.i 0
}
+;;
// Logic is the same as normal path but use normalized input
-{ .mmi
-(p0) getf.exp r35 = f10 ;;
- nop.m 999
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) and r36 = r35, r34 ;;
-(p0) sub r37 = r36, r33
- nop.i 999 ;;
-}
-
-{ .mmi
-(p0) setf.sig f9 = r37
- nop.m 999
- nop.i 999 ;;
+{ .mib
+ getf.exp rSignexp_x = fNorm_x
+ nop.i 0
+ br.cond.sptk LOGB_COMMON // Return to main path
}
+;;
-{ .mfi
- nop.m 999
-(p0) fcvt.xf f10 = f9
- nop.i 999 ;;
-}
+LOGB_ZERO:
+// Here if x zero
+// f10 = -|f8|
+// f9 = 1.0/f10 = -1.0/|f8| = -inf
-{ .mfb
- nop.m 999
-(p0) fnorm f8 = f10
-(p0) br.ret.sptk b0 ;;
+{ .mmf
+ alloc r32=ar.pfs,1,2,4,0
+ mov GR_Parameter_TAG = 150 // Error code
+ fmerge.ns f10 = f0,f8
}
+;;
-L(LOGB_NAN_INF):
-
-// X NAN or INFINITY, return f8 * f8
{ .mfb
- nop.m 999
-(p0) fma f8= f8,f8,f0
-(p0) br.ret.sptk b0 ;;
+ nop.m 0
+ frcpa.s0 f9,p6 = f1,f10 // Produce -inf, Z flag
+ br.cond.sptk __libm_error_region // Call error support
}
+;;
-L(LOGB_ZERO):
-{.mfi
- nop.m 0
-(p0) frcpa.s0 f10,p6 = f1,f0
- nop.i 0
-};;
-{.mfi
- mov GR_Parameter_TAG = 150
-(p0) fms.s1 f10 = f0,f0,f10
- nop.i 0
-};;
-// X ZERO
-// return -1.0/fabs(f8)=-inf, set divide-by-zero flag, call error support
-.endp logbl
-ASM_SIZE_DIRECTIVE(logbl)
+GLOBAL_LIBM_END(logbl)
-.proc __libm_error_region
-__libm_error_region:
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
+
{ .mfi
- add GR_Parameter_Y=-32,sp // Parameter 2 value
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
+ add sp=-64,sp // Create new stack
nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ mov GR_SAVE_GP=gp // Save gp
};;
+
{ .mmi
- stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
- add GR_Parameter_X = 16,sp // Parameter 1 address
+ stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
+
.body
{ .mib
- stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
- add GR_Parameter_RESULT = 0,GR_Parameter_Y
- nop.b 0 // Parameter 3 address
+ stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ stfe [GR_Parameter_Y] = f9 // Store Parameter 3 on stack
add GR_Parameter_Y = -16,GR_Parameter_Y
- br.call.sptk b0=__libm_error_support# // Call error handling function
+ br.call.sptk b0=__libm_error_support# // Call error handling function
};;
+
{ .mmi
- nop.m 0
- nop.m 0
add GR_Parameter_RESULT = 48,sp
+ nop.m 0
+ nop.i 0
};;
+
{ .mmi
ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
.restore sp
add sp = 64,sp // Restore stack pointer
mov b0 = GR_SAVE_B0 // Restore return address
};;
+
{ .mib
mov gp = GR_SAVE_GP // Restore gp
mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
- br.ret.sptk b0 // Return
+ br.ret.sptk b0
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region)
+
.type __libm_error_support#,@function
.global __libm_error_support#
diff --git a/sysdeps/ia64/fpu/s_modf.S b/sysdeps/ia64/fpu/s_modf.S
index e8e672adfe..2008bbfc5c 100644
--- a/sysdeps/ia64/fpu/s_modf.S
+++ b/sysdeps/ia64/fpu/s_modf.S
@@ -1,10 +1,10 @@
.file "modf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,14 +35,16 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 4/04/00: Improved speed, corrected result for NaN input
+// 02/02/00 Initial version
+// 04/04/00 Improved speed, corrected result for NaN input
// 12/22/00 Fixed so inexact flag is never set, and invalid is not set for
// qnans nor for inputs larger than 2^63.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
@@ -97,8 +99,6 @@
// p13 --------------------------------------------------->|
//
-#include "libm_support.h"
-
// floating-point registers used:
MODF_NORM_F8 = f9
MODF_FRACTION_PART = f10
@@ -115,23 +115,17 @@ modf_exp = r18
// r33 = iptr
-.align 32
-.global modf#
-
.section .text
-.proc modf#
-.align 32
-
+GLOBAL_LIBM_ENTRY(modf)
// Main path is p9, p11, p8 FALSE and p12 TRUE
// Assume input is normalized and get signexp
// Normalize input just in case
// Form exponent bias
-modf:
{ .mfi
getf.exp modf_signexp = f8
- fnorm MODF_NORM_F8 = f8
+ fnorm.s0 MODF_NORM_F8 = f8
addl modf_GR_FFFF = 0xffff, r0
}
// Get integer part of input
@@ -176,10 +170,10 @@ modf:
{ .mfb
(p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac
(p6) fclass.m.unc p6,p7 = f8, 0x23
-(p8) br.cond.spnt L(MODF_DENORM) ;;
+(p8) br.cond.spnt MODF_DENORM ;;
}
-L(MODF_COMMON):
+MODF_COMMON:
// For HUGE set fraction to signed 0
{ .mfi
nop.m 999
@@ -189,7 +183,7 @@ L(MODF_COMMON):
// For HUGE set integer part to normalized input
{ .mfi
nop.m 999
-(p9) fnorm.d MODF_INTEGER_PART = MODF_NORM_F8
+(p9) fnorm.d.s0 MODF_INTEGER_PART = MODF_NORM_F8
nop.i 999 ;;
}
@@ -201,7 +195,7 @@ L(MODF_COMMON):
}
{ .mfi
nop.m 999
-(p11) fnorm.d f8 = MODF_NORM_F8
+(p11) fnorm.d.s0 f8 = MODF_NORM_F8
nop.i 999 ;;
}
@@ -242,7 +236,7 @@ L(MODF_COMMON):
// For NORMAL test if fraction part is zero; if so append correct sign
{ .mfi
nop.m 999
-(p12) fcmp.eq.unc p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART
+(p12) fcmp.eq.unc.s0 p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART
nop.i 999 ;;
}
@@ -259,7 +253,7 @@ L(MODF_COMMON):
br.ret.sptk b0 ;;
}
-L(MODF_DENORM):
+MODF_DENORM:
// If x unorm get signexp from normalized input
// If x unorm get integer part from normalized input
{ .mfi
@@ -278,8 +272,7 @@ L(MODF_DENORM):
{ .mfb
(p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac
nop.f 999
- br.cond.spnt L(MODF_COMMON) ;;
+ br.cond.spnt MODF_COMMON ;;
}
-.endp modf
-ASM_SIZE_DIRECTIVE(modf)
+GLOBAL_LIBM_END(modf)
diff --git a/sysdeps/ia64/fpu/s_modff.S b/sysdeps/ia64/fpu/s_modff.S
index 6aa43c884d..edc1120971 100644
--- a/sysdeps/ia64/fpu/s_modff.S
+++ b/sysdeps/ia64/fpu/s_modff.S
@@ -1,10 +1,10 @@
.file "modff.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,14 +35,16 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 4/04/00: Improved speed, corrected result for NaN input
+// 02/02/00 Initial version
+// 04/04/00 Improved speed, corrected result for NaN input
// 12/22/00 Fixed so inexact flag is never set, and invalid is not set for
// qnans nor for inputs larger than 2^63.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
@@ -97,8 +99,6 @@
// p13 --------------------------------------------------->|
//
-#include "libm_support.h"
-
// floating-point registers used:
MODF_NORM_F8 = f9
MODF_FRACTION_PART = f10
@@ -115,23 +115,17 @@ modf_exp = r18
// r33 = iptr
-.align 32
-.global modff#
-
.section .text
-.proc modff#
-.align 32
-
+GLOBAL_LIBM_ENTRY(modff)
// Main path is p9, p11, p8 FALSE and p12 TRUE
// Assume input is normalized and get signexp
// Normalize input just in case
// Form exponent bias
-modff:
{ .mfi
getf.exp modf_signexp = f8
- fnorm MODF_NORM_F8 = f8
+ fnorm.s0 MODF_NORM_F8 = f8
addl modf_GR_FFFF = 0xffff, r0
}
// Get integer part of input
@@ -176,10 +170,10 @@ modff:
{ .mfb
(p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac
(p6) fclass.m.unc p6,p7 = f8, 0x23
-(p8) br.cond.spnt L(MODF_DENORM) ;;
+(p8) br.cond.spnt MODF_DENORM ;;
}
-L(MODF_COMMON):
+MODF_COMMON:
// For HUGE set fraction to signed 0
{ .mfi
nop.m 999
@@ -189,7 +183,7 @@ L(MODF_COMMON):
// For HUGE set integer part to normalized input
{ .mfi
nop.m 999
-(p9) fnorm.s MODF_INTEGER_PART = MODF_NORM_F8
+(p9) fnorm.s.s0 MODF_INTEGER_PART = MODF_NORM_F8
nop.i 999 ;;
}
@@ -201,7 +195,7 @@ L(MODF_COMMON):
}
{ .mfi
nop.m 999
-(p11) fnorm.s f8 = MODF_NORM_F8
+(p11) fnorm.s.s0 f8 = MODF_NORM_F8
nop.i 999 ;;
}
@@ -242,7 +236,7 @@ L(MODF_COMMON):
// For NORMAL test if fraction part is zero; if so append correct sign
{ .mfi
nop.m 999
-(p12) fcmp.eq.unc p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART
+(p12) fcmp.eq.unc.s0 p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART
nop.i 999 ;;
}
@@ -259,7 +253,7 @@ L(MODF_COMMON):
br.ret.sptk b0 ;;
}
-L(MODF_DENORM):
+MODF_DENORM:
// If x unorm get signexp from normalized input
// If x unorm get integer part from normalized input
{ .mfi
@@ -278,8 +272,7 @@ L(MODF_DENORM):
{ .mfb
(p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac
nop.f 999
- br.cond.spnt L(MODF_COMMON) ;;
+ br.cond.spnt MODF_COMMON ;;
}
-.endp modff
-ASM_SIZE_DIRECTIVE(modff)
+GLOBAL_LIBM_END(modff)
diff --git a/sysdeps/ia64/fpu/s_modfl.S b/sysdeps/ia64/fpu/s_modfl.S
index b5eb509adf..eaf410cb6c 100644
--- a/sysdeps/ia64/fpu/s_modfl.S
+++ b/sysdeps/ia64/fpu/s_modfl.S
@@ -1,10 +1,10 @@
.file "modfl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,15 +35,17 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 4/04/00: Improved speed, corrected result for NaN input
-// 5/30/00 Fixed bug for exponent 0x1003e
+// 02/02/00 Initial version
+// 04/04/00 Improved speed, corrected result for NaN input
+// 05/30/00 Fixed bug for exponent 0x1003e
// 12/22/00 Fixed so inexact flag is never set, and invalid is not set for
// qnans nor for inputs larger than 2^63.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
@@ -92,8 +94,6 @@
// p13 --------------------------------------------------->|
//
-#include "libm_support.h"
-
// floating-point registers used:
MODF_NORM_F8 = f9
MODF_FRACTION_PART = f10
@@ -110,23 +110,17 @@ modf_exp = r18
// r34 = iptr
-.align 32
-.global modfl#
-
.section .text
-.proc modfl#
-.align 32
-
+GLOBAL_LIBM_ENTRY(modfl)
// Main path is p9, p11, p8 FALSE and p12 TRUE
// Assume input is normalized and get signexp
// Normalize input just in case
// Form exponent bias
-modfl:
{ .mfi
getf.exp modf_signexp = f8
- fnorm MODF_NORM_F8 = f8
+ fnorm.s0 MODF_NORM_F8 = f8
addl modf_GR_FFFF = 0xffff, r0
}
// Get integer part of input
@@ -171,10 +165,10 @@ modfl:
{ .mfb
(p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac
(p6) fclass.m.unc p6,p7 = f8, 0x23
-(p8) br.cond.spnt L(MODF_DENORM) ;;
+(p8) br.cond.spnt MODF_DENORM ;;
}
-L(MODF_COMMON):
+MODF_COMMON:
// For HUGE set fraction to signed 0
{ .mfi
nop.m 999
@@ -184,7 +178,7 @@ L(MODF_COMMON):
// For HUGE set integer part to normalized input
{ .mfi
nop.m 999
-(p9) fnorm MODF_INTEGER_PART = MODF_NORM_F8
+(p9) fnorm.s0 MODF_INTEGER_PART = MODF_NORM_F8
nop.i 999 ;;
}
@@ -196,7 +190,7 @@ L(MODF_COMMON):
}
{ .mfi
nop.m 999
-(p11) fnorm f8 = MODF_NORM_F8
+(p11) fnorm.s0 f8 = MODF_NORM_F8
nop.i 999 ;;
}
@@ -237,7 +231,7 @@ L(MODF_COMMON):
// For NORMAL test if fraction part is zero; if so append correct sign
{ .mfi
nop.m 999
-(p12) fcmp.eq.unc p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART
+(p12) fcmp.eq.unc.s0 p7,p0 = MODF_NORM_F8, MODF_INTEGER_PART
nop.i 999 ;;
}
@@ -254,7 +248,7 @@ L(MODF_COMMON):
br.ret.sptk b0 ;;
}
-L(MODF_DENORM):
+MODF_DENORM:
// If x unorm get signexp from normalized input
// If x unorm get integer part from normalized input
{ .mfi
@@ -273,8 +267,7 @@ L(MODF_DENORM):
{ .mfb
(p10) cmp.ge.unc p9,p12 = modf_exp, modf_GR_no_frac
nop.f 999
- br.cond.spnt L(MODF_COMMON) ;;
+ br.cond.spnt MODF_COMMON ;;
}
-.endp modfl
-ASM_SIZE_DIRECTIVE(modfl)
+GLOBAL_LIBM_END(modfl)
diff --git a/sysdeps/ia64/fpu/s_nearbyint.S b/sysdeps/ia64/fpu/s_nearbyint.S
index 6ee01ea260..cba74e61d3 100644
--- a/sysdeps/ia64/fpu/s_nearbyint.S
+++ b/sysdeps/ia64/fpu/s_nearbyint.S
@@ -1,11 +1,10 @@
.file "nearbyint.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 10/19/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the
-// Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -36,20 +35,19 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 10/19/2000: Created
-// 2/08/01 Corrected behavior for all rounding modes.
+// 10/19/00 Created
+// 02/08/01 Corrected behavior for all rounding modes.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//==============================================================
//
// API
//==============================================================
// double nearbyint(double x)
-
-#include "libm_support.h"
-
//
// general registers used:
//
@@ -110,15 +108,8 @@ NEARBYINT_INT_f8 = f11
// 1 1 1 0 0 1 11 0xe7
-.align 32
-.global nearbyint#
-
.section .text
-.proc nearbyint#
-.align 32
-
-
-nearbyint:
+GLOBAL_LIBM_ENTRY(nearbyint)
{ .mfi
mov nearbyint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0
@@ -141,7 +132,7 @@ nearbyint:
{ .mfb
nop.m 999
-(p6) fnorm.d f8 = f8
+(p6) fnorm.d.s0 f8 = f8
(p6) br.ret.spnt b0 // Exit if x nan, inf, zero
;;
}
@@ -177,11 +168,11 @@ nearbyint:
// Check to see if s0 rounding mode is round to nearest. If not then set s2
// rounding mode to that of s0 and repeat conversions.
-L(NEARBYINT_COMMON):
+NEARBYINT_COMMON:
{ .mfb
cmp.ne p11,p0 = nearbyint_GR_rcs0, r0
(p6) fclass.m.unc p9,p10 = NEARBYINT_FLOAT_INT_f8, 0x07 // Test for result=0
-(p11) br.cond.spnt L(NEARBYINT_NOT_ROUND_NEAREST) // Branch if not round to nearest
+(p11) br.cond.spnt NEARBYINT_NOT_ROUND_NEAREST // Branch if not round to nearest
;;
}
@@ -200,13 +191,13 @@ L(NEARBYINT_COMMON):
}
{ .mfb
nop.m 999
-(p10) fnorm.d f8 = NEARBYINT_FLOAT_INT_f8
+(p10) fnorm.d.s0 f8 = NEARBYINT_FLOAT_INT_f8
br.ret.sptk b0
;;
}
-L(NEARBYINT_NOT_ROUND_NEAREST):
+NEARBYINT_NOT_ROUND_NEAREST:
// Set rounding mode of s2 to that of s0
{ .mfi
mov nearbyint_GR_rcs0 = r0 // Clear so we don't come back here
@@ -225,10 +216,9 @@ L(NEARBYINT_NOT_ROUND_NEAREST):
{ .mfb
nop.m 999
fcvt.xf NEARBYINT_FLOAT_INT_f8 = NEARBYINT_INT_f8
- br.cond.sptk L(NEARBYINT_COMMON)
+ br.cond.sptk NEARBYINT_COMMON
;;
}
-.endp nearbyint
-ASM_SIZE_DIRECTIVE(nearbyint)
+GLOBAL_LIBM_END(nearbyint)
diff --git a/sysdeps/ia64/fpu/s_nearbyintf.S b/sysdeps/ia64/fpu/s_nearbyintf.S
index 7050ddc52c..6471232513 100644
--- a/sysdeps/ia64/fpu/s_nearbyintf.S
+++ b/sysdeps/ia64/fpu/s_nearbyintf.S
@@ -1,11 +1,10 @@
.file "nearbyintf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 10/19/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the
-// Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -36,20 +35,19 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 10/19/2000: Created
-// 2/08/01 Corrected behavior for all rounding modes.
+// 10/19/00 Created
+// 02/08/01 Corrected behavior for all rounding modes.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//==============================================================
//
// API
//==============================================================
// float nearbyintf(float x)
-
-#include "libm_support.h"
-
//
// general registers used:
//
@@ -110,15 +108,8 @@ NEARBYINT_INT_f8 = f11
// 1 1 1 0 0 1 11 0xe7
-.align 32
-.global nearbyintf#
-
.section .text
-.proc nearbyintf#
-.align 32
-
-
-nearbyintf:
+GLOBAL_LIBM_ENTRY(nearbyintf)
{ .mfi
mov nearbyint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0
@@ -141,7 +132,7 @@ nearbyintf:
{ .mfb
nop.m 999
-(p6) fnorm.s f8 = f8
+(p6) fnorm.s.s0 f8 = f8
(p6) br.ret.spnt b0 // Exit if x nan, inf, zero
;;
}
@@ -177,11 +168,11 @@ nearbyintf:
// Check to see if s0 rounding mode is round to nearest. If not then set s2
// rounding mode to that of s0 and repeat conversions.
-L(NEARBYINT_COMMON):
+NEARBYINT_COMMON:
{ .mfb
cmp.ne p11,p0 = nearbyint_GR_rcs0, r0
(p6) fclass.m.unc p9,p10 = NEARBYINT_FLOAT_INT_f8, 0x07 // Test for result=0
-(p11) br.cond.spnt L(NEARBYINT_NOT_ROUND_NEAREST) // Branch if not round to nearest
+(p11) br.cond.spnt NEARBYINT_NOT_ROUND_NEAREST // Branch if not round to nearest
;;
}
@@ -200,13 +191,13 @@ L(NEARBYINT_COMMON):
}
{ .mfb
nop.m 999
-(p10) fnorm.s f8 = NEARBYINT_FLOAT_INT_f8
+(p10) fnorm.s.s0 f8 = NEARBYINT_FLOAT_INT_f8
br.ret.sptk b0
;;
}
-L(NEARBYINT_NOT_ROUND_NEAREST):
+NEARBYINT_NOT_ROUND_NEAREST:
// Set rounding mode of s2 to that of s0
{ .mfi
mov nearbyint_GR_rcs0 = r0 // Clear so we don't come back here
@@ -225,10 +216,9 @@ L(NEARBYINT_NOT_ROUND_NEAREST):
{ .mfb
nop.m 999
fcvt.xf NEARBYINT_FLOAT_INT_f8 = NEARBYINT_INT_f8
- br.cond.sptk L(NEARBYINT_COMMON)
+ br.cond.sptk NEARBYINT_COMMON
;;
}
-.endp nearbyintf
-ASM_SIZE_DIRECTIVE(nearbyintf)
+GLOBAL_LIBM_END(nearbyintf)
diff --git a/sysdeps/ia64/fpu/s_nearbyintl.S b/sysdeps/ia64/fpu/s_nearbyintl.S
index 95ba6ab260..9c4c2e4f16 100644
--- a/sysdeps/ia64/fpu/s_nearbyintl.S
+++ b/sysdeps/ia64/fpu/s_nearbyintl.S
@@ -1,11 +1,10 @@
.file "nearbyintl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 10/19/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the
-// Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -36,20 +35,19 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 10/19/2000: Created
-// 2/08/01 Corrected behavior for all rounding modes.
+// 10/19/00 Created
+// 02/08/01 Corrected behavior for all rounding modes.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//==============================================================
//
// API
//==============================================================
// long double nearbyintl(long double x)
-
-#include "libm_support.h"
-
//
// general registers used:
//
@@ -111,15 +109,8 @@ NEARBYINT_SIGNED_FLOAT_INT_f8 = f12
// 1 1 1 0 0 1 11 0xe7
-.align 32
-.global nearbyintl#
-
.section .text
-.proc nearbyintl#
-.align 32
-
-
-nearbyintl:
+GLOBAL_LIBM_ENTRY(nearbyintl)
{ .mfi
mov nearbyint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0
@@ -142,7 +133,7 @@ nearbyintl:
{ .mfb
nop.m 999
-(p6) fnorm f8 = f8
+(p6) fnorm.s0 f8 = f8
(p6) br.ret.spnt b0 // Exit if x nan, inf, zero
;;
}
@@ -180,11 +171,11 @@ nearbyintl:
// rounding mode to that of s0 and repeat conversions.
// Must merge the original sign for cases where the result is zero or the input
// is the largest that still has a fraction (0x1007dfffffffffff)
-L(NEARBYINT_COMMON):
+NEARBYINT_COMMON:
{ .mfb
cmp.ne p11,p0 = nearbyint_GR_rcs0, r0
(p6) fmerge.s NEARBYINT_SIGNED_FLOAT_INT_f8 = f8, NEARBYINT_FLOAT_INT_f8
-(p11) br.cond.spnt L(NEARBYINT_NOT_ROUND_NEAREST) // Branch if not round to nearest
+(p11) br.cond.spnt NEARBYINT_NOT_ROUND_NEAREST // Branch if not round to nearest
;;
}
@@ -197,13 +188,13 @@ L(NEARBYINT_COMMON):
{ .mfb
nop.m 999
-(p6) fnorm f8 = NEARBYINT_SIGNED_FLOAT_INT_f8
+(p6) fnorm.s0 f8 = NEARBYINT_SIGNED_FLOAT_INT_f8
br.ret.sptk b0
;;
}
-L(NEARBYINT_NOT_ROUND_NEAREST):
+NEARBYINT_NOT_ROUND_NEAREST:
// Set rounding mode of s2 to that of s0
{ .mfi
mov nearbyint_GR_rcs0 = r0 // Clear so we don't come back here
@@ -222,10 +213,9 @@ L(NEARBYINT_NOT_ROUND_NEAREST):
{ .mfb
nop.m 999
fcvt.xf NEARBYINT_FLOAT_INT_f8 = NEARBYINT_INT_f8
- br.cond.sptk L(NEARBYINT_COMMON)
+ br.cond.sptk NEARBYINT_COMMON
;;
}
-.endp nearbyintl
-ASM_SIZE_DIRECTIVE(nearbyintl)
+GLOBAL_LIBM_END(nearbyintl)
diff --git a/sysdeps/ia64/fpu/s_nextafter.S b/sysdeps/ia64/fpu/s_nextafter.S
new file mode 100644
index 0000000000..8c77aa492b
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_nextafter.S
@@ -0,0 +1,495 @@
+.file "nextafter.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/02/00 Initial version
+// 03/03/00 Modified to conform to C9X, and improve speed of main path
+// 03/14/00 Fixed case where x is a power of 2, and x > y, improved speed
+// 04/04/00 Unwind support added
+// 05/12/00 Fixed erroneous denormal flag setting for exponent change cases 1,3
+// 08/15/00 Bundle added after call to __libm_error_support to properly
+// set [the previously overwritten] GR_Parameter_RESULT.
+// 09/09/00 Updated fcmp so that qnans do not raise invalid
+// 12/15/00 Corrected behavior when both args are zero to conform to C99, and
+// fixed flag settings for several cases
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// double nextafter( double x, double y );
+// input floating point f8, f9
+// output floating point f8
+//
+// Registers used
+//==============================================================
+nextafter_GR_max_pexp = r14
+nextafter_GR_min_pexp = r15
+nextafter_GR_exp = r16
+nextafter_GR_sig = r17
+nextafter_GR_lnorm_sig = r18
+nextafter_GR_sign_mask = r19
+nextafter_GR_exp_mask = r20
+nextafter_GR_sden_sig = r21
+nextafter_GR_new_sig = r22
+nextafter_GR_new_exp = r23
+nextafter_GR_lden_sig = r24
+nextafter_GR_snorm_sig = r25
+nextafter_GR_exp1 = r26
+nextafter_GR_x_exp = r27
+nextafter_GR_min_den_rexp = r28
+// r36-39 parameters for libm_error_support
+
+GR_SAVE_B0 = r34
+GR_SAVE_GP = r35
+GR_SAVE_PFS = r32
+
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+
+NEXTAFTER_lnorm_sig = f10
+NEXTAFTER_lnorm_exp = f11
+NEXTAFTER_lnorm = f12
+NEXTAFTER_sden_sig = f13
+NEXTAFTER_sden_exp = f14
+NEXTAFTER_sden = f15
+NEXTAFTER_save_f8 = f33
+NEXTAFTER_new_exp = f34
+NEXTAFTER_new_sig = f35
+NEXTAFTER_lden_sig = f36
+NEXTAFTER_snorm_sig = f37
+NEXTAFTER_exp1 = f38
+NEXTAFTER_tmp = f39
+
+//
+// Overview of operation
+//==============================================================
+// nextafter determines the next representable value
+// after x in the direction of y.
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(nextafter)
+
+// Extract signexp from x
+// Is x < y ? p10 if yes, p11 if no
+// Form smallest denormal significand = ulp size
+{ .mfi
+ getf.exp nextafter_GR_exp = f8
+ fcmp.lt.s1 p10,p11 = f8, f9
+ addl nextafter_GR_sden_sig = 0x800, r0
+}
+// Form largest normal significand 0xfffffffffffff800
+// Form smallest normal exponent
+{ .mfi
+ addl nextafter_GR_lnorm_sig = -0x800,r0
+ nop.f 999
+ addl nextafter_GR_min_pexp = 0x0fc01, r0 ;;
+}
+// Extract significand from x
+// Is x=y?
+// Form largest normal exponent
+{ .mfi
+ getf.sig nextafter_GR_sig = f8
+ fcmp.eq.s0 p6,p0 = f8, f9
+ addl nextafter_GR_max_pexp = 0x103fe, r0
+}
+// Move largest normal significand to fp reg for special cases
+{ .mfi
+ setf.sig NEXTAFTER_lnorm_sig = nextafter_GR_lnorm_sig
+ nop.f 999
+ addl nextafter_GR_sign_mask = 0x20000, r0 ;;
+}
+
+// Move smallest denormal significand and signexp to fp regs
+// Is x=nan?
+// Set p12 and p13 based on whether significand increases or decreases
+// It increases (p12 set) if x<y and x>=0 or if x>y and x<0
+// It decreases (p13 set) if x<y and x<0 or if x>y and x>=0
+{ .mfi
+ setf.sig NEXTAFTER_sden_sig = nextafter_GR_sden_sig
+ fclass.m p8,p0 = f8, 0xc3
+(p10) cmp.lt p12,p13 = nextafter_GR_exp, nextafter_GR_sign_mask
+}
+{ .mfi
+ setf.exp NEXTAFTER_sden_exp = nextafter_GR_min_pexp
+(p11) cmp.ge p12,p13 = nextafter_GR_exp, nextafter_GR_sign_mask ;;
+}
+
+.pred.rel "mutex",p12,p13
+
+// Form expected new significand, adding or subtracting 1 ulp increment
+// If x=y set result to y
+// Form smallest normal significand and largest denormal significand
+{ .mfi
+(p12) add nextafter_GR_new_sig = nextafter_GR_sig, nextafter_GR_sden_sig
+(p6) fmerge.s f8=f9,f9
+ dep.z nextafter_GR_snorm_sig = 1,63,1 // 0x8000000000000000
+}
+{ .mlx
+(p13) sub nextafter_GR_new_sig = nextafter_GR_sig, nextafter_GR_sden_sig
+ movl nextafter_GR_lden_sig = 0x7ffffffffffff800 ;;
+}
+
+// Move expected result significand and signexp to fp regs
+// Is y=nan?
+// Form new exponent in case result exponent needs incrementing or decrementing
+{ .mfi
+ setf.exp NEXTAFTER_new_exp = nextafter_GR_exp
+ fclass.m p9,p0 = f9, 0xc3
+(p12) add nextafter_GR_exp1 = 1, nextafter_GR_exp
+}
+{ .mib
+ setf.sig NEXTAFTER_new_sig = nextafter_GR_new_sig
+(p13) add nextafter_GR_exp1 = -1, nextafter_GR_exp
+(p6) br.ret.spnt b0 ;; // Exit if x=y
+}
+
+// Move largest normal signexp to fp reg for special cases
+// Is x=zero?
+{ .mfi
+ setf.exp NEXTAFTER_lnorm_exp = nextafter_GR_max_pexp
+ fclass.m p7,p0 = f8, 0x7
+ nop.i 999
+}
+{ .mfb
+ nop.m 999
+(p8) fma.s0 f8 = f8,f1,f9
+(p8) br.ret.spnt b0 ;; // Exit if x=nan
+}
+
+// Move exp+-1 and smallest normal significand to fp regs for special cases
+// Is x=inf?
+{ .mfi
+ setf.exp NEXTAFTER_exp1 = nextafter_GR_exp1
+ fclass.m p6,p0 = f8, 0x23
+ addl nextafter_GR_exp_mask = 0x1ffff, r0
+}
+{ .mfb
+ setf.sig NEXTAFTER_snorm_sig = nextafter_GR_snorm_sig
+(p9) fma.s0 f8 = f8,f1,f9
+(p9) br.ret.spnt b0 ;; // Exit if y=nan
+}
+
+// Move largest denormal significand to fp regs for special cases
+// Save x
+{ .mfb
+ setf.sig NEXTAFTER_lden_sig = nextafter_GR_lden_sig
+ mov NEXTAFTER_save_f8 = f8
+(p7) br.cond.spnt NEXTAFTER_ZERO ;; // Exit if x=0
+}
+
+// Mask off the sign to get x_exp
+{ .mfb
+ and nextafter_GR_x_exp = nextafter_GR_exp_mask, nextafter_GR_exp
+ nop.f 999
+(p6) br.cond.spnt NEXTAFTER_INF ;; // Exit if x=inf
+}
+
+// Check 6 special cases when significand rolls over:
+// 1 sig size incr, x_sig=max_sig, x_exp < max_exp
+// Set p6, result is sig=min_sig, exp++
+// 2 sig size incr, x_sig=max_sig, x_exp >= max_exp
+// Set p7, result is inf, signal overflow
+// 3 sig size decr, x_sig=min_sig, x_exp > min_exp
+// Set p8, result is sig=max_sig, exp--
+// 4 sig size decr, x_sig=min_sig, x_exp = min_exp
+// Set p9, result is sig=max_den_sig, exp same, signal underflow and inexact
+// 5 sig size decr, x_sig=min_den_sig, x_exp = min_exp
+// Set p10, result is zero, sign of x, signal underflow and inexact
+// 6 sig size decr, x_sig=min_sig, x_exp < min_exp
+// Set p14, result is zero, sign of x, signal underflow and inexact
+//
+// Form exponent of smallest double denormal (if normalized register format)
+{ .mmi
+ adds nextafter_GR_min_den_rexp = -52, nextafter_GR_min_pexp
+(p12) cmp.eq.unc p6,p0 = nextafter_GR_new_sig, r0
+(p13) cmp.eq.unc p8,p10 = nextafter_GR_new_sig, nextafter_GR_lden_sig ;;
+}
+
+{ .mmi
+(p6) cmp.lt.unc p6,p7 = nextafter_GR_x_exp, nextafter_GR_max_pexp
+(p8) cmp.gt.unc p8,p9 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+(p10) cmp.eq.unc p10,p0 = nextafter_GR_new_sig, r0 ;;
+}
+
+// Create small normal in case need to generate underflow flag
+{ .mfi
+(p10) cmp.le.unc p10,p0 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+ fmerge.se NEXTAFTER_tmp = NEXTAFTER_sden_exp, NEXTAFTER_lnorm_sig
+(p9) cmp.gt.unc p9,p14 = nextafter_GR_x_exp, nextafter_GR_min_den_rexp
+}
+// Branch if cases 1, 2, 3
+{ .bbb
+(p6) br.cond.spnt NEXTAFTER_EXPUP
+(p7) br.cond.spnt NEXTAFTER_OVERFLOW
+(p8) br.cond.spnt NEXTAFTER_EXPDOWN ;;
+}
+
+// Branch if cases 4, 5, 6
+{ .bbb
+(p9) br.cond.spnt NEXTAFTER_NORM_TO_DENORM
+(p10) br.cond.spnt NEXTAFTER_UNDERFLOW_TO_ZERO
+(p14) br.cond.spnt NEXTAFTER_UNDERFLOW_TO_ZERO ;;
+}
+
+// Here if no special cases
+// Set p6 if result will be a denormal, so can force underflow flag
+// Case 1: x_exp=min_exp, x_sig=unnormalized
+// Case 2: x_exp<min_exp
+{ .mfi
+ cmp.lt p6,p7 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+ fmerge.se f8 = NEXTAFTER_new_exp, NEXTAFTER_new_sig
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+ nop.f 999
+(p7) tbit.z p6,p0 = nextafter_GR_new_sig, 63 ;;
+}
+
+NEXTAFTER_COMMON_FINISH:
+// Force underflow and inexact if denormal result
+{ .mfi
+ nop.m 999
+(p6) fma.d.s0 NEXTAFTER_tmp = NEXTAFTER_tmp,NEXTAFTER_tmp,f0
+ nop.i 999 ;;
+}
+
+// Final normalization to result precision and exit
+{ .mfb
+ nop.m 999
+ fnorm.d.s0 f8 = f8
+ br.ret.sptk b0;;
+}
+
+//Special cases
+NEXTAFTER_EXPUP:
+{ .mfb
+ cmp.lt p6,p7 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+ fmerge.se f8 = NEXTAFTER_exp1, NEXTAFTER_snorm_sig
+ br.cond.sptk NEXTAFTER_COMMON_FINISH ;;
+}
+
+NEXTAFTER_EXPDOWN:
+{ .mfb
+ cmp.lt p6,p7 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+ fmerge.se f8 = NEXTAFTER_exp1, NEXTAFTER_lnorm_sig
+ br.cond.sptk NEXTAFTER_COMMON_FINISH ;;
+}
+
+NEXTAFTER_NORM_TO_DENORM:
+{ .mfi
+ nop.m 999
+ fmerge.se f8 = NEXTAFTER_new_exp, NEXTAFTER_lden_sig
+ nop.i 999
+}
+// Force underflow and inexact if denormal result
+{ .mfb
+ nop.m 999
+ fma.d.s0 NEXTAFTER_tmp = NEXTAFTER_tmp,NEXTAFTER_tmp,f0
+ br.ret.sptk b0 ;;
+}
+
+NEXTAFTER_UNDERFLOW_TO_ZERO:
+{ .mfb
+ cmp.eq p6,p0 = r0,r0
+ fmerge.s f8 = NEXTAFTER_save_f8,f0
+ br.cond.sptk NEXTAFTER_COMMON_FINISH ;;
+}
+
+NEXTAFTER_INF:
+// Here if f8 is +- infinity
+// INF
+// if f8 is +inf, no matter what y is return largest double
+// if f8 is -inf, no matter what y is return -largest double
+
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTAFTER_lnorm = NEXTAFTER_lnorm_exp,NEXTAFTER_lnorm_sig
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+ fmerge.s f8 = f8,NEXTAFTER_lnorm
+ br.ret.sptk b0 ;;
+}
+
+NEXTAFTER_ZERO:
+
+// Here if f8 is +- zero
+// ZERO
+// if f8 is zero and y is +, return + smallest double denormal
+// if f8 is zero and y is -, return - smallest double denormal
+
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTAFTER_sden = NEXTAFTER_sden_exp,NEXTAFTER_sden_sig
+ nop.i 999 ;;
+}
+
+// Create small normal to generate underflow flag
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTAFTER_tmp = NEXTAFTER_sden_exp, NEXTAFTER_lnorm_sig
+ nop.i 999 ;;
+}
+
+// Add correct sign from direction arg
+{ .mfi
+ nop.m 999
+ fmerge.s f8 = f9,NEXTAFTER_sden
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+ fma.d.s0 NEXTAFTER_tmp = NEXTAFTER_tmp,NEXTAFTER_tmp,f0
+ br.ret.sptk b0 ;;
+}
+
+GLOBAL_LIBM_END(nextafter)
+// Stack operations when calling error support.
+// (1) (2) (3) (call) (4)
+// sp -> + psp -> + psp -> + sp -> +
+// | | | |
+// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
+// | | | |
+// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
+// | | | |
+// | | <- GR_X X1 ->| |
+// | | | |
+// sp-64 -> + sp -> + sp -> + +
+// save ar.pfs save b0 restore gp
+// save gp restore ar.pfs
+
+
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+NEXTAFTER_OVERFLOW:
+// Here if f8 is finite, but result will be infinite
+// Use frcpa to generate infinity of correct sign
+// Call error support to report possible range error
+.prologue
+
+{ .mfi
+ alloc r32=ar.pfs,2,2,4,0
+ frcpa.s1 f8,p6 = NEXTAFTER_save_f8, f0
+ nop.i 999 ;;
+}
+
+// Create largest double
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTAFTER_lnorm = NEXTAFTER_lnorm_exp,NEXTAFTER_lnorm_sig
+ nop.i 999 ;;
+}
+
+// Force overflow and inexact flags to be set
+{ .mfi
+ mov r39 = 154 // Error code
+ fma.d.s0 NEXTAFTER_tmp = NEXTAFTER_lnorm,NEXTAFTER_lnorm,f0
+ nop.i 999
+}
+;;
+
+// (1)
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+
+// (2)
+{ .mmi
+ stfd [GR_Parameter_Y] = f9,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+// (3)
+{ .mib
+ stfd [GR_Parameter_X] = NEXTAFTER_save_f8 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+
+// (4)
+{ .mmi
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/s_nextafterf.S b/sysdeps/ia64/fpu/s_nextafterf.S
new file mode 100644
index 0000000000..6d2a92796d
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_nextafterf.S
@@ -0,0 +1,502 @@
+.file "nextafterf.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/02/00 Initial version
+// 03/03/00 Modified to conform to C9X, and improve speed of main path
+// 03/14/00 Fixed case where x is a power of 2, and x > y, improved speed
+// 04/04/00 Unwind support added
+// 05/12/00 Fixed erroneous denormal flag setting for exponent change cases 1,3
+// 08/15/00 Bundle added after call to __libm_error_support to properly
+// set [the previously overwritten] GR_Parameter_RESULT.
+// 09/09/00 Updated fcmp so that qnans do not raise invalid
+// 12/15/00 Corrected behavior when both args are zero to conform to C99, and
+// fixed flag settings for several cases
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// float nextafterf( float x, float y );
+// input floating point f8, f9
+// output floating point f8
+//
+// Registers used
+//==============================================================
+nextafter_GR_max_pexp = r14
+nextafter_GR_min_pexp = r15
+nextafter_GR_exp = r16
+nextafter_GR_sig = r17
+nextafter_GR_lnorm_sig = r18
+nextafter_GR_sign_mask = r19
+nextafter_GR_exp_mask = r20
+nextafter_GR_sden_sig = r21
+nextafter_GR_new_sig = r22
+nextafter_GR_new_exp = r23
+nextafter_GR_lden_sig = r24
+nextafter_GR_snorm_sig = r25
+nextafter_GR_exp1 = r26
+nextafter_GR_x_exp = r27
+nextafter_GR_min_den_rexp = r28
+// r36-39 parameters for libm_error_support
+
+GR_SAVE_B0 = r34
+GR_SAVE_GP = r35
+GR_SAVE_PFS = r32
+
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+
+NEXTAFTER_lnorm_sig = f10
+NEXTAFTER_lnorm_exp = f11
+NEXTAFTER_lnorm = f12
+NEXTAFTER_sden_sig = f13
+NEXTAFTER_sden_exp = f14
+NEXTAFTER_sden = f15
+NEXTAFTER_save_f8 = f33
+NEXTAFTER_new_exp = f34
+NEXTAFTER_new_sig = f35
+NEXTAFTER_lden_sig = f36
+NEXTAFTER_snorm_sig = f37
+NEXTAFTER_exp1 = f38
+NEXTAFTER_tmp = f39
+
+//
+// Overview of operation
+//==============================================================
+// nextafterf determines the next representable value
+// after x in the direction of y.
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(nextafterf)
+
+// Extract signexp from x
+// Form smallest denormal significand = ulp size
+{ .mlx
+ getf.exp nextafter_GR_exp = f8
+ movl nextafter_GR_sden_sig = 0x0000010000000000
+}
+// Form largest normal exponent
+// Is x < y ? p10 if yes, p11 if no
+// Form smallest normal exponent
+{ .mfi
+ addl nextafter_GR_max_pexp = 0x1007e, r0
+ fcmp.lt.s1 p10,p11 = f8, f9
+ addl nextafter_GR_min_pexp = 0x0ff81, r0 ;;
+}
+
+// Is x=y?
+{ .mfi
+ getf.sig nextafter_GR_sig = f8
+ fcmp.eq.s0 p6,p0 = f8, f9
+ nop.i 0
+}
+// Extract significand from x
+// Form largest normal significand
+{ .mlx
+ nop.m 0
+ movl nextafter_GR_lnorm_sig = 0xffffff0000000000 ;;
+}
+
+// Move largest normal significand to fp reg for special cases
+{ .mfi
+ setf.sig NEXTAFTER_lnorm_sig = nextafter_GR_lnorm_sig
+ nop.f 0
+ addl nextafter_GR_sign_mask = 0x20000, r0 ;;
+}
+
+// Move smallest denormal significand and signexp to fp regs
+// Is x=nan?
+// Set p12 and p13 based on whether significand increases or decreases
+// It increases (p12 set) if x<y and x>=0 or if x>y and x<0
+// It decreases (p13 set) if x<y and x<0 or if x>y and x>=0
+{ .mfi
+ setf.sig NEXTAFTER_sden_sig = nextafter_GR_sden_sig
+ fclass.m p8,p0 = f8, 0xc3
+(p10) cmp.lt p12,p13 = nextafter_GR_exp, nextafter_GR_sign_mask
+}
+{ .mfi
+ setf.exp NEXTAFTER_sden_exp = nextafter_GR_min_pexp
+ nop.f 999
+(p11) cmp.ge p12,p13 = nextafter_GR_exp, nextafter_GR_sign_mask ;;
+}
+
+.pred.rel "mutex",p12,p13
+
+// Form expected new significand, adding or subtracting 1 ulp increment
+// If x=y set result to y
+// Form smallest normal significand and largest denormal significand
+{ .mfi
+(p12) add nextafter_GR_new_sig = nextafter_GR_sig, nextafter_GR_sden_sig
+(p6) fmerge.s f8=f9,f9
+ dep.z nextafter_GR_snorm_sig = 1,63,1 // 0x8000000000000000
+}
+{ .mlx
+(p13) sub nextafter_GR_new_sig = nextafter_GR_sig, nextafter_GR_sden_sig
+ movl nextafter_GR_lden_sig = 0x7fffff0000000000 ;;
+}
+
+// Move expected result significand and signexp to fp regs
+// Is y=nan?
+// Form new exponent in case result exponent needs incrementing or decrementing
+{ .mfi
+ setf.exp NEXTAFTER_new_exp = nextafter_GR_exp
+ fclass.m p9,p0 = f9, 0xc3
+(p12) add nextafter_GR_exp1 = 1, nextafter_GR_exp
+}
+{ .mib
+ setf.sig NEXTAFTER_new_sig = nextafter_GR_new_sig
+(p13) add nextafter_GR_exp1 = -1, nextafter_GR_exp
+(p6) br.ret.spnt b0 ;; // Exit if x=y
+}
+
+// Move largest normal signexp to fp reg for special cases
+// Is x=zero?
+{ .mfi
+ setf.exp NEXTAFTER_lnorm_exp = nextafter_GR_max_pexp
+ fclass.m p7,p0 = f8, 0x7
+ nop.i 999
+}
+{ .mfb
+ nop.m 999
+(p8) fma.s0 f8 = f8,f1,f9
+(p8) br.ret.spnt b0 ;; // Exit if x=nan
+}
+
+// Move exp+-1 and smallest normal significand to fp regs for special cases
+// Is x=inf?
+{ .mfi
+ setf.exp NEXTAFTER_exp1 = nextafter_GR_exp1
+ fclass.m p6,p0 = f8, 0x23
+ addl nextafter_GR_exp_mask = 0x1ffff, r0
+}
+{ .mfb
+ setf.sig NEXTAFTER_snorm_sig = nextafter_GR_snorm_sig
+(p9) fma.s0 f8 = f8,f1,f9
+(p9) br.ret.spnt b0 ;; // Exit if y=nan
+}
+
+// Move largest denormal significand to fp regs for special cases
+// Save x
+{ .mfb
+ setf.sig NEXTAFTER_lden_sig = nextafter_GR_lden_sig
+ mov NEXTAFTER_save_f8 = f8
+(p7) br.cond.spnt NEXTAFTER_ZERO ;; // Exit if x=0
+}
+
+// Mask off the sign to get x_exp
+{ .mfb
+ and nextafter_GR_x_exp = nextafter_GR_exp_mask, nextafter_GR_exp
+ nop.f 999
+(p6) br.cond.spnt NEXTAFTER_INF ;; // Exit if x=inf
+}
+
+// Check 6 special cases when significand rolls over:
+// 1 sig size incr, x_sig=max_sig, x_exp < max_exp
+// Set p6, result is sig=min_sig, exp++
+// 2 sig size incr, x_sig=max_sig, x_exp >= max_exp
+// Set p7, result is inf, signal overflow
+// 3 sig size decr, x_sig=min_sig, x_exp > min_exp
+// Set p8, result is sig=max_sig, exp--
+// 4 sig size decr, x_sig=min_sig, x_exp = min_exp
+// Set p9, result is sig=max_den_sig, exp same, signal underflow and inexact
+// 5 sig size decr, x_sig=min_den_sig, x_exp = min_exp
+// Set p10, result is zero, sign of x, signal underflow and inexact
+// 6 sig size decr, x_sig=min_sig, x_exp < min_exp
+// Set p14, result is zero, sign of x, signal underflow and inexact
+//
+// Form exponent of smallest float denormal (if normalized register format)
+{ .mmi
+ adds nextafter_GR_min_den_rexp = -23, nextafter_GR_min_pexp
+(p12) cmp.eq.unc p6,p0 = nextafter_GR_new_sig, r0
+(p13) cmp.eq.unc p8,p10 = nextafter_GR_new_sig, nextafter_GR_lden_sig ;;
+}
+
+{ .mmi
+(p6) cmp.lt.unc p6,p7 = nextafter_GR_x_exp, nextafter_GR_max_pexp
+(p8) cmp.gt.unc p8,p9 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+(p10) cmp.eq.unc p10,p0 = nextafter_GR_new_sig, r0 ;;
+}
+
+// Create small normal in case need to generate underflow flag
+{ .mfi
+(p10) cmp.le.unc p10,p0 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+ fmerge.se NEXTAFTER_tmp = NEXTAFTER_sden_exp, NEXTAFTER_lnorm_sig
+(p9) cmp.gt.unc p9,p14 = nextafter_GR_x_exp, nextafter_GR_min_den_rexp
+}
+// Branch if cases 1, 2, 3
+{ .bbb
+(p6) br.cond.spnt NEXTAFTER_EXPUP
+(p7) br.cond.spnt NEXTAFTER_OVERFLOW
+(p8) br.cond.spnt NEXTAFTER_EXPDOWN ;;
+}
+
+// Branch if cases 4, 5, 6
+{ .bbb
+(p9) br.cond.spnt NEXTAFTER_NORM_TO_DENORM
+(p10) br.cond.spnt NEXTAFTER_UNDERFLOW_TO_ZERO
+(p14) br.cond.spnt NEXTAFTER_UNDERFLOW_TO_ZERO ;;
+}
+
+// Here if no special cases
+// Set p6 if result will be a denormal, so can force underflow flag
+// Case 1: x_exp=min_exp, x_sig=unnormalized
+// Case 2: x_exp<min_exp
+{ .mfi
+ cmp.lt p6,p7 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+ fmerge.se f8 = NEXTAFTER_new_exp, NEXTAFTER_new_sig
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+ nop.f 999
+(p7) tbit.z p6,p0 = nextafter_GR_new_sig, 63 ;;
+}
+
+NEXTAFTER_COMMON_FINISH:
+// Force underflow and inexact if denormal result
+{ .mfi
+ nop.m 999
+(p6) fma.s.s0 NEXTAFTER_tmp = NEXTAFTER_tmp,NEXTAFTER_tmp,f0
+ nop.i 999 ;;
+}
+
+// Final normalization to result precision and exit
+{ .mfb
+ nop.m 999
+ fnorm.s.s0 f8 = f8
+ br.ret.sptk b0;;
+}
+
+//Special cases
+NEXTAFTER_EXPUP:
+{ .mfb
+ cmp.lt p6,p7 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+ fmerge.se f8 = NEXTAFTER_exp1, NEXTAFTER_snorm_sig
+ br.cond.sptk NEXTAFTER_COMMON_FINISH ;;
+}
+
+NEXTAFTER_EXPDOWN:
+{ .mfb
+ cmp.lt p6,p7 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+ fmerge.se f8 = NEXTAFTER_exp1, NEXTAFTER_lnorm_sig
+ br.cond.sptk NEXTAFTER_COMMON_FINISH ;;
+}
+
+NEXTAFTER_NORM_TO_DENORM:
+{ .mfi
+ nop.m 999
+ fmerge.se f8 = NEXTAFTER_new_exp, NEXTAFTER_lden_sig
+ nop.i 999
+}
+// Force underflow and inexact
+{ .mfb
+ nop.m 999
+ fma.s.s0 NEXTAFTER_tmp = NEXTAFTER_tmp,NEXTAFTER_tmp,f0
+ br.ret.sptk b0 ;;
+}
+
+NEXTAFTER_UNDERFLOW_TO_ZERO:
+{ .mfb
+ cmp.eq p6,p0 = r0,r0
+ fmerge.s f8 = NEXTAFTER_save_f8,f0
+ br.cond.sptk NEXTAFTER_COMMON_FINISH ;;
+}
+
+NEXTAFTER_INF:
+// Here if f8 is +- infinity
+// INF
+// if f8 is +inf, no matter what y is return largest float
+// if f8 is -inf, no matter what y is return -largest float
+
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTAFTER_lnorm = NEXTAFTER_lnorm_exp,NEXTAFTER_lnorm_sig
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+ fmerge.s f8 = f8,NEXTAFTER_lnorm
+ br.ret.sptk b0 ;;
+}
+
+NEXTAFTER_ZERO:
+
+// Here if f8 is +- zero
+// ZERO
+// if f8 is zero and y is +, return + smallest float denormal
+// if f8 is zero and y is -, return - smallest float denormal
+
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTAFTER_sden = NEXTAFTER_sden_exp,NEXTAFTER_sden_sig
+ nop.i 999 ;;
+}
+
+// Create small normal to generate underflow flag
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTAFTER_tmp = NEXTAFTER_sden_exp, NEXTAFTER_lnorm_sig
+ nop.i 999 ;;
+}
+
+// Add correct sign from direction arg
+{ .mfi
+ nop.m 999
+ fmerge.s f8 = f9,NEXTAFTER_sden
+ nop.i 999 ;;
+}
+
+// Force underflow and inexact flags
+{ .mfb
+ nop.m 999
+ fma.s.s0 NEXTAFTER_tmp = NEXTAFTER_tmp,NEXTAFTER_tmp,f0
+ br.ret.sptk b0 ;;
+}
+
+GLOBAL_LIBM_END(nextafterf)
+// Stack operations when calling error support.
+// (1) (2) (3) (call) (4)
+// sp -> + psp -> + psp -> + sp -> +
+// | | | |
+// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
+// | | | |
+// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
+// | | | |
+// | | <- GR_X X1 ->| |
+// | | | |
+// sp-64 -> + sp -> + sp -> + +
+// save ar.pfs save b0 restore gp
+// save gp restore ar.pfs
+
+
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+NEXTAFTER_OVERFLOW:
+// Here if f8 is finite, but result will be infinite
+// Use frcpa to generate infinity of correct sign
+// Call error support to report possible range error
+.prologue
+
+{ .mfi
+ alloc r32=ar.pfs,2,2,4,0
+ frcpa.s1 f8,p6 = NEXTAFTER_save_f8, f0
+ nop.i 999
+}
+
+// Create largest float
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTAFTER_lnorm = NEXTAFTER_lnorm_exp,NEXTAFTER_lnorm_sig
+ nop.i 999 ;;
+}
+
+// Force overflow and inexact flags to be set
+{ .mfi
+ mov r39 = 155 // Error code
+ fma.s.s0 NEXTAFTER_tmp = NEXTAFTER_lnorm,NEXTAFTER_lnorm,f0
+ nop.i 999
+}
+;;
+
+// (1)
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+
+// (2)
+{ .mmi
+ stfs [GR_Parameter_Y] = f9,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+// (3)
+{ .mib
+ stfs [GR_Parameter_X] = NEXTAFTER_save_f8 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+
+// (4)
+{ .mmi
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/s_nextafterl.S b/sysdeps/ia64/fpu/s_nextafterl.S
new file mode 100644
index 0000000000..05bdd9c17a
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_nextafterl.S
@@ -0,0 +1,501 @@
+.file "nextafterl.s"
+
+
+// Copyright (c) 2000 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 02/02/00 Initial version
+// 03/03/00 Modified to conform to C9X, and improve speed of main path
+// 03/14/00 Fixed case where x is a power of 2, and x > y, improved speed
+// 04/04/00 Unwind support added
+// 05/12/00 Fixed erroneous denormal flag setting for exponent change cases 1,3
+// 08/15/00 Bundle added after call to __libm_error_support to properly
+// set [the previously overwritten] GR_Parameter_RESULT.
+// 09/09/00 Updated fcmp so that qnans do not raise invalid.
+// 12/15/00 Fixed case of smallest long double normal to largest denormal,
+// now adhere to C99 for two zero args, and fixed flag settings
+// for several cases
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// long double nextafterl( long double x, long double y );
+// input floating point f8, f9
+// output floating point f8
+//
+// Registers used
+//==============================================================
+nextafter_GR_max_pexp = r14
+nextafter_GR_min_pexp = r15
+nextafter_GR_exp = r16
+nextafter_GR_sig = r17
+nextafter_GR_lnorm_sig = r18
+nextafter_GR_sign_mask = r19
+nextafter_GR_exp_mask = r20
+nextafter_GR_sden_sig = r21
+nextafter_GR_new_sig = r22
+nextafter_GR_new_exp = r23
+nextafter_GR_lden_sig = r24
+nextafter_GR_snorm_sig = r25
+nextafter_GR_exp1 = r26
+nextafter_GR_x_exp = r27
+// r36-39 parameters for libm_error_support
+
+GR_SAVE_B0 = r34
+GR_SAVE_GP = r35
+GR_SAVE_PFS = r32
+
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+
+NEXTAFTER_lnorm_sig = f10
+NEXTAFTER_lnorm_exp = f11
+NEXTAFTER_lnorm = f12
+NEXTAFTER_sden_sig = f13
+NEXTAFTER_den_exp = f14
+NEXTAFTER_sden = f15
+NEXTAFTER_snorm_exp = f32
+NEXTAFTER_save_f8 = f33
+NEXTAFTER_new_exp = f34
+NEXTAFTER_new_sig = f35
+NEXTAFTER_lden_sig = f36
+NEXTAFTER_snorm_sig = f37
+NEXTAFTER_exp1 = f38
+NEXTAFTER_tmp = f39
+
+//
+// Overview of operation
+//==============================================================
+// nextafterl determines the next representable value
+// after x in the direction of y.
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(nextafterl)
+
+// Extract signexp from x
+// Is x < y ? p10 if yes, p11 if no
+// Form smallest denormal significand = ulp size
+{ .mfi
+ getf.exp nextafter_GR_exp = f8
+ fcmp.lt.s1 p10,p11 = f8, f9
+ addl nextafter_GR_sden_sig = 0x1, r0
+}
+// Form largest normal significand 0xffffffffffffffff
+// Form smallest normal exponent
+{ .mfi
+ addl nextafter_GR_lnorm_sig = -0x1,r0
+ nop.f 999
+ addl nextafter_GR_min_pexp = 0x0c001, r0 ;;
+}
+
+// Extract significand from x
+// Is x=y? This fcmp also sets Invalid and Denormal if required
+// Form largest normal exponent
+{ .mfi
+ getf.sig nextafter_GR_sig = f8
+ fcmp.eq.s0 p6,p0 = f8, f9
+ addl nextafter_GR_max_pexp = 0x13ffe, r0
+}
+// Move largest normal significand to fp reg for special cases
+{ .mfi
+ setf.sig NEXTAFTER_lnorm_sig = nextafter_GR_lnorm_sig
+ nop.f 999
+ addl nextafter_GR_sign_mask = 0x20000, r0 ;;
+}
+
+// Move smallest denormal significand and exp to fp regs
+// Is x=nan?
+// Set p12 and p13 based on whether significand increases or decreases
+// It increases (p12 set) if x<y and x>=0 or if x>y and x<0
+// It decreases (p13 set) if x<y and x<0 or if x>y and x>=0
+{ .mfi
+ setf.sig NEXTAFTER_sden_sig = nextafter_GR_sden_sig
+ fclass.m p8,p0 = f8, 0xc3
+(p10) cmp.lt p12,p13 = nextafter_GR_exp, nextafter_GR_sign_mask
+}
+// Move smallest normal exp to fp regs
+{ .mfi
+ setf.exp NEXTAFTER_snorm_exp = nextafter_GR_min_pexp
+ nop.f 999
+(p11) cmp.ge p12,p13 = nextafter_GR_exp, nextafter_GR_sign_mask ;;
+}
+
+.pred.rel "mutex",p12,p13
+
+// Form expected new significand, adding or subtracting 1 ulp increment
+// If x=y set result to y
+// Form smallest normal significand and largest denormal significand
+{ .mfi
+(p12) add nextafter_GR_new_sig = nextafter_GR_sig, nextafter_GR_sden_sig
+(p6) fmerge.s f8=f9,f9
+ dep.z nextafter_GR_snorm_sig = 1,63,1 // 0x8000000000000000
+}
+{ .mlx
+(p13) sub nextafter_GR_new_sig = nextafter_GR_sig, nextafter_GR_sden_sig
+ movl nextafter_GR_lden_sig = 0x7fffffffffffffff ;;
+}
+
+// Move expected result significand and signexp to fp regs
+// Is y=nan?
+// Form new exponent in case result exponent needs incrementing or decrementing
+{ .mfi
+ setf.exp NEXTAFTER_new_exp = nextafter_GR_exp
+ fclass.m p9,p0 = f9, 0xc3
+(p12) add nextafter_GR_exp1 = 1, nextafter_GR_exp
+}
+{ .mib
+ setf.sig NEXTAFTER_new_sig = nextafter_GR_new_sig
+(p13) add nextafter_GR_exp1 = -1, nextafter_GR_exp
+(p6) br.ret.spnt b0 ;; // Exit if x=y
+}
+
+// Move largest normal signexp to fp reg for special cases
+// Is x=zero?
+{ .mfi
+ setf.exp NEXTAFTER_lnorm_exp = nextafter_GR_max_pexp
+ fclass.m p7,p0 = f8, 0x7
+ nop.i 999
+}
+{ .mfb
+ setf.exp NEXTAFTER_den_exp = nextafter_GR_min_pexp
+(p8) fma.s0 f8 = f8,f1,f9
+(p8) br.ret.spnt b0 ;; // Exit if x=nan
+}
+
+// Move exp+-1 and smallest normal significand to fp regs for special cases
+// Is x=inf?
+{ .mfi
+ setf.exp NEXTAFTER_exp1 = nextafter_GR_exp1
+ fclass.m p6,p0 = f8, 0x23
+ addl nextafter_GR_exp_mask = 0x1ffff, r0
+}
+{ .mfb
+ setf.sig NEXTAFTER_snorm_sig = nextafter_GR_snorm_sig
+(p9) fma.s0 f8 = f8,f1,f9
+(p9) br.ret.spnt b0 ;; // Exit if y=nan
+}
+
+// Move largest denormal significand to fp regs for special cases
+// Save x
+{ .mfb
+ setf.sig NEXTAFTER_lden_sig = nextafter_GR_lden_sig
+ mov NEXTAFTER_save_f8 = f8
+(p7) br.cond.spnt NEXTAFTER_ZERO ;; // Exit if x=0
+}
+
+// Mask off the sign to get x_exp
+{ .mfb
+ and nextafter_GR_x_exp = nextafter_GR_exp_mask, nextafter_GR_exp
+ nop.f 999
+(p6) br.cond.spnt NEXTAFTER_INF ;; // Exit if x=inf
+}
+
+// Check 5 special cases when significand rolls over:
+// 1 sig size incr, x_sig=max_sig, x_exp < max_exp
+// Set p6, result is sig=min_sig, exp++
+// 2 sig size incr, x_sig=max_sig, x_exp >= max_exp
+// Set p7, result is inf, signal overflow
+// 3 sig size decr, x_sig=min_sig, x_exp > min_exp
+// Set p8, result is sig=max_sig, exp--
+// 4 sig size decr, x_sig=min_sig, x_exp = min_exp
+// Set p9, result is sig=max_den_sig, exp same, signal underflow and inexact
+// 5 sig size decr, x_sig=min_den_sig, x_exp = min_exp
+// Set p10, result is zero, sign of x, signal underflow and inexact
+//
+{ .mmi
+(p12) cmp.eq.unc p6,p0 = nextafter_GR_new_sig, r0
+(p13) cmp.eq.unc p9,p10 = nextafter_GR_new_sig, nextafter_GR_lden_sig
+ nop.i 999
+;;
+}
+
+{ .mmi
+(p6) cmp.lt.unc p6,p7 = nextafter_GR_x_exp, nextafter_GR_max_pexp
+(p10) cmp.eq.unc p10,p0 = nextafter_GR_new_sig, r0
+(p9) cmp.le.unc p9,p8 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+;;
+}
+
+// Create small normal in case need to generate underflow flag
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTAFTER_tmp = NEXTAFTER_snorm_exp, NEXTAFTER_lnorm_sig
+ nop.i 999
+}
+// Branch if cases 1, 2, 3
+{ .bbb
+(p6) br.cond.spnt NEXTAFTER_EXPUP
+(p7) br.cond.spnt NEXTAFTER_OVERFLOW
+(p8) br.cond.spnt NEXTAFTER_EXPDOWN ;;
+}
+
+// Branch if cases 4, 5
+{ .mbb
+ nop.m 999
+(p9) br.cond.spnt NEXTAFTER_NORM_TO_DENORM
+(p10) br.cond.spnt NEXTAFTER_UNDERFLOW_TO_ZERO
+;;
+}
+
+// Here if no special cases
+// Set p6 if result will be a denormal, so can force underflow flag
+// Case 1: x_exp=min_exp, x_sig=unnormalized
+// Case 2: x_exp<min_exp
+{ .mfi
+ cmp.lt p6,p7 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+ fmerge.se f8 = NEXTAFTER_new_exp, NEXTAFTER_new_sig
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+ nop.f 999
+(p6) tbit.z p6,p0 = nextafter_GR_new_sig, 63 ;;
+}
+
+NEXTAFTER_COMMON_FINISH:
+// Force underflow and inexact if denormal result
+{ .mfi
+ nop.m 999
+(p6) fma.s0 NEXTAFTER_tmp = NEXTAFTER_tmp,NEXTAFTER_tmp,f0
+ nop.i 999 ;;
+}
+
+// Final normalization to result precision and exit
+{ .mfb
+ nop.m 999
+ fnorm.s0 f8 = f8
+ br.ret.sptk b0;;
+}
+
+//Special cases
+NEXTAFTER_EXPUP:
+{ .mfb
+ cmp.lt p6,p7 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+ fmerge.se f8 = NEXTAFTER_exp1, NEXTAFTER_snorm_sig
+ br.cond.sptk NEXTAFTER_COMMON_FINISH ;;
+}
+
+NEXTAFTER_EXPDOWN:
+{ .mfb
+ cmp.lt p6,p7 = nextafter_GR_x_exp, nextafter_GR_min_pexp
+ fmerge.se f8 = NEXTAFTER_exp1, NEXTAFTER_lnorm_sig
+ br.cond.sptk NEXTAFTER_COMMON_FINISH ;;
+}
+
+NEXTAFTER_NORM_TO_DENORM:
+{ .mfi
+ nop.m 999
+ fmerge.se f8 = NEXTAFTER_exp1, NEXTAFTER_lden_sig
+ nop.i 999
+}
+// Force underflow and inexact
+{ .mfb
+ nop.m 999
+ fma.s0 NEXTAFTER_tmp = NEXTAFTER_tmp,NEXTAFTER_tmp,f0
+ br.ret.sptk b0 ;;
+}
+
+NEXTAFTER_UNDERFLOW_TO_ZERO:
+{ .mfb
+ cmp.eq p6,p0 = r0,r0
+ fmerge.s f8 = NEXTAFTER_save_f8,f0
+ br.cond.sptk NEXTAFTER_COMMON_FINISH ;;
+}
+
+NEXTAFTER_INF:
+// Here if f8 is +- infinity
+// INF
+// if f8 is +inf, no matter what y is return largest long double
+// if f8 is -inf, no matter what y is return -largest long double
+
+// Create largest long double
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTAFTER_lnorm = NEXTAFTER_lnorm_exp,NEXTAFTER_lnorm_sig
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+ fmerge.s f8 = f8,NEXTAFTER_lnorm
+ br.ret.sptk b0 ;;
+}
+
+NEXTAFTER_ZERO:
+
+// Here if f8 is +- zero
+// ZERO
+// if f8 is zero and y is +, return + smallest long double denormal
+// if f8 is zero and y is -, return - smallest long double denormal
+
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTAFTER_sden = f0,NEXTAFTER_sden_sig
+ nop.i 999 ;;
+}
+
+// Create small normal to generate underflow flag
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTAFTER_tmp = NEXTAFTER_snorm_exp, NEXTAFTER_lnorm_sig
+ nop.i 999 ;;
+}
+
+// Add correct sign from direction arg
+{ .mfi
+ nop.m 999
+ fmerge.s f8 = f9,NEXTAFTER_sden
+ nop.i 999 ;;
+}
+
+// Force underflow and inexact flags
+{ .mfb
+ nop.m 999
+ fma.s0 NEXTAFTER_tmp = NEXTAFTER_tmp,NEXTAFTER_tmp,f0
+ br.ret.sptk b0 ;;
+}
+
+GLOBAL_LIBM_END(nextafterl)
+// Stack operations when calling error support.
+// (1) (2) (3) (call) (4)
+// sp -> + psp -> + psp -> + sp -> +
+// | | | |
+// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
+// | | | |
+// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
+// | | | |
+// | | <- GR_X X1 ->| |
+// | | | |
+// sp-64 -> + sp -> + sp -> + +
+// save ar.pfs save b0 restore gp
+// save gp restore ar.pfs
+
+
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+NEXTAFTER_OVERFLOW:
+// Here if f8 is finite, but result will be infinite
+// Use frcpa to generate infinity of correct sign
+// Call error support to report possible range error
+.prologue
+
+{ .mfi
+ alloc r32=ar.pfs,2,2,4,0
+ frcpa.s1 f8,p6 = NEXTAFTER_save_f8, f0
+ nop.i 999 ;;
+}
+
+// Create largest long double
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTAFTER_lnorm = NEXTAFTER_lnorm_exp,NEXTAFTER_lnorm_sig
+ nop.i 999 ;;
+}
+
+// Force overflow and inexact flags to be set
+{ .mfi
+ mov r39 = 153 // Error code
+ fma.s0 NEXTAFTER_tmp = NEXTAFTER_lnorm,NEXTAFTER_lnorm,f0
+ nop.i 999
+}
+;;
+
+// (1)
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+
+// (2)
+{ .mmi
+ stfe [GR_Parameter_Y] = f9,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+// (3)
+{ .mib
+ stfe [GR_Parameter_X] = NEXTAFTER_save_f8 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfe [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+
+// (4)
+{ .mmi
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/s_nextafterl.c b/sysdeps/ia64/fpu/s_nextafterl.c
deleted file mode 100644
index f59f16848f..0000000000
--- a/sysdeps/ia64/fpu/s_nextafterl.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <sysdeps/i386/fpu/s_nextafterl.c>
diff --git a/sysdeps/ia64/fpu/s_nexttoward.S b/sysdeps/ia64/fpu/s_nexttoward.S
new file mode 100644
index 0000000000..f8fac1e072
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_nexttoward.S
@@ -0,0 +1,488 @@
+.file "nexttoward.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 08/15/01 Initial version
+// 08/23/01 Corrected error tag number
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// double nexttoward( double x, long double y );
+// input floating point f8, f9
+// output floating point f8
+//
+// Registers used
+//==============================================================
+nexttoward_GR_max_pexp = r14
+nexttoward_GR_min_pexp = r15
+nexttoward_GR_exp = r16
+nexttoward_GR_sig = r17
+nexttoward_GR_lnorm_sig = r18
+nexttoward_GR_sign_mask = r19
+nexttoward_GR_exp_mask = r20
+nexttoward_GR_sden_sig = r21
+nexttoward_GR_new_sig = r22
+nexttoward_GR_new_exp = r23
+nexttoward_GR_lden_sig = r24
+nexttoward_GR_snorm_sig = r25
+nexttoward_GR_exp1 = r26
+nexttoward_GR_x_exp = r27
+nexttoward_GR_min_den_rexp = r28
+// r36-39 parameters for libm_error_support
+
+GR_SAVE_B0 = r34
+GR_SAVE_GP = r35
+GR_SAVE_PFS = r32
+
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+
+NEXTTOWARD_lnorm_sig = f10
+NEXTTOWARD_lnorm_exp = f11
+NEXTTOWARD_lnorm = f12
+NEXTTOWARD_sden_sig = f13
+NEXTTOWARD_sden_exp = f14
+NEXTTOWARD_sden = f15
+NEXTTOWARD_save_f8 = f33
+NEXTTOWARD_new_exp = f34
+NEXTTOWARD_new_sig = f35
+NEXTTOWARD_lden_sig = f36
+NEXTTOWARD_snorm_sig = f37
+NEXTTOWARD_exp1 = f38
+NEXTTOWARD_tmp = f39
+
+//
+// Overview of operation
+//==============================================================
+// nexttoward determines the next representable value
+// after x in the direction of y.
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(nexttoward)
+
+// Extract signexp from x
+// Is x < y ? p10 if yes, p11 if no
+// Form smallest denormal significand = ulp size
+{ .mfi
+ getf.exp nexttoward_GR_exp = f8
+ fcmp.lt.s1 p10,p11 = f8, f9
+ addl nexttoward_GR_sden_sig = 0x800, r0
+}
+// Form largest normal significand 0xfffffffffffff800
+// Form smallest normal exponent
+{ .mfi
+ addl nexttoward_GR_lnorm_sig = -0x800,r0
+ nop.f 999
+ addl nexttoward_GR_min_pexp = 0x0fc01, r0 ;;
+}
+// Extract significand from x
+// Is x=y?
+// Form largest normal exponent
+{ .mfi
+ getf.sig nexttoward_GR_sig = f8
+ fcmp.eq.s0 p6,p0 = f8, f9
+ addl nexttoward_GR_max_pexp = 0x103fe, r0
+}
+// Move largest normal significand to fp reg for special cases
+{ .mfi
+ setf.sig NEXTTOWARD_lnorm_sig = nexttoward_GR_lnorm_sig
+ nop.f 999
+ addl nexttoward_GR_sign_mask = 0x20000, r0 ;;
+}
+
+// Move smallest denormal significand and signexp to fp regs
+// Is x=nan?
+// Set p12 and p13 based on whether significand increases or decreases
+// It increases (p12 set) if x<y and x>=0 or if x>y and x<0
+// It decreases (p13 set) if x<y and x<0 or if x>y and x>=0
+{ .mfi
+ setf.sig NEXTTOWARD_sden_sig = nexttoward_GR_sden_sig
+ fclass.m p8,p0 = f8, 0xc3
+(p10) cmp.lt p12,p13 = nexttoward_GR_exp, nexttoward_GR_sign_mask
+}
+{ .mfi
+ setf.exp NEXTTOWARD_sden_exp = nexttoward_GR_min_pexp
+(p11) cmp.ge p12,p13 = nexttoward_GR_exp, nexttoward_GR_sign_mask ;;
+}
+
+.pred.rel "mutex",p12,p13
+
+// Form expected new significand, adding or subtracting 1 ulp increment
+// If x=y set result to y
+// Form smallest normal significand and largest denormal significand
+{ .mfi
+(p12) add nexttoward_GR_new_sig = nexttoward_GR_sig, nexttoward_GR_sden_sig
+(p6) fnorm.d.s0 f8=f9 //Normalise
+ dep.z nexttoward_GR_snorm_sig = 1,63,1 // 0x8000000000000000
+}
+{ .mlx
+(p13) sub nexttoward_GR_new_sig = nexttoward_GR_sig, nexttoward_GR_sden_sig
+ movl nexttoward_GR_lden_sig = 0x7ffffffffffff800 ;;
+}
+
+// Move expected result significand and signexp to fp regs
+// Is y=nan?
+// Form new exponent in case result exponent needs incrementing or decrementing
+{ .mfi
+ setf.exp NEXTTOWARD_new_exp = nexttoward_GR_exp
+ fclass.m p9,p0 = f9, 0xc3
+(p12) add nexttoward_GR_exp1 = 1, nexttoward_GR_exp
+}
+{ .mib
+ setf.sig NEXTTOWARD_new_sig = nexttoward_GR_new_sig
+(p13) add nexttoward_GR_exp1 = -1, nexttoward_GR_exp
+(p6) br.ret.spnt b0 ;; // Exit if x=y
+}
+
+// Move largest normal signexp to fp reg for special cases
+// Is x=zero?
+{ .mfi
+ setf.exp NEXTTOWARD_lnorm_exp = nexttoward_GR_max_pexp
+ fclass.m p7,p0 = f8, 0x7
+ nop.i 999
+}
+{ .mfb
+ nop.m 999
+(p8) fma.s0 f8 = f8,f1,f9
+(p8) br.ret.spnt b0 ;; // Exit if x=nan
+}
+
+// Move exp+-1 and smallest normal significand to fp regs for special cases
+// Is x=inf?
+{ .mfi
+ setf.exp NEXTTOWARD_exp1 = nexttoward_GR_exp1
+ fclass.m p6,p0 = f8, 0x23
+ addl nexttoward_GR_exp_mask = 0x1ffff, r0
+}
+{ .mfb
+ setf.sig NEXTTOWARD_snorm_sig = nexttoward_GR_snorm_sig
+(p9) fma.s0 f8 = f8,f1,f9
+(p9) br.ret.spnt b0 ;; // Exit if y=nan
+}
+
+// Move largest denormal significand to fp regs for special cases
+// Save x
+{ .mfb
+ setf.sig NEXTTOWARD_lden_sig = nexttoward_GR_lden_sig
+ mov NEXTTOWARD_save_f8 = f8
+(p7) br.cond.spnt NEXTTOWARD_ZERO ;; // Exit if x=0
+}
+
+// Mask off the sign to get x_exp
+{ .mfb
+ and nexttoward_GR_x_exp = nexttoward_GR_exp_mask, nexttoward_GR_exp
+ nop.f 999
+(p6) br.cond.spnt NEXTTOWARD_INF ;; // Exit if x=inf
+}
+
+// Check 6 special cases when significand rolls over:
+// 1 sig size incr, x_sig=max_sig, x_exp < max_exp
+// Set p6, result is sig=min_sig, exp++
+// 2 sig size incr, x_sig=max_sig, x_exp >= max_exp
+// Set p7, result is inf, signal overflow
+// 3 sig size decr, x_sig=min_sig, x_exp > min_exp
+// Set p8, result is sig=max_sig, exp--
+// 4 sig size decr, x_sig=min_sig, x_exp = min_exp
+// Set p9, result is sig=max_den_sig, exp same, signal underflow and inexact
+// 5 sig size decr, x_sig=min_den_sig, x_exp = min_exp
+// Set p10, result is zero, sign of x, signal underflow and inexact
+// 6 sig size decr, x_sig=min_sig, x_exp < min_exp
+// Set p14, result is zero, sign of x, signal underflow and inexact
+//
+// Form exponent of smallest double denormal (if normalized register format)
+{ .mmi
+ adds nexttoward_GR_min_den_rexp = -52, nexttoward_GR_min_pexp
+(p12) cmp.eq.unc p6,p0 = nexttoward_GR_new_sig, r0
+(p13) cmp.eq.unc p8,p10 = nexttoward_GR_new_sig, nexttoward_GR_lden_sig ;;
+}
+
+{ .mmi
+(p6) cmp.lt.unc p6,p7 = nexttoward_GR_x_exp, nexttoward_GR_max_pexp
+(p8) cmp.gt.unc p8,p9 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+(p10) cmp.eq.unc p10,p0 = nexttoward_GR_new_sig, r0 ;;
+}
+
+// Create small normal in case need to generate underflow flag
+{ .mfi
+(p10) cmp.le.unc p10,p0 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+ fmerge.se NEXTTOWARD_tmp = NEXTTOWARD_sden_exp, NEXTTOWARD_lnorm_sig
+(p9) cmp.gt.unc p9,p14 = nexttoward_GR_x_exp, nexttoward_GR_min_den_rexp
+}
+// Branch if cases 1, 2, 3
+{ .bbb
+(p6) br.cond.spnt NEXTTOWARD_EXPUP
+(p7) br.cond.spnt NEXTTOWARD_OVERFLOW
+(p8) br.cond.spnt NEXTTOWARD_EXPDOWN ;;
+}
+
+// Branch if cases 4, 5, 6
+{ .bbb
+(p9) br.cond.spnt NEXTTOWARD_NORM_TO_DENORM
+(p10) br.cond.spnt NEXTTOWARD_UNDERFLOW_TO_ZERO
+(p14) br.cond.spnt NEXTTOWARD_UNDERFLOW_TO_ZERO ;;
+}
+
+// Here if no special cases
+// Set p6 if result will be a denormal, so can force underflow flag
+// Case 1: x_exp=min_exp, x_sig=unnormalized
+// Case 2: x_exp<min_exp
+{ .mfi
+ cmp.lt p6,p7 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+ fmerge.se f8 = NEXTTOWARD_new_exp, NEXTTOWARD_new_sig
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+ nop.f 999
+(p7) tbit.z p6,p0 = nexttoward_GR_new_sig, 63 ;;
+}
+
+NEXTTOWARD_COMMON_FINISH:
+// Force underflow and inexact if denormal result
+{ .mfi
+ nop.m 999
+(p6) fma.d.s0 NEXTTOWARD_tmp = NEXTTOWARD_tmp,NEXTTOWARD_tmp,f0
+ nop.i 999 ;;
+}
+
+// Final normalization to result precision and exit
+{ .mfb
+ nop.m 999
+ fnorm.d.s0 f8 = f8
+ br.ret.sptk b0;;
+}
+
+//Special cases
+NEXTTOWARD_EXPUP:
+{ .mfb
+ cmp.lt p6,p7 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+ fmerge.se f8 = NEXTTOWARD_exp1, NEXTTOWARD_snorm_sig
+ br.cond.sptk NEXTTOWARD_COMMON_FINISH ;;
+}
+
+NEXTTOWARD_EXPDOWN:
+{ .mfb
+ cmp.lt p6,p7 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+ fmerge.se f8 = NEXTTOWARD_exp1, NEXTTOWARD_lnorm_sig
+ br.cond.sptk NEXTTOWARD_COMMON_FINISH ;;
+}
+
+NEXTTOWARD_NORM_TO_DENORM:
+{ .mfi
+ nop.m 999
+ fmerge.se f8 = NEXTTOWARD_new_exp, NEXTTOWARD_lden_sig
+ nop.i 999
+}
+// Force underflow and inexact if denormal result
+{ .mfb
+ nop.m 999
+ fma.d.s0 NEXTTOWARD_tmp = NEXTTOWARD_tmp,NEXTTOWARD_tmp,f0
+ br.ret.sptk b0 ;;
+}
+
+NEXTTOWARD_UNDERFLOW_TO_ZERO:
+{ .mfb
+ cmp.eq p6,p0 = r0,r0
+ fmerge.s f8 = NEXTTOWARD_save_f8,f0
+ br.cond.sptk NEXTTOWARD_COMMON_FINISH ;;
+}
+
+NEXTTOWARD_INF:
+// Here if f8 is +- infinity
+// INF
+// if f8 is +inf, no matter what y is return largest double
+// if f8 is -inf, no matter what y is return -largest double
+
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTTOWARD_lnorm = NEXTTOWARD_lnorm_exp,NEXTTOWARD_lnorm_sig
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+ fmerge.s f8 = f8,NEXTTOWARD_lnorm
+ br.ret.sptk b0 ;;
+}
+
+NEXTTOWARD_ZERO:
+
+// Here if f8 is +- zero
+// ZERO
+// if f8 is zero and y is +, return + smallest double denormal
+// if f8 is zero and y is -, return - smallest double denormal
+
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTTOWARD_sden = NEXTTOWARD_sden_exp,NEXTTOWARD_sden_sig
+ nop.i 999 ;;
+}
+
+// Create small normal to generate underflow flag
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTTOWARD_tmp = NEXTTOWARD_sden_exp, NEXTTOWARD_lnorm_sig
+ nop.i 999 ;;
+}
+
+// Add correct sign from direction arg
+{ .mfi
+ nop.m 999
+ fmerge.s f8 = f9,NEXTTOWARD_sden
+ nop.i 999 ;;
+}
+
+// Force underflow and inexact flags
+{ .mfb
+ nop.m 999
+ fma.d.s0 NEXTTOWARD_tmp = NEXTTOWARD_tmp,NEXTTOWARD_tmp,f0
+ br.ret.sptk b0 ;;
+}
+
+GLOBAL_LIBM_END(nexttoward)
+// Stack operations when calling error support.
+// (1) (2) (3) (call) (4)
+// sp -> + psp -> + psp -> + sp -> +
+// | | | |
+// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
+// | | | |
+// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
+// | | | |
+// | | <- GR_X X1 ->| |
+// | | | |
+// sp-64 -> + sp -> + sp -> + +
+// save ar.pfs save b0 restore gp
+// save gp restore ar.pfs
+
+
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+NEXTTOWARD_OVERFLOW:
+// Here if f8 is finite, but result will be infinite
+// Use frcpa to generate infinity of correct sign
+// Call error support to report possible range error
+.prologue
+
+{ .mfi
+ alloc r32=ar.pfs,2,2,4,0
+ frcpa.s1 f8,p6 = NEXTTOWARD_save_f8, f0
+ nop.i 999 ;;
+}
+
+// Create largest double
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTTOWARD_lnorm = NEXTTOWARD_lnorm_exp,NEXTTOWARD_lnorm_sig
+ nop.i 999 ;;
+}
+
+// Force overflow and inexact flags to be set
+{ .mfi
+ mov r39 = 199 // Error code
+ fma.d.s0 NEXTTOWARD_tmp = NEXTTOWARD_lnorm,NEXTTOWARD_lnorm,f0
+ nop.i 999
+}
+;;
+
+// (1)
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+
+// (2)
+{ .mmi
+ stfd [GR_Parameter_Y] = f9,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+// (3)
+{ .mib
+ stfd [GR_Parameter_X] = NEXTTOWARD_save_f8 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+
+// (4)
+{ .mmi
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/s_nexttoward.c b/sysdeps/ia64/fpu/s_nexttoward.c
deleted file mode 100644
index aee2bb5895..0000000000
--- a/sysdeps/ia64/fpu/s_nexttoward.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <sysdeps/i386/fpu/s_nexttoward.c>
diff --git a/sysdeps/ia64/fpu/s_nexttowardf.S b/sysdeps/ia64/fpu/s_nexttowardf.S
new file mode 100644
index 0000000000..fb1adaea5b
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_nexttowardf.S
@@ -0,0 +1,494 @@
+.file "nexttowardf.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 08/15/01 Initial version
+// 08/23/01 Corrected error tag number
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// float nexttowardf( float x, long double y );
+// input floating point f8, f9
+// output floating point f8
+//
+// Registers used
+//==============================================================
+nexttoward_GR_max_pexp = r14
+nexttoward_GR_min_pexp = r15
+nexttoward_GR_exp = r16
+nexttoward_GR_sig = r17
+nexttoward_GR_lnorm_sig = r18
+nexttoward_GR_sign_mask = r19
+nexttoward_GR_exp_mask = r20
+nexttoward_GR_sden_sig = r21
+nexttoward_GR_new_sig = r22
+nexttoward_GR_new_exp = r23
+nexttoward_GR_lden_sig = r24
+nexttoward_GR_snorm_sig = r25
+nexttoward_GR_exp1 = r26
+nexttoward_GR_x_exp = r27
+nexttoward_GR_min_den_rexp = r28
+// r36-39 parameters for libm_error_support
+
+GR_SAVE_B0 = r34
+GR_SAVE_GP = r35
+GR_SAVE_PFS = r32
+
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+
+NEXTTOWARD_lnorm_sig = f10
+NEXTTOWARD_lnorm_exp = f11
+NEXTTOWARD_lnorm = f12
+NEXTTOWARD_sden_sig = f13
+NEXTTOWARD_sden_exp = f14
+NEXTTOWARD_sden = f15
+NEXTTOWARD_save_f8 = f33
+NEXTTOWARD_new_exp = f34
+NEXTTOWARD_new_sig = f35
+NEXTTOWARD_lden_sig = f36
+NEXTTOWARD_snorm_sig = f37
+NEXTTOWARD_exp1 = f38
+NEXTTOWARD_tmp = f39
+
+//
+// Overview of operation
+//==============================================================
+// nexttowardf determines the next representable value
+// after x in the direction of y.
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(nexttowardf)
+
+// Extract signexp from x
+// Form smallest denormal significand = ulp size
+{ .mlx
+ getf.exp nexttoward_GR_exp = f8
+ movl nexttoward_GR_sden_sig = 0x0000010000000000
+}
+// Form largest normal exponent
+// Is x < y ? p10 if yes, p11 if no
+// Form smallest normal exponent
+{ .mfi
+ addl nexttoward_GR_max_pexp = 0x1007e, r0
+ fcmp.lt.s1 p10,p11 = f8, f9
+ addl nexttoward_GR_min_pexp = 0x0ff81, r0 ;;
+}
+
+// Is x=y?
+{ .mfi
+ getf.sig nexttoward_GR_sig = f8
+ fcmp.eq.s0 p6,p0 = f8, f9
+ nop.i 0
+}
+// Extract significand from x
+// Form largest normal significand
+{ .mlx
+ nop.m 0
+ movl nexttoward_GR_lnorm_sig = 0xffffff0000000000 ;;
+}
+
+// Move largest normal significand to fp reg for special cases
+{ .mfi
+ setf.sig NEXTTOWARD_lnorm_sig = nexttoward_GR_lnorm_sig
+ nop.f 0
+ addl nexttoward_GR_sign_mask = 0x20000, r0 ;;
+}
+
+// Move smallest denormal significand and signexp to fp regs
+// Is x=nan?
+// Set p12 and p13 based on whether significand increases or decreases
+// It increases (p12 set) if x<y and x>=0 or if x>y and x<0
+// It decreases (p13 set) if x<y and x<0 or if x>y and x>=0
+{ .mfi
+ setf.sig NEXTTOWARD_sden_sig = nexttoward_GR_sden_sig
+ fclass.m p8,p0 = f8, 0xc3
+(p10) cmp.lt p12,p13 = nexttoward_GR_exp, nexttoward_GR_sign_mask
+}
+{ .mfi
+ setf.exp NEXTTOWARD_sden_exp = nexttoward_GR_min_pexp
+ nop.f 999
+(p11) cmp.ge p12,p13 = nexttoward_GR_exp, nexttoward_GR_sign_mask ;;
+}
+
+.pred.rel "mutex",p12,p13
+
+// Form expected new significand, adding or subtracting 1 ulp increment
+// If x=y set result to y
+// Form smallest normal significand and largest denormal significand
+{ .mfi
+(p12) add nexttoward_GR_new_sig = nexttoward_GR_sig, nexttoward_GR_sden_sig
+(p6) fnorm.s.s0 f8=f9 //Normalise
+ dep.z nexttoward_GR_snorm_sig = 1,63,1 // 0x8000000000000000
+}
+{ .mlx
+(p13) sub nexttoward_GR_new_sig = nexttoward_GR_sig, nexttoward_GR_sden_sig
+ movl nexttoward_GR_lden_sig = 0x7fffff0000000000 ;;
+}
+
+// Move expected result significand and signexp to fp regs
+// Is y=nan?
+// Form new exponent in case result exponent needs incrementing or decrementing
+{ .mfi
+ setf.exp NEXTTOWARD_new_exp = nexttoward_GR_exp
+ fclass.m p9,p0 = f9, 0xc3
+(p12) add nexttoward_GR_exp1 = 1, nexttoward_GR_exp
+}
+{ .mib
+ setf.sig NEXTTOWARD_new_sig = nexttoward_GR_new_sig
+(p13) add nexttoward_GR_exp1 = -1, nexttoward_GR_exp
+(p6) br.ret.spnt b0 ;; // Exit if x=y
+}
+
+// Move largest normal signexp to fp reg for special cases
+// Is x=zero?
+{ .mfi
+ setf.exp NEXTTOWARD_lnorm_exp = nexttoward_GR_max_pexp
+ fclass.m p7,p0 = f8, 0x7
+ nop.i 999
+}
+{ .mfb
+ nop.m 999
+(p8) fma.s0 f8 = f8,f1,f9
+(p8) br.ret.spnt b0 ;; // Exit if x=nan
+}
+
+// Move exp+-1 and smallest normal significand to fp regs for special cases
+// Is x=inf?
+{ .mfi
+ setf.exp NEXTTOWARD_exp1 = nexttoward_GR_exp1
+ fclass.m p6,p0 = f8, 0x23
+ addl nexttoward_GR_exp_mask = 0x1ffff, r0
+}
+{ .mfb
+ setf.sig NEXTTOWARD_snorm_sig = nexttoward_GR_snorm_sig
+(p9) fma.s0 f8 = f8,f1,f9
+(p9) br.ret.spnt b0 ;; // Exit if y=nan
+}
+
+// Move largest denormal significand to fp regs for special cases
+// Save x
+{ .mfb
+ setf.sig NEXTTOWARD_lden_sig = nexttoward_GR_lden_sig
+ mov NEXTTOWARD_save_f8 = f8
+(p7) br.cond.spnt NEXTTOWARD_ZERO ;; // Exit if x=0
+}
+
+// Mask off the sign to get x_exp
+{ .mfb
+ and nexttoward_GR_x_exp = nexttoward_GR_exp_mask, nexttoward_GR_exp
+ nop.f 999
+(p6) br.cond.spnt NEXTTOWARD_INF ;; // Exit if x=inf
+}
+
+// Check 6 special cases when significand rolls over:
+// 1 sig size incr, x_sig=max_sig, x_exp < max_exp
+// Set p6, result is sig=min_sig, exp++
+// 2 sig size incr, x_sig=max_sig, x_exp >= max_exp
+// Set p7, result is inf, signal overflow
+// 3 sig size decr, x_sig=min_sig, x_exp > min_exp
+// Set p8, result is sig=max_sig, exp--
+// 4 sig size decr, x_sig=min_sig, x_exp = min_exp
+// Set p9, result is sig=max_den_sig, exp same, signal underflow and inexact
+// 5 sig size decr, x_sig=min_den_sig, x_exp = min_exp
+// Set p10, result is zero, sign of x, signal underflow and inexact
+// 6 sig size decr, x_sig=min_sig, x_exp < min_exp
+// Set p14, result is zero, sign of x, signal underflow and inexact
+//
+// Form exponent of smallest float denormal (if normalized register format)
+{ .mmi
+ adds nexttoward_GR_min_den_rexp = -23, nexttoward_GR_min_pexp
+(p12) cmp.eq.unc p6,p0 = nexttoward_GR_new_sig, r0
+(p13) cmp.eq.unc p8,p10 = nexttoward_GR_new_sig, nexttoward_GR_lden_sig ;;
+}
+
+{ .mmi
+(p6) cmp.lt.unc p6,p7 = nexttoward_GR_x_exp, nexttoward_GR_max_pexp
+(p8) cmp.gt.unc p8,p9 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+(p10) cmp.eq.unc p10,p0 = nexttoward_GR_new_sig, r0 ;;
+}
+
+// Create small normal in case need to generate underflow flag
+{ .mfi
+(p10) cmp.le.unc p10,p0 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+ fmerge.se NEXTTOWARD_tmp = NEXTTOWARD_sden_exp, NEXTTOWARD_lnorm_sig
+(p9) cmp.gt.unc p9,p14 = nexttoward_GR_x_exp, nexttoward_GR_min_den_rexp
+}
+// Branch if cases 1, 2, 3
+{ .bbb
+(p6) br.cond.spnt NEXTTOWARD_EXPUP
+(p7) br.cond.spnt NEXTTOWARD_OVERFLOW
+(p8) br.cond.spnt NEXTTOWARD_EXPDOWN ;;
+}
+
+// Branch if cases 4, 5, 6
+{ .bbb
+(p9) br.cond.spnt NEXTTOWARD_NORM_TO_DENORM
+(p10) br.cond.spnt NEXTTOWARD_UNDERFLOW_TO_ZERO
+(p14) br.cond.spnt NEXTTOWARD_UNDERFLOW_TO_ZERO ;;
+}
+
+// Here if no special cases
+// Set p6 if result will be a denormal, so can force underflow flag
+// Case 1: x_exp=min_exp, x_sig=unnormalized
+// Case 2: x_exp<min_exp
+{ .mfi
+ cmp.lt p6,p7 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+ fmerge.se f8 = NEXTTOWARD_new_exp, NEXTTOWARD_new_sig
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+ nop.f 999
+(p7) tbit.z p6,p0 = nexttoward_GR_new_sig, 63 ;;
+}
+
+NEXTTOWARD_COMMON_FINISH:
+// Force underflow and inexact if denormal result
+{ .mfi
+ nop.m 999
+(p6) fma.s.s0 NEXTTOWARD_tmp = NEXTTOWARD_tmp,NEXTTOWARD_tmp,f0
+ nop.i 999 ;;
+}
+
+// Final normalization to result precision and exit
+{ .mfb
+ nop.m 999
+ fnorm.s.s0 f8 = f8
+ br.ret.sptk b0;;
+}
+
+//Special cases
+NEXTTOWARD_EXPUP:
+{ .mfb
+ cmp.lt p6,p7 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+ fmerge.se f8 = NEXTTOWARD_exp1, NEXTTOWARD_snorm_sig
+ br.cond.sptk NEXTTOWARD_COMMON_FINISH ;;
+}
+
+NEXTTOWARD_EXPDOWN:
+{ .mfb
+ cmp.lt p6,p7 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+ fmerge.se f8 = NEXTTOWARD_exp1, NEXTTOWARD_lnorm_sig
+ br.cond.sptk NEXTTOWARD_COMMON_FINISH ;;
+}
+
+NEXTTOWARD_NORM_TO_DENORM:
+{ .mfi
+ nop.m 999
+ fmerge.se f8 = NEXTTOWARD_new_exp, NEXTTOWARD_lden_sig
+ nop.i 999
+}
+// Force underflow and inexact
+{ .mfb
+ nop.m 999
+ fma.s.s0 NEXTTOWARD_tmp = NEXTTOWARD_tmp,NEXTTOWARD_tmp,f0
+ br.ret.sptk b0 ;;
+}
+
+NEXTTOWARD_UNDERFLOW_TO_ZERO:
+{ .mfb
+ cmp.eq p6,p0 = r0,r0
+ fmerge.s f8 = NEXTTOWARD_save_f8,f0
+ br.cond.sptk NEXTTOWARD_COMMON_FINISH ;;
+}
+
+NEXTTOWARD_INF:
+// Here if f8 is +- infinity
+// INF
+// if f8 is +inf, no matter what y is return largest float
+// if f8 is -inf, no matter what y is return -largest float
+
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTTOWARD_lnorm = NEXTTOWARD_lnorm_exp,NEXTTOWARD_lnorm_sig
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+ fmerge.s f8 = f8,NEXTTOWARD_lnorm
+ br.ret.sptk b0 ;;
+}
+
+NEXTTOWARD_ZERO:
+
+// Here if f8 is +- zero
+// ZERO
+// if f8 is zero and y is +, return + smallest float denormal
+// if f8 is zero and y is -, return - smallest float denormal
+
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTTOWARD_sden = NEXTTOWARD_sden_exp,NEXTTOWARD_sden_sig
+ nop.i 999 ;;
+}
+
+// Create small normal to generate underflow flag
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTTOWARD_tmp = NEXTTOWARD_sden_exp, NEXTTOWARD_lnorm_sig
+ nop.i 999 ;;
+}
+
+// Add correct sign from direction arg
+{ .mfi
+ nop.m 999
+ fmerge.s f8 = f9,NEXTTOWARD_sden
+ nop.i 999;;
+}
+
+// Force underflow and inexact flags
+{ .mfb
+ nop.m 999
+ fma.s.s0 NEXTTOWARD_tmp = NEXTTOWARD_tmp,NEXTTOWARD_tmp,f0
+ br.ret.sptk b0 ;;
+}
+
+GLOBAL_LIBM_END(nexttowardf)
+// Stack operations when calling error support.
+// (1) (2) (3) (call) (4)
+// sp -> + psp -> + psp -> + sp -> +
+// | | | |
+// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
+// | | | |
+// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
+// | | | |
+// | | <- GR_X X1 ->| |
+// | | | |
+// sp-64 -> + sp -> + sp -> + +
+// save ar.pfs save b0 restore gp
+// save gp restore ar.pfs
+
+
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+NEXTTOWARD_OVERFLOW:
+// Here if f8 is finite, but result will be infinite
+// Use frcpa to generate infinity of correct sign
+// Call error support to report possible range error
+.prologue
+
+{ .mfi
+ alloc r32=ar.pfs,2,2,4,0
+ frcpa.s1 f8,p6 = NEXTTOWARD_save_f8, f0
+ nop.i 999
+}
+
+// Create largest float
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTTOWARD_lnorm = NEXTTOWARD_lnorm_exp,NEXTTOWARD_lnorm_sig
+ nop.i 999 ;;
+}
+
+// Force overflow and inexact flags to be set
+{ .mfi
+ mov r39 = 200 // Error code
+ fma.s.s0 NEXTTOWARD_tmp = NEXTTOWARD_lnorm,NEXTTOWARD_lnorm,f0
+ nop.i 999
+}
+;;
+
+// (1)
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+
+// (2)
+{ .mmi
+ stfs [GR_Parameter_Y] = f9,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+// (3)
+{ .mib
+ stfs [GR_Parameter_X] = NEXTTOWARD_save_f8 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+
+// (4)
+{ .mmi
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/s_nexttowardf.c b/sysdeps/ia64/fpu/s_nexttowardf.c
deleted file mode 100644
index 55e95f6916..0000000000
--- a/sysdeps/ia64/fpu/s_nexttowardf.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <sysdeps/i386/fpu/s_nexttowardf.c>
diff --git a/sysdeps/ia64/fpu/s_nexttowardl.S b/sysdeps/ia64/fpu/s_nexttowardl.S
new file mode 100644
index 0000000000..9c79f2cd1e
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_nexttowardl.S
@@ -0,0 +1,492 @@
+.file "nexttowardl.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 08/15/01 Initial version
+// 08/23/01 Corrected error tag number
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// long double nexttowardl( long double x, long double y );
+// input floating point f8, f9
+// output floating point f8
+//
+// Registers used
+//==============================================================
+nexttoward_GR_max_pexp = r14
+nexttoward_GR_min_pexp = r15
+nexttoward_GR_exp = r16
+nexttoward_GR_sig = r17
+nexttoward_GR_lnorm_sig = r18
+nexttoward_GR_sign_mask = r19
+nexttoward_GR_exp_mask = r20
+nexttoward_GR_sden_sig = r21
+nexttoward_GR_new_sig = r22
+nexttoward_GR_new_exp = r23
+nexttoward_GR_lden_sig = r24
+nexttoward_GR_snorm_sig = r25
+nexttoward_GR_exp1 = r26
+nexttoward_GR_x_exp = r27
+// r36-39 parameters for libm_error_support
+
+GR_SAVE_B0 = r34
+GR_SAVE_GP = r35
+GR_SAVE_PFS = r32
+
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+
+NEXTTOWARD_lnorm_sig = f10
+NEXTTOWARD_lnorm_exp = f11
+NEXTTOWARD_lnorm = f12
+NEXTTOWARD_sden_sig = f13
+NEXTTOWARD_den_exp = f14
+NEXTTOWARD_sden = f15
+NEXTTOWARD_snorm_exp = f32
+NEXTTOWARD_save_f8 = f33
+NEXTTOWARD_new_exp = f34
+NEXTTOWARD_new_sig = f35
+NEXTTOWARD_lden_sig = f36
+NEXTTOWARD_snorm_sig = f37
+NEXTTOWARD_exp1 = f38
+NEXTTOWARD_tmp = f39
+
+//
+// Overview of operation
+//==============================================================
+// nexttowardl determines the next representable value
+// after x in the direction of y.
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(nexttowardl)
+
+// Extract signexp from x
+// Is x < y ? p10 if yes, p11 if no
+// Form smallest denormal significand = ulp size
+{ .mfi
+ getf.exp nexttoward_GR_exp = f8
+ fcmp.lt.s1 p10,p11 = f8, f9
+ addl nexttoward_GR_sden_sig = 0x1, r0
+}
+// Form largest normal significand 0xffffffffffffffff
+// Form smallest normal exponent
+{ .mfi
+ addl nexttoward_GR_lnorm_sig = -0x1,r0
+ nop.f 999
+ addl nexttoward_GR_min_pexp = 0x0c001, r0 ;;
+}
+
+// Extract significand from x
+// Is x=y? This fcmp also sets Invalid and Denormal if required
+// Form largest normal exponent
+{ .mfi
+ getf.sig nexttoward_GR_sig = f8
+ fcmp.eq.s0 p6,p0 = f8, f9
+ addl nexttoward_GR_max_pexp = 0x13ffe, r0
+}
+// Move largest normal significand to fp reg for special cases
+{ .mfi
+ setf.sig NEXTTOWARD_lnorm_sig = nexttoward_GR_lnorm_sig
+ nop.f 999
+ addl nexttoward_GR_sign_mask = 0x20000, r0 ;;
+}
+
+// Move smallest denormal significand and exp to fp regs
+// Is x=nan?
+// Set p12 and p13 based on whether significand increases or decreases
+// It increases (p12 set) if x<y and x>=0 or if x>y and x<0
+// It decreases (p13 set) if x<y and x<0 or if x>y and x>=0
+{ .mfi
+ setf.sig NEXTTOWARD_sden_sig = nexttoward_GR_sden_sig
+ fclass.m p8,p0 = f8, 0xc3
+(p10) cmp.lt p12,p13 = nexttoward_GR_exp, nexttoward_GR_sign_mask
+}
+// Move smallest normal exp to fp regs
+{ .mfi
+ setf.exp NEXTTOWARD_snorm_exp = nexttoward_GR_min_pexp
+ nop.f 999
+(p11) cmp.ge p12,p13 = nexttoward_GR_exp, nexttoward_GR_sign_mask ;;
+}
+
+.pred.rel "mutex",p12,p13
+
+// Form expected new significand, adding or subtracting 1 ulp increment
+// If x=y set result to y
+// Form smallest normal significand and largest denormal significand
+{ .mfi
+(p12) add nexttoward_GR_new_sig = nexttoward_GR_sig, nexttoward_GR_sden_sig
+(p6) fmerge.s f8=f9,f9
+ dep.z nexttoward_GR_snorm_sig = 1,63,1 // 0x8000000000000000
+}
+{ .mlx
+(p13) sub nexttoward_GR_new_sig = nexttoward_GR_sig, nexttoward_GR_sden_sig
+ movl nexttoward_GR_lden_sig = 0x7fffffffffffffff ;;
+}
+
+// Move expected result significand and signexp to fp regs
+// Is y=nan?
+// Form new exponent in case result exponent needs incrementing or decrementing
+{ .mfi
+ setf.exp NEXTTOWARD_new_exp = nexttoward_GR_exp
+ fclass.m p9,p0 = f9, 0xc3
+(p12) add nexttoward_GR_exp1 = 1, nexttoward_GR_exp
+}
+{ .mib
+ setf.sig NEXTTOWARD_new_sig = nexttoward_GR_new_sig
+(p13) add nexttoward_GR_exp1 = -1, nexttoward_GR_exp
+(p6) br.ret.spnt b0 ;; // Exit if x=y
+}
+
+// Move largest normal signexp to fp reg for special cases
+// Is x=zero?
+{ .mfi
+ setf.exp NEXTTOWARD_lnorm_exp = nexttoward_GR_max_pexp
+ fclass.m p7,p0 = f8, 0x7
+ nop.i 999
+}
+{ .mfb
+ setf.exp NEXTTOWARD_den_exp = nexttoward_GR_min_pexp
+(p8) fma.s0 f8 = f8,f1,f9
+(p8) br.ret.spnt b0 ;; // Exit if x=nan
+}
+
+// Move exp+-1 and smallest normal significand to fp regs for special cases
+// Is x=inf?
+{ .mfi
+ setf.exp NEXTTOWARD_exp1 = nexttoward_GR_exp1
+ fclass.m p6,p0 = f8, 0x23
+ addl nexttoward_GR_exp_mask = 0x1ffff, r0
+}
+{ .mfb
+ setf.sig NEXTTOWARD_snorm_sig = nexttoward_GR_snorm_sig
+(p9) fma.s0 f8 = f8,f1,f9
+(p9) br.ret.spnt b0 ;; // Exit if y=nan
+}
+
+// Move largest denormal significand to fp regs for special cases
+// Save x
+{ .mfb
+ setf.sig NEXTTOWARD_lden_sig = nexttoward_GR_lden_sig
+ mov NEXTTOWARD_save_f8 = f8
+(p7) br.cond.spnt NEXTTOWARD_ZERO ;; // Exit if x=0
+}
+
+// Mask off the sign to get x_exp
+{ .mfb
+ and nexttoward_GR_x_exp = nexttoward_GR_exp_mask, nexttoward_GR_exp
+ nop.f 999
+(p6) br.cond.spnt NEXTTOWARD_INF ;; // Exit if x=inf
+}
+
+// Check 5 special cases when significand rolls over:
+// 1 sig size incr, x_sig=max_sig, x_exp < max_exp
+// Set p6, result is sig=min_sig, exp++
+// 2 sig size incr, x_sig=max_sig, x_exp >= max_exp
+// Set p7, result is inf, signal overflow
+// 3 sig size decr, x_sig=min_sig, x_exp > min_exp
+// Set p8, result is sig=max_sig, exp--
+// 4 sig size decr, x_sig=min_sig, x_exp = min_exp
+// Set p9, result is sig=max_den_sig, exp same, signal underflow and inexact
+// 5 sig size decr, x_sig=min_den_sig, x_exp = min_exp
+// Set p10, result is zero, sign of x, signal underflow and inexact
+//
+{ .mmi
+(p12) cmp.eq.unc p6,p0 = nexttoward_GR_new_sig, r0
+(p13) cmp.eq.unc p9,p10 = nexttoward_GR_new_sig, nexttoward_GR_lden_sig
+ nop.i 999
+;;
+}
+
+{ .mmi
+(p6) cmp.lt.unc p6,p7 = nexttoward_GR_x_exp, nexttoward_GR_max_pexp
+(p10) cmp.eq.unc p10,p0 = nexttoward_GR_new_sig, r0
+(p9) cmp.le.unc p9,p8 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+;;
+}
+
+// Create small normal in case need to generate underflow flag
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTTOWARD_tmp = NEXTTOWARD_snorm_exp, NEXTTOWARD_lnorm_sig
+ nop.i 999
+}
+// Branch if cases 1, 2, 3
+{ .bbb
+(p6) br.cond.spnt NEXTTOWARD_EXPUP
+(p7) br.cond.spnt NEXTTOWARD_OVERFLOW
+(p8) br.cond.spnt NEXTTOWARD_EXPDOWN ;;
+}
+
+// Branch if cases 4, 5
+{ .mbb
+ nop.m 999
+(p9) br.cond.spnt NEXTTOWARD_NORM_TO_DENORM
+(p10) br.cond.spnt NEXTTOWARD_UNDERFLOW_TO_ZERO
+;;
+}
+
+// Here if no special cases
+// Set p6 if result will be a denormal, so can force underflow flag
+// Case 1: x_exp=min_exp, x_sig=unnormalized
+// Case 2: x_exp<min_exp
+{ .mfi
+ cmp.lt p6,p7 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+ fmerge.se f8 = NEXTTOWARD_new_exp, NEXTTOWARD_new_sig
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
+ nop.f 999
+(p6) tbit.z p6,p0 = nexttoward_GR_new_sig, 63 ;;
+}
+
+NEXTTOWARD_COMMON_FINISH:
+// Force underflow and inexact if denormal result
+{ .mfi
+ nop.m 999
+(p6) fma.s0 NEXTTOWARD_tmp = NEXTTOWARD_tmp,NEXTTOWARD_tmp,f0
+ nop.i 999 ;;
+}
+
+// Final normalization to result precision and exit
+{ .mfb
+ nop.m 999
+ fnorm.s0 f8 = f8
+ br.ret.sptk b0;;
+}
+
+//Special cases
+NEXTTOWARD_EXPUP:
+{ .mfb
+ cmp.lt p6,p7 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+ fmerge.se f8 = NEXTTOWARD_exp1, NEXTTOWARD_snorm_sig
+ br.cond.sptk NEXTTOWARD_COMMON_FINISH ;;
+}
+
+NEXTTOWARD_EXPDOWN:
+{ .mfb
+ cmp.lt p6,p7 = nexttoward_GR_x_exp, nexttoward_GR_min_pexp
+ fmerge.se f8 = NEXTTOWARD_exp1, NEXTTOWARD_lnorm_sig
+ br.cond.sptk NEXTTOWARD_COMMON_FINISH ;;
+}
+
+NEXTTOWARD_NORM_TO_DENORM:
+{ .mfi
+ nop.m 999
+ fmerge.se f8 = NEXTTOWARD_exp1, NEXTTOWARD_lden_sig
+ nop.i 999
+}
+// Force underflow and inexact
+{ .mfb
+ nop.m 999
+ fma.s0 NEXTTOWARD_tmp = NEXTTOWARD_tmp,NEXTTOWARD_tmp,f0
+ br.ret.sptk b0 ;;
+}
+
+NEXTTOWARD_UNDERFLOW_TO_ZERO:
+{ .mfb
+ cmp.eq p6,p0 = r0,r0
+ fmerge.s f8 = NEXTTOWARD_save_f8,f0
+ br.cond.sptk NEXTTOWARD_COMMON_FINISH ;;
+}
+
+NEXTTOWARD_INF:
+// Here if f8 is +- infinity
+// INF
+// if f8 is +inf, no matter what y is return largest long double
+// if f8 is -inf, no matter what y is return -largest long double
+
+// Create largest long double
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTTOWARD_lnorm = NEXTTOWARD_lnorm_exp,NEXTTOWARD_lnorm_sig
+ nop.i 999 ;;
+}
+
+{ .mfb
+ nop.m 999
+ fmerge.s f8 = f8,NEXTTOWARD_lnorm
+ br.ret.sptk b0 ;;
+}
+
+NEXTTOWARD_ZERO:
+
+// Here if f8 is +- zero
+// ZERO
+// if f8 is zero and y is +, return + smallest long double denormal
+// if f8 is zero and y is -, return - smallest long double denormal
+
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTTOWARD_sden = f0,NEXTTOWARD_sden_sig
+ nop.i 999 ;;
+}
+
+// Create small normal to generate underflow flag
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTTOWARD_tmp = NEXTTOWARD_snorm_exp, NEXTTOWARD_lnorm_sig
+ nop.i 999 ;;
+}
+
+// Add correct sign from direction arg
+{ .mfi
+ nop.m 999
+ fmerge.s f8 = f9,NEXTTOWARD_sden
+ nop.i 999 ;;
+}
+
+// Force underflow and inexact flags
+{ .mfb
+ nop.m 999
+ fma.s0 NEXTTOWARD_tmp = NEXTTOWARD_tmp,NEXTTOWARD_tmp,f0
+ br.ret.sptk b0 ;;
+}
+
+GLOBAL_LIBM_END(nexttowardl)
+// Stack operations when calling error support.
+// (1) (2) (3) (call) (4)
+// sp -> + psp -> + psp -> + sp -> +
+// | | | |
+// | | <- GR_Y R3 ->| <- GR_RESULT | -> f8
+// | | | |
+// | <-GR_Y Y2->| Y2 ->| <- GR_Y |
+// | | | |
+// | | <- GR_X X1 ->| |
+// | | | |
+// sp-64 -> + sp -> + sp -> + +
+// save ar.pfs save b0 restore gp
+// save gp restore ar.pfs
+
+
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+NEXTTOWARD_OVERFLOW:
+// Here if f8 is finite, but result will be infinite
+// Use frcpa to generate infinity of correct sign
+// Call error support to report possible range error
+.prologue
+
+{ .mfi
+ alloc r32=ar.pfs,2,2,4,0
+ frcpa.s1 f8,p6 = NEXTTOWARD_save_f8, f0
+ nop.i 999 ;;
+}
+
+// Create largest long double
+{ .mfi
+ nop.m 999
+ fmerge.se NEXTTOWARD_lnorm = NEXTTOWARD_lnorm_exp,NEXTTOWARD_lnorm_sig
+ nop.i 999 ;;
+}
+
+// Force overflow and inexact flags to be set
+{ .mfi
+ mov r39 = 198 // Error code
+ fma.s0 NEXTTOWARD_tmp = NEXTTOWARD_lnorm,NEXTTOWARD_lnorm,f0
+ nop.i 999
+}
+;;
+
+// (1)
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+
+// (2)
+{ .mmi
+ stfe [GR_Parameter_Y] = f9,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+// (3)
+{ .mib
+ stfe [GR_Parameter_X] = NEXTTOWARD_save_f8 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfe [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+
+// (4)
+{ .mmi
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/s_rint.S b/sysdeps/ia64/fpu/s_rint.S
index d04f06a31f..1735d9b498 100644
--- a/sysdeps/ia64/fpu/s_rint.S
+++ b/sysdeps/ia64/fpu/s_rint.S
@@ -1,10 +1,10 @@
.file "rint.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,74 +20,68 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 2/08/01 Corrected behavior for all rounding modes.
-//
+// 02/02/00 Initial version
+// 02/08/01 Corrected behavior for all rounding modes.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance
+//==============================================================
+
// API
//==============================================================
// double rint(double x)
+//==============================================================
-#include "libm_support.h"
-
-//
-// general registers used:
-//
-rint_GR_FFFF = r14
-rint_GR_signexp = r15
-rint_GR_exponent = r16
-rint_GR_17ones = r17
-rint_GR_10033 = r18
-rint_GR_fpsr = r19
-rint_GR_rcs0 = r20
-rint_GR_rcs0_mask = r21
+// general input registers:
+// r14 - r21
+rSignexp = r14
+rExp = r15
+rExpMask = r16
+rBigexp = r17
+rM1 = r18
+rFpsr = r19
+rRcs0 = r20
+rRcs0Mask = r21
-// predicate registers used:
-// p6-11
+// floating-point registers:
+// f8 - f11
-// floating-point registers used:
+fXInt = f9
+fNormX = f10
+fTmp = f11
-RINT_NORM_f8 = f9
-RINT_FFFF = f10
-RINT_INEXACT = f11
-RINT_FLOAT_INT_f8 = f12
-RINT_INT_f8 = f13
+// predicate registers used:
+// p6 - p10
// Overview of operation
//==============================================================
-
// double rint(double x)
-// Return an integer value (represented as a double) that is x rounded to integer in current
-// rounding mode
+// Return an integer value (represented as a double) that is x
+// rounded to integer in current rounding mode
// Inexact is set if x != rint(x)
-// *******************************************************************************
-
-// Set denormal flag for denormal input and
-// and take denormal fault if necessary.
-
-// Is the input an integer value already?
+//==============================================================
// double_extended
-// if the exponent is >= 1003e => 3F(true) = 63(decimal)
+// if the exponent is > 1003e => 3F(true) = 63(decimal)
// we have a significand of 64 bits 1.63-bits.
// If we multiply by 2^63, we no longer have a fractional part
// So input is an integer value already.
@@ -100,155 +94,136 @@ RINT_INT_f8 = f13
// So input is an integer value already.
// single
-// if the exponent is >= 10016 => 17(true) = 23(decimal)
-// we have a significand of 53 bits 1.52-bits. (implicit 1)
-// If we multiply by 2^52, we no longer have a fractional part
+// if the exponent is > 10016 => 17(true) = 23(decimal)
+// we have a significand of 24 bits 1.23-bits. (implicit 1)
+// If we multiply by 2^23, we no longer have a fractional part
// So input is an integer value already.
-// If x is NAN, ZERO, or INFINITY, then return
-
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 1 11 0xe7
-
-
-.align 32
-.global rint#
-
.section .text
-.proc rint#
-.align 32
-
-
-rint:
-#ifdef _LIBC
-.global __rint
-.type __rint,@function
-__rint:
-#endif
+GLOBAL_IEEE754_ENTRY(rint)
{ .mfi
- mov rint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0
- fcvt.fx.s1 RINT_INT_f8 = f8
- addl rint_GR_10033 = 0x10033, r0
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
+ addl rBigexp = 0x10033, r0 // Set exponent at which is integer
}
{ .mfi
- mov rint_GR_FFFF = -1
- fnorm.s1 RINT_NORM_f8 = f8
- mov rint_GR_17ones = 0x1FFFF
-;;
+ mov rM1 = -1 // Set all ones
+ fcvt.fx.s1 fXInt = f8 // Convert to int in significand
+ mov rExpMask = 0x1FFFF // Form exponent mask
}
+;;
{ .mfi
- setf.sig RINT_FFFF = rint_GR_FFFF
- fclass.m.unc p6,p0 = f8, 0xe7
- mov rint_GR_rcs0_mask = 0x0c00
-;;
+ mov rFpsr = ar40 // Read fpsr -- check rc.s0
+ fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p6) fnorm.d f8 = f8
-(p6) br.ret.spnt b0 // Exit if x nan, inf, zero
-;;
+ setf.sig fTmp = rM1 // Make const for setting inexact
+ fnorm.s1 fNormX = f8 // Normalize input
+(p7) br.cond.spnt RINT_UNORM // Branch if x unorm
}
-
-{ .mfi
- nop.m 999
- fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8
- nop.i 999
;;
+
+
+RINT_COMMON:
+// Return here from RINT_UNORM
+{ .mfb
+ and rExp = rSignexp, rExpMask // Get biased exponent
+(p6) fma.d.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf
}
+;;
{ .mfi
- getf.exp rint_GR_signexp = RINT_NORM_f8
- fcmp.eq.s0 p8,p0 = f8,f0 // Dummy op to set denormal
- nop.i 999
-;;
+ mov rRcs0Mask = 0x0c00 // Mask for rc.s0
+ fcvt.xf f8 = fXInt // Result assume |x| < 2^52
+ cmp.ge p7,p8 = rExp, rBigexp // Is |x| >= 2^52?
}
-
-
-{ .mii
- nop.m 999
- nop.i 999
- and rint_GR_exponent = rint_GR_signexp, rint_GR_17ones
;;
-}
-{ .mmi
- cmp.ge.unc p7,p6 = rint_GR_exponent, rint_GR_10033
- and rint_GR_rcs0 = rint_GR_rcs0_mask, rint_GR_fpsr
- nop.i 999
-;;
+// We must correct result if |x| >= 2^52
+{ .mfi
+ nop.m 0
+(p7) fma.d.s0 f8 = fNormX, f1, f0 // If |x| >= 2^52, result x
+ nop.i 0
}
-
-// Check to see if s0 rounding mode is round to nearest. If not then set s2
-// rounding mode to that of s0 and repeat conversions.
-L(RINT_COMMON):
-{ .mfb
- cmp.ne p11,p0 = rint_GR_rcs0, r0
-(p6) fclass.m.unc p9,p10 = RINT_FLOAT_INT_f8, 0x07 // Test for result=0
-(p11) br.cond.spnt L(RINT_NOT_ROUND_NEAREST) // Branch if not round to nearest
;;
-}
{ .mfi
- nop.m 999
-(p6) fcmp.eq.unc.s1 p0,p8 = RINT_FLOAT_INT_f8, RINT_NORM_f8
- nop.i 999
+ nop.m 0
+ fcmp.eq.unc.s1 p0, p9 = f8, fNormX // Is result = x ?
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p7) fnorm.d.s0 f8 = f8
- nop.i 999
-;;
+ nop.m 0
+(p8) fmerge.s f8 = fNormX, f8 // Make sure sign rint(x) = sign x
+ nop.i 0
}
+;;
-// If result is zero, merge sign of input
{ .mfi
- nop.m 999
-(p9) fmerge.s f8 = f8, RINT_FLOAT_INT_f8
- nop.i 999
+(p8) and rRcs0 = rFpsr, rRcs0Mask // Get rounding mode for sf0
+ nop.f 0
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p10) fnorm.d f8 = RINT_FLOAT_INT_f8
- nop.i 999
;;
+
+// If |x| < 2^52 we must test for other rounding modes
+{ .mfi
+(p8) cmp.ne.unc p10,p0 = rRcs0, r0 // Test for other rounding modes
+(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact
+ nop.i 0
+}
+{ .mbb
+ nop.m 0
+(p10) br.cond.spnt RINT_NOT_ROUND_NEAREST // Branch if not round nearest
+ br.ret.sptk b0 // Exit main path if round nearest
}
+;;
+
+
+RINT_UNORM:
+// Here if x unorm
{ .mfb
- nop.m 999
-(p8) fmpy.s0 RINT_INEXACT = RINT_FFFF,RINT_FFFF // Dummy to set inexact
- br.ret.sptk b0
-;;
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk RINT_COMMON // Return to main path
}
+;;
-L(RINT_NOT_ROUND_NEAREST):
-// Set rounding mode of s2 to that of s0
+RINT_NOT_ROUND_NEAREST:
+// Here if not round to nearest, and |x| < 2^52
+// Set rounding mode of s2 to that of s0, and repeat the conversion using s2
{ .mfi
- mov rint_GR_rcs0 = r0 // Clear so we don't come back here
- fsetc.s2 0x7f, 0x40
- nop.i 999
-;;
+ nop.m 0
+ fsetc.s2 0x7f, 0x40
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
- fcvt.fx.s2 RINT_INT_f8 = f8
- nop.i 999
+ nop.m 0
+ fcvt.fx.s2 fXInt = fNormX // Convert to int in significand
+ nop.i 0
+}
;;
+
+{ .mfi
+ nop.m 0
+ fcvt.xf f8 = fXInt // Expected result
+ nop.i 0
}
+;;
+// Be sure sign of result = sign of input. Fixes cases where result is 0.
{ .mfb
- nop.m 999
- fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8
- br.cond.sptk L(RINT_COMMON)
-;;
+ nop.m 0
+ fmerge.s f8 = fNormX, f8
+ br.ret.sptk b0 // Exit main path
}
+;;
-
-.endp rint
-ASM_SIZE_DIRECTIVE(rint)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__rint)
-#endif
+GLOBAL_IEEE754_END(rint)
diff --git a/sysdeps/ia64/fpu/s_rintf.S b/sysdeps/ia64/fpu/s_rintf.S
index 73cb98a048..05d6b411f2 100644
--- a/sysdeps/ia64/fpu/s_rintf.S
+++ b/sysdeps/ia64/fpu/s_rintf.S
@@ -1,10 +1,10 @@
.file "rintf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,74 +20,68 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 2/08/01 Corrected behavior for all rounding modes.
-//
+// 02/02/00 Initial version
+// 02/08/01 Corrected behavior for all rounding modes.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance
+//==============================================================
+
// API
//==============================================================
// float rintf(float x)
+//==============================================================
-#include "libm_support.h"
-
-//
-// general registers used:
-//
-rint_GR_FFFF = r14
-rint_GR_signexp = r15
-rint_GR_exponent = r16
-rint_GR_17ones = r17
-rint_GR_10033 = r18
-rint_GR_fpsr = r19
-rint_GR_rcs0 = r20
-rint_GR_rcs0_mask = r21
+// general input registers:
+// r14 - r21
+rSignexp = r14
+rExp = r15
+rExpMask = r16
+rBigexp = r17
+rM1 = r18
+rFpsr = r19
+rRcs0 = r20
+rRcs0Mask = r21
-// predicate registers used:
-// p6-11
+// floating-point registers:
+// f8 - f11
-// floating-point registers used:
+fXInt = f9
+fNormX = f10
+fTmp = f11
-RINT_NORM_f8 = f9
-RINT_FFFF = f10
-RINT_INEXACT = f11
-RINT_FLOAT_INT_f8 = f12
-RINT_INT_f8 = f13
+// predicate registers used:
+// p6 - p10
// Overview of operation
//==============================================================
-
// float rintf(float x)
-// Return an integer value (represented as a float) that is x rounded to integer in current
-// rounding mode
-// Inexact is set if x != rintf(x)
-// *******************************************************************************
-
-// Set denormal flag for denormal input and
-// and take denormal fault if necessary.
-
-// Is the input an integer value already?
+// Return an integer value (represented as a float) that is x
+// rounded to integer in current rounding mode
+// Inexact is set if x != rint(x)
+//==============================================================
// double_extended
-// if the exponent is >= 1003e => 3F(true) = 63(decimal)
+// if the exponent is > 1003e => 3F(true) = 63(decimal)
// we have a significand of 64 bits 1.63-bits.
// If we multiply by 2^63, we no longer have a fractional part
// So input is an integer value already.
@@ -100,155 +94,136 @@ RINT_INT_f8 = f13
// So input is an integer value already.
// single
-// if the exponent is >= 10016 => 17(true) = 23(decimal)
-// we have a significand of 53 bits 1.52-bits. (implicit 1)
-// If we multiply by 2^52, we no longer have a fractional part
+// if the exponent is > 10016 => 17(true) = 23(decimal)
+// we have a significand of 24 bits 1.23-bits. (implicit 1)
+// If we multiply by 2^23, we no longer have a fractional part
// So input is an integer value already.
-// If x is NAN, ZERO, or INFINITY, then return
-
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 1 11 0xe7
-
-
-.align 32
-.global rintf#
-
.section .text
-.proc rintf#
-.align 32
-
-
-rintf:
-#ifdef _LIBC
-.global __rintf
-.type __rintf,@function
-__rintf:
-#endif
+GLOBAL_IEEE754_ENTRY(rintf)
{ .mfi
- mov rint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0
- fcvt.fx.s1 RINT_INT_f8 = f8
- addl rint_GR_10033 = 0x10016, r0
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
+ addl rBigexp = 0x10016, r0 // Set exponent at which is integer
}
{ .mfi
- mov rint_GR_FFFF = -1
- fnorm.s1 RINT_NORM_f8 = f8
- mov rint_GR_17ones = 0x1FFFF
-;;
+ mov rM1 = -1 // Set all ones
+ fcvt.fx.s1 fXInt = f8 // Convert to int in significand
+ mov rExpMask = 0x1FFFF // Form exponent mask
}
+;;
{ .mfi
- setf.sig RINT_FFFF = rint_GR_FFFF
- fclass.m.unc p6,p0 = f8, 0xe7
- mov rint_GR_rcs0_mask = 0x0c00
-;;
+ mov rFpsr = ar40 // Read fpsr -- check rc.s0
+ fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p6) fnorm.s f8 = f8
-(p6) br.ret.spnt b0 // Exit if x nan, inf, zero
-;;
+ setf.sig fTmp = rM1 // Make const for setting inexact
+ fnorm.s1 fNormX = f8 // Normalize input
+(p7) br.cond.spnt RINT_UNORM // Branch if x unorm
}
-
-{ .mfi
- nop.m 999
- fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8
- nop.i 999
;;
+
+
+RINT_COMMON:
+// Return here from RINT_UNORM
+{ .mfb
+ and rExp = rSignexp, rExpMask // Get biased exponent
+(p6) fma.s.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf
}
+;;
{ .mfi
- getf.exp rint_GR_signexp = RINT_NORM_f8
- fcmp.eq.s0 p8,p0 = f8,f0 // Dummy op to set denormal
- nop.i 999
-;;
+ mov rRcs0Mask = 0x0c00 // Mask for rc.s0
+ fcvt.xf f8 = fXInt // Result assume |x| < 2^23
+ cmp.ge p7,p8 = rExp, rBigexp // Is |x| >= 2^23?
}
-
-
-{ .mii
- nop.m 999
- nop.i 999
- and rint_GR_exponent = rint_GR_signexp, rint_GR_17ones
;;
-}
-{ .mmi
- cmp.ge.unc p7,p6 = rint_GR_exponent, rint_GR_10033
- and rint_GR_rcs0 = rint_GR_rcs0_mask, rint_GR_fpsr
- nop.i 999
-;;
+// We must correct result if |x| >= 2^23
+{ .mfi
+ nop.m 0
+(p7) fma.s.s0 f8 = fNormX, f1, f0 // If |x| >= 2^23, result x
+ nop.i 0
}
-
-// Check to see if s0 rounding mode is round to nearest. If not then set s2
-// rounding mode to that of s0 and repeat conversions.
-L(RINT_COMMON):
-{ .mfb
- cmp.ne p11,p0 = rint_GR_rcs0, r0
-(p6) fclass.m.unc p9,p10 = RINT_FLOAT_INT_f8, 0x07 // Test for result=0
-(p11) br.cond.spnt L(RINT_NOT_ROUND_NEAREST) // Branch if not round to nearest
;;
-}
{ .mfi
- nop.m 999
-(p6) fcmp.eq.unc.s1 p0,p8 = RINT_FLOAT_INT_f8, RINT_NORM_f8
- nop.i 999
+ nop.m 0
+ fcmp.eq.unc.s1 p0, p9 = f8, fNormX // Is result = x ?
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p7) fnorm.s.s0 f8 = f8
- nop.i 999
-;;
+ nop.m 0
+(p8) fmerge.s f8 = fNormX, f8 // Make sure sign rint(x) = sign x
+ nop.i 0
}
+;;
-// If result is zero, merge sign of input
{ .mfi
- nop.m 999
-(p9) fmerge.s f8 = f8, RINT_FLOAT_INT_f8
- nop.i 999
+(p8) and rRcs0 = rFpsr, rRcs0Mask // Get rounding mode for sf0
+ nop.f 0
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p10) fnorm.s f8 = RINT_FLOAT_INT_f8
- nop.i 999
;;
+
+// If |x| < 2^23 we must test for other rounding modes
+{ .mfi
+(p8) cmp.ne.unc p10,p0 = rRcs0, r0 // Test for other rounding modes
+(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact
+ nop.i 0
+}
+{ .mbb
+ nop.m 0
+(p10) br.cond.spnt RINT_NOT_ROUND_NEAREST // Branch if not round nearest
+ br.ret.sptk b0 // Exit main path if round nearest
}
+;;
+
+
+RINT_UNORM:
+// Here if x unorm
{ .mfb
- nop.m 999
-(p8) fmpy.s0 RINT_INEXACT = RINT_FFFF,RINT_FFFF // Dummy to set inexact
- br.ret.sptk b0
-;;
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk RINT_COMMON // Return to main path
}
+;;
-L(RINT_NOT_ROUND_NEAREST):
-// Set rounding mode of s2 to that of s0
+RINT_NOT_ROUND_NEAREST:
+// Here if not round to nearest, and |x| < 2^23
+// Set rounding mode of s2 to that of s0, and repeat the conversion using s2
{ .mfi
- mov rint_GR_rcs0 = r0 // Clear so we don't come back here
- fsetc.s2 0x7f, 0x40
- nop.i 999
-;;
+ nop.m 0
+ fsetc.s2 0x7f, 0x40
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
- fcvt.fx.s2 RINT_INT_f8 = f8
- nop.i 999
+ nop.m 0
+ fcvt.fx.s2 fXInt = fNormX // Convert to int in significand
+ nop.i 0
+}
;;
+
+{ .mfi
+ nop.m 0
+ fcvt.xf f8 = fXInt // Expected result
+ nop.i 0
}
+;;
+// Be sure sign of result = sign of input. Fixes cases where result is 0.
{ .mfb
- nop.m 999
- fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8
- br.cond.sptk L(RINT_COMMON)
-;;
+ nop.m 0
+ fmerge.s f8 = fNormX, f8
+ br.ret.sptk b0 // Exit main path
}
+;;
-
-.endp rintf
-ASM_SIZE_DIRECTIVE(rintf)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__rintf)
-#endif
+GLOBAL_IEEE754_END(rintf)
diff --git a/sysdeps/ia64/fpu/s_rintl.S b/sysdeps/ia64/fpu/s_rintl.S
index 857e8d5208..b5402149ec 100644
--- a/sysdeps/ia64/fpu/s_rintl.S
+++ b/sysdeps/ia64/fpu/s_rintl.S
@@ -1,10 +1,10 @@
.file "rintl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,76 +20,68 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 5/24/00 Fixed case of 2^63 - 1 + 0.5 (0x1007dffffffffffffffff)
-// 2/08/01 Corrected behavior for all rounding modes.
-//
+// 02/02/00 Initial version
+// 02/08/01 Corrected behavior for all rounding modes.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance
+//==============================================================
+
// API
//==============================================================
// long double rintl(long double x)
+//==============================================================
-#include "libm_support.h"
-
-//
-// general registers used:
-//
-rint_GR_FFFF = r14
-rint_GR_signexp = r15
-rint_GR_exponent = r16
-rint_GR_17ones = r17
-rint_GR_10033 = r18
-rint_GR_fpsr = r19
-rint_GR_rcs0 = r20
-rint_GR_rcs0_mask = r21
+// general input registers:
+// r14 - r21
+rSignexp = r14
+rExp = r15
+rExpMask = r16
+rBigexp = r17
+rM1 = r18
+rFpsr = r19
+rRcs0 = r20
+rRcs0Mask = r21
-// predicate registers used:
-// p6-11
+// floating-point registers:
+// f8 - f11
-// floating-point registers used:
+fXInt = f9
+fNormX = f10
+fTmp = f11
-RINT_NORM_f8 = f9
-RINT_FFFF = f10
-RINT_INEXACT = f11
-RINT_FLOAT_INT_f8 = f12
-RINT_INT_f8 = f13
-RINT_SIGNED_FLOAT_INT_f8 = f14
+// predicate registers used:
+// p6 - p10
// Overview of operation
//==============================================================
-
// long double rintl(long double x)
-// Return an integer value (represented as a long double) that is x rounded to integer in current
-// rounding mode
-// Inexact is set if x != rintl(x)
-// *******************************************************************************
-
-// Set denormal flag for denormal input and
-// and take denormal fault if necessary.
-
-// Is the input an integer value already?
+// Return an integer value (represented as a long double) that is x
+// rounded to integer in current rounding mode
+// Inexact is set if x != rint(x)
+//==============================================================
// double_extended
-// if the exponent is >= 1003e => 3F(true) = 63(decimal)
+// if the exponent is > 1003e => 3F(true) = 63(decimal)
// we have a significand of 64 bits 1.63-bits.
// If we multiply by 2^63, we no longer have a fractional part
// So input is an integer value already.
@@ -102,151 +94,136 @@ RINT_SIGNED_FLOAT_INT_f8 = f14
// So input is an integer value already.
// single
-// if the exponent is >= 10016 => 17(true) = 23(decimal)
-// we have a significand of 53 bits 1.52-bits. (implicit 1)
-// If we multiply by 2^52, we no longer have a fractional part
+// if the exponent is > 10016 => 17(true) = 23(decimal)
+// we have a significand of 24 bits 1.23-bits. (implicit 1)
+// If we multiply by 2^23, we no longer have a fractional part
// So input is an integer value already.
-// If x is NAN, ZERO, or INFINITY, then return
-
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 1 11 0xe7
-
-
-.align 32
-.global rintl#
-
.section .text
-.proc rintl#
-.align 32
-
-
-rintl:
-#ifdef _LIBC
-.global __rintl
-.type __rintl,@function
-__rintl:
-#endif
+GLOBAL_IEEE754_ENTRY(rintl)
{ .mfi
- mov rint_GR_fpsr = ar40 // Read the fpsr--need to check rc.s0
- fcvt.fx.s1 RINT_INT_f8 = f8
- addl rint_GR_10033 = 0x1003e, r0
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
+ addl rBigexp = 0x1003e, r0 // Set exponent at which is integer
}
{ .mfi
- mov rint_GR_FFFF = -1
- fnorm.s1 RINT_NORM_f8 = f8
- mov rint_GR_17ones = 0x1FFFF
-;;
+ mov rM1 = -1 // Set all ones
+ fcvt.fx.s1 fXInt = f8 // Convert to int in significand
+ mov rExpMask = 0x1FFFF // Form exponent mask
}
+;;
{ .mfi
- setf.sig RINT_FFFF = rint_GR_FFFF
- fclass.m.unc p6,p0 = f8, 0xe7
- mov rint_GR_rcs0_mask = 0x0c00
-;;
+ mov rFpsr = ar40 // Read fpsr -- check rc.s0
+ fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p6) fnorm f8 = f8
-(p6) br.ret.spnt b0 // Exit if x nan, inf, zero
-;;
+ setf.sig fTmp = rM1 // Make const for setting inexact
+ fnorm.s1 fNormX = f8 // Normalize input
+(p7) br.cond.spnt RINT_UNORM // Branch if x unorm
}
-
-{ .mfi
- nop.m 999
- fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8
- nop.i 999
;;
+
+
+RINT_COMMON:
+// Return here from RINT_UNORM
+{ .mfb
+ and rExp = rSignexp, rExpMask // Get biased exponent
+(p6) fma.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf
}
+;;
{ .mfi
- getf.exp rint_GR_signexp = RINT_NORM_f8
- fcmp.eq.s0 p8,p0 = f8,f0 // Dummy op to set denormal
- nop.i 999
-;;
+ mov rRcs0Mask = 0x0c00 // Mask for rc.s0
+ fcvt.xf f8 = fXInt // Result assume |x| < 2^63
+ cmp.ge p7,p8 = rExp, rBigexp // Is |x| >= 2^63?
}
-
-
-{ .mii
- nop.m 999
- nop.i 999
- and rint_GR_exponent = rint_GR_signexp, rint_GR_17ones
;;
-}
-{ .mmi
- cmp.ge.unc p7,p6 = rint_GR_exponent, rint_GR_10033
- and rint_GR_rcs0 = rint_GR_rcs0_mask, rint_GR_fpsr
- nop.i 999
-;;
+// We must correct result if |x| >= 2^63
+{ .mfi
+ nop.m 0
+(p7) fma.s0 f8 = fNormX, f1, f0 // If |x| >= 2^63, result x
+ nop.i 0
}
-
-// Check to see if s0 rounding mode is round to nearest. If not then set s2
-// rounding mode to that of s0 and repeat conversions.
-// Must merge the original sign for cases where the result is zero or the input
-// is the largest that still has a fraction (0x1007dfffffffffff)
-L(RINT_COMMON):
-{ .mfb
- cmp.ne p11,p0 = rint_GR_rcs0, r0
-(p6) fmerge.s RINT_SIGNED_FLOAT_INT_f8 = f8, RINT_FLOAT_INT_f8
-(p11) br.cond.spnt L(RINT_NOT_ROUND_NEAREST) // Branch if not round to nearest
;;
-}
{ .mfi
- nop.m 999
-(p6) fcmp.eq.unc.s1 p0,p8 = RINT_FLOAT_INT_f8, RINT_NORM_f8
- nop.i 999
+ nop.m 0
+ fcmp.eq.unc.s1 p0, p9 = f8, fNormX // Is result = x ?
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p7) fnorm.s0 f8 = f8
- nop.i 999
-;;
+ nop.m 0
+(p8) fmerge.s f8 = fNormX, f8 // Make sure sign rint(x) = sign x
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
-(p6) fnorm f8 = RINT_SIGNED_FLOAT_INT_f8
- nop.i 999
+(p8) and rRcs0 = rFpsr, rRcs0Mask // Get rounding mode for sf0
+ nop.f 0
+ nop.i 0
+}
;;
+
+// If |x| < 2^63 we must test for other rounding modes
+{ .mfi
+(p8) cmp.ne.unc p10,p0 = rRcs0, r0 // Test for other rounding modes
+(p9) fmpy.s0 fTmp = fTmp, fTmp // Dummy to set inexact
+ nop.i 0
+}
+{ .mbb
+ nop.m 0
+(p10) br.cond.spnt RINT_NOT_ROUND_NEAREST // Branch if not round nearest
+ br.ret.sptk b0 // Exit main path if round nearest
}
+;;
+
+
+RINT_UNORM:
+// Here if x unorm
{ .mfb
- nop.m 999
-(p8) fmpy.s0 RINT_INEXACT = RINT_FFFF,RINT_FFFF // Dummy to set inexact
- br.ret.sptk b0
-;;
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk RINT_COMMON // Return to main path
}
+;;
-L(RINT_NOT_ROUND_NEAREST):
-// Set rounding mode of s2 to that of s0
+RINT_NOT_ROUND_NEAREST:
+// Here if not round to nearest, and |x| < 2^63
+// Set rounding mode of s2 to that of s0, and repeat the conversion using s2
{ .mfi
- mov rint_GR_rcs0 = r0 // Clear so we don't come back here
- fsetc.s2 0x7f, 0x40
- nop.i 999
-;;
+ nop.m 0
+ fsetc.s2 0x7f, 0x40
+ nop.i 0
}
+;;
{ .mfi
- nop.m 999
- fcvt.fx.s2 RINT_INT_f8 = f8
- nop.i 999
+ nop.m 0
+ fcvt.fx.s2 fXInt = fNormX // Convert to int in significand
+ nop.i 0
+}
;;
+
+{ .mfi
+ nop.m 0
+ fcvt.xf f8 = fXInt // Expected result
+ nop.i 0
}
+;;
+// Be sure sign of result = sign of input. Fixes cases where result is 0.
{ .mfb
- nop.m 999
- fcvt.xf RINT_FLOAT_INT_f8 = RINT_INT_f8
- br.cond.sptk L(RINT_COMMON)
-;;
+ nop.m 0
+ fmerge.s f8 = fNormX, f8
+ br.ret.sptk b0 // Exit main path
}
+;;
-
-.endp rintl
-ASM_SIZE_DIRECTIVE(rintl)
-#ifdef _LIBC
-ASM_SIZE_DIRECTIVE(__rintl)
-#endif
+GLOBAL_IEEE754_END(rintl)
diff --git a/sysdeps/ia64/fpu/s_round.S b/sysdeps/ia64/fpu/s_round.S
index b08ede1740..04033b4aa2 100644
--- a/sysdeps/ia64/fpu/s_round.S
+++ b/sysdeps/ia64/fpu/s_round.S
@@ -1,11 +1,10 @@
.file "round.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 10/25/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the
-// Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,229 +20,202 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 10/25/2000: Created
+// 10/25/00 Initial version
+// 06/14/01 Changed cmp to an equivalent form
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance and reduced code size
+// 04/18/03 Eliminate possible WAW dependency warning
//==============================================================
-//
+
// API
//==============================================================
// double round(double x)
-//
+//==============================================================
-#include "libm_support.h"
+// general input registers:
+// r14 - r19
-// general input registers:
-//
-round_GR_half = r14
-round_GR_big = r15
-round_GR_expmask = r16
-round_GR_signexp = r17
-round_GR_exp = r18
-round_GR_expdiff = r19
-
-// predicate registers used:
-// p6 - p10
+rSignexp = r14
+rExp = r15
+rExpMask = r16
+rBigexp = r17
+rExpHalf = r18
+rExpMHalf = r19
+
+// floating-point registers:
+// f8 - f13
-// floating-point registers used:
+fXtruncInt = f9
+fNormX = f10
+fHalf = f11
+fMHalf = f12
+fRem = f13
-ROUND_NORM_f8 = f9
-ROUND_TRUNC_f8 = f10
-ROUND_RINT_f8 = f11
-ROUND_FLOAT_TRUNC_f8 = f12
-ROUND_FLOAT_RINT_f8 = f13
-ROUND_REMAINDER = f14
-ROUND_HALF = f15
+// predicate registers used:
+// p6 - p10
// Overview of operation
//==============================================================
-
// double round(double x)
-// Return an integer value (represented as a double) that is x
-// rounded to nearest integer, halfway cases rounded away from
-// zero.
+// Return an integer value (represented as a double) that is x
+// rounded to nearest integer, halfway cases rounded away from
+// zero.
// if x>0 result = trunc(x+0.5)
// if x<0 result = trunc(x-0.5)
-// *******************************************************************************
-
-// Set denormal flag for denormal input and
-// and take denormal fault if necessary.
+//
+//==============================================================
-// If x is NAN, ZERO, INFINITY, or >= 2^52 then return
+// double_extended
+// if the exponent is > 1003e => 3F(true) = 63(decimal)
+// we have a significand of 64 bits 1.63-bits.
+// If we multiply by 2^63, we no longer have a fractional part
+// So input is an integer value already.
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 1 11 0xe7
+// double
+// if the exponent is >= 10033 => 34(true) = 52(decimal)
+// 34 + 3ff = 433
+// we have a significand of 53 bits 1.52-bits. (implicit 1)
+// If we multiply by 2^52, we no longer have a fractional part
+// So input is an integer value already.
+// single
+// if the exponent is > 10016 => 17(true) = 23(decimal)
+// we have a significand of 24 bits 1.23-bits. (implicit 1)
+// If we multiply by 2^23, we no longer have a fractional part
+// So input is an integer value already.
-.align 32
-.global round#
.section .text
-.proc round#
-.align 32
-
+GLOBAL_LIBM_ENTRY(round)
-round:
-
-// Get exponent for +0.5
-// Truncate x to integer
{ .mfi
- addl round_GR_half = 0x0fffe, r0
- fcvt.fx.trunc.s1 ROUND_TRUNC_f8 = f8
- nop.i 999
-}
-
-// Get signexp of x
-// Normalize input
-// Form exponent mask
-{ .mfi
- getf.exp round_GR_signexp = f8
- fnorm ROUND_NORM_f8 = f8
- addl round_GR_expmask = 0x1ffff, r0 ;;
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fcvt.fx.trunc.s1 fXtruncInt = f8 // Convert to int in significand
+ addl rBigexp = 0x10033, r0 // Set exponent at which is integer
}
-
-// Form +0.5
-// Round x to integer
{ .mfi
- setf.exp ROUND_HALF = round_GR_half
- fcvt.fx.s1 ROUND_RINT_f8 = f8
- nop.i 999 ;;
+ mov rExpHalf = 0x0FFFE // Form sign and exponent of 0.5
+ fnorm.s1 fNormX = f8 // Normalize input
+ mov rExpMask = 0x1FFFF // Form exponent mask
}
-// Get exp of x
-// Test for NAN, INF, ZERO
-// Get exponent at which input has no fractional part
-{ .mfi
- and round_GR_exp = round_GR_expmask, round_GR_signexp
- fclass.m p8,p9 = f8,0xe7
- addl round_GR_big = 0x10033, r0 ;;
-}
-
-// Get exp-bigexp
-// If exp is so big there is no fractional part, then turn on p8, off p9
-{ .mmi
- sub round_GR_expdiff = round_GR_exp, round_GR_big ;;
-#ifdef _LIBC
-(p9) cmp.lt.or.andcm p8,p9 = r0, round_GR_expdiff
-#else
-(p9) cmp.ge.or.andcm p8,p9 = round_GR_expdiff, r0
-#endif
- nop.i 999 ;;
-}
-
-// Set p6 if x<0, else set p7
-{ .mfi
- nop.m 999
-(p9) fcmp.lt.unc p6,p7 = f8,f0
- nop.i 999
+;;
+
+{ .mmf
+ setf.exp fHalf = rExpHalf // Form 0.5
+ mov rExpMHalf = 0x2FFFE // Form sign and exponent of -0.5
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
}
-
-// If NAN, INF, ZERO, or no fractional part, result is just normalized input
-{ .mfi
- nop.m 999
-(p8) fnorm.d.s0 f8 = f8
- nop.i 999 ;;
+;;
+
+{ .mfb
+ setf.exp fMHalf = rExpMHalf // Form -0.5
+ fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf
+(p7) br.cond.spnt ROUND_UNORM // Branch if x unorm
}
+;;
-// Float the truncated integer
+ROUND_COMMON:
+// Return here from ROUND_UNORM
{ .mfi
- nop.m 999
-(p9) fcvt.xf ROUND_FLOAT_TRUNC_f8 = ROUND_TRUNC_f8
- nop.i 999 ;;
+ nop.m 0
+ fcmp.lt.s1 p8,p9 = f8, f0 // Test if x < 0
+ nop.i 0
+}
+{ .mfb
+ and rExp = rSignexp, rExpMask // Get biased exponent
+(p6) fma.d.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf
}
+;;
-// Float the rounded integer to get preliminary result
{ .mfi
- nop.m 999
-(p9) fcvt.xf ROUND_FLOAT_RINT_f8 = ROUND_RINT_f8
- nop.i 999 ;;
-}
-
-// If x<0 and the difference of the truncated input minus the input is 0.5
-// then result = truncated input - 1.0
-// Else if x>0 and the difference of the input minus truncated input is 0.5
-// then result = truncated input + 1.0
-// Else
-// result = rounded input
-// Endif
-{ .mfi
- nop.m 999
-(p6) fsub.s1 ROUND_REMAINDER = ROUND_FLOAT_TRUNC_f8, ROUND_NORM_f8
- nop.i 999
+ cmp.lt p6,p0 = rExp, rExpHalf // Is |x| < 0.5?
+ fcvt.xf f8 = fXtruncInt // Pre-Result if 0.5 <= |x| < 2^52
+ cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^52?
}
-
{ .mfi
- nop.m 999
-(p7) fsub.s1 ROUND_REMAINDER = ROUND_NORM_f8, ROUND_FLOAT_TRUNC_f8
- nop.i 999 ;;
+ cmp.lt p10,p0 = rExp, rExpHalf // Is |x| < 0.5?
+ nop.f 0
+ nop.i 0
}
+;;
-// Assume preliminary result is rounded integer
+// We must correct result if |x| < 0.5, or |x| >= 2^52
+.pred.rel "mutex",p6,p7
{ .mfi
- nop.m 999
-(p9) fnorm.d.s0 f8 = ROUND_FLOAT_RINT_f8
- nop.i 999
+ nop.m 0
+(p6) fmerge.s f8 = fNormX, f0 // If |x| < 0.5, result sgn(x)*0
+ nop.i 0
}
-
-// If x<0, test if result=0
-{ .mfi
- nop.m 999
-(p6) fcmp.eq.unc p10,p0 = ROUND_FLOAT_RINT_f8,f0
- nop.i 999 ;;
+{ .mfb
+(p7) cmp.eq p10,p0 = r0, r0 // Also turn on p10 if |x| >= 2^52
+(p7) fma.d.s0 f8 = fNormX, f1, f0 // If |x| >= 2^52, result x
+(p10) br.ret.spnt b0 // Exit |x| < 0.5 or |x| >= 2^52
}
+;;
-// If x<0 and result=0, set result=-0
+// Here if 0.5 <= |x| < 2^52
{ .mfi
- nop.m 999
-(p10) fmerge.ns f8 = f1,f8
- nop.i 999
+ nop.m 0
+ fms.s1 fRem = fNormX, f1, f8 // Get remainder = x - trunc(x)
+ nop.i 0
}
-
-// If x<0, test if remainder=0.5
+;;
+
{ .mfi
- nop.m 999
-(p6) fcmp.eq.unc p6,p0 = ROUND_REMAINDER, ROUND_HALF
- nop.i 999 ;;
+ nop.m 0
+(p8) fcmp.le.s1 p8,p0 = fRem, fMHalf
+ nop.i 0
}
-
-// If x>0, test if remainder=0.5
{ .mfi
- nop.m 999
-(p7) fcmp.eq.unc p7,p0 = ROUND_REMAINDER, ROUND_HALF
- nop.i 999 ;;
+ nop.m 0
+(p9) fcmp.ge.s1 p9,p0 = fRem, fHalf
+ nop.i 0
}
+;;
-// If x<0 and remainder=0.5, result=truncated-1.0
-// If x>0 and remainder=0.5, result=truncated+1.0
-// Exit
-.pred.rel "mutex",p6,p7
+// If x < 0 and remainder <= -0.5, then subtract 1 from result
+// If x > 0 and remainder >= +0.5, then add 1 to result
+.pred.rel "mutex",p8,p9
{ .mfi
- nop.m 999
-(p6) fsub.d.s0 f8 = ROUND_FLOAT_TRUNC_f8,f1
- nop.i 999
+ nop.m 0
+(p8) fms.d.s0 f8 = f8, f1, f1
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p7) fadd.d.s0 f8 = ROUND_FLOAT_TRUNC_f8,f1
- br.ret.sptk b0 ;;
+ nop.m 0
+(p9) fma.d.s0 f8 = f8, f1, f1
+ br.ret.sptk b0
+}
+;;
+
+
+ROUND_UNORM:
+// Here if x unorm
+{ .mfb
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk ROUND_COMMON // Return to main path
}
+;;
-.endp round
-ASM_SIZE_DIRECTIVE(round)
+GLOBAL_LIBM_END(round)
diff --git a/sysdeps/ia64/fpu/s_roundf.S b/sysdeps/ia64/fpu/s_roundf.S
index 42ee60b218..1e8dc78777 100644
--- a/sysdeps/ia64/fpu/s_roundf.S
+++ b/sysdeps/ia64/fpu/s_roundf.S
@@ -1,11 +1,10 @@
.file "roundf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 10/25/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the
-// Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,229 +20,202 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 10/25/2000: Created
+// 10/25/00 Initial version
+// 06/14/01 Changed cmp to an equivalent form
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance and reduced code size
+// 04/18/03 Eliminate possible WAW dependency warning
//==============================================================
-//
+
// API
//==============================================================
// float roundf(float x)
-//
+//==============================================================
-#include "libm_support.h"
+// general input registers:
+// r14 - r19
-// general input registers:
-//
-roundf_GR_half = r14
-roundf_GR_big = r15
-roundf_GR_expmask = r16
-roundf_GR_signexp = r17
-roundf_GR_exp = r18
-roundf_GR_expdiff = r19
-
-// predicate registers used:
-// p6 - p10
+rSignexp = r14
+rExp = r15
+rExpMask = r16
+rBigexp = r17
+rExpHalf = r18
+rExpMHalf = r19
+
+// floating-point registers:
+// f8 - f13
-// floating-point registers used:
+fXtruncInt = f9
+fNormX = f10
+fHalf = f11
+fMHalf = f12
+fRem = f13
-ROUNDF_NORM_f8 = f9
-ROUNDF_TRUNC_f8 = f10
-ROUNDF_RINT_f8 = f11
-ROUNDF_FLOAT_TRUNC_f8 = f12
-ROUNDF_FLOAT_RINT_f8 = f13
-ROUNDF_REMAINDER = f14
-ROUNDF_HALF = f15
+// predicate registers used:
+// p6 - p10
// Overview of operation
//==============================================================
-
// float roundf(float x)
-// Return an integer value (represented as a float) that is x
-// rounded to nearest integer, halfway cases rounded away from
-// zero.
+// Return an integer value (represented as a float) that is x
+// rounded to nearest integer, halfway cases rounded away from
+// zero.
// if x>0 result = trunc(x+0.5)
// if x<0 result = trunc(x-0.5)
-// *******************************************************************************
-
-// Set denormal flag for denormal input and
-// and take denormal fault if necessary.
+//
+//==============================================================
-// If x is NAN, ZERO, INFINITY, or >= 2^23 then return
+// double_extended
+// if the exponent is > 1003e => 3F(true) = 63(decimal)
+// we have a significand of 64 bits 1.63-bits.
+// If we multiply by 2^63, we no longer have a fractional part
+// So input is an integer value already.
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 1 11 0xe7
+// double
+// if the exponent is >= 10033 => 34(true) = 52(decimal)
+// 34 + 3ff = 433
+// we have a significand of 53 bits 1.52-bits. (implicit 1)
+// If we multiply by 2^52, we no longer have a fractional part
+// So input is an integer value already.
+// single
+// if the exponent is > 10016 => 17(true) = 23(decimal)
+// we have a significand of 24 bits 1.23-bits. (implicit 1)
+// If we multiply by 2^23, we no longer have a fractional part
+// So input is an integer value already.
-.align 32
-.global roundf#
.section .text
-.proc roundf#
-.align 32
-
+GLOBAL_LIBM_ENTRY(roundf)
-roundf:
-
-// Get exponent for +0.5
-// Truncate x to integer
{ .mfi
- addl roundf_GR_half = 0x0fffe, r0
- fcvt.fx.trunc.s1 ROUNDF_TRUNC_f8 = f8
- nop.i 999
-}
-
-// Get signexp of x
-// Normalize input
-// Form exponent mask
-{ .mfi
- getf.exp roundf_GR_signexp = f8
- fnorm ROUNDF_NORM_f8 = f8
- addl roundf_GR_expmask = 0x1ffff, r0 ;;
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fcvt.fx.trunc.s1 fXtruncInt = f8 // Convert to int in significand
+ addl rBigexp = 0x10016, r0 // Set exponent at which is integer
}
-
-// Form +0.5
-// Round x to integer
{ .mfi
- setf.exp ROUNDF_HALF = roundf_GR_half
- fcvt.fx.s1 ROUNDF_RINT_f8 = f8
- nop.i 999 ;;
+ mov rExpHalf = 0x0FFFE // Form sign and exponent of 0.5
+ fnorm.s1 fNormX = f8 // Normalize input
+ mov rExpMask = 0x1FFFF // Form exponent mask
}
-// Get exp of x
-// Test for NAN, INF, ZERO
-// Get exponent at which input has no fractional part
-{ .mfi
- and roundf_GR_exp = roundf_GR_expmask, roundf_GR_signexp
- fclass.m p8,p9 = f8,0xe7
- addl roundf_GR_big = 0x10016, r0 ;;
-}
-
-// Get exp-bigexp
-// If exp is so big there is no fractional part, then turn on p8, off p9
-{ .mmi
- sub roundf_GR_expdiff = roundf_GR_exp, roundf_GR_big ;;
-#ifdef _LIBC
-(p9) cmp.lt.or.andcm p8,p9 = r0, roundf_GR_expdiff
-#else
-(p9) cmp.ge.or.andcm p8,p9 = roundf_GR_expdiff, r0
-#endif
- nop.i 999 ;;
-}
-
-// Set p6 if x<0, else set p7
-{ .mfi
- nop.m 999
-(p9) fcmp.lt.unc p6,p7 = f8,f0
- nop.i 999
+;;
+
+{ .mmf
+ setf.exp fHalf = rExpHalf // Form 0.5
+ mov rExpMHalf = 0x2FFFE // Form sign and exponent of -0.5
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
}
-
-// If NAN, INF, ZERO, or no fractional part, result is just normalized input
-{ .mfi
- nop.m 999
-(p8) fnorm.s.s0 f8 = f8
- nop.i 999 ;;
+;;
+
+{ .mfb
+ setf.exp fMHalf = rExpMHalf // Form -0.5
+ fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf
+(p7) br.cond.spnt ROUND_UNORM // Branch if x unorm
}
+;;
-// Float the truncated integer
+ROUND_COMMON:
+// Return here from ROUND_UNORM
{ .mfi
- nop.m 999
-(p9) fcvt.xf ROUNDF_FLOAT_TRUNC_f8 = ROUNDF_TRUNC_f8
- nop.i 999 ;;
+ nop.m 0
+ fcmp.lt.s1 p8,p9 = f8, f0 // Test if x < 0
+ nop.i 0
+}
+{ .mfb
+ and rExp = rSignexp, rExpMask // Get biased exponent
+(p6) fma.s.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf
}
+;;
-// Float the rounded integer to get preliminary result
{ .mfi
- nop.m 999
-(p9) fcvt.xf ROUNDF_FLOAT_RINT_f8 = ROUNDF_RINT_f8
- nop.i 999 ;;
-}
-
-// If x<0 and the difference of the truncated input minus the input is 0.5
-// then result = truncated input - 1.0
-// Else if x>0 and the difference of the input minus truncated input is 0.5
-// then result = truncated input + 1.0
-// Else
-// result = rounded input
-// Endif
-{ .mfi
- nop.m 999
-(p6) fsub.s1 ROUNDF_REMAINDER = ROUNDF_FLOAT_TRUNC_f8, ROUNDF_NORM_f8
- nop.i 999
+ cmp.lt p6,p0 = rExp, rExpHalf // Is |x| < 0.5?
+ fcvt.xf f8 = fXtruncInt // Pre-Result if 0.5 <= |x| < 2^23
+ cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^23?
}
-
{ .mfi
- nop.m 999
-(p7) fsub.s1 ROUNDF_REMAINDER = ROUNDF_NORM_f8, ROUNDF_FLOAT_TRUNC_f8
- nop.i 999 ;;
+ cmp.lt p10,p0 = rExp, rExpHalf // Is |x| < 0.5?
+ nop.f 0
+ nop.i 0
}
+;;
-// Assume preliminary result is rounded integer
+// We must correct result if |x| < 0.5, or |x| >= 2^23
+.pred.rel "mutex",p6,p7
{ .mfi
- nop.m 999
-(p9) fnorm.s.s0 f8 = ROUNDF_FLOAT_RINT_f8
- nop.i 999
+ nop.m 0
+(p6) fmerge.s f8 = fNormX, f0 // If |x| < 0.5, result sgn(x)*0
+ nop.i 0
}
-
-// If x<0, test if result=0
-{ .mfi
- nop.m 999
-(p6) fcmp.eq.unc p10,p0 = ROUNDF_FLOAT_RINT_f8,f0
- nop.i 999 ;;
+{ .mfb
+(p7) cmp.eq p10,p0 = r0, r0 // Also turn on p10 if |x| >= 2^23
+(p7) fma.s.s0 f8 = fNormX, f1, f0 // If |x| >= 2^23, result x
+(p10) br.ret.spnt b0 // Exit |x| < 0.5 or |x| >= 2^23
}
+;;
-// If x<0 and result=0, set result=-0
+// Here if 0.5 <= |x| < 2^23
{ .mfi
- nop.m 999
-(p10) fmerge.ns f8 = f1,f8
- nop.i 999
+ nop.m 0
+ fms.s1 fRem = fNormX, f1, f8 // Get remainder = x - trunc(x)
+ nop.i 0
}
-
-// If x<0, test if remainder=0.5
+;;
+
{ .mfi
- nop.m 999
-(p6) fcmp.eq.unc p6,p0 = ROUNDF_REMAINDER, ROUNDF_HALF
- nop.i 999 ;;
+ nop.m 0
+(p8) fcmp.le.s1 p8,p0 = fRem, fMHalf
+ nop.i 0
}
-
-// If x>0, test if remainder=0.5
{ .mfi
- nop.m 999
-(p7) fcmp.eq.unc p7,p0 = ROUNDF_REMAINDER, ROUNDF_HALF
- nop.i 999 ;;
+ nop.m 0
+(p9) fcmp.ge.s1 p9,p0 = fRem, fHalf
+ nop.i 0
}
+;;
-// If x<0 and remainder=0.5, result=truncated-1.0
-// If x>0 and remainder=0.5, result=truncated+1.0
-// Exit
-.pred.rel "mutex",p6,p7
+// If x < 0 and remainder <= -0.5, then subtract 1 from result
+// If x > 0 and remainder >= +0.5, then add 1 to result
+.pred.rel "mutex",p8,p9
{ .mfi
- nop.m 999
-(p6) fsub.s.s0 f8 = ROUNDF_FLOAT_TRUNC_f8,f1
- nop.i 999
+ nop.m 0
+(p8) fms.s.s0 f8 = f8, f1, f1
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p7) fadd.s.s0 f8 = ROUNDF_FLOAT_TRUNC_f8,f1
- br.ret.sptk b0 ;;
+ nop.m 0
+(p9) fma.s.s0 f8 = f8, f1, f1
+ br.ret.sptk b0
+}
+;;
+
+
+ROUND_UNORM:
+// Here if x unorm
+{ .mfb
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk ROUND_COMMON // Return to main path
}
+;;
-.endp roundf
-ASM_SIZE_DIRECTIVE(roundf)
+GLOBAL_LIBM_END(roundf)
diff --git a/sysdeps/ia64/fpu/s_roundl.S b/sysdeps/ia64/fpu/s_roundl.S
index b30f590917..79dff00c06 100644
--- a/sysdeps/ia64/fpu/s_roundl.S
+++ b/sysdeps/ia64/fpu/s_roundl.S
@@ -1,11 +1,10 @@
.file "roundl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 10/25/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Tom Rowan, Shane Story, and Ping Tak Peter Tang of the
-// Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,229 +20,202 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 10/25/2000: Created
+// 10/25/00 Initial version
+// 06/14/01 Changed cmp to an equivalent form
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance and reduced code size
+// 04/18/03 Eliminate possible WAW dependency warning
//==============================================================
-//
+
// API
//==============================================================
// long double roundl(long double x)
-//
+//==============================================================
-#include "libm_support.h"
+// general input registers:
+// r14 - r19
-// general input registers:
-//
-roundl_GR_half = r14
-roundl_GR_big = r15
-roundl_GR_expmask = r16
-roundl_GR_signexp = r17
-roundl_GR_exp = r18
-roundl_GR_expdiff = r19
-
-// predicate registers used:
-// p6 - p10
+rSignexp = r14
+rExp = r15
+rExpMask = r16
+rBigexp = r17
+rExpHalf = r18
+rExpMHalf = r19
+
+// floating-point registers:
+// f8 - f13
-// floating-point registers used:
+fXtruncInt = f9
+fNormX = f10
+fHalf = f11
+fMHalf = f12
+fRem = f13
-ROUNDL_NORM_f8 = f9
-ROUNDL_TRUNC_f8 = f10
-ROUNDL_RINT_f8 = f11
-ROUNDL_FLOAT_TRUNC_f8 = f12
-ROUNDL_FLOAT_RINT_f8 = f13
-ROUNDL_REMAINDER = f14
-ROUNDL_HALF = f15
+// predicate registers used:
+// p6 - p10
// Overview of operation
//==============================================================
-
// long double roundl(long double x)
-// Return an integer value (represented as a long double) that is x
-// rounded to nearest integer, halfway cases rounded away from
-// zero.
+// Return an integer value (represented as a long double) that is x
+// rounded to nearest integer, halfway cases rounded away from
+// zero.
// if x>0 result = trunc(x+0.5)
// if x<0 result = trunc(x-0.5)
-// *******************************************************************************
-
-// Set denormal flag for denormal input and
-// and take denormal fault if necessary.
+//
+//==============================================================
-// If x is NAN, ZERO, INFINITY, or >= 2^63 then return
+// double_extended
+// if the exponent is > 1003e => 3F(true) = 63(decimal)
+// we have a significand of 64 bits 1.63-bits.
+// If we multiply by 2^63, we no longer have a fractional part
+// So input is an integer value already.
-// qnan snan inf norm unorm 0 -+
-// 1 1 1 0 0 1 11 0xe7
+// double
+// if the exponent is >= 10033 => 34(true) = 52(decimal)
+// 34 + 3ff = 433
+// we have a significand of 53 bits 1.52-bits. (implicit 1)
+// If we multiply by 2^52, we no longer have a fractional part
+// So input is an integer value already.
+// single
+// if the exponent is > 10016 => 17(true) = 23(decimal)
+// we have a significand of 24 bits 1.23-bits. (implicit 1)
+// If we multiply by 2^23, we no longer have a fractional part
+// So input is an integer value already.
-.align 32
-.global roundl#
.section .text
-.proc roundl#
-.align 32
-
+GLOBAL_LIBM_ENTRY(roundl)
-roundl:
-
-// Get exponent for +0.5
-// Truncate x to integer
{ .mfi
- addl roundl_GR_half = 0x0fffe, r0
- fcvt.fx.trunc.s1 ROUNDL_TRUNC_f8 = f8
- nop.i 999
-}
-
-// Get signexp of x
-// Normalize input
-// Form exponent mask
-{ .mfi
- getf.exp roundl_GR_signexp = f8
- fnorm ROUNDL_NORM_f8 = f8
- addl roundl_GR_expmask = 0x1ffff, r0 ;;
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fcvt.fx.trunc.s1 fXtruncInt = f8 // Convert to int in significand
+ addl rBigexp = 0x1003e, r0 // Set exponent at which is integer
}
-
-// Form +0.5
-// Round x to integer
{ .mfi
- setf.exp ROUNDL_HALF = roundl_GR_half
- fcvt.fx.s1 ROUNDL_RINT_f8 = f8
- nop.i 999 ;;
+ mov rExpHalf = 0x0FFFE // Form sign and exponent of 0.5
+ fnorm.s1 fNormX = f8 // Normalize input
+ mov rExpMask = 0x1FFFF // Form exponent mask
}
-// Get exp of x
-// Test for NAN, INF, ZERO
-// Get exponent at which input has no fractional part
-{ .mfi
- and roundl_GR_exp = roundl_GR_expmask, roundl_GR_signexp
- fclass.m p8,p9 = f8,0xe7
- addl roundl_GR_big = 0x1003e, r0 ;;
-}
-
-// Get exp-bigexp
-// If exp is so big there is no fractional part, then turn on p8, off p9
-{ .mmi
- sub roundl_GR_expdiff = roundl_GR_exp, roundl_GR_big ;;
-#ifdef _LIBC
-(p9) cmp.lt.or.andcm p8,p9 = r0, roundl_GR_expdiff
-#else
-(p9) cmp.ge.or.andcm p8,p9 = roundl_GR_expdiff, r0
-#endif
- nop.i 999 ;;
-}
-
-// Set p6 if x<0, else set p7
-{ .mfi
- nop.m 999
-(p9) fcmp.lt.unc p6,p7 = f8,f0
- nop.i 999
+;;
+
+{ .mmf
+ setf.exp fHalf = rExpHalf // Form 0.5
+ mov rExpMHalf = 0x2FFFE // Form sign and exponent of -0.5
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
}
-
-// If NAN, INF, ZERO, or no fractional part, result is just normalized input
-{ .mfi
- nop.m 999
-(p8) fnorm.s0 f8 = f8
- nop.i 999 ;;
+;;
+
+{ .mfb
+ setf.exp fMHalf = rExpMHalf // Form -0.5
+ fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf
+(p7) br.cond.spnt ROUND_UNORM // Branch if x unorm
}
+;;
-// Float the truncated integer
+ROUND_COMMON:
+// Return here from ROUND_UNORM
{ .mfi
- nop.m 999
-(p9) fcvt.xf ROUNDL_FLOAT_TRUNC_f8 = ROUNDL_TRUNC_f8
- nop.i 999 ;;
+ nop.m 0
+ fcmp.lt.s1 p8,p9 = f8, f0 // Test if x < 0
+ nop.i 0
+}
+{ .mfb
+ and rExp = rSignexp, rExpMask // Get biased exponent
+(p6) fma.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf
}
+;;
-// Float the rounded integer to get preliminary result
{ .mfi
- nop.m 999
-(p9) fcvt.xf ROUNDL_FLOAT_RINT_f8 = ROUNDL_RINT_f8
- nop.i 999 ;;
-}
-
-// If x<0 and the difference of the truncated input minus the input is 0.5
-// then result = truncated input - 1.0
-// Else if x>0 and the difference of the input minus truncated input is 0.5
-// then result = truncated input + 1.0
-// Else
-// result = rounded input
-// Endif
-{ .mfi
- nop.m 999
-(p6) fsub.s1 ROUNDL_REMAINDER = ROUNDL_FLOAT_TRUNC_f8, ROUNDL_NORM_f8
- nop.i 999
+ cmp.lt p6,p0 = rExp, rExpHalf // Is |x| < 0.5?
+ fcvt.xf f8 = fXtruncInt // Pre-Result if 0.5 <= |x| < 2^63
+ cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^63?
}
-
{ .mfi
- nop.m 999
-(p7) fsub.s1 ROUNDL_REMAINDER = ROUNDL_NORM_f8, ROUNDL_FLOAT_TRUNC_f8
- nop.i 999 ;;
+ cmp.lt p10,p0 = rExp, rExpHalf // Is |x| < 0.5?
+ nop.f 0
+ nop.i 0
}
+;;
-// Assume preliminary result is rounded integer
+// We must correct result if |x| < 0.5, or |x| >= 2^63
+.pred.rel "mutex",p6,p7
{ .mfi
- nop.m 999
-(p9) fnorm.s0 f8 = ROUNDL_FLOAT_RINT_f8
- nop.i 999
+ nop.m 0
+(p6) fmerge.s f8 = fNormX, f0 // If |x| < 0.5, result sgn(x)*0
+ nop.i 0
}
-
-// If x<0, test if result=0
-{ .mfi
- nop.m 999
-(p6) fcmp.eq.unc p10,p0 = ROUNDL_FLOAT_RINT_f8,f0
- nop.i 999 ;;
+{ .mfb
+(p7) cmp.eq p10,p0 = r0, r0 // Also turn on p10 if |x| >= 2^63
+(p7) fma.s0 f8 = fNormX, f1, f0 // If |x| >= 2^63, result x
+(p10) br.ret.spnt b0 // Exit |x| < 0.5 or |x| >= 2^63
}
+;;
-// If x<0 and result=0, set result=-0
+// Here if 0.5 <= |x| < 2^63
{ .mfi
- nop.m 999
-(p10) fmerge.ns f8 = f1,f8
- nop.i 999
+ nop.m 0
+ fms.s1 fRem = fNormX, f1, f8 // Get remainder = x - trunc(x)
+ nop.i 0
}
-
-// If x<0, test if remainder=0.5
+;;
+
{ .mfi
- nop.m 999
-(p6) fcmp.eq.unc p6,p0 = ROUNDL_REMAINDER, ROUNDL_HALF
- nop.i 999 ;;
+ nop.m 0
+(p8) fcmp.le.s1 p8,p0 = fRem, fMHalf
+ nop.i 0
}
-
-// If x>0, test if remainder=0.5
{ .mfi
- nop.m 999
-(p7) fcmp.eq.unc p7,p0 = ROUNDL_REMAINDER, ROUNDL_HALF
- nop.i 999 ;;
+ nop.m 0
+(p9) fcmp.ge.s1 p9,p0 = fRem, fHalf
+ nop.i 0
}
+;;
-// If x<0 and remainder=0.5, result=truncated-1.0
-// If x>0 and remainder=0.5, result=truncated+1.0
-// Exit
-.pred.rel "mutex",p6,p7
+// If x < 0 and remainder <= -0.5, then subtract 1 from result
+// If x > 0 and remainder >= +0.5, then add 1 to result
+.pred.rel "mutex",p8,p9
{ .mfi
- nop.m 999
-(p6) fsub.s0 f8 = ROUNDL_FLOAT_TRUNC_f8,f1
- nop.i 999
+ nop.m 0
+(p8) fms.s0 f8 = f8, f1, f1
+ nop.i 0
}
-
{ .mfb
- nop.m 999
-(p7) fadd.s0 f8 = ROUNDL_FLOAT_TRUNC_f8,f1
- br.ret.sptk b0 ;;
+ nop.m 0
+(p9) fma.s0 f8 = f8, f1, f1
+ br.ret.sptk b0
+}
+;;
+
+
+ROUND_UNORM:
+// Here if x unorm
+{ .mfb
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk ROUND_COMMON // Return to main path
}
+;;
-.endp roundl
-ASM_SIZE_DIRECTIVE(roundl)
+GLOBAL_LIBM_END(roundl)
diff --git a/sysdeps/ia64/fpu/s_scalblnf.c b/sysdeps/ia64/fpu/s_scalblnf.c
new file mode 100644
index 0000000000..97de090738
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_scalblnf.c
@@ -0,0 +1,62 @@
+/* file: scalblnf.c */
+
+
+// Copyright (c) 2000, 2001, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
+// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//
+
+#include "libm_support.h"
+
+float __libm_scalblnf(float, long int, int);
+
+
+float scalblnf(float x, long int n)
+{
+
+#ifdef SIZE_LONG_INT_64
+ return __libm_scalblnf(x,n,1);
+#else
+
+#ifdef SIZE_LONG_INT_32
+ return __libm_scalblnf(x,n,0);
+#endif
+
+#endif
+
+}
diff --git a/sysdeps/ia64/fpu/s_scalbn.c b/sysdeps/ia64/fpu/s_scalbn.c
new file mode 100644
index 0000000000..b0bd44a53c
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_scalbn.c
@@ -0,0 +1,62 @@
+/* file: scalbn.c */
+
+
+// Copyright (c) 2000, 2001, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
+// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//
+
+#include "libm_support.h"
+
+double __libm_scalbn(double, int, int);
+
+
+double scalbn(double x, int n)
+{
+
+#ifdef SIZE_INT_64
+ return __libm_scalbn(x,n,1);
+#else
+
+#ifdef SIZE_INT_32
+ return __libm_scalbn(x,n,0);
+#endif
+
+#endif
+
+}
diff --git a/sysdeps/ia64/fpu/s_scalbnf.c b/sysdeps/ia64/fpu/s_scalbnf.c
new file mode 100644
index 0000000000..176c2edbd8
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_scalbnf.c
@@ -0,0 +1,62 @@
+/* file: scalbnf.c */
+
+
+// Copyright (c) 2000, 2001, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
+// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//
+
+#include "libm_support.h"
+
+float __libm_scalbnf(float, int, int);
+
+
+float scalbnf(float x, int n)
+{
+
+#ifdef SIZE_INT_64
+ return __libm_scalbnf(x,n,1);
+#else
+
+#ifdef SIZE_INT_32
+ return __libm_scalbnf(x,n,0);
+#endif
+
+#endif
+
+}
diff --git a/sysdeps/ia64/fpu/s_scalbnl.c b/sysdeps/ia64/fpu/s_scalbnl.c
new file mode 100644
index 0000000000..d19ddd3c8e
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_scalbnl.c
@@ -0,0 +1,62 @@
+/* file: scalbnl.c */
+
+
+// Copyright (c) 2000, 2001, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
+// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//
+
+#include "libm_support.h"
+
+long double __libm_scalbnl(long double, int, int);
+
+
+long double scalbnl(long double x, int n)
+{
+
+#ifdef SIZE_INT_64
+ return __libm_scalbnl(x,n,1);
+#else
+
+#ifdef SIZE_INT_32
+ return __libm_scalbnl(x,n,0);
+#endif
+
+#endif
+
+}
diff --git a/sysdeps/ia64/fpu/s_significand.S b/sysdeps/ia64/fpu/s_significand.S
index 84141daf4d..720e043e5c 100644
--- a/sysdeps/ia64/fpu/s_significand.S
+++ b/sysdeps/ia64/fpu/s_significand.S
@@ -1,10 +1,10 @@
.file "significand.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,13 +35,15 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 4/04/00 Unwind support added
-// 5/31/00: Fixed bug when x a double-extended denormal
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
+// 05/31/00 Fixed bug when x a double-extended denormal
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
@@ -56,18 +58,10 @@
// p6, p7
//
// floating-point registers used:
-// f8, f9, f10
-
-#include "libm_support.h"
-
-.align 32
-.global significand#
+// f8, f9, f10
.section .text
-.proc significand#
-.align 32
-
-significand:
+GLOBAL_LIBM_ENTRY(significand)
// qnan snan inf norm unorm 0 -+
// 1 1 1 0 0 1 11
@@ -75,19 +69,19 @@ significand:
// f10 gets f8(sign) with f1(exp,significand)
{ .mfi
nop.m 999
-(p0) fmerge.s f10 = f8,f1
+ fmerge.s f10 = f8,f1
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fnorm f9 = f8
+ fnorm.s0 f9 = f8
nop.i 999 ;;
}
// Test for denormal input
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f8, 0x0b
+ fclass.m.unc p7,p0 = f8, 0x0b
nop.i 999 ;;
}
@@ -97,14 +91,14 @@ significand:
// return sign(f8) exp(f8) significand(f8), normalized.
{ .mfi
nop.m 999
-(p0) fclass.m.unc p0,p6 = f8, 0xe7
+ fclass.m.unc p0,p6 = f8, 0xe7
nop.i 999 ;;
}
{ .mmb
nop.m 999
nop.m 999
-(p7) br.cond.spnt L(SIGNIFICAND_DENORM) ;; // Branch if x denormal
+(p7) br.cond.spnt SIGNIFICAND_DENORM ;; // Branch if x denormal
}
{ .mfi
@@ -115,29 +109,29 @@ significand:
{ .mfb
nop.m 999
-(p0) fnorm.d f8 = f8
-(p0) br.ret.sptk b0 ;;
+ fnorm.d.s0 f8 = f8
+ br.ret.sptk b0 ;;
}
-L(SIGNIFICAND_DENORM):
+SIGNIFICAND_DENORM:
// Here if x denorm
{ .mfi
nop.m 999
-(p0) fmerge.se f8 = f10,f9
+ fmerge.se f8 = f10,f9
nop.i 999 ;;
}
// Check if fnorm(x) still denormal, means x double-extended denormal
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f9, 0x0b
+ fclass.m.unc p7,p0 = f9, 0x0b
nop.i 999 ;;
}
// This will be the final result unless x double-extended denormal
{ .mfi
nop.m 999
-(p0) fnorm.d f8 = f8
+ fnorm.d.s0 f8 = f8
nop.i 999 ;;
}
@@ -152,9 +146,8 @@ L(SIGNIFICAND_DENORM):
// Final normalization if x double-extended denorm
{ .mfb
nop.m 999
-(p7) fnorm.d f8 = f8
-(p0) br.ret.sptk b0 ;;
+(p7) fnorm.d.s0 f8 = f8
+ br.ret.sptk b0 ;;
}
-.endp significand
-ASM_SIZE_DIRECTIVE(significand)
+GLOBAL_LIBM_END(significand)
diff --git a/sysdeps/ia64/fpu/s_significandf.S b/sysdeps/ia64/fpu/s_significandf.S
index d8cdc159f6..5c8299b944 100644
--- a/sysdeps/ia64/fpu/s_significandf.S
+++ b/sysdeps/ia64/fpu/s_significandf.S
@@ -1,10 +1,10 @@
.file "significandf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,13 +35,15 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 2/03/00: Modified to improve speed
-// 5/31/00: Fixed bug when x a double-extended denormal
+// 02/02/00 Initial version
+// 02/03/00 Modified to improve speed
+// 05/31/00 Fixed bug when x a double-extended denormal
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
@@ -55,18 +57,10 @@
// p6, p7
//
// floating-point registers used:
-// f8, f9, f10
-
-#include "libm_support.h"
-
-.align 32
-.global significandf#
+// f8, f9, f10
.section .text
-.proc significandf#
-.align 32
-
-significandf:
+GLOBAL_LIBM_ENTRY(significandf)
// qnan snan inf norm unorm 0 -+
// 1 1 1 0 0 1 11
@@ -74,19 +68,19 @@ significandf:
// f10 gets f8(sign) with f1(exp,significand)
{ .mfi
nop.m 999
-(p0) fmerge.s f10 = f8,f1
+ fmerge.s f10 = f8,f1
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fnorm f9 = f8
+ fnorm.s0 f9 = f8
nop.i 999 ;;
}
// Test for denormal input
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f8, 0x0b
+ fclass.m.unc p7,p0 = f8, 0x0b
nop.i 999 ;;
}
@@ -96,14 +90,14 @@ significandf:
// return sign(f8) exp(f8) significand(f8), normalized.
{ .mfi
nop.m 999
-(p0) fclass.m.unc p0,p6 = f8, 0xe7
+ fclass.m.unc p0,p6 = f8, 0xe7
nop.i 999 ;;
}
{ .mmb
nop.m 999
nop.m 999
-(p7) br.cond.spnt L(SIGNIFICAND_DENORM) ;; // Branch if x denormal
+(p7) br.cond.spnt SIGNIFICAND_DENORM ;; // Branch if x denormal
}
{ .mfi
@@ -114,29 +108,29 @@ significandf:
{ .mfb
nop.m 999
-(p0) fnorm.s f8 = f8
-(p0) br.ret.sptk b0 ;;
+ fnorm.s.s0 f8 = f8
+ br.ret.sptk b0 ;;
}
-L(SIGNIFICAND_DENORM):
+SIGNIFICAND_DENORM:
// Here if x denorm
{ .mfi
nop.m 999
-(p0) fmerge.se f8 = f10,f9
+ fmerge.se f8 = f10,f9
nop.i 999 ;;
}
// Check if fnorm(x) still denormal, means x double-extended denormal
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f9, 0x0b
+ fclass.m.unc p7,p0 = f9, 0x0b
nop.i 999 ;;
}
// This will be the final result unless x double-extended denormal
{ .mfi
nop.m 999
-(p0) fnorm.s f8 = f8
+ fnorm.s.s0 f8 = f8
nop.i 999 ;;
}
@@ -151,9 +145,8 @@ L(SIGNIFICAND_DENORM):
// Final normalization if x double-extended denorm
{ .mfb
nop.m 999
-(p7) fnorm.s f8 = f8
-(p0) br.ret.sptk b0 ;;
+(p7) fnorm.s.s0 f8 = f8
+ br.ret.sptk b0 ;;
}
-.endp significandf
-ASM_SIZE_DIRECTIVE(significandf)
+GLOBAL_LIBM_END(significandf)
diff --git a/sysdeps/ia64/fpu/s_significandl.S b/sysdeps/ia64/fpu/s_significandl.S
index 268d3567d0..f62df4310c 100644
--- a/sysdeps/ia64/fpu/s_significandl.S
+++ b/sysdeps/ia64/fpu/s_significandl.S
@@ -1,10 +1,10 @@
.file "significandl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,13 +35,15 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 2/03/00: Modified to improve speed
-// 5/31/00: Fixed bug when x a double-extended denormal
+// 02/02/00 Initial version
+// 02/03/00 Modified to improve speed
+// 05/31/00 Fixed bug when x a double-extended denormal
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
@@ -56,18 +58,10 @@
// p6, p7
//
// floating-point registers used:
-// f8, f9, f10
-
-#include "libm_support.h"
-
-.align 32
-.global significandl#
+// f8, f9, f10
.section .text
-.proc significandl#
-.align 32
-
-significandl:
+GLOBAL_LIBM_ENTRY(significandl)
// qnan snan inf norm unorm 0 -+
// 1 1 1 0 0 1 11
@@ -75,19 +69,19 @@ significandl:
// f10 gets f8(sign) with f1(exp,significand)
{ .mfi
nop.m 999
-(p0) fmerge.s f10 = f8,f1
+ fmerge.s f10 = f8,f1
nop.i 999
}
{ .mfi
nop.m 999
-(p0) fnorm f9 = f8
+ fnorm.s0 f9 = f8
nop.i 999 ;;
}
// Test for denormal input
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f8, 0x0b
+ fclass.m.unc p7,p0 = f8, 0x0b
nop.i 999 ;;
}
@@ -97,14 +91,14 @@ significandl:
// return sign(f8) exp(f8) significand(f8), normalized.
{ .mfi
nop.m 999
-(p0) fclass.m.unc p0,p6 = f8, 0xe7
+ fclass.m.unc p0,p6 = f8, 0xe7
nop.i 999 ;;
}
{ .mmb
nop.m 999
nop.m 999
-(p7) br.cond.spnt L(SIGNIFICAND_DENORM) ;; // Branch if x denormal
+(p7) br.cond.spnt SIGNIFICAND_DENORM ;; // Branch if x denormal
}
{ .mfi
@@ -115,29 +109,29 @@ significandl:
{ .mfb
nop.m 999
-(p0) fnorm f8 = f8
-(p0) br.ret.sptk b0 ;;
+ fnorm.s0 f8 = f8
+ br.ret.sptk b0 ;;
}
-L(SIGNIFICAND_DENORM):
+SIGNIFICAND_DENORM:
// Here if x denorm
{ .mfi
nop.m 999
-(p0) fmerge.se f8 = f10,f9
+ fmerge.se f8 = f10,f9
nop.i 999 ;;
}
// Check if fnorm(x) still denormal, means x double-extended denormal
{ .mfi
nop.m 999
-(p0) fclass.m.unc p7,p0 = f9, 0x0b
+ fclass.m.unc p7,p0 = f9, 0x0b
nop.i 999 ;;
}
// This will be the final result unless x double-extended denormal
{ .mfi
nop.m 999
-(p0) fnorm f8 = f8
+ fnorm.s0 f8 = f8
nop.i 999 ;;
}
@@ -152,9 +146,8 @@ L(SIGNIFICAND_DENORM):
// Final normalization if x double-extended denorm
{ .mfb
nop.m 999
-(p7) fnorm f8 = f8
-(p0) br.ret.sptk b0 ;;
+(p7) fnorm.s0 f8 = f8
+ br.ret.sptk b0 ;;
}
-.endp significandl
-ASM_SIZE_DIRECTIVE(significandl)
+GLOBAL_LIBM_END(significandl)
diff --git a/sysdeps/ia64/fpu/s_sincos.c b/sysdeps/ia64/fpu/s_sincos.c
index 1ddbc2122a..41254ae60a 100644
--- a/sysdeps/ia64/fpu/s_sincos.c
+++ b/sysdeps/ia64/fpu/s_sincos.c
@@ -1,9 +1 @@
-#include <math.h>
-
-void
-__sincos (double x, double *s, double *c)
-{
- *s = sin (x);
- *c = cos (x);
-}
-weak_alias (__sincos, sincos)
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/s_sincosf.c b/sysdeps/ia64/fpu/s_sincosf.c
index efd0fe3038..41254ae60a 100644
--- a/sysdeps/ia64/fpu/s_sincosf.c
+++ b/sysdeps/ia64/fpu/s_sincosf.c
@@ -1,9 +1 @@
-#include <math.h>
-
-void
-__sincosf (float x, float *s, float *c)
-{
- *s = sinf (x);
- *c = cosf (x);
-}
-weak_alias (__sincosf, sincosf)
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/s_sincosl.c b/sysdeps/ia64/fpu/s_sincosl.c
index a835b772e2..41254ae60a 100644
--- a/sysdeps/ia64/fpu/s_sincosl.c
+++ b/sysdeps/ia64/fpu/s_sincosl.c
@@ -1,9 +1 @@
-#include <math.h>
-
-void
-__sincosl (long double x, long double *s, long double *c)
-{
- *s = sinl (x);
- *c = cosl (x);
-}
-weak_alias (__sincosl, sincosl)
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/s_tan.S b/sysdeps/ia64/fpu/s_tan.S
index 3a497fcf4c..3000f5ee06 100644
--- a/sysdeps/ia64/fpu/s_tan.S
+++ b/sysdeps/ia64/fpu/s_tan.S
@@ -1,10 +1,10 @@
-.file "tan.s"
+.file "tancot.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -32,20 +32,24 @@
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 4/04/00 Unwind support added
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
// 12/27/00 Improved speed
+// 02/21/01 Updated to call tanl
+// 05/30/02 Added cot
+// 02/10/03 Reordered header: .section, .global, .proc, .align
//
// API
//==============================================================
-// double tan( double x);
+// double tan(double x);
+// double cot(double x);
//
// Overview of operation
//==============================================================
@@ -61,11 +65,14 @@
// Nfloat = round_int(tan_W)
//
// tan_r = x - Nfloat * (pi/2)_hi
-// tan_r = tan_r - Nfloat * (pi/2)_lo
+// a) tan_r = tan_r - Nfloat * (pi/2)_lo (for tan)
+// b) tan_r = Nfloat * (pi/2)_lo - tan_r (for cot)
//
// We have two paths: p8, when Nfloat is even and p9. when Nfloat is odd.
-// p8: tan(X) = tan(r)
-// p9: tan(X) = -cot(r)
+// a) for tan: p8: tan(X) = tan(r)
+// p9: tan(X) = -cot(r)
+// b) for cot: p9: cot(X) = cot(r)
+// p8: cot(X) = -tan(r)
//
// Each is evaluated as a series. The p9 path requires 1/r.
//
@@ -75,19 +82,16 @@
// Registers used
//==============================================================
//
-// predicate registers used:
-// p6-10
+// predicate registers used:
+// p6-12
//
-// floating-point registers used:
-// f10-15, f32-105
+// floating-point registers used:
+// f10-15, f32-106
// f8, input
//
// general registers used
-// r14-18, r32-43
+// r14-26, r32-39
//
-
-#include "libm_support.h"
-
// Assembly macros
//==============================================================
TAN_INV_PI_BY_2_2TO64 = f10
@@ -105,28 +109,28 @@ tan_Pi_by_2_lo = f34
tan_P0 = f35
tan_P1 = f36
tan_P2 = f37
-tan_P3 = f38
-tan_P4 = f39
-tan_P5 = f40
+tan_P3 = f38
+tan_P4 = f39
+tan_P5 = f40
tan_P6 = f41
tan_P7 = f42
-tan_P8 = f43
-tan_P9 = f44
-tan_P10 = f45
+tan_P8 = f43
+tan_P9 = f44
+tan_P10 = f45
tan_P11 = f46
-tan_P12 = f47
+tan_P12 = f47
tan_P13 = f48
tan_P14 = f49
tan_P15 = f50
-tan_Q0 = f51
-tan_Q1 = f52
-tan_Q2 = f53
-tan_Q3 = f54
-tan_Q4 = f55
-tan_Q5 = f56
-tan_Q6 = f57
-tan_Q7 = f58
+tan_Q0 = f51
+tan_Q1 = f52
+tan_Q2 = f53
+tan_Q3 = f54
+tan_Q4 = f55
+tan_Q5 = f56
+tan_Q6 = f57
+tan_Q7 = f58
tan_Q8 = f59
tan_Q9 = f60
tan_Q10 = f61
@@ -153,19 +157,19 @@ tan_v10 = f79
tan_v2 = f80
tan_v9 = f81
tan_v1 = f82
-tan_int_Nfloat = f83
-tan_Nfloat = f84
+tan_int_Nfloat = f83
+tan_Nfloat = f84
-tan_NORM_f8 = f85
+tan_NORM_f8 = f85
tan_W = f86
tan_y0 = f87
-tan_d = f88
-tan_y1 = f89
-tan_dsq = f90
-tan_y2 = f91
-tan_d4 = f92
-tan_inv_r = f93
+tan_d = f88
+tan_y1 = f89
+tan_dsq = f90
+tan_y2 = f91
+tan_d4 = f92
+tan_inv_r = f93
tan_z1 = f94
tan_z2 = f95
@@ -180,6 +184,7 @@ tan_z10 = f103
tan_z11 = f104
tan_z12 = f105
+arg_copy = f106
/////////////////////////////////////////////////////////////
@@ -188,37 +193,33 @@ tan_GR_rshf_2to64 = r15
tan_GR_exp_2tom64 = r16
tan_GR_n = r17
tan_GR_rshf = r18
-
-tan_AD = r33
-tan_GR_10009 = r34
-tan_GR_17_ones = r35
-tan_GR_N_odd_even = r36
-tan_GR_N = r37
-tan_signexp = r38
-tan_exp = r39
-tan_ADQ = r40
-
-GR_SAVE_PFS = r41
-GR_SAVE_B0 = r42
-GR_SAVE_GP = r43
-
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+tan_AD = r19
+tan_GR_10009 = r20
+tan_GR_17_ones = r21
+tan_GR_N_odd_even = r22
+tan_GR_N = r23
+tan_signexp = r24
+tan_exp = r25
+tan_ADQ = r26
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+GR_Parameter_Tag = r39
+
+
+RODATA
.align 16
-double_tan_constants:
-ASM_TYPE_DIRECTIVE(double_tan_constants,@object)
-// data8 0xA2F9836E4E44152A, 0x00003FFE // 2/pi
+LOCAL_OBJECT_START(double_tan_constants)
data8 0xC90FDAA22168C234, 0x00003FFF // pi/2 hi
-
- data8 0xBEEA54580DDEA0E1 // P14
+ data8 0xBEEA54580DDEA0E1 // P14
data8 0x3ED3021ACE749A59 // P15
- data8 0xBEF312BD91DC8DA1 // P12
+ data8 0xBEF312BD91DC8DA1 // P12
data8 0x3EFAE9AFC14C5119 // P13
data8 0x3F2F342BF411E769 // P8
data8 0x3F1A60FC9F3B0227 // P9
@@ -232,10 +233,9 @@ ASM_TYPE_DIRECTIVE(double_tan_constants,@object)
data8 0x3FC11111111111C2 // P1
data8 0x3FABA1BA1BA0E850 // P2
data8 0x3F9664F4886725A7 // P3
-ASM_SIZE_DIRECTIVE(double_tan_constants)
+LOCAL_OBJECT_END(double_tan_constants)
-double_Q_tan_constants:
-ASM_TYPE_DIRECTIVE(double_Q_tan_constants,@object)
+LOCAL_OBJECT_START(double_Q_tan_constants)
data8 0xC4C6628B80DC1CD1, 0x00003FBF // pi/2 lo
data8 0x3E223A73BA576E48 // Q8
data8 0x3DF54AD8D1F2CA43 // Q9
@@ -248,35 +248,19 @@ ASM_TYPE_DIRECTIVE(double_Q_tan_constants,@object)
data8 0x3F61566ABBFFB489 // Q2
data8 0x3F2BBD77945C1733 // Q3
data8 0x3D927FB33E2B0E04 // Q10
-ASM_SIZE_DIRECTIVE(double_Q_tan_constants)
+LOCAL_OBJECT_END(double_Q_tan_constants)
-
-.align 32
-.global tan#
-#ifdef _LIBC
-.global __tan#
-#endif
+.section .text
////////////////////////////////////////////////////////
-
-
-.section .text
-.proc tan#
-#ifdef _LIBC
-.proc __tan#
-#endif
-.align 32
-tan:
-#ifdef _LIBC
-__tan:
-#endif
+LOCAL_LIBM_ENTRY(cot)
// The initial fnorm will take any unmasked faults and
// normalize any single/double unorms
{ .mlx
- alloc r32=ar.pfs,1,11,0,0
+ cmp.eq p12, p11 = r0, r0 // set p12=1, p11=0 for cot
movl tan_GR_sig_inv_pi_by_2 = 0xA2F9836E4E44152A // significand of 2/pi
}
{ .mlx
@@ -285,18 +269,47 @@ __tan:
}
;;
-{ .mfi
- ld8 tan_AD = [tan_AD]
- fnorm tan_NORM_f8 = f8
+{ .mlx
mov tan_GR_exp_2tom64 = 0xffff-64 // exponent of scaling factor 2^-64
+ movl tan_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift
}
+{ .mfb
+ ld8 tan_AD = [tan_AD]
+ fnorm.s0 tan_NORM_f8 = f8
+ br.cond.sptk COMMON_PATH
+}
+;;
+
+LOCAL_LIBM_END(cot)
+
+GLOBAL_IEEE754_ENTRY(tan)
+// The initial fnorm will take any unmasked faults and
+// normalize any single/double unorms
+
{ .mlx
- nop.m 999
+ cmp.eq p11, p12 = r0, r0 // set p11=1, p12=0 for tan
+ movl tan_GR_sig_inv_pi_by_2 = 0xA2F9836E4E44152A // significand of 2/pi
+}
+{ .mlx
+ addl tan_AD = @ltoff(double_tan_constants), gp
+ movl tan_GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+63+1)
+}
+;;
+
+{ .mlx
+ mov tan_GR_exp_2tom64 = 0xffff-64 // exponent of scaling factor 2^-64
movl tan_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift
}
+{ .mfi
+ ld8 tan_AD = [tan_AD]
+ fnorm.s0 tan_NORM_f8 = f8
+ nop.i 0
+}
;;
+// Common path for both tan and cot
+COMMON_PATH:
// Form two constants we need
// 2/pi * 2^1 * 2^63, scaled by 2^64 since we just loaded the significand
// 1.1000...000 * 2^(63+63+1) to right shift int(W) into the significand
@@ -313,7 +326,7 @@ __tan:
{ .mmf
setf.exp TAN_2TOM64 = tan_GR_exp_2tom64
adds tan_ADQ = double_Q_tan_constants - double_tan_constants, tan_AD
- fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0
+(p11) fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0 (tan)
}
;;
@@ -323,79 +336,79 @@ __tan:
// 1.1000...000 * 2^63, the right shift constant
{ .mmf
setf.d TAN_RSHF = tan_GR_rshf
- ldfe tan_Pi_by_2_hi = [tan_AD],16
+ ldfe tan_Pi_by_2_hi = [tan_AD],16
fclass.m.unc p7,p0 = f8, 0x23 // Test for x=inf
}
;;
{ .mfb
- ldfe tan_Pi_by_2_lo = [tan_ADQ],16
+ ldfe tan_Pi_by_2_lo = [tan_ADQ],16
fclass.m.unc p8,p0 = f8, 0xc3 // Test for x=nan
-(p6) br.ret.spnt b0 ;; // Exit for x=0
+(p6) br.ret.spnt b0 ;; // Exit for x=0 (tan only)
}
{ .mfi
- ldfpd tan_P14,tan_P15 = [tan_AD],16
+ ldfpd tan_P14,tan_P15 = [tan_AD],16
(p7) frcpa.s0 f8,p9=f0,f0 // Set qnan indef if x=inf
mov tan_GR_10009 = 0x10009
}
{ .mib
- ldfpd tan_Q8,tan_Q9 = [tan_ADQ],16
+ ldfpd tan_Q8,tan_Q9 = [tan_ADQ],16
nop.i 999
(p7) br.ret.spnt b0 ;; // Exit for x=inf
}
{ .mfi
- ldfpd tan_P12,tan_P13 = [tan_AD],16
-(p8) fma.d f8=f8,f1,f8 // Set qnan if x=nan
+ ldfpd tan_P12,tan_P13 = [tan_AD],16
+(p12) fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0 (cot)
nop.i 999
}
-{ .mib
- ldfpd tan_Q4,tan_Q5 = [tan_ADQ],16
- nop.i 999
+{ .mfb
+ ldfpd tan_Q4,tan_Q5 = [tan_ADQ],16
+(p8) fma.d.s0 f8=f8,f1,f8 // Set qnan if x=nan
(p8) br.ret.spnt b0 ;; // Exit for x=nan
}
-{ .mmi
- getf.exp tan_signexp = tan_NORM_f8
- ldfpd tan_P8,tan_P9 = [tan_AD],16
- nop.i 999 ;;
+{ .mmf
+ getf.exp tan_signexp = tan_NORM_f8
+ ldfpd tan_P8,tan_P9 = [tan_AD],16
+ fmerge.s arg_copy = f8, f8 ;; // Save input for error call
}
-// Multiply x by scaled 2/pi and add large const to shift integer part of W to
+// Multiply x by scaled 2/pi and add large const to shift integer part of W to
// rightmost bits of significand
-{ .mfi
+{ .mmf
+ alloc r32=ar.pfs,0,4,4,0
ldfpd tan_Q6,tan_Q7 = [tan_ADQ],16
fma.s1 TAN_W_2TO64_RSH = tan_NORM_f8,TAN_INV_PI_BY_2_2TO64,TAN_RSHF_2TO64
- nop.i 999 ;;
-}
+};;
-{ .mmi
- ldfpd tan_P10,tan_P11 = [tan_AD],16
- nop.m 999
- and tan_exp = tan_GR_17_ones, tan_signexp ;;
+{ .mmf
+ ldfpd tan_P10,tan_P11 = [tan_AD],16
+ and tan_exp = tan_GR_17_ones, tan_signexp
+(p6) frcpa.s0 f8, p0 = f1, f8 ;; // cot(+-0) = +-Inf
}
// p7 is true if we must call DBX TAN
// p7 is true if f8 exp is > 0x10009 (which includes all ones
// NAN or inf)
-{ .mmi
- ldfpd tan_Q0,tan_Q1 = [tan_ADQ],16
- cmp.ge.unc p7,p0 = tan_exp,tan_GR_10009
- nop.i 999 ;;
+{ .mmb
+ ldfpd tan_Q0,tan_Q1 = [tan_ADQ],16
+ cmp.ge.unc p7,p0 = tan_exp,tan_GR_10009
+(p7) br.cond.spnt TAN_DBX ;;
}
{ .mmb
- ldfpd tan_P4,tan_P5 = [tan_AD],16
- nop.m 999
-(p7) br.cond.spnt L(TAN_DBX) ;;
+ ldfpd tan_P4,tan_P5 = [tan_AD],16
+(p6) mov GR_Parameter_Tag = 226 // (cot)
+(p6) br.cond.spnt __libm_error_region ;; // call error support if cot(+-0)
}
{ .mmi
- ldfpd tan_Q2,tan_Q3 = [tan_ADQ],16
+ ldfpd tan_Q2,tan_Q3 = [tan_ADQ],16
nop.m 999
nop.i 999 ;;
}
@@ -404,8 +417,8 @@ __tan:
// TAN_NFLOAT = Round_Int_Nearest(tan_W)
{ .mfi
- ldfpd tan_P6,tan_P7 = [tan_AD],16
- fms.s1 TAN_NFLOAT = TAN_W_2TO64_RSH,TAN_2TOM64,TAN_RSHF
+ ldfpd tan_P6,tan_P7 = [tan_AD],16
+ fms.s1 TAN_NFLOAT = TAN_W_2TO64_RSH,TAN_2TOM64,TAN_RSHF
nop.i 999 ;;
}
@@ -418,22 +431,22 @@ __tan:
{ .mfi
- ldfpd tan_P0,tan_P1 = [tan_AD],16
+ ldfpd tan_P0,tan_P1 = [tan_AD],16
nop.f 999
nop.i 999 ;;
}
-{ .mfi
+{ .mmi
getf.sig tan_GR_n = TAN_W_2TO64_RSH
- nop.f 999
+ ldfpd tan_P2,tan_P3 = [tan_AD]
nop.i 999 ;;
}
// tan_r = -tan_Nfloat * tan_Pi_by_2_hi + x
{ .mfi
- ldfpd tan_P2,tan_P3 = [tan_AD]
- fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_hi, tan_NORM_f8
+(p12) add tan_GR_n = 0x1, tan_GR_n // N = N + 1 (for cot)
+ fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_hi, tan_NORM_f8
nop.i 999 ;;
}
@@ -441,42 +454,49 @@ __tan:
// p8 ==> even
// p9 ==> odd
{ .mmi
- and tan_GR_N_odd_even = 0x1, tan_GR_n ;;
+ and tan_GR_N_odd_even = 0x1, tan_GR_n ;;
nop.m 999
cmp.eq.unc p8,p9 = tan_GR_N_odd_even, r0 ;;
}
-// tan_r = tan_r -tan_Nfloat * tan_Pi_by_2_lo
+.pred.rel "mutex", p11, p12
+// tan_r = tan_r -tan_Nfloat * tan_Pi_by_2_lo (tan)
{ .mfi
nop.m 999
- fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_lo, tan_r
+(p11) fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_lo, tan_r
+ nop.i 999
+}
+// tan_r = -(tan_r -tan_Nfloat * tan_Pi_by_2_lo) (cot)
+{ .mfi
+ nop.m 999
+(p12) fms.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_lo, tan_r
nop.i 999 ;;
}
{ .mfi
nop.m 999
- fma.s1 tan_rsq = tan_r, tan_r, f0
+ fma.s1 tan_rsq = tan_r, tan_r, f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p9) frcpa.s1 tan_y0, p10 = f1,tan_r
+(p9) frcpa.s1 tan_y0, p0 = f1,tan_r
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v18 = tan_rsq, tan_P15, tan_P14
+(p8) fma.s1 tan_v18 = tan_rsq, tan_P15, tan_P14
nop.i 999
}
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v4 = tan_rsq, tan_P1, tan_P0
+(p8) fma.s1 tan_v4 = tan_rsq, tan_P1, tan_P0
nop.i 999 ;;
}
@@ -484,12 +504,12 @@ __tan:
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v16 = tan_rsq, tan_P13, tan_P12
- nop.i 999
+(p8) fma.s1 tan_v16 = tan_rsq, tan_P13, tan_P12
+ nop.i 999
}
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v17 = tan_rsq, tan_rsq, f0
+(p8) fma.s1 tan_v17 = tan_rsq, tan_rsq, f0
nop.i 999 ;;
}
@@ -497,12 +517,12 @@ __tan:
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v12 = tan_rsq, tan_P9, tan_P8
- nop.i 999
+(p8) fma.s1 tan_v12 = tan_rsq, tan_P9, tan_P8
+ nop.i 999
}
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v13 = tan_rsq, tan_P11, tan_P10
+(p8) fma.s1 tan_v13 = tan_rsq, tan_P11, tan_P10
nop.i 999 ;;
}
@@ -510,12 +530,12 @@ __tan:
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v7 = tan_rsq, tan_P5, tan_P4
- nop.i 999
+(p8) fma.s1 tan_v7 = tan_rsq, tan_P5, tan_P4
+ nop.i 999
}
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v8 = tan_rsq, tan_P7, tan_P6
+(p8) fma.s1 tan_v8 = tan_rsq, tan_P7, tan_P6
nop.i 999 ;;
}
@@ -523,12 +543,12 @@ __tan:
{ .mfi
nop.m 999
-(p9) fnma.s1 tan_d = tan_r, tan_y0, f1
- nop.i 999
+(p9) fnma.s1 tan_d = tan_r, tan_y0, f1
+ nop.i 999
}
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v5 = tan_rsq, tan_P3, tan_P2
+(p8) fma.s1 tan_v5 = tan_rsq, tan_P3, tan_P2
nop.i 999 ;;
}
@@ -536,36 +556,36 @@ __tan:
{ .mfi
nop.m 999
-(p9) fma.s1 tan_z11 = tan_rsq, tan_Q9, tan_Q8
+(p9) fma.s1 tan_z11 = tan_rsq, tan_Q9, tan_Q8
nop.i 999
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_z12 = tan_rsq, tan_rsq, f0
+(p9) fma.s1 tan_z12 = tan_rsq, tan_rsq, f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v15 = tan_v17, tan_v18, tan_v16
- nop.i 999
+(p8) fma.s1 tan_v15 = tan_v17, tan_v18, tan_v16
+ nop.i 999
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_z7 = tan_rsq, tan_Q5, tan_Q4
+(p9) fma.s1 tan_z7 = tan_rsq, tan_Q5, tan_Q4
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v11 = tan_v17, tan_v13, tan_v12
+(p8) fma.s1 tan_v11 = tan_v17, tan_v13, tan_v12
nop.i 999
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_z8 = tan_rsq, tan_Q7, tan_Q6
+(p9) fma.s1 tan_z8 = tan_rsq, tan_Q7, tan_Q6
nop.i 999 ;;
}
@@ -573,13 +593,13 @@ __tan:
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v14 = tan_v17, tan_v17, f0
- nop.i 999
+(p8) fma.s1 tan_v14 = tan_v17, tan_v17, f0
+ nop.i 999
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_z3 = tan_rsq, tan_Q1, tan_Q0
- nop.i 999 ;;
+(p9) fma.s1 tan_z3 = tan_rsq, tan_Q1, tan_Q0
+ nop.i 999 ;;
}
@@ -587,12 +607,12 @@ __tan:
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v3 = tan_v17, tan_v5, tan_v4
+(p8) fma.s1 tan_v3 = tan_v17, tan_v5, tan_v4
nop.i 999
}
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v6 = tan_v17, tan_v8, tan_v7
+(p8) fma.s1 tan_v6 = tan_v17, tan_v8, tan_v7
nop.i 999 ;;
}
@@ -600,89 +620,89 @@ __tan:
{ .mfi
nop.m 999
-(p9) fma.s1 tan_y1 = tan_y0, tan_d, tan_y0
- nop.i 999
+(p9) fma.s1 tan_y1 = tan_y0, tan_d, tan_y0
+ nop.i 999
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_dsq = tan_d, tan_d, f0
- nop.i 999 ;;
+(p9) fma.s1 tan_dsq = tan_d, tan_d, f0
+ nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_z10 = tan_z12, tan_Q10, tan_z11
- nop.i 999
+(p9) fma.s1 tan_z10 = tan_z12, tan_Q10, tan_z11
+ nop.i 999
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_z9 = tan_z12, tan_z12,f0
+(p9) fma.s1 tan_z9 = tan_z12, tan_z12,f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_z4 = tan_rsq, tan_Q3, tan_Q2
- nop.i 999
+(p9) fma.s1 tan_z4 = tan_rsq, tan_Q3, tan_Q2
+ nop.i 999
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_z6 = tan_z12, tan_z8, tan_z7
- nop.i 999 ;;
+(p9) fma.s1 tan_z6 = tan_z12, tan_z8, tan_z7
+ nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v10 = tan_v14, tan_v15, tan_v11
- nop.i 999 ;;
+(p8) fma.s1 tan_v10 = tan_v14, tan_v15, tan_v11
+ nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_y2 = tan_y1, tan_d, tan_y0
- nop.i 999
+(p9) fma.s1 tan_y2 = tan_y1, tan_d, tan_y0
+ nop.i 999
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_d4 = tan_dsq, tan_dsq, tan_d
+(p9) fma.s1 tan_d4 = tan_dsq, tan_dsq, tan_d
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v2 = tan_v14, tan_v6, tan_v3
+(p8) fma.s1 tan_v2 = tan_v14, tan_v6, tan_v3
nop.i 999
}
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v9 = tan_v14, tan_v14, f0
+(p8) fma.s1 tan_v9 = tan_v14, tan_v14, f0
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_z2 = tan_z12, tan_z4, tan_z3
- nop.i 999
+(p9) fma.s1 tan_z2 = tan_z12, tan_z4, tan_z3
+ nop.i 999
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_z5 = tan_z9, tan_z10, tan_z6
+(p9) fma.s1 tan_z5 = tan_z9, tan_z10, tan_z6
nop.i 999 ;;
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_inv_r = tan_d4, tan_y2, tan_y0
- nop.i 999
+(p9) fma.s1 tan_inv_r = tan_d4, tan_y2, tan_y0
+ nop.i 999
}
{ .mfi
nop.m 999
@@ -694,12 +714,12 @@ __tan:
{ .mfi
nop.m 999
-(p8) fma.s1 tan_v1 = tan_v9, tan_v10, tan_v2
- nop.i 999
+(p8) fma.s1 tan_v1 = tan_v9, tan_v10, tan_v2
+ nop.i 999
}
{ .mfi
nop.m 999
-(p9) fma.s1 tan_z1 = tan_z9, tan_z5, tan_z2
+(p9) fma.s1 tan_z1 = tan_z9, tan_z5, tan_z2
nop.i 999 ;;
}
@@ -707,64 +727,150 @@ __tan:
{ .mfi
nop.m 999
-(p8) fma.d.s0 f8 = tan_v1, tan_rcube, tan_r
- nop.i 999
+(p8) fma.d.s0 f8 = tan_v1, tan_rcube, tan_r
+ nop.i 999
}
{ .mfb
nop.m 999
-(p9) fms.d.s0 f8 = tan_r, tan_z1, tan_inv_r
- br.ret.sptk b0 ;;
+(p9) fms.d.s0 f8 = tan_r, tan_z1, tan_inv_r
+ br.ret.sptk b0 ;;
}
-.endp tan#
-ASM_SIZE_DIRECTIVE(tan)
-
+GLOBAL_IEEE754_END(tan)
-.proc __libm_callout
-__libm_callout:
-L(TAN_DBX):
+LOCAL_LIBM_ENTRY(__libm_callout)
+TAN_DBX:
.prologue
{ .mfi
- nop.m 0
- fmerge.s f9 = f0,f0
-.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs
+ nop.m 0
+ fmerge.s f9 = f0,f0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs
}
;;
{ .mfi
- mov GR_SAVE_GP=gp
- nop.f 0
-.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0
+ mov GR_SAVE_GP=gp
+ nop.f 0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0
}
.body
-{ .mfb
+{ .mmb
nop.m 999
- nop.f 999
- br.call.sptk.many b0=__libm_tan# ;;
+ nop.m 999
+(p11) br.cond.sptk.many call_tanl ;;
}
+// Here if we should call cotl
+{ .mmb
+ nop.m 999
+ nop.m 999
+ br.call.sptk.many b0=__libm_cotl# ;;
+}
{ .mfi
- mov gp = GR_SAVE_GP
- fnorm.d f8 = f8
- mov b0 = GR_SAVE_B0
+ mov gp = GR_SAVE_GP
+ fnorm.d.s0 f8 = f8
+ mov b0 = GR_SAVE_B0
}
;;
+{ .mib
+ nop.m 999
+ mov ar.pfs = GR_SAVE_PFS
+ br.ret.sptk b0
+;;
+}
+
+// Here if we should call tanl
+call_tanl:
+{ .mmb
+ nop.m 999
+ nop.m 999
+ br.call.sptk.many b0=__libm_tanl# ;;
+}
+
+{ .mfi
+ mov gp = GR_SAVE_GP
+ fnorm.d.s0 f8 = f8
+ mov b0 = GR_SAVE_B0
+}
+;;
{ .mib
- nop.m 999
+ nop.m 999
mov ar.pfs = GR_SAVE_PFS
br.ret.sptk b0
;;
}
+LOCAL_LIBM_END(__libm_callout)
+
+.type __libm_tanl#,@function
+.global __libm_tanl#
+.type __libm_cotl#,@function
+.global __libm_cotl#
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+
+// (1)
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+// (2)
+{ .mmi
+ stfd [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
+// (3)
+{ .mib
+ stfd [GR_Parameter_X] = arg_copy // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfd [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+
+// (4)
+{ .mmi
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
-.endp __libm_callout
-ASM_SIZE_DIRECTIVE(__libm_callout)
+.type __libm_error_support#,@function
+.global __libm_error_support#
-.type __libm_tan#,@function
-.global __libm_tan#
diff --git a/sysdeps/ia64/fpu/s_tanf.S b/sysdeps/ia64/fpu/s_tanf.S
index a84009e2fe..48f82345f9 100644
--- a/sysdeps/ia64/fpu/s_tanf.S
+++ b/sysdeps/ia64/fpu/s_tanf.S
@@ -1,10 +1,10 @@
-.file "tanf.s"
+.file "tancotf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -32,739 +32,658 @@
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
// History
//==============================================================
-// 2/02/00: Initial version
-// 4/04/00 Unwind support added
+// 02/02/00 Initial version
+// 04/04/00 Unwind support added
// 12/27/00 Improved speed
+// 02/21/01 Updated to call tanl
+// 05/30/02 Improved speed, added cotf.
+// 11/25/02 Added explicit completer on fnorm
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+// 04/17/03 Eliminated redundant stop bits
//
-// API
+// APIs
//==============================================================
-// float tan( float x);
+// float tanf(float)
+// float cotf(float)
//
-// Overview of operation
+// Algorithm Description for tanf
//==============================================================
-// If the input value in radians is |x| >= 1.xxxxx 2^10 call the
-// older slower version.
+// The tanf function computes the principle value of the tangent of x,
+// where x is radian argument.
//
-// The new algorithm is used when |x| <= 1.xxxxx 2^9.
+// There are 5 paths:
+// 1. x = +/-0.0
+// Return tanf(x) = +/-0.0
//
-// Represent the input X as Nfloat * pi/2 + r
-// where r can be negative and |r| <= pi/4
+// 2. x = [S,Q]NaN
+// Return tanf(x) = QNaN
//
-// tan_W = x * 2/pi
-// Nfloat = round_int(tan_W)
+// 3. x = +/-Inf
+// Return tanf(x) = QNaN
//
-// tan_r = x - Nfloat * (pi/2)_hi
-// tan_r = tan_r - Nfloat * (pi/2)_lo
+// 4. x = r + (Pi/2)*N, N = RoundInt(x*(2/Pi)), N is even, |r|<Pi/4
+// Return tanf(x) = P19(r) = A1*r + A3*r^3 + A5*r^5 + ... + A19*r^19 =
+// = r*(A1 + A3*t + A5*t^2 + ... + A19*t^9) = r*P9(t), where t = r^2
//
-// We have two paths: p8, when Nfloat is even and p9. when Nfloat is odd.
-// p8: tan(X) = tan(r)
-// p9: tan(X) = -cot(r)
+// 5. x = r + (Pi/2)*N, N = RoundInt(x*(2/Pi)), N is odd, |r|<Pi/4
+// Return tanf(x) = -1/r + P11(r) = -1/r + B1*r + B3*r^3 + ... + B11*r^11 =
+// = -1/r + r*(B1 + B3*t + B5*t^2 + ... + B11*t^5) = -1/r + r*P11(t),
+// where t = r^2
//
-// Each is evaluated as a series. The p9 path requires 1/r.
+// Algorithm Description for cotf
+//==============================================================
+// The cotf function computes the principle value of the cotangent of x,
+// where x is radian argument.
//
-// The coefficients used in the series are stored in a table as
-// are the pi constants.
+// There are 5 paths:
+// 1. x = +/-0.0
+// Return cotf(x) = +/-Inf and error handling is called
//
-// Registers used
-//==============================================================
+// 2. x = [S,Q]NaN
+// Return cotf(x) = QNaN
//
-// predicate registers used:
-// p6-10
+// 3. x = +/-Inf
+// Return cotf(x) = QNaN
//
-// floating-point registers used:
-// f10-15, f32-105
+// 4. x = r + (Pi/2)*N, N = RoundInt(x*(2/Pi)), N is odd, |r|<Pi/4
+// Return cotf(x) = P19(-r) = A1*(-r) + A3*(-r^3) + ... + A19*(-r^19) =
+// = -r*(A1 + A3*t + A5*t^2 + ... + A19*t^9) = -r*P9(t), where t = r^2
+//
+// 5. x = r + (Pi/2)*N, N = RoundInt(x*(2/Pi)), N is even, |r|<Pi/4
+// Return cotf(x) = 1/r + P11(-r) = 1/r + B1*(-r) + ... + B11*(-r^11) =
+// = 1/r - r*(B1 + B3*t + B5*t^2 + ... + B11*t^5) = 1/r - r*P11(t),
+// where t = r^2
+//
+// We set p10 and clear p11 if computing tanf, vice versa for cotf.
+//
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
// f8, input
+// f32 -> f80
//
-// general registers used
-// r14-18, r32-43
+// General registers used:
+// r14 -> r23, r32 -> r39
+//
+// Predicate registers used:
+// p6 -> p13
//
-
-#include "libm_support.h"
-
// Assembly macros
//==============================================================
-TAN_INV_PI_BY_2_2TO64 = f10
-TAN_RSHF_2TO64 = f11
-TAN_2TOM64 = f12
-TAN_RSHF = f13
-TAN_W_2TO64_RSH = f14
-TAN_NFLOAT = f15
-
-tan_Inv_Pi_by_2 = f32
-tan_Pi_by_2_hi = f33
-tan_Pi_by_2_lo = f34
-
-
-tan_P0 = f35
-tan_P1 = f36
-tan_P2 = f37
-tan_P3 = f38
-tan_P4 = f39
-tan_P5 = f40
-tan_P6 = f41
-tan_P7 = f42
-tan_P8 = f43
-tan_P9 = f44
-tan_P10 = f45
-tan_P11 = f46
-tan_P12 = f47
-tan_P13 = f48
-tan_P14 = f49
-tan_P15 = f50
-
-tan_Q0 = f51
-tan_Q1 = f52
-tan_Q2 = f53
-tan_Q3 = f54
-tan_Q4 = f55
-tan_Q5 = f56
-tan_Q6 = f57
-tan_Q7 = f58
-tan_Q8 = f59
-tan_Q9 = f60
-tan_Q10 = f61
-
-tan_r = f62
-tan_rsq = f63
-tan_rcube = f64
-
-tan_v18 = f65
-tan_v16 = f66
-tan_v17 = f67
-tan_v12 = f68
-tan_v13 = f69
-tan_v7 = f70
-tan_v8 = f71
-tan_v4 = f72
-tan_v5 = f73
-tan_v15 = f74
-tan_v11 = f75
-tan_v14 = f76
-tan_v3 = f77
-tan_v6 = f78
-tan_v10 = f79
-tan_v2 = f80
-tan_v9 = f81
-tan_v1 = f82
-tan_int_Nfloat = f83
-tan_Nfloat = f84
-
-tan_NORM_f8 = f85
-tan_W = f86
-
-tan_y0 = f87
-tan_d = f88
-tan_y1 = f89
-tan_dsq = f90
-tan_y2 = f91
-tan_d4 = f92
-tan_inv_r = f93
-
-tan_z1 = f94
-tan_z2 = f95
-tan_z3 = f96
-tan_z4 = f97
-tan_z5 = f98
-tan_z6 = f99
-tan_z7 = f100
-tan_z8 = f101
-tan_z9 = f102
-tan_z10 = f103
-tan_z11 = f104
-tan_z12 = f105
-
-
-/////////////////////////////////////////////////////////////
-
-tan_GR_sig_inv_pi_by_2 = r14
-tan_GR_rshf_2to64 = r15
-tan_GR_exp_2tom64 = r16
-tan_GR_n = r17
-tan_GR_rshf = r18
-
-tan_AD = r33
-tan_GR_10009 = r34
-tan_GR_17_ones = r35
-tan_GR_N_odd_even = r36
-tan_GR_N = r37
-tan_signexp = r38
-tan_exp = r39
-tan_ADQ = r40
-
-GR_SAVE_PFS = r41
-GR_SAVE_B0 = r42
-GR_SAVE_GP = r43
-
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
+// integer registers
+rExp = r14
+rSignMask = r15
+rRshf = r16
+rScFctrExp = r17
+rIntN = r18
+rSigRcpPiby2 = r19
+rScRshf = r20
+rCoeffA = r21
+rCoeffB = r22
+rExpCut = r23
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_Parameter_X = r36
+GR_Parameter_Y = r37
+GR_Parameter_RESULT = r38
+GR_Parameter_Tag = r39
+
+//==============================================================
+// floating point registers
+fScRcpPiby2 = f32
+fScRshf = f33
+fNormArg = f34
+fScFctr = f35
+fRshf = f36
+fShiftedN = f37
+fN = f38
+fR = f39
+fA01 = f40
+fA03 = f41
+fA05 = f42
+fA07 = f43
+fA09 = f44
+fA11 = f45
+fA13 = f46
+fA15 = f47
+fA17 = f48
+fA19 = f49
+fB01 = f50
+fB03 = f51
+fB05 = f52
+fB07 = f53
+fB09 = f54
+fB11 = f55
+fA03_01 = f56
+fA07_05 = f57
+fA11_09 = f58
+fA15_13 = f59
+fA19_17 = f60
+fA11_05 = f61
+fA19_13 = f62
+fA19_05 = f63
+fRbyA03_01 = f64
+fB03_01 = f65
+fB07_05 = f66
+fB11_09 = f67
+fB11_05 = f68
+fRbyB03_01 = f69
+fRbyB11_01 = f70
+fRp2 = f71
+fRp4 = f72
+fRp8 = f73
+fRp5 = f74
+fY0 = f75
+fY1 = f76
+fD = f77
+fDp2 = f78
+fInvR = f79
+fPiby2 = f80
+//==============================================================
-.align 16
-double_tan_constants:
-ASM_TYPE_DIRECTIVE(double_tan_constants,@object)
-// data8 0xA2F9836E4E44152A, 0x00003FFE // 2/pi
- data8 0xC90FDAA22168C234, 0x00003FFF // pi/2 hi
-
- data8 0xBEEA54580DDEA0E1 // P14
- data8 0x3ED3021ACE749A59 // P15
- data8 0xBEF312BD91DC8DA1 // P12
- data8 0x3EFAE9AFC14C5119 // P13
- data8 0x3F2F342BF411E769 // P8
- data8 0x3F1A60FC9F3B0227 // P9
- data8 0x3EFF246E78E5E45B // P10
- data8 0x3F01D9D2E782875C // P11
- data8 0x3F8226E34C4499B6 // P4
- data8 0x3F6D6D3F12C236AC // P5
- data8 0x3F57DA1146DCFD8B // P6
- data8 0x3F43576410FE3D75 // P7
- data8 0x3FD5555555555555 // P0
- data8 0x3FC11111111111C2 // P1
- data8 0x3FABA1BA1BA0E850 // P2
- data8 0x3F9664F4886725A7 // P3
-ASM_SIZE_DIRECTIVE(double_tan_constants)
-
-double_Q_tan_constants:
-ASM_TYPE_DIRECTIVE(double_Q_tan_constants,@object)
- data8 0xC4C6628B80DC1CD1, 0x00003FBF // pi/2 lo
- data8 0x3E223A73BA576E48 // Q8
- data8 0x3DF54AD8D1F2CA43 // Q9
- data8 0x3EF66A8EE529A6AA // Q4
- data8 0x3EC2281050410EE6 // Q5
- data8 0x3E8D6BB992CC3CF5 // Q6
- data8 0x3E57F88DE34832E4 // Q7
- data8 0x3FD5555555555555 // Q0
- data8 0x3F96C16C16C16DB8 // Q1
- data8 0x3F61566ABBFFB489 // Q2
- data8 0x3F2BBD77945C1733 // Q3
- data8 0x3D927FB33E2B0E04 // Q10
-ASM_SIZE_DIRECTIVE(double_Q_tan_constants)
-
-
-
-.align 32
-.global tanf#
-#ifdef _LIBC
-.global __tanf#
-#endif
-
-////////////////////////////////////////////////////////
+RODATA
+.align 16
+LOCAL_OBJECT_START(coeff_A)
+data8 0x3FF0000000000000 // A1 = 1.00000000000000000000e+00
+data8 0x3FD5555556BCE758 // A3 = 3.33333334641442641606e-01
+data8 0x3FC111105C2DAE48 // A5 = 1.33333249100689099175e-01
+data8 0x3FABA1F876341060 // A7 = 5.39701122561673229739e-02
+data8 0x3F965FB86D12A38D // A9 = 2.18495194027670719750e-02
+data8 0x3F8265F62415F9D6 // A11 = 8.98353860497717439465e-03
+data8 0x3F69E3AE64CCF58D // A13 = 3.16032468108912746342e-03
+data8 0x3F63920D09D0E6F6 // A15 = 2.38897844840557235331e-03
+LOCAL_OBJECT_END(coeff_A)
+
+LOCAL_OBJECT_START(coeff_B)
+data8 0xC90FDAA22168C235, 0x3FFF // pi/2
+data8 0x3FD55555555358DB // B1 = 3.33333333326107426583e-01
+data8 0x3F96C16C252F643F // B3 = 2.22222230621336129239e-02
+data8 0x3F61566243AB3C60 // B5 = 2.11638633968606896785e-03
+data8 0x3F2BC1169BD4438B // B7 = 2.11748132564551094391e-04
+data8 0x3EF611B4CEA056A1 // B9 = 2.10467959860990200942e-05
+data8 0x3EC600F9E32194BF // B11 = 2.62305891234274186608e-06
+data8 0xBF42BA7BCC177616 // A17 =-5.71546981685324877205e-04
+data8 0x3F4F2614BC6D3BB8 // A19 = 9.50584530849832782542e-04
+LOCAL_OBJECT_END(coeff_B)
.section .text
-.proc tanf#
-#ifdef _LIBC
-.proc __tanf#
-#endif
-.align 32
-tanf:
-#ifdef _LIBC
-__tanf:
-#endif
-// The initial fnorm will take any unmasked faults and
-// normalize any single/double unorms
+
+LOCAL_LIBM_ENTRY(cotf)
{ .mlx
- alloc r32=ar.pfs,1,11,0,0
- movl tan_GR_sig_inv_pi_by_2 = 0xA2F9836E4E44152A // significand of 2/pi
+ getf.exp rExp = f8 // ***** Get 2ˆ17 * s + E
+ movl rSigRcpPiby2= 0xA2F9836E4E44152A // significand of 2/Pi
}
{ .mlx
- addl tan_AD = @ltoff(double_tan_constants), gp
- movl tan_GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+63+1)
+ addl rCoeffA = @ltoff(coeff_A), gp
+ movl rScRshf = 0x47e8000000000000 // 1.5*2^(63+63+1)
}
;;
{ .mfi
- ld8 tan_AD = [tan_AD]
- fnorm tan_NORM_f8 = f8
- mov tan_GR_exp_2tom64 = 0xffff-64 // exponent of scaling factor 2^-64
+ alloc r32 = ar.pfs, 0, 4, 4, 0
+ fclass.m p9, p0 = f8, 0xc3 // Test for x=nan
+ cmp.eq p11, p10 = r0, r0 // if p11=1 we compute cotf
}
-{ .mlx
- nop.m 999
- movl tan_GR_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift
+{ .mib
+ ld8 rCoeffA = [rCoeffA]
+ mov rExpCut = 0x10009 // cutoff for exponent
+ br.cond.sptk Common_Path
}
;;
+LOCAL_LIBM_END(cotf)
-// Form two constants we need
-// 2/pi * 2^1 * 2^63, scaled by 2^64 since we just loaded the significand
-// 1.1000...000 * 2^(63+63+1) to right shift int(W) into the significand
-{ .mmi
- setf.sig TAN_INV_PI_BY_2_2TO64 = tan_GR_sig_inv_pi_by_2
- setf.d TAN_RSHF_2TO64 = tan_GR_rshf_2to64
- mov tan_GR_17_ones = 0x1ffff ;;
-}
-
+GLOBAL_IEEE754_ENTRY(tanf)
-// Form another constant
-// 2^-64 for scaling Nfloat
-// 1.1000...000 * 2^63, the right shift constant
-{ .mmf
- setf.exp TAN_2TOM64 = tan_GR_exp_2tom64
- adds tan_ADQ = double_Q_tan_constants - double_tan_constants, tan_AD
- fclass.m.unc p6,p0 = f8, 0x07 // Test for x=0
+{ .mlx
+ getf.exp rExp = f8 // ***** Get 2ˆ17 * s + E
+ movl rSigRcpPiby2= 0xA2F9836E4E44152A // significand of 2/Pi
}
-;;
-
-
-// Form another constant
-// 2^-64 for scaling Nfloat
-// 1.1000...000 * 2^63, the right shift constant
-{ .mmf
- setf.d TAN_RSHF = tan_GR_rshf
- ldfe tan_Pi_by_2_hi = [tan_AD],16
- fclass.m.unc p7,p0 = f8, 0x23 // Test for x=inf
+{ .mlx
+ addl rCoeffA = @ltoff(coeff_A), gp
+ movl rScRshf = 0x47e8000000000000 // 1.5*2^(63+63+1)
}
;;
-{ .mfb
- ldfe tan_Pi_by_2_lo = [tan_ADQ],16
- fclass.m.unc p8,p0 = f8, 0xc3 // Test for x=nan
-(p6) br.ret.spnt b0 ;; // Exit for x=0
-}
-
{ .mfi
- ldfpd tan_P14,tan_P15 = [tan_AD],16
-(p7) frcpa.s0 f8,p9=f0,f0 // Set qnan indef if x=inf
- mov tan_GR_10009 = 0x10009
+ alloc r32 = ar.pfs, 0, 4, 4, 0
+ fclass.m p9, p0 = f8, 0xc3 // Test for x=nan
+ cmp.eq p10, p11 = r0, r0 // if p10=1 we compute tandf
}
{ .mib
- ldfpd tan_Q8,tan_Q9 = [tan_ADQ],16
- nop.i 999
-(p7) br.ret.spnt b0 ;; // Exit for x=inf
+ ld8 rCoeffA = [rCoeffA]
+ mov rExpCut = 0x10009 // cutoff for exponent
+ nop.b 0
}
+;;
+// Below is common path for both tandf and cotdf
+Common_Path:
{ .mfi
- ldfpd tan_P12,tan_P13 = [tan_AD],16
-(p8) fma.s f8=f8,f1,f8 // Set qnan if x=nan
- nop.i 999
+ setf.sig fScRcpPiby2 = rSigRcpPiby2 // 2^(63+1)*(2/Pi)
+ fclass.m p8, p0 = f8, 0x23 // Test for x=inf
+ mov rSignMask = 0x1ffff // mask for sign bit
}
-{ .mib
- ldfpd tan_Q4,tan_Q5 = [tan_ADQ],16
- nop.i 999
-(p8) br.ret.spnt b0 ;; // Exit for x=nan
+{ .mlx
+ setf.d fScRshf = rScRshf // 1.5*2^(63+63+1)
+ movl rRshf = 0x43e8000000000000 // 1.5 2^63 for right shift
}
+;;
-{ .mmi
- getf.exp tan_signexp = tan_NORM_f8
- ldfpd tan_P8,tan_P9 = [tan_AD],16
- nop.i 999 ;;
+{ .mfi
+ and rSignMask = rSignMask, rExp // clear sign bit
+(p10) fclass.m.unc p7, p0 = f8, 0x07 // Test for x=0 (for tanf)
+ mov rScFctrExp = 0xffff-64 // exp of scaling factor
+}
+{ .mfb
+ adds rCoeffB = coeff_B - coeff_A, rCoeffA
+(p9) fma.s.s0 f8 = f8, f1, f8 // Set qnan if x=nan
+(p9) br.ret.spnt b0 // Exit for x=nan
}
+;;
-// Multiply x by scaled 2/pi and add large const to shift integer part of W to
-// rightmost bits of significand
{ .mfi
- ldfpd tan_Q6,tan_Q7 = [tan_ADQ],16
- fma.s1 TAN_W_2TO64_RSH = tan_NORM_f8,TAN_INV_PI_BY_2_2TO64,TAN_RSHF_2TO64
- nop.i 999 ;;
+ cmp.ge p6, p0 = rSignMask, rExpCut // p6 = (E => 0x10009)
+(p8) frcpa.s0 f8, p0 = f0, f0 // Set qnan indef if x=inf
+ mov GR_Parameter_Tag = 227 // (cotf)
}
-
-{ .mmi
- ldfpd tan_P10,tan_P11 = [tan_AD],16
- nop.m 999
- and tan_exp = tan_GR_17_ones, tan_signexp ;;
+{ .mbb
+ ldfe fPiby2 = [rCoeffB], 16
+(p8) br.ret.spnt b0 // Exit for x=inf
+(p6) br.cond.spnt Huge_Argument // Branch if |x|>=2^10
}
+;;
+{ .mfi
+ nop.m 0
+(p11) fclass.m.unc p6, p0 = f8, 0x07 // Test for x=0 (for cotf)
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fnorm.s0 fNormArg = f8
+(p7) br.ret.spnt b0 // Exit for x=0 (for tanf)
+}
+;;
-// p7 is true if we must call DBX TAN
-// p7 is true if f8 exp is > 0x10009 (which includes all ones
-// NAN or inf)
-{ .mmi
- ldfpd tan_Q0,tan_Q1 = [tan_ADQ],16
- cmp.ge.unc p7,p0 = tan_exp,tan_GR_10009
- nop.i 999 ;;
+{ .mmf
+ ldfpd fA01, fA03 = [rCoeffA], 16
+ ldfpd fB01, fB03 = [rCoeffB], 16
+ fmerge.s f10 = f8, f8 // Save input for error call
}
+;;
+{ .mmf
+ setf.exp fScFctr = rScFctrExp // get as real
+ setf.d fRshf = rRshf // get right shifter as real
+(p6) frcpa.s0 f8, p0 = f1, f8 // cotf(+-0) = +-Inf
+}
+;;
{ .mmb
- ldfpd tan_P4,tan_P5 = [tan_AD],16
- nop.m 999
-(p7) br.cond.spnt L(TAN_DBX) ;;
+ ldfpd fA05, fA07 = [rCoeffA], 16
+ ldfpd fB05, fB07 = [rCoeffB], 16
+(p6) br.cond.spnt __libm_error_region // call error support if cotf(+-0)
}
-
+;;
{ .mmi
- ldfpd tan_Q2,tan_Q3 = [tan_ADQ],16
- nop.m 999
- nop.i 999 ;;
-}
-
-
-
-// TAN_NFLOAT = Round_Int_Nearest(tan_W)
-{ .mfi
- ldfpd tan_P6,tan_P7 = [tan_AD],16
- fms.s1 TAN_NFLOAT = TAN_W_2TO64_RSH,TAN_2TOM64,TAN_RSHF
- nop.i 999 ;;
+ ldfpd fA09, fA11 = [rCoeffA], 16
+ ldfpd fB09, fB11 = [rCoeffB], 16
+ nop.i 0
}
-
+;;
{ .mfi
- ldfd tan_Q10 = [tan_ADQ]
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fShiftedN = fNormArg,fScRcpPiby2,fScRshf // x*2^70*(2/Pi)+ScRshf
+ nop.i 0
}
-
+;;
{ .mfi
- ldfpd tan_P0,tan_P1 = [tan_AD],16
- nop.f 999
- nop.i 999 ;;
+ nop.m 0
+ fms.s1 fN = fShiftedN, fScFctr, fRshf // N = Y*2^(-70) - Rshf
+ nop.i 0
}
+;;
-
+.pred.rel "mutex", p10, p11
{ .mfi
- getf.sig tan_GR_n = TAN_W_2TO64_RSH
- nop.f 999
- nop.i 999 ;;
+ getf.sig rIntN = fShiftedN // get N as integer
+(p10) fnma.s1 fR = fN, fPiby2, fNormArg // R = x - (Pi/2)*N (tanf)
+ nop.i 0
}
-
-// tan_r = -tan_Nfloat * tan_Pi_by_2_hi + x
{ .mfi
- ldfpd tan_P2,tan_P3 = [tan_AD]
- fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_hi, tan_NORM_f8
- nop.i 999 ;;
+ nop.m 0
+(p11) fms.s1 fR = fN, fPiby2, fNormArg // R = (Pi/2)*N - x (cotf)
+ nop.i 0
}
+;;
-
-// p8 ==> even
-// p9 ==> odd
{ .mmi
- and tan_GR_N_odd_even = 0x1, tan_GR_n ;;
- nop.m 999
- cmp.eq.unc p8,p9 = tan_GR_N_odd_even, r0 ;;
+ ldfpd fA13, fA15 = [rCoeffA], 16
+ ldfpd fA17, fA19 = [rCoeffB], 16
+ nop.i 0
}
+;;
-
-// tan_r = tan_r -tan_Nfloat * tan_Pi_by_2_lo
-{ .mfi
- nop.m 999
- fnma.s1 tan_r = TAN_NFLOAT, tan_Pi_by_2_lo, tan_r
- nop.i 999 ;;
-}
-
-
+Return_From_Huges:
{ .mfi
- nop.m 999
- fma.s1 tan_rsq = tan_r, tan_r, f0
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fRp2 = fR, fR, f0 // R^2
+(p11) add rIntN = 0x1, rIntN // N = N + 1 (cotf)
}
-
+;;
{ .mfi
- nop.m 999
-(p9) frcpa.s1 tan_y0, p10 = f1,tan_r
- nop.i 999 ;;
+ nop.m 0
+ frcpa.s1 fY0, p0 = f1, fR // Y0 ~ 1/R
+ tbit.z p8, p9 = rIntN, 0 // p8=1 if N is even
}
+;;
-
+// Below are mixed polynomial calculations (mixed for even and odd N)
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v18 = tan_rsq, tan_P15, tan_P14
- nop.i 999
+ nop.m 0
+(p9) fma.s1 fB03_01 = fRp2, fB03, fB01 // R^2*B3 + B1
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v4 = tan_rsq, tan_P1, tan_P0
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fRp4 = fRp2, fRp2, f0 // R^4
+ nop.i 0
}
-
-
+;;
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v16 = tan_rsq, tan_P13, tan_P12
- nop.i 999
+ nop.m 0
+(p8) fma.s1 fA15_13 = fRp2, fA15, fA13 // R^2*A15 + A13
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v17 = tan_rsq, tan_rsq, f0
- nop.i 999 ;;
+ nop.m 0
+(p8) fma.s1 fA19_17 = fRp2, fA19, fA17 // R^2*A19 + A17
+ nop.i 0
}
-
-
+;;
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v12 = tan_rsq, tan_P9, tan_P8
- nop.i 999
+ nop.m 0
+(p8) fma.s1 fA07_05 = fRp2, fA07, fA05 // R^2*A7 + A5
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v13 = tan_rsq, tan_P11, tan_P10
- nop.i 999 ;;
+ nop.m 0
+(p8) fma.s1 fA11_09 = fRp2, fA11, fA09 // R^2*A11 + A9
+ nop.i 0
}
-
-
+;;
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v7 = tan_rsq, tan_P5, tan_P4
- nop.i 999
+ nop.m 0
+(p9) fma.s1 fB07_05 = fRp2, fB07, fB05 // R^2*B7 + B5
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v8 = tan_rsq, tan_P7, tan_P6
- nop.i 999 ;;
+ nop.m 0
+(p9) fma.s1 fB11_09 = fRp2, fB11, fB09 // R^2*B11 + B9
+ nop.i 0
}
-
-
+;;
{ .mfi
- nop.m 999
-(p9) fnma.s1 tan_d = tan_r, tan_y0, f1
- nop.i 999
+ nop.m 0
+(p9) fnma.s1 fD = fR, fY0, f1 // D = 1 - R*Y0
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v5 = tan_rsq, tan_P3, tan_P2
- nop.i 999 ;;
+ nop.m 0
+(p8) fma.s1 fA03_01 = fRp2, fA03, fA01 // R^2*A3 + A1
+ nop.i 0
}
-
-
+;;
{ .mfi
- nop.m 999
-(p9) fma.s1 tan_z11 = tan_rsq, tan_Q9, tan_Q8
- nop.i 999
+ nop.m 0
+ fma.s1 fRp8 = fRp4, fRp4, f0 // R^8
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p9) fma.s1 tan_z12 = tan_rsq, tan_rsq, f0
- nop.i 999 ;;
+ nop.m 0
+ fma.s1 fRp5 = fR, fRp4, f0 // R^5
+ nop.i 0
}
-
+;;
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v15 = tan_v17, tan_v18, tan_v16
- nop.i 999
+ nop.m 0
+(p8) fma.s1 fA11_05 = fRp4, fA11_09, fA07_05 // R^4*(R^2*A11 + A9) + ...
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p9) fma.s1 tan_z7 = tan_rsq, tan_Q5, tan_Q4
- nop.i 999 ;;
+ nop.m 0
+(p8) fma.s1 fA19_13 = fRp4, fA19_17, fA15_13 // R^4*(R^2*A19 + A17) + ..
+ nop.i 0
}
-
+;;
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v11 = tan_v17, tan_v13, tan_v12
- nop.i 999
+ nop.m 0
+(p9) fma.s1 fB11_05 = fRp4, fB11_09, fB07_05 // R^4*(R^2*B11 + B9) + ...
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p9) fma.s1 tan_z8 = tan_rsq, tan_Q7, tan_Q6
- nop.i 999 ;;
+ nop.m 0
+(p9) fma.s1 fRbyB03_01 = fR, fB03_01, f0 // R*(R^2*B3 + B1)
+ nop.i 0
}
-
-
+;;
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v14 = tan_v17, tan_v17, f0
- nop.i 999
+ nop.m 0
+(p9) fma.s1 fY1 = fY0, fD, fY0 // Y1 = Y0*D + Y0
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p9) fma.s1 tan_z3 = tan_rsq, tan_Q1, tan_Q0
- nop.i 999 ;;
+ nop.m 0
+(p9) fma.s1 fDp2 = fD, fD, f0 // D^2
+ nop.i 0
}
-
-
-
+;;
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v3 = tan_v17, tan_v5, tan_v4
- nop.i 999
+ nop.m 0
+ // R^8*(R^6*A19 + R^4*A17 + R^2*A15 + A13) + R^6*A11 + R^4*A9 + R^2*A7 + A5
+(p8) fma.d.s1 fA19_05 = fRp8, fA19_13, fA11_05
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v6 = tan_v17, tan_v8, tan_v7
- nop.i 999 ;;
+ nop.m 0
+(p8) fma.d.s1 fRbyA03_01 = fR, fA03_01, f0 // R*(R^2*A3 + A1)
+ nop.i 0
}
-
-
+;;
{ .mfi
- nop.m 999
-(p9) fma.s1 tan_y1 = tan_y0, tan_d, tan_y0
- nop.i 999
+ nop.m 0
+(p9) fma.d.s1 fInvR = fY1, fDp2, fY1 // 1/R = Y1*D^2 + Y1
+ nop.i 0
}
{ .mfi
- nop.m 999
-(p9) fma.s1 tan_dsq = tan_d, tan_d, f0
- nop.i 999 ;;
+ nop.m 0
+ // R^5*(R^6*B11 + R^4*B9 + R^2*B7 + B5) + R^3*B3 + R*B1
+(p9) fma.d.s1 fRbyB11_01 = fRp5, fB11_05, fRbyB03_01
+ nop.i 0
}
+;;
-
+.pred.rel "mutex", p8, p9
{ .mfi
- nop.m 999
-(p9) fma.s1 tan_z10 = tan_z12, tan_Q10, tan_z11
- nop.i 999
+ nop.m 0
+ // Result = R^5*(R^14*A19 + R^12*A17 + R^10*A15 + ...) + R^3*A3 + R*A1
+(p8) fma.s.s0 f8 = fRp5, fA19_05, fRbyA03_01
+ nop.i 0
}
-{ .mfi
- nop.m 999
-(p9) fma.s1 tan_z9 = tan_z12, tan_z12,f0
- nop.i 999 ;;
+{ .mfb
+ nop.m 0
+ // Result = -1/R + R^11*B11 + R^9*B9 + R^7*B7 + R^5*B5 + R^3*B3 + R*B1
+(p9) fnma.s.s0 f8 = f1, fInvR, fRbyB11_01
+ br.ret.sptk b0 // exit for main path
}
+;;
+GLOBAL_IEEE754_END(tanf)
+
+LOCAL_LIBM_ENTRY(__libm_callout)
+Huge_Argument:
+.prologue
{ .mfi
- nop.m 999
-(p9) fma.s1 tan_z4 = tan_rsq, tan_Q3, tan_Q2
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p9) fma.s1 tan_z6 = tan_z12, tan_z8, tan_z7
- nop.i 999 ;;
+ nop.m 0
+ fmerge.s f9 = f0,f0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs
}
-
-
+;;
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v10 = tan_v14, tan_v15, tan_v11
- nop.i 999 ;;
+ mov GR_SAVE_GP=gp
+ nop.f 0
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0
}
-
-
-{ .mfi
+.body
+{ .mmb
nop.m 999
-(p9) fma.s1 tan_y2 = tan_y1, tan_d, tan_y0
- nop.i 999
-}
-{ .mfi
nop.m 999
-(p9) fma.s1 tan_d4 = tan_dsq, tan_dsq, tan_d
- nop.i 999 ;;
+(p10) br.cond.sptk.many call_tanl ;;
}
-
-{ .mfi
+// Here if we should call cotl (p10=0, p11=1)
+{ .mmb
nop.m 999
-(p8) fma.s1 tan_v2 = tan_v14, tan_v6, tan_v3
- nop.i 999
-}
-{ .mfi
nop.m 999
-(p8) fma.s1 tan_v9 = tan_v14, tan_v14, f0
- nop.i 999 ;;
+ br.call.sptk.many b0=__libm_cotl# ;;
}
-
{ .mfi
- nop.m 999
-(p9) fma.s1 tan_z2 = tan_z12, tan_z4, tan_z3
- nop.i 999
+ mov gp = GR_SAVE_GP
+ fnorm.s.s0 f8 = f8
+ mov b0 = GR_SAVE_B0
}
-{ .mfi
+;;
+
+{ .mib
nop.m 999
-(p9) fma.s1 tan_z5 = tan_z9, tan_z10, tan_z6
- nop.i 999 ;;
+ mov ar.pfs = GR_SAVE_PFS
+ br.ret.sptk b0
+;;
}
-
-{ .mfi
+// Here if we should call tanl (p10=1, p11=0)
+call_tanl:
+{ .mmb
nop.m 999
-(p9) fma.s1 tan_inv_r = tan_d4, tan_y2, tan_y0
- nop.i 999
-}
-{ .mfi
nop.m 999
-(p8) fma.s1 tan_rcube = tan_rsq, tan_r, f0
- nop.i 999 ;;
+ br.call.sptk.many b0=__libm_tanl# ;;
}
-
-
{ .mfi
- nop.m 999
-(p8) fma.s1 tan_v1 = tan_v9, tan_v10, tan_v2
- nop.i 999
+ mov gp = GR_SAVE_GP
+ fnorm.s.s0 f8 = f8
+ mov b0 = GR_SAVE_B0
}
-{ .mfi
+;;
+
+{ .mib
nop.m 999
-(p9) fma.s1 tan_z1 = tan_z9, tan_z5, tan_z2
- nop.i 999 ;;
+ mov ar.pfs = GR_SAVE_PFS
+ br.ret.sptk b0
+;;
}
+LOCAL_LIBM_END(__libm_callout)
-
-{ .mfi
- nop.m 999
-(p8) fma.s.s0 f8 = tan_v1, tan_rcube, tan_r
- nop.i 999
-}
-{ .mfb
- nop.m 999
-(p9) fms.s.s0 f8 = tan_r, tan_z1, tan_inv_r
- br.ret.sptk b0 ;;
-}
-.endp tanf#
-ASM_SIZE_DIRECTIVE(tanf#)
+.type __libm_tanl#,@function
+.global __libm_tanl#
+.type __libm_cotl#,@function
+.global __libm_cotl#
-.proc __libm_callout
-__libm_callout:
-L(TAN_DBX):
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
+// (1)
{ .mfi
- nop.m 0
- fmerge.s f9 = f0,f0
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
-;;
-
{ .mfi
- mov GR_SAVE_GP=gp
- nop.f 0
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+
+// (2)
+{ .mmi
+ stfs [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0
-}
+ mov GR_SAVE_B0=b0 // Save b0
+};;
.body
-{ .mfb
- nop.m 999
- nop.f 999
- br.call.sptk.many b0=__libm_tan# ;;
-}
-
-
-{ .mfi
- mov gp = GR_SAVE_GP
- fnorm.s f8 = f8
- mov b0 = GR_SAVE_B0
+// (3)
+{ .mib
+ stfs [GR_Parameter_X] = f10 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
-;;
-
+{ .mib
+ stfs [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+// (4)
+{ .mmi
+ ldfs f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
{ .mib
- nop.m 999
- mov ar.pfs = GR_SAVE_PFS
- br.ret.sptk b0
-;;
-}
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+LOCAL_LIBM_END(__libm_error_region)
-.endp __libm_callout
-ASM_SIZE_DIRECTIVE(__libm_callout)
+.type __libm_error_support#,@function
+.global __libm_error_support#
-.type __libm_tan#,@function
-.global __libm_tan#
diff --git a/sysdeps/ia64/fpu/s_tanh.S b/sysdeps/ia64/fpu/s_tanh.S
new file mode 100644
index 0000000000..c8583980fb
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_tanh.S
@@ -0,0 +1,987 @@
+.file "tanh.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================================
+// 05/30/01 Initial version
+// 12/04/01 Rewritten version with erf-like algorithm.
+// Performance improved.
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 08/14/02 Changed mli templates to mlx
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================================
+// double tanh(double)
+//
+// Overview of operation
+//==============================================================================
+//
+// Algorithm description
+// ---------------------
+//
+// There are 4 paths:
+//
+// 1. Special path: x = 0, Inf, NaNs, denormals
+// Return tanh(x) = +/-0.0 for zeros
+// Return tanh(x) = QNaN for NaNs
+// Return tanh(x) = sign(x)*1.0 for Inf
+// Return tanh(x) = x + x^2 for - denormals
+// Return tanh(x) = x - x^2 for + denormals
+//
+// 2. Near zero path: 0.0 < |x| < 0.25
+// Return tanh(x) = x + x^3*A3 + ... + x^19*A19
+//
+// 3. Main path: 0.25 <= |x| < 19.0625
+// For several ranges of 0.25 <= |x| < 19.0625
+// Return tanh(x) = sign(x)*(A0 + y*A1 + y^2*A2 +
+// + y^3*A3 + ... + y^19*A19)
+// where y = (|x|/a) - b
+//
+// For each range there is particular set of coefficients.
+// Below is the list of ranges:
+// 1/4 <= |x| < 1/2 a = 0.25, b = 1.0
+// 1/2 <= |x| < 1.0 a = 0.5, b = 1.0
+// 1.0 <= |x| < 2.0 a = 1.0, b = 1.0
+// 2.0 <= |x| < 3.25 a = 2.0, b = 1.0
+// 3.25 <= |x| < 4.0 a = 2.0, b = 2.0
+// 4.0 <= |x| < 6.5 a = 4.0, b = 1.0
+// 6.5 <= |x| < 8.0 a = 4.0, b = 2.0
+// 8.0 <= |x| < 13.0 a = 8.0, b = 1.0
+// 13.0 <= |x| < 16.0 a = 8.0, b = 2.0
+// 16.0 <= |x| < 19.0625 a = 16.0, b = 1.0
+// ( [3.25;4.0], [6.5;8.0], [13.0;16.0] subranges separated
+// for monotonicity issues resolve )
+//
+// 4. Saturation path: 19.0625 <= |x| < +INF
+// Return tanh(x) = sign(x)*(1.0 - tiny_value)
+// (tiny_value ~ 2^(-63))
+//
+// Registers used
+//==============================================================================
+// Floating Point registers used:
+// f8 = input, output
+// f32 -> f64
+//
+// General registers used:
+// r32 -> r51, r2, r3
+//
+// Predicate registers used:
+// p6, p8, p10, p11, p12, p14, p15
+// p6 arg is zero, denormal or special IEEE
+// p8 to filter out case when signd(x) > 1.625
+// p10 to filter out case when |x| < 0.25
+// p11 to filter out case when signd(x) <= 1.625
+// p12 to filter out case when |x| >= 19.0625
+// p14 set to 1 for positive x
+// p15 set to 1 for negative x
+
+// Assembly macros
+//==============================================================================
+rDataPtr = r2
+rDataPtr1 = r3
+
+rBias = r33
+rCoeffAddr3 = r34
+rThreeAndQ = r35
+rCoeffAddr2 = r36
+rMask = r37
+rArg = r38
+rSignBit = r39
+rAbsArg = r40
+rSaturation = r41
+rIndex = r42
+rCoeffAddr1 = r43
+rCoeffAddr4 = r44
+rShiftedArg = r45
+rShiftedArgMasked = r46
+rBiasedExpOf4 = r47
+rShiftedAbsArg = r48
+rArgSgnd = r49
+r1625Sgnd = r50
+rTwo = r51
+
+//==============================================================================
+fA0 = f32
+fA1 = f33
+fA2 = f34
+fA3 = f35
+fA4 = f36
+fA5 = f37
+fA6 = f38
+fA7 = f39
+fA8 = f40
+fA9 = f41
+fA10 = f42
+fA11 = f43
+fA12 = f44
+fA13 = f45
+fA14 = f46
+fA15 = f47
+fA16 = f48
+fA17 = f49
+fA18 = f50
+fA19 = f51
+fArgSqr = f52
+fArgAbsNorm = f53
+fSignumX = f54
+fRes = f55
+fThreeAndQ = f56
+fArgAbs = f57
+fTSqr = f58
+fTQuadr = f59
+fTDeg3 = f60
+fTDeg7 = f61
+fArgAbsNormSgn = f62
+fTQuadrSgn = f63
+fTwo = f64
+
+// Data tables
+//==============================================================================
+RODATA
+
+.align 16
+
+LOCAL_OBJECT_START(tanh_data)
+// CAUTION: The order of these table coefficients shouldn't be changed!
+
+// Main path coefficients:
+// Coefficients ##0..15 ("main" coefficient tables)
+// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5
+data8 0xE9D218BC9A3FB55A, 0x00003FC7 //A19
+data8 0xC8C0D38687F36EBA, 0x00003FCE //A18
+data8 0xA2663E519FAC8A43, 0x0000BFD2 //A17
+data8 0xD913F0490674B0DF, 0x00003FD3 //A16
+data8 0xF75D84789DE0AE52, 0x00003FD6 //A15
+data8 0xACB3C40EEF3A06F0, 0x0000BFD9 //A14
+data8 0xEBD7F5DC02CFD5BA, 0x0000BFDB //A13
+data8 0x8B52CDF66D709E2A, 0x00003FDF //A12
+data8 0x9EC21F28E05C4A3E, 0x00003FE0 //A11
+data8 0xC412B44D0176F3ED, 0x0000BFE4 //A10
+data8 0x97BF35A34DD1EA4C, 0x0000BFE0 //A9
+data8 0xF89F5B39E3A3AA36, 0x00003FE9 //A8
+data8 0xF2BA654BCEEBA433, 0x0000BFEA //A7
+data8 0x8E1C15876AA589AD, 0x0000BFEF //A6
+data8 0x942226246A8C2A86, 0x00003FF1 //A5
+data8 0x8F06D9FF7DB47261, 0x00003FF4 //A4
+//
+// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
+data8 0xC4A7B8FB672A8520, 0x00003FDC //A19
+data8 0xA20724B847E13499, 0x0000BFE0 //A18
+data8 0xE17DB53F02E4D340, 0x00003FE2 //A17
+data8 0x90264A1012F4CA6F, 0x0000BFE4 //A16
+data8 0xEBEC9F776F0BF415, 0x0000BFE0 //A15
+data8 0x89AF912B305B45A4, 0x00003FE7 //A14
+data8 0xB4A960B81F5EC36A, 0x0000BFE7 //A13
+data8 0x969A4E95B2DA86B5, 0x0000BFEA //A12
+data8 0x8A3FC0EC082305CB, 0x00003FEC //A11
+data8 0x83D7795BCBE24373, 0x00003FEC //A10
+data8 0xDCBF42AEB82932EC, 0x0000BFEF //A9
+data8 0x83318E61ECAFD804, 0x00003FF0 //A8
+data8 0xEA4DE5746975A914, 0x00003FF2 //A7
+data8 0xCE63E8FA6B96480B, 0x0000BFF4 //A6
+data8 0xDF017BE0D4FE45D8, 0x0000BFF4 //A5
+data8 0xA8A0C6E2226DF3CD, 0x00003FF8 //A4
+//
+// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
+data8 0x8E89D2EBFDAA160B, 0x00003FE9 //A19
+data8 0xDD9226310A272046, 0x0000BFEC //A18
+data8 0xA038042D28B0D665, 0x00003FEF //A17
+data8 0x8C04796F03516306, 0x0000BFF1 //A16
+data8 0x9CD6A9CB4E90A2FD, 0x00003FF2 //A15
+data8 0xC8980E166F5A84FD, 0x0000BFF2 //A14
+data8 0x9ADFE65F56B7BCFD, 0x00003FED //A13
+data8 0x8B11FDFB5D0A7B96, 0x00003FF4 //A12
+data8 0x8209A125E829CBFA, 0x0000BFF5 //A11
+data8 0xCF38AAC17B85BD76, 0x00003FF1 //A10
+data8 0xD5C2E248D8AB99AB, 0x00003FF6 //A9
+data8 0xE12BE2785727F2D6, 0x0000BFF7 //A8
+data8 0x9FC9EF90F87BF1E2, 0x00003FF6 //A7
+data8 0x9B02FE0DAF42C08F, 0x00003FF9 //A6
+data8 0xBDACE06F531D9491, 0x0000BFFA //A5
+data8 0xE3048AD1DB2F648C, 0x00003FF9 //A4
+//
+// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25
+data8 0x856EC3B0330A385A, 0x00003FEB //A19
+data8 0xC641D69DAE2D429C, 0x0000BFF2 //A18
+data8 0xC683EB0BE1343FFF, 0x00003FF5 //A17
+data8 0xC358954224E4E823, 0x0000BFF7 //A16
+data8 0xF813A8D6D396BC5F, 0x00003FF8 //A15
+data8 0xE0ECDFED078D37D6, 0x0000BFF9 //A14
+data8 0x950E4E619855E316, 0x00003FFA //A13
+data8 0x8453B8F93370FB58, 0x0000BFFA //A12
+data8 0xFDBA28430AEC95BA, 0x00003FF7 //A11
+data8 0x9371AAC1FDB1E664, 0x00003FFA //A10
+data8 0xAC972DA97782D88A, 0x0000BFFB //A9
+data8 0xE18F47B10B9CE1BC, 0x00003FFB //A8
+data8 0xAB7C81230BF13BC6, 0x0000BFFB //A7
+data8 0xA6CAAD4A3E31A7D5, 0x0000BFF8 //A6
+data8 0x9CABD76D1D5C3878, 0x00003FFC //A5
+data8 0x92906D077941CAA9, 0x0000BFFD //A4
+//
+// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5
+data8 0x9232D19F71709AC9, 0x0000BFF5 //A19
+data8 0x819E31323F5DD3F8, 0x00003FF8 //A18
+data8 0xDA8E1CDB8D23DC29, 0x0000BFF9 //A17
+data8 0xE97C7CD8FC0486D8, 0x00003FFA //A16
+data8 0xB0C4AD234D88C9F2, 0x0000BFFB //A15
+data8 0xC5989BFB28FDE267, 0x00003FFB //A14
+data8 0x9B26520EC4EFEE8E, 0x0000BFFB //A13
+data8 0xC4B6F758AD21E574, 0x00003FF9 //A12
+data8 0xCC36E3FFA10D2CFF, 0x00003FFA //A11
+data8 0x8738696FB06A5CED, 0x0000BFFC //A10
+data8 0xD31981825BF39228, 0x00003FFC //A9
+data8 0x82C58FB9BEE43992, 0x0000BFFD //A8
+data8 0x88D5AAE49164B6F3, 0x00003FFD //A7
+data8 0xF4CA0B968AF2DDE2, 0x0000BFFC //A6
+data8 0xB99874B482BD17EE, 0x00003FFC //A5
+data8 0xE93FB2F99431DC1D, 0x0000BFFB //A4
+//
+// Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0
+data8 0xAAA9EB7EADA85CEC, 0x00003FF5 //A19
+data8 0x980C80EE05A6BE78, 0x0000BFF8 //A18
+data8 0x818DA9F5396390A5, 0x00003FFA //A17
+data8 0x8D8CC21E23D8A6A2, 0x0000BFFB //A16
+data8 0xE0EC19E55A886765, 0x00003FFB //A15
+data8 0x8C11197A7E6244C5, 0x0000BFFC //A14
+data8 0x901D2BF203C2F7F3, 0x00003FFC //A13
+data8 0xFEACAEE66EE803E5, 0x0000BFFB //A12
+data8 0xC684E4925E318C3F, 0x00003FFB //A11
+data8 0x8A9D8A970565F28D, 0x0000BFFB //A10
+data8 0xAE34C61DE5CEA4D4, 0x00003FFA //A9
+data8 0xC44C5714BD6208A0, 0x0000BFF9 //A8
+data8 0xC4612F7D6C8BDB79, 0x00003FF8 //A7
+data8 0xABD91DCE40D5EECB, 0x0000BFF7 //A6
+data8 0x80E375C1B847B72F, 0x00003FF6 //A5
+data8 0xA11C7DD978CF700A, 0x0000BFF4 //A4
+//
+// Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625
+data8 0xE29D17C510F86F6B, 0x00003FF3 //A19
+data8 0x88FE52EB39A3A98C, 0x0000BFF5 //A18
+data8 0xA406547E50360693, 0x00003FF5 //A17
+data8 0x83E6260B71C6D7DE, 0x0000BFF5 //A16
+data8 0xA36AB5B0CBC97B85, 0x00003FF4 //A15
+data8 0xA94931E0B7BA6C14, 0x0000BFF3 //A14
+data8 0x9A4596DAF350AD63, 0x00003FF2 //A13
+data8 0xFE47643F375AECA5, 0x0000BFF0 //A12
+data8 0xBF8433C5ABEE63B1, 0x00003FEF //A11
+data8 0x83CEE05D7AE90A0A, 0x0000BFEE //A10
+data8 0xA4CC45480BCEB02D, 0x00003FEC //A9
+data8 0xB967CBDCBC16CB10, 0x0000BFEA //A8
+data8 0xB9681B214EDC098D, 0x00003FE8 //A7
+data8 0xA23B20D87B80DFA8, 0x0000BFE6 //A6
+data8 0xF358B2C46F10CBAF, 0x00003FE3 //A5
+data8 0x98176FD06229A385, 0x0000BFE1 //A4
+//
+// Binary subranges
+// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0
+data8 0xEF2EE841288F6706, 0x00003FE9 //A19
+data8 0xE65D5B74B85F82A6, 0x00003FEB //A18
+data8 0xE495FC21E42A79FF, 0x00003FEA //A17
+data8 0xF99B267A913CF3E5, 0x00003FEC //A16
+data8 0xFE3D700F4A0A0FDE, 0x0000BFEC //A15
+data8 0x8F91BB4EE4E4EA52, 0x00003FEE //A14
+data8 0xBCA9F41A5C6EF8BA, 0x0000BFEE //A13
+data8 0xF93E00884027A9CF, 0x00003FED //A12
+data8 0xC4D4036A61BABC2F, 0x00003FEF //A11
+data8 0x86CC2AD1AD47C7D5, 0x0000BFF2 //A10
+data8 0xD3065DEF4CE9AD32, 0x00003FF3 //A9
+data8 0x82C44125F568D54E, 0x0000BFF5 //A8
+data8 0x88D588729BAF14CA, 0x00003FF6 //A7
+data8 0xF4CA0661307243C7, 0x0000BFF6 //A6
+data8 0xB998746D57061F74, 0x00003FF7 //A5
+data8 0xE93FB2F482327C19, 0x0000BFF7 //A4
+//
+// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
+data8 0xEB189B71ADC40BE2, 0x00003FEA //A19
+data8 0xA60B46F9FF6DC2DF, 0x00003FEA //A18
+data8 0xBB061CDD9F368B9D, 0x00003FEC //A17
+data8 0x841E08BDF5429991, 0x0000BFEC //A16
+data8 0xDD33990B433F25BE, 0x00003FED //A15
+data8 0xBA5DE6B870F0A2BB, 0x0000BFEE //A14
+data8 0xA71D489AAA6DACF0, 0x00003FEF //A13
+data8 0x874CCB2B8F3FBC0E, 0x0000BFF0 //A12
+data8 0xCB1D2E9754EA534A, 0x00003FF0 //A11
+data8 0x8BA5ABB53BA6ABCF, 0x0000BFF1 //A10
+data8 0xAE91FD1C2391A32B, 0x00003FF1 //A9
+data8 0xC465A74B798E5761, 0x0000BFF1 //A8
+data8 0xC4666152397D15C1, 0x00003FF1 //A7
+data8 0xABD9E63CA575B950, 0x0000BFF1 //A6
+data8 0x80E38B18E8D0F460, 0x00003FF1 //A5
+data8 0xA11C80E20AAFDD3C, 0x0000BFF0 //A4
+//
+// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0
+data8 0xBECD0AF7E22E5594, 0x00003FE9 //A19
+data8 0xE2834E2D68C1128C, 0x00003FEA //A18
+data8 0x97B117611B317379, 0x00003FEB //A17
+data8 0xEE91A0D39A772F6B, 0x00003FEA //A16
+data8 0x92F6EC377DCADA4F, 0x00003FEA //A15
+data8 0xD8FCCD6A3277FAB7, 0x00003FE8 //A14
+data8 0xC15AB9CB0C3DCFE0, 0x00003FE7 //A13
+data8 0xC3C659704A7147CD, 0x00003FE2 //A12
+data8 0xFA17F09D27C97912, 0x00003FE4 //A11
+data8 0xF664147182B94788, 0x0000BFE3 //A10
+data8 0xA6C89FA741464DA1, 0x00003FE3 //A9
+data8 0xB90FE464A825EFA8, 0x0000BFE2 //A8
+data8 0xB973AE0FD86EC024, 0x00003FE1 //A7
+data8 0xA23A087F96846951, 0x0000BFE0 //A6
+data8 0xF358D8A7FC012D5D, 0x00003FDE //A5
+data8 0x98176E2309B7C73A, 0x0000BFDD //A4
+//
+//
+// Coefficients ##16..19 ("tail" coefficient tables)
+// Polynomial coefficients for the tanh(x), 0.25 <= |x| < 0.5
+data8 0x838F209ABB9BA7B3, 0x0000BFF7 //A3
+data8 0xEBC0AC78DA4FC500, 0x0000BFF8 //A2
+data8 0xF0A4D02960B60E69, 0x00003FFC //A1
+data8 0xFACBF534D0E42F8A, 0x00003FFC //A0
+//
+// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
+data8 0xC0ECBDC0A0D133A6, 0x0000BFF8 //A3
+data8 0xBA13A076BF8E812F, 0x0000BFFB //A2
+data8 0xC954A37D1A1CA070, 0x00003FFD //A1
+data8 0xEC9A9EBAB4579B29, 0x00003FFD //A0
+//
+// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
+data8 0xD42E9175A6EA1397, 0x00003FFB //A3
+data8 0xA3C361378A55CF56, 0x0000BFFD //A2
+data8 0xD706E07CC8622983, 0x00003FFD //A1
+data8 0xC2F7D5A8A79CA2AC, 0x00003FFE //A0
+//
+// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 3.25
+data8 0xAC7A7F8776817C7E, 0x00003FFD //A3
+data8 0x8B7CE95E69FCFE9A, 0x0000BFFD //A2
+data8 0x90B161317028D995, 0x00003FFC //A1
+data8 0xF6CA82F0DE1E9E9A, 0x00003FFE //A0
+//
+// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 6.5
+data8 0xE9E072407BC22DC6, 0x00003FFA //A3
+data8 0xAFA4A913D8E6BB4A, 0x0000BFF9 //A2
+data8 0xAFC2D6A885BAA875, 0x00003FF7 //A1
+data8 0xFFD40B84505A10B2, 0x00003FFE //A0
+//
+// Polynomial coefficients for the tanh(x), 8.0 <= |x| < 13.0
+data8 0xA11C8A1FED168CD5, 0x00003FF2 //A3
+data8 0xF1AAD6B02063A5F5, 0x0000BFEF //A2
+data8 0xF1AADA46AD341C34, 0x00003FEC //A1
+data8 0xFFFFFC39548FC34B, 0x00003FFE //A0
+//
+// Polynomial coefficients for the tanh(x), 16.0 <= |x| < 19.0625
+data8 0x98176FD1F0950C16, 0x00003FDE //A3
+data8 0xE42327BB09C8B2A5, 0x0000BFDA //A2
+data8 0xE42327BB0B154F13, 0x00003FD6 //A1
+data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0
+//
+// Binary subranges
+// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4.0
+data8 0xE9E072404329293B, 0x00003FF7 //A3
+data8 0xAFA4A913D798300B, 0x0000BFF7 //A2
+data8 0xAFC2D6A885B48567, 0x00003FF6 //A1
+data8 0xFFD40B84505A10B4, 0x00003FFE //A0
+//
+// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
+data8 0xA11C8A63815F7A28, 0x00003FEF //A3
+data8 0xF1AAD6B65B0EBF53, 0x0000BFED //A2
+data8 0xF1AADA46E799831F, 0x00003FEB //A1
+data8 0xFFFFFC39548FC348, 0x00003FFE //A0
+//
+// Polynomial coefficients for the tanh(x), 13.0 <= |x| < 16.0
+data8 0x98176FE982140A59, 0x00003FDB //A3
+data8 0xE42327B9B0D7202F, 0x0000BFD8 //A2
+data8 0xE42327BB13076BD6, 0x00003FD5 //A1
+data8 0xFFFFFFFFFFF8DEE7, 0x00003FFE //A0
+//
+//
+// Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.25
+// ('tanh_near_zero' path)
+data8 0xBF2BA5D26E479D0C //A9
+data8 0x3F4336D96F81EE26 //A8
+data8 0xBF8226E34AE197B0 //A5
+data8 0x3F9664F488148657 //A4
+data8 0xAAAAAAAAAAAAAA99, 0x0000BFFD //A1
+data8 0xBF57D91925BB5EE2 //A7
+data8 0x3F6D6D36C3D5B7A1 //A6
+data8 0xBFABA1BA1BA19D32 //A3
+data8 0x3FC1111111111108 //A2
+//
+//
+// 1.0 - 2^(-63)
+// ('tanh_saturation' path)
+data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE
+LOCAL_OBJECT_END(tanh_data)
+
+// CAUTION: The order of table coefficients shouldn't be changed!
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(tanh)
+{ .mfi
+ alloc r32 = ar.pfs, 0, 20, 0, 0
+ fmerge.se fArgAbsNorm = f1, f8 // normalized x
+ adds rSignBit = 0x1, r0 // Bit for sign removing
+}
+{ .mfi
+ addl rDataPtr = @ltoff(tanh_data), gp // Data pointer
+ fma.s1 fTwo = f1, f1, f1 // 2.0 construct
+ addl rArgSgnd = 0xfff, r0 // mask for exponent
+};;
+
+{ .mfi
+ getf.d rArg = f8 // x in GR
+ fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials
+ // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf
+ shl rArgSgnd = rArgSgnd, 52 // mask for exponent
+}
+{ .mlx
+ ld8 rDataPtr = [rDataPtr] // Real data pointer
+ movl r1625Sgnd = 0xA000000000000 // 1.625 signd
+ // 1.625 significand used to filter values greater than 3.25, 6.5, 13.0
+ // to enter binary subranges
+};;
+
+{ .mfi
+ addl rBias = 0x3FD00, r0 // bias of 0.25 << 8
+ fma.s1 fArgSqr = f8, f8, f0 // x^2
+ shl rSignBit = rSignBit, 63 // mask for sign bit
+}
+{ .mlx
+ addl rMask = 0x7FF00, r0 // Mask for index bits
+ movl rTwo = 0x4000000000000000 // 2.0
+};;
+
+{ .mfi
+ andcm rArgSgnd = rArg, rArgSgnd // Remove exponent
+ nop.f 0
+ shr.u rShiftedArg = rArg, 44 // Select only necessary bits of arg
+}
+{ .mfb
+ andcm rAbsArg = rArg, rSignBit // Remove sign
+ nop.f 0
+(p6) br.cond.spnt _tanh_spec // Branch to zero, denorm & specs
+};;
+
+{ .mfi
+ and rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8
+ fmerge.s fArgAbs = f1, f8 // |x|
+ shr rShiftedAbsArg = rAbsArg, 44 // Select only necessary
+ // bits of absolute arg
+}
+{ .mfi
+ cmp.gt p8, p11 = rArgSgnd, r1625Sgnd // p8 = 1 if
+ // signd(x) > 1.625 - to filter values greater than 3.25, 6.5, 13.0
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ sub rIndex = rShiftedArgMasked, rBias // index << 8
+ nop.f 0
+ cmp.lt p10, p0 = rShiftedArgMasked, rBias // p10=1 if |x|<0.25
+}
+{ .mfb
+(p8) cmp.gt p8, p11 = rAbsArg, rTwo // If arg is greater than 2.0?
+ // (then we should use binary subranges)
+ nop.f 0
+(p10) br.cond.spnt tanh_near_zero // branch out if |x| < 0.25
+};;
+
+.pred.rel "mutex",p8,p11
+{ .mfi
+(p8) add rIndex = 0x400, rIndex // Make pointer to binary
+ // subranges
+(p11) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1 // |x|/b - 1.0
+ addl rSaturation = 0x40331, r0 // shifted bits of 19.0625
+}
+{ .mfi
+ nop.m 0
+(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, fTwo // |x|/b - 2.0
+ // this is only for binary subranges [3.25;4], [6.5;8], [13.0;16]
+ nop.i 0
+}
+;;
+
+{ .mfi
+ add rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ adds rCoeffAddr2 = 16, rCoeffAddr1 // Shifted pointer to coeffs
+ fmerge.s fSignumX = f8, f1 // signum(x)
+ nop.i 0
+}
+{ .mfb
+ cmp.le p12, p0 = rSaturation, rShiftedAbsArg // |x|>=19.0625?
+ nop.f 0
+(p12) br.cond.spnt tanh_saturation // branch out if x |x| >= 19.0625
+};;
+
+{.mfi
+ ldfe fA19 = [rCoeffAddr1], 32 // Load A19
+ nop.f 0
+ nop.i 0
+}
+{.mfi
+ ldfe fA18 = [rCoeffAddr2], 32 // Load A18
+ nop.f 0
+ adds rCoeffAddr3 = 0xA00, rDataPtr // Pointer to "tail"
+ // coefficients tables
+};;
+
+{.mfi
+ ldfe fA17 = [rCoeffAddr1], 32 // Load A17
+ nop.f 0
+ nop.i 0
+}
+{.mfi
+ ldfe fA16 = [rCoeffAddr2], 32 // Load A16
+ nop.f 0
+ nop.i 0
+};;
+
+{.mfi
+ ldfe fA15 = [rCoeffAddr1], 32 // Load A15
+ fma.s1 fTSqr = fArgAbsNorm, fArgAbsNorm, f0 // x^2
+ shr.u rIndex = rIndex, 2 // Index for "tail" tables
+}
+{.mfi
+ ldfe fA14 = [rCoeffAddr2], 32 // Load A14
+ nop.f 0
+ adds rCoeffAddr4 = 16, r0 // Shifter pointer
+ // to "tail" tables
+};;
+
+{.mfi
+ ldfe fA13 = [rCoeffAddr1], 32 // Load A13
+ nop.f 0
+ add rCoeffAddr3 = rCoeffAddr3, rIndex // "tail" coeffs to load
+ // ##16..23
+}
+{.mfi
+ ldfe fA12 = [rCoeffAddr2], 32 // Load A12
+ nop.f 0
+ cmp.lt p15, p14 = rArg, r0 // Arg positive (p14)
+ // or negative (p15)?
+};;
+
+{.mfi
+ ldfe fA11 = [rCoeffAddr1], 32 // Load A11
+ nop.f 0
+ add rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4 // shifted "tail"
+ // coeffs to load
+}
+{.mfi
+ ldfe fA10 = [rCoeffAddr2], 32 // Load A10
+ nop.f 0
+ nop.i 0
+};;
+
+{.mfi
+ ldfe fA9 = [rCoeffAddr1], 32 // Load A9
+ nop.f 0
+ nop.i 0
+}
+{.mfi
+ ldfe fA8 = [rCoeffAddr2], 32 // Load A8
+ nop.f 0
+ nop.i 0
+};;
+
+{.mfi
+ ldfe fA7 = [rCoeffAddr1], 32 // Load A7
+ nop.f 0
+ nop.i 0
+}
+{.mfi
+ ldfe fA6 = [rCoeffAddr2], 32 // Load A6
+ nop.f 0
+ nop.i 0
+};;
+
+{.mfi
+ ldfe fA5 = [rCoeffAddr1], 32 // Load A5
+ fma.s1 fTDeg3 = fArgAbsNorm, fTSqr, f0 // x^3
+ nop.i 0
+}
+{.mfi
+ ldfe fA4 = [rCoeffAddr2], 32 // Load A4
+ fma.s1 fTQuadr = fTSqr, fTSqr, f0 // x^4
+ nop.i 0
+};;
+
+// Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm
+{.mfi
+ ldfe fA3 = [rCoeffAddr3], 32 // Load A3
+ fma.s1 fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0 // sign(x)*x
+ nop.i 0
+}
+{.mfi
+ ldfe fA2 = [rCoeffAddr4], 32 // Load A2
+ nop.f 0
+ nop.i 0
+};;
+
+{.mfi
+ ldfe fA1 = [rCoeffAddr3], 32 // Load A1
+ fma.s1 fRes = fA19, fArgAbsNorm, fA18 // Polynomial
+ nop.i 0
+}
+{.mfi
+ ldfe fA0 = [rCoeffAddr4], 32 // Load A0
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, fArgAbsNorm, fA16 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fArgAbsNorm, fA14 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fTDeg7 = fTDeg3, fTQuadr, f0 // Polynomial
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, fArgAbsNorm, fA12 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, fArgAbsNorm, fA10 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, fArgAbsNorm, fA8 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTSqr, fA17 // Polynomial
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA7, fArgAbsNorm, fA6 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA5 = fA5, fArgAbsNorm, f0 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fTSqr, fA13 // Polynomial
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA4, fArgAbsNorm, fA3 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA2 = fA2, fArgAbsNorm, fA1 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, fTSqr, fA9 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA7, fTSqr, fA5 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTQuadr, fA15 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA4, fTSqr, fA2 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTQuadr, fA11 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA7, fTDeg3, fA4 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTDeg7, fA4 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ // result for negative argument
+(p15) fms.d.s0 f8 = fRes, fArgAbsNormSgn, fA0 // Polynomial
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ // result for positive argument
+(p14) fma.d.s0 f8 = fRes, fArgAbsNormSgn, fA0 // Polynomial
+ br.ret.sptk b0
+};;
+
+
+// |x| < 0.25 Path /////////////////////////////////////////////////////////////
+.align 32
+tanh_near_zero:
+{ .mfi
+ adds rCoeffAddr1 = 0xC80, rDataPtr // address of A9
+ fma.s0 fTSqr = fArgSqr, fArgSqr, f0 // x^4
+ nop.i 0
+}
+{ .mfi
+ adds rCoeffAddr2 = 0xCB0, rDataPtr // address of A7
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfpd fA9, fA8 = [rCoeffAddr1], 16 // Load A9, A8
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA7, fA6 = [rCoeffAddr2], 16 // Load A7, A6
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfpd fA5, fA4 = [rCoeffAddr1], 16 // Load A5, A4
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA3, fA2 = [rCoeffAddr2], 16 // Load A3, A2
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe fA1 = [rCoeffAddr1] // Load A1
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fTQuadr = fTSqr, fTSqr, f0 // x^4
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fA9, fArgSqr, fA8 // Polynomial
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA7, fArgSqr, fA6 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA3 = fA3, fArgSqr, fA2 // Polynomial
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA5 = fA5, fArgSqr, fA4 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA1 = fA1, fArgSqr, f0 // Polynomial
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fTQuadrSgn = fTQuadr, f8, f0 // x^4 * x
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTSqr, fA7 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA1 = fA3, fTSqr, fA1 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTSqr, fA5 // Polynomial
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fRes, fTQuadr, fA1 // Polynomial
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = fRes, f8, f8 // x+x*Polynomial
+ br.ret.sptk b0 // Exit for |x| < 0.25
+};;
+
+
+
+
+
+// 19.0625 <= |x| < +inf Saturation path ///////////////////////////////////////
+.align 32
+tanh_saturation:
+{ .mfi
+ adds rDataPtr = 0xCD0, rDataPtr // address of A0
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe fA0 = [rDataPtr] // Load A0 = 2^(-63)
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+ fma.d.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0-2^(-63))
+ br.ret.sptk b0 // Exit for 19.0625 <=|x|< +inf
+};;
+
+
+
+
+
+// 0, denormals and special IEEE numbers path /////////////////////////////////
+_tanh_spec:
+
+{ .mfi
+ cmp.lt p15, p14 = rArg, r0 // Is arg negative (p15)
+ // or positive p14)
+ fclass.m p6,p0 = f8, 0x23 // To filter infinities
+ // 0x23 = @pos|@neg|@inf
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros
+ // 0xC7 = @pos|@neg|@zero|@qnan|@snan
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args
+(p6) br.ret.spnt b0 // exit for x = INF
+};;
+
+{ .mfb
+ nop.m 0
+(p7) fma.d.s0 f8 = f8, f1, f8 // +/-0 for 0 args
+ // and NaNs for NaNs
+(p7) br.ret.spnt b0 // exit for x = NaN or +/-0
+};;
+
+{ .mfi
+ nop.m 0
+ fnorm.s0 f8 = f8 // Normalize arg
+ nop.i 0
+};;
+
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p14) fnma.d.s0 f8 = f8, f8, f8 // res = r-r^2
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p15) fma.d.s0 f8 = f8, f8, f8 // res = r+r^2
+ br.ret.sptk b0 // 0, denormals, specials return
+};;
+
+GLOBAL_LIBM_END(tanh)
+
diff --git a/sysdeps/ia64/fpu/s_tanhf.S b/sysdeps/ia64/fpu/s_tanhf.S
new file mode 100644
index 0000000000..344ca4ec5a
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_tanhf.S
@@ -0,0 +1,581 @@
+.file "tanhf.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 05/30/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// float tanhf(float)
+//
+// Overview of operation
+//==============================================================
+// Background
+//
+//
+// There are 9 paths:
+// 1. x = +/-0.0
+// Return tanhf(x) = +/-0.0
+//
+// 2. 0.0 < |x| < 0.3125
+// Return tanhf(x) = x + x^3*Pol3(x^2),
+// where Pol3(x^2) = C3*x^6 + C2*x^4 + C1*x^2 + C0
+//
+// 3. 0.3125 <= |x| < 8.0
+// Return tanhf(x) = sign(x)*PolD(x)*PolC(|x|) + sign(x)*PolA(|x|),
+// where sign(x)*PolD(x) = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4),
+// PolC(|x|) = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0,
+// PolA(|x|) = A3|x|^3 + A2*x^2 + A1*|x| + A0
+//
+// Actually range 0.3125<=|x|< 8.0 is split to 5 subranges.
+// For each subrange there is particular set of coefficients.
+// Below is the list of subranges:
+// 3.1 0.3125 <= |x| < 0.5
+// 3.2 0.5 <= |x| < 1.0
+// 3.3 1.0 <= |x| < 2.0
+// 3.4 2.0 <= |x| < 4.0
+// 3.5 4.0 <= |x| < 8.0
+//
+// 4. 8.0 <= |x| < 9.125
+// Return tanhf(x) = sign(x)*(A3|x|^3 + A2*x^2 + A1*|x| + A0)
+//
+// 5. 9.125 <= |x| < +INF
+// Return tanhf(x) = sign(x)*(1.0d - 2^(-52))
+//
+// 6. |x| = INF
+// Return tanhf(x) = sign(x) * 1.0
+//
+// 7. x = [S,Q]NaN
+// Return tanhf(x) = QNaN
+//
+// 8. x is positive denormal
+// Return tanhf(x) = x - x^2
+//
+// 9. x is negative denormal
+// Return tanhf(x) = x + x^2
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8, input
+// f32 -> f59
+
+// General registers used:
+// r32 -> r46, r2, r3
+
+// Predicate registers used:
+// p0, p6 -> p15
+
+// p6 to filter out case when x = [Q,S]NaN or +/-0
+// p7 to filter out case when x = denormal
+// p8 set if |x| >= 0.3125, used also to process denormal input
+// p9 to filter out case when |x| = inf
+// p10 to filter out case when |x| < 0.3125
+// p11 to filter out case when 0.3125 <= |x| < 9.125
+// p12 to filter out case when |x| >= 9.125
+// p13 to filter out case when 8.0 <= |x| < 9.125
+// p14 set to 1 for positive x
+// p15 set to 1 for negative x
+
+// Assembly macros
+//==============================================================
+rDataPtr = r2
+rDataPtr1 = r3
+
+rBias = r33
+rCoeffAddr3 = r34
+rNearSaturation = r35
+rCoeffAddr1 = r36
+rCoeffAddr2 = r37
+rOffset2 = r38
+rBias2 = r39
+rMask = r40
+rArg = r41
+rBound = r42
+rSignBit = r43
+rAbsArg = r44
+rDataPtr2 = r45
+rSaturation = r46
+
+//==============================================================
+fA0 = f32
+fA1 = f33
+fA2 = f34
+fA3 = f35
+fC0 = f36
+fC1 = f37
+fC2 = f38
+fC3 = f39
+fD0 = f40
+fD1 = f41
+fD2 = f42
+fB0 = f43
+fArgSqr = f44
+fAbsArg = f45
+fSignumX = f46
+fArg4 = f47
+fArg4Sgn = f48
+fArg3 = f49
+fArg3Sgn = f50
+fArg7Sgn = f51
+fArg6Sgn = f52
+fPolC = f53
+fPolCTmp = f54
+fPolA = f55
+fPolATmp = f56
+fPolD = f57
+fPolDTmp = f58
+fArgSqrSgn = f59
+
+// Data tables
+//==============================================================
+
+RODATA
+
+.align 16
+
+LOCAL_OBJECT_START(tanhf_data)
+// Polynomial coefficients for the tanh(x), 0.3125 <= |x| < 0.5
+data8 0x3F9BEEDFDD177D7B // C0
+data8 0x3F970D10C7F32458 // C1
+data8 0x3F766D6B051F3A38 // C2
+data8 0xBF732F2001B23402 // C3
+data8 0xBF854BE1CE1ED499 // D0
+data8 0x4013C944F3999A16 // D1
+data8 0xC01106C6975222C0 // D2
+data8 0x3F783D5ACCF9EBE8 // B0
+// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
+data8 0xBF5D631440786869 // C0
+data8 0xBF575D79A0D52069 // C1
+data8 0xBF7E2237B7EFC705 // C2
+data8 0x3F6A7ACBC273041F // C3
+data8 0xC040E32EA52D91EB // D0
+data8 0x403D19463E5DB4D7 // D1
+data8 0xC02216F61F759F39 // D2
+data8 0xBF55B4EA0B844BE7 // B0
+// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
+data8 0x3F8637DBE5B3E690 // C0
+data8 0xBF7F7FEC158C07F5 // C1
+data8 0x3F711C586706838A // C2
+data8 0xBF50EF7EF605554E // C3
+data8 0xC054D45448354E25 // D0
+data8 0x404ADFEEA282E730 // D1
+data8 0xC028AEE456D59549 // D2
+data8 0x3F25232D1BED59A8 // B0
+// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 4.0
+data8 0xBF52602285F2D06C // C0
+data8 0x3F2E57C298FFE1E0 // C1
+data8 0xBF15ED575DB3C811 // C2
+data8 0x3EE428878A08525C // C3
+data8 0xC0895A26849039C1 // D0
+data8 0x406E3C60BBFBB575 // D1
+data8 0xC03A06F62867C75A // D2
+data8 0xBEB114C70F1C723E // B0
+// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 8.0
+data8 0x3EF4B22BD17039A3 // C0
+data8 0xBEB704ADC040C57F // C1
+data8 0x3E937A98288AFE1A // C2
+data8 0xBE4F33B2C9FFE7E7 // C3
+data8 0xC0BE48CFADE2431E // D0
+data8 0x4090E74249760FDD // D1
+data8 0xC04B6F537FCF2F1E // D2
+data8 0x3E0DCD879C91ADEA // B0
+// Polynomial coefficients for the tanh(x), -0.3125 < x < 0.3125
+data8 0xBFD555551E8245B7 // A0
+data8 0x3FC110E63F52E689 // A1
+data8 0xBFAB8CD6A5B7BAFA // A2
+data8 0x3F945D467FCEB553 // A3
+//
+// Polynomial coefficients for the tanh(x), 0.3125 <= |x| < 0.5
+data8 0xBE3DCC92FCAECBB6 // A0
+data8 0x3FF0000043B7D267 // A1
+data8 0xBED18BF28ACFC4B1 // A2
+data8 0xBFD554A56F82837E // A3
+// Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
+data8 0x3EFD6054758539F9 // A0
+data8 0x3FEFFBFC77198EBE // A1
+data8 0x3F700327CA98D237 // A2
+data8 0xBFD68955F5BB2FA1 // A3
+// Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
+data8 0xBF71A53F229DF01B // A0
+data8 0x3FF0AECFD730DE50 // A1
+data8 0xBFC882F88E5DF3BA // A2
+data8 0x3FC6EDF212CA2A8D // A3
+// Polynomial coefficients for the tanh(x), 2.0 <= |x| < 4.0
+data8 0xBFAF0B712E9EDA47 // A0
+data8 0x3FF1C208080BEA64 // A1
+data8 0x3FC3D29B20C8946E // A2
+data8 0xBFF04514ED900A6A // A3
+// Polynomial coefficients for the tanh(x), 4.0 <= |x| < 8.0
+data8 0xBFB1DEA49A831CBC // A0
+data8 0x3FFA729FC7085674 // A1
+data8 0xBFF2F44D923A8FA4 // A2
+data8 0x3FE092FC5712227E // A3
+// Polynomial coefficients for the tanh(x), 8.0 <= |x| <= 9.125
+data8 0x3FEFFF5769EE3041 // A0
+data8 0x3EFBBF148D850891 // A1
+data8 0xBEC86BCEF0F5C2FE // A2
+data8 0x3E7CBA4F3A885A5C // A3
+//
+data8 0x3FEFFFFFFFFFFFFF // 1.0 - epsilon
+LOCAL_OBJECT_END(tanhf_data)
+
+.section .text
+GLOBAL_LIBM_ENTRY(tanhf)
+
+{ .mfi
+ alloc r32 = ar.pfs, 1, 14, 0, 0
+ fmerge.s fAbsArg = f1, f8 // |x|
+ addl rMask = 0x806, r0
+}
+{ .mfi
+ addl rDataPtr = @ltoff(tanhf_data), gp
+ fma.s1 fArgSqr = f8, f8, f0 // x^2
+ adds rSignBit = 0x1, r0
+}
+;;
+
+{ .mfi
+ getf.s rArg = f8 // x in GR
+ fclass.m p7,p0 = f8, 0x0b // is x denormal ?
+ // sign bit and 2 most bits in significand
+ shl rMask = rMask, 20
+}
+{ .mfi
+ ld8 rDataPtr = [rDataPtr]
+ nop.f 0
+ adds rBias2 = 0x1F4, r0
+}
+;;
+
+{ .mfi
+ adds rNearSaturation = 0x14, r0
+ fmerge.s fSignumX = f8, f1 // signum(x)
+ shl rSignBit = rSignBit, 31 // mask for sign bit
+}
+{ .mfi
+ adds rBound = 0x3EA, r0
+ nop.f 0
+ addl rSaturation = 0x4112, r0
+}
+;;
+
+{ .mfi
+ andcm rOffset2 = rArg, rMask
+ fclass.m p6,p0 = f8, 0xc7 // is x [S,Q]NaN or +/-0 ?
+ shl rBound = rBound, 20 // 1.0f in GR
+}
+{ .mfb
+ andcm rAbsArg = rArg, rSignBit // |x| in GR
+ nop.f 0
+(p7) br.cond.spnt tanhf_denormal // branch out if x is denormal
+}
+;;
+
+{ .mfi
+ adds rCoeffAddr2 = 352, rDataPtr
+ fclass.m p9,p0 = f8, 0x23 // is x +/- inf?
+ shr rOffset2 = rOffset2, 21
+}
+{ .mfi
+ cmp.lt p10, p8 = rAbsArg, rBound // |x| < 0.3125?
+ nop.f 0
+ adds rCoeffAddr3 = 16, rDataPtr
+}
+;;
+
+{ .mfi
+(p8) sub rBias = rOffset2, rBias2
+ fma.s1 fArg4 = fArgSqr, fArgSqr, f0 // x^4
+ shl rSaturation = rSaturation, 16
+}
+{ .mfb
+(p10) adds rBias = 0x14, r0
+(p6) fma.s.s0 f8 = f8,f1,f8 // NaN or +/-0
+(p6) br.ret.spnt b0 // exit for x = NaN or +/-0
+}
+;;
+
+{ .mfi
+ shladd rCoeffAddr1 = rBias, 4, rDataPtr
+ fma.s1 fArg3Sgn = fArgSqr, f8, f0 // sign(x)*|x|^3
+ // is |x| < 9.125?
+ cmp.lt p11, p12 = rAbsArg, rSaturation
+}
+{ .mfi
+ shladd rCoeffAddr3 = rBias, 4, rCoeffAddr3
+ fma.s1 fArg3 = fArgSqr, fAbsArg, f0 // |x|^3
+ shladd rCoeffAddr2 = rBias, 3, rCoeffAddr2
+}
+;;
+
+{ .mfi
+(p11) ldfpd fC0, fC1 = [rCoeffAddr1]
+(p9) fmerge.s f8 = f8,f1 // +/- inf
+(p12) adds rDataPtr = 544, rDataPtr
+}
+{ .mfb
+(p11) ldfpd fC2, fC3 = [rCoeffAddr3], 16
+ nop.f 0
+(p9) br.ret.spnt b0 // exit for x = +/- inf
+}
+;;
+
+{ .mfi
+(p11) ldfpd fA0, fA1 = [rCoeffAddr2], 16
+ nop.f 0
+(p8) cmp.eq.unc p13, p0 = rBias, rNearSaturation
+}
+{ .mfi
+ add rCoeffAddr1 = 48, rCoeffAddr1
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+(p11) ldfpd fD0, fD1 = [rCoeffAddr3]
+ nop.f 0
+ nop.i 0
+}
+{ .mfb
+(p11) ldfpd fD2, fB0 = [rCoeffAddr1]
+ // sign(x)*|x|^2
+ fma.s1 fArgSqrSgn = fArgSqr, fSignumX, f0
+(p10) br.cond.spnt tanhf_near_zero
+}
+;;
+
+{ .mfi
+(p11) ldfpd fA2, fA3 = [rCoeffAddr2], 16
+ fcmp.lt.s1 p15, p14 = f8,f0
+ nop.i 0
+}
+{ .mfb
+(p12) ldfd fA0 = [rDataPtr]
+ fma.s1 fArg4Sgn = fArg4, fSignumX, f0 // sign(x)*|x|^4
+(p12) br.cond.spnt tanhf_saturation
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fArg7Sgn = fArg4, fArg3Sgn, f0 // sign(x)*|x|^7
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fma.s1 fArg6Sgn = fArg3, fArg3Sgn, f0 // sign(x)*|x|^6
+(p13) br.cond.spnt tanhf_close_to_saturation
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fPolC = fC3, fAbsArg, fC2 // C3*|x| + C2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPolCTmp = fC1, fAbsArg, fC0 // C1*|x| + C0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fPolA = fA1, fAbsArg, fA0 // A1*|x| + A0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fPolD = fD1, fAbsArg, fD0 // D1*|x| + D0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // sign(x)*(|x|^7 + D2*x^6)
+ fma.s1 fPolDTmp = fArg6Sgn, fD2, fArg7Sgn
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fPolATmp = fA3, fAbsArg, fA2 // A3*|x| + A2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB0 = fB0, fArg4, f0 // B0*x^4
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ // C3*|x|^3 + C2*x^2 + C1*|x| + C0
+ fma.s1 fPolC = fPolC, fArgSqr, fPolCTmp
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // PolD = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4)
+ fma.d.s1 fPolD = fPolD, fArg4Sgn, fPolDTmp
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0
+ fma.d.s1 fPolA = fPolATmp, fArgSqr, fPolA
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0
+ fma.d.s1 fPolC = fPolC, f1, fB0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p14) fma.s.s0 f8 = fPolC, fPolD, fPolA // for positive x
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p15) fms.s.s0 f8 = fPolC, fPolD, fPolA // for negative x
+ br.ret.sptk b0 // Exit for 0.3125 <=|x|< 8.0
+};;
+
+
+// Here if |x| < 0.3125
+tanhf_near_zero:
+{ .mfi
+ nop.m 0
+ fma.s1 fPolC = fC3, fArgSqr, fC2 // C3*x^2 + C2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPolCTmp = fC1, fArgSqr, fC0 // C1*x^2 + C0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fPolC = fPolC, fArg4, fPolCTmp // C3*x^6 + C2*x^4 + C1*x^2 + C0
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+ // x + x^3*(C3*x^6 + C2*x^4 + C1*x^2 + C0)
+ fma.s.s0 f8 = fPolC, fArg3Sgn, f8
+ br.ret.sptk b0 // Exit for |x| < 0.3125
+};;
+
+// Here if 9.125 <= |x| < +inf
+tanhf_saturation:
+{ .mfb
+ nop.m 0
+ fma.s.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0d - 2^(-52))
+ // Exit for 9.125 <= |x| < +inf
+ br.ret.sptk b0 // Exit for 9.125 <=|x|< +inf
+}
+;;
+
+// Here if 8.0 <= |x| < 9.125
+tanhf_close_to_saturation:
+{ .mfi
+ nop.m 0
+ fma.s1 fPolATmp = fA1, fAbsArg, fA0 // A1*|x| + A0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPolA = fA3, fAbsArg, fA2 // A3*|x| + A2
+ nop.i 0
+}
+;;
+
+.pred.rel "mutex", p14, p15
+{ .mfi
+ nop.m 0
+ // for positive x
+(p14) fma.s.s0 f8 = fPolA, fArgSqr, fPolATmp
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ // for negative x
+(p15) fms.s.s0 f8 = fPolA, fArgSqrSgn, fPolATmp
+ br.ret.sptk b0 // Exit for 8.0 <=|x|< 9.125
+};;
+
+// Here if x is single precision denormal
+tanhf_denormal:
+{ .mfi
+ nop.m 0
+ fclass.m p7,p8 = f8, 0x0a // is x -denormal ?
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+(p7) fma.s.s0 f8 = f8,f8,f8 // -denormal
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p8) fnma.s.s0 f8 = f8,f8,f8 // +denormal
+ br.ret.sptk b0 // Exit for denormal
+}
+;;
+
+GLOBAL_LIBM_END(tanhf)
diff --git a/sysdeps/ia64/fpu/s_tanhl.S b/sysdeps/ia64/fpu/s_tanhl.S
new file mode 100644
index 0000000000..ab00994c85
--- /dev/null
+++ b/sysdeps/ia64/fpu/s_tanhl.S
@@ -0,0 +1,1347 @@
+.file "tanhl.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 11/29/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 08/14/02 Changed mli templates to mlx
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+// API
+//==============================================================
+// long double tanhl(long double)
+//
+// Overview of operation
+//==============================================================
+//
+// Algorithm description
+// ---------------------
+//
+// There are 4 paths:
+//
+// 1. Special path: x = 0, Inf, NaNs, denormal
+// Return tanhl(x) = +/-0.0 for zeros
+// Return tanhl(x) = QNaN for NaNs
+// Return tanhl(x) = sign(x)*1.0 for Inf
+// Return tanhl(x) = x + x^2 for - denormals
+// Return tanhl(x) = x - x^2 for + denormals
+//
+// 2. [0;1/8] path: 0.0 < |x| < 1/8
+// Return tanhl(x) = x + x^3*A3 + ... + x^15*A15
+//
+// 3. Main path: 1/8 <= |x| < 22.8
+// For several ranges of 1/8 <= |x| < 22.8
+// Return tanhl(x) = sign(x)*((A0H+A0L) + y*(A1H+A1L) + y^2*(A2H+A2L) +
+// + y^3*A3 + y^4*A4 + ... + y^25*A25 )
+// where y = (|x|/a) - b
+//
+// For each range there is particular set of coefficients.
+// Below is the list of ranges:
+// 1/8 <= |x| < 1/4 a = 0.125, b = 1.5
+// 1/4 <= |x| < 1/2 a = 0.25, b = 1.5
+// 1/2 <= |x| < 1.0 a = 0.5, b = 1.5
+// 1.0 <= |x| < 2.0 a = 1.0, b = 1.5
+// 2.0 <= |x| < 3.25 a = 2.0, b = 1.5
+// 3.25 <= |x| < 4.0 a = 2.0, b = 2.0
+// 4.0 <= |x| < 6.5 a = 4.0, b = 1.5
+// 6.5 <= |x| < 8.0 a = 4.0, b = 2.0
+// 8.0 <= |x| < 13.0 a = 8.0, b = 1.5
+// 13.0 <= |x| < 16.0 a = 8.0, b = 2.0
+// 16.0 <= |x| < 22.8 a = 16.0, b = 1.5
+// ( [3.25;4.0], [6.5;8.0], [13.9;16.0] subranges separated
+// for monotonicity issues resolve )
+//
+// 4. Saturation path: 22.8 <= |x| < +INF
+// Return tanhl(x) = sign(x)*(1.0 - tiny_value)
+// (tiny_value ~ 1e-1233)
+//
+// Implementation notes
+// --------------------
+//
+// 1. Special path: x = 0, INF, NaNa, denormals
+//
+// This branch is cut off by one fclass operation.
+// Then zeros+nans, infinities and denormals processed separately.
+// For denormals we use simple fma operaton x+x*x (- for +denorms)
+//
+// 2. [0;1/8] path: 0.0 < |x| < 1/8
+//
+// Here we use simple polynimial computations, where last step
+// is performed as x + x^3*A3+...
+// The rest of polynomial is factorized using binary tree technique.
+//
+// 3. Main path: 1/8 <= |x| < 22.8
+//
+// Multiprecision have to be performed only for first few
+// polynomial iterations (up to 3-rd x degree)
+// Here we use the same parallelisation way as above:
+// Split whole polynomial to first, "multiprecision" part, and second,
+// so called "tail", native precision part.
+//
+// 1) Multiprecision part:
+// [v1=(A0H+A0L)+y*(A1H+A1L)] + [v2=y^2*((A2H+A2L)+y*A3)]
+// v1 and v2 terms calculated in parallel
+//
+// 2) Tail part:
+// v3 = x^4 * ( A4 + x*A5 + ... + x^21*A25 )
+// v3 is splitted to 2 even parts (10 coefficient in each one).
+// These 2 parts are also factorized using binary tree technique.
+//
+// So Multiprecision and Tail parts cost is almost the same
+// and we have both results ready before final summation.
+//
+// Some tricks were applied to maintain symmetry at direct
+// rounding modes (to +/-inf). We had to set result sign
+// not at the last operation but much more earlier and at
+// several places.
+//
+// 4. Saturation path: 22.8 <= |x| < +INF
+//
+// We use formula sign(x)*(1.0 - tiny_value) instead of simple sign(x)*1.0
+// just to meet IEEE requirements for different rounding modes in this case.
+//
+// Registers used
+//==============================================================
+// Floating Point registers used:
+// f8 - input & output
+// f32 -> f92
+
+// General registers used:
+// r2, r3, r32 -> r52
+
+// Predicate registers used:
+// p0, p6 -> p11, p14, p15
+
+// p6 - arg is zero, denormal or special IEEE
+// p7 - arg is in [16;32] binary interval
+// p8 - arg is in one of subranges
+// [3.25;4.0], [6.5;8.0], [13.9;16.0]
+// p9 - arg < 1/8
+// p10 - arg is NOT in one of subranges
+// [3.25;4.0], [6.5;8.0], [13.9;16.0]
+// p11 - arg in saturation domain
+// p14 - arg is positive
+// p15 - arg is negative
+
+// Assembly macros
+//==============================================================
+rDataPtr = r2
+rTailDataPtr = r3
+
+rBias = r33
+rSignBit = r34
+rInterval = r35
+
+rArgExp = r36
+rArgSig = r37
+r3p25Offset = r38
+r2to4 = r39
+r1p25 = r40
+rOffset = r41
+r1p5 = r42
+rSaturation = r43
+r1625Sign = r44
+rTiny = r45
+rAddr1 = r46
+rAddr2 = r47
+rTailAddr1 = r48
+rTailAddr2 = r49
+rTailOffset = r50
+rTailAddOffset = r51
+rShiftedDataPtr = r52
+
+//==============================================================
+fA0H = f32
+fA0L = f33
+fA1H = f34
+fA1L = f35
+fA2H = f36
+fA2L = f37
+fA3 = f38
+fA4 = f39
+fA5 = f40
+fA6 = f41
+fA7 = f42
+fA8 = f43
+fA9 = f44
+fA10 = f45
+fA11 = f46
+fA12 = f47
+fA13 = f48
+fA14 = f49
+fA15 = f50
+fA16 = f51
+fA17 = f52
+fA18 = f53
+fA19 = f54
+fA20 = f55
+fA21 = f56
+fA22 = f57
+fA23 = f58
+fA24 = f59
+fA25 = f60
+
+fArgSqr = f61
+fArgCube = f62
+fArgFour = f63
+fArgEight = f64
+
+fArgAbsNorm = f65
+fArgAbsNorm2 = f66
+fArgAbsNorm2L = f67
+fArgAbsNorm3 = f68
+fArgAbsNorm4 = f69
+fArgAbsNorm11 = f70
+
+fRes = f71
+fResH = f72
+fResL = f73
+fRes1H = f74
+fRes1L = f75
+fRes1Hd = f76
+fRes2H = f77
+fRes2L = f78
+fRes3H = f79
+fRes3L = f80
+fRes4 = f81
+
+fTT = f82
+fTH = f83
+fTL = f84
+fTT2 = f85
+fTH2 = f86
+fTL2 = f87
+
+f1p5 = f88
+f2p0 = f89
+fTiny = f90
+fSignumX = f91
+fArgAbsNorm4X = f92
+
+// Data tables
+//==============================================================
+RODATA
+
+.align 16
+LOCAL_OBJECT_START(tanhl_data)
+
+////////// Main tables ///////////
+_0p125_to_0p25_data: // exp = 2^-3
+// Polynomial coefficients for the tanh(x), 1/8 <= |x| < 1/4
+data8 0x93D27D6AE7E835F8, 0x0000BFF4 //A3 = -5.6389704216278164626050408239e-04
+data8 0xBF66E8668A78A8BC //A2H = -2.7963640930198357253955165902e-03
+data8 0xBBD5384EFD0E7A54 //A2L = -1.7974001252014762983581666453e-20
+data8 0x3FBEE69E31DB6156 //A1H = 1.2070645062647619716322822114e-01
+data8 0x3C43A0B4E24A3DCA //A1L = 2.1280460108882061756490131241e-18
+data8 0x3FC7B8FF903BF776 //A0H = 1.8533319990813951205765874874e-01
+data8 0x3C593F1A61986FD4 //A0L = 5.4744612262799573374268254539e-18
+data8 0xDB9E6735560AAE5A, 0x0000BFA3 //A25 = -3.4649731131719154051239475238e-28
+data8 0xF0DDE953E4327704, 0x00003FA4 //A24 = 7.6004173864565644629900702857e-28
+data8 0x8532AED11DEC5612, 0x00003FAB //A23 = 5.3798235684551098715428515761e-26
+data8 0xAEF72A34D88B0038, 0x0000BFAD //A22 = -2.8267199091484508912273222600e-25
+data8 0x9645EF1DCB759DDD, 0x0000BFB2 //A21 = -7.7689413112830095709522203109e-24
+data8 0xA5D12364E121F70F, 0x00003FB5 //A20 = 6.8580281614531622113161030550e-23
+data8 0x9CF166EA815AC705, 0x00003FB9 //A19 = 1.0385615003184753213024737634e-21
+data8 0x852B1D0252498752, 0x0000BFBD //A18 = -1.4099753997949827217635356478e-20
+data8 0x9270F5716D25EC9F, 0x0000BFC0 //A17 = -1.2404055949090177751123473821e-19
+data8 0xC216A9C4EEBDDDCA, 0x00003FC4 //A16 = 2.6303900460415782677749729120e-18
+data8 0xDCE944D89FF592F2, 0x00003FC6 //A15 = 1.1975620514752377092265425941e-17
+data8 0x83C8DDF213711381, 0x0000BFCC //A14 = -4.5721980583985311263109531319e-16
+LOCAL_OBJECT_END(tanhl_data)
+
+LOCAL_OBJECT_START(_0p25_to_0p5_data)
+// Polynomial coefficients for the tanh(x), 1/4 <= |x| < 1/2
+data8 0xB6E27B747C47C8AD, 0x0000BFF6 //A3 = -2.7905990032063258105302045572e-03
+data8 0xBF93FD54E226F8F7 //A2H = -1.9521070769536099515084615064e-02
+data8 0xBC491BC884F6F18A //A2L = -2.7222721075104525371410300625e-18
+data8 0x3FCBE3FBB015A591 //A1H = 2.1789499376181400980279079249e-01
+data8 0x3C76AFC2D1AE35F7 //A1L = 1.9677459707672596091076696742e-17
+data8 0x3FD6EF53DE8C8FAF //A0H = 3.5835739835078589399230963863e-01
+data8 0x3C8E2A1C14355F9D //A0L = 5.2327050592919416045278607775e-17
+data8 0xF56D363AAE3BAD53, 0x00003FBB //A25 = 6.4963882412697389947564301120e-21
+data8 0xAD6348526CEEB897, 0x0000BFBD //A24 = -1.8358149767147407353343152624e-20
+data8 0x85D96A988565FD65, 0x0000BFC1 //A23 = -2.2674950494950919052759556703e-19
+data8 0xD52CAF6B1E4D9717, 0x00003FC3 //A22 = 1.4445269502644677106995571101e-18
+data8 0xBD7E1BE5CBEF7A01, 0x00003FC5 //A21 = 5.1362075721080004718090799595e-18
+data8 0xAE84A9B12ADD6948, 0x0000BFC9 //A20 = -7.5685210830925426342786733068e-17
+data8 0xEAC2D5FCF80E250C, 0x00003FC6 //A19 = 1.2726423522879522181100392135e-17
+data8 0xE0D2A8AC8C2EDB95, 0x00003FCE //A18 = 3.1200443098733419749016380203e-15
+data8 0xB22F0AB7B417F78E, 0x0000BFD0 //A17 = -9.8911854977385933809488291835e-15
+data8 0xE25A627BAEFFA7A4, 0x0000BFD3 //A16 = -1.0052095388666003876301743498e-13
+data8 0xC90F32EC4A17F908, 0x00003FD6 //A15 = 7.1430637679768183097897337145e-13
+data8 0x905F6F124AF956B1, 0x00003FD8 //A14 = 2.0516607231389483452611375485e-12
+LOCAL_OBJECT_END(_0p25_to_0p5_data)
+
+LOCAL_OBJECT_START(_0p5_to_1_data)
+// Polynomial coefficients for the tanh(x), 1/2 <= |x| < 1
+data8 0xAB402BE491EE72A7, 0x00003FF7 //A3 = 5.2261556931080934657023772945e-03
+data8 0xBFB8403D3DDA87BE //A2H = -9.4730212784752659826992271519e-02
+data8 0xBC6FF7BC2AB71A8B //A2L = -1.3863786398568460929625760740e-17
+data8 0x3FD3173B1EFA6EF4 //A1H = 2.9829290414066567116435635398e-01
+data8 0x3C881E4DCABDE840 //A1L = 4.1838710466827119847963316219e-17
+data8 0x3FE45323E552F228 //A0H = 6.3514895238728730220145735075e-01
+data8 0x3C739D5832BF7BCF //A0L = 1.7012977006567066423682445459e-17
+data8 0xF153980BECD8AE12, 0x00003FD0 //A25 = 1.3396313991261493342597057700e-14
+data8 0xEC9ACCD245368129, 0x0000BFD3 //A24 = -1.0507358886349528807350792383e-13
+data8 0x8AE6498CA36D2D1A, 0x00003FD4 //A23 = 1.2336759149738309660361813001e-13
+data8 0x8DF02FBF5AC70E64, 0x00003FD7 //A22 = 1.0085317723615282268326194551e-12
+data8 0x9E15C7125DA204EE, 0x0000BFD9 //A21 = -4.4930478919612724261941857560e-12
+data8 0xA62C6F39BDDCEC1C, 0x00003FD7 //A20 = 1.1807342457875095150035780314e-12
+data8 0xDFD8D65D30F80F52, 0x00003FDC //A19 = 5.0896919887121116317817665996e-11
+data8 0xB795AFFD458F743E, 0x0000BFDE //A18 = -1.6696932710534097241291327756e-10
+data8 0xFEF30234CB01EC89, 0x0000BFDD //A17 = -1.1593749714588103589483091370e-10
+data8 0xA2F638356E13761E, 0x00003FE2 //A16 = 2.3714062288761887457674853605e-09
+data8 0xC429CC0D031E4FD5, 0x0000BFE3 //A15 = -5.7091025466377379046489586383e-09
+data8 0xC78363FF929EFF62, 0x0000BFE4 //A14 = -1.1613199289622686725595739572e-08
+LOCAL_OBJECT_END(_0p5_to_1_data)
+
+LOCAL_OBJECT_START(_1_to_2_data)
+// Polynomial coefficients for the tanh(x), 1 <= |x| < 2.0
+data8 0xB3D8FB48A548D99A, 0x00003FFB //A3 = 8.7816203264683800892441646129e-02
+data8 0xBFC4EFBD8FB38E3B //A2H = -1.6356629864377389416141284073e-01
+data8 0xBC77687FD8087B23 //A2L = -2.0303377679446772162287121190e-17
+data8 0x3FC72165282C6F72 //A1H = 1.8070663892364852154415189034e-01
+data8 0x3C64E01F7A76D777 //A1L = 9.0532964466719018524360408402e-18
+data8 0x3FECF6F9786DF577 //A0H = 9.0514825364486639625027919465e-01
+data8 0x3C8834EDCE71A65B //A0L = 4.1992023813070331863928976191e-17
+data8 0xC3EEEB3EFA688094, 0x00003FE2 //A25 = 2.8512044383274095705865793485e-09
+data8 0x88461973672AEB12, 0x0000BFE1 //A24 = -9.9152258079470849685057375343e-10
+data8 0xFC2AF9950DC5027E, 0x0000BFE4 //A23 = -1.4678101918123116001692289670e-08
+data8 0x9C80CA742F89B7B5, 0x00003FE6 //A22 = 3.6438714992394138274843759814e-08
+data8 0xA0B3D7FAA606260A, 0x0000BFE6 //A21 = -3.7416469848124568887944709492e-08
+data8 0xDA5858432FBD9D9D, 0x0000BFE6 //A20 = -5.0837429421503142141842414978e-08
+data8 0xB0244D1E1AE9C1B0, 0x00003FE9 //A19 = 3.2808967255272595749004827841e-07
+data8 0xC8D3109ACF740738, 0x0000BFEA //A18 = -7.4812945767507614821609020680e-07
+data8 0xBB0F3440EEA55BBF, 0x00003FEA //A17 = 6.9685053481643125932497676583e-07
+data8 0xC13A8B08D8576C19, 0x00003FEB //A16 = 1.4396658837712390333960587173e-06
+data8 0xFF3A1163CC5522A1, 0x0000BFED //A15 = -7.6063522055104010298762276148e-06
+data8 0x8672AF27EB0823B7, 0x00003FEF //A14 = 1.6027448793338500004496520337e-05
+LOCAL_OBJECT_END(_1_to_2_data)
+
+LOCAL_OBJECT_START(_2_to_3p25_data)
+// Polynomial coefficients for the tanh(x), 2 <= |x| < 3.25
+data8 0xD45657BEC559E366, 0x00003FFA //A3 = 5.1840155367548909799883161889e-02
+data8 0xBFA41B109CA6AB81 //A2H = -3.9268988726084870510835145296e-02
+data8 0xBC2C3D708A4E56C5 //A2L = -7.6544669252238280132415018518e-19
+data8 0x3F9434A517BBC5F4 //A1H = 1.9732074330880380874653212686e-02
+data8 0x3C3ED62DD9585229 //A1L = 1.6716574468135097509707871438e-18
+data8 0x3FEFD77D111A0AFF //A0H = 9.9505475368673035330147058630e-01
+data8 0x3C9C415E151C6CA5 //A0L = 9.8030409604070051319822874013e-17
+data8 0xB1596391D4534D52, 0x00003FEC //A25 = 2.6427086526487251988631279067e-06
+data8 0xC4DC44E243D1AF5F, 0x00003FEF //A24 = 2.3467591534149209236830008333e-05
+data8 0xAED5786023982BB8, 0x00003FF0 //A23 = 4.1683642395739762658623742687e-05
+data8 0xCF39926C9FBC6A10, 0x00003FF0 //A22 = 4.9406263949321793291856681624e-05
+data8 0xA255A72359928142, 0x00003FF0 //A21 = 3.8703580278108400672236161973e-05
+data8 0xA2E573B9FC332C0D, 0x00003FED //A20 = 4.8546879618263642155709302480e-06
+data8 0x82C7BD01830ACA93, 0x00003FF0 //A19 = 3.1180436075031301077175550468e-05
+data8 0xB38AF4C76E96444B, 0x0000BFF0 //A18 = -4.2806338675404452784440167120e-05
+data8 0xEC08FF0FB194464C, 0x00003FF0 //A17 = 5.6275163156181928637744511210e-05
+data8 0xB850825D9E235135, 0x0000BFF0 //A16 = -4.3943998628289568813056822585e-05
+data8 0xF98436E838763687, 0x0000BFEF //A15 = -2.9744680263523220185672219686e-05
+data8 0xE1851A2D00737A5D, 0x00003FF2 //A14 = 2.1507256570895163202182573369e-04
+LOCAL_OBJECT_END(_2_to_3p25_data)
+
+LOCAL_OBJECT_START(_4_to_6p5_data)
+// Polynomial coefficients for the tanh(x), 4 <= |x| < 6.5
+data8 0x896FDBD321A0BE58, 0x00003FF5 //A3 = 1.0485606995331904734870550114e-03
+data8 0xBF39C522B95A37D6 //A2H = -3.9321992640217512306882730044e-04
+data8 0xBBA9B3EC39A45338 //A2L = -2.7213922673282819034134988241e-21
+data8 0x3F19C5377A48B5AD //A1H = 9.8306189621330793766869338146e-05
+data8 0x3BCAFCB1D08A891C //A1L = 1.1429476443042275163117526657e-20
+data8 0x3FEFFFE63ABE253B //A0H = 9.9998771165079547440512897083e-01
+data8 0x3C9BB74C4EE0D16F //A0L = 9.6159219890436197391279544561e-17
+data8 0x8D86121D469AFA7E, 0x0000BFEF //A25 = -1.6870941388985743600323604423e-05
+data8 0x9D3656A36593C5C4, 0x00003FEF //A24 = 1.8741161763079973068909254398e-05
+data8 0xDCD772D5BF9ADB96, 0x00003FF0 //A23 = 5.2652739523018349983563695656e-05
+data8 0xFF79ADCF0DCBCC2D, 0x00003FF1 //A22 = 1.2182012003034659966028035977e-04
+data8 0x84D24E394DEFD0D2, 0x00003FF1 //A21 = 6.3334229517535065590380468696e-05
+data8 0xA66B56BFD2782544, 0x00003FF1 //A20 = 7.9354902476954571736114945842e-05
+data8 0xFB15771FBF3155FE, 0x0000BFEE //A19 = -1.4965763624796745134798717707e-05
+data8 0xC774790126BE54C3, 0x00003FEF //A18 = 2.3776885435831770523136610539e-05
+data8 0x825A13DACB8C68CD, 0x00003FEF //A17 = 1.5539153272890695426189818556e-05
+data8 0xCFF96E6810AACE27, 0x0000BFF1 //A16 = -9.9169893703251156059893890295e-05
+data8 0x8A85D2061B865024, 0x00003FF3 //A15 = 2.6421115104625621420758344535e-04
+data8 0x922EC6F3CFE0496E, 0x0000BFF4 //A14 = -5.5764283474946207558456581668e-04
+LOCAL_OBJECT_END(_4_to_6p5_data)
+
+LOCAL_OBJECT_START(_8_to_13_data)
+// Polynomial coefficients for the tanh(x), 8 <= |x| < 13
+data8 0xDD6050A898303460, 0x00003FE6 //A3 = 5.1543170295688189081352133793e-08
+data8 0xBE44C1078FDBADC0 //A2H = -9.6643444318955652627581125180e-09
+data8 0xBAF95FCAA6DBBA6F //A2L = -1.3118146684038113473094275420e-24
+data8 0x3E14C1078FE26748 //A1H = 1.2080430540780827633746315479e-09
+data8 0x3A88168082F37D95 //A1L = 9.7290246966246404028418245094e-27
+data8 0x3FEFFFFFFFF59F7C //A0H = 9.9999999992449728480892190419e-01
+data8 0x3C7C068EBC5C2EEB //A0L = 2.4308346546749583521003998922e-17
+data8 0x9DC155C77A6C46E5, 0x00003FF2 //A25 = 1.5044709695520252096006763473e-04
+data8 0xF2F9E09CA47F46E9, 0x00003FF3 //A24 = 4.6344010077547944693833282056e-04
+data8 0xCBFD67E704734BC8, 0x00003FF4 //A23 = 7.7815958662026429864083620142e-04
+data8 0xC18DC821CD67E621, 0x00003FF4 //A22 = 7.3834928521190855055818897104e-04
+data8 0x8AF72BCAB05A296E, 0x00003FF4 //A21 = 5.3011135848666430331904214879e-04
+data8 0xC2E73BE9B9AB4007, 0x00003FF2 //A20 = 1.8587423129049905806822275188e-04
+data8 0xE7E8C2058E2FF9F7, 0x00003FF1 //A19 = 1.1058292891321512917337425414e-04
+data8 0xC46309F52E429F97, 0x0000BFF0 //A18 = -4.6822278664829811025251866877e-05
+data8 0x81966C1E007E9BEB, 0x00003FF1 //A17 = 6.1792176836716291200611553354e-05
+data8 0x8CEDC4BEFCAB9A7E, 0x0000BFF1 //A16 = -6.7200080564674449915571760779e-05
+data8 0x8B64E9FA53210018, 0x00003FF1 //A15 = 6.6468331917938095774361868182e-05
+data8 0x82DEDAA539A3A3F1, 0x0000BFF1 //A14 = -6.2403928644276709411156885292e-05
+LOCAL_OBJECT_END(_8_to_13_data)
+
+LOCAL_OBJECT_START(_16_to_22p8_data)
+// Polynomial coefficients for the tanh(x), 16 <= |x| < 22.88
+data8 0x992C00F33DDE804D, 0x00003FCE //A3 = 2.1256869805798788337547274131e-15
+data8 0x3C8D42EA28102760 //A2H = 5.0760412270332007485198379096e-17
+data8 0x391A747B43B072DD //A2L = 1.2737621993898125881520341053e-33
+data8 0x3C309BC5C3CB4D5F //A1H = 9.0034785192019775952205276560e-19
+data8 0x38A8EF3B5C9DCE71 //A1L = 9.3793162715476168397242934494e-36
+data8 0x3FF0000000000000 //A0H = 1.0000000000000000000000000000e+00
+data8 0x3BACC66AFD5CA22A //A0L = 3.0466790472070565954180861749e-21
+data8 0xF020FB351C2F37CB, 0x00003FF1 //A25 = 1.1450235038836625246604146870e-04
+data8 0xBE80596C51302A7B, 0x00003FF4 //A24 = 7.2670503421185030764546828414e-04
+data8 0x91343CF8577E0131, 0x00003FF6 //A23 = 2.2156380512949603402001207105e-03
+data8 0x8D029A8679641286, 0x00003FF7 //A22 = 4.3032888906494613055765544559e-03
+data8 0xC3713F64D8DC4BAB, 0x00003FF7 //A21 = 5.9644279041951657632420721490e-03
+data8 0xCD678C455A5D06C2, 0x00003FF7 //A20 = 6.2684473911812928601693994403e-03
+data8 0xA9E1C825BDCEEBCC, 0x00003FF7 //A19 = 5.1843859941826642445235686826e-03
+data8 0xE29C919AD93F6EB9, 0x00003FF6 //A18 = 3.4578185539872939928152204329e-03
+data8 0xF7E615A75994A607, 0x00003FF5 //A17 = 1.8913175041916131006881986311e-03
+data8 0xE102EFE0F7F2B2AD, 0x00003FF4 //A16 = 8.5835064987089641065525269712e-04
+data8 0xAAD62946DEE96996, 0x00003FF3 //A15 = 3.2584489313998677644253007210e-04
+data8 0xDA2470DE110B293E, 0x00003FF1 //A14 = 1.0401837693241806604296821650e-04
+LOCAL_OBJECT_END(_16_to_22p8_data)
+
+LOCAL_OBJECT_START(_3p25_to_4_data)
+// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4
+data8 0xE9E07240432926E6, 0x00003FF7 //A3 = 7.1373517862636557382403555215e-03
+data8 0xBF75F495227AF306 //A2H = -5.3602052282115727338540622782e-03
+data8 0xBBBE92D355A6B716 //A2L = -6.4741983326810209847018826624e-21
+data8 0x3F65F85AD510B690 //A1H = 2.6819013660517934671823070403e-03
+data8 0x3C159A0B73E6EC01 //A1L = 2.9275813076637328121849573333e-19
+data8 0x3FEFFA81708A0B42 //A0H = 9.9932929973906703402519724477e-01
+data8 0x3C66857246C19DC6 //A0L = 9.7670460995685717424398031188e-18
+data8 0xE6B6B8365B1E4D6C, 0x00003FE3 //A25 = 6.7146538162212081470554423396e-09
+data8 0xE0453CEEF483A510, 0x00003FE2 //A24 = 3.2635647369924061614015292015e-09
+data8 0x9C7D83B56E92CF1A, 0x00003FE5 //A23 = 1.8217867585545497089756353348e-08
+data8 0xA94635C48ABA9EB4, 0x0000BFE4 //A22 = -9.8530586070049930796756799547e-09
+data8 0xB1B0C14443067646, 0x00003FE5 //A21 = 2.0685890807654992387562340307e-08
+data8 0x9C6E549781E293C3, 0x00003FDE //A20 = 1.4227314592865135171341122138e-10
+data8 0xB0CBFCE7C80F57A7, 0x0000BFE7 //A19 = -8.2327438416004542109809245219e-08
+data8 0xB151AB3876E896E1, 0x00003FE9 //A18 = 3.3028241036175815328309577940e-07
+data8 0xFCF3A5C1A5CB7EEE, 0x0000BFEA //A17 = -9.4231869277542043001280640966e-07
+data8 0x96A9016C7C95BEDA, 0x00003FEC //A16 = 2.2450115975007100522962781833e-06
+data8 0x9B9B0A3901DEC05B, 0x0000BFED //A15 = -4.6374089937147736266514566049e-06
+data8 0x8987DF26A6789CCF, 0x00003FEE //A14 = 8.1974714257536543772040700977e-06
+LOCAL_OBJECT_END(_3p25_to_4_data)
+
+LOCAL_OBJECT_START(_6p5_to_8_data)
+// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
+data8 0xA11C8A63815E5657, 0x00003FEF //A3 = 1.9205985861286093001394561449e-05
+data8 0xBEDE355AD6CB61D8 //A2H = -7.2022479400070228499307345427e-06
+data8 0xBB8E6B50B8468A63 //A2L = -8.0518953122203408718779840543e-22
+data8 0x3EBE355B48DCF330 //A1H = 1.8005623902549165889479948488e-06
+data8 0x3B5837550FFA98DA //A1L = 8.0124491698609178046195694087e-23
+data8 0x3FEFFFFF872A91F8 //A0H = 9.9999977492967584424832239165e-01
+data8 0x3C8A43B839B4EB63 //A0L = 4.5561696441306660142461355317e-17
+data8 0xB5BC1948966B8826, 0x0000BFE6 //A25 = -4.2313421330480692560677276010e-08
+data8 0x91D0BE367389BDFC, 0x0000BFE8 //A24 = -1.3580117599617083801153887619e-07
+data8 0xFFD950AF282AB36C, 0x0000BFE8 //A23 = -2.3827784451962439125197203287e-07
+data8 0x959B1770EBB8903A, 0x0000BFE9 //A22 = -2.7866256690165347051403663794e-07
+data8 0xCC78060D1C0CFF3C, 0x0000BFE8 //A21 = -1.9042644867126442102188429523e-07
+data8 0xF8919BAF2E87F31D, 0x0000BFE8 //A20 = -2.3149771783868910586746973299e-07
+data8 0xC5B6AC942A3F2440, 0x00003FE8 //A19 = 1.8413511183396213757149263639e-07
+data8 0xABF1A4703056450A, 0x0000BFEA //A18 = -6.4054099983863829656292958643e-07
+data8 0xBB543D8BDB670453, 0x00003FEB //A17 = 1.3957102903892251890348444989e-06
+data8 0xC9D6F37700C1D092, 0x0000BFEC //A16 = -3.0076451968978522605262647414e-06
+data8 0xCA6EF4BB64E49EC8, 0x00003FED //A15 = 6.0329860989478473738709576062e-06
+data8 0xBE25D0FD069D0A93, 0x0000BFEE //A14 = -1.1333687314965721384777951065e-05
+LOCAL_OBJECT_END(_6p5_to_8_data)
+
+LOCAL_OBJECT_START(_13_to_16_data)
+// Polynomial coefficients for the tanh(x), 13 <= |x| < 16
+data8 0x98176FD2075BDBD5, 0x00003FDB //A3 = 1.7290807363028159200235264756e-11
+data8 0xBD8C8464F76162D1 //A2H = -3.2420263805679445515400340441e-12
+data8 0xBA2D56B508E0F1FD //A2L = -1.8515322669984580704502445180e-28
+data8 0x3D5C8464F761639C //A1H = 4.0525329757100331782338488690e-13
+data8 0x3A0A09D9E328E620 //A1L = 4.1081479300866418212862258651e-29
+data8 0x3FEFFFFFFFFFFF1B //A0H = 9.9999999999997457589273608392e-01
+data8 0x3C9B9B089E9BFD89 //A0L = 9.5776165728054091471814161399e-17
+data8 0xC5395B9EC765BDB7, 0x00003FE6 //A25 = 4.5919803498257974411526879804e-08
+data8 0x9A0F1FCB1DC24C3A, 0x00003FE8 //A24 = 1.4347869798460288751020493795e-07
+data8 0x8AA5C3459FAD0B28, 0x00003FE9 //A23 = 2.5825111356333853968900510087e-07
+data8 0x9578B747988CFF9D, 0x00003FE9 //A22 = 2.7841245127068220034870119246e-07
+data8 0x810DF1A589D9CAF1, 0x00003FE9 //A21 = 2.4038267971021370956311255310e-07
+data8 0x8A00D77B9416EB75, 0x00003FE8 //A20 = 1.2852557749068320312899366352e-07
+data8 0xB2436C4A1849C498, 0x00003FE7 //A19 = 8.3010350873515703893886683374e-08
+data8 0xEA6405B18356600B, 0x00003FE3 //A18 = 6.8216675390299296071261114202e-09
+data8 0xF7606C022194B7E8, 0x00003FE5 //A17 = 2.8798432098264655723769995993e-08
+data8 0xAF4B0C453FCAF34E, 0x0000BFE5 //A16 = -2.0406809167824936143455638336e-08
+data8 0xC324C1F10D5FA7CC, 0x00003FE5 //A15 = 2.2717703170390130238356558599e-08
+data8 0xB34A2E3A4D3B9C31, 0x0000BFE5 //A14 = -2.0872076027950789618606920471e-08
+LOCAL_OBJECT_END(_13_to_16_data)
+
+
+//////// "Tail" tables //////////
+LOCAL_OBJECT_START(_0p125_to_0p25_data_tail)
+// Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4
+data8 0x9D7D206E97ADC83A, 0x0000BFCC //A13 = -5.4639895428711257047470806445e-16
+data8 0xA8972B666A845810, 0x00003FD3 //A12 = 7.4869224589947988668562043110e-14
+data8 0x9A5B31511C9F4698, 0x0000BFD4 //A11 = -1.3709586467430093373657009487e-13
+data8 0xCBB8047BCB274982, 0x0000BFDA //A10 = -1.1580074124926108509393610532e-11
+data8 0xF95EB849E5F9247C, 0x00003FDC //A9 = 5.6700173336564916962945623180e-11
+data8 0xE7893404C6A53386, 0x00003FE1 //A8 = 1.6846457582993065168777704528e-09
+data8 0xF2E5C7E2B5F55ECC, 0x0000BFE4 //A7 = -1.4138500046802141367543484859e-08
+data8 0xF43906FF53A002C0, 0x0000BFE8 //A6 = -2.2745017243678613107034288816e-07
+data8 0xC6175D5E47D1D259, 0x00003FEC //A5 = 2.9517899220726077077586632607e-06
+data8 0xE7C2AE92CB36769B, 0x00003FEF //A4 = 2.7628001723157068127646694830e-05
+LOCAL_OBJECT_END(_0p125_to_0p25_data_tail)
+
+LOCAL_OBJECT_START(_0p25_to_0p5_data_tail)
+// Polynomial coefficients for the tanh(x), 1/4 <= |x| < 1/2
+data8 0x9E2972C008B9965E, 0x0000BFDC //A13 = -3.5961854154738002253192260213e-11
+data8 0xC3EABA3D219BEA8A, 0x00003FDB //A12 = 2.2273173303628274478819473067e-11
+data8 0xC50FB68D960D5CD9, 0x00003FE1 //A11 = 1.4338102430978399800743148719e-09
+data8 0xB3BB92499EF2D583, 0x0000BFE3 //A10 = -5.2309100551458044083112632491e-09
+data8 0xBD915BE632F1D04E, 0x0000BFE6 //A9 = -4.4137194873936112573773943707e-08
+data8 0xBC48C813FA819141, 0x00003FE9 //A8 = 3.5070684356359066908197915734e-07
+data8 0xD3E34EA031AC611B, 0x00003FEA //A7 = 7.8934400708919584259192272835e-07
+data8 0x8EAC489D859541CD, 0x0000BFEF //A6 = -1.7007944944124693133572815137e-05
+data8 0x98D4D7E5D1508B8A, 0x00003FEF //A5 = 1.8218924920302265989878708948e-05
+data8 0xAC262F3F8CF49C02, 0x00003FF4 //A4 = 6.5669692402266433496312492412e-04
+LOCAL_OBJECT_END(_0p25_to_0p5_data_tail)
+
+LOCAL_OBJECT_START(_0p5_to_1_data_tail)
+// Polynomial coefficients for the tanh(x), 1/2 <= |x| < 1
+data8 0xDF67FB36FFA2A538, 0x00003FE7 //A13 = 1.0403160796697495720021114635e-07
+data8 0xB7FB80FB5AFA63A4, 0x0000BFE8 //A12 = -1.7134699677764282023124981753e-07
+data8 0xC87625A0BA7D6C5F, 0x0000BFEA //A11 = -7.4677732458471897291461679095e-07
+data8 0x90DA375DD9AF6D79, 0x00003FED //A10 = 4.3169381418023765618186668159e-06
+data8 0x82DFB03317B17316, 0x0000BFED //A9 = -3.9003426534601562552753368105e-06
+data8 0xAA582FD4F3438BB4, 0x0000BFF0 //A8 = -4.0613288845040776435400454867e-05
+data8 0xB1532D8CF763B21C, 0x00003FF2 //A7 = 1.6911021594787399557528570601e-04
+data8 0x82E12AEF7CAB76C6, 0x0000BFEF //A6 = -1.5602059530458172761585925044e-05
+data8 0x83256E3D0FBA5C93, 0x0000BFF6 //A5 = -2.0011324059500451791903108104e-03
+data8 0xCC4AB2EC0965499B, 0x00003FF7 //A4 = 6.2344907419841579664122448353e-03
+LOCAL_OBJECT_END(_0p5_to_1_data_tail)
+
+LOCAL_OBJECT_START(_1_to_2_data_tail)
+// Polynomial coefficients for the tanh(x), 1 <= |x| < 2.0
+data8 0xCCAEE174EAC17F78, 0x0000BFEE //A13 = -1.2200065117856038355953618829e-05
+data8 0xA39DD0981D1A2776, 0x0000BFF0 //A12 = -3.9009204899026604074167603200e-05
+data8 0xB7104FA27FAF80D0, 0x00003FF2 //A11 = 1.7458316338540792661905876072e-04
+data8 0xB219A7274436A734, 0x0000BFF3 //A10 = -3.3969918595931391572998415468e-04
+data8 0xCCD9D03C0C73CECF, 0x00003FF2 //A9 = 1.9536097875337884986025498958e-04
+data8 0x85321EA40CFEEBEE, 0x00003FF5 //A8 = 1.0162031558369402750607778300e-03
+data8 0x81F272C08C308220, 0x0000BFF7 //A7 = -3.9656696618251138315464862909e-03
+data8 0xE8761C6BDEA9ED87, 0x00003FF7 //A6 = 7.0941580558970243020090656343e-03
+data8 0xAE4E9F3691F66877, 0x0000BFF6 //A5 = -2.6597155288710984120834711909e-03
+data8 0xCC8286B331BD8AAA, 0x0000BFF9 //A4 = -2.4964583478826523250880337777e-02
+LOCAL_OBJECT_END(_1_to_2_data_tail)
+
+LOCAL_OBJECT_START(_2_to_3p25_data_tail)
+// Polynomial coefficients for the tanh(x), 2 <= |x| < 3.25
+data8 0x92E1711A3BD6408B, 0x0000BFF4 //A13 = -5.6030514548041036913731470443e-04
+data8 0x8B9BD885FF3E98C5, 0x00003FF5 //A12 = 1.0651304064581604055612602669e-03
+data8 0xD041356C7FA26A22, 0x0000BFF5 //A11 = -1.5888574328066952147023520244e-03
+data8 0xDFA210BE9BE6B7FD, 0x00003FF5 //A10 = 1.7061849060196387827639060629e-03
+data8 0x8ECC3606808028E9, 0x0000BFF4 //A9 = -5.4472999329435778312080340471e-04
+data8 0xD5C053B8EEBD10C8, 0x0000BFF6 //A8 = -3.2615856552479930645151033322e-03
+data8 0xB7BFD63AC5051539, 0x00003FF8 //A7 = 1.1215171059191957498023766643e-02
+data8 0xC367C59D7FA3ADA2, 0x0000BFF9 //A6 = -2.3853193251842394834616848995e-02
+data8 0x9FC9FB890BB053CF, 0x00003FFA //A5 = 3.9010984954739386625695104667e-02
+data8 0xD01D077B42E7ED76, 0x0000BFFA //A4 = -5.0808934425896607486919526567e-02
+LOCAL_OBJECT_END(_2_to_3p25_data_tail)
+
+LOCAL_OBJECT_START(_4_to_6p5_data_tail)
+// Polynomial coefficients for the tanh(x), 4 <= |x| < 6.5
+data8 0x870CCE8C76C52C7E, 0x00003FF5 //A13 = 1.0303499350193060915603525934e-03
+data8 0xE1431E54AD2A738B, 0x0000BFF5 //A12 = -1.7186140560972621669872002486e-03
+data8 0xAB20056533E28734, 0x00003FF6 //A11 = 2.6111615345168277554841545330e-03
+data8 0xECCB91D64718B9BD, 0x0000BFF6 //A10 = -3.6132079169671860943878776041e-03
+data8 0x94771DA3B8C2EB4F, 0x00003FF7 //A9 = 4.5308012699419563988381317896e-03
+data8 0xA7497377E4946F2C, 0x0000BFF7 //A8 = -5.1051915941441437592654444804e-03
+data8 0xA76B2D6FCA088AE9, 0x00003FF7 //A7 = 5.1092120989582196669504468168e-03
+data8 0x928C8961F33C9560, 0x0000BFF7 //A6 = -4.4723196805537430568162704711e-03
+data8 0xDBDDDF6CDE9AB9BE, 0x00003FF6 //A5 = 3.3548994514326736175581084349e-03
+data8 0x896E211733AD9D40, 0x0000BFF6 //A4 = -2.0970183170010094667442967500e-03
+LOCAL_OBJECT_END(_4_to_6p5_data_tail)
+
+LOCAL_OBJECT_START(_8_to_13_data_tail)
+// Polynomial coefficients for the tanh(x), 8 <= |x| < 13
+data8 0xE50C3476BED020AA, 0x00003FF0 //A13 = 5.4609221347524272615754239857e-05
+data8 0xBA16F5F4EDC0EABC, 0x0000BFF0 //A12 = -4.4367239594986428539386662937e-05
+data8 0x8B916C2F002C3D91, 0x00003FF0 //A11 = 3.3275617838067362533536610680e-05
+data8 0xBFE8031097CB4442, 0x0000BFEF //A10 = -2.2877013297722792747267224605e-05
+data8 0xEFE1FFD106B2DA41, 0x00003FEE //A9 = 1.4298129659899553350478452989e-05
+data8 0x86EF1FF403A6622E, 0x0000BFEE //A8 = -8.0426979849841642112688693288e-06
+data8 0x86EF200FD047306B, 0x00003FED //A7 = 4.0213490418736097707257704218e-06
+data8 0xEC22782377882553, 0x0000BFEB //A6 = -1.7593402092805559754997565942e-06
+data8 0xB119DA1DB7C47773, 0x00003FEA //A5 = 6.5975257917246601211360847253e-07
+data8 0xDD6050A7761D67BB, 0x0000BFE8 //A4 = -2.0617268111985310661707082242e-07
+LOCAL_OBJECT_END(_8_to_13_data_tail)
+
+LOCAL_OBJECT_START(_16_to_22p8_data_tail)
+// Polynomial coefficients for the tanh(x), 16 <= |x| < 22.88
+data8 0xEAF4AF87336E81B1, 0x00003FEF //A13 = 2.8008914392791730186582989654e-05
+data8 0xD5B309EA768E2711, 0x00003FED //A12 = 6.3687375204024238267961143128e-06
+data8 0xA4048CA537113538, 0x00003FEB //A11 = 1.2220276227448617951538196845e-06
+data8 0xD3EC78BB3425377D, 0x00003FE8 //A10 = 1.9736934193679794194181457250e-07
+data8 0xE5763CD37440266E, 0x00003FE5 //A9 = 2.6712876934440631473215182284e-08
+data8 0xCECA765EEB4A265F, 0x00003FE2 //A8 = 3.0092031912460315516888139627e-09
+data8 0x99ABF588DF81A52E, 0x00003FDF //A7 = 2.7952722177649984066847682907e-10
+data8 0xB9C78918294A4685, 0x00003FDB //A6 = 2.1120676552098603524020495036e-11
+data8 0xB3A3C42AD539D50F, 0x00003FD7 //A5 = 1.2764169243389521270291967366e-12
+data8 0x86BC347939478174, 0x00003FD3 //A4 = 5.9834437707863962671883176163e-14
+LOCAL_OBJECT_END(_16_to_22p8_data_tail)
+
+LOCAL_OBJECT_START(_3p25_to_4_data_tail)
+// Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4
+data8 0xBE9A2BE19F21BA1C, 0x0000BFEE //A13 = -1.1360778336288065244475976873e-05
+data8 0xF84910F515BDB014, 0x00003FED //A12 = 7.3994819819577018481862729782e-06
+data8 0xC4C84FB788AA4007, 0x00003FEF //A11 = 2.3458298013663976251972482656e-05
+data8 0x86CC6243C170E5ED, 0x0000BFF2 //A10 = -1.2855374755847770638424932233e-04
+data8 0xD3065AC539ABABFF, 0x00003FF3 //A9 = 4.0249790677367806832685138089e-04
+data8 0x82C4413795EC381B, 0x0000BFF5 //A8 = -9.9767013652382759950854031514e-04
+data8 0x88D588720888899A, 0x00003FF6 //A7 = 2.0879228705174076794011525274e-03
+data8 0xF4CA066137741469, 0x0000BFF6 //A6 = -3.7351861548964870836350490741e-03
+data8 0xB998746D56E81737, 0x00003FF7 //A5 = 5.6639259807333999973200378964e-03
+data8 0xE93FB2F48233275B, 0x0000BFF7 //A4 = -7.1181892208343798194003322900e-03
+LOCAL_OBJECT_END(_3p25_to_4_data_tail)
+
+LOCAL_OBJECT_START(_6p5_to_8_data_tail)
+// Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0
+data8 0xA6881D7D21774BFD, 0x00003FEF //A13 = 1.9852125640303530752913966680e-05
+data8 0x875E983AA042E605, 0x0000BFF0 //A12 = -3.2274606306629334402383651599e-05
+data8 0xCB19E01E94FC133C, 0x00003FF0 //A11 = 4.8423069963831314927026982707e-05
+data8 0x8BA5E8D9E72D56B2, 0x0000BFF1 //A10 = -6.6589395655200734237190902534e-05
+data8 0xAE91F647ED4E46B2, 0x00003FF1 //A9 = 8.3241541003842930001632190258e-05
+data8 0xC465A7E0B22F884E, 0x0000BFF1 //A8 = -9.3649431639051891449916386619e-05
+data8 0xC4666148AA01A4D7, 0x00003FF1 //A7 = 9.3650780646160216748407869111e-05
+data8 0xABD9E63D181B0C6C, 0x0000BFF1 //A6 = -8.1945023256769295802996591839e-05
+data8 0x80E38B18E509387A, 0x00003FF1 //A5 = 6.1458988764532931141264026311e-05
+data8 0xA11C80E20ADA5A64, 0x0000BFF0 //A4 = -3.8411937140983728563216440713e-05
+LOCAL_OBJECT_END(_6p5_to_8_data_tail)
+
+LOCAL_OBJECT_START(_13_to_16_data_tail)
+// Polynomial coefficients for the tanh(x), 13 <= |x| < 16
+data8 0x9D6CCDA4767CA6D9, 0x00003FE5 //A13 = 1.8326683535066775712253572575e-08
+data8 0xFFAF154F334BF403, 0x0000BFE4 //A12 = -1.4882762852665077172347508377e-08
+data8 0xBFC68FA7C61B6C17, 0x00003FE4 //A11 = 1.1162810813806544919835662888e-08
+data8 0x83D8439A6B19A015, 0x0000BFE4 //A10 = -7.6743763372603959795701788561e-09
+data8 0xA4CE5BE9DC6A2962, 0x00003FE3 //A9 = 4.7964885012772346158732715382e-09
+data8 0xB96826C0697253CA, 0x0000BFE2 //A8 = -2.6980246373950994097953903952e-09
+data8 0xB96826CADDC00E35, 0x00003FE1 //A7 = 1.3490123232313844006540534789e-09
+data8 0xA23B21F1155DF322, 0x0000BFE0 //A6 = -5.9019289132168830718664922372e-10
+data8 0xF358B2E9A50C349C, 0x00003FDE //A5 = 2.2132233424669131155945897524e-10
+data8 0x98176FD2074C1D77, 0x0000BFDD //A4 = -6.9163229452106125388824134881e-11
+LOCAL_OBJECT_END(_13_to_16_data_tail)
+
+LOCAL_OBJECT_START(_0_to_1o8_data)
+// Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.125
+data8 0xBA0EC1879495150B, 0x0000BFF5 // A15 = -1.4195071451378679802688367813e-03
+data8 0xEB5A82898D1BCBA4, 0x00003FF6 // A13 = 3.5912102408030526706365632879e-03
+data8 0x91370DAFE0B64438, 0x0000BFF8 // A11 = -8.8632234251336964576640807982e-03
+data8 0xB327A435358F1200, 0x00003FF9 // A9 = 2.1869488447622383899199238857e-02
+data8 0xDD0DD0DD07A0775F, 0x0000BFFA // A7 = -5.3968253967902161405327069187e-02
+data8 0x888888888887C299, 0x00003FFC // A5 = 1.3333333333333264660338062012e-01
+data8 0xAAAAAAAAAAAAAA98, 0x0000BFFD // A3 = -3.3333333333333333282255458755e-01
+LOCAL_OBJECT_END(_0_to_1o8_data)
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(tanhl)
+
+{ .mfi
+ alloc r32 = ar.pfs, 0, 21, 0, 0
+ fmerge.se fArgAbsNorm = f1, f8 // normalized x (1.0 <= x < 2.0)
+ addl rSignBit = 0x20000, r0 // Set sign bit for exponent
+}
+{ .mlx
+ addl rDataPtr = @ltoff(tanhl_data), gp // Get common data ptr
+ movl r1p5 = 0x3FF8000000000000 // 1.5 in dbl repres.
+};;
+
+{ .mfi
+ getf.exp rArgExp = f8 // Get arg exponent
+ fclass.m p6,p0 = f8, 0xEF // Filter 0, denormals and specials
+ // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf
+ addl rBias = 0xfffc, r0 // Value to subtract from exp
+ // to get actual interval number
+}
+{ .mfi
+ ld8 rDataPtr = [rDataPtr] // Get real common data pointer
+ fma.s1 fArgSqr = f8, f8, f0 // x^2 (for [0;1/8] path)
+ addl r2to4 = 0x10000, r0 // unbiased exponent
+ // for [2;4] binary interval
+};;
+
+{ .mfi
+ getf.sig rArgSig = f8 // Get arg significand
+ fcmp.lt.s1 p15, p14 = f8, f0 // Is arg negative/positive?
+ addl rSaturation = 0xb70, r0 // First 12 bits of
+ // saturation value signif.
+}
+{ .mfi
+ setf.d f1p5 = r1p5 // 1.5 construction
+ fma.s1 f2p0 = f1,f1,f1 // 2.0 construction
+ addl r1625Sign = 0xd01, r0 // First 12 bits of
+ // 1.625 value signif.
+ // 1.625 significand used to filter values greater than 3.25, 6.5, 13.0
+};;
+
+{ .mfi
+ addl rTailDataPtr = 0xB00, rDataPtr // Pointer to "tail" data
+ fmerge.s fSignumX = f8, f1 // signum(x)
+ andcm rArgExp = rArgExp, rSignBit // Remove sign of exp
+}
+{ .mfb
+ addl rTiny = 0xf000, r0 // Tiny value for saturation path
+ nop.f 0
+(p6) br.cond.spnt tanhl_spec // Branch to zero, denorm & specs
+};;
+
+{ .mfi
+ sub rInterval = rArgExp, rBias // Get actual interval number
+ nop.f 0
+ shr.u rArgSig = rArgSig, 52 // Leave only 12 bits of sign.
+}
+{ .mfi
+ adds rShiftedDataPtr = 0x10, rDataPtr // Second ptr to data
+ nop.f 0
+ cmp.ge p8, p10 = rArgExp, r2to4 // If exp >= 2to4 interval?
+};;
+
+{ .mfi
+(p8) cmp.le p8, p10 = r1625Sign, rArgSig // If signd is greater
+ // than 1.625? (arg is at one of binary subranges)
+ nop.f 0
+ shl rOffset = rInterval, 8 // Make offset from
+ // interval number
+}
+{ .mfi
+ cmp.gt p9, p0 = 0x0, rInterval // If interval is less than 0
+ // (means arg is in [0; 1/8])
+ nop.f 0
+ cmp.eq p7, p0 = 0x7, rInterval // If arg is in [16;] interv.?
+};;
+
+{ .mfi
+(p8) adds rOffset = 0x400, rOffset // Add additional offset
+ // (arg is at one of binary subranges)
+ fma.s1 fArgCube = fArgSqr, f8, f0 // x^3 (for [0;1/8] path)
+ shl rTailOffset = rInterval, 7 // Make offset to "tail" data
+ // from interval number
+}
+{ .mib
+ setf.exp fTiny = rTiny // Construct "tiny" value
+ // for saturation path
+ cmp.ltu p11, p0 = 0x7, rInterval // if arg > 32
+(p9) br.cond.spnt _0_to_1o8
+};;
+
+{ .mfi
+ add rAddr1 = rDataPtr, rOffset // Get address for
+ // interval data
+ nop.f 0
+ shl rTailAddOffset = rInterval, 5 // Offset to interval
+ // "tail" data
+}
+{ .mib
+ add rAddr2 = rShiftedDataPtr, rOffset // Get second
+ // address for interval data
+(p7) cmp.leu p11, p0 = rSaturation, rArgSig // if arg is
+ // in [22.8;32] interval
+(p11) br.cond.spnt _saturation // Branch to Saturation path
+};;
+
+{ .mmi
+ ldfe fA3 = [rAddr1], 0x90 // Load A3
+ ldfpd fA2H, fA2L = [rAddr2], 16 // Load A2High, A2Low
+ add rTailOffset = rTailOffset, rTailAddOffset // "Tail" offset
+};;
+
+{ .mmi
+ ldfe fA20 = [rAddr1], 16 // Load A20
+ ldfpd fA1H, fA1L = [rAddr2], 16 // Load A1High, A1Low
+(p8) adds rTailOffset = 0x280, rTailOffset // Additional offset
+ // (arg is at one of binary subranges)
+};;
+
+{ .mmi
+ ldfe fA19 = [rAddr1], 16 // Load A19
+ ldfpd fA0H, fA0L = [rAddr2], 16 // Load A0High, A0Low
+ add rTailAddr1 = rTailDataPtr, rTailOffset // First tail
+ // data address
+};;
+
+.pred.rel "mutex",p8,p10
+{ .mfi
+ ldfe fA18 = [rAddr1], 16 // Load A18
+(p8) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f2p0 // Add 2.0
+ // (arg is at one of binary subranges)
+ adds rTailAddr2 = 0x10, rTailAddr1 // First tail
+ // data address
+}
+{ .mfi
+ ldfe fA25 = [rAddr2], 16 // Load A25
+(p10) fms.s1 fArgAbsNorm = fArgAbsNorm, f1, f1p5 // Add 1.5
+ // to normalized arg
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA17 = [rAddr1], 16 // Load A17
+ ldfe fA24 = [rAddr2], 16 // Load A24
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA16 = [rAddr1], 16 // Load A16
+ ldfe fA23 = [rAddr2], 16 // Load A23
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA15 = [rAddr1], 16 // Load A15
+ ldfe fA22 = [rAddr2], 16 // Load A22
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA14 = [rAddr1], 16 // Load A14
+ ldfe fA21 = [rAddr2], 16 // Load A21
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe fA13 = [rTailAddr1], 32 // Load A13
+ fms.s1 fArgAbsNorm2 = fArgAbsNorm, fArgAbsNorm, f0 // x^2
+ nop.i 0
+}
+{ .mfi
+ ldfe fA12 = [rTailAddr2], 32 // Load A12
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe fA11 = [rTailAddr1], 32 // Load A11
+ fma.s1 fRes3H = fA3, fArgAbsNorm, fA2H // (A3*x+A2)*x^2
+ nop.i 0
+}
+{ .mfi
+ ldfe fA10 = [rTailAddr2], 32 // Load A10
+ fma.s1 fTH = fA3, fArgAbsNorm, f0 // (A3*x+A2)*x^2
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe fA9 = [rTailAddr1], 32 // Load A9
+ fma.s1 fTT2 = fA1L, fArgAbsNorm, f0 // A1*x+A0
+ nop.i 0
+}
+{ .mfi
+ ldfe fA8 = [rTailAddr2], 32 // Load A8
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA7 = [rTailAddr1], 32 // Load A7
+ ldfe fA6 = [rTailAddr2], 32 // Load A6
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA5 = [rTailAddr1], 32 // Load A5
+ ldfe fA4 = [rTailAddr2], 32 // Load A4
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fArgAbsNorm2L = fArgAbsNorm, fArgAbsNorm, fArgAbsNorm2
+ // Low part of x^2 (delta)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fArgAbsNorm4 = fArgAbsNorm2, fArgAbsNorm2, f0 // x^4
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fRes3L = fA2H, f1, fRes3H // // (A3*x+A2)*x^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fArgAbsNorm3 = fArgAbsNorm2, fArgAbsNorm, f0 // x^3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fTH2 = fA1H, fArgAbsNorm, fTT2 // A1*x+A0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA23 = fA24, fArgAbsNorm, fA23 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA21 = fA22, fArgAbsNorm, fA21 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA12 = fA13, fArgAbsNorm, fA12 // Polynomial tail
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3L = fRes3L, f1, fTH // (A3*x+A2)*x^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA20, fArgAbsNorm, fA19 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1H = fTH2, f1, fA0H // A1*x+A0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fTL2 = fA1H, fArgAbsNorm, fTH2 // A1*x+A0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA8 = fA9, fArgAbsNorm, fA8 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA10 = fA11, fArgAbsNorm, fA10 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA16, fArgAbsNorm, fA15 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA18, fArgAbsNorm, fA17 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fArgAbsNorm11 = fArgAbsNorm4, fArgAbsNorm4, f0 // x^8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA5, fArgAbsNorm, fA4 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3L = fRes3L, f1, fA2L // (A3*x+A2)*x^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA6 = fA7, fArgAbsNorm, fA6 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fTL2 = fTL2, f1, fTT2 // A1*x+A0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1L = fA0H, f1, fRes1H // A1*x+A0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA23 = fA25, fArgAbsNorm2, fA23 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA12 = fA14, fArgAbsNorm2, fA12 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA21, fArgAbsNorm2, fA19 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA8 = fA10, fArgAbsNorm2, fA8 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA17, fArgAbsNorm2, fA15 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fArgAbsNorm11 = fArgAbsNorm11, fArgAbsNorm3, f0 // x^11
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fTT = fRes3L, fArgAbsNorm2, f0 // (A3*x+A2)*x^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA6, fArgAbsNorm2, fA4 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes1L, f1, fTH2 // A1*x+A0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fArgAbsNorm4X = fArgAbsNorm4, fSignumX, f0 // x^4 * signum
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA23, fArgAbsNorm4, fA19 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA8 = fA12, fArgAbsNorm4, fA8 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fTT = fRes3H, fArgAbsNorm2L, fTT // (A3*x+A2)*x^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes1L, f1, fTL2 // A1*x+A0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA19, fArgAbsNorm4, fA15 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA8, fArgAbsNorm4, fA4 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2H = fRes3H, fArgAbsNorm2, fTT // (A3*x+A2)*x^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes1L, f1, fA0L // A1*x+A0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes4 = fA15, fArgAbsNorm11, fA4 // Result of
+ // polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fRes2L = fRes3H, fArgAbsNorm2, fRes2H // (A3*x+A2)*x^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fRes2H, f1, fRes1H // High result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p14) fma.s1 fRes1L = fRes4, fArgAbsNorm4X, fRes1L // A1*x+A0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fms.s1 fRes1L = fRes4, fArgAbsNorm4X, fRes1L // A1*x+A0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2L = fRes2L, f1, fTT // (A3*x+A2)*x^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fResL = fRes1H, f1, fResH // Low result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s0 fRes1L = fRes2L, fSignumX, fRes1L // Low result
+ // .s0 - for symmetry issue resolving at +/-inf rounding mode
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fResL = fResL, f1, fRes2H // Low result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p14) fma.s0 fResL = fRes1L, f1, fResL // Low result
+ // .s0 - for symmetry issue resolving at +/-inf rounding mode
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fms.s0 fResL = fRes1L, f1, fResL // Low result
+ // .s0 - for symmetry issue resolving at +/-inf rounding mode
+ nop.i 0
+};;
+
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p14) fma.s0 f8 = fResL, f1, fResH// Add high and low results
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p15) fms.s0 f8 = fResL, f1, fResH // Add high and low results
+ br.ret.sptk b0 // Main path return
+};;
+
+// satiration path ////////////////////////////////////////////////////////////
+_saturation:
+
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p14) fms.s0 f8 = f1, f1, fTiny // Saturation result r = 1-tiny
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+(p15) fnma.s0 f8 = f1, f1, fTiny // Saturation result r = tiny-1
+ br.ret.sptk b0 // Saturation path return
+};;
+
+
+// 0, denormals and special IEEE numbers path /////////////////////////////////
+tanhl_spec:
+
+{ .mfi
+ nop.m 0
+ fclass.m p6,p0 = f8, 0x23 // To filter infinities
+ // 0x23 = @pos|@neg|@inf
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fclass.m p7,p0 = f8, 0xC7 // To filter NaNs & Zeros
+ // 0xC7 = @pos|@neg|@zero|@qnan|@snan
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+(p6) fmerge.s f8 = f8, f1 // +/-1 for INF args
+(p6) br.ret.spnt b0 // exit for x = INF
+};;
+
+{ .mfb
+ nop.m 0
+(p7) fma.s0 f8 = f8, f1, f8 // +/-0 for 0 args
+ // and NaNs for NaNs
+(p7) br.ret.spnt b0 // exit for x = NaN or +/-0
+};;
+
+{ .mfi
+ nop.m 0
+ fnorm.s0 f8 = f8 // Normalize arg
+ nop.i 0
+};;
+
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p14) fnma.s0 f8 = f8, f8, f8 // res = r-r^2
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p15) fma.s0 f8 = f8, f8, f8 // res = r+r^2
+ br.ret.sptk b0 // 0, denormals, IEEE specials return
+};;
+
+
+// 0 < |x| < 1/8 path /////////////////////////////////////////////////////////
+_0_to_1o8:
+
+{ .mmi
+ adds rAddr1 = 0x11e0, rDataPtr // Ptr 1 to coeffs
+ adds rAddr2 = 0x11f0, rDataPtr // Ptr 2 to coeffs
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA15 = [rAddr1], 32 // Load A15
+ ldfe fA13 = [rAddr2], 32 // Load A13
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA11 = [rAddr1], 32 // Load A11
+ ldfe fA9 = [rAddr2], 32 // Load A9
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe fA7 = [rAddr1], 32 // Load A7
+ ldfe fA5 = [rAddr2] // Load A5
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe fA3 = [rAddr1] // Load A3
+ fma.s1 fA11 = fA13, fArgSqr, fA11 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fArgFour = fArgSqr, fArgSqr, f0 // a^4
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA3 = fA5, fArgSqr, fA3 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA9, fArgSqr, fA7 // Polynomial tail
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA15, fArgFour, fA11 // Polynomial tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA3 = fA7, fArgFour, fA3 // Polynomial tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fArgEight = fArgFour, fArgFour, f0 // a^8
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes = fA11, fArgEight, fA3 //Polynomial tail result
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+ fma.s0 f8 = fRes, fArgCube, f8 // (Polynomial tail)*x^3
+ br.ret.sptk b0 // [0;1/8] interval return
+};;
+
+GLOBAL_LIBM_END(tanhl)
+
+
+
diff --git a/sysdeps/ia64/fpu/s_tanl.S b/sysdeps/ia64/fpu/s_tanl.S
index e13e6c6cbd..345a059c5f 100644
--- a/sysdeps/ia64/fpu/s_tanl.S
+++ b/sysdeps/ia64/fpu/s_tanl.S
@@ -1,10 +1,10 @@
-.file "tanl.s"
+.file "tancotl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,50 +35,77 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-// *********************************************************************
+//*********************************************************************
//
// History:
//
-// 2/02/2000 (hand-optimized)
-// 4/04/00 Unwind support added
+// 02/02/00 (hand-optimized)
+// 04/04/00 Unwind support added
// 12/28/00 Fixed false invalid flags
+// 02/06/02 Improved speed
+// 05/07/02 Changed interface to __libm_pi_by_2_reduce
+// 05/30/02 Added cotl
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
+// 05/15/03 Reformatted data tables
//
-// *********************************************************************
+//*********************************************************************
//
-// Function: tanl(x) = tangent(x), for double-extended precision x values
+// Functions: tanl(x) = tangent(x), for double-extended precision x values
+// cotl(x) = cotangent(x), for double-extended precision x values
//
-// *********************************************************************
+//*********************************************************************
//
// Resources Used:
//
// Floating-Point Registers: f8 (Input and Return Value)
// f9-f15
-// f32-f112
+// f32-f121
//
// General Purpose Registers:
-// r32-r48
-// r49-r50 (Used to pass arguments to pi_by_2 reduce routine)
+// r14-r26,r32-r57
//
// Predicate Registers: p6-p15
//
-// *********************************************************************
+//*********************************************************************
//
-// IEEE Special Conditions:
+// IEEE Special Conditions for tanl:
//
// Denormal fault raised on denormal inputs
// Overflow exceptions do not occur
-// Underflow exceptions raised when appropriate for tan
+// Underflow exceptions raised when appropriate for tan
// (No specialized error handling for this routine)
// Inexact raised when appropriate by algorithm
//
-// tan(SNaN) = QNaN
-// tan(QNaN) = QNaN
-// tan(inf) = QNaN
-// tan(+/-0) = +/-0
+// tanl(SNaN) = QNaN
+// tanl(QNaN) = QNaN
+// tanl(inf) = QNaN
+// tanl(+/-0) = +/-0
+//
+//*********************************************************************
+//
+// IEEE Special Conditions for cotl:
+//
+// Denormal fault raised on denormal inputs
+// Overflow exceptions occur at zero and near zero
+// Underflow exceptions do not occur
+// Inexact raised when appropriate by algorithm
+//
+// cotl(SNaN) = QNaN
+// cotl(QNaN) = QNaN
+// cotl(inf) = QNaN
+// cotl(+/-0) = +/-Inf and error handling is called
+//
+//*********************************************************************
//
-// *********************************************************************
+// Below are mathematical and algorithmic descriptions for tanl.
+// For cotl we use next identity cot(x) = -tan(x + Pi/2).
+// So, to compute cot(x) we just need to increment N (N = N + 1)
+// and invert sign of the computed result.
+//
+//*********************************************************************
//
// Mathematical Description
//
@@ -106,13 +133,13 @@
// -------
//
// tan(r + c) = r + c + r^3/3 ...accurately
-// -cot(r + c) = -1/(r+c) + r/3 ...accurately
+// -cot(r + c) = -1/(r+c) + r/3 ...accurately
//
// Case 4:
// -------
//
// tan(r + c) = r + c + r^3/3 + 2r^5/15 ...accurately
-// -cot(r + c) = -1/(r+c) + r/3 + r^3/45 ...accurately
+// -cot(r + c) = -1/(r+c) + r/3 + r^3/45 ...accurately
//
//
// The only cases left are Cases 1 and 3 of the argument reduction
@@ -143,13 +170,13 @@
// Since Arg = N pi/4 + r + c accurately, we have
//
// tan(Arg) = tan(r+c) for N even,
-// = -cot(r+c) otherwise.
+// = -cot(r+c) otherwise.
//
// Here for this case, both tan(r) and -cot(r) can be approximated
// by simple polynomials:
//
// tan(r) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
-// -cot(r) = -1/r + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
+// -cot(r) = -1/r + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
//
// accurately. Since |r| is relatively small, tan(r+c) and
// -cot(r+c) can be accurately approximated by replacing r with
@@ -178,21 +205,21 @@
// The required calculation is either
//
// tan(r + c) = tan(r) + correction, or
-// -cot(r + c) = -cot(r) + correction.
+// -cot(r + c) = -cot(r) + correction.
//
// Specifically,
//
// tan(r + c) = tan(r) + c tan'(r) + O(c^2)
-// = tan(r) + c sec^2(r) + O(c^2)
-// = tan(r) + c SEC_sq ...accurately
+// = tan(r) + c sec^2(r) + O(c^2)
+// = tan(r) + c SEC_sq ...accurately
// as long as SEC_sq approximates sec^2(r)
// to, say, 5 bits or so.
//
// Similarly,
//
-// -cot(r + c) = -cot(r) - c cot'(r) + O(c^2)
-// = -cot(r) + c csc^2(r) + O(c^2)
-// = -cot(r) + c CSC_sq ...accurately
+// -cot(r + c) = -cot(r) - c cot'(r) + O(c^2)
+// = -cot(r) + c csc^2(r) + O(c^2)
+// = -cot(r) + c CSC_sq ...accurately
// as long as CSC_sq approximates csc^2(r)
// to, say, 5 bits or so.
//
@@ -208,14 +235,14 @@
// where
//
// B = 2^k * 1.b_1 b_2 ... b_5 1
-// x = |r| - B
+// x = |r| - B
//
// Now,
// tan(B) + tan(x)
// tan( B + x ) = ------------------------
// 1 - tan(B)*tan(x)
//
-// / \
+// / \
// | tan(B) + tan(x) |
// = tan(B) + | ------------------------ - tan(B) |
@@ -248,7 +275,7 @@
// cot( B + x ) = ------------------------
// tan(B) + tan(x)
//
-// / \
+// / \
// | 1 - tan(B)*tan(x) |
// = cot(B) + | ----------------------- - cot(B) |
@@ -273,7 +300,7 @@
// Arg = N * pi/2 + r + c ...accurately
//
// tan(Arg) = tan(r) + correction if N is even;
-// = -cot(r) + correction otherwise.
+// = -cot(r) + correction otherwise.
//
// For Cases 2 and 4,
//
@@ -292,8 +319,8 @@
// tan(Arg) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
// + c*(1 + r^2) N even
//
-// = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
-// + Q1_1*c N odd
+// = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
+// + Q1_1*c N odd
//
// Case normal_r: 2^(-2) <= |r| <= pi/4
//
@@ -304,15 +331,15 @@
//
// tan(Arg) = tan(r) + c*sec^2(r)
// = tan( sgn_r * (B+x) ) + c * sec^2(|r|)
-// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(|r|) )
-// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(B) )
+// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(|r|) )
+// = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(B) )
//
// since B approximates |r| to 2^(-6) in relative accuracy.
//
// / (1/[sin(B)*cos(B)]) * tan(x)
// tan(Arg) = sgn_r * | tan(B) + --------------------------------
// \ cot(B) - tan(x)
-// \
+// \
// + CORR |
// /
@@ -324,15 +351,15 @@
//
// tan(Arg) = -cot(r) + c*csc^2(r)
// = -cot( sgn_r * (B+x) ) + c * csc^2(|r|)
-// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(|r|) )
-// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(B) )
+// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(|r|) )
+// = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(B) )
//
// since B approximates |r| to 2^(-6) in relative accuracy.
//
// / (1/[sin(B)*cos(B)]) * tan(x)
// tan(Arg) = sgn_r * | -cot(B) + --------------------------------
// \ tan(B) + tan(x)
-// \
+// \
// + CORR |
// /
@@ -356,8 +383,8 @@
// For N even,
//
// rsq := r * r
-// Result := c + r * rsq * P1_1
-// Result := r + Result ...in user-defined rounding
+// Poly := c + r * rsq * P1_1
+// Result := r + Poly ...in user-defined rounding
//
// For N odd,
// S_hi := -frcpa(r) ...8 bits
@@ -375,8 +402,8 @@
// For N even,
//
// rsq := r * r
-// Result := c + r * rsq * (P1_1 + rsq * P1_2)
-// Result := r + Result ...in user-defined rounding
+// Poly := c + r * rsq * (P1_1 + rsq * P1_2)
+// Result := r + Poly ...in user-defined rounding
//
// For N odd,
// S_hi := -frcpa(r) ...8 bits
@@ -414,8 +441,8 @@
// Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9))
// CORR := c * ( 1 + rsq )
// Poly := Poly1 + r_to_the_8*Poly2
-// Result := r*Poly + CORR
-// Result := r + Result ...in user-defined rounding
+// Poly := r*Poly + CORR
+// Result := r + Poly ...in user-defined rounding
// ...note that Poly1 and r_to_the_8 can be computed in parallel
// ...with Poly2 (Poly1 is intentionally set to be much
// ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden)
@@ -434,8 +461,8 @@
// rsq := r*r
// P := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7))
//
-// Result := r*P + S_lo
-// Result := S_hi + Result ...in user-defined rounding
+// Poly := r*P + S_lo
+// Result := S_hi + Poly ...in user-defined rounding
//
//
// Algorithm for the case of normal_r
@@ -454,7 +481,7 @@
// / (1/[sin(B)*cos(B)]) * tan(x)
// sgn_r * | tan(B) + -------------------------------- +
// \ cot(B) - tan(x)
-// \
+// \
// CORR |
// /
@@ -463,7 +490,7 @@
// calculated beforehand and stored in a table. Specifically,
// the table values are
//
-// tan(B) as T_hi + T_lo;
+// tan(B) as T_hi + T_lo;
// cot(B) as C_hi + C_lo;
// 1/[sin(B)*cos(B)] as SC_inv
//
@@ -559,7 +586,7 @@
// / (1/[sin(B)*cos(B)]) * tan(x)
// sgn_r * | -cot(B) + -------------------------------- +
// \ tan(B) + tan(x)
-// \
+// \
// CORR |
// /
@@ -568,7 +595,7 @@
// calculated beforehand and stored in a table. Specifically,
// the table values are
//
-// tan(B) as T_hi + T_lo;
+// tan(B) as T_hi + T_lo;
// cot(B) as C_hi + C_lo;
// 1/[sin(B)*cos(B)] as SC_inv
//
@@ -675,254 +702,382 @@
//
//
-#include "libm_support.h"
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-.align 128
-
-TANL_BASE_CONSTANTS:
-ASM_TYPE_DIRECTIVE(TANL_BASE_CONSTANTS,@object)
-data4 0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24
- // two**-14, -two**-14
-data4 0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi
-data4 0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0
-data4 0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1
-data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2
-data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3
-data4 0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63
-data4 0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0
-data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1
-data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2
-data4 0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4
-data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4
-data4 0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2
-data4 0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33
-data4 0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1
-data4 0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2
-data4 0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3
-data4 0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4
-data4 0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5
-data4 0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6
-data4 0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7
-data4 0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8
-data4 0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9
-data4 0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1
-data4 0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2
-data4 0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3
-data4 0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4
-data4 0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5
-data4 0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6
-data4 0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7
-data4 0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1
-data4 0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2
-data4 0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(TANL_BASE_CONSTANTS)
+
+tanl_table_1:
+data8 0xA2F9836E4E44152A, 0x00003FFE // two_by_pi
+data8 0xC84D32B0CE81B9F1, 0x00004016 // P_0
+data8 0xC90FDAA22168C235, 0x00003FFF // P_1
+data8 0xECE675D1FC8F8CBB, 0x0000BFBD // P_2
+data8 0xB7ED8FBBACC19C60, 0x0000BF7C // P_3
+LOCAL_OBJECT_END(TANL_BASE_CONSTANTS)
+
+LOCAL_OBJECT_START(tanl_table_2)
+data8 0xC90FDAA22168C234, 0x00003FFE // PI_BY_4
+data8 0xA397E5046EC6B45A, 0x00003FE7 // Inv_P_0
+data8 0x8D848E89DBD171A1, 0x0000BFBF // d_1
+data8 0xD5394C3618A66F8E, 0x0000BF7C // d_2
+data4 0x3E800000 // two**-2
+data4 0xBE800000 // -two**-2
+data4 0x00000000 // pad
+data4 0x00000000 // pad
+LOCAL_OBJECT_END(tanl_table_2)
+
+LOCAL_OBJECT_START(tanl_table_p1)
+data8 0xAAAAAAAAAAAAAABD, 0x00003FFD // P1_1
+data8 0x8888888888882E6A, 0x00003FFC // P1_2
+data8 0xDD0DD0DD0F0177B6, 0x00003FFA // P1_3
+data8 0xB327A440646B8C6D, 0x00003FF9 // P1_4
+data8 0x91371B251D5F7D20, 0x00003FF8 // P1_5
+data8 0xEB69A5F161C67914, 0x00003FF6 // P1_6
+data8 0xBEDD37BE019318D2, 0x00003FF5 // P1_7
+data8 0x9979B1463C794015, 0x00003FF4 // P1_8
+data8 0x8EBD21A38C6EB58A, 0x00003FF3 // P1_9
+LOCAL_OBJECT_END(tanl_table_p1)
+
+LOCAL_OBJECT_START(tanl_table_q1)
+data8 0xAAAAAAAAAAAAAAB4, 0x00003FFD // Q1_1
+data8 0xB60B60B60B5FC93E, 0x00003FF9 // Q1_2
+data8 0x8AB355E00C9BBFBF, 0x00003FF6 // Q1_3
+data8 0xDDEBBC89CBEE3D4C, 0x00003FF2 // Q1_4
+data8 0xB3548A685F80BBB6, 0x00003FEF // Q1_5
+data8 0x913625604CED5BF1, 0x00003FEC // Q1_6
+data8 0xF189D95A8EE92A83, 0x00003FE8 // Q1_7
+LOCAL_OBJECT_END(tanl_table_q1)
+
+LOCAL_OBJECT_START(tanl_table_p2)
+data8 0xAAAAAAAAAAAB362F, 0x00003FFD // P2_1
+data8 0x88888886E97A6097, 0x00003FFC // P2_2
+data8 0xDD108EE025E716A1, 0x00003FFA // P2_3
+LOCAL_OBJECT_END(tanl_table_p2)
+
+LOCAL_OBJECT_START(tanl_table_tm2)
//
// Entries T_hi double-precision memory format
// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
// Entries T_lo single-precision memory format
// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
//
-data4 0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000
-data4 0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000
-data4 0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000
-data4 0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000
-data4 0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000
-data4 0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000
-data4 0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000
-data4 0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000
-data4 0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000
-data4 0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000
-data4 0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000
-data4 0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000
-data4 0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000
-data4 0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000
-data4 0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000
-data4 0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000
-data4 0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000
-data4 0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000
-data4 0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000
-data4 0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000
-data4 0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000
-data4 0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000
-data4 0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000
-data4 0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000
-data4 0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000
-data4 0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000
-data4 0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000
-data4 0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000
-data4 0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000
-data4 0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000
-data4 0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000
-data4 0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000
+data8 0x3FD09BC362400794
+data4 0x23A05C32, 0x00000000
+data8 0x3FD124A9DFFBC074
+data4 0x240078B2, 0x00000000
+data8 0x3FD1AE235BD4920F
+data4 0x23826B8E, 0x00000000
+data8 0x3FD2383515E2701D
+data4 0x22D31154, 0x00000000
+data8 0x3FD2C2E463739C2D
+data4 0x2265C9E2, 0x00000000
+data8 0x3FD34E36AFEEA48B
+data4 0x245C05EB, 0x00000000
+data8 0x3FD3DA317DBB35D1
+data4 0x24749F2D, 0x00000000
+data8 0x3FD466DA67321619
+data4 0x2462CECE, 0x00000000
+data8 0x3FD4F4371F94A4D5
+data4 0x246D0DF1, 0x00000000
+data8 0x3FD5824D740C3E6D
+data4 0x240A85B5, 0x00000000
+data8 0x3FD611234CB1E73D
+data4 0x23F96E33, 0x00000000
+data8 0x3FD6A0BEAD9EA64B
+data4 0x247C5393, 0x00000000
+data8 0x3FD73125B804FD01
+data4 0x241F3B29, 0x00000000
+data8 0x3FD7C25EAB53EE83
+data4 0x2479989B, 0x00000000
+data8 0x3FD8546FE6640EED
+data4 0x23B343BC, 0x00000000
+data8 0x3FD8E75FE8AF1892
+data4 0x241454D1, 0x00000000
+data8 0x3FD97B3553928BDA
+data4 0x238613D9, 0x00000000
+data8 0x3FDA0FF6EB9DE4DE
+data4 0x22859FA7, 0x00000000
+data8 0x3FDAA5AB99ECF92D
+data4 0x237A6D06, 0x00000000
+data8 0x3FDB3C5A6D8F1796
+data4 0x23952F6C, 0x00000000
+data8 0x3FDBD40A9CFB8BE4
+data4 0x2280FC95, 0x00000000
+data8 0x3FDC6CC387943100
+data4 0x245D2EC0, 0x00000000
+data8 0x3FDD068CB736C500
+data4 0x23C4AD7D, 0x00000000
+data8 0x3FDDA16DE1DDBC31
+data4 0x23D076E6, 0x00000000
+data8 0x3FDE3D6EEB515A93
+data4 0x244809A6, 0x00000000
+data8 0x3FDEDA97E6E9E5F1
+data4 0x220856C8, 0x00000000
+data8 0x3FDF78F11963CE69
+data4 0x244BE993, 0x00000000
+data8 0x3FE00C417D635BCE
+data4 0x23D21799, 0x00000000
+data8 0x3FE05CAB1C302CD3
+data4 0x248A1B1D, 0x00000000
+data8 0x3FE0ADB9DB6A1FA0
+data4 0x23D53E33, 0x00000000
+data8 0x3FE0FF724A20BA81
+data4 0x24DB9ED5, 0x00000000
+data8 0x3FE151D9153FA6F5
+data4 0x24E9E451, 0x00000000
+LOCAL_OBJECT_END(tanl_table_tm2)
+
+LOCAL_OBJECT_START(tanl_table_tm1)
//
// Entries T_hi double-precision memory format
// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
// Entries T_lo single-precision memory format
// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
//
-data4 0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000
-data4 0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000
-data4 0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000
-data4 0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000
-data4 0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000
-data4 0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000
-data4 0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000
-data4 0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000
-data4 0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000
-data4 0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000
-data4 0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000
-data4 0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000
-data4 0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000
-data4 0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000
-data4 0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000
-data4 0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000
-data4 0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000
-data4 0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000
-data4 0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000
-data4 0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000
+data8 0x3FE1CEC4BA1BE39E
+data4 0x24B60F9E, 0x00000000
+data8 0x3FE277E45ABD9B2D
+data4 0x248C2474, 0x00000000
+data8 0x3FE324180272B110
+data4 0x247B8311, 0x00000000
+data8 0x3FE3D38B890E2DF0
+data4 0x24C55751, 0x00000000
+data8 0x3FE4866D46236871
+data4 0x24E5BC34, 0x00000000
+data8 0x3FE53CEE45E044B0
+data4 0x24001BA4, 0x00000000
+data8 0x3FE5F74282EC06E4
+data4 0x24B973DC, 0x00000000
+data8 0x3FE6B5A125DF43F9
+data4 0x24895440, 0x00000000
+data8 0x3FE77844CAFD348C
+data4 0x240021CA, 0x00000000
+data8 0x3FE83F6BCEED6B92
+data4 0x24C45372, 0x00000000
+data8 0x3FE90B58A34F3665
+data4 0x240DAD33, 0x00000000
+data8 0x3FE9DC522C1E56B4
+data4 0x24F846CE, 0x00000000
+data8 0x3FEAB2A427041578
+data4 0x2323FB6E, 0x00000000
+data8 0x3FEB8E9F9DD8C373
+data4 0x24B3090B, 0x00000000
+data8 0x3FEC709B65C9AA7B
+data4 0x2449F611, 0x00000000
+data8 0x3FED58F4ACCF8435
+data4 0x23616A7E, 0x00000000
+data8 0x3FEE480F97635082
+data4 0x24C2FEAE, 0x00000000
+data8 0x3FEF3E57F0ACC544
+data4 0x242CE964, 0x00000000
+data8 0x3FF01E20F7E06E4B
+data4 0x2480D3EE, 0x00000000
+data8 0x3FF0A1258A798A69
+data4 0x24DB8967, 0x00000000
+LOCAL_OBJECT_END(tanl_table_tm1)
+
+LOCAL_OBJECT_START(tanl_table_cm2)
//
// Entries C_hi double-precision memory format
// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
// Entries C_lo single-precision memory format
// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
//
-data4 0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000
-data4 0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000
-data4 0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000
-data4 0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000
-data4 0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000
-data4 0x5285B068, 0x400A855A, 0x25524F18, 0x00000000
-data4 0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000
-data4 0x646AF623, 0x4009188A, 0x254FD801, 0x00000000
-data4 0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000
-data4 0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000
-data4 0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000
-data4 0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000
-data4 0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000
-data4 0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000
-data4 0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000
-data4 0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000
-data4 0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000
-data4 0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000
-data4 0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000
-data4 0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000
-data4 0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000
-data4 0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000
-data4 0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000
-data4 0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000
-data4 0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000
-data4 0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000
-data4 0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000
-data4 0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000
-data4 0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000
-data4 0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000
-data4 0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000
-data4 0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000
+data8 0x400ED3E2E63EFBD0
+data4 0x259D94D4, 0x00000000
+data8 0x400DDDB4C515DAB5
+data4 0x245F0537, 0x00000000
+data8 0x400CF57ABE19A79F
+data4 0x25D4EA9F, 0x00000000
+data8 0x400C1A06D15298ED
+data4 0x24AE40A0, 0x00000000
+data8 0x400B4A4C164B2708
+data4 0x25A5AAB6, 0x00000000
+data8 0x400A855A5285B068
+data4 0x25524F18, 0x00000000
+data8 0x4009CA5A3FFA549F
+data4 0x24C999C0, 0x00000000
+data8 0x4009188A646AF623
+data4 0x254FD801, 0x00000000
+data8 0x40086F3C6084D0E7
+data4 0x2560F5FD, 0x00000000
+data8 0x4007CDD2A29A76EE
+data4 0x255B9D19, 0x00000000
+data8 0x400733BE6C8ECA95
+data4 0x25CB021B, 0x00000000
+data8 0x4006A07E1F8DDC52
+data4 0x24AB4722, 0x00000000
+data8 0x4006139BC298AD58
+data4 0x252764E2, 0x00000000
+data8 0x40058CABBAD7164B
+data4 0x24DAF5DB, 0x00000000
+data8 0x40050B4BAE31A5D3
+data4 0x25EA20F4, 0x00000000
+data8 0x40048F2189F85A8A
+data4 0x2583A3E8, 0x00000000
+data8 0x400417DAA862380D
+data4 0x25DCC4CC, 0x00000000
+data8 0x4003A52B1088FCFE
+data4 0x2430A492, 0x00000000
+data8 0x400336CCCD3527D5
+data4 0x255F77CF, 0x00000000
+data8 0x4002CC7F5760766D
+data4 0x25DA0BDA, 0x00000000
+data8 0x4002660711CE02E3
+data4 0x256FF4A2, 0x00000000
+data8 0x4002032CD37BBE04
+data4 0x25208AED, 0x00000000
+data8 0x4001A3BD7F050775
+data4 0x24B72DD6, 0x00000000
+data8 0x40014789A554848A
+data4 0x24AB4DAA, 0x00000000
+data8 0x4000EE65323E81B7
+data4 0x2584C440, 0x00000000
+data8 0x4000982721CF1293
+data4 0x25C9428D, 0x00000000
+data8 0x400044A93D415EEB
+data4 0x25DC8482, 0x00000000
+data8 0x3FFFE78FBD72C577
+data4 0x257F5070, 0x00000000
+data8 0x3FFF4AC375EFD28E
+data4 0x23EBBF7A, 0x00000000
+data8 0x3FFEB2AF60B52DDE
+data4 0x22EECA07, 0x00000000
+data8 0x3FFE1F1935204180
+data4 0x24191079, 0x00000000
+data8 0x3FFD8FCA54F7E60A
+data4 0x248D3058, 0x00000000
+LOCAL_OBJECT_END(tanl_table_cm2)
+
+LOCAL_OBJECT_START(tanl_table_cm1)
//
// Entries C_hi double-precision memory format
// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
// Entries C_lo single-precision memory format
// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
//
-data4 0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000
-data4 0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000
-data4 0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000
-data4 0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000
-data4 0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000
-data4 0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000
-data4 0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000
-data4 0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000
-data4 0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000
-data4 0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000
-data4 0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000
-data4 0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000
-data4 0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000
-data4 0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000
-data4 0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000
-data4 0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000
-data4 0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000
-data4 0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000
-data4 0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000
-data4 0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000
+data8 0x3FFCC06A79F6FADE
+data4 0x239C7886, 0x00000000
+data8 0x3FFBB91F891662A6
+data4 0x250BD191, 0x00000000
+data8 0x3FFABFB6529F155D
+data4 0x256CC3E6, 0x00000000
+data8 0x3FF9D3002E964AE9
+data4 0x250843E3, 0x00000000
+data8 0x3FF8F1EF89DCB383
+data4 0x2277C87E, 0x00000000
+data8 0x3FF81B937C87DBD6
+data4 0x256DA6CF, 0x00000000
+data8 0x3FF74F141042EDE4
+data4 0x2573D28A, 0x00000000
+data8 0x3FF68BAF1784B360
+data4 0x242E489A, 0x00000000
+data8 0x3FF5D0B57C923C4C
+data4 0x2532D940, 0x00000000
+data8 0x3FF51D88F418EF20
+data4 0x253C7DD6, 0x00000000
+data8 0x3FF4719A02F88DAE
+data4 0x23DB59BF, 0x00000000
+data8 0x3FF3CC6649DA0788
+data4 0x252B4756, 0x00000000
+data8 0x3FF32D770B980DB8
+data4 0x23FE585F, 0x00000000
+data8 0x3FF2945FE56C987A
+data4 0x25378A63, 0x00000000
+data8 0x3FF200BDB16523F6
+data4 0x247BB2E0, 0x00000000
+data8 0x3FF172358CE27778
+data4 0x24446538, 0x00000000
+data8 0x3FF0E873FDEFE692
+data4 0x2514638F, 0x00000000
+data8 0x3FF0632C33154062
+data4 0x24A7FC27, 0x00000000
+data8 0x3FEFC42EB3EF115F
+data4 0x248FD0FE, 0x00000000
+data8 0x3FEEC9E8135D26F6
+data4 0x2385C719, 0x00000000
+LOCAL_OBJECT_END(tanl_table_cm1)
+
+LOCAL_OBJECT_START(tanl_table_scim2)
//
// Entries SC_inv in Swapped IEEE format (extended)
// Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
//
-data4 0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000
-data4 0x554B0EB0, 0x80092804, 0x00004001, 0x00000000
-data4 0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000
-data4 0x77378677, 0xF3086BA0, 0x00004000, 0x00000000
-data4 0xCCD4723C, 0xED154515, 0x00004000, 0x00000000
-data4 0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000
-data4 0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000
-data4 0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000
-data4 0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000
-data4 0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000
-data4 0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000
-data4 0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000
-data4 0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000
-data4 0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000
-data4 0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000
-data4 0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000
-data4 0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000
-data4 0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000
-data4 0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000
-data4 0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000
-data4 0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000
-data4 0x7221235F, 0xAC862A23, 0x00004000, 0x00000000
-data4 0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000
-data4 0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000
-data4 0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000
-data4 0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000
-data4 0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000
-data4 0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000
-data4 0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000
-data4 0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000
-data4 0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000
-data4 0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000
+data8 0x839D6D4A1BF30C9E, 0x00004001
+data8 0x80092804554B0EB0, 0x00004001
+data8 0xF959F94CA1CF0DE9, 0x00004000
+data8 0xF3086BA077378677, 0x00004000
+data8 0xED154515CCD4723C, 0x00004000
+data8 0xE77909441C27CF25, 0x00004000
+data8 0xE22D037D8DDACB88, 0x00004000
+data8 0xDD2B2D8A89C73522, 0x00004000
+data8 0xD86E1A23BB2C1171, 0x00004000
+data8 0xD3F0E288DFF5E0F9, 0x00004000
+data8 0xCFAF16B1283BEBD5, 0x00004000
+data8 0xCBA4AFAA0D88DD53, 0x00004000
+data8 0xC7CE03CCCA67C43D, 0x00004000
+data8 0xC427BC820CA0DDB0, 0x00004000
+data8 0xC0AECD57F13D8CAB, 0x00004000
+data8 0xBD606C3871ECE6B1, 0x00004000
+data8 0xBA3A0A96A44C4929, 0x00004000
+data8 0xB7394F6FE5CCCEC1, 0x00004000
+data8 0xB45C12039637D8BC, 0x00004000
+data8 0xB1A0552892CB051B, 0x00004000
+data8 0xAF04432B6BA2FFD0, 0x00004000
+data8 0xAC862A237221235F, 0x00004000
+data8 0xAA2478AF5F00A9D1, 0x00004000
+data8 0xA7DDBB0C81E082BF, 0x00004000
+data8 0xA5B0987D45684FEE, 0x00004000
+data8 0xA39BD0F5627A8F53, 0x00004000
+data8 0xA19E3B036EC5C8B0, 0x00004000
+data8 0x9FB6C1F091CD7C66, 0x00004000
+data8 0x9DE464101FA3DF8A, 0x00004000
+data8 0x9C263139A8F6B888, 0x00004000
+data8 0x9A7B4968C27B0450, 0x00004000
+data8 0x98E2DB7E5EE614EE, 0x00004000
+LOCAL_OBJECT_END(tanl_table_scim2)
+
+LOCAL_OBJECT_START(tanl_table_scim1)
//
// Entries SC_inv in Swapped IEEE format (extended)
// Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
//
-data4 0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000
-data4 0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000
-data4 0x61B798AF, 0x9147094F, 0x00004000, 0x00000000
-data4 0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000
-data4 0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000
-data4 0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000
-data4 0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000
-data4 0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000
-data4 0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000
-data4 0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000
-data4 0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000
-data4 0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000
-data4 0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000
-data4 0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000
-data4 0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000
-data4 0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000
-data4 0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000
-data4 0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000
-data4 0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000
-data4 0xA07791FA, 0x80186650, 0x00004000, 0x00000000
-ASM_SIZE_DIRECTIVE(TANL_BASE_CONSTANTS)
-
-Arg = f8
+data8 0x969F335C13B2B5BA, 0x00004000
+data8 0x93D446D9D4C0F548, 0x00004000
+data8 0x9147094F61B798AF, 0x00004000
+data8 0x8EF317CC758787AC, 0x00004000
+data8 0x8CD498B3B99EEFDB, 0x00004000
+data8 0x8AE82A7DDFF8BC37, 0x00004000
+data8 0x892AD546E3C55D42, 0x00004000
+data8 0x8799FEA9D15573C1, 0x00004000
+data8 0x86335F88435A4B4C, 0x00004000
+data8 0x84F4FB6E3E93A87B, 0x00004000
+data8 0x83DD195280A382FB, 0x00004000
+data8 0x82EA3D7FA4CB8C9E, 0x00004000
+data8 0x821B247C6861D0A8, 0x00004000
+data8 0x816EBED163E8D244, 0x00004000
+data8 0x80E42D9127E4CFC6, 0x00004000
+data8 0x807ABF8D28E64AFD, 0x00004000
+data8 0x8031EF26863B4FD8, 0x00004000
+data8 0x800960ADAE8C11FD, 0x00004000
+data8 0x8000E1475FDBEC21, 0x00004000
+data8 0x80186650A07791FA, 0x00004000
+LOCAL_OBJECT_END(tanl_table_scim1)
+
+Arg = f8
+Save_Norm_Arg = f8 // For input to reduction routine
Result = f8
-fp_tmp = f9
+r = f8 // For output from reduction routine
+c = f9 // For output from reduction routine
U_2 = f10
-rsq = f11
+rsq = f11
C_hi = f12
C_lo = f13
T_hi = f14
T_lo = f15
-N_0 = f32
d_1 = f33
-MPI_BY_4 = f34
+N_0 = f34
tail = f35
tanx = f36
Cx = f37
@@ -949,8 +1104,6 @@ P1_7 = f51
P1_8 = f52
P1_9 = f53
-TWO_TO_63 = f54
-NEGTWO_TO_63 = f55
x = f56
xsq = f57
Tx = f58
@@ -966,12 +1119,10 @@ B = f67
SC_inv = f68
Pos_r = f69
N_0_fix = f70
-PI_BY_4 = f71
-NEGTWO_TO_NEG2 = f72
-TWO_TO_24 = f73
+d_2 = f71
+PI_BY_4 = f72
TWO_TO_NEG14 = f74
TWO_TO_NEG33 = f75
-NEGTWO_TO_24 = f76
NEGTWO_TO_NEG14 = f76
NEGTWO_TO_NEG33 = f77
two_by_PI = f78
@@ -982,13 +1133,14 @@ P_2 = f82
P_3 = f83
s_val = f84
w = f85
-c = f86
-r = f87
+B_mask1 = f86
+B_mask2 = f87
+w2 = f88
A = f89
a = f90
t = f91
U_1 = f92
-d_2 = f93
+NEGTWO_TO_NEG2 = f93
TWO_TO_NEG2 = f94
Q1_1 = f95
Q1_2 = f96
@@ -1009,609 +1161,641 @@ V_hiabs = f110
V = f111
Inv_P_0 = f112
+FR_inv_pi_2to63 = f113
+FR_rshf_2to64 = f114
+FR_2tom64 = f115
+FR_rshf = f116
+Norm_Arg = f117
+Abs_Arg = f118
+TWO_TO_NEG65 = f119
+fp_tmp = f120
+mOne = f121
+
+GR_sig_inv_pi = r14
+GR_rshf_2to64 = r15
+GR_exp_2tom64 = r16
+GR_rshf = r17
+GR_exp_2_to_63 = r18
+GR_exp_2_to_24 = r19
+GR_signexp_x = r20
+GR_exp_x = r21
+GR_exp_mask = r22
+GR_exp_2tom14 = r23
+GR_exp_m2tom14 = r24
+GR_exp_2tom33 = r25
+GR_exp_m2tom33 = r26
+
GR_SAVE_B0 = r33
GR_SAVE_GP = r34
GR_SAVE_PFS = r35
-delta1 = r36
+table_base = r36
table_ptr1 = r37
table_ptr2 = r38
-i_0 = r39
-i_1 = r40
-N_fix_gr = r41
-N_inc = r42
-exp_Arg = r43
-exp_r = r44
-sig_r = r45
-lookup = r46
-table_offset = r47
-Create_B = r48
+table_ptr3 = r39
+lookup = r40
+N_fix_gr = r41
+GR_exp_2tom2 = r42
+GR_exp_2tom65 = r43
+exp_r = r44
+sig_r = r45
+bmask1 = r46
+table_offset = r47
+bmask2 = r48
gr_tmp = r49
+cot_flag = r50
+
+GR_SAVE_B0 = r51
+GR_SAVE_PFS = r52
+GR_SAVE_GP = r53
+GR_Parameter_X = r54
+GR_Parameter_Y = r55
+GR_Parameter_RESULT = r56
+GR_Parameter_Tag = r57
+
.section .text
-.global tanl
-.proc tanl
-tanl:
-#ifdef _LIBC
-.global __tanl
-.proc __tanl
-__tanl:
-#endif
-{ .mfi
-alloc r32 = ar.pfs, 0,17,2,0
-(p0) fclass.m.unc p6,p0 = Arg, 0x1E7
- addl gr_tmp = -1,r0
-}
-{ .mfi
- nop.m 0
-(p0) fclass.nm.unc p7,p0 = Arg, 0x1FF
- nop.i 0
+.global __libm_tanl#
+.global __libm_cotl#
+
+.proc __libm_cotl#
+__libm_cotl:
+.endp __libm_cotl#
+LOCAL_LIBM_ENTRY(cotl)
+
+{ .mlx
+ alloc r32 = ar.pfs, 0,22,4,0
+ movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
+}
+{ .mlx
+ mov GR_exp_mask = 0x1ffff // Exponent mask
+ movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
+}
+;;
+
+// Check for NatVals, Infs , NaNs, and Zeros
+{ .mfi
+ getf.exp GR_signexp_x = Arg // Get sign and exponent of x
+ fclass.m p6,p0 = Arg, 0x1E7 // Test for natval, nan, inf, zero
+ mov cot_flag = 0x1
+}
+{ .mfb
+ addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr
+ fnorm.s1 Norm_Arg = Arg // Normalize x
+ br.cond.sptk COMMON_PATH
};;
+LOCAL_LIBM_END(cotl)
+
+.proc __libm_tanl#
+__libm_tanl:
+.endp __libm_tanl#
+GLOBAL_IEEE754_ENTRY(tanl)
+
+{ .mlx
+ alloc r32 = ar.pfs, 0,22,4,0
+ movl GR_sig_inv_pi = 0xa2f9836e4e44152a // significand of 1/pi
+}
+{ .mlx
+ mov GR_exp_mask = 0x1ffff // Exponent mask
+ movl GR_rshf_2to64 = 0x47e8000000000000 // 1.1000 2^(63+64)
+}
+;;
+
+// Check for NatVals, Infs , NaNs, and Zeros
{ .mfi
-(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp
- nop.f 999
+ getf.exp GR_signexp_x = Arg // Get sign and exponent of x
+ fclass.m p6,p0 = Arg, 0x1E7 // Test for natval, nan, inf, zero
+ mov cot_flag = 0x0
+}
+{ .mfi
+ addl table_base = @ltoff(TANL_BASE_CONSTANTS), gp // Pointer to table ptr
+ fnorm.s1 Norm_Arg = Arg // Normalize x
nop.i 0
+};;
+
+// Common path for both tanl and cotl
+COMMON_PATH:
+{ .mfi
+ setf.sig FR_inv_pi_2to63 = GR_sig_inv_pi // Form 1/pi * 2^63
+ fclass.m p9, p0 = Arg, 0x0b // Test x denormal
+ mov GR_exp_2tom64 = 0xffff - 64 // Scaling constant to compute N
+}
+{ .mlx
+ setf.d FR_rshf_2to64 = GR_rshf_2to64 // Form const 1.1000 * 2^(63+64)
+ movl GR_rshf = 0x43e8000000000000 // Form const 1.1000 * 2^63
}
;;
-{ .mmi
-(p0) ld8 table_ptr1 = [table_ptr1]
- setf.sig fp_tmp = gr_tmp // Make a constant so fmpy produces inexact
- nop.i 999
+
+// Check for everything - if false, then must be pseudo-zero or pseudo-nan.
+// Branch out to deal with special values.
+{ .mfi
+ addl gr_tmp = -1,r0
+ fclass.nm p7,p0 = Arg, 0x1FF // Test x unsupported
+ mov GR_exp_2_to_63 = 0xffff + 63 // Exponent of 2^63
+}
+{ .mfb
+ ld8 table_base = [table_base] // Get pointer to constant table
+ fms.s1 mOne = f0, f0, f1
+(p6) br.cond.spnt TANL_SPECIAL // Branch if x natval, nan, inf, zero
}
;;
-//
-// Check for NatVals, Infs , NaNs, and Zeros
-// Check for everything - if false, then must be pseudo-zero
-// or pseudo-nan.
-// Local table pointer
-//
-{ .mbb
-(p0) add table_ptr2 = 96, table_ptr1
-(p6) br.cond.spnt L(TANL_SPECIAL)
-(p7) br.cond.spnt L(TANL_SPECIAL) ;;
+{ .mmb
+ setf.sig fp_tmp = gr_tmp // Make a constant so fmpy produces inexact
+ mov GR_exp_2_to_24 = 0xffff + 24 // Exponent of 2^24
+(p9) br.cond.spnt TANL_DENORMAL // Branch if x denormal
}
+;;
+
+TANL_COMMON:
+// Return to here if x denormal
//
-// Point to Inv_P_0
-// Branch out to deal with unsupporteds and special values.
-//
-{ .mmf
-(p0) ldfs TWO_TO_24 = [table_ptr1],4
-(p0) ldfs TWO_TO_63 = [table_ptr2],4
-//
-// Load -2**24, load -2**63.
-//
-(p0) fcmp.eq.s0 p0, p6 = Arg, f1 ;;
-}
+// Do fcmp to generate Denormal exception
+// - can't do FNORM (will generate Underflow when U is unmasked!)
+// Branch out to deal with unsupporteds values.
{ .mfi
-(p0) ldfs NEGTWO_TO_63 = [table_ptr2],12
-(p0) fnorm.s1 Arg = Arg
- nop.i 999
+ setf.exp FR_2tom64 = GR_exp_2tom64 // Form 2^-64 for scaling N_float
+ fcmp.eq.s0 p0, p6 = Arg, f1 // Dummy to flag denormals
+ add table_ptr1 = 0, table_base // Point to tanl_table_1
}
-//
-// Load 2**24, Load 2**63.
-//
-{ .mmi
-(p0) ldfs NEGTWO_TO_24 = [table_ptr1],12 ;;
-//
-// Do fcmp to generate Denormal exception
-// - can't do FNORM (will generate Underflow when U is unmasked!)
-// Normalize input argument.
-//
-(p0) ldfe two_by_PI = [table_ptr1],16
- nop.i 999
+{ .mib
+ setf.d FR_rshf = GR_rshf // Form right shift const 1.1000 * 2^63
+ add table_ptr2 = 80, table_base // Point to tanl_table_2
+(p7) br.cond.spnt TANL_UNSUPPORTED // Branch if x unsupported type
}
-{ .mmi
-(p0) ldfe Inv_P_0 = [table_ptr2],16 ;;
-(p0) ldfe d_1 = [table_ptr2],16
- nop.i 999
+;;
+
+{ .mfi
+ and GR_exp_x = GR_exp_mask, GR_signexp_x // Get exponent of x
+ fmpy.s1 Save_Norm_Arg = Norm_Arg, f1 // Save x if large arg reduction
+ dep.z bmask1 = 0x7c, 56, 8 // Form mask to get 5 msb of r
+ // bmask1 = 0x7c00000000000000
}
+;;
+
//
// Decide about the paths to take:
-// PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2
-// OTHERWISE - CASE 3 OR 4
-// Load inverse of P_0 .
-// Set PR_6 if Arg <= -2**63
-// Are there any Infs, NaNs, or zeros?
+// Set PR_6 if |Arg| >= 2**63
+// Set PR_9 if |Arg| < 2**24 - CASE 1 OR 2
+// OTHERWISE Set PR_8 - CASE 3 OR 4
//
-{ .mmi
-(p0) ldfe P_0 = [table_ptr1],16 ;;
-(p0) ldfe d_2 = [table_ptr2],16
- nop.i 999
+// Branch out if the magnitude of the input argument is >= 2^63
+// - do this branch before the next.
+{ .mfi
+ ldfe two_by_PI = [table_ptr1],16 // Load 2/pi
+ nop.f 999
+ dep.z bmask2 = 0x41, 57, 7 // Form mask to OR to produce B
+ // bmask2 = 0x8200000000000000
}
-//
-// Set PR_8 if Arg <= -2**24
-// Set PR_6 if Arg >= 2**63
-//
-{ .mmi
-(p0) ldfe P_1 = [table_ptr1],16 ;;
-(p0) ldfe PI_BY_4 = [table_ptr2],16
- nop.i 999
+{ .mib
+ ldfe PI_BY_4 = [table_ptr2],16 // Load pi/4
+ cmp.ge p6,p0 = GR_exp_x, GR_exp_2_to_63 // Is |x| >= 2^63
+(p6) br.cond.spnt TANL_ARG_TOO_LARGE // Branch if |x| >= 2^63
}
-//
-// Set PR_8 if Arg >= 2**24
-//
+;;
+
{ .mmi
-(p0) ldfe P_2 = [table_ptr1],16 ;;
-(p0) ldfe MPI_BY_4 = [table_ptr2],16
- nop.i 999
-}
-//
-// Load P_2 and PI_BY_4
-//
-{ .mfi
-(p0) ldfe P_3 = [table_ptr1],16
- nop.f 999
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63
- nop.i 999
+ ldfe P_0 = [table_ptr1],16 // Load P_0
+ ldfe Inv_P_0 = [table_ptr2],16 // Load Inv_P_0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24
- nop.i 999 ;;
+ ldfe P_1 = [table_ptr1],16 // Load P_1
+ fmerge.s Abs_Arg = f0, Norm_Arg // Get |x|
+ mov GR_exp_m2tom33 = 0x2ffff - 33 // Form signexp of -2^-33
}
{ .mfi
- nop.m 999
-(p7) fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63
- nop.i 999
+ ldfe d_1 = [table_ptr2],16 // Load d_1 for 2^24 <= |x| < 2^63
+ nop.f 999
+ mov GR_exp_2tom33 = 0xffff - 33 // Form signexp of 2^-33
}
-{ .mfi
- nop.m 999
-(p9) fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24
- nop.i 999 ;;
+;;
+
+{ .mmi
+ ldfe P_2 = [table_ptr1],16 // Load P_2
+ ldfe d_2 = [table_ptr2],16 // Load d_2 for 2^24 <= |x| < 2^63
+ cmp.ge p8,p0 = GR_exp_x, GR_exp_2_to_24 // Is |x| >= 2^24
}
-{ .mib
- nop.m 999
- nop.i 999
-//
-// Load P_3 and -PI_BY_4
-//
-(p6) br.cond.spnt L(TANL_ARG_TOO_LARGE) ;;
+;;
+
+// Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits
+// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
+{ .mfb
+ ldfe P_3 = [table_ptr1],16 // Load P_3
+ fma.s1 N_fix = Norm_Arg, FR_inv_pi_2to63, FR_rshf_2to64
+(p8) br.cond.spnt TANL_LARGER_ARG // Branch if 2^24 <= |x| < 2^63
}
-{ .mib
- nop.m 999
- nop.i 999
-//
-// Load 2**(-2).
-// Load -2**(-2).
-// Branch out if we have a special argument.
-// Branch out if the magnitude of the input argument is too large
-// - do this branch before the next.
+;;
+
+// Here if 0 < |x| < 2^24
+// ARGUMENT REDUCTION CODE - CASE 1 and 2
//
-(p8) br.cond.spnt L(TANL_LARGER_ARG) ;;
+{ .mmf
+ setf.exp TWO_TO_NEG33 = GR_exp_2tom33 // Form 2^-33
+ setf.exp NEGTWO_TO_NEG33 = GR_exp_m2tom33 // Form -2^-33
+ fmerge.s r = Norm_Arg,Norm_Arg // Assume r=x, ok if |x| < pi/4
}
+;;
+
//
-// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
+// If |Arg| < pi/4, set PR_8, else pi/4 <=|Arg| < 2^24 - set PR_9.
//
+// Case 2: Convert integer N_fix back to normalized floating-point value.
{ .mfi
-(p0) ldfs TWO_TO_NEG2 = [table_ptr2],4
-// ARGUMENT REDUCTION CODE - CASE 1 and 2
-// Load 2**(-2).
-// Load -2**(-2).
-(p0) fmpy.s1 N = Arg,two_by_PI
- nop.i 999 ;;
+ getf.sig sig_r = Norm_Arg // Get sig_r if 1/4 <= |x| < pi/4
+ fcmp.lt.s1 p8,p9= Abs_Arg,PI_BY_4 // Test |x| < pi/4
+ mov GR_exp_2tom2 = 0xffff - 2 // Form signexp of 2^-2
}
{ .mfi
-(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],12
-//
-// N = Arg * 2/pi
-//
-(p0) fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-//
-// if Arg < pi/4, set PR_8.
-//
-(p8) fcmp.gt.s1 p8,p9= Arg,MPI_BY_4
- nop.i 999 ;;
+ ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2] // Load 2^-2, -2^-2
+ fms.s1 N = N_fix, FR_2tom64, FR_rshf // Use scaling to get N floated
+ mov N_fix_gr = r0 // Assume N=0, ok if |x| < pi/4
}
+;;
+
//
// Case 1: Is |r| < 2**(-2).
// Arg is the same as r in this case.
// r = Arg
// c = 0
//
+// Case 2: Place integer part of N in GP register.
{ .mfi
-(p8) mov N_fix_gr = r0
-//
-// if Arg > -pi/4, reset PR_8.
-// Select the case when |Arg| < pi/4 - set PR[8] = true.
-// Else Select the case when |Arg| >= pi/4 - set PR[9] = true.
-//
-(p0) fcvt.fx.s1 N_fix = N
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-//
-// Grab the integer part of N .
-//
-(p8) mov r = Arg
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p8) mov c = f0
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p8) fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2
- nop.i 999 ;;
+(p9) getf.sig N_fix_gr = N_fix
+ fmerge.s c = f0, f0 // Assume c=0, ok if |x| < pi/4
+ cmp.lt p10, p0 = GR_exp_x, GR_exp_2tom2 // Test if |x| < 1/4
}
+;;
+
{ .mfi
- nop.m 999
-(p10) fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2
- nop.i 999 ;;
+ setf.sig B_mask1 = bmask1 // Form mask to get 5 msb of r
+ nop.f 999
+ mov exp_r = GR_exp_x // Get exp_r if 1/4 <= |x| < pi/4
}
-{ .mfi
- nop.m 999
-//
-// Case 2: Place integer part of N in GP register.
-//
-(p9) fcvt.xf N = N_fix
- nop.i 999 ;;
-}
-{ .mib
-(p9) getf.sig N_fix_gr = N_fix
- nop.i 999
-//
-// Case 2: Convert integer N_fix back to normalized floating-point value.
-//
-(p10) br.cond.spnt L(TANL_SMALL_R) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p8) br.cond.sptk L(TANL_NORMAL_R) ;;
+{ .mbb
+ setf.sig B_mask2 = bmask2 // Form mask to form B from r
+(p10) br.cond.spnt TANL_SMALL_R // Branch if 0 < |x| < 1/4
+(p8) br.cond.spnt TANL_NORMAL_R // Branch if 1/4 <= |x| < pi/4
}
+;;
+
+// Here if pi/4 <= |x| < 2^24
//
// Case 1: PR_3 is only affected when PR_1 is set.
//
-{ .mmi
-(p9) ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;;
//
-// Case 2: Load 2**(-33).
+// Case 2: w = N * P_2
+// Case 2: s_val = -N * P_1 + Arg
//
-(p9) ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4
- nop.i 999 ;;
+
+{ .mfi
+ nop.m 999
+ fnma.s1 s_val = N, P_1, Norm_Arg
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Case 2: Load -2**(-33).
-//
-(p9) fnma.s1 s_val = N, P_1, Arg
- nop.i 999
+ nop.m 999
+ fmpy.s1 w = N, P_2 // w = N * P_2 for |s| >= 2^-33
+ nop.i 999
}
+;;
+
+// Case 2_reduce: w = N * P_3 (change sign)
{ .mfi
- nop.m 999
-(p9) fmpy.s1 w = N, P_2
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 w2 = N, P_3 // w = N * P_3 for |s| < 2^-33
+ nop.i 999
}
+;;
+
+// Case 1_reduce: r = s + w (change sign)
{ .mfi
- nop.m 999
-//
-// Case 2: w = N * P_2
-// Case 2: s_val = -N * P_1 + Arg
-//
-(p0) fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33
- nop.i 999 ;;
+ nop.m 999
+ fsub.s1 r = s_val, w // r = s_val - w for |s| >= 2^-33
+ nop.i 999
}
+;;
+
+// Case 2_reduce: U_1 = N * P_2 + w
{ .mfi
- nop.m 999
+ nop.m 999
+ fma.s1 U_1 = N, P_2, w2 // U_1 = N * P_2 + w for |s| < 2^-33
+ nop.i 999
+}
+;;
+
//
// Decide between case_1 and case_2 reduce:
+// Case 1_reduce: |s| >= 2**(-33)
+// Case 2_reduce: |s| < 2**(-33)
//
-(p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
- nop.i 999 ;;
+{ .mfi
+ nop.m 999
+ fcmp.lt.s1 p9, p8 = s_val, TWO_TO_NEG33
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Case 1_reduce: s <= -2**(-33) or s >= 2**(-33)
-// Case 2_reduce: -2**(-33) < s < 2**(-33)
-//
-(p8) fsub.s1 r = s_val, w
- nop.i 999
+ nop.m 999
+(p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
+ nop.i 999
}
+;;
+
+// Case 1_reduce: c = s - r
{ .mfi
- nop.m 999
-(p9) fmpy.s1 w = N, P_3
- nop.i 999 ;;
+ nop.m 999
+ fsub.s1 c = s_val, r // c = s_val - r for |s| >= 2^-33
+ nop.i 999
}
+;;
+
+// Case 2_reduce: r is complete here - continue to calculate c .
+// r = s - U_1
{ .mfi
- nop.m 999
-(p9) fma.s1 U_1 = N, P_2, w
- nop.i 999
+ nop.m 999
+(p9) fsub.s1 r = s_val, U_1
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
+(p9) fms.s1 U_2 = N, P_2, U_1
+ nop.i 999
+}
+;;
+
//
// Case 1_reduce: Is |r| < 2**(-2), if so set PR_10
-// else set PR_11.
+// else set PR_13.
//
-(p8) fsub.s1 c = s_val, r
- nop.i 999 ;;
-}
+
{ .mfi
- nop.m 999
-//
-// Case 1_reduce: r = s + w (change sign)
-// Case 2_reduce: w = N * P_3 (change sign)
-//
-(p8) fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2
- nop.i 999 ;;
+ nop.m 999
+ fand B = B_mask1, r
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p10) fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2
- nop.i 999 ;;
+ nop.m 999
+(p8) fcmp.lt.unc.s1 p10, p13 = r, TWO_TO_NEG2
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p9) fsub.s1 r = s_val, U_1
- nop.i 999
+(p8) getf.sig sig_r = r // Get signif of r if |s| >= 2^-33
+ nop.f 999
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
+(p8) getf.exp exp_r = r // Extract signexp of r if |s| >= 2^-33
+(p10) fcmp.gt.s1 p10, p13 = r, NEGTWO_TO_NEG2
+ nop.i 999
+}
+;;
+
// Case 1_reduce: c is complete here.
+// Case 1: Branch to SMALL_R or NORMAL_R.
// c = c + w (w has not been negated.)
-// Case 2_reduce: r is complete here - continue to calculate c .
-// r = s - U_1
-//
-(p9) fms.s1 U_2 = N, P_2, U_1
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
+ nop.m 999
+(p8) fsub.s1 c = c, w // c = c - w for |s| >= 2^-33
+ nop.i 999
+}
+{ .mbb
+ nop.m 999
+(p10) br.cond.spnt TANL_SMALL_R // Branch if pi/4 < |x| < 2^24 and |r|<1/4
+(p13) br.cond.sptk TANL_NORMAL_R_A // Branch if pi/4 < |x| < 2^24 and |r|>=1/4
+}
+;;
+
+
+// Here if pi/4 < |x| < 2^24 and |s| < 2^-33
//
-// Case 1_reduce: c = s - r
-// Case 2_reduce: U_1 = N * P_2 + w
+// Is i_1 = lsb of N_fix_gr even or odd?
+// if i_1 == 0, set p11, else set p12.
//
-(p8) fsub.s1 c = c, w
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p9) fsub.s1 s_val = s_val, r
- nop.i 999
+ nop.m 999
+ fsub.s1 s_val = s_val, r
+ add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
}
-{ .mfb
- nop.m 999
+{ .mfi
+ nop.m 999
//
// Case 2_reduce:
// U_2 = N * P_2 - U_1
// Not needed until later.
//
-(p9) fadd.s1 U_2 = U_2, w
+ fadd.s1 U_2 = U_2, w2
//
// Case 2_reduce:
// s = s - r
// U_2 = U_2 + w
//
-(p10) br.cond.spnt L(TANL_SMALL_R) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p11) br.cond.sptk L(TANL_NORMAL_R) ;;
+ nop.i 999
}
-{ .mii
- nop.m 999
+;;
+
//
// Case 2_reduce:
// c = c - U_2
// c is complete here
// Argument reduction ends here.
//
-(p9) extr.u i_1 = N_fix_gr, 0, 1 ;;
-(p9) cmp.eq.unc p11, p12 = 0x0000,i_1 ;;
-}
-{ .mfi
- nop.m 999
-//
-// Is i_1 even or odd?
-// if i_1 == 0, set p11, else set p12.
-//
-(p11) fmpy.s1 rsq = r, r
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p12) frcpa.s1 S_hi,p0 = f1, r
- nop.i 999
+ nop.m 999
+ fmpy.s1 rsq = r, r
+ tbit.z p11, p12 = N_fix_gr, 0 ;; // Set p11 if N even, p12 if odd
}
-
-
-//
-// Case 1: Branch to SMALL_R or NORMAL_R.
-// Case 1 is done now.
-//
-
{ .mfi
-(p9) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp
-(p9) fsub.s1 c = s_val, U_1
- nop.i 999 ;;
+ nop.m 999
+(p12) frcpa.s1 S_hi,p0 = f1, r
+ nop.i 999
}
-;;
-
-{ .mmi
-(p9) ld8 table_ptr1 = [table_ptr1]
+{ .mfi
nop.m 999
+ fsub.s1 c = s_val, U_1
nop.i 999
}
;;
-
{ .mmi
-(p9) add table_ptr1 = 224, table_ptr1 ;;
-(p9) ldfe P1_1 = [table_ptr1],144
- nop.i 999 ;;
+ add table_ptr1 = 160, table_base ;; // Point to tanl_table_p1
+ ldfe P1_1 = [table_ptr1],144
+ nop.i 999 ;;
}
//
-// Get [i_1] - lsb of N_fix_gr .
// Load P1_1 and point to Q1_1 .
//
{ .mfi
-(p9) ldfe Q1_1 = [table_ptr1] , 0
+ ldfe Q1_1 = [table_ptr1]
//
// N even: rsq = r * Z
// N odd: S_hi = frcpa(r)
//
-(p12) fmerge.ns S_hi = S_hi, S_hi
- nop.i 999
+(p12) fmerge.ns S_hi = S_hi, S_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// Case 2_reduce:
// c = s - U_1
//
-(p9) fsub.s1 c = c, U_2
- nop.i 999 ;;
+(p9) fsub.s1 c = c, U_2
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p12) fma.s1 poly1 = S_hi, r, f1
- nop.i 999 ;;
+ nop.m 999
+(p12) fma.s1 poly1 = S_hi, r, f1
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: Change sign of S_hi
//
-(p11) fmpy.s1 rsq = rsq, P1_1
- nop.i 999 ;;
+(p11) fmpy.s1 rsq = rsq, P1_1
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p12) fma.s1 S_hi = S_hi, poly1, S_hi
- nop.i 999 ;;
+ nop.m 999
+(p12) fma.s1 S_hi = S_hi, poly1, S_hi
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: rsq = rsq * P1_1
// N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary
//
-(p11) fma.s1 Result = r, rsq, c
- nop.i 999 ;;
+(p11) fma.s1 Poly = r, rsq, c
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// N even: Result = c + r * rsq
+// N even: Poly = c + r * rsq
// N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary
//
-(p12) fma.s1 poly1 = S_hi, r, f1
- nop.i 999 ;;
+(p12) fma.s1 poly1 = S_hi, r, f1
+(p11) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// N even: Result = Result + r
+// N even: Result = Poly + r
// N odd: poly1 = 1.0 + S_hi * r 32 bits partial
//
-(p11) fadd.s0 Result = r, Result
- nop.i 999 ;;
+(p14) fadd.s0 Result = r, Poly // for tanl
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p15) fms.s0 Result = r, mOne, Poly // for cotl
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p12) fma.s1 S_hi = S_hi, poly1, S_hi
- nop.i 999 ;;
+ nop.m 999
+(p12) fma.s1 S_hi = S_hi, poly1, S_hi
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: Result1 = Result + r
// N odd: S_hi = S_hi * poly1 + S_hi 32 bits
//
-(p12) fma.s1 poly1 = S_hi, r, f1
- nop.i 999 ;;
+(p12) fma.s1 poly1 = S_hi, r, f1
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: poly1 = S_hi * r + 1.0 64 bits partial
//
-(p12) fma.s1 S_hi = S_hi, poly1, S_hi
- nop.i 999 ;;
+(p12) fma.s1 S_hi = S_hi, poly1, S_hi
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: poly1 = S_hi * poly + 1.0 64 bits
//
-(p12) fma.s1 poly1 = S_hi, r, f1
- nop.i 999 ;;
+(p12) fma.s1 poly1 = S_hi, r, f1
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: poly1 = S_hi * r + 1.0
//
-(p12) fma.s1 poly1 = S_hi, c, poly1
- nop.i 999 ;;
+(p12) fma.s1 poly1 = S_hi, c, poly1
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: poly1 = S_hi * c + poly1
//
-(p12) fmpy.s1 S_lo = S_hi, poly1
- nop.i 999 ;;
+(p12) fmpy.s1 S_lo = S_hi, poly1
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: S_lo = S_hi * poly1
//
-(p12) fma.s1 S_lo = Q1_1, r, S_lo
- nop.i 999
+(p12) fma.s1 S_lo = Q1_1, r, S_lo
+(p12) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: Result = S_hi + S_lo
//
-(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
- nop.i 999 ;;
+ fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
+ nop.i 999 ;;
}
-{ .mfb
- nop.m 999
+{ .mfi
+ nop.m 999
//
// N odd: S_lo = S_lo + Q1_1 * r
//
-(p12) fadd.s0 Result = S_hi, S_lo
-(p0) br.ret.sptk b0 ;;
+(p14) fadd.s0 Result = S_hi, S_lo // for tanl
+ nop.i 999
+}
+{ .mfb
+ nop.m 999
+(p15) fms.s0 Result = S_hi, mOne, S_lo // for cotl
+ br.ret.sptk b0 ;; // Exit for pi/4 <= |x| < 2^24 and |s| < 2^-33
}
-L(TANL_LARGER_ARG):
-
+TANL_LARGER_ARG:
+// Here if 2^24 <= |x| < 2^63
//
// ARGUMENT REDUCTION CODE - CASE 3 and 4
//
-{ .mfi
-(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp
-(p0) fmpy.s1 N_0 = Arg, Inv_P_0
- nop.i 999
+{ .mmf
+ mov GR_exp_2tom14 = 0xffff - 14 // Form signexp of 2^-14
+ mov GR_exp_m2tom14 = 0x2ffff - 14 // Form signexp of -2^-14
+ fmpy.s1 N_0 = Norm_Arg, Inv_P_0
}
;;
{ .mmi
-(p0) ld8 table_ptr1 = [table_ptr1]
- nop.m 999
+ setf.exp TWO_TO_NEG14 = GR_exp_2tom14 // Form 2^-14
+ setf.exp NEGTWO_TO_NEG14 = GR_exp_m2tom14// Form -2^-14
nop.i 999
}
;;
@@ -1622,661 +1806,605 @@ L(TANL_LARGER_ARG):
// N_0 = Arg * Inv_P_0
//
{ .mmi
-(p0) add table_ptr1 = 8, table_ptr1 ;;
-//
-// Point to 2*-14
-//
-(p0) ldfs TWO_TO_NEG14 = [table_ptr1], 4
- nop.i 999 ;;
+ add table_ptr2 = 144, table_base ;; // Point to 2^-2
+ ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2]
+ nop.i 999
}
-//
-// Load 2**(-14).
-//
-{ .mmi
-(p0) ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;;
+;;
+
//
// N_0_fix = integer part of N_0 .
-// Adjust table_ptr1 to beginning of table.
//
-(p0) ldfs TWO_TO_NEG2 = [table_ptr1], 4
- nop.i 999 ;;
-}
//
// Make N_0 the integer part.
//
{ .mfi
-(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr1]
-//
-// Load -2**(-14).
-//
-(p0) fcvt.fx.s1 N_0_fix = N_0
- nop.i 999 ;;
+ nop.m 999
+ fcvt.fx.s1 N_0_fix = N_0
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p0) fcvt.xf N_0 = N_0_fix
- nop.i 999 ;;
+ setf.sig B_mask1 = bmask1 // Form mask to get 5 msb of r
+ fcvt.xf N_0 = N_0_fix
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p0) fnma.s1 ArgPrime = N_0, P_0, Arg
- nop.i 999
+ setf.sig B_mask2 = bmask2 // Form mask to form B from r
+ fnma.s1 ArgPrime = N_0, P_0, Norm_Arg
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 w = N_0, d_1
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 w = N_0, d_1
+ nop.i 999 ;;
}
-{ .mfi
- nop.m 999
//
// ArgPrime = -N_0 * P_0 + Arg
// w = N_0 * d_1
//
-(p0) fmpy.s1 N = ArgPrime, two_by_PI
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
//
// N = ArgPrime * 2/pi
//
-(p0) fcvt.fx.s1 N_fix = N
- nop.i 999 ;;
-}
+// fcvt.fx.s1 N_fix = N
+// Use special scaling to right shift so N=Arg * 2/pi is in rightmost bits
+// Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
{ .mfi
- nop.m 999
-//
-// N_fix is the integer part.
-//
-(p0) fcvt.xf N = N_fix
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 N_fix = ArgPrime, FR_inv_pi_2to63, FR_rshf_2to64
+
+ nop.i 999 ;;
}
+// Convert integer N_fix back to normalized floating-point value.
{ .mfi
-(p0) getf.sig N_fix_gr = N_fix
- nop.f 999
- nop.i 999 ;;
+ nop.m 999
+ fms.s1 N = N_fix, FR_2tom64, FR_rshf // Use scaling to get N floated
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
// N is the integer part of the reduced-reduced argument.
// Put the integer in a GP register.
//
-(p0) fnma.s1 s_val = N, P_1, ArgPrime
- nop.i 999
-}
{ .mfi
- nop.m 999
-(p0) fnma.s1 w = N, P_2, w
- nop.i 999 ;;
+ getf.sig N_fix_gr = N_fix
+ nop.f 999
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
// s_val = -N*P_1 + ArgPrime
// w = -N*P_2 + w
//
-(p0) fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-//
-// Case 3: r = s_val + w (Z complete)
-// Case 4: U_hi = N_0 * d_1
-//
-(p10) fmpy.s1 V_hi = N, P_2
- nop.i 999
+ nop.m 999
+ fnma.s1 s_val = N, P_1, ArgPrime
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p11) fmpy.s1 U_hi = N_0, d_1
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 w = N, P_2, w
+ nop.i 999
}
-{ .mfi
- nop.m 999
-//
-// Case 3: r = s_val + w (Z complete)
+;;
+
+// Case 4: V_hi = N * P_2
// Case 4: U_hi = N_0 * d_1
-//
-(p11) fmpy.s1 V_hi = N, P_2
- nop.i 999
-}
{ .mfi
- nop.m 999
-(p11) fmpy.s1 U_hi = N_0, d_1
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 V_hi = N, P_2 // V_hi = N * P_2 for |s| < 2^-14
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Decide between case 3 and 4:
-// Case 3: s <= -2**(-14) or s >= 2**(-14)
-// Case 4: -2**(-14) < s < 2**(-14)
-//
-(p10) fadd.s1 r = s_val, w
- nop.i 999
+ nop.m 999
+ fmpy.s1 U_hi = N_0, d_1 // U_hi = N_0 * d_1 for |s| < 2^-14
+ nop.i 999
}
+;;
+
+// Case 3: r = s_val + w (Z complete)
+// Case 4: w = N * P_3
{ .mfi
- nop.m 999
-(p11) fmpy.s1 w = N, P_3
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 r = s_val, w // r = s_val + w for |s| >= 2^-14
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Case 4: We need abs of both U_hi and V_hi - dont
-// worry about switched sign of V_hi .
-//
-(p11) fsub.s1 A = U_hi, V_hi
- nop.i 999
+ nop.m 999
+ fmpy.s1 w2 = N, P_3 // w = N * P_3 for |s| < 2^-14
+ nop.i 999
}
-{ .mfi
- nop.m 999
-//
+;;
+
// Case 4: A = U_hi + V_hi
// Note: Worry about switched sign of V_hi, so subtract instead of add.
-//
-(p11) fnma.s1 V_lo = N, P_2, V_hi
- nop.i 999 ;;
+// Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
+// Note: the (-) is still missing for V_hi.
+{ .mfi
+ nop.m 999
+ fsub.s1 A = U_hi, V_hi // A = U_hi - V_hi for |s| < 2^-14
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p11) fms.s1 U_lo = N_0, d_1, U_hi
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 V_lo = N, P_2, V_hi // V_lo = V_hi - N * P_2 for |s| < 2^-14
+ nop.i 999
}
+;;
+
+// Decide between case 3 and 4:
+// Case 3: |s| >= 2**(-14) Set p10
+// Case 4: |s| < 2**(-14) Set p11
+//
+// Case 4: U_lo = N_0 * d_1 - U_hi
{ .mfi
- nop.m 999
-(p11) fabs V_hiabs = V_hi
- nop.i 999
+ nop.m 999
+ fms.s1 U_lo = N_0, d_1, U_hi // U_lo = N_0*d_1 - U_hi for |s| < 2^-14
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Case 4: V_hi = N * P_2
-// w = N * P_3
-// Note the product does not include the (-) as in the writeup
-// so (-) missing for V_hi and w .
-(p10) fadd.s1 r = s_val, w
- nop.i 999 ;;
+ nop.m 999
+ fcmp.lt.s1 p11, p10 = s_val, TWO_TO_NEG14
+ nop.i 999
}
+;;
+
+// Case 4: We need abs of both U_hi and V_hi - dont
+// worry about switched sign of V_hi.
{ .mfi
- nop.m 999
-//
-// Case 3: c = s_val - r
-// Case 4: U_lo = N_0 * d_1 - U_hi
-//
-(p11) fabs U_hiabs = U_hi
- nop.i 999
+ nop.m 999
+ fabs V_hiabs = V_hi // |V_hi| for |s| < 2^-14
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p11) fmpy.s1 w = N, P_3
- nop.i 999 ;;
+ nop.m 999
+(p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
+ nop.i 999
}
+;;
+
+// Case 3: c = s_val - r
{ .mfi
- nop.m 999
-//
-// Case 4: Set P_12 if U_hiabs >= V_hiabs
-//
-(p11) fadd.s1 C_hi = s_val, A
- nop.i 999 ;;
+ nop.m 999
+ fabs U_hiabs = U_hi // |U_hi| for |s| < 2^-14
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
+ fsub.s1 c = s_val, r // c = s_val - r for |s| >= 2^-14
+ nop.i 999
+}
+;;
+
+// For Case 3, |s| >= 2^-14, determine if |r| < 1/4
//
// Case 4: C_hi = s_val + A
//
-(p11) fadd.s1 t = U_lo, V_lo
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-//
-// Case 3: Is |r| < 2**(-2), if so set PR_7
-// else set PR_8.
-// Case 3: If PR_7 is set, prepare to branch to Small_R.
-// Case 3: If PR_8 is set, prepare to branch to Normal_R.
-//
-(p10) fsub.s1 c = s_val, r
- nop.i 999 ;;
+ nop.m 999
+(p11) fadd.s1 C_hi = s_val, A // C_hi = s_val + A for |s| < 2^-14
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Case 3: c = (s - r) + w (c complete)
-//
-(p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
- nop.i 999
+ nop.m 999
+(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p11) fms.s1 w = N_0, d_2, w
- nop.i 999 ;;
+ getf.sig sig_r = r // Get signif of r if |s| >= 2^-33
+ fand B = B_mask1, r
+ nop.i 999
}
+;;
+
+// Case 4: t = U_lo + V_lo
{ .mfi
- nop.m 999
-//
-// Case 4: V_hi = N * P_2
-// w = N * P_3
-// Note the product does not include the (-) as in the writeup
-// so (-) missing for V_hi and w .
-//
-(p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
- nop.i 999 ;;
+ getf.exp exp_r = r // Extract signexp of r if |s| >= 2^-33
+(p11) fadd.s1 t = U_lo, V_lo // t = U_lo + V_lo for |s| < 2^-14
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2
- nop.i 999 ;;
+ nop.i 999
}
-{ .mfb
- nop.m 999
+;;
+
+// Case 3: c = (s - r) + w (c complete)
+{ .mfi
+ nop.m 999
+(p10) fadd.s1 c = c, w // c = c + w for |s| >= 2^-14
+ nop.i 999
+}
+{ .mbb
+ nop.m 999
+(p14) br.cond.spnt TANL_SMALL_R // Branch if 2^24 <= |x| < 2^63 and |r|< 1/4
+(p15) br.cond.sptk TANL_NORMAL_R_A // Branch if 2^24 <= |x| < 2^63 and |r|>=1/4
+}
+;;
+
+
+// Here if 2^24 <= |x| < 2^63 and |s| < 2^-14 >>>>>>> Case 4.
//
-// Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
-// Note: the (-) is still missing for V_hi .
+// Case 4: Set P_12 if U_hiabs >= V_hiabs
// Case 4: w = w + N_0 * d_2
// Note: the (-) is now incorporated in w .
-//
-(p10) fadd.s1 c = c, w
-//
-// Case 4: t = U_lo + V_lo
-// Note: remember V_lo should be (-), subtract instead of add. NO
-//
-(p14) br.cond.spnt L(TANL_SMALL_R) ;;
-}
-{ .mib
- nop.m 999
- nop.i 999
-(p15) br.cond.spnt L(TANL_NORMAL_R) ;;
-}
{ .mfi
- nop.m 999
-//
-// Case 3: Vector off when |r| < 2**(-2). Recall that PR_3 will be true.
-// The remaining stuff is for Case 4.
-//
-(p12) fsub.s1 a = U_hi, A
-(p11) extr.u i_1 = N_fix_gr, 0, 1 ;;
+ add table_ptr1 = 160, table_base // Point to tanl_table_p1
+ fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Case 4: C_lo = s_val - C_hi
-//
-(p11) fadd.s1 t = t, w
- nop.i 999
+ nop.m 999
+ fms.s1 w2 = N_0, d_2, w2
+ nop.i 999
}
+;;
+
+// Case 4: C_lo = s_val - C_hi
{ .mfi
- nop.m 999
-(p13) fadd.s1 a = V_hi, A
- nop.i 999 ;;
+ ldfe P1_1 = [table_ptr1], 16 // Load P1_1
+ fsub.s1 C_lo = s_val, C_hi
+ nop.i 999
}
-
-
+;;
//
// Case 4: a = U_hi - A
// a = V_hi - A (do an add to account for missing (-) on V_hi
//
-
{ .mfi
-(p11) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp
-(p11) fsub.s1 C_lo = s_val, C_hi
- nop.i 999
+ ldfe P1_2 = [table_ptr1], 128 // Load P1_2
+(p12) fsub.s1 a = U_hi, A
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p13) fadd.s1 a = V_hi, A
+ nop.i 999
}
;;
+// Case 4: t = U_lo + V_lo + w
+{ .mfi
+ ldfe Q1_1 = [table_ptr1], 16 // Load Q1_1
+ fadd.s1 t = t, w2
+ nop.i 999
+}
+;;
-
-//
// Case 4: a = (U_hi - A) + V_hi
// a = (V_hi - A) + U_hi
// In each case account for negative missing form V_hi .
//
-
-
-{ .mmi
-(p11) ld8 table_ptr1 = [table_ptr1]
+{ .mfi
+ ldfe Q1_2 = [table_ptr1], 16 // Load Q1_2
+(p12) fsub.s1 a = a, V_hi
+ nop.i 999
+}
+{ .mfi
nop.m 999
+(p13) fsub.s1 a = U_hi, a
nop.i 999
}
;;
-
//
// Case 4: C_lo = (s_val - C_hi) + A
//
-{ .mmi
-(p11) add table_ptr1 = 224, table_ptr1 ;;
-(p11) ldfe P1_1 = [table_ptr1], 16
- nop.i 999 ;;
-}
{ .mfi
-(p11) ldfe P1_2 = [table_ptr1], 128
-//
-// Case 4: w = U_lo + V_lo + w
-//
-(p12) fsub.s1 a = a, V_hi
- nop.i 999 ;;
-}
-//
-// Case 4: r = C_hi + C_lo
-//
-{ .mfi
-(p11) ldfe Q1_1 = [table_ptr1], 16
-(p11) fadd.s1 C_lo = C_lo, A
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 C_lo = C_lo, A
+ nop.i 999 ;;
}
//
-// Case 4: c = C_hi - r
-// Get [i_1] - lsb of N_fix_gr.
+// Case 4: t = t + a
//
{ .mfi
-(p11) ldfe Q1_2 = [table_ptr1], 16
- nop.f 999
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 t = t, a
+ nop.i 999
}
+;;
+
+// Case 4: C_lo = C_lo + t
+// Case 4: r = C_hi + C_lo
{ .mfi
- nop.m 999
-(p13) fsub.s1 a = U_hi, a
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 C_lo = C_lo, t
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p11) fadd.s1 t = t, a
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 r = C_hi, C_lo
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
-// Case 4: t = t + a
+// Case 4: c = C_hi - r
//
-(p11) fadd.s1 C_lo = C_lo, t
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-//
-// Case 4: C_lo = C_lo + t
-//
-(p11) fadd.s1 r = C_hi, C_lo
- nop.i 999 ;;
+ nop.m 999
+ fsub.s1 c = C_hi, r
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p11) fsub.s1 c = C_hi, r
- nop.i 999
+ nop.m 999
+ fmpy.s1 rsq = r, r
+ add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
}
-{ .mfi
- nop.m 999
-//
+;;
+
// Case 4: c = c + C_lo finished.
-// Is i_1 even or odd?
-// if i_1 == 0, set PR_4, else set PR_5.
//
-// r and c have been computed.
-// We known whether this is the sine or cosine routine.
-// Make sure ftz mode is set - should be automatic when using wre
-(p0) fmpy.s1 rsq = r, r
- nop.i 999 ;;
-}
+// Is i_1 = lsb of N_fix_gr even or odd?
+// if i_1 == 0, set PR_11, else set PR_12.
+//
{ .mfi
- nop.m 999
-(p11) fadd.s1 c = c , C_lo
-(p11) cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
+ nop.m 999
+ fadd.s1 c = c , C_lo
+ tbit.z p11, p12 = N_fix_gr, 0
}
+;;
+
+// r and c have been computed.
{ .mfi
- nop.m 999
+ nop.m 999
(p12) frcpa.s1 S_hi, p0 = f1, r
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: Change sign of S_hi
//
-(p11) fma.s1 Result = rsq, P1_2, P1_1
- nop.i 999 ;;
+(p11) fma.s1 Poly = rsq, P1_2, P1_1
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 P = rsq, Q1_2, Q1_1
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: Result = S_hi + S_lo (User supplied rounding mode for C1)
//
-(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
- nop.i 999 ;;
+ fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: rsq = r * r
// N odd: S_hi = frcpa(r)
//
(p12) fmerge.ns S_hi = S_hi, S_hi
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: rsq = rsq * P1_2 + P1_1
// N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary
//
-(p11) fmpy.s1 Result = rsq, Result
- nop.i 999 ;;
+(p11) fmpy.s1 Poly = rsq, Poly
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 poly1 = S_hi, r,f1
- nop.i 999
+(p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// N even: Result = Result * rsq
+// N even: Poly = Poly * rsq
// N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary
//
-(p11) fma.s1 Result = r, Result, c
- nop.i 999 ;;
+(p11) fma.s1 Poly = r, Poly, c
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 S_hi = S_hi, poly1, S_hi
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: S_hi = S_hi * poly1 + S_hi 32 bits
//
-(p11) fadd.s0 Result= r, Result
- nop.i 999 ;;
+(p14) fadd.s0 Result = r, Poly // for tanl
+ nop.i 999 ;;
}
+
+.pred.rel "mutex",p15,p12
{ .mfi
- nop.m 999
+ nop.m 999
+(p15) fms.s0 Result = r, mOne, Poly // for cotl
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
(p12) fma.s1 poly1 = S_hi, r, f1
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// N even: Result = Result * r + c
+// N even: Poly = Poly * r + c
// N odd: poly1 = 1.0 + S_hi * r 32 bits partial
//
(p12) fma.s1 S_hi = S_hi, poly1, S_hi
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 poly1 = S_hi, r, f1
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// N even: Result1 = Result + r (Rounding mode S0)
+// N even: Result = Poly + r (Rounding mode S0)
// N odd: poly1 = S_hi * r + 1.0 64 bits partial
//
(p12) fma.s1 S_hi = S_hi, poly1, S_hi
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: poly1 = S_hi * poly + S_hi 64 bits
//
(p12) fma.s1 poly1 = S_hi, r, f1
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: poly1 = S_hi * r + 1.0
//
(p12) fma.s1 poly1 = S_hi, c, poly1
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: poly1 = S_hi * c + poly1
//
(p12) fmpy.s1 S_lo = S_hi, poly1
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: S_lo = S_hi * poly1
//
(p12) fma.s1 S_lo = P, r, S_lo
- nop.i 999 ;;
+(p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
+}
+
+{ .mfi
+ nop.m 999
+(p14) fadd.s0 Result = S_hi, S_lo // for tanl
+ nop.i 999
}
{ .mfb
- nop.m 999
+ nop.m 999
//
// N odd: S_lo = S_lo + r * P
//
-(p12) fadd.s0 Result = S_hi, S_lo
-(p0) br.ret.sptk b0 ;;
+(p15) fms.s0 Result = S_hi, mOne, S_lo // for cotl
+ br.ret.sptk b0 ;; // Exit for 2^24 <= |x| < 2^63 and |s| < 2^-14
}
-L(TANL_SMALL_R):
-{ .mii
- nop.m 999
-(p0) extr.u i_1 = N_fix_gr, 0, 1 ;;
-(p0) cmp.eq.unc p11, p12 = 0x0000, i_1
-}
+TANL_SMALL_R:
+// Here if |r| < 1/4
+// r and c have been computed.
+// *****************************************************************
+// *****************************************************************
+// *****************************************************************
+// N odd: S_hi = frcpa(r)
+// Get [i_1] - lsb of N_fix_gr. Set p11 if N even, p12 if N odd.
+// N even: rsq = r * r
{ .mfi
- nop.m 999
-(p0) fmpy.s1 rsq = r, r
- nop.i 999 ;;
+ add table_ptr1 = 160, table_base // Point to tanl_table_p1
+ frcpa.s1 S_hi, p0 = f1, r // S_hi for N odd
+ add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
}
{ .mfi
-(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp
-(p12) frcpa.s1 S_hi, p0 = f1, r
- nop.i 999
+ add table_ptr2 = 400, table_base // Point to Q1_7
+ fmpy.s1 rsq = r, r
+ nop.i 999
}
;;
-
{ .mmi
-(p0) ld8 table_ptr1 = [table_ptr1]
- nop.m 999
- nop.i 999
+ ldfe P1_1 = [table_ptr1], 16
+;;
+ ldfe P1_2 = [table_ptr1], 16
+ tbit.z p11, p12 = N_fix_gr, 0
}
;;
-// *****************************************************************
-// *****************************************************************
-// *****************************************************************
-
-{ .mmi
-(p0) add table_ptr1 = 224, table_ptr1 ;;
-(p0) ldfe P1_1 = [table_ptr1], 16
- nop.i 999 ;;
-}
-// r and c have been computed.
-// We known whether this is the sine or cosine routine.
-// Make sure ftz mode is set - should be automatic when using wre
-// |r| < 2**(-2)
{ .mfi
-(p0) ldfe P1_2 = [table_ptr1], 16
-(p11) fmpy.s1 r_to_the_8 = rsq, rsq
- nop.i 999 ;;
+ ldfe P1_3 = [table_ptr1], 96
+ nop.f 999
+ nop.i 999
}
-//
-// Set table_ptr1 to beginning of constant table.
-// Get [i_1] - lsb of N_fix_gr.
-//
+;;
+
{ .mfi
-(p0) ldfe P1_3 = [table_ptr1], 96
-//
-// N even: rsq = r * r
-// N odd: S_hi = frcpa(r)
-//
+(p11) ldfe P1_9 = [table_ptr1], -16
(p12) fmerge.ns S_hi = S_hi, S_hi
- nop.i 999 ;;
+ nop.i 999
}
-//
-// Is i_1 even or odd?
-// if i_1 == 0, set PR_11.
-// if i_1 != 0, set PR_12.
-//
{ .mfi
-(p11) ldfe P1_9 = [table_ptr1], -16
+ nop.m 999
+(p11) fmpy.s1 r_to_the_8 = rsq, rsq
+ nop.i 999
+}
+;;
+
//
// N even: Poly2 = P1_7 + Poly2 * rsq
// N odd: poly2 = Q1_5 + poly2 * rsq
//
+{ .mfi
+(p11) ldfe P1_8 = [table_ptr1], -16
(p11) fadd.s1 CORR = rsq, f1
- nop.i 999 ;;
+ nop.i 999
}
-{ .mmi
-(p11) ldfe P1_8 = [table_ptr1], -16 ;;
+;;
+
//
// N even: Poly1 = P1_2 + P1_3 * rsq
-// N odd: poly1 = 1.0 + S_hi * r
+// N odd: poly1 = 1.0 + S_hi * r
// 16 bits partial account for necessary (-1)
//
+{ .mmi
(p11) ldfe P1_7 = [table_ptr1], -16
- nop.i 999 ;;
+;;
+(p11) ldfe P1_6 = [table_ptr1], -16
+ nop.i 999
}
+;;
+
//
// N even: Poly1 = P1_1 + Poly1 * rsq
// N odd: S_hi = S_hi + S_hi * poly1) 16 bits account for necessary
//
-{ .mfi
-(p11) ldfe P1_6 = [table_ptr1], -16
//
// N even: Poly2 = P1_5 + Poly2 * rsq
// N odd: poly2 = Q1_3 + poly2 * rsq
//
+{ .mfi
+(p11) ldfe P1_5 = [table_ptr1], -16
(p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8
- nop.i 999 ;;
+ nop.i 999
}
+{ .mfi
+ nop.m 999
+(p12) fma.s1 poly1 = S_hi, r, f1
+ nop.i 999
+}
+;;
+
//
// N even: Poly1 = Poly1 * rsq
// N odd: poly1 = 1.0 + S_hi * r 32 bits partial
//
-{ .mfi
-(p11) ldfe P1_5 = [table_ptr1], -16
-(p12) fma.s1 poly1 = S_hi, r, f1
- nop.i 999 ;;
-}
//
// N even: CORR = CORR * c
@@ -2290,44 +2418,30 @@ L(TANL_SMALL_R):
{ .mmf
(p11) ldfe P1_4 = [table_ptr1], -16
-(p0) addl table_ptr2 = @ltoff(TANL_BASE_CONSTANTS), gp
-(p11) fmpy.s1 CORR = CORR, c
-}
-;;
-
-
-{ .mmi
-(p0) ld8 table_ptr2 = [table_ptr2]
nop.m 999
- nop.i 999
+(p11) fmpy.s1 CORR = CORR, c
}
;;
-
-{ .mii
-(p0) add table_ptr2 = 464, table_ptr2
- nop.i 999 ;;
- nop.i 999
-}
{ .mfi
- nop.m 999
+ nop.m 999
(p11) fma.s1 Poly1 = P1_3, rsq, P1_2
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
-(p0) ldfe Q1_7 = [table_ptr2], -16
+(p12) ldfe Q1_7 = [table_ptr2], -16
(p12) fma.s1 S_hi = S_hi, poly1, S_hi
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
-(p0) ldfe Q1_6 = [table_ptr2], -16
+(p12) ldfe Q1_6 = [table_ptr2], -16
(p11) fma.s1 Poly2 = P1_9, rsq, P1_8
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mmi
-(p0) ldfe Q1_5 = [table_ptr2], -16 ;;
+(p12) ldfe Q1_5 = [table_ptr2], -16 ;;
(p12) ldfe Q1_4 = [table_ptr2], -16
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
(p12) ldfe Q1_3 = [table_ptr2], -16
@@ -2336,735 +2450,795 @@ L(TANL_SMALL_R):
// N odd: poly2 = Q1_6 + Q1_7 * rsq
//
(p11) fma.s1 Poly1 = Poly1, rsq, P1_1
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
(p12) ldfe Q1_2 = [table_ptr2], -16
(p12) fma.s1 poly1 = S_hi, r, f1
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
(p12) ldfe Q1_1 = [table_ptr2], -16
(p11) fma.s1 Poly2 = Poly2, rsq, P1_7
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: CORR = rsq + 1
// N even: r_to_the_8 = rsq * rsq
//
(p11) fmpy.s1 Poly1 = Poly1, rsq
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 S_hi = S_hi, poly1, S_hi
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 poly2 = Q1_7, rsq, Q1_6
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p11) fma.s1 Poly2 = Poly2, rsq, P1_6
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 poly1 = S_hi, r, f1
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 poly2 = poly2, rsq, Q1_5
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p11) fma.s1 Poly2= Poly2, rsq, P1_5
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 S_hi = S_hi, poly1, S_hi
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 poly2 = poly2, rsq, Q1_4
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: r_to_the_8 = r_to_the_8 * r_to_the_8
// N odd: poly1 = S_hi * r + 1.0 64 bits partial
//
(p11) fma.s1 Poly2 = Poly2, rsq, P1_4
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// N even: Result = CORR + Poly * r
+// N even: Poly = CORR + Poly * r
// N odd: P = Q1_1 + poly2 * rsq
//
(p12) fma.s1 poly1 = S_hi, r, f1
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 poly2 = poly2, rsq, Q1_3
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: Poly2 = P1_4 + Poly2 * rsq
// N odd: poly2 = Q1_2 + poly2 * rsq
//
(p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 poly1 = S_hi, c, poly1
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 poly2 = poly2, rsq, Q1_2
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: Poly = Poly1 + Poly2 * r_to_the_8
// N odd: S_hi = S_hi * poly1 + S_hi 64 bits
//
-(p11) fma.s1 Result = Poly, r, CORR
- nop.i 999 ;;
+(p11) fma.s1 Poly = Poly, r, CORR
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
-// N even: Result = r + Result (User supplied rounding mode)
+// N even: Result = r + Poly (User supplied rounding mode)
// N odd: poly1 = S_hi * c + poly1
//
(p12) fmpy.s1 S_lo = S_hi, poly1
- nop.i 999
+(p11) tbit.z.unc p14, p15 = cot_flag, 0 // p14=1 for tanl; p15=1 for cotl
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fma.s1 P = poly2, rsq, Q1_1
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: poly1 = S_hi * r + 1.0
//
//
// N odd: S_lo = S_hi * poly1
//
-(p11) fadd.s0 Result = Result, r
- nop.i 999 ;;
+(p14) fadd.s0 Result = Poly, r // for tanl
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
+(p15) fms.s0 Result = Poly, mOne, r // for cotl
+ nop.i 999 ;;
+}
+
+{ .mfi
+ nop.m 999
//
// N odd: S_lo = Q1_1 * c + S_lo
//
(p12) fma.s1 S_lo = Q1_1, c, S_lo
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: Result = S_lo + r * P
//
(p12) fma.s1 Result = P, r, S_lo
- nop.i 999 ;;
+(p12) tbit.z.unc p14, p15 = cot_flag, 0 ;; // p14=1 for tanl; p15=1 for cotl
}
-{ .mfb
- nop.m 999
+
//
// N odd: Result = Result + S_hi (user supplied rounding mode)
//
-(p12) fadd.s0 Result = Result, S_hi
-(p0) br.ret.sptk b0 ;;
+{ .mfi
+ nop.m 999
+(p14) fadd.s0 Result = Result, S_hi // for tanl
+ nop.i 999
+}
+{ .mfb
+ nop.m 999
+(p15) fms.s0 Result = Result, mOne, S_hi // for cotl
+ br.ret.sptk b0 ;; // Exit |r| < 1/4 path
}
-L(TANL_NORMAL_R):
-{ .mfi
-(p0) getf.sig sig_r = r
+TANL_NORMAL_R:
+// Here if 1/4 <= |x| < pi/4 or if |x| >= 2^63 and |r| >= 1/4
// *******************************************************************
// *******************************************************************
// *******************************************************************
//
// r and c have been computed.
-// Make sure ftz mode is set - should be automatic when using wre
-//
//
-// Get [i_1] - lsb of N_fix_gr alone.
-//
-(p0) fmerge.s Pos_r = f1, r
-(p0) extr.u i_1 = N_fix_gr, 0, 1 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fmerge.s sgn_r = r, f1
-(p0) cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
-}
-{ .mfi
- nop.m 999
- nop.f 999
-(p0) extr.u lookup = sig_r, 58, 5
-}
-{ .mlx
- nop.m 999
-(p0) movl Create_B = 0x8200000000000000 ;;
-}
{ .mfi
-(p0) addl table_ptr1 = @ltoff(TANL_BASE_CONSTANTS), gp
- nop.f 999
-(p0) dep Create_B = lookup, Create_B, 58, 5
-}
-;;
-
-
-//
-// Get [i_1] - lsb of N_fix_gr alone.
-// Pos_r = abs (r)
-//
-
-
-{ .mmi
-(p0) ld8 table_ptr1 = [table_ptr1]
nop.m 999
+ fand B = B_mask1, r
nop.i 999
}
;;
-
+TANL_NORMAL_R_A:
+// Enter here if pi/4 <= |x| < 2^63 and |r| >= 1/4
+// Get the 5 bits or r for the lookup. 1.xxxxx ....
{ .mmi
- nop.m 999
-(p0) setf.sig B = Create_B
-//
-// Set table_ptr1 and table_ptr2 to base address of
-// constant table.
-//
-(p0) add table_ptr1 = 480, table_ptr1 ;;
-}
-{ .mmb
- nop.m 999
-//
-// Is i_1 or i_0 == 0 ?
-// Create the constant 1 00000 1000000000000000000000...
-//
-(p0) ldfe P2_1 = [table_ptr1], 16
- nop.b 999
+ add table_ptr1 = 416, table_base // Point to tanl_table_p2
+ mov GR_exp_2tom65 = 0xffff - 65 // Scaling constant for B
+ extr.u lookup = sig_r, 58, 5
}
+;;
+
{ .mmi
- nop.m 999 ;;
-(p0) getf.exp exp_r = Pos_r
- nop.i 999
+ ldfe P2_1 = [table_ptr1], 16
+ setf.exp TWO_TO_NEG65 = GR_exp_2tom65 // 2^-65 for scaling B if exp_r=-2
+ add N_fix_gr = N_fix_gr, cot_flag // N = N + 1 (for cotl)
}
-//
-// Get r's exponent
-// Get r's significand
-//
-{ .mmi
-(p0) ldfe P2_2 = [table_ptr1], 16 ;;
-//
-// Get the 5 bits or r for the lookup. 1.xxxxx ....
-// from sig_r.
-// Grab lsb of exp of B
-//
-(p0) ldfe P2_3 = [table_ptr1], 16
- nop.i 999 ;;
+;;
+
+.pred.rel "mutex",p11,p12
+// B = 2^63 * 1.xxxxx 100...0
+{ .mfi
+ ldfe P2_2 = [table_ptr1], 16
+ for B = B_mask2, B
+ mov table_offset = 512 // Assume table offset is 512
}
-{ .mii
- nop.m 999
-(p0) andcm table_offset = 0x0001, exp_r ;;
-(p0) shl table_offset = table_offset, 9 ;;
+;;
+
+{ .mfi
+ ldfe P2_3 = [table_ptr1], 16
+ fmerge.s Pos_r = f1, r
+ tbit.nz p8,p9 = exp_r, 0
}
-{ .mii
- nop.m 999
-//
-// Deposit 0 00000 1000000000000000000000... on
-// 1 xxxxx yyyyyyyyyyyyyyyyyyyyyy...,
-// getting rid of the ys.
+;;
+
// Is B = 2** -2 or B= 2** -1? If 2**-1, then
// we want an offset of 512 for table addressing.
-//
-(p0) shladd table_offset = lookup, 4, table_offset ;;
-//
-// B = ........ 1xxxxx 1000000000000000000...
-//
-(p0) add table_ptr1 = table_ptr1, table_offset ;;
-}
-{ .mmb
- nop.m 999
-//
-// B = ........ 1xxxxx 1000000000000000000...
-// Convert B so it has the same exponent as Pos_r
-//
-(p0) ldfd T_hi = [table_ptr1], 8
- nop.b 999 ;;
+{ .mii
+ add table_ptr2 = 1296, table_base // Point to tanl_table_cm2
+(p9) shladd table_offset = lookup, 4, table_offset
+(p8) shladd table_offset = lookup, 4, r0
}
+;;
+{ .mmi
+ add table_ptr1 = table_ptr1, table_offset // Point to T_hi
+ add table_ptr2 = table_ptr2, table_offset // Point to C_hi
+ add table_ptr3 = 2128, table_base // Point to tanl_table_scim2
+}
+;;
+{ .mmi
+ ldfd T_hi = [table_ptr1], 8 // Load T_hi
+;;
+ ldfd C_hi = [table_ptr2], 8 // Load C_hi
+ add table_ptr3 = table_ptr3, table_offset // Point to SC_inv
+}
+;;
//
// x = |r| - B
-// Load T_hi.
-// Load C_hi.
//
-
-{ .mmf
-(p0) addl table_ptr2 = @ltoff(TANL_BASE_CONSTANTS), gp
-(p0) ldfs T_lo = [table_ptr1]
-(p0) fmerge.se B = Pos_r, B
+// Convert B so it has the same exponent as Pos_r before subtracting
+{ .mfi
+ ldfs T_lo = [table_ptr1] // Load T_lo
+(p9) fnma.s1 x = B, FR_2tom64, Pos_r
+ nop.i 999
}
-;;
-
-
-{ .mmi
-(p0) ld8 table_ptr2 = [table_ptr2]
+{ .mfi
nop.m 999
+(p8) fnma.s1 x = B, TWO_TO_NEG65, Pos_r
nop.i 999
}
;;
-
-{ .mii
-(p0) add table_ptr2 = 1360, table_ptr2
- nop.i 999 ;;
-(p0) add table_ptr2 = table_ptr2, table_offset ;;
+{ .mfi
+ ldfs C_lo = [table_ptr2] // Load C_lo
+ nop.f 999
+ nop.i 999
}
+;;
+
{ .mfi
-(p0) ldfd C_hi = [table_ptr2], 8
-(p0) fsub.s1 x = Pos_r, B
- nop.i 999 ;;
+ ldfe SC_inv = [table_ptr3] // Load SC_inv
+ fmerge.s sgn_r = r, f1
+ tbit.z p11, p12 = N_fix_gr, 0 // p11 if N even, p12 if odd
+
}
-{ .mii
-(p0) ldfs C_lo = [table_ptr2],255
- nop.i 999 ;;
+;;
+
//
// xsq = x * x
// N even: Tx = T_hi * x
-// Load T_lo.
-// Load C_lo - increment pointer to get SC_inv
-// - cant get all the way, do an add later.
-//
-(p0) add table_ptr2 = 569, table_ptr2 ;;
-}
//
// N even: Tx1 = Tx + 1
// N odd: Cx1 = 1 - Cx
//
+
{ .mfi
-(p0) ldfe SC_inv = [table_ptr2], 0
- nop.f 999
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fmpy.s1 xsq = x, x
- nop.i 999
+ nop.m 999
+ fmpy.s1 xsq = x, x
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p11) fmpy.s1 Tx = T_hi, x
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p12) fmpy.s1 Cx = C_hi, x
- nop.i 999 ;;
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
// N odd: Cx = C_hi * x
//
-(p0) fma.s1 P = P2_3, xsq, P2_2
- nop.i 999
-}
{ .mfi
- nop.m 999
+ nop.m 999
+(p12) fmpy.s1 Cx = C_hi, x
+ nop.i 999
+}
+;;
//
// N even and odd: P = P2_3 + P2_2 * xsq
//
+{ .mfi
+ nop.m 999
+ fma.s1 P = P2_3, xsq, P2_2
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
(p11) fadd.s1 Tx1 = Tx, f1
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: D = C_hi - tanx
// N odd: D = T_hi + tanx
//
(p11) fmpy.s1 CORR = SC_inv, T_hi
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 Sx = SC_inv, x
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 Sx = SC_inv, x
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fmpy.s1 CORR = SC_inv, C_hi
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fsub.s1 V_hi = f1, Cx
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p0) fma.s1 P = P, xsq, P2_1
- nop.i 999
+ nop.m 999
+ fma.s1 P = P, xsq, P2_1
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even and odd: P = P2_1 + P * xsq
//
(p11) fma.s1 V_hi = Tx, Tx1, f1
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: Result = sgn_r * tail + T_hi (user rounding mode for C1)
// N odd: Result = sgn_r * tail + C_hi (user rounding mode for C1)
//
-(p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
- nop.i 999 ;;
+ fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 CORR = CORR, c
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 CORR = CORR, c
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fnma.s1 V_hi = Cx,V_hi,f1
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: V_hi = Tx * Tx1 + 1
// N odd: Cx1 = 1 - Cx * Cx1
//
-(p0) fmpy.s1 P = P, xsq
- nop.i 999
+ fmpy.s1 P = P, xsq
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even and odd: P = P * xsq
//
(p11) fmpy.s1 V_hi = V_hi, T_hi
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even and odd: tail = P * tail + V_lo
//
(p11) fmpy.s1 T_hi = sgn_r, T_hi
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 CORR = CORR, sgn_r
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 CORR = CORR, sgn_r
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fmpy.s1 V_hi = V_hi,C_hi
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: V_hi = T_hi * V_hi
// N odd: V_hi = C_hi * V_hi
//
-(p0) fma.s1 tanx = P, x, x
- nop.i 999
+ fma.s1 tanx = P, x, x
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fnmpy.s1 C_hi = sgn_r, C_hi
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: V_lo = 1 - V_hi + C_hi
// N odd: V_lo = 1 - V_hi + T_hi
//
(p11) fadd.s1 CORR = CORR, T_lo
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fsub.s1 CORR = CORR, C_lo
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even and odd: tanx = x + x * P
// N even and odd: Sx = SC_inv * x
//
(p11) fsub.s1 D = C_hi, tanx
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fadd.s1 D = T_hi, tanx
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N odd: CORR = SC_inv * C_hi
// N even: CORR = SC_inv * T_hi
//
-(p0) fnma.s1 D = V_hi, D, f1
- nop.i 999 ;;
+ fnma.s1 D = V_hi, D, f1
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even and odd: D = 1 - V_hi * D
// N even and odd: CORR = CORR * c
//
-(p0) fma.s1 V_hi = V_hi, D, V_hi
- nop.i 999 ;;
+ fma.s1 V_hi = V_hi, D, V_hi
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even and odd: V_hi = V_hi + V_hi * D
// N even and odd: CORR = sgn_r * CORR
//
(p11) fnma.s1 V_lo = V_hi, C_hi, f1
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fnma.s1 V_lo = V_hi, T_hi, f1
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: CORR = COOR + T_lo
// N odd: CORR = CORR - C_lo
//
(p11) fma.s1 V_lo = tanx, V_hi, V_lo
- nop.i 999
+ tbit.nz p15, p0 = cot_flag, 0 // p15=1 if we compute cotl
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fnma.s1 V_lo = tanx, V_hi, V_lo
- nop.i 999 ;;
+ nop.i 999 ;;
}
+
{ .mfi
- nop.m 999
+ nop.m 999
+(p15) fms.s1 T_hi = f0, f0, T_hi // to correct result's sign for cotl
+ nop.i 999
+}
+{ .mfi
+ nop.m 999
+(p15) fms.s1 C_hi = f0, f0, C_hi // to correct result's sign for cotl
+ nop.i 999
+};;
+
+{ .mfi
+ nop.m 999
+(p15) fms.s1 sgn_r = f0, f0, sgn_r // to correct result's sign for cotl
+ nop.i 999
+};;
+
+{ .mfi
+ nop.m 999
//
// N even: V_lo = V_lo + V_hi * tanx
// N odd: V_lo = V_lo - V_hi * tanx
//
(p11) fnma.s1 V_lo = C_lo, V_hi, V_lo
- nop.i 999
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
(p12) fnma.s1 V_lo = T_lo, V_hi, V_lo
- nop.i 999 ;;
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: V_lo = V_lo - V_hi * C_lo
// N odd: V_lo = V_lo - V_hi * T_lo
//
-(p0) fmpy.s1 V_lo = V_hi, V_lo
- nop.i 999 ;;
+ fmpy.s1 V_lo = V_hi, V_lo
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even and odd: V_lo = V_lo * V_hi
//
-(p0) fadd.s1 tail = V_hi, V_lo
- nop.i 999 ;;
+ fadd.s1 tail = V_hi, V_lo
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even and odd: tail = V_hi + V_lo
//
-(p0) fma.s1 tail = tail, P, V_lo
- nop.i 999 ;;
+ fma.s1 tail = tail, P, V_lo
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even: T_hi = sgn_r * T_hi
// N odd : C_hi = -sgn_r * C_hi
//
-(p0) fma.s1 tail = tail, Sx, CORR
- nop.i 999 ;;
+ fma.s1 tail = tail, Sx, CORR
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even and odd: tail = Sx * tail + CORR
//
-(p0) fma.s1 tail = V_hi, Sx, tail
- nop.i 999 ;;
+ fma.s1 tail = V_hi, Sx, tail
+ nop.i 999 ;;
}
{ .mfi
- nop.m 999
+ nop.m 999
//
// N even an odd: tail = Sx * V_hi + tail
//
(p11) fma.s0 Result = sgn_r, tail, T_hi
- nop.i 999
+ nop.i 999
}
{ .mfb
- nop.m 999
+ nop.m 999
(p12) fma.s0 Result = sgn_r, tail, C_hi
-(p0) br.ret.sptk b0 ;;
+ br.ret.sptk b0 ;; // Exit for 1/4 <= |r| < pi/4
}
-L(TANL_SPECIAL):
+TANL_DENORMAL:
+// Here if x denormal
{ .mfb
- nop.m 999
-(p0) fmpy.s0 Arg = Arg, f0
-(p0) br.ret.sptk b0 ;;
+ getf.exp GR_signexp_x = Norm_Arg // Get sign and exponent of x
+ nop.f 999
+ br.cond.sptk TANL_COMMON // Return to common code
}
+;;
+
+
+TANL_SPECIAL:
+TANL_UNSUPPORTED:
//
// Code for NaNs, Unsupporteds, Infs, or +/- zero ?
// Invalid raised for Infs and SNaNs.
//
-.endp tanl
-ASM_SIZE_DIRECTIVE(tanl)
+{ .mfi
+ nop.m 999
+ fmerge.s f10 = f8, f8 // Save input for error call
+ tbit.nz p6, p7 = cot_flag, 0 // p6=1 if we compute cotl
+}
+;;
-// *******************************************************************
-// *******************************************************************
-// *******************************************************************
-//
-// Special Code to handle very large argument case.
-// Call int pi_by_2_reduce(&x,&r,&c)
-// for |arguments| >= 2**63
-// (Arg or x) is in f8
-// Address to save r and c as double
-// *******************************************************************
-// *******************************************************************
-// *******************************************************************
+{ .mfi
+ nop.m 999
+(p6) fclass.m p6, p7 = f8, 0x7 // Test for zero (cotl only)
+ nop.i 999
+}
+;;
+
+.pred.rel "mutex", p6, p7
+{ .mfi
+(p6) mov GR_Parameter_Tag = 225 // (cotl)
+(p6) frcpa.s0 f8, p0 = f1, f8 // cotl(+-0) = +-Inf
+ nop.i 999
+}
+{ .mfb
+ nop.m 999
+(p7) fmpy.s0 f8 = f8, f0
+(p7) br.ret.sptk b0
+}
+;;
+
+GLOBAL_IEEE754_END(tanl)
-.proc __libm_callout
-__libm_callout:
-L(TANL_ARG_TOO_LARGE):
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
+
+// (1)
{ .mfi
- add r50=-32,sp // Parameter: r address
- nop.f 0
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
.save ar.pfs,GR_SAVE_PFS
- mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
}
{ .mfi
.fframe 64
- add sp=-64,sp // Create new stack
- nop.f 0
- mov GR_SAVE_GP=gp // Save gp
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
};;
+
+// (2)
{ .mmi
- stfe [r50] = f0,16 // Clear Parameter r on stack
- add r49 = 16,sp // Parameter x address
+ stfe [GR_Parameter_Y] = f1,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
.save b0, GR_SAVE_B0
- mov GR_SAVE_B0=b0 // Save b0
+ mov GR_SAVE_B0=b0 // Save b0
};;
+
.body
+// (3)
{ .mib
- stfe [r50] = f0,-16 // Clear Parameter c on stack
- nop.i 0
- nop.b 0
+ stfe [GR_Parameter_X] = f10 // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
}
{ .mib
- stfe [r49] = Arg // Store Parameter x on stack
- nop.i 0
-(p0) br.call.sptk b0=__libm_pi_by_2_reduce# ;;
+ stfe [GR_Parameter_Y] = f8 // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+
+// (4)
+{ .mmi
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
+
+// *******************************************************************
+// *******************************************************************
+// *******************************************************************
//
-// Load 2^-2
+// Special Code to handle very large argument case.
+// Call int __libm_pi_by_2_reduce(x,r,c) for |arguments| >= 2**63
+// The interface is custom:
+// On input:
+// (Arg or x) is in f8
+// On output:
+// r is in f8
+// c is in f9
+// N is in r8
+// We know also that __libm_pi_by_2_reduce preserves f10-15, f71-127. We
+// use this to eliminate save/restore of key fp registers in this calling
+// function.
//
+// *******************************************************************
+// *******************************************************************
+// *******************************************************************
+
+LOCAL_LIBM_ENTRY(__libm_callout)
+TANL_ARG_TOO_LARGE:
+.prologue
+{ .mfi
+ add table_ptr2 = 144, table_base // Point to 2^-2
+ nop.f 999
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+;;
+
+// Load 2^-2, -2^-2
{ .mmi
-(p0) ldfe Arg =[r49],16
+ ldfps TWO_TO_NEG2, NEGTWO_TO_NEG2 = [table_ptr2]
+ setf.sig B_mask1 = bmask1 // Form mask to get 5 msb of r
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+
+.body
//
-// Call argument reduction
+// Call argument reduction with x in f8
+// Returns with N in r8, r in f8, c in f9
+// Assumes f71-127 are preserved across the call
//
-(p0) ldfs TWO_TO_NEG2 = [table_ptr2],4
-// Get Arg off stack
-// Get r off stack - hi order part
-// Get c off stack - lo order part
-(p0) mov N_fix_gr = r8 ;;
-}
-{ .mmb
-(p0) ldfe r =[r50],16
-(p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],4
- nop.b 999 ;;
+{ .mib
+ setf.sig B_mask2 = bmask2 // Form mask to form B from r
+ mov GR_SAVE_GP=gp // Save gp
+ br.call.sptk b0=__libm_pi_by_2_reduce#
}
+;;
+
+//
+// Is |r| < 2**(-2)
+//
{ .mfi
-(p0) ldfe c =[r50],-32
- nop.f 999
- nop.i 999 ;;
+ getf.sig sig_r = r // Extract significand of r
+ fcmp.lt.s1 p6, p0 = r, TWO_TO_NEG2
+ mov gp = GR_SAVE_GP // Restore gp
}
+;;
+
{ .mfi
-.restore sp
- add sp = 64,sp // Restore stack pointer
+ getf.exp exp_r = r // Extract signexp of r
+ nop.f 999
+ mov b0 = GR_SAVE_B0 // Restore return address
+}
+;;
+
//
-// Is |r| < 2**(-2)
+// Get N_fix_gr
//
-(p0) fcmp.lt.unc.s1 p6, p0 = r, TWO_TO_NEG2
-mov b0 = GR_SAVE_B0 // Restore return address
-};;
{ .mfi
- mov gp = GR_SAVE_GP // Restore gp
-(p6) fcmp.gt.unc.s1 p6, p0 = r, NEGTWO_TO_NEG2
- mov ar.pfs = GR_SAVE_PFS // Restore gp
-};;
+ mov N_fix_gr = r8
+(p6) fcmp.gt.unc.s1 p6, p0 = r, NEGTWO_TO_NEG2
+ mov ar.pfs = GR_SAVE_PFS // Restore pfs
+}
+;;
+
{ .mbb
- nop.m 999
-(p6) br.cond.spnt L(TANL_SMALL_R)
-(p0) br.cond.sptk L(TANL_NORMAL_R) ;;
+ nop.m 999
+(p6) br.cond.spnt TANL_SMALL_R // Branch if |r| < 1/4
+ br.cond.sptk TANL_NORMAL_R // Branch if 1/4 <= |r| < pi/4
}
+;;
-.endp __libm_callout
-ASM_SIZE_DIRECTIVE(__libm_callout)
+LOCAL_LIBM_END(__libm_callout)
.type __libm_pi_by_2_reduce#,@function
.global __libm_pi_by_2_reduce#
diff --git a/sysdeps/ia64/fpu/s_trunc.S b/sysdeps/ia64/fpu/s_trunc.S
index 0be91200e3..b9ad03b5a8 100644
--- a/sysdeps/ia64/fpu/s_trunc.S
+++ b/sysdeps/ia64/fpu/s_trunc.S
@@ -1,11 +1,10 @@
.file "trunc.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 7/7/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang of the
-// Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,33 +20,28 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-.align 32
-.global trunc#
-
-.section .text
-.proc trunc#
-.align 32
-
// History
//==============================================================
-// 7/7/00: Created
+// 07/07/00 Created
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance and reduced code size
//==============================================================
// API
@@ -55,25 +49,28 @@
// double trunc(double x)
//==============================================================
-#include "libm_support.h"
+// general input registers:
+// r14 - r18
-// general input registers:
-TRUNC_GR_FFFF = r14
-TRUNC_GR_signexp = r15
-TRUNC_GR_exponent = r16
-TRUNC_GR_expmask = r17
-TRUNC_GR_bigexp = r18
+rExpBias = r14
+rSignexp = r15
+rExp = r16
+rExpMask = r17
+rBigexp = r18
// floating-point registers:
-// f8, f9, f11, f12
+// f8 - f10
+
+fXtruncInt = f9
+fNormX = f10
-// predicate registers used:
-// p6, p7, p8, p9, p10, p11
+// predicate registers used:
+// p6, p7
// Overview of operation
//==============================================================
// double trunc(double x)
-// Return an integer value (represented as a double) less than or
+// Return an integer value (represented as a double) less than or
// equal to x in magnitude.
// This is x rounded toward zero to an integral value.
//==============================================================
@@ -97,105 +94,73 @@ TRUNC_GR_bigexp = r18
// If we multiply by 2^23, we no longer have a fractional part
// So input is an integer value already.
-trunc:
+.section .text
+GLOBAL_LIBM_ENTRY(trunc)
{ .mfi
- getf.exp TRUNC_GR_signexp = f8
- fcvt.fx.trunc.s1 f9 = f8
- addl TRUNC_GR_bigexp = 0x10033, r0
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fcvt.fx.trunc.s1 fXtruncInt = f8 // Convert to int in significand
+ addl rBigexp = 0x10033, r0 // Set exponent at which is integer
}
{ .mfi
- mov TRUNC_GR_FFFF = 0x0FFFF
- fnorm.d f11 = f8
- mov TRUNC_GR_expmask = 0x1FFFF
-};;
-// get the exponent of x
-// convert x to integer in signficand of f9
-// Normalize x - this will raise invalid on SNaNs, the
-// denormal operand flag - and possibly a spurious U flag
-// get exponent only mask (will exclude sign bit)
+ mov rExpBias = 0x0FFFF // Form exponent bias
+ fnorm.s1 fNormX = f8 // Normalize input
+ mov rExpMask = 0x1FFFF // Form exponent mask
+}
+;;
{ .mfi
nop.m 0
- fclass.m p7,p8 = f8, 0x0b
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
nop.i 0
}
-{ .mfi
- nop.m 0
- fcmp.eq.unc.s1 p9,p0 = f8,f0
- nop.i 0
-};;
-// fclass to set p7 if unnorm
-{ .mmi
- and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask ;;
-(p8) cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp
-(p8) cmp.ne.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_signexp
-};;
-// Get the exponent of x
-// Test if exponent such that result already an integer
-// Test if x < 0
-{ .mmi
-(p9) cmp.eq.andcm p10,p11 = r0, r0
-(p6) cmp.lt.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_FFFF
- nop.i 0
-};;
-// If -1 < x < 0, set p6, turn off p10 and p11, and set result to -0.0
-{ .mfb
-(p6) cmp.eq.andcm p10,p11 = r0, r0
-(p6) fmerge.s f8 = f8, f0
- nop.b 0
-};;
-// If not a unnorm, set p10 if x already is a big int, nan, or inf?
-// If not a unnorm, set p10 if x already is a big int, nan, or inf?
-.pred.rel "mutex",p10,p11
+;;
+
{ .mfb
nop.m 0
-(p11) fcvt.xf f8 = f9
- nop.b 0
+ fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf
+(p7) br.cond.spnt TRUNC_UNORM // Branch if x unorm
}
+;;
+
+TRUNC_COMMON:
+// Return here from TRUNC_UNORM
{ .mfb
+ and rExp = rSignexp, rExpMask // Get biased exponent
+(p6) fma.d.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf
+}
+;;
+
+{ .mfi
+ cmp.lt p6,p0 = rExp, rExpBias // Is |x| < 1?
+ fcvt.xf f8 = fXtruncInt // Result, assume 1 <= |x| < 2^52
+ cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^52?
+}
+;;
+
+// We must correct result if |x| < 1, or |x| >= 2^52
+.pred.rel "mutex",p6,p7
+{ .mfi
nop.m 0
-(p10) fma.d.s1 f8 = f11,f1,f0
-(p8) br.ret.sptk b0
-};;
-// If not a unnorm and not an big int, nan,or +/-inf convert signficand
-// back to f8.
-// If not a unorm and a big int, nan, or +/-inf, return fnorm'd x
-// If not a unorm, Return
-// If unnorm, get the exponent again - perhaps it wasn't a denorm.
-{ .mfb
-(p7) getf.exp TRUNC_GR_signexp = f11
-(p7) fcvt.fx.trunc.s1 f12 = f11
- nop.b 0
-};;
-{ .mfb
- and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask
- fcmp.lt.unc.s1 p9,p0 = f8,f0
- nop.b 0
-};;
-{ .mfb
- cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp
- nop.f 0
- nop.b 0
-};;
-// If a unnorm, check to see if value is already a big int.
+(p6) fmerge.s f8 = fNormX, f0 // If |x| < 1, result sgn(x)*0
+ nop.i 0
+}
{ .mfb
- nop.m 0
-(p11) fcvt.xf f8 = f12
- nop.b 0
+ nop.m 0
+(p7) fma.d.s0 f8 = fNormX, f1, f0 // If |x| >= 2^52, result x
+ br.ret.sptk b0 // Exit main path
}
-{ .mfi
- nop.m 0
-(p10) fma.d.s1 f8 = f11,f1,f0
- nop.i 0
-};;
+;;
+
+
+TRUNC_UNORM:
+// Here if x unorm
{ .mfb
- nop.m 0
-(p9) fmerge.ns f8 = f1,f8
- br.ret.sptk b0
-};;
-// If so return it. Otherwise, return (fcvt.xf(fcvt.fx.trunc(x)))
-// Make sure the result is negative if it should be - that is
-// negative(denormal) -> -0.
-.endp trunc
-ASM_SIZE_DIRECTIVE(trunc)
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk TRUNC_COMMON // Return to main path
+}
+;;
+
+GLOBAL_LIBM_END(trunc)
diff --git a/sysdeps/ia64/fpu/s_truncf.S b/sysdeps/ia64/fpu/s_truncf.S
index 0ac4181209..ff40bc7101 100644
--- a/sysdeps/ia64/fpu/s_truncf.S
+++ b/sysdeps/ia64/fpu/s_truncf.S
@@ -1,11 +1,10 @@
.file "truncf.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 7/7/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang of the
-// Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,33 +20,28 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-.align 32
-.global truncf#
-
-.section .text
-.proc truncf#
-.align 32
-
// History
//==============================================================
-// 7/7/00: Created
+// 07/07/00 Created
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance and reduced code size
//==============================================================
// API
@@ -55,25 +49,28 @@
// float truncf(float x)
//==============================================================
-#include "libm_support.h"
+// general input registers:
+// r14 - r18
-// general input registers:
-TRUNC_GR_FFFF = r14
-TRUNC_GR_signexp = r15
-TRUNC_GR_exponent = r16
-TRUNC_GR_expmask = r17
-TRUNC_GR_bigexp = r18
+rExpBias = r14
+rSignexp = r15
+rExp = r16
+rExpMask = r17
+rBigexp = r18
// floating-point registers:
-// f8, f9, f11, f12
+// f8 - f10
+
+fXtruncInt = f9
+fNormX = f10
-// predicate registers used:
-// p6, p7, p8, p9, p10, p11
+// predicate registers used:
+// p6, p7
// Overview of operation
//==============================================================
// float truncf(float x)
-// Return an integer value (represented as a float) less than or
+// Return an integer value (represented as a float) less than or
// equal to x in magnitude.
// This is x rounded toward zero to an integral value.
//==============================================================
@@ -97,105 +94,73 @@ TRUNC_GR_bigexp = r18
// If we multiply by 2^23, we no longer have a fractional part
// So input is an integer value already.
-truncf:
+.section .text
+GLOBAL_LIBM_ENTRY(truncf)
{ .mfi
- getf.exp TRUNC_GR_signexp = f8
- fcvt.fx.trunc.s1 f9 = f8
- addl TRUNC_GR_bigexp = 0x10016, r0
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fcvt.fx.trunc.s1 fXtruncInt = f8 // Convert to int in significand
+ addl rBigexp = 0x10016, r0 // Set exponent at which is integer
}
{ .mfi
- mov TRUNC_GR_FFFF = 0x0FFFF
- fnorm.s f11 = f8
- mov TRUNC_GR_expmask = 0x1FFFF
-};;
-// get the exponent of x
-// convert x to integer in signficand of f9
-// Normalize x - this will raise invalid on SNaNs, the
-// denormal operand flag - and possibly a spurious U flag
-// get exponent only mask (will exclude sign bit)
+ mov rExpBias = 0x0FFFF // Form exponent bias
+ fnorm.s1 fNormX = f8 // Normalize input
+ mov rExpMask = 0x1FFFF // Form exponent mask
+}
+;;
{ .mfi
nop.m 0
- fclass.m p7,p8 = f8, 0x0b
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
nop.i 0
}
-{ .mfi
- nop.m 0
- fcmp.eq.unc.s1 p9,p0 = f8,f0
- nop.i 0
-};;
-// fclass to set p7 if unnorm
-{ .mmi
- and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask ;;
-(p8) cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp
-(p8) cmp.ne.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_signexp
-};;
-// Get the exponent of x
-// Test if exponent such that result already an integer
-// Test if x < 0
-{ .mmi
-(p9) cmp.eq.andcm p10,p11 = r0, r0
-(p6) cmp.lt.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_FFFF
- nop.i 0
-};;
-// If -1 < x < 0, set p6, turn off p10 and p11, and set result to -0.0
-{ .mfb
-(p6) cmp.eq.andcm p10,p11 = r0, r0
-(p6) fmerge.s f8 = f8, f0
- nop.b 0
-};;
-// If not a unnorm, set p10 if x already is a big int, nan, or inf?
-// If not a unnorm, set p10 if x already is a big int, nan, or inf?
-.pred.rel "mutex",p10,p11
+;;
+
{ .mfb
nop.m 0
-(p11) fcvt.xf f8 = f9
- nop.b 0
+ fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf
+(p7) br.cond.spnt TRUNC_UNORM // Branch if x unorm
}
+;;
+
+TRUNC_COMMON:
+// Return here from TRUNC_UNORM
{ .mfb
+ and rExp = rSignexp, rExpMask // Get biased exponent
+(p6) fma.s.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf
+}
+;;
+
+{ .mfi
+ cmp.lt p6,p0 = rExp, rExpBias // Is |x| < 1?
+ fcvt.xf f8 = fXtruncInt // Result, assume 1 <= |x| < 2^23
+ cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^23?
+}
+;;
+
+// We must correct result if |x| < 1, or |x| >= 2^23
+.pred.rel "mutex",p6,p7
+{ .mfi
nop.m 0
-(p10) fma.s.s1 f8 = f11,f1,f0
-(p8) br.ret.sptk b0
-};;
-// If not a unnorm and not an big int, nan,or +/-inf convert signficand
-// back to f8.
-// If not a unorm and a big int, nan, or +/-inf, return fnorm'd x
-// If not a unorm, Return
-// If unnorm, get the exponent again - perhaps it wasn't a denorm.
-{ .mfb
-(p7) getf.exp TRUNC_GR_signexp = f11
-(p7) fcvt.fx.trunc.s1 f12 = f11
- nop.b 0
-};;
-{ .mfb
- and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask
- fcmp.lt.unc.s1 p9,p0 = f8,f0
- nop.b 0
-};;
-{ .mfb
- cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp
- nop.f 0
- nop.b 0
-};;
-// If a unnorm, check to see if value is already a big int.
+(p6) fmerge.s f8 = fNormX, f0 // If |x| < 1, result sgn(x)*0
+ nop.i 0
+}
{ .mfb
- nop.m 0
-(p11) fcvt.xf f8 = f12
- nop.b 0
+ nop.m 0
+(p7) fma.s.s0 f8 = fNormX, f1, f0 // If |x| >= 2^23, result x
+ br.ret.sptk b0 // Exit main path
}
-{ .mfi
- nop.m 0
-(p10) fma.s.s1 f8 = f11,f1,f0
- nop.i 0
-};;
+;;
+
+
+TRUNC_UNORM:
+// Here if x unorm
{ .mfb
- nop.m 0
-(p9) fmerge.ns f8 = f1,f8
- br.ret.sptk b0
-};;
-// If so return it. Otherwise, return (fcvt.xf(fcvt.fx.trunc(x)))
-// Make sure the result is negative if it should be - that is
-// negative(denormal) -> -0.
-.endp truncf
-ASM_SIZE_DIRECTIVE(truncf)
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk TRUNC_COMMON // Return to main path
+}
+;;
+
+GLOBAL_LIBM_END(truncf)
diff --git a/sysdeps/ia64/fpu/s_truncl.S b/sysdeps/ia64/fpu/s_truncl.S
index 91bf96ce90..1afa19ba2b 100644
--- a/sysdeps/ia64/fpu/s_truncl.S
+++ b/sysdeps/ia64/fpu/s_truncl.S
@@ -1,11 +1,10 @@
.file "truncl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 7/7/2000 by John Harrison, Cristina Iordache, Ted Kubaska,
-// Bob Norin, Shane Story, and Ping Tak Peter Tang of the
-// Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -21,59 +20,57 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
// Intel Corporation is the author of this code, and requests that all
-// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
-.align 32
-.global truncl#
-
-.section .text
-.proc truncl#
-.align 32
-
// History
//==============================================================
-// 7/7/00: Created
+// 07/07/00 Created
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 01/20/03 Improved performance and reduced code size
//==============================================================
// API
//==============================================================
-// long double truncl(float x)
+// long double truncl(long double x)
//==============================================================
-#include "libm_support.h"
+// general input registers:
+// r14 - r18
-// general input registers:
-TRUNC_GR_FFFF = r14
-TRUNC_GR_signexp = r15
-TRUNC_GR_exponent = r16
-TRUNC_GR_expmask = r17
-TRUNC_GR_bigexp = r18
+rExpBias = r14
+rSignexp = r15
+rExp = r16
+rExpMask = r17
+rBigexp = r18
// floating-point registers:
-// f8, f9, f11, f12
+// f8 - f10
-// predicate registers used:
-// p6, p7, p8, p9, p10, p11
+fXtruncInt = f9
+fNormX = f10
+
+// predicate registers used:
+// p6, p7
// Overview of operation
//==============================================================
// long double truncl(long double x)
-// Return an integer value (represented as a long double) less than or
+// Return an integer value (represented as a long double) less than or
// equal to x in magnitude.
// This is x rounded toward zero to an integral value.
//==============================================================
@@ -97,105 +94,73 @@ TRUNC_GR_bigexp = r18
// If we multiply by 2^23, we no longer have a fractional part
// So input is an integer value already.
-truncl:
+.section .text
+GLOBAL_LIBM_ENTRY(truncl)
{ .mfi
- getf.exp TRUNC_GR_signexp = f8
- fcvt.fx.trunc.s1 f9 = f8
- addl TRUNC_GR_bigexp = 0x1003e, r0
+ getf.exp rSignexp = f8 // Get signexp, recompute if unorm
+ fcvt.fx.trunc.s1 fXtruncInt = f8 // Convert to int in significand
+ addl rBigexp = 0x1003e, r0 // Set exponent at which is integer
}
{ .mfi
- mov TRUNC_GR_FFFF = 0x0FFFF
- fnorm f11 = f8
- mov TRUNC_GR_expmask = 0x1FFFF
-};;
-// get the exponent of x
-// convert x to integer in signficand of f9
-// Normalize x - this will raise invalid on SNaNs, the
-// denormal operand flag - and possibly a spurious U flag
-// get exponent only mask (will exclude sign bit)
+ mov rExpBias = 0x0FFFF // Form exponent bias
+ fnorm.s1 fNormX = f8 // Normalize input
+ mov rExpMask = 0x1FFFF // Form exponent mask
+}
+;;
{ .mfi
nop.m 0
- fclass.m p7,p8 = f8, 0x0b
+ fclass.m p7,p0 = f8, 0x0b // Test x unorm
nop.i 0
}
-{ .mfi
- nop.m 0
- fcmp.eq.unc.s1 p9,p0 = f8,f0
- nop.i 0
-};;
-// fclass to set p7 if unnorm
-{ .mmi
- and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask ;;
-(p8) cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp
-(p8) cmp.ne.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_signexp
-};;
-// Get the exponent of x
-// Test if exponent such that result already an integer
-// Test if x < 0
-{ .mmi
-(p9) cmp.eq.andcm p10,p11 = r0, r0
-(p6) cmp.lt.unc p6,p0 = TRUNC_GR_exponent, TRUNC_GR_FFFF
- nop.i 0
-};;
-// If -1 < x < 0, set p6, turn off p10 and p11, and set result to -0.0
-{ .mfb
-(p6) cmp.eq.andcm p10,p11 = r0, r0
-(p6) fmerge.s f8 = f8, f0
- nop.b 0
-};;
-// If not a unnorm, set p10 if x already is a big int, nan, or inf?
-// If not a unnorm, set p10 if x already is a big int, nan, or inf?
-.pred.rel "mutex",p10,p11
+;;
+
{ .mfb
nop.m 0
-(p11) fcvt.xf f8 = f9
- nop.b 0
+ fclass.m p6,p0 = f8, 0x1e3 // Test x natval, nan, inf
+(p7) br.cond.spnt TRUNC_UNORM // Branch if x unorm
}
+;;
+
+TRUNC_COMMON:
+// Return here from TRUNC_UNORM
{ .mfb
+ and rExp = rSignexp, rExpMask // Get biased exponent
+(p6) fma.s0 f8 = f8, f1, f0 // Result if x natval, nan, inf
+(p6) br.ret.spnt b0 // Exit if x natval, nan, inf
+}
+;;
+
+{ .mfi
+ cmp.lt p6,p0 = rExp, rExpBias // Is |x| < 1?
+ fcvt.xf f8 = fXtruncInt // Result, assume 1 <= |x| < 2^63
+ cmp.ge p7,p0 = rExp, rBigexp // Is |x| >= 2^63?
+}
+;;
+
+// We must correct result if |x| < 1, or |x| >= 2^63
+.pred.rel "mutex",p6,p7
+{ .mfi
nop.m 0
-(p10) fma.s1 f8 = f11,f1,f0
-(p8) br.ret.sptk b0
-};;
-// If not a unnorm and not an big int, nan,or +/-inf convert signficand
-// back to f8.
-// If not a unorm and a big int, nan, or +/-inf, return fnorm'd x
-// If not a unorm, Return
-// If unnorm, get the exponent again - perhaps it wasn't a denorm.
-{ .mfb
-(p7) getf.exp TRUNC_GR_signexp = f11
-(p7) fcvt.fx.trunc.s1 f12 = f11
- nop.b 0
-};;
-{ .mfb
- and TRUNC_GR_exponent = TRUNC_GR_signexp, TRUNC_GR_expmask
- fcmp.lt.unc.s1 p9,p0 = f8,f0
- nop.b 0
-};;
-{ .mfb
- cmp.ge.unc p10,p11 = TRUNC_GR_exponent, TRUNC_GR_bigexp
- nop.f 0
- nop.b 0
-};;
-// If a unnorm, check to see if value is already a big int.
+(p6) fmerge.s f8 = fNormX, f0 // If |x| < 1, result sgn(x)*0
+ nop.i 0
+}
{ .mfb
- nop.m 0
-(p11) fcvt.xf f8 = f12
- nop.b 0
+ nop.m 0
+(p7) fma.s0 f8 = fNormX, f1, f0 // If |x| >= 2^63, result x
+ br.ret.sptk b0 // Exit main path
}
-{ .mfi
- nop.m 0
-(p10) fma.s1 f8 = f11,f1,f0
- nop.i 0
-};;
+;;
+
+
+TRUNC_UNORM:
+// Here if x unorm
{ .mfb
- nop.m 0
-(p9) fmerge.ns f8 = f1,f8
- br.ret.sptk b0
-};;
-// If so return it. Otherwise, return (fcvt.xf(fcvt.fx.trunc(x)))
-// Make sure the result is negative if it should be - that is
-// negative(denormal) -> -0.
-.endp truncl
-ASM_SIZE_DIRECTIVE(truncl)
+ getf.exp rSignexp = fNormX // Get signexp, recompute if unorm
+ fcmp.eq.s0 p7,p0 = f8, f0 // Dummy op to set denormal flag
+ br.cond.sptk TRUNC_COMMON // Return to main path
+}
+;;
+
+GLOBAL_LIBM_END(truncl)
diff --git a/sysdeps/ia64/fpu/t_exp.c b/sysdeps/ia64/fpu/t_exp.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/t_exp.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_acosh.c b/sysdeps/ia64/fpu/w_acosh.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_acosh.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_acoshf.c b/sysdeps/ia64/fpu/w_acoshf.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_acoshf.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_acoshl.c b/sysdeps/ia64/fpu/w_acoshl.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_acoshl.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_atanh.c b/sysdeps/ia64/fpu/w_atanh.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_atanh.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_atanhf.c b/sysdeps/ia64/fpu/w_atanhf.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_atanhf.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_atanhl.c b/sysdeps/ia64/fpu/w_atanhl.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_atanhl.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_exp10.c b/sysdeps/ia64/fpu/w_exp10.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_exp10.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_exp10f.c b/sysdeps/ia64/fpu/w_exp10f.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_exp10f.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_exp10l.c b/sysdeps/ia64/fpu/w_exp10l.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_exp10l.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_exp2.c b/sysdeps/ia64/fpu/w_exp2.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_exp2.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_exp2f.c b/sysdeps/ia64/fpu/w_exp2f.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_exp2f.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_exp2l.c b/sysdeps/ia64/fpu/w_exp2l.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_exp2l.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_expl.c b/sysdeps/ia64/fpu/w_expl.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_expl.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_lgamma.c b/sysdeps/ia64/fpu/w_lgamma.c
new file mode 100644
index 0000000000..fb799df68d
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_lgamma.c
@@ -0,0 +1,80 @@
+/* file: lgamma.c */
+
+// Copyright (c) 2002 Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//
+
+// History
+//==============================================================
+// 02/04/02: Initial version
+// 02/22/02: Removed lgammaf/gammaf
+//
+/*
+// FUNCTIONS: double lgamma(double x)
+// double gamma(double x)
+// Natural logarithm of GAMMA function
+*/
+
+#include "libm_support.h"
+
+
+extern double __libm_lgamma(double /*x*/, int* /*signgam*/, int /*signgamsz*/);
+
+
+double __ieee754_lgamma(double x)
+{
+#ifdef __POSIX__
+ extern int signgam;
+#else
+ int signgam;
+#endif
+ return __libm_lgamma(x, &signgam, sizeof(signgam));
+}
+weak_alias(__ieee754_lgamma, lgamma)
+
+double __ieee754_gamma(double x)
+{
+#ifdef __POSIX__
+ extern int signgam;
+#else
+ int signgam;
+#endif
+ return __libm_lgamma(x, &signgam, sizeof(signgam));
+}
+weak_alias(__ieee754_gamma, gamma)
diff --git a/sysdeps/ia64/fpu/w_lgamma_r.c b/sysdeps/ia64/fpu/w_lgamma_r.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_lgamma_r.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_lgammaf.c b/sysdeps/ia64/fpu/w_lgammaf.c
new file mode 100644
index 0000000000..bda3741f78
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_lgammaf.c
@@ -0,0 +1,80 @@
+/* file: lgammaf.c */
+
+// Copyright (c) 2002 Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//
+
+// History
+//==============================================================
+// 02/04/02: Initial version
+// 02/22/02: Removed lgamma/gamma
+//
+/*
+// FUNCTIONS: float lgammaf(float x)
+// float gammaf(float x)
+// Natural logarithm of GAMMA function
+*/
+
+#include "libm_support.h"
+
+
+extern float __libm_lgammaf(float /*x*/, int* /*signgam*/, int /*signgamsz*/);
+
+
+float __ieee754_lgammaf(float x)
+{
+#ifdef __POSIX__
+ extern int signgam;
+#else
+ int signgam;
+#endif
+ return __libm_lgammaf(x, &signgam, sizeof(signgam));
+}
+weak_alias(__ieee754_lgammaf, lgammaf)
+
+float __ieee754_gammaf(float x)
+{
+#ifdef __POSIX__
+ extern int signgam;
+#else
+ int signgam;
+#endif
+ return __libm_lgammaf(x, &signgam, sizeof(signgam));
+}
+weak_alias(__ieee754_gammaf, gammaf)
diff --git a/sysdeps/ia64/fpu/w_lgammaf_r.c b/sysdeps/ia64/fpu/w_lgammaf_r.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_lgammaf_r.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_lgammal.c b/sysdeps/ia64/fpu/w_lgammal.c
new file mode 100644
index 0000000000..9f9f356e98
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_lgammal.c
@@ -0,0 +1,79 @@
+/* file: lgammal.c */
+
+// Copyright (c) 2002 Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//
+
+// History
+//==============================================================
+// 08/15/02: Initial version
+//
+/*
+// FUNCTIONS: long double lgammal(long double x)
+// long double gammal(long double x)
+// Natural logarithm of GAMMA function
+*/
+
+#include "libm_support.h"
+
+
+extern double __libm_lgammal(long double /*x*/, int* /*signgam*/, int /*signgamsz*/);
+
+
+long double __ieee754_lgammal(long double x)
+{
+#ifdef __POSIX__
+ extern int signgam;
+#else
+ int signgam;
+#endif
+ return __libm_lgammal(x, &signgam, sizeof(signgam));
+}
+weak_alias(__ieee754_lgammal, lgammal)
+
+long double __ieee754_gammal(long double x)
+{
+#ifdef __POSIX__
+ extern int signgam;
+#else
+ int signgam;
+#endif
+ return __libm_lgammal(x, &signgam, sizeof(signgam));
+}
+weak_alias(__ieee754_gammal, gammal)
diff --git a/sysdeps/ia64/fpu/w_lgammal_r.c b/sysdeps/ia64/fpu/w_lgammal_r.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_lgammal_r.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_log2.c b/sysdeps/ia64/fpu/w_log2.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_log2.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_log2f.c b/sysdeps/ia64/fpu/w_log2f.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_log2f.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_log2l.c b/sysdeps/ia64/fpu/w_log2l.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_log2l.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_sinh.c b/sysdeps/ia64/fpu/w_sinh.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_sinh.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_sinhf.c b/sysdeps/ia64/fpu/w_sinhf.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_sinhf.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_sinhl.c b/sysdeps/ia64/fpu/w_sinhl.c
new file mode 100644
index 0000000000..41254ae60a
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_sinhl.c
@@ -0,0 +1 @@
+/* Not needed. */
diff --git a/sysdeps/ia64/fpu/w_tgamma.S b/sysdeps/ia64/fpu/w_tgamma.S
new file mode 100644
index 0000000000..7d654d0343
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_tgamma.S
@@ -0,0 +1,1835 @@
+.file "tgamma.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
+// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
+// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code,and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//*********************************************************************
+//
+// History:
+// 10/12/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+// 04/04/03 Changed error codes for overflow and negative integers
+// 04/10/03 Changed code for overflow near zero handling
+//
+//*********************************************************************
+//
+//*********************************************************************
+//
+// Function: tgamma(x) computes the principle value of the GAMMA
+// function of x.
+//
+//*********************************************************************
+//
+// Resources Used:
+//
+// Floating-Point Registers: f8-f15
+// f33-f87
+//
+// General Purpose Registers:
+// r8-r11
+// r14-r28
+// r32-r36
+// r37-r40 (Used to pass arguments to error handling routine)
+//
+// Predicate Registers: p6-p15
+//
+//*********************************************************************
+//
+// IEEE Special Conditions:
+//
+// tgamma(+inf) = +inf
+// tgamma(-inf) = QNaN
+// tgamma(+/-0) = +/-inf
+// tgamma(x<0, x - integer) = QNaN
+// tgamma(SNaN) = QNaN
+// tgamma(QNaN) = QNaN
+//
+//*********************************************************************
+//
+// Overview
+//
+// The method consists of three cases.
+//
+// If 2 <= x < OVERFLOW_BOUNDARY use case tgamma_regular;
+// else if 0 < x < 2 use case tgamma_from_0_to_2;
+// else if -(i+1) < x < -i, i = 0...184 use case tgamma_negatives;
+//
+// Case 2 <= x < OVERFLOW_BOUNDARY
+// -------------------------------
+// Here we use algorithm based on the recursive formula
+// GAMMA(x+1) = x*GAMMA(x). For that we subdivide interval
+// [2; OVERFLOW_BOUNDARY] into intervals [16*n; 16*(n+1)] and
+// approximate GAMMA(x) by polynomial of 22th degree on each
+// [16*n; 16*n+1], recursive formula is used to expand GAMMA(x)
+// to [16*n; 16*n+1]. In other words we need to find n, i and r
+// such that x = 16 * n + i + r where n and i are integer numbers
+// and r is fractional part of x. So GAMMA(x) = GAMMA(16*n+i+r) =
+// = (x-1)*(x-2)*...*(x-i)*GAMMA(x-i) =
+// = (x-1)*(x-2)*...*(x-i)*GAMMA(16*n+r) ~
+// ~ (x-1)*(x-2)*...*(x-i)*P22n(r).
+//
+// Step 1: Reduction
+// -----------------
+// N = [x] with truncate
+// r = x - N, note 0 <= r < 1
+//
+// n = N & ~0xF - index of table that contains coefficient of
+// polynomial approximation
+// i = N & 0xF - is used in recursive formula
+//
+//
+// Step 2: Approximation
+// ---------------------
+// We use factorized minimax approximation polynomials
+// P22n(r) = A22*(r^2+C01(n)*R+C00(n))*
+// *(r^2+C11(n)*R+C10(n))*...*(r^2+CA1(n)*R+CA0(n))
+//
+// Step 3: Recursion
+// -----------------
+// In case when i > 0 we need to multiply P22n(r) by product
+// R(i)=(x-1)*(x-2)*...*(x-i). To reduce number of fp-instructions
+// we can calculate R as follow:
+// R(i) = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-1))*(x-i)) if i is
+// even or R = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-2))*(x-(i-1)))*
+// *(i-1) if i is odd. In both cases we need to calculate
+// R2(i) = (x^2-3*x+2)*(x^2-7*x+12)*...*(x^2+x+2*j*(2*j-1)) =
+// = (x^2-3*x+2)*(x^2-7*x+12)*...*((x^2+x)+2*j*(2*(j-1)+(1-2*x))) =
+// = (RA+2*(2-RB))*(RA+4*(4-RB))*...*(RA+2*j*(2*(j-1)+RB))
+// where j = 1..[i/2], RA = x^2+x, RB = 1-2*x.
+//
+// Step 4: Reconstruction
+// ----------------------
+// Reconstruction is just simple multiplication i.e.
+// GAMMA(x) = P22n(r)*R(i)
+//
+// Case 0 < x < 2
+// --------------
+// To calculate GAMMA(x) on this interval we do following
+// if 1 <= x < 1.25 than GAMMA(x) = P15(x-1)
+// if 1.25 <= x < 1.5 than GAMMA(x) = P15(x-x_min) where
+// x_min is point of local minimum on [1; 2] interval.
+// if 1.5 <= x < 2.0 than GAMMA(x) = P15(x-1.5)
+// and
+// if 0 < x < 1 than GAMMA(x) = GAMMA(x+1)/x
+//
+// Case -(i+1) < x < -i, i = 0...184
+// ----------------------------------
+// Here we use the fact that GAMMA(-x) = PI/(x*GAMMA(x)*sin(PI*x)) and
+// so we need to calculate GAMMA(x), sin(PI*x)/PI. Calculation of
+// GAMMA(x) is described above.
+//
+// Step 1: Reduction
+// -----------------
+// Note that period of sin(PI*x) is 2 and range reduction for
+// sin(PI*x) is like to range reduction for GAMMA(x)
+// i.e r = x - [x] with exception of cases
+// when r > 0.5 (in such cases r = 1 - (x - [x])).
+//
+// Step 2: Approximation
+// ---------------------
+// To approximate sin(PI*x)/PI = sin(PI*(2*n+r))/PI =
+// = (-1)^n*sin(PI*r)/PI Taylor series is used.
+// sin(PI*r)/PI ~ S21(r).
+//
+// Step 3: Division
+// ----------------
+// To calculate 1/(x*GAMMA(x)*S21(r)) we use frcpa instruction
+// with following Newton-Raphson interations.
+//
+//
+//*********************************************************************
+
+GR_Sig = r8
+GR_TAG = r8
+GR_ad_Data = r9
+GR_SigRqLin = r10
+GR_iSig = r11
+GR_ExpOf1 = r11
+GR_ExpOf8 = r11
+
+
+GR_Sig2 = r14
+GR_Addr_Mask1 = r15
+GR_Sign_Exp = r16
+GR_Tbl_Offs = r17
+GR_Addr_Mask2 = r18
+GR_ad_Co = r19
+GR_Bit2 = r19
+GR_ad_Ce = r20
+GR_ad_Co7 = r21
+GR_NzOvfBound = r21
+GR_ad_Ce7 = r22
+GR_Tbl_Ind = r23
+GR_Tbl_16xInd = r24
+GR_ExpOf025 = r24
+GR_ExpOf05 = r25
+GR_0x30033 = r26
+GR_10 = r26
+GR_12 = r27
+GR_185 = r27
+GR_14 = r28
+GR_2 = r28
+GR_fpsr = r28
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+
+FR_X = f10
+FR_Y = f1 // tgamma is single argument function
+FR_RESULT = f8
+
+FR_AbsX = f9
+FR_NormX = f9
+FR_r02 = f11
+FR_AbsXp1 = f12
+FR_X2pX = f13
+FR_1m2X = f14
+FR_Rq1 = f14
+FR_Xt = f15
+
+FR_r = f33
+FR_OvfBound = f34
+FR_Xmin = f35
+FR_2 = f36
+FR_Rcp1 = f36
+FR_Rcp3 = f36
+FR_4 = f37
+FR_5 = f38
+FR_6 = f39
+FR_8 = f40
+FR_10 = f41
+FR_12 = f42
+FR_14 = f43
+FR_GAMMA = f43
+FR_05 = f44
+
+FR_Rq2 = f45
+FR_Rq3 = f46
+FR_Rq4 = f47
+FR_Rq5 = f48
+FR_Rq6 = f49
+FR_Rq7 = f50
+FR_RqLin = f51
+
+FR_InvAn = f52
+
+FR_C01 = f53
+FR_A15 = f53
+FR_C11 = f54
+FR_A14 = f54
+FR_C21 = f55
+FR_A13 = f55
+FR_C31 = f56
+FR_A12 = f56
+FR_C41 = f57
+FR_A11 = f57
+FR_C51 = f58
+FR_A10 = f58
+FR_C61 = f59
+FR_A9 = f59
+FR_C71 = f60
+FR_A8 = f60
+FR_C81 = f61
+FR_A7 = f61
+FR_C91 = f62
+FR_A6 = f62
+FR_CA1 = f63
+FR_A5 = f63
+FR_C00 = f64
+FR_A4 = f64
+FR_rs2 = f64
+FR_C10 = f65
+FR_A3 = f65
+FR_rs3 = f65
+FR_C20 = f66
+FR_A2 = f66
+FR_rs4 = f66
+FR_C30 = f67
+FR_A1 = f67
+FR_rs7 = f67
+FR_C40 = f68
+FR_A0 = f68
+FR_rs8 = f68
+FR_C50 = f69
+FR_r2 = f69
+FR_C60 = f70
+FR_r3 = f70
+FR_C70 = f71
+FR_r4 = f71
+FR_C80 = f72
+FR_r7 = f72
+FR_C90 = f73
+FR_r8 = f73
+FR_CA0 = f74
+FR_An = f75
+
+FR_S21 = f76
+FR_S19 = f77
+FR_Rcp0 = f77
+FR_Rcp2 = f77
+FR_S17 = f78
+FR_S15 = f79
+FR_S13 = f80
+FR_S11 = f81
+FR_S9 = f82
+FR_S7 = f83
+FR_S5 = f84
+FR_S3 = f85
+
+FR_iXt = f86
+FR_rs = f87
+
+
+// Data tables
+//==============================================================
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(tgamma_data)
+data8 0x406573FAE561F648 // overflow boundary (171.624376956302739927196)
+data8 0x3FDD8B618D5AF8FE // point of local minium (0.461632144968362356785)
+//
+//[2; 3]
+data8 0xEF0E85C9AE40ABE2,0x00004000 // C01
+data8 0xCA2049DDB4096DD8,0x00004000 // C11
+data8 0x99A203B4DC2D1A8C,0x00004000 // C21
+data8 0xBF5D9D9C0C295570,0x00003FFF // C31
+data8 0xE8DD037DEB833BAB,0x00003FFD // C41
+data8 0xB6AE39A2A36AA03A,0x0000BFFE // C51
+data8 0x804960DC2850277B,0x0000C000 // C61
+data8 0xD9F3973841C09F80,0x0000C000 // C71
+data8 0x9C198A676F8A2239,0x0000C001 // C81
+data8 0xC98B7DAE02BE3226,0x0000C001 // C91
+data8 0xE9CAF31AC69301BA,0x0000C001 // CA1
+data8 0xFBBDD58608A0D172,0x00004000 // C00
+data8 0xFDD0316D1E078301,0x00004000 // C10
+data8 0x8630B760468C15E4,0x00004001 // C20
+data8 0x93EDE20E47D9152E,0x00004001 // C30
+data8 0xA86F3A38C77D6B19,0x00004001 // C40
+//[16; 17]
+data8 0xF87F757F365EE813,0x00004000 // C01
+data8 0xECA84FBA92759DA4,0x00004000 // C11
+data8 0xD4E0A55E07A8E913,0x00004000 // C21
+data8 0xB0EB45E94C8A5F7B,0x00004000 // C31
+data8 0x8050D6B4F7C8617D,0x00004000 // C41
+data8 0x8471B111AA691E5A,0x00003FFF // C51
+data8 0xADAF462AF96585C9,0x0000BFFC // C61
+data8 0xD327C7A587A8C32B,0x0000BFFF // C71
+data8 0xDEF5192B4CF5E0F1,0x0000C000 // C81
+data8 0xBADD64BB205AEF02,0x0000C001 // C91
+data8 0x9330A24AA67D6860,0x0000C002 // CA1
+data8 0xF57EEAF36D8C47BE,0x00004000 // C00
+data8 0x807092E12A251B38,0x00004001 // C10
+data8 0x8C458F80DEE7ED1C,0x00004001 // C20
+data8 0x9F30C731DC77F1A6,0x00004001 // C30
+data8 0xBAC4E7E099C3A373,0x00004001 // C40
+//[32; 33]
+data8 0xC3059A415F142DEF,0x00004000 // C01
+data8 0xB9C1DAC24664587A,0x00004000 // C11
+data8 0xA7101D910992FFB2,0x00004000 // C21
+data8 0x8A9522B8E4AA0AB4,0x00004000 // C31
+data8 0xC76A271E4BA95DCC,0x00003FFF // C41
+data8 0xC5D6DE2A38DB7FF2,0x00003FFE // C51
+data8 0xDBA42086997818B2,0x0000BFFC // C61
+data8 0xB8EDDB1424C1C996,0x0000BFFF // C71
+data8 0xBF7372FB45524B5D,0x0000C000 // C81
+data8 0xA03DDE759131580A,0x0000C001 // C91
+data8 0xFDA6FC4022C1FFE3,0x0000C001 // CA1
+data8 0x9759ABF797B2533D,0x00004000 // C00
+data8 0x9FA160C6CF18CEC5,0x00004000 // C10
+data8 0xB0EFF1E3530E0FCD,0x00004000 // C20
+data8 0xCCD60D5C470165D1,0x00004000 // C30
+data8 0xF5E53F6307B0B1C1,0x00004000 // C40
+//[48; 49]
+data8 0xAABE577FBCE37F5E,0x00004000 // C01
+data8 0xA274CAEEB5DF7172,0x00004000 // C11
+data8 0x91B90B6646C1B924,0x00004000 // C21
+data8 0xF06718519CA256D9,0x00003FFF // C31
+data8 0xAA9EE181C0E30263,0x00003FFF // C41
+data8 0xA07BDB5325CB28D2,0x00003FFE // C51
+data8 0x86C8B873204F9219,0x0000BFFD // C61
+data8 0xB0192C5D3E4787D6,0x0000BFFF // C71
+data8 0xB1E0A6263D4C19EF,0x0000C000 // C81
+data8 0x93BA32A118EAC9AE,0x0000C001 // C91
+data8 0xE942A39CD9BEE887,0x0000C001 // CA1
+data8 0xE838B0957B0D3D0D,0x00003FFF // C00
+data8 0xF60E0F00074FCF34,0x00003FFF // C10
+data8 0x89869936AE00C2A5,0x00004000 // C20
+data8 0xA0FE4E8AA611207F,0x00004000 // C30
+data8 0xC3B1229CFF1DDAFE,0x00004000 // C40
+//[64; 65]
+data8 0x9C00DDF75CDC6183,0x00004000 // C01
+data8 0x9446AE9C0F6A833E,0x00004000 // C11
+data8 0x84ABC5083310B774,0x00004000 // C21
+data8 0xD9BA3A0977B1ED83,0x00003FFF // C31
+data8 0x989B18C99411D300,0x00003FFF // C41
+data8 0x886E66402318CE6F,0x00003FFE // C51
+data8 0x99028C2468F18F38,0x0000BFFD // C61
+data8 0xAB72D17DCD40CCE1,0x0000BFFF // C71
+data8 0xA9D9AC9BE42C2EF9,0x0000C000 // C81
+data8 0x8C11D983AA177AD2,0x0000C001 // C91
+data8 0xDC779E981C1F0F06,0x0000C001 // CA1
+data8 0xC1FD4AC85965E8D6,0x00003FFF // C00
+data8 0xCE3D2D909D389EC2,0x00003FFF // C10
+data8 0xE7F79980AD06F5D8,0x00003FFF // C20
+data8 0x88DD9F73C8680B5D,0x00004000 // C30
+data8 0xA7D6CB2CB2D46F9D,0x00004000 // C40
+//[80; 81]
+data8 0x91C7FF4E993430D0,0x00004000 // C01
+data8 0x8A6E7AB83E45A7E9,0x00004000 // C11
+data8 0xF72D6382E427BEA9,0x00003FFF // C21
+data8 0xC9E2E4F9B3B23ED6,0x00003FFF // C31
+data8 0x8BEFEF56AE05D775,0x00003FFF // C41
+data8 0xEE9666AB6A185560,0x00003FFD // C51
+data8 0xA6AFAF5CEFAEE04D,0x0000BFFD // C61
+data8 0xA877EAFEF1F9C880,0x0000BFFF // C71
+data8 0xA45BD433048ECA15,0x0000C000 // C81
+data8 0x86BD1636B774CC2E,0x0000C001 // C91
+data8 0xD3721BE006E10823,0x0000C001 // CA1
+data8 0xA97EFABA91854208,0x00003FFF // C00
+data8 0xB4AF0AEBB3F97737,0x00003FFF // C10
+data8 0xCC38241936851B0B,0x00003FFF // C20
+data8 0xF282A6261006EA84,0x00003FFF // C30
+data8 0x95B8E9DB1BD45BAF,0x00004000 // C40
+//[96; 97]
+data8 0x8A1FA3171B35A106,0x00004000 // C01
+data8 0x830D5B8843890F21,0x00004000 // C11
+data8 0xE98B0F1616677A23,0x00003FFF // C21
+data8 0xBDF8347F5F67D4EC,0x00003FFF // C31
+data8 0x825F15DE34EC055D,0x00003FFF // C41
+data8 0xD4846186B8AAC7BE,0x00003FFD // C51
+data8 0xB161093AB14919B1,0x0000BFFD // C61
+data8 0xA65758EEA4800EF4,0x0000BFFF // C71
+data8 0xA046B67536FA329C,0x0000C000 // C81
+data8 0x82BBEC1BCB9E9068,0x0000C001 // C91
+data8 0xCC9DE2B23BA91B0B,0x0000C001 // CA1
+data8 0x983B16148AF77F94,0x00003FFF // C00
+data8 0xA2A4D8EE90FEE5DD,0x00003FFF // C10
+data8 0xB89446FA37FF481C,0x00003FFF // C20
+data8 0xDC5572648485FB01,0x00003FFF // C30
+data8 0x88CD5D7DB976129A,0x00004000 // C40
+//[112; 113]
+data8 0x8417098FD62AC5E3,0x00004000 // C01
+data8 0xFA7896486B779CBB,0x00003FFF // C11
+data8 0xDEC98B14AF5EEBD1,0x00003FFF // C21
+data8 0xB48E153C6BF0B5A3,0x00003FFF // C31
+data8 0xF597B038BC957582,0x00003FFE // C41
+data8 0xBFC6F0884A415694,0x00003FFD // C51
+data8 0xBA075A1392BDB5E5,0x0000BFFD // C61
+data8 0xA4B79E01B44C7DB4,0x0000BFFF // C71
+data8 0x9D12FA7711BFAB0F,0x0000C000 // C81
+data8 0xFF24C47C8E108AB4,0x0000C000 // C91
+data8 0xC7325EC86562606A,0x0000C001 // CA1
+data8 0x8B47DCD9E1610938,0x00003FFF // C00
+data8 0x9518B111B70F88B8,0x00003FFF // C10
+data8 0xA9CC197206F68682,0x00003FFF // C20
+data8 0xCB98294CC0D7A6A6,0x00003FFF // C30
+data8 0xFE09493EA9165181,0x00003FFF // C40
+//[128; 129]
+data8 0xFE53D03442270D90,0x00003FFF // C01
+data8 0xF0F857BAEC1993E4,0x00003FFF // C11
+data8 0xD5FF6D70DBBC2FD3,0x00003FFF // C21
+data8 0xACDAA5F4988B1074,0x00003FFF // C31
+data8 0xE92E069F8AD75B54,0x00003FFE // C41
+data8 0xAEBB64645BD94234,0x00003FFD // C51
+data8 0xC13746249F39B43C,0x0000BFFD // C61
+data8 0xA36B74F5B6297A1F,0x0000BFFF // C71
+data8 0x9A77860DF180F6E5,0x0000C000 // C81
+data8 0xF9F8457D84410A0C,0x0000C000 // C91
+data8 0xC2BF44C649EB8597,0x0000C001 // CA1
+data8 0x81225E7489BCDC0E,0x00003FFF // C00
+data8 0x8A788A09CE0EED11,0x00003FFF // C10
+data8 0x9E2E6F86D1B1D89C,0x00003FFF // C20
+data8 0xBE6866B21CF6CCB5,0x00003FFF // C30
+data8 0xEE94426EC1486AAE,0x00003FFF // C40
+//[144; 145]
+data8 0xF6113E09732A6497,0x00003FFF // C01
+data8 0xE900D45931B04FC8,0x00003FFF // C11
+data8 0xCE9FD58F745EBA5D,0x00003FFF // C21
+data8 0xA663A9636C864C86,0x00003FFF // C31
+data8 0xDEBF5315896CE629,0x00003FFE // C41
+data8 0xA05FEA415EBD7737,0x00003FFD // C51
+data8 0xC750F112BD9C4031,0x0000BFFD // C61
+data8 0xA2593A35C51C6F6C,0x0000BFFF // C71
+data8 0x9848E1DA7FB40C8C,0x0000C000 // C81
+data8 0xF59FEE87A5759A4B,0x0000C000 // C91
+data8 0xBF00203909E45A1D,0x0000C001 // CA1
+data8 0xF1D8E157200127E5,0x00003FFE // C00
+data8 0x81DD5397CB08D487,0x00003FFF // C10
+data8 0x94C1DC271A8B766F,0x00003FFF // C20
+data8 0xB3AFAF9B5D6EDDCF,0x00003FFF // C30
+data8 0xE1FB4C57CA81BE1E,0x00003FFF // C40
+//[160; 161]
+data8 0xEEFFE5122AC72FFD,0x00003FFF // C01
+data8 0xE22F70BB52AD54B3,0x00003FFF // C11
+data8 0xC84FF021FE993EEA,0x00003FFF // C21
+data8 0xA0DA2208EB5B2752,0x00003FFF // C31
+data8 0xD5CDD2FCF8AD2DF5,0x00003FFE // C41
+data8 0x940BEC6DCD811A59,0x00003FFD // C51
+data8 0xCC954EF4FD4EBB81,0x0000BFFD // C61
+data8 0xA1712E29A8C04554,0x0000BFFF // C71
+data8 0x966B55DFB243521A,0x0000C000 // C81
+data8 0xF1E6A2B9CEDD0C4C,0x0000C000 // C91
+data8 0xBBC87BCC031012DB,0x0000C001 // CA1
+data8 0xE43974E6D2818583,0x00003FFE // C00
+data8 0xF5702A516B64C5B7,0x00003FFE // C10
+data8 0x8CEBCB1B32E19471,0x00003FFF // C20
+data8 0xAAC10F05BB77E0AF,0x00003FFF // C30
+data8 0xD776EFCAB205CC58,0x00003FFF // C40
+//[176; 177]
+data8 0xE8DA614119811E5D,0x00003FFF // C01
+data8 0xDC415E0288B223D8,0x00003FFF // C11
+data8 0xC2D2243E44EC970E,0x00003FFF // C21
+data8 0x9C086664B5307BEA,0x00003FFF // C31
+data8 0xCE03D7A08B461156,0x00003FFE // C41
+data8 0x894BE3BAAAB66ADC,0x00003FFD // C51
+data8 0xD131EDD71A702D4D,0x0000BFFD // C61
+data8 0xA0A907CDDBE10898,0x0000BFFF // C71
+data8 0x94CC3CD9C765C808,0x0000C000 // C81
+data8 0xEEA85F237815FC0D,0x0000C000 // C91
+data8 0xB8FA04B023E43F91,0x0000C001 // CA1
+data8 0xD8B2C7D9FCBD7EF9,0x00003FFE // C00
+data8 0xE9566E93AAE7E38F,0x00003FFE // C10
+data8 0x8646E78AABEF0255,0x00003FFF // C20
+data8 0xA32AEDB62E304345,0x00003FFF // C30
+data8 0xCE83E40280EE7DF0,0x00003FFF // C40
+//
+//
+//[2; 3]
+data8 0xC44FB47E90584083,0x00004001 // C50
+data8 0xE863EE77E1C45981,0x00004001 // C60
+data8 0x8AC15BE238B9D70E,0x00004002 // C70
+data8 0xA5D94B6592350EF4,0x00004002 // C80
+data8 0xC379DB3E20A148B3,0x00004002 // C90
+data8 0xDACA49B73974F6C9,0x00004002 // CA0
+data8 0x810E496A1AFEC895,0x00003FE1 // An
+//[16; 17]
+data8 0xE17C0357AAF3F817,0x00004001 // C50
+data8 0x8BA8804750FBFBFE,0x00004002 // C60
+data8 0xB18EAB3CB64BEBEE,0x00004002 // C70
+data8 0xE90AB7015AF1C28F,0x00004002 // C80
+data8 0xA0AB97CE9E259196,0x00004003 // C90
+data8 0xF5E0E0A000C2D720,0x00004003 // CA0
+data8 0xD97F0F87EC791954,0x00004005 // An
+//[32; 33]
+data8 0x980C293F3696040D,0x00004001 // C50
+data8 0xC0DBFFBB948A9A4E,0x00004001 // C60
+data8 0xFAB54625E9A588A2,0x00004001 // C70
+data8 0xA7E08176D6050FBF,0x00004002 // C80
+data8 0xEBAAEC4952270A9F,0x00004002 // C90
+data8 0xB7479CDAD20550FE,0x00004003 // CA0
+data8 0xAACD45931C3FF634,0x00004054 // An
+//[48; 49]
+data8 0xF5180F0000419AD5,0x00004000 // C50
+data8 0x9D507D07BFBB2273,0x00004001 // C60
+data8 0xCEB53F7A13A383E3,0x00004001 // C70
+data8 0x8BAFEF9E0A49128F,0x00004002 // C80
+data8 0xC58EF912D39E228C,0x00004002 // C90
+data8 0x9A88118422BA208E,0x00004003 // CA0
+data8 0xBD6C0E2477EC12CB,0x000040AC // An
+//[64; 65]
+data8 0xD410AC48BF7748DA,0x00004000 // C50
+data8 0x89399B90AFEBD931,0x00004001 // C60
+data8 0xB596DF8F77EB8560,0x00004001 // C70
+data8 0xF6D9445A047FB4A6,0x00004001 // C80
+data8 0xAF52F0DD65221357,0x00004002 // C90
+data8 0x8989B45BFC881989,0x00004003 // CA0
+data8 0xB7FCAE86E6E10D5A,0x0000410B // An
+//[80; 81]
+data8 0xBE759740E3B5AA84,0x00004000 // C50
+data8 0xF8037B1B07D27609,0x00004000 // C60
+data8 0xA4F6F6C7F0977D4F,0x00004001 // C70
+data8 0xE131960233BF02C4,0x00004001 // C80
+data8 0xA06DF43D3922BBE2,0x00004002 // C90
+data8 0xFC266AB27255A360,0x00004002 // CA0
+data8 0xD9F4B012EDAFEF2F,0x0000416F // An
+//[96; 97]
+data8 0xAEFC84CDA8E1EAA6,0x00004000 // C50
+data8 0xE5009110DB5F3C8A,0x00004000 // C60
+data8 0x98F5F48738E7B232,0x00004001 // C70
+data8 0xD17EE64E21FFDC6B,0x00004001 // C80
+data8 0x9596F7A7E36145CC,0x00004002 // C90
+data8 0xEB64DBE50E125CAF,0x00004002 // CA0
+data8 0xA090530D79E32D2E,0x000041D8 // An
+//[112; 113]
+data8 0xA33AEA22A16B2655,0x00004000 // C50
+data8 0xD682B93BD7D7945C,0x00004000 // C60
+data8 0x8FC854C6E6E30CC3,0x00004001 // C70
+data8 0xC5754D828AFFDC7A,0x00004001 // C80
+data8 0x8D41216B397139C2,0x00004002 // C90
+data8 0xDE78D746848116E5,0x00004002 // CA0
+data8 0xB8A297A2DC0630DB,0x00004244 // An
+//[128; 129]
+data8 0x99EB00F11D95E292,0x00004000 // C50
+data8 0xCB005CB911EB779A,0x00004000 // C60
+data8 0x8879AA2FDFF3A37A,0x00004001 // C70
+data8 0xBBDA538AD40CAC2C,0x00004001 // C80
+data8 0x8696D849D311B9DE,0x00004002 // C90
+data8 0xD41E1C041481199F,0x00004002 // CA0
+data8 0xEBA1A43D34EE61EE,0x000042B3 // An
+//[144; 145]
+data8 0x924F822578AA9F3D,0x00004000 // C50
+data8 0xC193FAF9D3B36960,0x00004000 // C60
+data8 0x827AE3A6B68ED0CA,0x00004001 // C70
+data8 0xB3F52A27EED23F0B,0x00004001 // C80
+data8 0x811A079FB3C94D79,0x00004002 // C90
+data8 0xCB94415470B6F8D2,0x00004002 // CA0
+data8 0x80A0260DCB3EC9AC,0x00004326 // An
+//[160; 161]
+data8 0x8BF24091E88B331D,0x00004000 // C50
+data8 0xB9ADE01187E65201,0x00004000 // C60
+data8 0xFAE4508F6E7625FE,0x00004000 // C70
+data8 0xAD516668AD6D7367,0x00004001 // C80
+data8 0xF8F5FF171154F637,0x00004001 // C90
+data8 0xC461321268990C82,0x00004002 // CA0
+data8 0xC3B693F344B0E6FE,0x0000439A // An
+//
+//[176; 177]
+data8 0x868545EB42A258ED,0x00004000 // C50
+data8 0xB2EF04ACE8BA0E6E,0x00004000 // C60
+data8 0xF247D22C22E69230,0x00004000 // C70
+data8 0xA7A1AB93E3981A90,0x00004001 // C80
+data8 0xF10951733E2C697F,0x00004001 // C90
+data8 0xBE3359BFAD128322,0x00004002 // CA0
+data8 0x8000000000000000,0x00003fff
+//
+//[160; 161] for negatives
+data8 0xA76DBD55B2E32D71,0x00003C63 // 1/An
+//
+// sin(pi*x)/pi
+data8 0xBCBC4342112F52A2,0x00003FDE // S21
+data8 0xFAFCECB86536F655,0x0000BFE3 // S19
+data8 0x87E4C97F9CF09B92,0x00003FE9 // S17
+data8 0xEA124C68E704C5CB,0x0000BFED // S15
+data8 0x9BA38CFD59C8AA1D,0x00003FF2 // S13
+data8 0x99C0B552303D5B21,0x0000BFF6 // S11
+//
+//[176; 177] for negatives
+data8 0xBA5D5869211696FF,0x00003BEC // 1/An
+//
+// sin(pi*x)/pi
+data8 0xD63402E79A853175,0x00003FF9 // S9
+data8 0xC354723906DB36BA,0x0000BFFC // S7
+data8 0xCFCE5A015E236291,0x00003FFE // S5
+data8 0xD28D3312983E9918,0x0000BFFF // S3
+//
+//
+// [1.0;1.25]
+data8 0xA405530B067ECD3C,0x0000BFFC // A15
+data8 0xF5B5413F95E1C282,0x00003FFD // A14
+data8 0xC4DED71C782F76C8,0x0000BFFE // A13
+data8 0xECF7DDDFD27C9223,0x00003FFE // A12
+data8 0xFB73D31793068463,0x0000BFFE // A11
+data8 0xFF173B7E66FD1D61,0x00003FFE // A10
+data8 0xFFA5EF3959089E94,0x0000BFFE // A9
+data8 0xFF8153BD42E71A4F,0x00003FFE // A8
+data8 0xFEF9CAEE2CB5B533,0x0000BFFE // A7
+data8 0xFE3F02E5EDB6811E,0x00003FFE // A6
+data8 0xFB64074CED2658FB,0x0000BFFE // A5
+data8 0xFB52882A095B18A4,0x00003FFE // A4
+data8 0xE8508C7990A0DAC0,0x0000BFFE // A3
+data8 0xFD32C611D8A881D0,0x00003FFE // A2
+data8 0x93C467E37DB0C536,0x0000BFFE // A1
+data8 0x8000000000000000,0x00003FFF // A0
+//
+// [1.25;1.5]
+data8 0xD038092400619677,0x0000BFF7 // A15
+data8 0xEA6DE925E6EB8C8F,0x00003FF3 // A14
+data8 0xC53F83645D4597FC,0x0000BFF7 // A13
+data8 0xE366DB2FB27B7ECD,0x00003FF7 // A12
+data8 0xAC8FD5E11F6EEAD8,0x0000BFF8 // A11
+data8 0xFB14010FB3697785,0x00003FF8 // A10
+data8 0xB6F91CB5C371177B,0x0000BFF9 // A9
+data8 0x85A262C6F8FEEF71,0x00003FFA // A8
+data8 0xC038E6E3261568F9,0x0000BFFA // A7
+data8 0x8F4BDE8883232364,0x00003FFB // A6
+data8 0xBCFBBD5786537E9A,0x0000BFFB // A5
+data8 0xA4C08BAF0A559479,0x00003FFC // A4
+data8 0x85D74FA063E81476,0x0000BFFC // A3
+data8 0xDB629FB9BBDC1C4E,0x00003FFD // A2
+data8 0xF4F8FBC7C0C9D317,0x00003FC6 // A1
+data8 0xE2B6E4153A57746C,0x00003FFE // A0
+//
+// [1.25;1.5]
+data8 0x9533F9D3723B448C,0x0000BFF2 // A15
+data8 0xF1F75D3C561CBBAF,0x00003FF5 // A14
+data8 0xBA55A9A1FC883523,0x0000BFF8 // A13
+data8 0xB5D5E9E5104FA995,0x00003FFA // A12
+data8 0xFD84F35B70CD9AE2,0x0000BFFB // A11
+data8 0x87445235F4688CC5,0x00003FFD // A10
+data8 0xE7F236EBFB9F774E,0x0000BFFD // A9
+data8 0xA6605F2721F787CE,0x00003FFE // A8
+data8 0xCF579312AD7EAD72,0x0000BFFE // A7
+data8 0xE96254A2407A5EAC,0x00003FFE // A6
+data8 0xF41312A8572ED346,0x0000BFFE // A5
+data8 0xF9535027C1B1F795,0x00003FFE // A4
+data8 0xE7E82D0C613A8DE4,0x0000BFFE // A3
+data8 0xFD23CD9741B460B8,0x00003FFE // A2
+data8 0x93C30FD9781DBA88,0x0000BFFE // A1
+data8 0xFFFFF1781FDBEE84,0x00003FFE // A0
+LOCAL_OBJECT_END(tgamma_data)
+
+
+//==============================================================
+// Code
+//==============================================================
+
+.section .text
+GLOBAL_LIBM_ENTRY(tgamma)
+{ .mfi
+ getf.exp GR_Sign_Exp = f8
+ fma.s1 FR_1m2X = f8,f1,f8 // 2x
+ addl GR_ad_Data = @ltoff(tgamma_data), gp
+}
+{ .mfi
+ mov GR_ExpOf8 = 0x10002 // 8
+ fcvt.fx.trunc.s1 FR_iXt = f8 // [x]
+ mov GR_ExpOf05 = 0xFFFE // 0.5
+};;
+{ .mfi
+ getf.sig GR_Sig = f8
+ fma.s1 FR_2 = f1,f1,f1 // 2
+ mov GR_Addr_Mask1 = 0x780
+}
+{ .mlx
+ setf.exp FR_8 = GR_ExpOf8
+ movl GR_10 = 0x4024000000000000
+};;
+{ .mfi
+ ld8 GR_ad_Data = [GR_ad_Data]
+ fcmp.lt.s1 p14,p15 = f8,f0
+ tbit.z p12,p13 = GR_Sign_Exp,0x10 // p13 if x >= 2
+}
+{ .mlx
+ and GR_Bit2 = 4,GR_Sign_Exp
+ movl GR_12 = 0x4028000000000000
+};;
+{ .mfi
+ setf.d FR_10 = GR_10
+ fma.s1 FR_r02 = f8,f1,f0
+ extr.u GR_Tbl_Offs = GR_Sig,58,6
+}
+{ .mfi
+(p12) mov GR_Addr_Mask1 = r0
+ fma.s1 FR_NormX = f8,f1,f0
+ cmp.ne p8,p0 = GR_Bit2,r0
+};;
+{ .mfi
+(p8) shladd GR_Tbl_Offs = GR_Tbl_Offs,4,r0
+ fclass.m p10,p0 = f8,0x1E7 // Test x for NaTVal, NaN, +/-0, +/-INF
+ tbit.nz p11,p0 = GR_Sign_Exp,1
+}
+{ .mlx
+ add GR_Addr_Mask2 = GR_Addr_Mask1,GR_Addr_Mask1
+ movl GR_14 = 0x402C000000000000
+};;
+.pred.rel "mutex",p14,p15
+{ .mfi
+ setf.d FR_12 = GR_12
+(p14) fma.s1 FR_1m2X = f1,f1,FR_1m2X // RB=1-2|x|
+ tbit.nz p8,p9 = GR_Sign_Exp,0
+}
+{ .mfi
+ ldfpd FR_OvfBound,FR_Xmin = [GR_ad_Data],16
+(p15) fms.s1 FR_1m2X = f1,f1,FR_1m2X // RB=1-2|x|
+(p11) shladd GR_Tbl_Offs = GR_Tbl_Offs,2,r0
+};;
+.pred.rel "mutex",p9,p8
+{ .mfi
+ setf.d FR_14 = GR_14
+ fma.s1 FR_4 = FR_2,FR_2,f0
+(p8) and GR_Tbl_Offs = GR_Tbl_Offs, GR_Addr_Mask1
+}
+{ .mfi
+ setf.exp FR_05 = GR_ExpOf05
+ fma.s1 FR_6 = FR_2,FR_2,FR_2
+(p9) and GR_Tbl_Offs = GR_Tbl_Offs, GR_Addr_Mask2
+};;
+.pred.rel "mutex",p9,p8
+{ .mfi
+(p8) shladd GR_ad_Co = GR_Tbl_Offs,1,GR_ad_Data
+ fcvt.xf FR_Xt = FR_iXt // [x]
+(p15) tbit.z.unc p11,p0 = GR_Sign_Exp,0x10 // p11 if 0 < x < 2
+}
+{ .mfi
+(p9) add GR_ad_Co = GR_ad_Data,GR_Tbl_Offs
+ fma.s1 FR_5 = FR_2,FR_2,f1
+(p15) cmp.lt.unc p7,p6 = GR_ExpOf05,GR_Sign_Exp // p7 if 0 < x < 1
+};;
+{ .mfi
+ add GR_ad_Ce = 16,GR_ad_Co
+(p11) frcpa.s1 FR_Rcp0,p0 = f1,f8
+ sub GR_Tbl_Offs = GR_ad_Co,GR_ad_Data
+}
+{ .mfb
+ ldfe FR_C01 = [GR_ad_Co],32
+(p7) fms.s1 FR_r02 = FR_r02,f1,f1
+ // jump if x is NaTVal, NaN, +/-0, +/-INF
+(p10) br.cond.spnt tgamma_spec
+};;
+.pred.rel "mutex",p14,p15
+{ .mfi
+ ldfe FR_C11 = [GR_ad_Ce],32
+(p14) fms.s1 FR_X2pX = f8,f8,f8 // RA=x^2+|x|
+ shr GR_Tbl_Ind = GR_Tbl_Offs,8
+}
+{ .mfb
+ ldfe FR_C21 = [GR_ad_Co],32
+(p15) fma.s1 FR_X2pX = f8,f8,f8 // RA=x^2+x
+ // jump if 0 < x < 2
+(p11) br.cond.spnt tgamma_from_0_to_2
+};;
+{ .mfi
+ ldfe FR_C31 = [GR_ad_Ce],32
+ fma.s1 FR_Rq2 = FR_2,f1,FR_1m2X // 2 + B
+ cmp.ltu p7,p0=0xB,GR_Tbl_Ind
+}
+{ .mfb
+ ldfe FR_C41 = [GR_ad_Co],32
+ fma.s1 FR_Rq3 = FR_2,FR_2,FR_1m2X // 4 + B
+ // jump if GR_Tbl_Ind > 11, i.e |x| is more than 192
+(p7) br.cond.spnt tgamma_spec_res
+};;
+{ .mfi
+ ldfe FR_C51 = [GR_ad_Ce],32
+ fma.s1 FR_Rq4 = FR_6,f1,FR_1m2X // 6 + B
+ shr GR_Tbl_Offs = GR_Tbl_Offs,1
+}
+{ .mfi
+ ldfe FR_C61 = [GR_ad_Co],32
+ fma.s1 FR_Rq5 = FR_4,FR_2,FR_1m2X // 8 + B
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_C71 = [GR_ad_Ce],32
+(p14) fms.s1 FR_r = FR_Xt,f1,f8 // r = |x| - [|x|]
+ shr GR_Tbl_16xInd = GR_Tbl_Offs,3
+}
+{ .mfi
+ ldfe FR_C81 = [GR_ad_Co],32
+(p15) fms.s1 FR_r = f8,f1,FR_Xt // r = x - [x]
+ add GR_ad_Data = 0xC00,GR_ad_Data
+};;
+{ .mfi
+ ldfe FR_C91 = [GR_ad_Ce],32
+ fma.s1 FR_Rq6 = FR_5,FR_2,FR_1m2X // 10 + B
+(p14) mov GR_0x30033 = 0x30033
+}
+{ .mfi
+ ldfe FR_CA1 = [GR_ad_Co],32
+ fma.s1 FR_Rq7 = FR_6,FR_2,FR_1m2X // 12 + B
+ sub GR_Tbl_Offs = GR_Tbl_Offs,GR_Tbl_16xInd
+};;
+{ .mfi
+ ldfe FR_C00 = [GR_ad_Ce],32
+ fma.s1 FR_Rq1 = FR_Rq1,FR_2,FR_X2pX // (x-1)*(x-2)
+(p13) cmp.eq.unc p8,p0 = r0,GR_Tbl_16xInd // index is 0 i.e. arg from [2;16)
+}
+{ .mfi
+ ldfe FR_C10 = [GR_ad_Co],32
+(p14) fms.s1 FR_AbsX = f0,f0,FR_NormX // absolute value of argument
+ add GR_ad_Co7 = GR_ad_Data,GR_Tbl_Offs
+};;
+{ .mfi
+ ldfe FR_C20 = [GR_ad_Ce],32
+ fma.s1 FR_Rq2 = FR_Rq2,FR_4,FR_X2pX // (x-3)*(x-4)
+ add GR_ad_Ce7 = 16,GR_ad_Co7
+}
+{ .mfi
+ ldfe FR_C30 = [GR_ad_Co],32
+ fma.s1 FR_Rq3 = FR_Rq3,FR_6,FR_X2pX // (x-5)*(x-6)
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_C40 = [GR_ad_Ce],32
+ fma.s1 FR_Rq4 = FR_Rq4,FR_8,FR_X2pX // (x-7)*(x-8)
+(p14) cmp.leu.unc p7,p0 = GR_0x30033,GR_Sign_Exp
+}
+{ .mfb
+ ldfe FR_C50 = [GR_ad_Co7],32
+ fma.s1 FR_Rq5 = FR_Rq5,FR_10,FR_X2pX // (x-9)*(x-10)
+ // jump if x is less or equal to -2^52, i.e. x is big negative integer
+(p7) br.cond.spnt tgamma_singularity
+};;
+{ .mfi
+ ldfe FR_C60 = [GR_ad_Ce7],32
+ fma.s1 FR_C01 = FR_C01,f1,FR_r
+ add GR_ad_Ce = 0x560,GR_ad_Data
+}
+{ .mfi
+ ldfe FR_C70 = [GR_ad_Co7],32
+ fma.s1 FR_rs = f0,f0,FR_r // reduced arg for sin(pi*x)
+ add GR_ad_Co = 0x550,GR_ad_Data
+};;
+{ .mfi
+ ldfe FR_C80 = [GR_ad_Ce7],32
+ fma.s1 FR_C11 = FR_C11,f1,FR_r
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_C90 = [GR_ad_Co7],32
+ fma.s1 FR_C21 = FR_C21,f1,FR_r
+ nop.i 0
+};;
+.pred.rel "mutex",p12,p13
+{ .mfi
+(p13) getf.sig GR_iSig = FR_iXt
+ fcmp.lt.s1 p11,p0 = FR_05,FR_r
+ mov GR_185 = 185
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Rq6 = FR_Rq6,FR_12,FR_X2pX // (x-11)*(x-12)
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_CA0 = [GR_ad_Ce7],32
+ fma.s1 FR_C31 = FR_C31,f1,FR_r
+(p12) mov GR_iSig = 0
+}
+{ .mfi
+ ldfe FR_An = [GR_ad_Co7],0x80
+ fma.s1 FR_C41 = FR_C41,f1,FR_r
+ nop.i 0
+};;
+{ .mfi
+(p14) getf.sig GR_Sig = FR_r
+ fma.s1 FR_C51 = FR_C51,f1,FR_r
+(p14) sub GR_iSig = r0,GR_iSig
+}
+{ .mfi
+ ldfe FR_S21 = [GR_ad_Co],32
+ fma.s1 FR_C61 = FR_C61,f1,FR_r
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_S19 = [GR_ad_Ce],32
+ fma.s1 FR_C71 = FR_C71,f1,FR_r
+ and GR_SigRqLin = 0xF,GR_iSig
+}
+{ .mfi
+ ldfe FR_S17 = [GR_ad_Co],32
+ fma.s1 FR_C81 = FR_C81,f1,FR_r
+ mov GR_2 = 2
+};;
+{ .mfi
+(p14) ldfe FR_InvAn = [GR_ad_Co7]
+ fma.s1 FR_C91 = FR_C91,f1,FR_r
+ // if significand of r is 0 tnan argument is negative integer
+(p14) cmp.eq.unc p12,p0 = r0,GR_Sig
+}
+{ .mfb
+(p8) sub GR_SigRqLin = GR_SigRqLin,GR_2 // subtract 2 if 2 <= x < 16
+ fma.s1 FR_CA1 = FR_CA1,f1,FR_r
+ // jump if x is negative integer such that -2^52 < x < -185
+(p12) br.cond.spnt tgamma_singularity
+};;
+{ .mfi
+ setf.sig FR_Xt = GR_SigRqLin
+(p11) fms.s1 FR_rs = f1,f1,FR_r
+(p14) cmp.ltu.unc p7,p0 = GR_185,GR_iSig
+}
+{ .mfb
+ ldfe FR_S15 = [GR_ad_Ce],32
+ fma.s1 FR_Rq7 = FR_Rq7,FR_14,FR_X2pX // (x-13)*(x-14)
+ // jump if x is noninteger such that -2^52 < x < -185
+(p7) br.cond.spnt tgamma_underflow
+};;
+{ .mfi
+ ldfe FR_S13 = [GR_ad_Co],48
+ fma.s1 FR_C01 = FR_C01,FR_r,FR_C00
+ and GR_Sig2 = 0xE,GR_SigRqLin
+}
+{ .mfi
+ ldfe FR_S11 = [GR_ad_Ce],48
+ fma.s1 FR_C11 = FR_C11,FR_r,FR_C10
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_S9 = [GR_ad_Co],32
+ fma.s1 FR_C21 = FR_C21,FR_r,FR_C20
+ // should we mul by polynomial of recursion?
+ cmp.eq p13,p12 = r0,GR_SigRqLin
+}
+{ .mfi
+ ldfe FR_S7 = [GR_ad_Ce],32
+ fma.s1 FR_C31 = FR_C31,FR_r,FR_C30
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_S5 = [GR_ad_Co],32
+ fma.s1 FR_C41 = FR_C41,FR_r,FR_C40
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_S3 = [GR_ad_Ce],32
+ fma.s1 FR_C51 = FR_C51,FR_r,FR_C50
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C61 = FR_C61,FR_r,FR_C60
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C71 = FR_C71,FR_r,FR_C70
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C81 = FR_C81,FR_r,FR_C80
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C91 = FR_C91,FR_r,FR_C90
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_CA1 = FR_CA1,FR_r,FR_CA0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C01 = FR_C01,FR_C11,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C21 = FR_C21,FR_C31,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rs2 = FR_rs,FR_rs,f0
+(p12) cmp.lt.unc p7,p0 = 2,GR_Sig2 // should mul by FR_Rq2?
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C41 = FR_C41,FR_C51,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fma.s1 FR_Rq1 = FR_Rq1,FR_Rq2,f0
+(p12) cmp.lt.unc p9,p0 = 6,GR_Sig2 // should mul by FR_Rq4?
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C61 = FR_C61,FR_C71,f0
+(p15) cmp.eq p11,p0 = r0,r0
+}
+{ .mfi
+ nop.m 0
+(p9) fma.s1 FR_Rq3 = FR_Rq3,FR_Rq4,f0
+(p12) cmp.lt.unc p8,p0 = 10,GR_Sig2 // should mul by FR_Rq6?
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C81 = FR_C81,FR_C91,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p8) fma.s1 FR_Rq5 = FR_Rq5,FR_Rq6,f0
+(p14) cmp.ltu p0,p11 = 0x9,GR_Tbl_Ind
+};;
+{ .mfi
+ nop.m 0
+ fcvt.xf FR_RqLin = FR_Xt
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fma.s1 FR_CA1 = FR_CA1,FR_An,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S21 = FR_S21,FR_rs2,FR_S19
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S17 = FR_S17,FR_rs2,FR_S15
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C01 = FR_C01,FR_C21,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rs4 = FR_rs2,FR_rs2,f0
+(p12) cmp.lt.unc p8,p0 = 4,GR_Sig2 // should mul by FR_Rq3?
+};;
+{ .mfi
+ nop.m 0
+(p8) fma.s1 FR_Rq1 = FR_Rq1,FR_Rq3,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S13 = FR_S13,FR_rs2,FR_S11
+(p12) cmp.lt.unc p9,p0 = 12,GR_Sig2 // should mul by FR_Rq7?
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C41 = FR_C41,FR_C61,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p9) fma.s1 FR_Rq5 = FR_Rq5,FR_Rq7,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C81 = FR_C81,FR_CA1,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S9 = FR_S9,FR_rs2,FR_S7
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S5 = FR_S5,FR_rs2,FR_S3
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rs3 = FR_rs2,FR_rs,f0
+(p12) tbit.nz.unc p6,p0 = GR_SigRqLin,0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rs8 = FR_rs4,FR_rs4,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S21 = FR_S21,FR_rs4,FR_S17
+ mov GR_ExpOf1 = 0x2FFFF
+}
+{ .mfi
+ nop.m 0
+(p6) fms.s1 FR_RqLin = FR_AbsX,f1,FR_RqLin
+(p12) cmp.lt.unc p8,p0 = 8,GR_Sig2 // should mul by FR_Rq5?
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C01 = FR_C01,FR_C41,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p8) fma.s1 FR_Rq1 = FR_Rq1,FR_Rq5,f0
+(p14) cmp.gtu.unc p7,p0 = GR_Sign_Exp,GR_ExpOf1
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S13 = FR_S13,FR_rs4,FR_S9
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fma.s1 FR_C81 = FR_C81,FR_AbsX,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_AbsXp1 = f1,f1,FR_AbsX // |x|+1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fcmp.lt.unc.s1 p0,p10 = FR_AbsX,FR_OvfBound // x >= overflow_boundary
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rs7 = FR_rs4,FR_rs3,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S5 = FR_S5,FR_rs3,FR_rs
+ nop.i 0
+};;
+{ .mib
+(p14) cmp.lt p13,p0 = r0,r0 // set p13 to 0 if x < 0
+(p12) cmp.eq.unc p8,p9 = 1,GR_SigRqLin
+(p10) br.cond.spnt tgamma_spec_res
+};;
+{ .mfi
+ getf.sig GR_Sig = FR_iXt
+(p6) fma.s1 FR_Rq1 = FR_Rq1,FR_RqLin,f0
+ // should we mul by polynomial of recursion?
+(p15) cmp.eq.unc p0,p11 = r0,GR_SigRqLin
+}
+{ .mfb
+ nop.m 0
+ fma.s1 FR_GAMMA = FR_C01,FR_C81,f0
+(p11) br.cond.spnt tgamma_positives
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S21 = FR_S21,FR_rs8,FR_S13
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p13) fma.d.s0 f8 = FR_C01,FR_C81,f0
+(p13) br.ret.spnt b0
+};;
+.pred.rel "mutex",p8,p9
+{ .mfi
+ nop.m 0
+(p9) fma.s1 FR_GAMMA = FR_GAMMA,FR_Rq1,f0
+ tbit.z p6,p7 = GR_Sig,0 // p6 if sin<0, p7 if sin>0
+}
+{ .mfi
+ nop.m 0
+(p8) fma.s1 FR_GAMMA = FR_GAMMA,FR_RqLin,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_S21 = FR_S21,FR_rs7,FR_S5
+ nop.i 0
+};;
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+(p6) fnma.s1 FR_GAMMA = FR_GAMMA,FR_S21,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fma.s1 FR_GAMMA = FR_GAMMA,FR_S21,f0
+ mov GR_Sig2 = 1
+};;
+{ .mfi
+ nop.m 0
+ frcpa.s1 FR_Rcp0,p0 = f1,FR_GAMMA
+ cmp.ltu p13,p0 = GR_Sign_Exp,GR_ExpOf1
+};;
+// NR method: ineration #1
+{ .mfi
+(p13) getf.exp GR_Sign_Exp = FR_AbsX
+ fnma.s1 FR_Rcp1 = FR_Rcp0,FR_GAMMA,f1 // t = 1 - r0*x
+(p13) shl GR_Sig2 = GR_Sig2,63
+};;
+{ .mfi
+(p13) getf.sig GR_Sig = FR_AbsX
+ nop.f 0
+(p13) mov GR_NzOvfBound = 0xFBFF
+};;
+{ .mfi
+(p13) cmp.ltu.unc p8,p0 = GR_Sign_Exp,GR_NzOvfBound // p8 <- overflow
+ nop.f 0
+(p13) cmp.eq.unc p9,p0 = GR_Sign_Exp,GR_NzOvfBound
+};;
+{ .mfb
+ nop.m 0
+(p13) fma.d.s0 FR_X = f1,f1,f8 // set deno & inexact flags
+(p8) br.cond.spnt tgamma_ovf_near_0 //tgamma_neg_overflow
+};;
+{ .mib
+ nop.m 0
+(p9) cmp.eq.unc p8,p0 = GR_Sig,GR_Sig2
+(p8) br.cond.spnt tgamma_ovf_near_0_boundary //tgamma_neg_overflow
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0
+ nop.i 0
+};;
+// NR method: ineration #2
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_Rcp2 = FR_Rcp1,FR_GAMMA,f1 // t = 1 - r1*x
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_Rcp2 = FR_Rcp1,FR_Rcp2,FR_Rcp1
+ nop.i 0
+};;
+// NR method: ineration #3
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_Rcp3 = FR_Rcp2,FR_GAMMA,f1 // t = 1 - r2*x
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_Rcp2 = FR_Rcp2,FR_AbsXp1,f0
+(p14) cmp.ltu p10,p11 = 0x9,GR_Tbl_Ind
+};;
+.pred.rel "mutex",p10,p11
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_GAMMA = FR_Rcp2,FR_Rcp3,FR_Rcp2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fma.d.s0 f8 = FR_Rcp2,FR_Rcp3,FR_Rcp2
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+(p10) fma.d.s0 f8 = FR_GAMMA,FR_InvAn,f0
+ br.ret.sptk b0
+};;
+
+
+// here if x >= 3
+//--------------------------------------------------------------------
+.align 32
+tgamma_positives:
+.pred.rel "mutex",p8,p9
+{ .mfi
+ nop.m 0
+(p9) fma.d.s0 f8 = FR_GAMMA,FR_Rq1,f0
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p8) fma.d.s0 f8 = FR_GAMMA,FR_RqLin,f0
+ br.ret.sptk b0
+};;
+
+// here if 0 < x < 1
+//--------------------------------------------------------------------
+.align 32
+tgamma_from_0_to_2:
+{ .mfi
+ getf.exp GR_Sign_Exp = FR_r02
+ fms.s1 FR_r = FR_r02,f1,FR_Xmin
+ mov GR_ExpOf025 = 0xFFFD
+}
+{ .mfi
+ add GR_ad_Co = 0x1200,GR_ad_Data
+(p6) fnma.s1 FR_Rcp1 = FR_Rcp0,FR_NormX,f1 // t = 1 - r0*x
+(p6) mov GR_Sig2 = 1
+};;
+{ .mfi
+(p6) getf.sig GR_Sig = FR_NormX
+ nop.f 0
+(p6) shl GR_Sig2 = GR_Sig2,63
+}
+{ .mfi
+ add GR_ad_Ce = 0x1210,GR_ad_Data
+ nop.f 0
+(p6) mov GR_NzOvfBound = 0xFBFF
+};;
+{ .mfi
+ cmp.eq p8,p0 = GR_Sign_Exp,GR_ExpOf05 // r02 >= 1/2
+ nop.f 0
+ cmp.eq p9,p10 = GR_Sign_Exp,GR_ExpOf025 // r02 >= 1/4
+}
+{ .mfi
+(p6) cmp.ltu.unc p11,p0 = GR_Sign_Exp,GR_NzOvfBound // p11 <- overflow
+ nop.f 0
+(p6) cmp.eq.unc p12,p0 = GR_Sign_Exp,GR_NzOvfBound
+};;
+.pred.rel "mutex",p8,p9
+{ .mfi
+(p8) add GR_ad_Co = 0x200,GR_ad_Co
+(p6) fma.d.s0 FR_X = f1,f1,f8 // set deno & inexact flags
+(p9) add GR_ad_Co = 0x100,GR_ad_Co
+}
+{ .mib
+(p8) add GR_ad_Ce = 0x200,GR_ad_Ce
+(p9) add GR_ad_Ce = 0x100,GR_ad_Ce
+(p11) br.cond.spnt tgamma_ovf_near_0 //tgamma_spec_res
+};;
+{ .mfi
+ ldfe FR_A15 = [GR_ad_Co],32
+ nop.f 0
+(p12) cmp.eq.unc p13,p0 = GR_Sig,GR_Sig2
+}
+{ .mfb
+ ldfe FR_A14 = [GR_ad_Ce],32
+ nop.f 0
+(p13) br.cond.spnt tgamma_ovf_near_0_boundary //tgamma_spec_res
+};;
+{ .mfi
+ ldfe FR_A13 = [GR_ad_Co],32
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_A12 = [GR_ad_Ce],32
+ nop.f 0
+ nop.i 0
+};;
+.pred.rel "mutex",p9,p10
+{ .mfi
+ ldfe FR_A11 = [GR_ad_Co],32
+(p10) fma.s1 FR_r2 = FR_r02,FR_r02,f0
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_A10 = [GR_ad_Ce],32
+(p9) fma.s1 FR_r2 = FR_r,FR_r,f0
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_A9 = [GR_ad_Co],32
+(p6) fma.s1 FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_A8 = [GR_ad_Ce],32
+(p10) fma.s1 FR_r = f0,f0,FR_r02
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_A7 = [GR_ad_Co],32
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_A6 = [GR_ad_Ce],32
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_A5 = [GR_ad_Co],32
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_A4 = [GR_ad_Ce],32
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_A3 = [GR_ad_Co],32
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_A2 = [GR_ad_Ce],32
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ ldfe FR_A1 = [GR_ad_Co],32
+ fma.s1 FR_r4 = FR_r2,FR_r2,f0
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_A0 = [GR_ad_Ce],32
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p6) fnma.s1 FR_Rcp2 = FR_Rcp1,FR_NormX,f1 // t = 1 - r1*x
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A15 = FR_A15,FR_r,FR_A14
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A11 = FR_A11,FR_r,FR_A10
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r8 = FR_r4,FR_r4,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p6) fma.s1 FR_Rcp2 = FR_Rcp1,FR_Rcp2,FR_Rcp1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A7 = FR_A7,FR_r,FR_A6
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A3 = FR_A3,FR_r,FR_A2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A15 = FR_A15,FR_r,FR_A13
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A11 = FR_A11,FR_r,FR_A9
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p6) fnma.s1 FR_Rcp3 = FR_Rcp2,FR_NormX,f1 // t = 1 - r1*x
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A7 = FR_A7,FR_r,FR_A5
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A3 = FR_A3,FR_r,FR_A1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A15 = FR_A15,FR_r,FR_A12
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A11 = FR_A11,FR_r,FR_A8
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p6) fma.s1 FR_Rcp3 = FR_Rcp2,FR_Rcp3,FR_Rcp2
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A7 = FR_A7,FR_r,FR_A4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A3 = FR_A3,FR_r,FR_A0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A15 = FR_A15,FR_r4,FR_A11
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A7 = FR_A7,FR_r4,FR_A3
+ nop.i 0
+};;
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+(p6) fma.s1 FR_A15 = FR_A15,FR_r8,FR_A7
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fma.d.s0 f8 = FR_A15,FR_r8,FR_A7
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+(p6) fma.d.s0 f8 = FR_A15,FR_Rcp3,f0
+ br.ret.sptk b0
+};;
+
+// overflow
+//--------------------------------------------------------------------
+.align 32
+tgamma_ovf_near_0_boundary:
+.pred.rel "mutex",p14,p15
+{ .mfi
+ mov GR_fpsr = ar.fpsr
+ nop.f 0
+(p15) mov r8 = 0x7ff
+}
+{ .mfi
+ nop.m 0
+ nop.f 0
+(p14) mov r8 = 0xfff
+};;
+{ .mfi
+ nop.m 0
+ nop.f 0
+ shl r8 = r8,52
+};;
+{ .mfi
+ sub r8 = r8,r0,1
+ nop.f 0
+ extr.u GR_fpsr = GR_fpsr,10,2 // rounding mode
+};;
+.pred.rel "mutex",p14,p15
+{ .mfi
+ // set p8 to 0 in case of overflow and to 1 otherwise
+ // for negative arg:
+ // no overflow if rounding mode either Z or +Inf, i.e.
+ // GR_fpsr > 1
+(p14) cmp.lt p8,p0 = 1,GR_fpsr
+ nop.f 0
+ // for positive arg:
+ // no overflow if rounding mode either Z or -Inf, i.e.
+ // (GR_fpsr & 1) == 0
+(p15) tbit.z p0,p8 = GR_fpsr,0
+};;
+{ .mib
+(p8) setf.d f8 = r8 // set result to 0x7fefffffffffffff without
+ // OVERFLOW flag raising
+ nop.i 0
+(p8) br.ret.sptk b0
+};;
+.align 32
+tgamma_ovf_near_0:
+{ .mfi
+ mov r8 = 0x1FFFE
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ setf.exp f9 = r8
+ fmerge.s FR_X = f8,f8
+ mov GR_TAG = 258 // overflow
+};;
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p15) fma.d.s0 f8 = f9,f9,f0 // Set I,O and +INF result
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p14) fnma.d.s0 f8 = f9,f9,f0 // Set I,O and -INF result
+ br.cond.sptk tgamma_libm_err
+};;
+// overflow or absolute value of x is too big
+//--------------------------------------------------------------------
+.align 32
+tgamma_spec_res:
+{ .mfi
+ mov GR_0x30033 = 0x30033
+(p14) fcmp.eq.unc.s1 p10,p11 = f8,FR_Xt
+(p15) mov r8 = 0x1FFFE
+};;
+{ .mfi
+(p15) setf.exp f9 = r8
+ nop.f 0
+ nop.i 0
+};;
+{ .mfb
+(p11) cmp.ltu.unc p7,p8 = GR_0x30033,GR_Sign_Exp
+ nop.f 0
+(p10) br.cond.spnt tgamma_singularity
+};;
+.pred.rel "mutex",p7,p8
+{ .mbb
+ nop.m 0
+(p7) br.cond.spnt tgamma_singularity
+(p8) br.cond.spnt tgamma_underflow
+};;
+{ .mfi
+ nop.m 0
+ fmerge.s FR_X = f8,f8
+ mov GR_TAG = 258 // overflow
+}
+{ .mfb
+ nop.m 0
+(p15) fma.d.s0 f8 = f9,f9,f0 // Set I,O and +INF result
+ br.cond.sptk tgamma_libm_err
+};;
+
+// x is negative integer or +/-0
+//--------------------------------------------------------------------
+.align 32
+tgamma_singularity:
+{ .mfi
+ nop.m 0
+ fmerge.s FR_X = f8,f8
+ mov GR_TAG = 259 // negative
+}
+{ .mfb
+ nop.m 0
+ frcpa.s0 f8,p0 = f0,f0
+ br.cond.sptk tgamma_libm_err
+};;
+// x is negative noninteger with big absolute value
+//--------------------------------------------------------------------
+.align 32
+tgamma_underflow:
+{ .mmi
+ getf.sig GR_Sig = FR_iXt
+ mov r11 = 0x00001
+ nop.i 0
+};;
+{ .mfi
+ setf.exp f9 = r11
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ nop.f 0
+ tbit.z p6,p7 = GR_Sig,0
+};;
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+(p6) fms.d.s0 f8 = f9,f9,f9
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p7) fma.d.s0 f8 = f9,f9,f9
+ br.ret.sptk b0
+};;
+
+// x for natval, nan, +/-inf or +/-0
+//--------------------------------------------------------------------
+.align 32
+tgamma_spec:
+{ .mfi
+ nop.m 0
+ fclass.m p6,p0 = f8,0x1E1 // Test x for natval, nan, +inf
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fclass.m p7,p8 = f8,0x7 // +/-0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fmerge.s FR_X = f8,f8
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p6) fma.d.s0 f8 = f8,f1,f8
+(p6) br.ret.spnt b0
+};;
+.pred.rel "mutex",p7,p8
+{ .mfi
+(p7) mov GR_TAG = 259 // negative
+(p7) frcpa.s0 f8,p0 = f1,f8
+ nop.i 0
+}
+{ .mib
+ nop.m 0
+ nop.i 0
+(p8) br.cond.spnt tgamma_singularity
+};;
+
+.align 32
+tgamma_libm_err:
+{ .mfi
+ alloc r32 = ar.pfs,1,4,4,0
+ nop.f 0
+ mov GR_Parameter_TAG = GR_TAG
+};;
+
+GLOBAL_LIBM_END(tgamma)
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+{ .mmi
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/w_tgammaf.S b/sysdeps/ia64/fpu/w_tgammaf.S
new file mode 100644
index 0000000000..4363ca27b8
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_tgammaf.S
@@ -0,0 +1,1328 @@
+.file "tgammaf.s"
+
+
+// Copyright (c) 2001 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2001 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
+// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
+// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code,and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//*********************************************************************
+//
+// History:
+// 11/30/01 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+// 04/04/03 Changed error codes for overflow and negative integers
+// 04/10/03 Changed code for overflow near zero handling
+//
+//*********************************************************************
+//
+//*********************************************************************
+//
+// Function: tgammaf(x) computes the principle value of the GAMMA
+// function of x.
+//
+//*********************************************************************
+//
+// Resources Used:
+//
+// Floating-Point Registers: f8-f15
+// f33-f75
+//
+// General Purpose Registers:
+// r8-r11
+// r14-r29
+// r32-r36
+// r37-r40 (Used to pass arguments to error handling routine)
+//
+// Predicate Registers: p6-p15
+//
+//*********************************************************************
+//
+// IEEE Special Conditions:
+//
+// tgammaf(+inf) = +inf
+// tgammaf(-inf) = QNaN
+// tgammaf(+/-0) = +/-inf
+// tgammaf(x<0, x - integer) = QNaN
+// tgammaf(SNaN) = QNaN
+// tgammaf(QNaN) = QNaN
+//
+//*********************************************************************
+//
+// Overview
+//
+// The method consists of three cases.
+//
+// If 2 <= x < OVERFLOW_BOUNDARY use case tgamma_regular;
+// else if 0 < x < 2 use case tgamma_from_0_to_2;
+// else if -(i+1) < x < -i, i = 0...43 use case tgamma_negatives;
+//
+// Case 2 <= x < OVERFLOW_BOUNDARY
+// -------------------------------
+// Here we use algorithm based on the recursive formula
+// GAMMA(x+1) = x*GAMMA(x). For that we subdivide interval
+// [2; OVERFLOW_BOUNDARY] into intervals [8*n; 8*(n+1)] and
+// approximate GAMMA(x) by polynomial of 22th degree on each
+// [8*n; 8*n+1], recursive formula is used to expand GAMMA(x)
+// to [8*n; 8*n+1]. In other words we need to find n, i and r
+// such that x = 8 * n + i + r where n and i are integer numbers
+// and r is fractional part of x. So GAMMA(x) = GAMMA(8*n+i+r) =
+// = (x-1)*(x-2)*...*(x-i)*GAMMA(x-i) =
+// = (x-1)*(x-2)*...*(x-i)*GAMMA(8*n+r) ~
+// ~ (x-1)*(x-2)*...*(x-i)*P12n(r).
+//
+// Step 1: Reduction
+// -----------------
+// N = [x] with truncate
+// r = x - N, note 0 <= r < 1
+//
+// n = N & ~0xF - index of table that contains coefficient of
+// polynomial approximation
+// i = N & 0xF - is used in recursive formula
+//
+//
+// Step 2: Approximation
+// ---------------------
+// We use factorized minimax approximation polynomials
+// P12n(r) = A12*(r^2+C01(n)*r+C00(n))*
+// *(r^2+C11(n)*r+C10(n))*...*(r^2+C51(n)*r+C50(n))
+//
+// Step 3: Recursion
+// -----------------
+// In case when i > 0 we need to multiply P12n(r) by product
+// R(i,x)=(x-1)*(x-2)*...*(x-i). To reduce number of fp-instructions
+// we can calculate R as follow:
+// R(i,x) = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-1))*(x-i)) if i is
+// even or R = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-2))*(x-(i-1)))*
+// *(i-1) if i is odd. In both cases we need to calculate
+// R2(i,x) = (x^2-3*x+2)*(x^2-7*x+12)*...*(x^2+x+2*j*(2*j-1)) =
+// = ((x^2-x)+2*(1-x))*((x^2-x)+6*(2-x))*...*((x^2-x)+2*(2*j-1)*(j-x)) =
+// = (RA+2*RB)*(RA+6*(1-RB))*...*(RA+2*(2*j-1)*(j-1+RB))
+// where j = 1..[i/2], RA = x^2-x, RB = 1-x.
+//
+// Step 4: Reconstruction
+// ----------------------
+// Reconstruction is just simple multiplication i.e.
+// GAMMA(x) = P12n(r)*R(i,x)
+//
+// Case 0 < x < 2
+// --------------
+// To calculate GAMMA(x) on this interval we do following
+// if 1.0 <= x < 1.25 than GAMMA(x) = P7(x-1)
+// if 1.25 <= x < 1.5 than GAMMA(x) = P7(x-x_min) where
+// x_min is point of local minimum on [1; 2] interval.
+// if 1.5 <= x < 1.75 than GAMMA(x) = P7(x-1.5)
+// if 1.75 <= x < 2.0 than GAMMA(x) = P7(x-1.5)
+// and
+// if 0 < x < 1 than GAMMA(x) = GAMMA(x+1)/x
+//
+// Case -(i+1) < x < -i, i = 0...43
+// ----------------------------------
+// Here we use the fact that GAMMA(-x) = PI/(x*GAMMA(x)*sin(PI*x)) and
+// so we need to calculate GAMMA(x), sin(PI*x)/PI. Calculation of
+// GAMMA(x) is described above.
+//
+// Step 1: Reduction
+// -----------------
+// Note that period of sin(PI*x) is 2 and range reduction for
+// sin(PI*x) is like to range reduction for GAMMA(x)
+// i.e rs = x - round(x) and |rs| <= 0.5.
+//
+// Step 2: Approximation
+// ---------------------
+// To approximate sin(PI*x)/PI = sin(PI*(2*n+rs))/PI =
+// = (-1)^n*sin(PI*rs)/PI Taylor series is used.
+// sin(PI*rs)/PI ~ S17(rs).
+//
+// Step 3: Division
+// ----------------
+// To calculate 1/x and 1/(GAMMA(x)*S12(rs)) we use frcpa
+// instruction with following Newton-Raphson interations.
+//
+//
+//*********************************************************************
+
+GR_ad_Data = r8
+GR_TAG = r8
+GR_SignExp = r9
+GR_Sig = r10
+GR_ArgNz = r10
+GR_RqDeg = r11
+
+GR_NanBound = r14
+GR_ExpOf025 = r15
+GR_ExpOf05 = r16
+GR_ad_Co = r17
+GR_ad_Ce = r18
+GR_TblOffs = r19
+GR_Arg = r20
+GR_Exp2Ind = r21
+GR_TblOffsMask = r21
+GR_Offs = r22
+GR_OvfNzBound = r23
+GR_ZeroResBound = r24
+GR_ad_SinO = r25
+GR_ad_SinE = r26
+GR_Correction = r27
+GR_Tbl12Offs = r28
+GR_NzBound = r28
+GR_ExpOf1 = r29
+GR_fpsr = r29
+
+GR_SAVE_B0 = r33
+GR_SAVE_PFS = r34
+GR_SAVE_GP = r35
+GR_SAVE_SP = r36
+
+GR_Parameter_X = r37
+GR_Parameter_Y = r38
+GR_Parameter_RESULT = r39
+GR_Parameter_TAG = r40
+
+
+FR_X = f10
+FR_Y = f1
+FR_RESULT = f8
+
+FR_iXt = f11
+FR_Xt = f12
+FR_r = f13
+FR_r2 = f14
+FR_r4 = f15
+
+FR_C01 = f33
+FR_A7 = f33
+FR_C11 = f34
+FR_A6 = f34
+FR_C21 = f35
+FR_A5 = f35
+FR_C31 = f36
+FR_A4 = f36
+FR_C41 = f37
+FR_A3 = f37
+FR_C51 = f38
+FR_A2 = f38
+
+FR_C00 = f39
+FR_A1 = f39
+FR_C10 = f40
+FR_A0 = f40
+FR_C20 = f41
+FR_C30 = f42
+FR_C40 = f43
+FR_C50 = f44
+FR_An = f45
+FR_OvfBound = f46
+FR_InvAn = f47
+
+FR_Multplr = f48
+FR_NormX = f49
+FR_X2mX = f50
+FR_1mX = f51
+FR_Rq0 = f51
+FR_Rq1 = f52
+FR_Rq2 = f53
+FR_Rq3 = f54
+
+FR_Rcp0 = f55
+FR_Rcp1 = f56
+FR_Rcp2 = f57
+
+FR_InvNormX1 = f58
+FR_InvNormX2 = f59
+
+FR_rs = f60
+FR_rs2 = f61
+
+FR_LocalMin = f62
+FR_10 = f63
+
+FR_05 = f64
+
+FR_S32 = f65
+FR_S31 = f66
+FR_S01 = f67
+FR_S11 = f68
+FR_S21 = f69
+FR_S00 = f70
+FR_S10 = f71
+FR_S20 = f72
+
+FR_GAMMA = f73
+FR_2 = f74
+FR_6 = f75
+
+
+
+
+// Data tables
+//==============================================================
+RODATA
+.align 16
+LOCAL_OBJECT_START(tgammaf_data)
+data8 0x3FDD8B618D5AF8FE // local minimum (0.461632144968362356785)
+data8 0x4024000000000000 // 10.0
+data8 0x3E90FC992FF39E13 // S32
+data8 0xBEC144B2760626E2 // S31
+//
+//[2; 8)
+data8 0x4009EFD1BA0CB3B4 // C01
+data8 0x3FFFB35378FF4822 // C11
+data8 0xC01032270413B896 // C41
+data8 0xC01F171A4C0D6827 // C51
+data8 0x40148F8E197396AC // C20
+data8 0x401C601959F1249C // C30
+data8 0x3EE21AD881741977 // An
+data8 0x4041852200000000 // overflow boundary (35.04010009765625)
+data8 0x3FD9CE68F695B198 // C21
+data8 0xBFF8C30AC900DA03 // C31
+data8 0x400E17D2F0535C02 // C00
+data8 0x4010689240F7FAC8 // C10
+data8 0x402563147DDCCF8D // C40
+data8 0x4033406D0480A21C // C50
+//
+//[8; 16)
+data8 0x4006222BAE0B793B // C01
+data8 0x4002452733473EDA // C11
+data8 0xC0010EF3326FDDB3 // C41
+data8 0xC01492B817F99C0F // C51
+data8 0x40099C905A249B75 // C20
+data8 0x4012B972AE0E533D // C30
+data8 0x3FE6F6DB91D0D4CC // An
+data8 0x4041852200000000 // overflow boundary
+data8 0x3FF545828F7B73C5 // C21
+data8 0xBFBBD210578764DF // C31
+data8 0x4000542098F53CFC // C00
+data8 0x40032C1309AD6C81 // C10
+data8 0x401D7331E19BD2E1 // C40
+data8 0x402A06807295EF57 // C50
+//
+//[16; 24)
+data8 0x4000131002867596 // C01
+data8 0x3FFAA362D5D1B6F2 // C11
+data8 0xBFFCB6985697DB6D // C41
+data8 0xC0115BEE3BFC3B3B // C51
+data8 0x3FFE62FF83456F73 // C20
+data8 0x4007E33478A114C4 // C30
+data8 0x41E9B2B73795ED57 // An
+data8 0x4041852200000000 // overflow boundary
+data8 0x3FEEB1F345BC2769 // C21
+data8 0xBFC3BBE6E7F3316F // C31
+data8 0x3FF14E07DA5E9983 // C00
+data8 0x3FF53B76BF81E2C0 // C10
+data8 0x4014051E0269A3DC // C40
+data8 0x40229D4227468EDB // C50
+//
+//[24; 32)
+data8 0x3FFAF7BD498384DE // C01
+data8 0x3FF62AD8B4D1C3D2 // C11
+data8 0xBFFABCADCD004C32 // C41
+data8 0xC00FADE97C097EC9 // C51
+data8 0x3FF6DA9ED737707E // C20
+data8 0x4002A29E9E0C782C // C30
+data8 0x44329D5B5167C6C3 // An
+data8 0x4041852200000000 // overflow boundary
+data8 0x3FE8943CBBB4B727 // C21
+data8 0xBFCB39D466E11756 // C31
+data8 0x3FE879AF3243D8C1 // C00
+data8 0x3FEEC7DEBB14CE1E // C10
+data8 0x401017B79BA80BCB // C40
+data8 0x401E941DC3C4DE80 // C50
+//
+//[32; 40)
+data8 0x3FF7ECB3A0E8FE5C // C01
+data8 0x3FF3815A8516316B // C11
+data8 0xBFF9ABD8FCC000C3 // C41
+data8 0xC00DD89969A4195B // C51
+data8 0x3FF2E43139CBF563 // C20
+data8 0x3FFF96DC3474A606 // C30
+data8 0x46AFF4CA9B0DDDF0 // An
+data8 0x4041852200000000 // overflow boundary
+data8 0x3FE4CE76DA1B5783 // C21
+data8 0xBFD0524DB460BC4E // C31
+data8 0x3FE35852DF14E200 // C00
+data8 0x3FE8C7610359F642 // C10
+data8 0x400BCF750EC16173 // C40
+data8 0x401AC14E02EA701C // C50
+//
+//[40; 48)
+data8 0x3FF5DCE4D8193097 // C01
+data8 0x3FF1B0D8C4974FFA // C11
+data8 0xBFF8FB450194CAEA // C41
+data8 0xC00C9658E030A6C4 // C51
+data8 0x3FF068851118AB46 // C20
+data8 0x3FFBF7C7BB46BF7D // C30
+data8 0x3FF0000000000000 // An
+data8 0x4041852200000000 // overflow boundary
+data8 0x3FE231DEB11D847A // C21
+data8 0xBFD251ECAFD7E935 // C31
+data8 0x3FE0368AE288F6BF // C00
+data8 0x3FE513AE4215A70C // C10
+data8 0x4008F960F7141B8B // C40
+data8 0x40183BA08134397B // C50
+//
+//[1.0; 1.25)
+data8 0xBFD9909648921868 // A7
+data8 0x3FE96FFEEEA8520F // A6
+data8 0xBFED0800D93449B8 // A3
+data8 0x3FEFA648D144911C // A2
+data8 0xBFEE3720F7720B4D // A5
+data8 0x3FEF4857A010CA3B // A4
+data8 0xBFE2788CCD545AA4 // A1
+data8 0x3FEFFFFFFFE9209E // A0
+//
+//[1.25; 1.5)
+data8 0xBFB421236426936C // A7
+data8 0x3FAF237514F36691 // A6
+data8 0xBFC0BADE710A10B9 // A3
+data8 0x3FDB6C5465BBEF1F // A2
+data8 0xBFB7E7F83A546EBE // A5
+data8 0x3FC496A01A545163 // A4
+data8 0xBDEE86A39D8452EB // A1
+data8 0x3FEC56DC82A39AA2 // A0
+//
+//[1.5; 1.75)
+data8 0xBF94730B51795867 // A7
+data8 0x3FBF4203E3816C7B // A6
+data8 0xBFE85B427DBD23E4 // A3
+data8 0x3FEE65557AB26771 // A2
+data8 0xBFD59D31BE3AB42A // A5
+data8 0x3FE3C90CC8F09147 // A4
+data8 0xBFE245971DF735B8 // A1
+data8 0x3FEFFC613AE7FBC8 // A0
+//
+//[1.75; 2.0)
+data8 0xBF7746A85137617E // A7
+data8 0x3FA96E37D09735F3 // A6
+data8 0xBFE3C24AC40AC0BB // A3
+data8 0x3FEC56A80A977CA5 // A2
+data8 0xBFC6F0E707560916 // A5
+data8 0x3FDB262D949175BE // A4
+data8 0xBFE1C1AEDFB25495 // A1
+data8 0x3FEFEE1E644B2022 // A0
+//
+// sin(pi*x)/pi
+data8 0xC026FB0D377656CC // S01
+data8 0x3FFFB15F95A22324 // S11
+data8 0x406CE58F4A41C6E7 // S10
+data8 0x404453786302C61E // S20
+data8 0xC023D59A47DBFCD3 // S21
+data8 0x405541D7ABECEFCA // S00
+//
+// 1/An for [40; 48)
+data8 0xCAA7576DE621FCD5, 0x3F68
+LOCAL_OBJECT_END(tgammaf_data)
+
+//==============================================================
+// Code
+//==============================================================
+
+.section .text
+GLOBAL_LIBM_ENTRY(tgammaf)
+{ .mfi
+ getf.exp GR_SignExp = f8
+ fma.s1 FR_NormX = f8,f1,f0
+ addl GR_ad_Data = @ltoff(tgammaf_data), gp
+}
+{ .mfi
+ mov GR_ExpOf05 = 0xFFFE
+ fcvt.fx.trunc.s1 FR_iXt = f8 // [x]
+ mov GR_Offs = 0 // 2 <= x < 8
+};;
+{ .mfi
+ getf.d GR_Arg = f8
+ fcmp.lt.s1 p14,p15 = f8,f0
+ mov GR_Tbl12Offs = 0
+}
+{ .mfi
+ setf.exp FR_05 = GR_ExpOf05
+ fma.s1 FR_2 = f1,f1,f1 // 2
+ mov GR_Correction = 0
+};;
+{ .mfi
+ ld8 GR_ad_Data = [GR_ad_Data]
+ fclass.m p10,p0 = f8,0x1E7 // is x NaTVal, NaN, +/-0 or +/-INF?
+ tbit.z p12,p13 = GR_SignExp,16 // p13 if |x| >= 2
+}
+{ .mfi
+ mov GR_ExpOf1 = 0xFFFF
+ fcvt.fx.s1 FR_rs = f8 // round(x)
+ and GR_Exp2Ind = 7,GR_SignExp
+};;
+.pred.rel "mutex",p14,p15
+{ .mfi
+(p15) cmp.eq.unc p11,p0 = GR_ExpOf1,GR_SignExp // p11 if 1 <= x < 2
+(p14) fma.s1 FR_1mX = f1,f1,f8 // 1 - |x|
+ mov GR_Sig = 0 // if |x| < 2
+}
+{ .mfi
+(p13) cmp.eq.unc p7,p0 = 2,GR_Exp2Ind
+(p15) fms.s1 FR_1mX = f1,f1,f8 // 1 - |x|
+(p13) cmp.eq.unc p8,p0 = 3,GR_Exp2Ind
+};;
+.pred.rel "mutex",p7,p8
+{ .mfi
+(p7) mov GR_Offs = 0x7 // 8 <= |x| < 16
+ nop.f 0
+(p8) tbit.z.unc p0,p6 = GR_Arg,51
+}
+{ .mib
+(p13) cmp.lt.unc p9,p0 = 3,GR_Exp2Ind
+(p8) mov GR_Offs = 0xE // 16 <= |x| < 32
+ // jump if x is NaTVal, NaN, +/-0 or +/-INF?
+(p10) br.cond.spnt tgammaf_spec_args
+};;
+.pred.rel "mutex",p14,p15
+.pred.rel "mutex",p6,p9
+{ .mfi
+(p9) mov GR_Offs = 0x1C // 32 <= |x|
+(p14) fma.s1 FR_X2mX = FR_NormX,FR_NormX,FR_NormX // x^2-|x|
+(p9) tbit.z.unc p0,p8 = GR_Arg,50
+}
+{ .mfi
+ ldfpd FR_LocalMin,FR_10 = [GR_ad_Data],16
+(p15) fms.s1 FR_X2mX = FR_NormX,FR_NormX,FR_NormX // x^2-|x|
+(p6) add GR_Offs = 0x7,GR_Offs // 24 <= x < 32
+};;
+.pred.rel "mutex",p8,p12
+{ .mfi
+ add GR_ad_Ce = 0x50,GR_ad_Data
+(p15) fcmp.lt.unc.s1 p10,p0 = f8,f1 // p10 if 0 <= x < 1
+ mov GR_OvfNzBound = 2
+}
+{ .mib
+ ldfpd FR_S32,FR_S31 = [GR_ad_Data],16
+(p8) add GR_Offs = 0x7,GR_Offs // 40 <= |x|
+ // jump if 1 <= x < 2
+(p11) br.cond.spnt tgammaf_from_1_to_2
+};;
+{ .mfi
+ shladd GR_ad_Ce = GR_Offs,4,GR_ad_Ce
+ fcvt.xf FR_Xt = FR_iXt // [x]
+(p13) cmp.eq.unc p7,p0 = r0,GR_Offs // p7 if 2 <= |x| < 8
+}
+{ .mfi
+ shladd GR_ad_Co = GR_Offs,4,GR_ad_Data
+ fma.s1 FR_6 = FR_2,FR_2,FR_2
+ mov GR_ExpOf05 = 0x7FC
+};;
+{ .mfi
+(p13) getf.sig GR_Sig = FR_iXt // if |x| >= 2
+ frcpa.s1 FR_Rcp0,p0 = f1,FR_NormX
+(p10) shr GR_Arg = GR_Arg,51
+}
+{ .mib
+ ldfpd FR_C01,FR_C11 = [GR_ad_Co],16
+(p7) mov GR_Correction = 2
+ // jump if 0 < x < 1
+(p10) br.cond.spnt tgammaf_from_0_to_1
+};;
+{ .mfi
+ ldfpd FR_C21,FR_C31 = [GR_ad_Ce],16
+ fma.s1 FR_Rq2 = f1,f1,FR_1mX // 2 - |x|
+(p14) sub GR_Correction = r0,GR_Correction
+}
+{ .mfi
+ ldfpd FR_C41,FR_C51 = [GR_ad_Co],16
+(p14) fcvt.xf FR_rs = FR_rs
+(p14) add GR_ad_SinO = 0x3A0,GR_ad_Data
+};;
+.pred.rel "mutex",p14,p15
+{ .mfi
+ ldfpd FR_C00,FR_C10 = [GR_ad_Ce],16
+ nop.f 0
+(p14) sub GR_Sig = GR_Correction,GR_Sig
+}
+{ .mfi
+ ldfpd FR_C20,FR_C30 = [GR_ad_Co],16
+ fma.s1 FR_Rq1 = FR_1mX,FR_2,FR_X2mX // (x-1)*(x-2)
+(p15) sub GR_Sig = GR_Sig,GR_Correction
+};;
+{ .mfi
+(p14) ldfpd FR_S01,FR_S11 = [GR_ad_SinO],16
+ fma.s1 FR_Rq3 = FR_2,f1,FR_1mX // 3 - |x|
+ and GR_RqDeg = 0x6,GR_Sig
+}
+{ .mfi
+ ldfpd FR_C40,FR_C50 = [GR_ad_Ce],16
+(p14) fma.d.s0 FR_X = f0,f0,f8 // set deno flag
+ mov GR_NanBound = 0x30016 // -2^23
+};;
+.pred.rel "mutex",p14,p15
+{ .mfi
+(p14) add GR_ad_SinE = 0x3C0,GR_ad_Data
+(p15) fms.s1 FR_r = FR_NormX,f1,FR_Xt // r = x - [x]
+ cmp.eq p8,p0 = 2,GR_RqDeg
+}
+{ .mfi
+ ldfpd FR_An,FR_OvfBound = [GR_ad_Co]
+(p14) fms.s1 FR_r = FR_Xt,f1,FR_NormX // r = |x - [x]|
+ cmp.eq p9,p0 = 4,GR_RqDeg
+};;
+.pred.rel "mutex",p8,p9
+{ .mfi
+(p14) ldfpd FR_S21,FR_S00 = [GR_ad_SinE],16
+(p8) fma.s1 FR_Rq0 = FR_2,f1,FR_1mX // (3-x)
+ tbit.z p0,p6 = GR_Sig,0
+}
+{ .mfi
+(p14) ldfpd FR_S10,FR_S20 = [GR_ad_SinO],16
+(p9) fma.s1 FR_Rq0 = FR_2,FR_2,FR_1mX // (5-x)
+ cmp.eq p10,p0 = 6,GR_RqDeg
+};;
+{ .mfi
+(p14) getf.s GR_Arg = f8
+(p14) fcmp.eq.unc.s1 p13,p0 = FR_NormX,FR_Xt
+(p14) mov GR_ZeroResBound = 0xC22C // -43
+}
+{ .mfi
+(p14) ldfe FR_InvAn = [GR_ad_SinE]
+(p10) fma.s1 FR_Rq0 = FR_6,f1,FR_1mX // (7-x)
+ cmp.eq p7,p0 = r0,GR_RqDeg
+};;
+{ .mfi
+(p14) cmp.ge.unc p11,p0 = GR_SignExp,GR_NanBound
+ fma.s1 FR_Rq2 = FR_Rq2,FR_6,FR_X2mX // (x-3)*(x-4)
+(p14) shl GR_ZeroResBound = GR_ZeroResBound,16
+}
+{ .mfb
+(p14) mov GR_OvfNzBound = 0x802
+(p14) fms.s1 FR_rs = FR_rs,f1,FR_NormX // rs = round(x) - x
+ // jump if x < -2^23 i.e. x is negative integer
+(p11) br.cond.spnt tgammaf_singularity
+};;
+{ .mfi
+ nop.m 0
+(p7) fma.s1 FR_Rq1 = f0,f0,f1
+(p14) shl GR_OvfNzBound = GR_OvfNzBound,20
+}
+{ .mfb
+ nop.m 0
+ fma.s1 FR_Rq3 = FR_Rq3,FR_10,FR_X2mX // (x-5)*(x-6)
+ // jump if x is negative integer such that -2^23 < x < 0
+(p13) br.cond.spnt tgammaf_singularity
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C01 = FR_C01,f1,FR_r
+(p14) mov GR_ExpOf05 = 0xFFFE
+}
+{ .mfi
+(p14) cmp.eq.unc p7,p0 = GR_Arg,GR_OvfNzBound
+ fma.s1 FR_C11 = FR_C11,f1,FR_r
+(p14) cmp.ltu.unc p11,p0 = GR_Arg,GR_OvfNzBound
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C21 = FR_C21,f1,FR_r
+(p14) cmp.ltu.unc p9,p0 = GR_ZeroResBound,GR_Arg
+}
+{ .mfb
+ nop.m 0
+ fma.s1 FR_C31 = FR_C31,f1,FR_r
+ // jump if argument is close to 0 negative
+(p11) br.cond.spnt tgammaf_overflow
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C41 = FR_C41,f1,FR_r
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fma.s1 FR_C51 = FR_C51,f1,FR_r
+ // jump if x is negative noninteger such that -2^23 < x < -43
+(p9) br.cond.spnt tgammaf_underflow
+};;
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_rs2 = FR_rs,FR_rs,f0
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p14) fma.s1 FR_S01 = FR_rs,FR_rs,FR_S01
+ // jump if argument is 0x80200000
+(p7) br.cond.spnt tgammaf_overflow_near0_bound
+};;
+{ .mfi
+ nop.m 0
+(p6) fnma.s1 FR_Rq1 = FR_Rq1,FR_Rq0,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_Rq2 = FR_Rq2,FR_Rq3,f0
+ and GR_Sig = 0x7,GR_Sig
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C01 = FR_C01,FR_r,FR_C00
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C11 = FR_C11,FR_r,FR_C10
+ cmp.eq p6,p7 = r0,GR_Sig // p6 if |x| from one of base intervals
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C21 = FR_C21,FR_r,FR_C20
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C31 = FR_C31,FR_r,FR_C30
+(p7) cmp.lt.unc p9,p0 = 2,GR_RqDeg
+};;
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_S11 = FR_rs,FR_rs,FR_S11
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_S21 = FR_rs,FR_rs,FR_S21
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C41 = FR_C41,FR_r,FR_C40
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_S32 = FR_rs2,FR_S32,FR_S31
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p9) fma.s1 FR_Rq1 = FR_Rq1,FR_Rq2,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C51 = FR_C51,FR_r,FR_C50
+ nop.i 0
+};;
+{ .mfi
+(p14) getf.exp GR_SignExp = FR_rs
+ fma.s1 FR_C01 = FR_C01,FR_C11,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_S01 = FR_S01,FR_rs2,FR_S00
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C21 = FR_C21,FR_C31,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // NR-iteration
+(p14) fnma.s1 FR_InvNormX1 = FR_Rcp0,FR_NormX,f1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_S11 = FR_S11,FR_rs2,FR_S10
+(p14) tbit.z.unc p11,p12 = GR_SignExp,17
+}
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_S21 = FR_S21,FR_rs2,FR_S20
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p15) fcmp.lt.unc.s1 p0,p13 = FR_NormX,FR_OvfBound
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_S32 = FR_rs2,FR_S32,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C41 = FR_C41,FR_C51,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p7) fma.s1 FR_An = FR_Rq1,FR_An,f0
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ nop.f 0
+ // jump if x > 35.04010009765625
+(p13) br.cond.spnt tgammaf_overflow
+};;
+{ .mfi
+ nop.m 0
+ // NR-iteration
+(p14) fma.s1 FR_InvNormX1 = FR_Rcp0,FR_InvNormX1,FR_Rcp0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_S01 = FR_S01,FR_S11,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_S21 = FR_S21,FR_S32,f0
+ nop.i 0
+};;
+{ .mfi
+(p14) getf.exp GR_SignExp = FR_NormX
+ fma.s1 FR_C01 = FR_C01,FR_C21,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_C41 = FR_C41,FR_An,f0
+(p14) mov GR_ExpOf1 = 0x2FFFF
+};;
+{ .mfi
+ nop.m 0
+ // NR-iteration
+(p14) fnma.s1 FR_InvNormX2 = FR_InvNormX1,FR_NormX,f1
+ nop.i 0
+};;
+.pred.rel "mutex",p11,p12
+{ .mfi
+ nop.m 0
+(p12) fnma.s1 FR_S01 = FR_S01,FR_S21,f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fma.s1 FR_S01 = FR_S01,FR_S21,f0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_GAMMA = FR_C01,FR_C41,f0
+(p14) tbit.z.unc p6,p7 = GR_Sig,0
+}
+{ .mfb
+ nop.m 0
+(p15) fma.s.s0 f8 = FR_C01,FR_C41,f0
+(p15) br.ret.spnt b0 // exit for positives
+};;
+.pred.rel "mutex",p11,p12
+{ .mfi
+ nop.m 0
+(p12) fms.s1 FR_S01 = FR_rs,FR_S01,FR_rs
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fma.s1 FR_S01 = FR_rs,FR_S01,FR_rs
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ // NR-iteration
+ fma.s1 FR_InvNormX2 = FR_InvNormX1,FR_InvNormX2,FR_InvNormX1
+ cmp.eq p10,p0 = 0x23,GR_Offs
+};;
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+(p6) fma.s1 FR_GAMMA = FR_S01,FR_GAMMA,f0
+ cmp.gtu p8,p0 = GR_SignExp,GR_ExpOf1
+}
+{ .mfi
+ nop.m 0
+(p7) fnma.s1 FR_GAMMA = FR_S01,FR_GAMMA,f0
+ cmp.eq p9,p0 = GR_SignExp,GR_ExpOf1
+};;
+{ .mfi
+ nop.m 0
+ // NR-iteration
+ fnma.s1 FR_InvNormX1 = FR_InvNormX2,FR_NormX,f1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_InvNormX2 = FR_InvNormX2,FR_InvAn,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ frcpa.s1 FR_Rcp0,p0 = f1,FR_GAMMA
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_Multplr = FR_NormX,f1,f1 // x - 1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ // NR-iteration
+ fnma.s1 FR_Rcp1 = FR_Rcp0,FR_GAMMA,f1
+ nop.i 0
+};;
+.pred.rel "mutex",p8,p9
+{ .mfi
+ nop.m 0
+ // 1/x or 1/(An*x)
+(p8) fma.s1 FR_Multplr = FR_InvNormX2,FR_InvNormX1,FR_InvNormX2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p9) fma.s1 FR_Multplr = f1,f1,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ // NR-iteration
+ fma.s1 FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ // NR-iteration
+ fnma.s1 FR_Rcp2 = FR_Rcp1,FR_GAMMA,f1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // NR-iteration
+ fma.s1 FR_Rcp1 = FR_Rcp1,FR_Multplr,f0
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ fma.s.s0 f8 = FR_Rcp1,FR_Rcp2,FR_Rcp1
+ br.ret.sptk b0
+};;
+
+// here if 0 < x < 1
+//--------------------------------------------------------------------
+.align 32
+tgammaf_from_0_to_1:
+{ .mfi
+ cmp.lt p7,p0 = GR_Arg,GR_ExpOf05
+ // NR-iteration
+ fnma.s1 FR_Rcp1 = FR_Rcp0,FR_NormX,f1
+ cmp.eq p8,p0 = GR_Arg,GR_ExpOf05
+}
+{ .mfi
+ cmp.gt p9,p0 = GR_Arg,GR_ExpOf05
+ fma.s1 FR_r = f0,f0,FR_NormX // reduced arg for (0;1)
+ mov GR_ExpOf025 = 0x7FA
+};;
+{ .mfi
+ getf.s GR_ArgNz = f8
+ fma.d.s0 FR_X = f0,f0,f8 // set deno flag
+ shl GR_OvfNzBound = GR_OvfNzBound,20
+}
+{ .mfi
+(p8) mov GR_Tbl12Offs = 0x80 // 0.5 <= x < 0.75
+ nop.f 0
+(p7) cmp.ge.unc p6,p0 = GR_Arg,GR_ExpOf025
+};;
+.pred.rel "mutex",p6,p9
+{ .mfi
+(p9) mov GR_Tbl12Offs = 0xC0 // 0.75 <= x < 1
+ nop.f 0
+(p6) mov GR_Tbl12Offs = 0x40 // 0.25 <= x < 0.5
+}
+{ .mfi
+ add GR_ad_Ce = 0x2C0,GR_ad_Data
+ nop.f 0
+ add GR_ad_Co = 0x2A0,GR_ad_Data
+};;
+{ .mfi
+ add GR_ad_Co = GR_ad_Co,GR_Tbl12Offs
+ nop.f 0
+ cmp.lt p12,p0 = GR_ArgNz,GR_OvfNzBound
+}
+{ .mib
+ add GR_ad_Ce = GR_ad_Ce,GR_Tbl12Offs
+ cmp.eq p7,p0 = GR_ArgNz,GR_OvfNzBound
+ // jump if argument is 0x00200000
+(p7) br.cond.spnt tgammaf_overflow_near0_bound
+};;
+{ .mmb
+ ldfpd FR_A7,FR_A6 = [GR_ad_Co],16
+ ldfpd FR_A5,FR_A4 = [GR_ad_Ce],16
+ // jump if argument is close to 0 positive
+(p12) br.cond.spnt tgammaf_overflow
+};;
+{ .mfi
+ ldfpd FR_A3,FR_A2 = [GR_ad_Co],16
+ // NR-iteration
+ fma.s1 FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0
+ nop.i 0
+}
+{ .mfb
+ ldfpd FR_A1,FR_A0 = [GR_ad_Ce],16
+ nop.f 0
+ br.cond.sptk tgamma_from_0_to_2
+};;
+
+// here if 1 < x < 2
+//--------------------------------------------------------------------
+.align 32
+tgammaf_from_1_to_2:
+{ .mfi
+ add GR_ad_Co = 0x2A0,GR_ad_Data
+ fms.s1 FR_r = f0,f0,FR_1mX
+ shr GR_TblOffs = GR_Arg,47
+}
+{ .mfi
+ add GR_ad_Ce = 0x2C0,GR_ad_Data
+ nop.f 0
+ mov GR_TblOffsMask = 0x18
+};;
+{ .mfi
+ nop.m 0
+ nop.f 0
+ and GR_TblOffs = GR_TblOffs,GR_TblOffsMask
+};;
+{ .mfi
+ shladd GR_ad_Co = GR_TblOffs,3,GR_ad_Co
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ shladd GR_ad_Ce = GR_TblOffs,3,GR_ad_Ce
+ nop.f 0
+ cmp.eq p6,p7 = 8,GR_TblOffs
+};;
+{ .mmi
+ ldfpd FR_A7,FR_A6 = [GR_ad_Co],16
+ ldfpd FR_A5,FR_A4 = [GR_ad_Ce],16
+ nop.i 0
+};;
+{ .mmi
+ ldfpd FR_A3,FR_A2 = [GR_ad_Co],16
+ ldfpd FR_A1,FR_A0 = [GR_ad_Ce],16
+ nop.i 0
+};;
+
+.align 32
+tgamma_from_0_to_2:
+{ .mfi
+ nop.m 0
+(p6) fms.s1 FR_r = FR_r,f1,FR_LocalMin
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ // NR-iteration
+(p10) fnma.s1 FR_Rcp2 = FR_Rcp1,FR_NormX,f1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r2 = FR_r,FR_r,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A7 = FR_A7,FR_r,FR_A6
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A5 = FR_A5,FR_r,FR_A4
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A3 = FR_A3,FR_r,FR_A2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A1 = FR_A1,FR_r,FR_A0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ // NR-iteration
+(p10) fma.s1 FR_Rcp2 = FR_Rcp1,FR_Rcp2,FR_Rcp1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A7 = FR_A7,FR_r2,FR_A5
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r4 = FR_r2,FR_r2,f0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_A3 = FR_A3,FR_r2,FR_A1
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+(p10) fma.s1 FR_GAMMA = FR_A7,FR_r4,FR_A3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p11) fma.s.s0 f8 = FR_A7,FR_r4,FR_A3
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+(p10) fma.s.s0 f8 = FR_GAMMA,FR_Rcp2,f0
+ br.ret.sptk b0
+};;
+
+
+// overflow
+//--------------------------------------------------------------------
+.align 32
+tgammaf_overflow_near0_bound:
+.pred.rel "mutex",p14,p15
+{ .mfi
+ mov GR_fpsr = ar.fpsr
+ nop.f 0
+(p15) mov r8 = 0x7f8
+}
+{ .mfi
+ nop.m 0
+ nop.f 0
+(p14) mov r8 = 0xff8
+};;
+{ .mfi
+ nop.m 0
+ nop.f 0
+ shl r8 = r8,20
+};;
+{ .mfi
+ sub r8 = r8,r0,1
+ nop.f 0
+ extr.u GR_fpsr = GR_fpsr,10,2 // rounding mode
+};;
+.pred.rel "mutex",p14,p15
+{ .mfi
+ // set p8 to 0 in case of overflow and to 1 otherwise
+ // for negative arg:
+ // no overflow if rounding mode either Z or +Inf, i.e.
+ // GR_fpsr > 1
+(p14) cmp.lt p8,p0 = 1,GR_fpsr
+ nop.f 0
+ // for positive arg:
+ // no overflow if rounding mode either Z or -Inf, i.e.
+ // (GR_fpsr & 1) == 0
+(p15) tbit.z p0,p8 = GR_fpsr,0
+};;
+{ .mib
+(p8) setf.s f8 = r8 // set result to 0x7f7fffff without
+ // OVERFLOW flag raising
+ nop.i 0
+(p8) br.ret.sptk b0
+};;
+
+.align 32
+tgammaf_overflow:
+{ .mfi
+ nop.m 0
+ nop.f 0
+ mov r8 = 0x1FFFE
+};;
+{ .mfi
+ setf.exp f9 = r8
+ fmerge.s FR_X = f8,f8
+ nop.i 0
+};;
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p14) fnma.s.s0 f8 = f9,f9,f0 // set I,O and -INF result
+ mov GR_TAG = 261 // overflow
+}
+{ .mfb
+ nop.m 0
+(p15) fma.s.s0 f8 = f9,f9,f0 // set I,O and +INF result
+ br.cond.sptk tgammaf_libm_err
+};;
+
+// x is negative integer or +/-0
+//--------------------------------------------------------------------
+.align 32
+tgammaf_singularity:
+{ .mfi
+ nop.m 0
+ fmerge.s FR_X = f8,f8
+ mov GR_TAG = 262 // negative
+}
+{ .mfb
+ nop.m 0
+ frcpa.s0 f8,p0 = f0,f0
+ br.cond.sptk tgammaf_libm_err
+};;
+// x is negative noninteger with big absolute value
+//--------------------------------------------------------------------
+.align 32
+tgammaf_underflow:
+{ .mfi
+ mov r8 = 0x00001
+ nop.f 0
+ tbit.z p6,p7 = GR_Sig,0
+};;
+{ .mfi
+ setf.exp f9 = r8
+ nop.f 0
+ nop.i 0
+};;
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+(p6) fms.s.s0 f8 = f9,f9,f9
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p7) fma.s.s0 f8 = f9,f9,f9
+ br.ret.sptk b0
+};;
+
+// x for natval, nan, +/-inf or +/-0
+//--------------------------------------------------------------------
+.align 32
+tgammaf_spec_args:
+{ .mfi
+ nop.m 0
+ fclass.m p6,p0 = f8,0x1E1 // Test x for natval, nan, +inf
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fclass.m p7,p8 = f8,0x7 // +/-0
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fmerge.s FR_X = f8,f8
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p6) fma.s.s0 f8 = f8,f1,f8
+(p6) br.ret.spnt b0
+};;
+.pred.rel "mutex",p7,p8
+{ .mfi
+(p7) mov GR_TAG = 262 // negative
+(p7) frcpa.s0 f8,p0 = f1,f8
+ nop.i 0
+}
+{ .mib
+ nop.m 0
+ nop.i 0
+(p8) br.cond.spnt tgammaf_singularity
+};;
+
+.align 32
+tgammaf_libm_err:
+{ .mfi
+ alloc r32 = ar.pfs,1,4,4,0
+ nop.f 0
+ mov GR_Parameter_TAG = GR_TAG
+};;
+
+GLOBAL_LIBM_END(tgammaf)
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
+ nop.b 0
+}
+{ .mib
+ stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 0
+ nop.m 0
+ add GR_Parameter_RESULT = 48,sp
+};;
+{ .mmi
+ ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region)
+.type __libm_error_support#,@function
+.global __libm_error_support#
+
diff --git a/sysdeps/ia64/fpu/w_tgammal.S b/sysdeps/ia64/fpu/w_tgammal.S
new file mode 100644
index 0000000000..75b1069d21
--- /dev/null
+++ b/sysdeps/ia64/fpu/w_tgammal.S
@@ -0,0 +1,4485 @@
+.file "tgammal.s"
+
+
+// Copyright (c) 2002 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+// History
+//==============================================================
+// 01/16/02 Initial version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
+// 03/17/03 Moved tgammal_libm_err label into .proc region
+// 04/10/03 Changed error codes for overflow and negative integers
+//
+// API
+//==============================================================
+// long double tgammal(long double)
+//
+// Resources Used:
+//
+// Floating-Point Registers: f8-f15
+// f32-f127
+//
+// General Purpose Registers: r32-r67
+//
+// Predicate Registers: p6-p15
+//
+//*********************************************************************
+//
+// IEEE Special Conditions:
+//
+// tgammal(+inf) = +inf
+// tgammal(-inf) = QNaN
+// tgammal(+/-0) = +/-inf
+// tgammal(x<0, x - integer) = QNaN
+// tgammal(SNaN) = QNaN
+// tgammal(QNaN) = QNaN
+//
+//*********************************************************************
+// Overview of operation
+//==============================================================
+//
+// Algorithm description
+// ---------------------
+//
+// There are 3 main paths in the implementation
+// (and additional special values branches)
+//
+// 1) |X| >= 13 - Stirling formula computation
+// a) Positive arguments:
+// TGAMMAL(X) = exp((X-0.5)*ln(X) - X + C + S(Z)),
+// where C = 0.5*ln(2*Pi) , Z = 1/Z, S(Z) - Bernulli polynomial
+// (up to 'B18' term).
+// Some of these calculation done in multiprecision.
+// Ln returns multiprecision result too
+// and exp also accepts and returns pair of values.
+//
+// b) Negative arguments
+// TGAMMAL(-X) = PI/(X*TGAMMAL(X)*sin(PI*X)).
+// (X*sin(PI*X))/PI calculated in parallel with TGAMMAL.
+// Here we use polynomial of 9th degree with 2 multiprecision steps.
+// Argument range reduction is:
+// N = [x] with round to nearest, r = x - N, -0.5 <= r < 0.5
+// After ((X-0.5)*ln(X) - X + C + S(Z)) completed we just invert
+// its result and compute exp with negative argument (1/exp(x)=exp(-x))
+// Then we multiply exp result to PI/(X*sin(PI*X)).
+//
+// 2) 1 <= |X| < 13 - Polynomial part
+// a) Positive arguments:
+// All values are splitted to such intervals as:
+// #0->[2;3], #1->[3,4], #2->[5,6]...
+// For even intervals we just use polynomial computation with degree 20
+// and first 6 multiprecision computations.
+// Range reduction looks like
+// N = [x] with truncate, r = x - N - 0.5, -0.5 <= r < 0.5
+// For odd intervals we use reccurent formula:
+// TGAMMAL(X) = TGAMMA(X-1)*(X-1)
+// [1;2] interval is splitted to 3 subranges:
+// [1;1.25], [1.25;1.75], [1.75;2] with the same polynomial forms
+//
+// b) Negative arguments
+// TGAMMAL(-X) = PI/(X*TGAMMAL(X)*sin(PI*X)).
+// (X*sin(PI*X))/PI calculated in parallel with TGAMMAL.
+// After multiplication by TGAMMAL(X) result we calculate reciprocal
+// and get final result.
+//
+// 3) 0 < |X| < 1 - Near 0 part
+// a) Here we use reccurent formula TGAMMAL(X) = TGAMMAL(X+1)/X
+// TGAMMAL(X+1) calculated as shown above,
+// 1/X result obtained in parallel. Then we just multiply these values.
+// There is only additional separated subrange: [0;0.125] with specific
+// polynomial constants set.
+//
+// b) Negative arguments
+// TGAMMAL(-X) = PI/(TGAMMAL(X+1)*sin(PI*X)).
+// There is no need to compute 1/X.
+
+
+
+RODATA
+
+.align 16
+LOCAL_OBJECT_START(Constants_Tgammal_log_80_Q)
+// log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1
+data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
+data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
+data4 0xA51BE0AF,0x92492453,0x00003FFC,0x00000000
+data4 0xA0CFD29F,0xAAAAAB73,0x0000BFFC,0x00000000
+data4 0xCCCE3872,0xCCCCCCCC,0x00003FFC,0x00000000
+data4 0xFFFFB4FB,0xFFFFFFFF,0x0000BFFC,0x00000000
+data4 0xAAAAAAAB,0xAAAAAAAA,0x00003FFD,0x00000000
+data4 0x00000000,0x80000000,0x0000BFFE,0x00000000
+LOCAL_OBJECT_END(Constants_Tgammal_log_80_Q)
+
+.align 64
+LOCAL_OBJECT_START(Constants_Tgammal_log_80_Z_G_H_h1)
+// Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double
+data4 0x00008000,0x3F800000,0x00000000,0x00000000
+data4 0x00000000,0x00000000,0x00000000,0x00000000
+data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000
+data4 0xEBA0E0D1,0x8B1D330B,0x00003FDA,0x00000000
+data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000
+data4 0x9EADD553,0xE2AF365E,0x00003FE2,0x00000000
+data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000
+data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000
+data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000
+data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000
+data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000
+data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000
+data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000
+data4 0x457978A1,0x8718789F,0x00003FE2,0x00000000
+data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000
+data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000
+data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000
+data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000
+data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000
+data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000
+data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000
+data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000
+data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000
+data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000
+data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000
+data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000
+data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000
+data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000
+data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000
+data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000
+data4 0x00004211,0x3F042108,0x3F29516A,0x00000000
+data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000
+LOCAL_OBJECT_END(Constants_Tgammal_log_80_Z_G_H_h1)
+
+.align 64
+LOCAL_OBJECT_START(Constants_Tgammal_log_80_Z_G_H_h2)
+// Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double
+data4 0x00008000,0x3F800000,0x00000000,0x00000000
+data4 0x00000000,0x00000000,0x00000000,0x00000000
+data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000
+data4 0x211398BF,0xAD08B116,0x00003FDB,0x00000000
+data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000
+data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000
+data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000
+data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000
+data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000
+data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000
+data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000
+data4 0x049EB22F,0xD1B87D3C,0x00003FDE,0x00000000
+data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000
+data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000
+data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000
+data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000
+data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000
+data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000
+data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000
+data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000
+data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000
+data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000
+data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000
+data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000
+data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000
+data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000
+data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000
+data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000
+data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000
+data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000
+data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000
+data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000
+LOCAL_OBJECT_END(Constants_Tgammal_log_80_Z_G_H_h2)
+
+.align 64
+LOCAL_OBJECT_START(Constants_Tgammal_log_80_h3_G_H)
+// h3 IEEE double extended, H3 and G3 IEEE single
+data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00
+data4 0x9B7FAD21,0x90051030,0x00003FD8,0x3F7FF400
+data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00
+data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400
+data4 0xCA964D95,0xCE65C1D8,0x0000BFD8,0x3F7FDC00
+data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400
+data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08
+data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408
+data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10
+data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410
+data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18
+data4 0x2F053150,0xB25CA912,0x0000BFDA,0x3F7FA420
+data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20
+data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428
+data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30
+data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438
+data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40
+data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448
+data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50
+data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458
+data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68
+data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470
+data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78
+data4 0x1F34A7EB,0x9A995A97,0x0000BFDC,0x3F7F4488
+data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90
+data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0
+data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8
+data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8
+data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8
+data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8
+data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0
+data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0
+data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here
+data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D
+data4 0x3A87F20B,0x3A97F68B,0x3AA7EB86,0x3AB7E101
+data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED
+data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766
+data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6
+data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620
+data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D
+LOCAL_OBJECT_END(Constants_Tgammal_log_80_h3_G_H)
+
+.align 64
+LOCAL_OBJECT_START(Constants_Tgammal_stirling)
+//0.5*ln(2*Pi)=9.1893853320467266954096885e-01 + 7.2239360881843238220057778e-17
+data8 0x3FED67F1C864BEB4, 0x3C94D252F2400510
+// Bernulli numbers
+data8 0xAAAAAAAAAAAAAAAB, 0x00003FFB //B2 = 8.3333333333333333333333333333e-02
+data8 0xBF66C16C16C16C17 //B4 = -2.7777777777777777777777777778e-03
+data8 0x3F4A01A01A01A01A //B6 = 7.9365079365079365079365079365e-04
+data8 0xBF43813813813814 //B8 = -5.9523809523809523809523809524e-04
+data8 0x3F4B951E2B18FF23 //B10 = 8.4175084175084175084175084175e-04
+data8 0xBF5F6AB0D9993C7D //B12 = -1.9175269175269175269175269175e-03
+data8 0x3F7A41A41A41A41A //B14 = 6.4102564102564102564102564103e-03
+data8 0xBF9E4286CB0F5398 //B16 = -2.9550653594771241830065359477e-02
+data8 0x3FC6FE96381E0680 //B18 = 1.7964437236883057316493849002e-01
+data8 0x3FE0000000000000 // 0.5
+LOCAL_OBJECT_END(Constants_Tgammal_stirling)
+
+.align 64
+LOCAL_OBJECT_START(Constants_Tgammal_sin)
+// Polynomial coefficients for the sin(Pi*x)/Pi, 0 <= |x| < 0.5
+//A2 = 8.1174242528335360802316245099e-01 + 5.1302254650266899774269946201e-18
+data8 0x3FE9F9CB402BC46C, 0x3C57A8B3819B7CEC
+//A1 = -1.6449340668482264060656916627e+00 + -3.0210280454695477893051351574e-17
+data8 0xBFFA51A6625307D3, 0xBC816A402079D0EF
+data8 0xF3AEF1FFCCE6C813, 0x0000BFE3 //A9 = -7.0921197799923779127089910470e-09
+data8 0x87D54408E6D4BB9D, 0x00003FE9 //A8 = 2.5300880778252693946712766029e-07
+data8 0xEA12033DCE7B8ED9, 0x0000BFED //A7 = -6.9758403885461690048189307819e-06
+data8 0x9BA38C952A59D1A8, 0x00003FF2 //A6 = 1.4842878710882320255092707181e-04
+data8 0x99C0B55178FF0E38, 0x0000BFF6 //A5 = -2.3460810348048124421268761990e-03
+data8 0xD63402E798FEC896, 0x00003FF9 //A4 = 2.6147847817611456327417812320e-02
+data8 0xC354723906D95E92, 0x0000BFFC //A3 = -1.9075182412208257558294507774e-01
+LOCAL_OBJECT_END(Constants_Tgammal_sin)
+
+.align 64
+LOCAL_OBJECT_START(Constants_Tgammal_exp_64_Arg)
+data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 // L_hi = hi part log(2)/2^12
+data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 // L_lo = lo part log(2)/2^12
+LOCAL_OBJECT_END(Constants_Tgammal_exp_64_Arg)
+
+LOCAL_OBJECT_START(Constants_Tgammal_exp_64_A)
+data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 // A3
+data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 // A2
+data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 // A1
+LOCAL_OBJECT_END(Constants_Tgammal_exp_64_A)
+
+LOCAL_OBJECT_START(Constants_Tgammal_exp_64_T1)
+data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
+data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
+data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
+data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
+data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
+data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516
+data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A
+data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4
+data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B
+data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD
+data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15
+data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B
+data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
+data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
+data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
+data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
+LOCAL_OBJECT_END(Constants_Tgammal_exp_64_T1)
+
+LOCAL_OBJECT_START(Constants_Tgammal_exp_64_T2)
+data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
+data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
+data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
+data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
+data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
+data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
+data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
+data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
+data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
+data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
+data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
+data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
+data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
+data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
+data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
+data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
+LOCAL_OBJECT_END(Constants_Tgammal_exp_64_T2)
+
+LOCAL_OBJECT_START(Constants_Tgammal_exp_64_W1)
+data8 0x0000000000000000, 0xBE384454171EC4B4
+data8 0xBE6947414AA72766, 0xBE5D32B6D42518F8
+data8 0x3E68D96D3A319149, 0xBE68F4DA62415F36
+data8 0xBE6DDA2FC9C86A3B, 0x3E6B2E50F49228FE
+data8 0xBE49C0C21188B886, 0x3E64BFC21A4C2F1F
+data8 0xBE6A2FBB2CB98B54, 0x3E5DC5DE9A55D329
+data8 0x3E69649039A7AACE, 0x3E54728B5C66DBA5
+data8 0xBE62B0DBBA1C7D7D, 0x3E576E0409F1AF5F
+data8 0x3E6125001A0DD6A1, 0xBE66A419795FBDEF
+data8 0xBE5CDE8CE1BD41FC, 0xBE621376EA54964F
+data8 0x3E6370BE476E76EE, 0x3E390D1A3427EB92
+data8 0x3E1336DE2BF82BF8, 0xBE5FF1CBD0F7BD9E
+data8 0xBE60A3550CEB09DD, 0xBE5CA37E0980F30D
+data8 0xBE5C541B4C082D25, 0xBE5BBECA3B467D29
+data8 0xBE400D8AB9D946C5, 0xBE5E2A0807ED374A
+data8 0xBE66CB28365C8B0A, 0x3E3AAD5BD3403BCA
+data8 0x3E526055C7EA21E0, 0xBE442C75E72880D6
+data8 0x3E58B2BB85222A43, 0xBE5AAB79522C42BF
+data8 0xBE605CB4469DC2BC, 0xBE589FA7A48C40DC
+data8 0xBE51C2141AA42614, 0xBE48D087C37293F4
+data8 0x3E367A1CA2D673E0, 0xBE51BEBB114F7A38
+data8 0xBE6348E5661A4B48, 0xBDF526431D3B9962
+data8 0x3E3A3B5E35A78A53, 0xBE46C46C1CECD788
+data8 0xBE60B7EC7857D689, 0xBE594D3DD14F1AD7
+data8 0xBE4F9C304C9A8F60, 0xBE52187302DFF9D2
+data8 0xBE5E4C8855E6D68F, 0xBE62140F667F3DC4
+data8 0xBE36961B3BF88747, 0x3E602861C96EC6AA
+data8 0xBE3B5151D57FD718, 0x3E561CD0FC4A627B
+data8 0xBE3A5217CA913FEA, 0x3E40A3CC9A5D193A
+data8 0xBE5AB71310A9C312, 0x3E4FDADBC5F57719
+data8 0x3E361428DBDF59D5, 0x3E5DB5DB61B4180D
+data8 0xBE42AD5F7408D856, 0x3E2A314831B2B707
+LOCAL_OBJECT_END(Constants_Tgammal_exp_64_W1)
+
+LOCAL_OBJECT_START(Constants_Tgammal_exp_64_W2)
+data8 0x0000000000000000, 0xBE641F2537A3D7A2
+data8 0xBE68DD57AD028C40, 0xBE5C77D8F212B1B6
+data8 0x3E57878F1BA5B070, 0xBE55A36A2ECAE6FE
+data8 0xBE620608569DFA3B, 0xBE53B50EA6D300A3
+data8 0x3E5B5EF2223F8F2C, 0xBE56A0D9D6DE0DF4
+data8 0xBE64EEF3EAE28F51, 0xBE5E5AE2367EA80B
+data8 0x3E47CB1A5FCBC02D, 0xBE656BA09BDAFEB7
+data8 0x3E6E70C6805AFEE7, 0xBE6E0509A3415EBA
+data8 0xBE56856B49BFF529, 0x3E66DD3300508651
+data8 0x3E51165FC114BC13, 0x3E53333DC453290F
+data8 0x3E6A072B05539FDA, 0xBE47CD877C0A7696
+data8 0xBE668BF4EB05C6D9, 0xBE67C3E36AE86C93
+data8 0xBE533904D0B3E84B, 0x3E63E8D9556B53CE
+data8 0x3E212C8963A98DC8, 0xBE33138F032A7A22
+data8 0x3E530FA9BC584008, 0xBE6ADF82CCB93C97
+data8 0x3E5F91138370EA39, 0x3E5443A4FB6A05D8
+data8 0x3E63DACD181FEE7A, 0xBE62B29DF0F67DEC
+data8 0x3E65C4833DDE6307, 0x3E5BF030D40A24C1
+data8 0x3E658B8F14E437BE, 0xBE631C29ED98B6C7
+data8 0x3E6335D204CF7C71, 0x3E529EEDE954A79D
+data8 0x3E5D9257F64A2FB8, 0xBE6BED1B854ED06C
+data8 0x3E5096F6D71405CB, 0xBE3D4893ACB9FDF5
+data8 0xBDFEB15801B68349, 0x3E628D35C6A463B9
+data8 0xBE559725ADE45917, 0xBE68C29C042FC476
+data8 0xBE67593B01E511FA, 0xBE4A4313398801ED
+data8 0x3E699571DA7C3300, 0x3E5349BE08062A9E
+data8 0x3E5229C4755BB28E, 0x3E67E42677A1F80D
+data8 0xBE52B33F6B69C352, 0xBE6B3550084DA57F
+data8 0xBE6DB03FD1D09A20, 0xBE60CBC42161B2C1
+data8 0x3E56ED9C78A2B771, 0xBE508E319D0FA795
+data8 0xBE59482AFD1A54E9, 0xBE2A17CEB07FD23E
+data8 0x3E68BF5C17365712, 0x3E3956F9B3785569
+LOCAL_OBJECT_END(Constants_Tgammal_exp_64_W2)
+
+
+
+LOCAL_OBJECT_START(Constants_Tgammal_poly)
+
+// Polynomial coefficients for the tgammal(x), 2 <= |x| < 3
+//A5 = 2.8360780594841213109180699803e-02 + 2.2504152891014320704380000000e-19
+data8 0x3F9D0A9BC49353D2, 0x3C109AEA0F23CE2D
+//A4 = 1.0967323400216015538699565468e-01 + 9.9225166000430644587276000000e-18
+data8 0x3FBC138B89492C5B, 0x3C66E138506D5652
+//A3 = 2.5387124684114281691904579930e-01 + 2.2667777637607113205546600000e-17
+data8 0x3FD03F6D2FA4F4F8, 0x3C7A2258DA8CD8B1
+data8 0xC5866457328BC39B, 0x00003FE3 //A20 = 5.7487331964156762795056629138e-09
+data8 0xE93D9F1ACD59C929, 0x0000BFE4 //A19= -1.3576396100397317396956445658e-08
+data8 0xE33389C8F6CBA813, 0x00003FE5 //A18 = 2.6449714924964597501721434271e-08
+data8 0x8FE7B25B9CD26D2A, 0x0000BFE7 //A17= -6.7011017946055513660266853311e-08
+data8 0xB89F4721BFBC15B0, 0x00003FE8 //A16 = 1.7194280320370423615174419192e-07
+data8 0xE49CBDC1874EBABA, 0x0000BFE9 //A15= -4.2582353660153782928729466776e-07
+data8 0x913AF50A336129CA, 0x00003FEB //A14 = 1.0820500665257088283172211622e-06
+data8 0xABCF0F7313B3B332, 0x0000BFEC //A13= -2.5601510627710417669568115706e-06
+//A2 = 6.5455857798133676439533701341e-01 + 1.3292075193155190798867000000e-18
+data8 0x3FE4F224D4B7E01C, 0x3C3885014A2B8319
+//A1 = 9.3473452162608550164435428087e-01 + 3.2785154201417136611642400000e-17
+data8 0x3FEDE9585F1A7093, 0x3C82E63C1B5028BF
+//A0 = 1.3293403881791368004172682049e+00 + 2.2005689328949279282607500000e-16
+data8 0x3FF544FA6D47B38F, 0x3CAFB6AA9829E81F
+data8 0xF3668F799997C76D, 0x00003FED //A12 = 7.2539039479124273660331538367e-06
+data8 0xD6C6BBD54CDEAEB1, 0x0000BFEE //A11= -1.2801665282681088568639378920e-05
+data8 0x809E4763B06F6883, 0x00003FF1 //A10 = 6.1329973609906572700697893187e-05
+data8 0x8443B000F8F9A71A, 0x00003FED //A9 = 3.9417864189995544394564413428e-06
+data8 0xC5C7E6D62A6991D8, 0x00003FF4 //A8 = 7.5447412886334708803357581519e-04
+data8 0xD2AF690725C62D88, 0x00003FF5 //A7 = 1.6074004848394703022110823298e-03
+data8 0xAA44E635D4B7B682, 0x00003FF8 //A6 = 1.0392403425906843901680697839e-02
+//
+// Polynomial coefficients for the tgammal(x), 4 <= |x| < 5
+//A5 = 1.1600674810589555185913468449e+00 + 3.0229979112715124660731000000e-17
+data8 0x3FF28FA2EB44D22E, 0x3C816D285234C815
+//A4 = 3.1374268565470946334983182169e+00 + 1.3694868953995008497659600000e-16
+data8 0x400919734073B1E1, 0x3CA3BC83CD7E9565
+//A3 = 7.0834593993741057360580271052e+00 + 3.3899702569039156457249800000e-16
+data8 0x401C5576617B6C1F, 0x3CB86D6431213296
+data8 0xA4A5FB49C094966B, 0x00003FDA //A20 = 9.3591760106637809309720130828e-12
+data8 0xA9260DA0F51D7ED8, 0x00003FDD //A19 = 7.6919898428091669411809372180e-11
+data8 0xA16441DFB14BD6E1, 0x00003FE0 //A18 = 5.8713933014370867331213494535e-10
+data8 0x95F098D9C2234849, 0x00003FE3 //A17 = 4.3638234584169302324461091035e-09
+data8 0x8581817400E5AD2B, 0x00003FE6 //A16 = 3.1084260332429955234755367839e-08
+data8 0xE272940E373EBE15, 0x00003FE8 //A15 = 2.1089573544273993580820317236e-07
+data8 0xB6B3391145D226FB, 0x00003FEB //A14 = 1.3612217421122787182942706259e-06
+data8 0x8B9428C4DF95FCD5, 0x00003FEE //A13 = 8.3195416382628990683949003789e-06
+//A2 = 1.2665135075272345943631080445e+01 + 9.8721896915973874255877000000e-16
+data8 0x4029548C95A76F38, 0x3CD1C8BE715B8E13
+//A1 = 1.6154969393303069580269948347e+01 + 9.6850518810678379641029000000e-16
+data8 0x403027AC12FC1E1E, 0x3CD172711C15501B
+//A0 = 1.1631728396567448058362970187e+01 + 8.7078125362814179268673000000e-16
+data8 0x40274371E7866C65, 0x3CCF5F8A1A5FACA0
+data8 0xC94A903114272C03, 0x00003FF0 //A12 = 4.7991576836334427243159066630e-05
+data8 0x8844262960E04BE6, 0x00003FF3 //A11 = 2.5990716419283017929486175141e-04
+data8 0xAC5418A76767678D, 0x00003FF5 //A10 = 1.3147621245497801180184809726e-03
+data8 0xCA231B6EFE959132, 0x00003FF7 //A9 = 6.1687358811367989146517222415e-03
+data8 0xDA38E39C13819D2A, 0x00003FF9 //A8 = 2.6638454961912040754759086920e-02
+data8 0xD696DF8D8389FE53, 0x00003FFB //A7 = 1.0477995539298934056097943975e-01
+data8 0xBDD5C153048BC435, 0x00003FFD //A6 = 3.7077144754791605130056406006e-01
+//
+// Polynomial coefficients for the tgammal(x), 6 <= |x| < 7
+//A5 = 6.7169398121054200601065531373e+01 + 2.9481001527213915901489600000e-15
+data8 0x4050CAD76B377BA0, 0x3CEA8DDB2B2DE93E
+//A4 = 1.6115104376855398982115730178e+02 + 1.3422421925418824418257300000e-14
+data8 0x406424D559BDC687, 0x3D0E397FDB5B33DC
+//A3 = 3.1812194028053562533386866562e+02 + 3.9881709875858650942409600000e-14
+data8 0x4073E1F377A6CF73, 0x3D26738F63FE9C4C
+data8 0xD6E1B5FF90CAABD3, 0x00003FE1 //A20 = 1.5634700199277480081025480635e-09
+data8 0xD451987B925DD37E, 0x00003FE4 //A19 = 1.2358576813211397717382327174e-08
+data8 0xBFC151B67FA58E6B, 0x00003FE7 //A18 = 8.9292951435632759686382657901e-08
+data8 0xA9034C5E1D67572E, 0x00003FEA //A17 = 6.2962205718327848327368724720e-07
+data8 0x8E40F6EAA30A71EC, 0x00003FED //A16 = 4.2394926442967995119170095258e-06
+data8 0xE3C3541B03A1C350, 0x00003FEF //A15 = 2.7151465666109594512258841637e-05
+data8 0xACE2E58436B2DDCE, 0x00003FF2 //A14 = 1.6487723793339152877117376243e-04
+data8 0xF7EAF8D8D1CAA3D1, 0x00003FF4 //A13 = 9.4573158112768812533636022369e-04
+//A2 = 4.8664351544258869353143381886e+02 + 4.7424047995944376868895400000e-14
+data8 0x407E6A4BD6D9463B, 0x3D2AB2868D79E192
+//A1 = 5.1615277644992545447166776285e+02 + 3.0901956935588717379242200000e-14
+data8 0x40802138E2DC003B, 0x3D216570FB601AEA
+//A0 = 2.8788527781504433278314536437e+02 + 2.8213174117085164944959600000e-14
+data8 0x4071FE2A1911F7D6, 0x3D1FC3E4CF4DB5AF
+data8 0xA72B88E48D3D1BAB, 0x00003FF7 //A12 = 5.1016252919939028020562237471e-03
+data8 0xD2EFB1067DB4FFB2, 0x00003FF9 //A11 = 2.5749059441230515023024615917e-02
+data8 0xF788AF9522205C24, 0x00003FFB //A10 = 1.2086617635601742290221382521e-01
+data8 0x861A6CE06CB29EAF, 0x00003FFE //A9 = 5.2384071807018493367136112163e-01
+data8 0x84FBDE0947718B58, 0x00004000 //A8 = 2.0778727617851237754568261869e+00
+data8 0xEEC1371E265A2C3A, 0x00004001 //A7 = 7.4610858525146049022238037342e+00
+data8 0xBF514B9BE68ED59D, 0x00004003 //A6 = 2.3914694993947572859629197920e+01
+//
+// Polynomial coefficients for the tgammal(x), 8 <= |x| < 9
+//A5 = 5.8487447114416836484451778233e+03 + 4.7365465221455983144182900000e-13
+data8 0x40B6D8BEA568B6FD, 0x3D60AA4D44C2589B
+//A4 = 1.2796464063087094473303295672e+04 + 1.2373341702514898266244200000e-12
+data8 0x40C8FE3B666B532D, 0x3D75C4752C5B4783
+//A3 = 2.2837606581322281272150576115e+04 + 2.6598064610627891398831000000e-13
+data8 0x40D64D66D23A7764, 0x3D52B77B3A10EA5C
+data8 0xB23418F75B0BE22A, 0x00003FE9 //A20 = 3.3192989594206801808678663868e-07
+data8 0xA984A7BC8B856ED2, 0x00003FEC //A19 = 2.5260177918662350066375115788e-06
+data8 0x921A49729416372C, 0x00003FEF //A18 = 1.7416797068239475136398213598e-05
+data8 0xF5BB9415CC399CA4, 0x00003FF1 //A17 = 1.1717449586392814601938207599e-04
+data8 0xC50B91A40B81F9DF, 0x00003FF4 //A16 = 7.5166775151159345732094429036e-04
+data8 0x96002572326DB203, 0x00003FF7 //A15 = 4.5776541559407384162139204300e-03
+data8 0xD81A1A595E4157BA, 0x00003FF9 //A14 = 2.6379634345126284099420760736e-02
+data8 0x92B700D0CFECADD8, 0x00003FFC //A13 = 1.4327622675407940907282658100e-01
+//A2 = 3.1237895525940199149772524834e+04 + 3.1280450505163186432331700000e-12
+data8 0x40DE8179504C0878, 0x3D8B83BB33FBB766
+//A1 = 2.9192841741344487672904506326e+04 + 7.9300780509779689630767000000e-13
+data8 0x40DC8235DF171691, 0x3D6BE6C780EE54DF
+//A0 = 1.4034407293483411194756627083e+04 + 1.4038139346291543309253700000e-12
+data8 0x40CB693422315F90, 0x3D78B23746113FCE
+data8 0xBAE50807548BC711, 0x00003FFE //A12 = 7.3005724123917935346868107005e-01
+data8 0xDE28B1F57E68CFB6, 0x00004000 //A11 = 3.4712338349724065462763671443e+00
+data8 0xF4DCA5A5FF901118, 0x00004002 //A10 = 1.5303868912154033908205911714e+01
+data8 0xF85AAA1AD5E84E5E, 0x00004004 //A9 = 6.2088539523416399361048051373e+01
+data8 0xE5AA8BB1BF02934D, 0x00004006 //A8 = 2.2966619406617480799195651466e+02
+data8 0xBF6CFEFD67F59845, 0x00004008 //A7 = 7.6570306334640770654588802417e+02
+data8 0x8DB5D2F001635C29, 0x0000400A //A6 = 2.2673639984182571062068713002e+03
+//
+// Polynomial coefficients for the tgammal(x), 10 <= |x| < 11
+//A5 = 7.2546009516580589115619659424e+05 + 1.0343348865365065212891728822e-10
+data8 0x412623A830B99290, 0x3DDC6E7C157611C4
+//A4 = 1.4756292870840241666883230209e+06 + 8.1516565365333844166705674775e-11
+data8 0x4136842D497E56AF, 0x3DD66837E4C3F9EE
+//A3 = 2.4356116926500420086085796356e+06 + 3.5508860076560925641351069404e-10
+data8 0x4142950DD8A8C1AF, 0x3DF866C8E3DD0980
+data8 0xB7FD0D1EEAC38EB4, 0x00003FF1 //A20 = 8.7732544640091602721643775932e-05
+data8 0xA9345C64AC750AE9, 0x00003FF4 //A19 = 6.4546407626804942279126469603e-04
+data8 0x8BEABC81BE1E93C9, 0x00003FF7 //A18 = 4.2699261134524096128048819443e-03
+data8 0xE1CD281EDD7315F8, 0x00003FF9 //A17 = 2.7563646660310313164706189622e-02
+data8 0xAD8A5BA6D0FD9758, 0x00003FFC //A16 = 1.6947310643831556048460963841e-01
+data8 0xFCDDA464AD3F182E, 0x00003FFE //A15 = 9.8775699098518676937088606052e-01
+data8 0xAE0DCE2F7B60D1AE, 0x00004001 //A14 = 5.4391852309591064073782104822e+00
+data8 0xE1745D9ABEB8D1A7, 0x00004003 //A13 = 2.8181819161363002758615770457e+01
+//A2 = 3.0619656223573554307222366333e+06 + 1.0819940302945474471259520006e-10
+data8 0x41475C66CFA967E4, 0x3DDDBDDB2A27334B
+//A1 = 2.6099413018962685018777847290e+06 + 3.6851882860056025385268615240e-10
+data8 0x4143E98AA6A48974, 0x3DF9530D42589AB6
+//A0 = 1.1332783889487853739410638809e+06 + 1.9339350553312096248591829758e-10
+data8 0x41314ADE639225C9, 0x3DEA946DD6C2C8D3
+data8 0x88BCFAAE71812A1C, 0x00004006 //A12 = 1.3673820009490115307300592012e+02
+data8 0x9A770F5AB540A326, 0x00004008 //A11 = 6.1786031215382040427126476507e+02
+data8 0xA170C1D2C6B413FC, 0x0000400A //A10 = 2.5830473201524594051391525170e+03
+data8 0x9AE56061CB02EB55, 0x0000400C //A9 = 9.9133441230507404119297200255e+03
+data8 0x872390769650FBE2, 0x0000400E //A8 = 3.4595564309496661629764193479e+04
+data8 0xD3E5E8D6923910C1, 0x0000400F //A7 = 1.0849181904819284819615140521e+05
+data8 0x930D70602F50B754, 0x00004011 //A6 = 3.0116351174131169193070583741e+05
+//
+// Polynomial coefficients for the tgammal(x), 12 <= |x| < 13
+//A5 = 1.2249876249976964294910430908e+08 + 6.0051348061679753770848000000e-09
+data8 0x419D34BB29FFC39D, 0x3E39CAB72E01818D
+//A4 = 2.3482765927605420351028442383e+08 + 1.1874729051592862323641700000e-08
+data8 0x41ABFE5F168D56FA, 0x3E4980338AA7B04B
+//A3 = 3.6407329688125067949295043945e+08 + 2.6657200942150363994658700000e-08
+data8 0x41B5B35150E199A5, 0x3E5C9F79C0EB5300
+data8 0xE89AE0F8D726329D, 0x00003FF9 //A20 = 2.8394164465429105626588451540e-02
+data8 0xCF90981F86E38013, 0x00003FFC //A19 = 2.0270002071785908652476845915e-01
+data8 0xA56C658079CA8C4A, 0x00003FFF //A18 = 1.2923704984019263122675412350e+00
+data8 0x80AEF96A67C5615A, 0x00004002 //A17 = 8.0427183300456238315262463506e+00
+data8 0xBE886D7529678931, 0x00004004 //A16 = 4.7633230047847868242503413461e+01
+data8 0x858EDBA4CE2F7508, 0x00004007 //A15 = 2.6711607799594541057655957154e+02
+data8 0xB0B0A3AF388274F0, 0x00004009 //A14 = 1.4135199810126975119809102782e+03
+data8 0xDBA87137988751EF, 0x0000400B //A13 = 7.0290552818218513870879313985e+03
+//A2 = 4.2828433593031734228134155273e+08 + 3.9760422293645854535247300000e-08
+data8 0x41B98719AFEE2947, 0x3E6558A17E0D3007
+//A1 = 3.4008253676084774732589721680e+08 + 1.2558352335001093116071000000e-09
+data8 0x41B4453F68C2C6EB, 0x3E159338C5BC7EC3
+//A0 = 1.3684336546556583046913146973e+08 + 2.6786516700381562934240300000e-08
+data8 0x41A05020CAEE5EA5, 0x3E5CC3058A858579
+data8 0xFF5E3940FB4BA576, 0x0000400D //A12 = 3.2687111823895439312116108631e+04
+data8 0x8A08C124C7F74B6C, 0x00004010 //A11 = 1.4134701786994123329786229006e+05
+data8 0x89D701953540BFFB, 0x00004012 //A10 = 5.6459209892773907605385652281e+05
+data8 0xFC46344B3116C3AD, 0x00004013 //A9 = 2.0666305367147234406757715163e+06
+data8 0xD183EBD7A400151F, 0x00004015 //A8 = 6.8653979211730981618367536737e+06
+data8 0x9C083A40742112F4, 0x00004017 //A7 = 2.0451444503543981795037456447e+07
+data8 0xCD3C475B1A8B6662, 0x00004018 //A6 = 5.3801245423495149598177886823e+07
+LOCAL_OBJECT_END(Constants_Tgammal_poly)
+
+
+LOCAL_OBJECT_START(Constants_Tgammal_poly_splitted)
+
+// Polynomial coefficients for the tgammal(x), 1 <= |x| < 1.25
+//A5 = -9.8199506890310417350775651357e-01+ -3.2546247786122976510752200000e-17
+data8 0xBFEF6C80EC38B509, 0xBC82C2FA7A3DE3BD
+//A4 = 9.8172808683439960475425323239e-01 + 4.4847611775298520359811400000e-17
+data8 0x3FEF6A51055096B0, 0x3C89DA56DE95EFE4
+//A3 = -9.0747907608088618225394839101e-01 +-1.0244057366544064435443970000e-16
+data8 0xBFED0A118F324B62, 0xBC9D86C7B9EBCFFF
+data8 0xB8E3FDAA66CC738E, 0x00003FFB //A20 = 9.0278608095877488976217714815e-02
+data8 0xA76067AE1738699C, 0x0000BFFD //A19 =-3.2690738678103132837070881737e-01
+data8 0x9D66B13718408C44, 0x00003FFE //A18 = 6.1484820933424283818320582920e-01
+data8 0xD4AC67BBB4AE5599, 0x0000BFFE //A17 =-8.3075569470082063491389474937e-01
+data8 0xF1426ED1C1488DB3, 0x00003FFE //A16 = 9.4241993542644505594957058785e-01
+data8 0xFC12EB07AA6F4B6B, 0x0000BFFE //A15 =-9.8466366707947121954333549690e-01
+data8 0xFF2B32CFE5B0DDC8, 0x00003FFE //A14 = 9.9675290656677214804168895915e-01
+data8 0xFFD8E7E6FF3662EA, 0x0000BFFE //A13 =-9.9940347089360552383472582319e-01
+//A2 = 9.8905599532797250361682017683e-01 + 5.1760162410376024240867300000e-17
+data8 0x3FEFA658C23B1578, 0x3C8DD673A61F6FE7
+//A1 = -5.7721566490153275452712478000e-01+ -1.0607935612223465065923310000e-16
+data8 0xBFE2788CFC6FB618, 0xBC9E9346622D53B7
+//A0 = 9.9999999999999988897769753748e-01 + 1.1102230245372554544790880000e-16
+data8 0x3FEFFFFFFFFFFFFF, 0x3C9FFFFFFFF51E4E
+data8 0xFFF360DF628F0BC9, 0x00003FFE //A12 = 9.9980740979895815468216470840e-01
+data8 0xFFEF8F9A72B40480, 0x0000BFFE //A11 = -9.9974916001038145045939523470e-01
+data8 0xFFE037B8C7E39952, 0x00003FFE //A10 = 9.9951504002809911822597567307e-01
+data8 0xFFC01E08F348BED2, 0x0000BFFE //A9 = -9.9902522772325406705059517941e-01
+data8 0xFF83DAC83119B52C, 0x00003FFE //A8 = 9.9810569179053383842734164901e-01
+data8 0xFEF9F8AB891ABB24, 0x0000BFFE //A7 = -9.9600176036720260345608796766e-01
+data8 0xFE3F0537573C8235, 0x00003FFE //A6 = 9.9314911461918778676646301341e-01
+//
+// Polynomial coefficients for the tgammal(x), 1.25 <= |x| < 1.75
+//A5 = -7.7523052299853054125655660300e-02+ -1.2693512521686721504433600000e-17
+data8 0xBFB3D88CFE50601B, 0xBC6D44ED60EE2170
+//A4 = 1.4464535904462152982041800442e-01 + 2.5426820829345729856648800000e-17
+data8 0x3FC283BD374EB2A9, 0x3C7D50AC436187C3
+//A3 = -1.0729480456477220873257039102e-01+ -6.2429894945456418196551000000e-18
+data8 0xBFBB77AC1CA2EBA5, 0xBC5CCA6BCC422D41
+data8 0xF732D2689F323283, 0x00003FF2 //A20 = 2.3574688251652899567587145422e-04
+data8 0xB6B00E23DE89D13A, 0x0000BFF3 //A19 =-3.4844916488842618776630058875e-04
+data8 0xE98396FE4A1B2799, 0x00003FF3 //A18 =4.4539265198744452020440735977e-04
+data8 0xAF8D235A640DB1A2, 0x0000BFF4 //A17 =-6.6967514303333563295261178346e-04
+data8 0x8513B736C918B261, 0x00003FF5 //A16 = 1.0152970456990865810615917715e-03
+data8 0xC790A1A2C78D8E17, 0x0000BFF5 //A15 =-1.5225598630329403515321688394e-03
+data8 0x959706CFA638CDE2, 0x00003FF6 //A14 = 2.2825614575133879623648932383e-03
+data8 0xE050A6021E129860, 0x0000BFF6 //A13 =-3.4227757733947066666295285936e-03
+//A2 = 4.1481345368830113695679528973e-01 + 3.1252439808354284892632100000e-17
+data8 0x3FDA8C4DBA620D56, 0x3C82040BCB483C76
+//A1 = 3.2338397448885010387886751460e-02 + 3.4437825798552300531443100000e-18
+data8 0x3FA08EA88EE561B1, 0x3C4FC366D6C64806
+//A0 = 8.8622692545275794095971377828e-01 + 7.2689375867553992399219000000e-17
+data8 0x3FEC5BF891B4EF6A, 0x3C94F3877D311C0C
+data8 0xA8275AADC09D16FC, 0x00003FF7 //A12 = 5.1316445128621071486146117136e-03
+data8 0xFBFE2CE9215267A2, 0x0000BFF7 //A11= -7.6902121820788373000579382408e-03
+data8 0xBCC8EEAB67ECD91D, 0x00003FF8 //A10 = 1.1522515369164312742737727262e-02
+data8 0x8D1614BB97E5E8C2, 0x0000BFF9 //A9 = -1.7222443097804730395560633583e-02
+data8 0xD3A963578BE291E3, 0x00003FF9 //A8 = 2.5837606456090186343624210891e-02
+data8 0x9BA7EAE64C42FDF7, 0x0000BFFA //A7 = -3.8001935555045161419575037512e-02
+data8 0xF0115BA1A77607E7, 0x00003FFA //A6 = 5.8610303817173477119764956736e-02
+//
+// Polynomial coefficients for the tgammal(x), 1.75 <= |x| < 2.0
+//A5 = 2.6698206874501426502654943818e-04 + 3.4033756836921062797887300000e-20
+data8 0x3F317F3740FE2A68, 0x3BE417093234B06E
+//A4 = 7.4249010753513894345090307070e-02 + 3.9810018444482764697014200000e-18
+data8 0x3FB301FBB0F25A92, 0x3C525BEFFABB622F
+//A3 = -8.1576919247086265851720554565e-02+ -5.2716624487804746360745000000e-19
+data8 0xBFB4E239984650AC, 0xBC2372F1C4F276FF
+data8 0xFEF3AEE71038E9A3, 0x00003FEB //A20 = 1.8995395865421509009969188571e-06
+data8 0xA11CFA2672BF876A, 0x0000BFEB //A19 =-1.2003868221414015771269244270e-06
+data8 0xF8E107215DAE2164, 0x00003FEC //A18 = 3.7085863210303833432006027217e-06
+data8 0xBCDDD3FC011EF7D6, 0x00003FEC //A17 = 2.8143303971756051015245433043e-06
+data8 0x8683C4687FA22E68, 0x00003FEE //A16 = 8.0177018464360416764308252462e-06
+data8 0xFDA09E5D33E32968, 0x00003FEE //A15 = 1.5117372062443781157389064848e-05
+data8 0xFFB00D0CFF4089B4, 0x00003FEF //A14 = 3.0480348961227424242198174995e-05
+data8 0xFEF6C39566785085, 0x00003FF0 //A13 = 6.0788135974125244644334004947e-05
+//A2 = 4.1184033042643969357854416558e-01 + 1.2103396182129232634761000000e-18
+data8 0x3FDA5B978B96BEBF, 0x3C3653AAD0A139E4
+//A1 = -4.2278433509846713445057275749e-01+ -4.9429151528135657430413000000e-18
+data8 0xBFDB0EE6072093CE, 0xBC56CB907027554F
+//A0 = 1.0000000000000000000000000000e+00 + 1.0969171200000000000000000000e-31
+data8 0x3FF0000000000000, 0x3981CC6A5B20B4D5
+data8 0xFF2B7BA9A8D68C37, 0x00003FF1 //A12 = 1.2167446884801403650547161615e-04
+data8 0xFCA53468E3692EF1, 0x00003FF2 //A11 = 2.4094136329542400976250900707e-04
+data8 0x808D698A9C993615, 0x00003FF4 //A10 = 4.9038845704938303659791698883e-04
+data8 0xF10F8E3FB8BB4AFB, 0x00003FF4 //A9 = 9.1957383840999861214472423976e-04
+data8 0x89E224E42F93F005, 0x00003FF6 //A8 = 2.1039333407187324139473634747e-03
+data8 0xBAF374824937A323, 0x00003FF6 //A7 = 2.8526458211545152218493600470e-03
+data8 0xB6BF7564F52140C6, 0x00003FF8 //A6 = 1.1154045718131014476684982178e-02
+//
+// Polynomial coefficients for the tgammal(x), 0.0 <= |x| < 0.125
+//A5 = -9.8199506890314514073736518185e-01+ -5.9363811993837985890950900000e-17
+data8 0xBFEF6C80EC38B67A, 0xBC911C46B447C81F
+//A4 = 9.8172808683440015986576554496e-01 + 2.7457414262802803699834200000e-17
+data8 0x3FEF6A51055096B5, 0x3C7FA7FF90ACAD1F
+//A3 = -9.0747907608088618225394839101e-01 + -1.0676255850934306734701780000e-16
+data8 0xBFED0A118F324B62, 0xBC9EC5AFB633438D
+data8 0x9217E83FA207CB80, 0x00003FFD //A20 = 2.8533864762086088781083621561e-01
+data8 0xA8DABFA52FDF03EC, 0x0000BFFE //A19= -6.5958783896337186303285832783e-01
+data8 0xE331ED293AF39F9B, 0x00003FFE //A18 = 8.8748056656454687449654731184e-01
+data8 0xF9163C5DDB52419D, 0x0000BFFE //A17= -9.7299554149078295602977718525e-01
+data8 0xFEC0A1C672CB9265, 0x00003FFE //A16 = 9.9512683005268190987854104489e-01
+data8 0xFFD2D65B8EA7B5F4, 0x0000BFFE //A15= -9.9931087241443958201592847861e-01
+data8 0xFFF93AA39EE53445, 0x00003FFE //A14 = 9.9989668364186884793382816496e-01
+data8 0xFFFB99A9A3F5F480, 0x0000BFFE //A13= -9.9993286506283835663204999212e-01
+//A2 = 9.8905599532797250361682017683e-01 + 5.1778575360788420716540100000e-17
+data8 0x3FEFA658C23B1578, 0x3C8DD92B45408D07
+//A1 = -5.7721566490153275452712478000e-01+ -1.0607938730998824663273110000e-16
+data8 0xBFE2788CFC6FB618, 0xBC9E9346F8FDE55B
+//A0 = 9.9999999999999988897769753748e-01 + 1.1102230246251564036631420000e-16
+data8 0x3FEFFFFFFFFFFFFF, 0x3C9FFFFFFFFFFFFF
+data8 0xFFF7FEBB545812C1, 0x00003FFE //A12 = 9.9987785409425126648628395084e-01
+data8 0xFFF00C02E943A3F2, 0x0000BFFE //A11= -9.9975657530855116454438747397e-01
+data8 0xFFE0420AADC53820, 0x00003FFE //A10 = 9.9951565514290485919027183699e-01
+data8 0xFFC01EB42EF27EEB, 0x0000BFFE //A9 = -9.9902526759155739377365522320e-01
+data8 0xFF83DAD0BF23FF12, 0x00003FFE //A8 = 9.9810569378236378800364235948e-01
+data8 0xFEF9F8ABDBCDB2F3, 0x0000BFFE //A7 = -9.9600176044241699109053158187e-01
+data8 0xFE3F05375988491D, 0x00003FFE //A6 = 9.9314911462127599008937257662e-01
+LOCAL_OBJECT_END(Constants_Tgammal_poly_splitted)
+
+.align 64
+LOCAL_OBJECT_START(Constants_Tgammal_common)
+// Positive overflow value
+data8 0x3FE0000000000000 // 0.5
+data8 0x3FF8000000000000 // 1.5
+data8 0x3FD0000000000000 // 0.25
+data8 0x0000000000000000 // 0
+data8 0xDB718C066B352E21, 0x00004009 // Positive overflow value
+LOCAL_OBJECT_END(Constants_Tgammal_common)
+
+
+
+//=======================================================
+// Lgamma registers
+
+// General Purpose Registers
+GR_l_Log_Table = r33
+GR_l_Log_Table1 = r34
+GR_l_BIAS = r34
+GR_l_Index1 = r35
+GR_l_Index2 = r36
+GR_l_signif_Z = r37
+GR_l_X_0 = r38
+GR_l_X_1 = r39
+GR_l_X_2 = r40
+GR_l_Z_1 = r41
+GR_l_Z_2 = r42
+GR_l_N = r43
+GR_l_Index3 = r44
+GR_l_Stirling_Table = r45
+GR_l_N_Unbiased = r46
+
+// Floating Point Registers
+FR_l_logl_X = f8
+
+FR_l_h_3 = f10
+FR_l_poly_hi = f10
+FR_l_W = f11
+FR_l_S = f12
+FR_l_GS_hi = f13
+FR_l_Y_lo = f13
+FR_l_r_cor = f14
+FR_l_G_1 = f15
+FR_l_G = f15
+FR_l_H_1 = f32
+FR_l_H = f32
+FR_l_h = f33
+FR_l_h_1 = f33
+FR_l_N = f33
+FR_l_G_2 = f34
+FR_l_H_2 = f35
+FR_l_h_2 = f36
+FR_l_G_3 = f37
+FR_l_log2_hi = f38
+FR_l_GS_lo = f39
+FR_l_H_3 = f40
+FR_l_float_N = f41
+FR_l_Q_4 = f42
+FR_l_Q_3 = f43
+FR_l_Q_2 = f44
+FR_l_Q_1 = f45
+FR_l_Q_5 = f46
+FR_l_Q_6 = f47
+FR_l_log2_lo = f48
+FR_l_r = f49
+FR_l_poly_lo = f50
+FR_l_poly = f51
+FR_l_rsq = f52
+FR_l_Y_lo_res = f53
+
+FR_l_Y0 = f55
+FR_l_Q0 = f56
+FR_l_E0 = f57
+FR_l_E2 = f58
+FR_l_E1 = f59
+FR_l_Y1 = f60
+FR_l_E3 = f61
+FR_l_Y2 = f62
+
+FR_l_Z = f63
+FR_l_Z2 = f64
+FR_l_Z4 = f65
+FR_l_Z8 = f66
+
+FR_l_CH = f67
+FR_l_CL = f68
+
+FR_l_B2 = f69
+FR_l_B4 = f70
+FR_l_B6 = f71
+FR_l_B8 = f72
+FR_l_B10 = f73
+FR_l_B12 = f74
+FR_l_B14 = f75
+FR_l_B16 = f76
+FR_l_B18 = f77
+FR_l_Half = f78
+FR_l_SS = f79
+FR_l_AbsX_m_Half = f80
+FR_l_CXH = f81
+FR_l_CXL = f82
+FR_l_SSCXH = f83
+FR_l_SSCXL = f84
+FR_l_XYH = f85
+FR_l_XYL = f86
+FR_l_Temp = f87
+
+FR_l_logl_YHi = f88
+FR_l_logl_YLo = f89
+
+FR_l_SignedXYH = f123
+
+FR_l_AbsX = f127
+
+
+
+//=======================================================
+// Negative part registers
+
+// General Purpose Registers
+GR_n_sin_Table = r47
+GR_n_XN = r48
+
+// Float point registers
+FR_n_IXNS = f125
+FR_n_IXN = f126
+
+FR_n_XNS = f90
+FR_n_XS = f91
+FR_n_XS2 = f92
+FR_n_XS2L = f93
+FR_n_XS4 = f94
+FR_n_XS7 = f95
+FR_n_XS8 = f96
+FR_n_TT = f97
+FR_n_TH = f98
+FR_n_TL = f99
+
+FR_n_A2H = f100
+FR_n_A2L = f101
+FR_n_A1H = f102
+FR_n_A1L = f103
+FR_n_A9 = f104
+FR_n_A8 = f105
+FR_n_A7 = f106
+FR_n_A6 = f107
+FR_n_A5 = f108
+FR_n_A4 = f109
+FR_n_A3 = f110
+
+FR_n_PolyH = f111
+FR_n_PolyL = f112
+
+FR_n_Poly1H = f113
+FR_n_SinxH = f113 // the same as FR_n_Poly1H
+FR_n_Poly1L = f114
+FR_n_SinxL = f114 // the same as FR_n_Poly1L
+
+FR_n_Tail = f115
+FR_n_NegOne = f116
+
+FR_n_Y0 = f117
+
+FR_n_Q0 = f118
+FR_n_E0 = f119
+
+FR_n_E2 = f120
+FR_n_E1 = f121
+
+FR_n_Y1 = f55
+FR_n_E3 = f56
+
+FR_n_Y2 = f57
+FR_n_R0 = f58
+
+FR_n_E4 = f59
+FR_n_RcpResH = f60
+
+FR_n_Y3 = f61
+FR_n_R1 = f62
+FR_n_Temp = f63
+
+FR_n_RcpResL = f64
+
+FR_n_ResH = f65
+FR_n_ResL = f66
+
+
+
+
+//=======================================================
+// Exp registers
+
+// General Purpose Registers
+GR_e_ad_Arg = r33
+GR_e_ad_A = r34
+GR_e_signexp_x = r35
+GR_e_exp_x = r35
+GR_e_exp_mask = r36
+GR_e_ad_W1 = r37
+GR_e_ad_W2 = r38
+GR_e_M2 = r39
+GR_e_M1 = r40
+GR_e_K = r41
+GR_e_exp_2_mk = r42
+GR_e_exp_2_k = r43
+GR_e_ad_T1 = r44
+GR_e_ad_T2 = r45
+GR_e_N_fix = r46
+GR_e_one = r47
+GR_e_exp_bias = r48
+GR_e_sig_inv_ln2 = r49
+GR_e_rshf_2to51 = r50
+GR_e_exp_2tom51 = r51
+GR_e_rshf = r52
+
+// Floating Point Registers
+FR_e_RSHF_2TO51 = f10
+FR_e_INV_LN2_2TO63 = f11
+FR_e_W_2TO51_RSH = f12
+FR_e_2TOM51 = f13
+FR_e_RSHF = f14
+FR_e_Y_hi = f15
+FR_e_Y_lo = f32
+FR_e_scale = f33
+FR_e_float_N = f34
+FR_e_N_signif = f35
+FR_e_L_hi = f36
+FR_e_L_lo = f37
+FR_e_r = f38
+FR_e_W1 = f39
+FR_e_T1 = f40
+FR_e_W2 = f41
+FR_e_T2 = f42
+FR_e_W1_p1 = f43
+FR_e_rsq = f44
+FR_e_A2 = f45
+FR_e_r4 = f46
+FR_e_A3 = f47
+FR_e_poly = f48
+FR_e_T = f49
+FR_e_W = f50
+FR_e_Wp1 = f51
+FR_e_r6 = f52
+FR_e_2_mk = f53
+FR_e_A1 = f54
+FR_e_T_scale = f55
+FR_e_result_lo = f56
+FR_e_W_T_scale = f57
+FR_e_Wp1_T_scale = f58
+
+FR_e_expl_Input_X = f123
+FR_e_expl_Input_Y = f124
+FR_e_expl_Output_X = f123
+FR_e_expl_Output_Y = f124
+
+
+FR_e_expl_Input_AbsX = f122
+
+
+
+//=======================================================
+// Common registers
+
+// General Purpose Registers
+GR_c_Table = r53
+GR_c_NegUnderflow = r54
+GR_c_NegSingularity = r55
+GR_c_X = r56
+GR_c_SignBit = r57
+GR_c_13 = r58
+
+
+// Floating Point Registers
+FR_c_PosOverflow = f123
+FR_c_XN = f124
+
+
+//=======================================================
+// Polynomial part registers
+
+// General Purpose Registers
+GR_p_Table = r59
+GR_p_XN = r33
+GR_p_Table2 = r34
+GR_p_Int = r35
+GR_p_Offset = r36
+GR_p_Offset2 = r38
+GR_p_X_Sgnd = GR_l_signif_Z // = r37
+GR_p_Exp = r61
+GR_p_Bias = r62
+GR_p_0p75 = r63
+
+// Floating Point Registers
+FR_p_AbsX = FR_l_AbsX // = f127
+FR_p_IXN = FR_n_IXN // = f126
+FR_p_XN = f32
+FR_p_0p5 = f33
+FR_p_1p5 = f34
+FR_p_AbsXM1 = f35
+FR_p_2 = f36
+
+FR_p_A20 = f37
+FR_p_A19 = f38
+FR_p_A18 = f39
+FR_p_A17 = f40
+FR_p_A16 = f41
+FR_p_A15 = f42
+FR_p_A14 = f43
+FR_p_A13 = f44
+FR_p_A12 = f45
+FR_p_A11 = f46
+FR_p_A10 = f47
+FR_p_A9 = f48
+FR_p_A8 = f49
+FR_p_A7 = f50
+FR_p_A6 = f51
+FR_p_A5H = f52
+FR_p_A5L = f53
+FR_p_A4H = f54
+FR_p_A4L = f55
+FR_p_A3H = f56
+FR_p_A3L = f57
+FR_p_A2H = f58
+FR_p_A2L = f59
+FR_p_A1H = f60
+FR_p_A1L = f61
+FR_p_A0H = f62
+FR_p_A0L = f63
+
+FR_p_XR = f64
+FR_p_XR2 = f65
+FR_p_XR2L = f52
+
+FR_p_XR3 = f58
+FR_p_XR3L = f38
+
+FR_p_XR4 = f42
+FR_p_XR6 = f40
+FR_p_XR8 = f37
+
+FR_p_Poly5H = f66
+FR_p_Poly5L = f67
+FR_p_Poly4H = f53
+FR_p_Poly4L = f44
+FR_p_Poly3H = f41
+FR_p_Poly3L = f47
+FR_p_Poly2H = f68
+FR_p_Poly2L = f54
+FR_p_Poly1H = f55
+FR_p_Poly1L = f46
+FR_p_Poly0H = f39
+FR_p_Poly0L = f43
+
+FR_p_Temp5H = f69
+FR_p_Temp5L = f70
+FR_p_Temp4H = f71
+FR_p_Temp4L = f60
+FR_p_Temp2H = f72
+FR_p_Temp2L = f73
+FR_p_Temp1H = f59
+FR_p_Temp1L = f61
+FR_p_Temp0H = f49
+FR_p_Temp0L = f48
+FR_p_PolyTail = f45
+FR_p_OddPoly0H = f56
+FR_p_OddPoly0L = f51
+
+FR_p_0p25 = f73
+
+
+//=======================================================
+// Negative polynomial part registers
+// General Purpose Registers
+GR_r_sin_Table = r47
+GR_r_sin_Table2 = r60
+
+// Floating Point Registers
+FR_r_IXNS = FR_n_IXNS
+FR_r_IXN = FR_n_IXN
+
+FR_r_AbsX = FR_l_AbsX
+
+FR_r_A9 = f74
+FR_r_A8 = f75
+FR_r_A7 = f76
+FR_r_A6 = f77
+FR_r_A5 = f78
+FR_r_A4 = f79
+FR_r_A3 = f80
+FR_r_A2H = f81
+FR_r_A2L = f82
+FR_r_A1H = f83
+FR_r_A1L = f84
+
+FR_r_XNS = f85
+FR_r_XS = f86
+FR_r_XS2 = f87
+FR_r_XS2L = f88
+FR_r_XS4 = f89
+FR_r_XS7 = f90
+FR_r_XS8 = f91
+
+FR_r_Tail = f92
+
+FR_r_TT = f93
+FR_r_TH = f94
+FR_r_TL = f95
+
+FR_r_ResH = f96
+FR_r_ResL = f97
+
+FR_r_Res3H = f98
+FR_r_Res3L = f99
+
+FR_r_Res1H = f100
+FR_r_Res1L = f101
+
+
+
+FR_r_Y0 = f102
+FR_r_Q0 = f103
+FR_r_E0 = f104
+FR_r_E2 = f105
+FR_r_E1 = f106
+FR_r_Y1 = f107
+FR_r_E3 = f108
+FR_r_Y2 = f109
+FR_r_R0 = f110
+FR_r_E4 = f111
+FR_r_ZH = f112
+FR_r_Y3 = f113
+FR_r_R1 = f114
+FR_r_ZHN = f115
+FR_r_ZL = f115
+FR_r_NegOne = f116
+
+FR_z_Y0 = f102
+FR_z_Q0 = f103
+FR_z_E0 = f104
+FR_z_E2 = f105
+FR_z_E1 = f106
+FR_z_Y1 = f107
+FR_z_E3 = f108
+FR_z_Y2 = f109
+FR_z_R0 = f110
+FR_z_E4 = f111
+FR_z_ZH = f112
+FR_z_Y3 = f113
+FR_z_R1 = f114
+FR_z_ZL = f115
+
+
+// General Purpose Registers
+GR_SAVE_PFS = r32
+GR_DenOverflow = r33
+GR_u_XN = r34
+
+GR_SAVE_B0 = r35
+GR_SAVE_GP = r36
+GR_SAVE_SP = r37
+
+// Floating Point Registers
+FR_u_IXN = f34
+
+
+// ERROR HANDLER REGISTERS
+GR_Parameter_X = r64
+GR_Parameter_Y = r65
+GR_Parameter_RESULT = r66
+GR_Parameter_TAG = r67
+
+FR_RESULT = f8
+FR_X = f32
+FR_Y = f1
+
+
+.section .text
+GLOBAL_LIBM_ENTRY(tgammal)
+{ .mfi
+ alloc r32 = ar.pfs,0,32,4,0
+ fabs FR_l_AbsX = f8 // Get absolute value of X
+ addl GR_n_sin_Table = @ltoff(Constants_Tgammal_sin), gp
+}
+{ .mfi
+ addl GR_l_Log_Table=@ltoff(Constants_Tgammal_log_80_Z_G_H_h1#),gp
+ nop.f 0
+ addl GR_l_Stirling_Table = @ltoff(Constants_Tgammal_stirling), gp
+};;
+
+{ .mfi
+ getf.sig GR_l_signif_Z = f8 // Significand of X
+ fcvt.fx.s1 FR_n_IXNS = f8 // Convert to fixed point
+ addl GR_c_Table = @ltoff(Constants_Tgammal_common), gp
+}
+{ .mfi
+ ld8 GR_l_Log_Table = [GR_l_Log_Table]
+ nop.f 0
+ addl GR_p_Table = @ltoff(Constants_Tgammal_poly), gp
+};;
+
+{ .mfi
+ ld8 GR_n_sin_Table = [GR_n_sin_Table]
+ fclass.m p6,p0 = f8,0x1EF // Check x for NaN, 0, INF, denorm
+ // NatVal.
+ addl GR_c_NegSingularity = 0x1003E, r0
+}
+{ .mlx
+ ld8 GR_l_Stirling_Table = [GR_l_Stirling_Table]
+ movl GR_c_13 = 0x402A000000000000 // 13.0
+};;
+
+{ .mfi
+ getf.d GR_c_X = f8 // Double prec. X to general register
+ frcpa.s1 FR_z_Y0,p0 = f1,f8 // y = frcpa(x) (for negatives)
+ extr.u GR_l_Index1 = GR_l_signif_Z, 59, 4 // = High 4 bits of Z
+}
+{ .mlx
+ ld8 GR_c_Table = [GR_c_Table]
+ movl GR_c_SignBit = 0x8000000000000000 // High bit (sign)
+};;
+
+{ .mfi
+ ld8 GR_p_Table = [GR_p_Table]
+ fcmp.lt.s1 p15, p14 = f8,f0 // p14 - positive arg, p15 - negative
+ shl GR_l_Index1 = GR_l_Index1,5 // Adjust Index1 ptr (x32)
+}
+{ .mfb
+ adds GR_c_NegUnderflow = 1765, r0
+ nop.f 0
+(p6) br.cond.spnt tgammal_spec // Spec. values processing branch ////////////
+ // (0s, INFs, NANs, NatVals, denormals) //////
+};;
+
+{ .mfi
+ ldfpd FR_l_CH,FR_l_CL= [GR_l_Stirling_Table], 16 // Load CH, CL
+ fcvt.fx.trunc.s1 FR_n_IXN = FR_l_AbsX // Abs arg to int by trunc
+ extr.u GR_l_X_0 = GR_l_signif_Z, 49, 15 // High 15 bit of Z
+}
+{ .mfi
+ add GR_l_Index1 = GR_l_Index1,GR_l_Log_Table // Add offset
+ fma.s1 FR_p_2 = f1, f1, f1 // 2.0
+ andcm GR_c_X = GR_c_X, GR_c_SignBit // Remove sign
+};;
+
+{ .mfi
+ addl GR_l_Log_Table = @ltoff(Constants_Tgammal_log_80_Z_G_H_h2#), gp
+ fcmp.lt.s1 p10, p0 = FR_l_AbsX, f1 // If |X|<1 then p10 = 1
+ nop.i 0
+}
+{ .mlx
+ ld2 GR_l_Z_1 = [GR_l_Index1],4 // load Z_1 from Index1
+ movl GR_l_BIAS = 0x000000000000FFFF // Bias for exponent
+};;
+
+{ .mfi
+ ld8 GR_l_Log_Table = [GR_l_Log_Table]
+ frcpa.s1 FR_l_Y0, p0 = f1, FR_l_AbsX // y = frcpa(x)
+ nop.i 0
+}
+{ .mfi
+ ldfs FR_l_G_1 = [GR_l_Index1],4 // Load G_1
+ fsub.s1 FR_l_W = FR_l_AbsX, f1 // W = |X|-1
+ nop.i 0
+};;
+
+{ .mfi
+ getf.exp GR_l_N_Unbiased= FR_l_AbsX // exponent of |X|
+ fmerge.se FR_l_S = f1, FR_l_AbsX // S = merging of X and 1.0
+ cmp.gtu p11, p0 = GR_c_13, GR_c_X // If 1 <= |X| < 13
+ // then p11 = 1
+}
+{ .mfb
+ ldfs FR_l_H_1 = [GR_l_Index1],8 // Load H_1
+ fcvt.xf FR_n_XNS = FR_n_IXNS // Convert to FP repr. of int X
+(p10) br.cond.spnt tgamma_lt_1 // Branch to |X| < 1 path ///////////////////
+};;
+
+{ .mfi
+ ldfpd FR_n_A2H, FR_n_A2L = [GR_n_sin_Table], 16
+ nop.f 0
+ pmpyshr2.u GR_l_X_1 = GR_l_X_0,GR_l_Z_1,15 // Adjust Index2 (x32)
+}
+{ .mfb
+ ldfe FR_l_B2 = [GR_l_Stirling_Table], 16
+ nop.f 0
+(p11) br.cond.spnt tgamma_lt_13 // Branch to 1 <= |X| < 13 path ///////////////
+};;
+
+{ .mfi
+ ldfe FR_l_h_1 = [GR_l_Index1],0
+ nop.f 0
+ sub GR_l_N = GR_l_N_Unbiased, GR_l_BIAS // N - BIAS
+}
+{ .mib
+ ldfpd FR_l_B4,FR_l_B6= [GR_l_Stirling_Table], 16 // Load C
+(p15) cmp.geu.unc p8,p0 = GR_l_N_Unbiased, GR_c_NegSingularity
+(p8) br.cond.spnt tgammal_singularity // Singularity for arg < to -2^63 //////
+};;
+
+{ .mmi
+(p15) ldfpd FR_n_A1H, FR_n_A1L = [GR_n_sin_Table], 16
+ ldfpd FR_l_B8, FR_l_B10 = [GR_l_Stirling_Table], 16
+ add GR_c_Table = 0x20, GR_c_Table
+};;
+
+{ .mfi
+(p15) ldfe FR_n_A9 = [GR_n_sin_Table], 16
+ fma.s1 FR_l_Q0 = f1,FR_l_Y0,f0 // Q0 = Y0
+ nop.i 0
+}
+{ .mfi
+ ldfpd FR_l_B12, FR_l_B14 = [GR_l_Stirling_Table], 16
+ fnma.s1 FR_l_E0 = FR_l_Y0,FR_l_AbsX,f1 // e = 1-b*y
+ nop.i 0
+};;
+
+{ .mfi
+(p15) ldfe FR_n_A8 = [GR_n_sin_Table], 16
+ fcvt.xf FR_c_XN = FR_n_IXN // Convert to FP repr. of int X
+ extr.u GR_l_Index2 = GR_l_X_1, 6, 4 // Extract Index2
+}
+{ .mfi
+ ldfpd FR_l_B16, FR_l_B18 = [GR_l_Stirling_Table], 16
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+(p15) ldfe FR_n_A7 = [GR_n_sin_Table], 16
+ fms.s1 FR_l_CXH = FR_l_CH, f1, FR_l_AbsX // CXH = CH+|X|
+ shl GR_l_Index2 = GR_l_Index2,5
+}
+{ .mfi
+ ldfd FR_l_Half = [GR_l_Stirling_Table] // Load 0.5
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ add GR_l_Index2 = GR_l_Index2, GR_l_Log_Table // Add offset
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+(p15) ldfe FR_n_A6 = [GR_n_sin_Table], 16
+(p15) fma.s1 FR_n_XS = FR_l_AbsX , f1, FR_n_XNS // xs = x - int(x)
+ nop.i 0
+};;
+
+{ .mmi
+ ld2 GR_l_Z_2 = [GR_l_Index2],4
+ addl GR_l_Log_Table = @ltoff(Constants_Tgammal_log_80_h3_G_H#),gp
+ nop.i 0
+};;
+
+{ .mfi
+ ld8 GR_l_Log_Table = [GR_l_Log_Table]
+ fma.s1 FR_l_E2 = FR_l_E0,FR_l_E0,FR_l_E0 // e2 = e+e^2
+ nop.i 0
+}
+{ .mfi
+ ldfs FR_l_G_2 = [GR_l_Index2],4
+ fma.s1 FR_l_E1 = FR_l_E0,FR_l_E0,f0 // e1 = e^2
+ nop.i 0
+};;
+
+{ .mmi
+ ldfs FR_l_H_2 = [GR_l_Index2],8
+(p15) ldfe FR_n_A5 = [GR_n_sin_Table], 16
+ nop.i 0
+};;
+
+{ .mfi
+ setf.sig FR_l_float_N = GR_l_N // float_N = Make N a fp number
+ nop.f 0
+ pmpyshr2.u GR_l_X_2 = GR_l_X_1,GR_l_Z_2,15 // X_2 = X_1 * Z_2
+}
+{ .mfi
+ ldfe FR_l_h_2 = [GR_l_Index2],0
+ fma.s1 FR_l_CXL = FR_l_AbsX, f1, FR_l_CXH // CXL = |X|+CXH
+ add GR_l_Log_Table1= 0x200, GR_l_Log_Table
+};;
+
+{ .mfi
+(p15) ldfe FR_n_A4 = [GR_n_sin_Table], 16
+(p15) fcmp.eq.unc.s1 p9,p0 = FR_l_AbsX, FR_c_XN //if argument is integer
+ // and negative
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_c_PosOverflow = [GR_c_Table],16 //Load pos overflow value
+(p15) fma.s1 FR_n_XS2 = FR_n_XS, FR_n_XS, f0 // xs^2 = xs*xs
+ nop.i 0
+};;
+
+{ .mfi
+(p15) ldfe FR_n_A3 = [GR_n_sin_Table], 16
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+(p15) getf.sig GR_n_XN = FR_n_IXN // int(x) to general reg
+ fma.s1 FR_l_Y1 = FR_l_Y0,FR_l_E2,FR_l_Y0 // y1 = y+y*e2
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fma.s1 FR_l_E3 = FR_l_E1,FR_l_E1,FR_l_E0 // e3 = e+e1^2
+(p9) br.cond.spnt tgammal_singularity // Singularity for integer /////////////
+ // and negative arguments //////////////
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_l_AbsX_m_Half = FR_l_AbsX, f1, FR_l_Half // |x|-0.5
+ extr.u GR_l_Index2 = GR_l_X_2, 1, 5 // Get Index3
+};;
+
+{ .mfi
+ shladd GR_l_Log_Table1= GR_l_Index2, 2, GR_l_Log_Table1
+ nop.f 0
+ shladd GR_l_Index3 = GR_l_Index2,4, GR_l_Log_Table // Index3
+}
+{ .mfb
+(p15) cmp.gtu.unc p11, p0 = GR_n_XN, GR_c_NegUnderflow // X < -1765
+ fms.s1 FR_l_CXL = FR_l_CH, f1, FR_l_CXL // CXL = CH - CXL
+(p11) br.cond.spnt tgammal_underflow // Singularity for negative argument //////
+ // at underflow domain (X < -1765) //////
+};;
+
+{ .mfi
+ addl GR_l_Log_Table = @ltoff(Constants_Tgammal_log_80_Q#), gp
+(p15) fma.s1 FR_n_TT = FR_n_A2L, FR_n_XS2, f0 // T=A2L*x^2
+ tbit.nz.unc p13, p12 = GR_n_XN, 0x0 // whether [X] odd or even
+}
+{ .mfi
+ nop.m 0
+(p15) fms.s1 FR_n_XS2L = FR_n_XS, FR_n_XS, FR_n_XS2 // xs^2 Low part
+ nop.i 0
+};;
+
+{ .mfi
+ ld8 GR_l_Log_Table = [GR_l_Log_Table]
+(p15) fma.s1 FR_n_A7 = FR_n_A8, FR_n_XS2, FR_n_A7 // poly tail
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_l_h_3 = [GR_l_Index3],12
+(p15) fma.s1 FR_n_XS4 = FR_n_XS2, FR_n_XS2, f0 // xs^4 = xs^2*xs^2
+ nop.i 0
+};;
+
+{ .mfi
+ ldfs FR_l_H_3 = [GR_l_Log_Table1], 0
+ fma.s1 FR_l_Y2 = FR_l_Y1, FR_l_E3, FR_l_Y0 // y2 = y+y1*e3
+ nop.i 0
+}
+{ .mfi
+ ldfs FR_l_G_3 = [GR_l_Index3], 0
+ fnma.s1 FR_l_Z = FR_l_AbsX,FR_l_Q0,f1 // r = a-b*q
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_l_G = FR_l_G_1, FR_l_G_2 // G = G1 * G_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_l_H = FR_l_H_1, FR_l_H_2 // H = H_1 + H_2
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_l_log2_hi = [GR_l_Log_Table],16 // load log2_hi part
+ fadd.s1 FR_l_h = FR_l_h_1, FR_l_h_2 // h = h_1 + h_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fcvt.xf FR_l_float_N = FR_l_float_N // int(N)
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_l_log2_lo = [GR_l_Log_Table],16 // Load log2_lo part
+ fma.s1 FR_l_CXL = FR_l_CXL, f1, FR_l_CL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_TT = FR_n_A2H, FR_n_XS2L, FR_n_TT // T=A2H*x2L+T
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_l_Q_6 = [GR_l_Log_Table],16
+(p15) fma.s1 FR_n_A3 = FR_n_A4, FR_n_XS2, FR_n_A3 // poly tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_A5 = FR_n_A6, FR_n_XS2, FR_n_A5 // poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_l_Q_5 = [GR_l_Log_Table],16
+(p15) fabs FR_n_XS = FR_n_XS // abs(xs)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_Z = FR_l_Z,FR_l_Y2,FR_l_Q0 // x_hi = q+r*y2
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_l_Q_4 = [GR_l_Log_Table],16
+(p15) fma.s1 FR_n_A7 = FR_n_A9, FR_n_XS4, FR_n_A7 // poly tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_XS7 = FR_n_XS4, FR_n_XS2, f0 // = x^4*x^2
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_l_Q_3 = [GR_l_Log_Table],16
+ fneg FR_n_NegOne = f1 // -1.0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_XS8 = FR_n_XS4, FR_n_XS4, f0 // xs^8 = xs^4*xs^4
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_l_Q_2 = [GR_l_Log_Table],16
+ fadd.s1 FR_l_h = FR_l_h, FR_l_h_3 // h = h_1 + h_2 + h_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_TH = FR_n_A2H, FR_n_XS2, FR_n_TT // A2H*xs2+T
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_l_Q_1 = [GR_l_Log_Table],16
+ fmpy.s1 FR_l_G = FR_l_G, FR_l_G_3 // G = G_1 * G_2 * G_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_l_H = FR_l_H, FR_l_H_3 // H = H_1 + H_2 + H_3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_Z2 = FR_l_Z, FR_l_Z, f0 // Z^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_A3 = FR_n_A5, FR_n_XS4, FR_n_A3 // poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p14) fcmp.gt.unc.s1 p7,p0 = FR_l_AbsX, FR_c_PosOverflow //X > 1755.5483
+ // (overflow domain, result cannot be represented by normal value)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_XS7 = FR_n_XS7, FR_n_XS, f0 // x^7 construction
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fms.s1 FR_n_TL = FR_n_A2H, FR_n_XS2, FR_n_TH // A2H*xs2+TH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_PolyH = FR_n_TH, f1, FR_n_A1H // PolyH=TH+A1H
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_l_GS_hi = FR_l_G, FR_l_S // GS_hi = G*S
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fms.s1 FR_l_r = FR_l_G, FR_l_S, f1 // r = G*S -1
+(p7) br.cond.spnt tgammal_overflow // Overflow path for arg > 1755.5483 //////
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_B14 = FR_l_B16, FR_l_Z2, FR_l_B14// bernulli tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_Z4 = FR_l_Z2, FR_l_Z2, f0 // Z^4 = Z^2*Z^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_B2 = FR_l_B4, FR_l_Z2, FR_l_B2 // bernulli tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_B6 = FR_l_B8, FR_l_Z2, FR_l_B6 // bernulli tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_B10 = FR_l_B12, FR_l_Z2, FR_l_B10// bernulli tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_Tail = FR_n_A7, FR_n_XS8, FR_n_A3 // poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_TL = FR_n_TL, f1, FR_n_TT // TL = TL+T
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fms.s1 FR_n_PolyL = FR_n_A1H, f1, FR_n_PolyH // polyH+A1H
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_poly_lo = FR_l_r, FR_l_Q_6, FR_l_Q_5 // Q_5+r*Q_6
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 FR_l_r_cor = FR_l_GS_hi, f1 // r_cor = GS_hi -1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_l_GS_lo = FR_l_G, FR_l_S, FR_l_GS_hi // G*S-GS_hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_poly = FR_l_r, FR_l_Q_2, FR_l_Q_1 //poly=r*Q2+Q1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_l_rsq = FR_l_r, FR_l_r // rsq = r * r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_G = FR_l_float_N, FR_l_log2_hi, FR_l_H // Tbl =
+ // float_N*log2_hi + H
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_Y_lo = FR_l_float_N, FR_l_log2_lo, FR_l_h // Y_lo=
+ // float_N*log2_lo + h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_B14 = FR_l_B18, FR_l_Z4, FR_l_B14 //bernulli tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_B2 = FR_l_B6, FR_l_Z4, FR_l_B2 //bernulli tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_Z8 = FR_l_Z4, FR_l_Z4, f0 //bernulli tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_poly_lo = FR_l_r, FR_l_poly_lo, FR_l_Q_4 // poly_lo =
+ // Q_4 + r * poly_lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 FR_l_r_cor = FR_l_r_cor, FR_l_r // r_cor = r_cor - r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_PolyL = FR_n_PolyL, f1, FR_n_TH // polyL+TH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_TT = FR_n_TL, f1, FR_n_A1L // TL+A1L
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_l_logl_YHi = FR_l_G, FR_l_r // Y_hi = Tbl + r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_B10 = FR_l_B14, FR_l_Z4, FR_l_B10 //bernulli tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_poly_lo = FR_l_r, FR_l_poly_lo, FR_l_Q_3 // poly_lo =
+ // Q_3 + r * poly_lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_l_r_cor = FR_l_r_cor, FR_l_GS_lo // r_cor=r_cor+GS_lo
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_PolyL = FR_n_PolyL, f1, FR_n_TT // polyL+TT
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fsub.s1 FR_l_Y_lo_res = FR_l_G, FR_l_logl_YHi // Y_lo = Tbl - Y_hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_XYH = FR_l_logl_YHi, FR_l_AbsX_m_Half, f0 // XYH=
+ // YHi*|x-0.5|
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_SS = FR_l_B10, FR_l_Z8, FR_l_B2 // bernulli tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_l_r_cor = FR_l_r_cor, FR_l_Y_lo // r_cor = r_cor+Y_lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_poly = FR_l_rsq, FR_l_poly_lo, FR_l_poly //poly=
+ // r^2*polyLo+poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_TT = FR_n_PolyL, FR_n_XS2, f0 // T=polyL*xs^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_l_Y_lo = FR_l_Y_lo_res, FR_l_r // Y_lo = Y_lo + r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_l_XYL = FR_l_logl_YHi, FR_l_AbsX_m_Half, FR_l_XYH
+ // XYL = YHi*|x-0.5|-XYH
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_SSCXH = FR_l_SS, FR_l_Z, FR_l_CXH // SS*Z+CXH
+ nop.i 0
+}
+{ .mfi
+ mov GR_e_exp_2tom51= 0xffff-51 // 2^-51
+(p15) fma.s1 FR_l_SignedXYH = FR_l_XYH, FR_n_NegOne, f0 // XYH = -XYH
+ // for negatives
+ nop.i 0
+};;
+
+{ .mlx
+ nop.m 0
+ movl GR_e_rshf_2to51 = 0x4718000000000000 // 1.10000 2^(63+51)
+}
+{ .mlx
+ nop.m 0
+ movl GR_e_sig_inv_ln2 = 0xb8aa3b295c17f0bc //significand of 1/ln2
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_poly = FR_l_rsq, FR_l_poly, FR_l_r_cor // poly =
+ // rsq * poly + r_cor
+ nop.i 0
+};;
+
+{ .mfi
+ addl GR_e_ad_Arg = @ltoff(Constants_Tgammal_exp_64_Arg#),gp
+(p15) fma.s1 FR_n_TT = FR_n_PolyH, FR_n_XS2L, FR_n_TT
+ mov GR_e_exp_mask = 0x1FFFF // Form exponent mask
+}
+{ .mlx
+ nop.m 0
+ movl GR_e_rshf = 0x43e8000000000000 // 1.10000 2^63 rshift
+};;
+
+
+{ .mmi
+ setf.sig FR_e_INV_LN2_2TO63 = GR_e_sig_inv_ln2 // form 1/ln2 * 2^63
+ setf.d FR_e_RSHF_2TO51 = GR_e_rshf_2to51 // 1.1000 * 2^(63+51)
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_l_SSCXL = FR_l_CXH, f1, FR_l_SSCXH // CXH+SS*CXH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_e_expl_Input_AbsX = FR_l_XYH, f1, FR_l_SSCXH // HI EXP
+ nop.i 0
+};;
+
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_e_expl_Input_X = FR_l_XYH, f1, FR_l_SSCXH // HI EXP
+ mov GR_e_exp_bias = 0x0FFFF // Set exponent bias
+}
+{ .mfi
+ ld8 GR_e_ad_Arg = [GR_e_ad_Arg] // Point to Arg table
+(p15) fms.s1 FR_e_expl_Input_X = FR_l_SignedXYH, f1, FR_l_SSCXH // HI EXP
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_l_logl_YLo = FR_l_Y_lo, FR_l_poly // YLo = YLo+poly
+ nop.i 0
+};;
+
+{ .mfi
+ setf.exp FR_e_2TOM51 = GR_e_exp_2tom51 //2^-51 for scaling float_N
+(p15) fma.s1 FR_n_TH = FR_n_PolyH, FR_n_XS2, FR_n_TT // TH=
+ // polyH*xs^2+T
+ nop.i 0
+}
+{ .mib
+ setf.d FR_e_RSHF = GR_e_rshf // Right shift const 1.1000*2^63
+ nop.i 0
+ nop.b 0
+};;
+
+{ .mfi
+ add GR_e_ad_A = 0x20, GR_e_ad_Arg // Point to A table
+ nop.f 0
+ add GR_e_ad_T1 = 0x50, GR_e_ad_Arg // Point to T1 table
+}
+{ .mfi
+ add GR_e_ad_T2 = 0x150, GR_e_ad_Arg // Point to T2 table
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_SSCXL = FR_l_SS, FR_l_Z, FR_l_SSCXL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_e_expl_Input_Y = FR_l_XYH, f1, FR_e_expl_Input_AbsX
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_e_L_hi = [GR_e_ad_Arg],16 // Get L_hi
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_XYL = FR_l_logl_YLo, FR_l_AbsX_m_Half, FR_l_XYL
+ // XYL = YLo*|x-0.5|+XYL
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_e_L_lo = [GR_e_ad_Arg],16 // Get L_lo
+(p15) fms.s1 FR_n_TL = FR_n_PolyH, FR_n_XS2, FR_n_TH // TL =
+ // = polyH*xs^2-TH
+ add GR_e_ad_W1 = 0x100, GR_e_ad_T2 // Point to W1 table
+}
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_Poly1H = FR_n_TH, f1, f1 // poly1H = TH+1
+ add GR_e_ad_W2 = 0x300, GR_e_ad_T2 // Point to W2 table
+};;
+
+{ .mmi
+ getf.exp GR_e_signexp_x = FR_e_expl_Input_X // Extract sign and exp
+ ldfe FR_e_A3 = [GR_e_ad_A],16 // Get A3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_SSCXL = FR_l_SSCXL, f1, FR_l_CXL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, f1, FR_l_SSCXH
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_e_N_signif=FR_e_expl_Input_X,FR_e_INV_LN2_2TO63,FR_e_RSHF_2TO51
+ and GR_e_exp_x = GR_e_signexp_x, GR_e_exp_mask
+};;
+
+{ .mmi
+ sub GR_e_exp_x = GR_e_exp_x, GR_e_exp_bias // Get exponent
+ ldfe FR_e_A2 = [GR_e_ad_A],16 // Get A2 for main path
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_PolyH = FR_n_Poly1H, FR_n_XS, f0//sin(Pi*x) poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fms.s1 FR_n_Poly1L = f1, f1, FR_n_Poly1H//sin(Pi*x) poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_TL = FR_n_TL, f1, FR_n_TT//sin(Pi*x) poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_l_Temp = FR_l_XYL, f1, FR_l_SSCXL // XYL+SS*CXL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, FR_n_NegOne, f0
+ // Negate lo part of exp argument for negative input values
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_e_A1 = [GR_e_ad_A],16 // Get A1
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_e_float_N = FR_e_N_signif, FR_e_2TOM51, FR_e_RSHF
+ // Get float N = signd*2^51-RSHIFTER
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_Poly1L = FR_n_Poly1L, f1, FR_n_TH //sin(Pi*x) poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fms.s1 FR_n_PolyL = FR_n_Poly1H, FR_n_XS, FR_n_PolyH//sin(Pi*x)
+ nop.i 0
+};;
+
+{ .mfi
+ getf.sig GR_e_N_fix = FR_e_N_signif // Get N from significand
+ nop.f 0
+ nop.i 0
+};;
+
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, f1, FR_l_Temp
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fms.s1 FR_e_expl_Input_Y = FR_e_expl_Input_Y, f1, FR_l_Temp
+ // arguments for exp computation
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_e_r = FR_e_L_hi, FR_e_float_N, FR_e_expl_Input_X
+ // r = -L_hi * float_N + x
+ extr.u GR_e_M1 = GR_e_N_fix, 6, 6 // Extract index M_1
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_Poly1L = FR_n_Poly1L, f1, FR_n_TL //sin(Pi*x) poly
+ nop.i 0
+};;
+
+
+{ .mmf
+ nop.m 0
+ nop.m 0
+ fma.s1 FR_e_r = FR_e_r, f1, FR_e_expl_Input_Y
+ // r = r + FR_e_expl_Input_Y
+};;
+
+{ .mmi
+ shladd GR_e_ad_W1 = GR_e_M1,3,GR_e_ad_W1 // Point to W1
+ shladd GR_e_ad_T1 = GR_e_M1,2,GR_e_ad_T1 // Point to T1
+ extr.u GR_e_M2 = GR_e_N_fix, 0, 6 // Extract index M_2
+};;
+
+
+{ .mfi
+ ldfs FR_e_T1 = [GR_e_ad_T1],0 // Get T1
+ nop.f 0
+ extr GR_e_K = GR_e_N_fix, 12, 32 //Extract limit range K
+}
+{ .mfi
+ shladd GR_e_ad_T2 = GR_e_M2,2,GR_e_ad_T2 // Point to T2
+(p15) fma.s1 FR_n_PolyL = FR_n_Poly1L, FR_n_XS, FR_n_PolyL
+ //sin(Pi*x) poly
+ shladd GR_e_ad_W2 = GR_e_M2,3,GR_e_ad_W2 // Point to W2
+};;
+
+{ .mfi
+ ldfs FR_e_T2 = [GR_e_ad_T2],0 // Get T2
+ nop.f 0
+ add GR_e_exp_2_k = GR_e_exp_bias, GR_e_K // exp of 2^k
+}
+{ .mfi
+ ldfd FR_e_W1 = [GR_e_ad_W1],0 // Get W1
+ nop.f 0
+ sub GR_e_exp_2_mk = GR_e_exp_bias, GR_e_K // exp of 2^-k
+};;
+
+{ .mmi
+ ldfd FR_e_W2 = [GR_e_ad_W2],0 // Get W2
+ nop.m 0
+ nop.i 0
+};;
+
+{ .mmf
+ setf.exp FR_e_scale = GR_e_exp_2_k // Set scale = 2^k
+ setf.exp FR_e_2_mk = GR_e_exp_2_mk // Form 2^-k
+ fnma.s1 FR_e_r = FR_e_L_lo, FR_e_float_N, FR_e_r
+ // r = -L_lo * float_N + r
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_PolyL = FR_n_Tail, FR_n_XS7, FR_n_PolyL
+ //sin(Pi*x) poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_e_poly = FR_e_r, FR_e_A3, FR_e_A2 // poly=r*A3+A2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_e_rsq = FR_e_r, FR_e_r // rsq = r * r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_e_T = FR_e_T1, FR_e_T2 // T = T1 * T2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_e_W1_p1 = FR_e_W1, f1 // W1_p1 = W1 + 1.0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_TT = FR_n_PolyL, FR_l_AbsX, f0 //sin(Pi*x) poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_e_poly = FR_e_r, FR_e_poly, FR_e_A1
+ // poly = r * poly + A1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_e_T_scale = FR_e_T, FR_e_scale, f0 // T_scale=T*scale
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_e_W = FR_e_W2, FR_e_W1_p1, FR_e_W1
+ // W = W2 * (W1+1.0) + W1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_SinxH = FR_n_PolyH, FR_l_AbsX, FR_n_TT
+ // sin(Pi*x) poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ mov FR_e_Y_hi = FR_e_T // Assume Y_hi = T
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_e_poly = FR_e_rsq, FR_e_poly, FR_e_r
+ // poly = rsq * poly + r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_e_Wp1_T_scale = FR_e_W, FR_e_T_scale, FR_e_T_scale
+ // (W+1)*T*scale
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_e_W_T_scale = FR_e_W, FR_e_T_scale, f0 // W*T*scale
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fms.s1 FR_n_SinxL = FR_n_PolyH, FR_l_AbsX, FR_n_SinxH
+ // Low part of sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) frcpa.s1 FR_n_Y0, p0 = f1, FR_n_SinxH // y = frcpa(b)
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_e_result_lo = FR_e_Wp1_T_scale, FR_e_poly, FR_e_W_T_scale
+ // Low part of exp result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_SinxL = FR_n_SinxL, f1, FR_n_TT // sin low result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p15) fma.s1 FR_n_Q0 = f1,FR_n_Y0,f0 // q = y
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p15) fnma.s1 FR_n_E0 = FR_n_Y0, FR_n_SinxH, f1 // e = 1-b*y
+ nop.i 0
+};;
+
+
+{ .mfb
+ nop.m 0
+(p14) fma.s0 f8 = FR_e_Y_hi, FR_e_scale, FR_e_result_lo
+(p14) br.ret.spnt b0 // Exit for positive Stirling path //////////////////////
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_e_expl_Output_X = FR_e_Y_hi, FR_e_scale, f0 // exp result
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_e_expl_Output_Y = FR_e_result_lo, f1, f0// exp lo result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_n_E2 = FR_n_E0,FR_n_E0,FR_n_E0 // e2 = e+e^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_n_E1 = FR_n_E0,FR_n_E0,f0 // e1 = e^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_n_Y1 = FR_n_Y0,FR_n_E2,FR_n_Y0 // y1 = y+y*e2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_n_E3 = FR_n_E1,FR_n_E1,FR_n_E0 // e3 = e+e1^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_n_Y2 = FR_n_Y1,FR_n_E3,FR_n_Y0 // y2 = y+y1*e3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_n_R0 = FR_n_SinxH,FR_n_Q0,f1 // r = a-b*q
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_n_E4 = FR_n_SinxH,FR_n_Y2,f1 // e4 = 1-b*y2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_n_RcpResH = FR_n_R0,FR_n_Y2,FR_n_Q0 // x = q+r*y2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_n_Y3 = FR_n_Y2,FR_n_E4,FR_n_Y2 // y3 = y2+y2*e4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_n_R1 = FR_n_SinxH,FR_n_RcpResH,f1 // r1 = a-b*x
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_n_R1 = FR_n_SinxL,FR_n_RcpResH,FR_n_R1
+ // r1 = r1 - b_lo*X
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_n_RcpResL = FR_n_R1,FR_n_Y3,f0 // x_lo = r1*y3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_n_Temp = FR_n_RcpResH, FR_e_expl_Output_Y, f0
+ // Multiplying exp and sin result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_n_Temp = FR_n_RcpResL, FR_e_expl_Output_X, FR_n_Temp
+ // Multiplying exp and sin result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_n_ResH = FR_n_RcpResH, FR_e_expl_Output_X, FR_n_Temp
+ // Multiplying exp and sin result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_n_ResL = FR_n_RcpResH, FR_e_expl_Output_X, FR_n_ResH
+ // Multiplying exp and sin result
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p12) fma.s1 FR_n_ResH = FR_n_ResH, FR_n_NegOne, f0 // Negate
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_n_ResL = FR_n_ResL, f1, FR_n_Temp
+ // Multiplying exp and sin result - low result obtained
+ nop.i 0
+};;
+
+.pred.rel "mutex",p12,p13
+{ .mfi
+ nop.m 0
+(p13) fma.s0 f8 = FR_n_ResH, f1, FR_n_ResL // For odd
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p12) fms.s0 f8 = FR_n_ResH, f1, FR_n_ResL // For even
+ br.ret.sptk b0 // Exit for negative Stirling path //////////////////////
+};;
+
+
+//////////// 1 <= |X| < 13 path ////////////////////////////////////////////////
+//------------------------------------------------------------------------------
+.align 64
+tgamma_lt_13:
+{ .mfi
+ getf.sig GR_p_XN = FR_p_IXN // Get significand
+ fcvt.xf FR_p_XN = FR_p_IXN // xn = [x]
+ add GR_r_sin_Table2= 0x40, GR_r_sin_Table // Shifted table addr.
+}
+{ .mfi
+ ldfpd FR_p_0p5, FR_p_1p5 = [GR_c_Table], 16 // 0.5 & 1.5
+ fms.s1 FR_p_AbsXM1 = FR_p_AbsX, f1, f1 // X-1
+ add GR_p_Table2 = 0xB0, GR_p_Table
+};;
+
+{ .mfi
+ add GR_r_sin_Table = -16, GR_r_sin_Table // For compensation
+ fcvt.xf FR_r_XNS = FR_r_IXNS // Convert int repr to float
+ shr.u GR_p_X_Sgnd = GR_p_X_Sgnd, 59 // Get only 5 bit of signd
+};;
+
+{ .mfi
+ ldfpd FR_r_A2H,FR_r_A2L = [GR_r_sin_Table], 16 // Load A2
+ nop.f 0
+ add GR_p_Int = -2, GR_p_XN // int = int - 2
+}
+{ .mfi
+ ldfe FR_r_A6 = [GR_r_sin_Table2], 16
+ nop.f 0
+ cmp.gtu p11, p12 = 0x2, GR_p_XN // p11: x < 2 (splitted intervals),
+ // p12: x > 2 (base intervals)
+};;
+
+{ .mfi
+ ldfpd FR_r_A1H, FR_r_A1L = [GR_r_sin_Table], 16
+ nop.f 0
+ shr GR_p_Int = GR_p_Int, 1 // int/2
+}
+{ .mfi
+ ldfe FR_r_A5 = [GR_r_sin_Table2], 16
+ nop.f 0
+(p11) cmp.gtu.unc p10, p11 = 0x1C, GR_p_X_Sgnd // sgnd(x) < 0.75
+};;
+
+{ .mfi
+ ldfe FR_r_A9 = [GR_r_sin_Table], 16
+ nop.f 0
+ shl GR_p_Offset = GR_p_Int, 4 // offset = int*16
+}
+{ .mfi
+ ldfe FR_r_A4 = [GR_r_sin_Table2], 16
+ nop.f 0
+(p10) cmp.gtu.unc p9, p10 = 0x14, GR_p_X_Sgnd // sgnd(x) < 0.25
+};;
+
+
+{ .mfi
+ ldfe FR_r_A8 = [GR_r_sin_Table], 16
+ nop.f 0
+(p12) tbit.nz.unc p13, p12 = GR_p_XN, 0x0 // p13: reccurent computations
+ // X is at [3;4], [5;6], [7;8]... interval
+}
+{ .mfi
+ ldfe FR_r_A3 = [GR_r_sin_Table2], 16
+ nop.f 0
+ shladd GR_p_Offset = GR_p_Int, 2, GR_p_Offset // +int*4
+};;
+
+.pred.rel "mutex",p9,p11
+{ .mfi
+ add GR_p_Offset = GR_p_Int, GR_p_Offset
+ // +int, so offset = int*21
+(p9) fms.s1 FR_p_XR = FR_p_AbsX, f1, f1 // r = x-1
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_r_A7 = [GR_r_sin_Table], 16
+(p11) fms.s1 FR_p_XR = FR_p_2, f1, FR_p_AbsX
+ // r = 2-x for 1.75 < x < 2
+ nop.i 0
+};;
+
+.pred.rel "mutex",p9,p10
+.pred.rel "mutex",p10,p11
+.pred.rel "mutex",p9,p11
+{ .mfi
+(p9) add GR_p_Offset = 126, r0 // 1.0 < x < 1.25 table
+(p15) fcmp.eq.unc.s1 p7,p0 = FR_p_AbsX, FR_p_XN
+ // If arg is integer and negative - singularity branch
+ nop.i 0
+}
+{ .mfi
+(p10) add GR_p_Offset = 147, r0 // 1.25 < x < 1.75 table
+ nop.f 0
+(p11) add GR_p_Offset = 168, r0 // 1.75 < x < 2.0 table
+};;
+
+{ .mmf
+ shladd GR_p_Table = GR_p_Offset, 4, GR_p_Table
+ shladd GR_p_Table2 = GR_p_Offset, 4, GR_p_Table2
+ fma.s1 FR_r_XS = FR_r_AbsX , f1, FR_r_XNS // xs = x - [x]
+};;
+
+{ .mmb
+ ldfpd FR_p_A5H, FR_p_A5L = [GR_p_Table], 16
+ ldfpd FR_p_A2H, FR_p_A2L = [GR_p_Table2], 16
+(p7) br.cond.spnt tgammal_singularity // Singularity for integer /////////////
+ // and negative argument ///////////////
+};;
+
+{ .mfi
+ ldfpd FR_p_A4H, FR_p_A4L = [GR_p_Table], 16
+ fma.s1 FR_p_XN = FR_p_XN, f1, FR_p_0p5 // xn = xn+0.5
+ nop.i 0
+}
+{ .mfi
+ ldfpd FR_p_A1H, FR_p_A1L = [GR_p_Table2], 16
+(p10) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_1p5 // r = x - 1.5
+ nop.i 0
+};;
+
+{ .mmi
+ ldfpd FR_p_A3H, FR_p_A3L = [GR_p_Table], 16
+ ldfpd FR_p_A0H, FR_p_A0L = [GR_p_Table2], 16
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe FR_p_A20 = [GR_p_Table], 16
+ ldfe FR_p_A12 = [GR_p_Table2], 16
+ nop.i 0
+};;
+
+{ .mmf
+ ldfe FR_p_A19 = [GR_p_Table], 16
+ ldfe FR_p_A11 = [GR_p_Table2], 16
+ fma.s1 FR_r_XS2 = FR_r_XS, FR_r_XS, f0 // xs2 = xs*xs
+};;
+
+{ .mmi
+ ldfe FR_p_A18 = [GR_p_Table], 16
+ ldfe FR_p_A10 = [GR_p_Table2], 16
+ nop.i 0
+};;
+
+.pred.rel "mutex",p12,p13
+{ .mfi
+ ldfe FR_p_A17 = [GR_p_Table], 16
+(p12) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_XN // r = x - xn
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_p_A9 = [GR_p_Table2], 16
+(p13) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_XN
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe FR_p_A16 = [GR_p_Table], 16
+ ldfe FR_p_A8 = [GR_p_Table2], 16
+(p9) cmp.eq p12, p0 = r0, r0 // clear p12
+};;
+
+{ .mmi
+ ldfe FR_p_A15 = [GR_p_Table], 16
+ ldfe FR_p_A7 = [GR_p_Table2], 16
+(p10) cmp.eq p12, p0 = r0, r0 // clear p12
+};;
+
+{ .mfi
+ ldfe FR_p_A14 = [GR_p_Table], 16
+ fma.s1 FR_r_TH = FR_r_A2H, FR_r_XS2, f0 // sin for neg
+(p11) cmp.eq p12, p0 = r0, r0 // clear p12
+}
+{ .mfi
+ ldfe FR_p_A6 = [GR_p_Table2], 16
+ fma.s1 FR_r_TL = FR_r_A2L, FR_r_XS2, f0 // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_p_A13 = [GR_p_Table], 16
+ fms.s1 FR_r_XS2L = FR_r_XS, FR_r_XS, FR_r_XS2 // x2Lo part
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp5H = FR_p_A5H, FR_p_XR, f0 // A5H*r
+ // 'Low poly'
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_XR2 = FR_p_XR, FR_p_XR, f0 // r^2 = r*r
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fabs FR_r_XS = FR_r_XS // abs(xs)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp2H = FR_p_A2H, FR_p_XR, f0 // A2H*r
+ // 'High poly'
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r_TT = FR_r_A2H, FR_r_XS2, FR_r_TH // sin for neg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResH = FR_r_TH, f1, FR_r_A1H // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_r_A2H, FR_r_XS2L, FR_r_TL // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Temp5L = FR_p_A5H,FR_p_XR,FR_p_Temp5H //A5H*r delta
+ // 'Low poly'
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly5H = FR_p_Temp5H, f1, FR_p_A4H // A5H*r+A4H
+ // 'Low poly'
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Temp2L = FR_p_A2H, FR_p_XR, FR_p_Temp2H//A2H*r delta
+ //'High poly'
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly2H = FR_p_Temp2H, f1, FR_p_A1H // A2H*r+A1H
+ //'High poly'
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_XR3 = FR_p_XR2, FR_p_XR, f0 // r^3 = r^2*r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_XR2L = FR_p_XR, FR_p_XR, FR_p_XR2 // r^2 delta
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A18 = FR_p_A19, FR_p_XR, FR_p_A18 // Poly tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A14 = FR_p_A15, FR_p_XR, FR_p_A14 // Poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_XR4 = FR_p_XR2, FR_p_XR2, f0 // r^4 = r^2*r^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp5L = FR_p_A5L, FR_p_XR, FR_p_Temp5L// Low part
+ // of A5*r+A4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Poly5L = FR_p_A4H, f1, FR_p_Poly5H // Low part
+ // of A5*r+A4
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp4H = FR_p_Poly5H, FR_p_XR, f0 // (A5H*r+A4H)*r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp2L = FR_p_A2L, FR_p_XR, FR_p_Temp2L // A2*r low
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Poly2L = FR_p_A1H, f1, FR_p_Poly2H // High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp1H = FR_p_Poly2H, FR_p_XR, f0 // High poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_XR3L = FR_p_XR2, FR_p_XR, FR_p_XR3 // x^3 delta
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A16 = FR_p_A17, FR_p_XR, FR_p_A16 // Poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r_ResL = FR_r_A1H, f1, FR_r_ResH // sin for neg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp5L = FR_p_Temp5L, f1, FR_p_A4L // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5H // Low poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Temp4L = FR_p_Poly5H,FR_p_XR,FR_p_Temp4H //Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly4H = FR_p_Temp4H, f1, FR_p_A3H // Low poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp2L = FR_p_Temp2L, f1, FR_p_A1L // High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2H // High poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Temp1L = FR_p_Poly2H,FR_p_XR,FR_p_Temp1H //High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly1H = FR_p_Temp1H, f1, FR_p_A0H // High poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A12 = FR_p_A13, FR_p_XR, FR_p_A12 // Poly tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_XR3L = FR_p_XR2L, FR_p_XR, FR_p_XR3L // x^3 low
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5L // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A10 = FR_p_A11, FR_p_XR, FR_p_A10 // Poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Poly4L = FR_p_A3H, f1, FR_p_Poly4H // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A6 = FR_p_A7, FR_p_XR, FR_p_A6 // Poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A8 = FR_p_A9, FR_p_XR, FR_p_A8 // Poly tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_XR6 = FR_p_XR4, FR_p_XR2, f0 // Poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2L // High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Poly1L = FR_p_A0H, f1, FR_p_Poly1H // High poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TH // sin for neg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TT = FR_r_TL, f1, FR_r_A1L // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp4L = FR_p_Poly5L,FR_p_XR,FR_p_Temp4L // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A18 = FR_p_A20, FR_p_XR2, FR_p_A18 // Poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4H // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A14 = FR_p_A16, FR_p_XR2, FR_p_A14 // Poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A6 = FR_p_A8, FR_p_XR2, FR_p_A6 // Poly tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A10 = FR_p_A12, FR_p_XR2, FR_p_A10 // Poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp1L = FR_p_Poly2L,FR_p_XR,FR_p_Temp1L //High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1H // High poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TT // sin for neg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TH = FR_r_ResH, FR_r_XS2, f0 // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp4L = FR_p_Temp4L, f1, FR_p_A3L // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly3H = FR_p_Poly4H, FR_p_XR3, f0 // Low poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A14 = FR_p_A18, FR_p_XR4, FR_p_A14 // Poly tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_XR8 = FR_p_XR4, FR_p_XR4, f0 // Poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_r_ResH, FR_r_XS2L, f0 // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp1L = FR_p_Temp1L, f1, FR_p_A0L // High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A6 = FR_p_A10, FR_p_XR4, FR_p_A6 // Poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r_TT = FR_r_ResH, FR_r_XS2, FR_r_TH // sin for neg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Res3H = FR_r_TH, f1, f1 // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4L // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly3L = FR_p_Poly4H, FR_p_XR3L, f0 // Low poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly0H = FR_p_Poly3H,f1,FR_p_Poly1H //Low & High add
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_A7 = FR_r_A8, FR_r_XS2, FR_r_A7 // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_r_ResL, FR_r_XS2, FR_r_TL // sin for neg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_XS4 = FR_r_XS2, FR_r_XS2, f0 // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1L // High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_PolyTail = FR_p_A14, FR_p_XR8, FR_p_A6 // Poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r_Res3L = f1, f1, FR_r_Res3H // sin for neg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResH = FR_r_Res3H, FR_r_XS, f0 // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Temp0L = FR_p_Poly4H,FR_p_XR3,FR_p_Poly3H //Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly3L = FR_p_Poly4L,FR_p_XR3,FR_p_Poly3L //Low poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Poly0L = FR_p_Poly1H,f1,FR_p_Poly0H //Low & High add
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_p_OddPoly0H = FR_p_Poly0H, FR_p_AbsXM1, f0
+ // Reccurent computations - multiplying by X-1
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // sin for neg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_A3 = FR_r_A4, FR_r_XS2, FR_r_A3 // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly1L = FR_p_PolyTail,FR_p_XR6,FR_p_Poly1L//High
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_A5 = FR_r_A6, FR_r_XS2, FR_r_A5 // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TH // sin for neg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r_ResL = FR_r_Res3H, FR_r_XS, FR_r_ResH//sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly3L = FR_p_Poly3L, f1, FR_p_Temp0L // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_A7 = FR_r_A9, FR_r_XS4, FR_r_A7 // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly0L = FR_p_Poly0L,f1,FR_p_Poly3H //Low & High add
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fms.s1 FR_p_OddPoly0L = FR_p_Poly0H, FR_p_AbsXM1, FR_p_OddPoly0H
+ // Reccurent computations - multiplying by X-1 (low part)
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_A3 = FR_r_A5, FR_r_XS4, FR_r_A3 // sin for neg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_XS7 = FR_r_XS4, FR_r_XS2, f0 // xs^6
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TL // sin for neg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_XS8 = FR_r_XS4, FR_r_XS4, f0 // sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp0H = FR_p_Poly3L,f1,FR_p_Poly1L //Low & High add
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_XS7 = FR_r_XS7, FR_r_XS, f0 // xs^7
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResL = FR_r_Res3L, FR_r_XS, FR_r_ResL//sin for neg
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Tail = FR_r_A7, FR_r_XS8, FR_r_A3 // sin tail res
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly0L = FR_p_Poly0L,f1,FR_p_Temp0H //Low & High add
+ nop.i 0
+};;
+
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResL = FR_r_Tail,FR_r_XS7,FR_r_ResL //sin for neg
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_p_OddPoly0L = FR_p_Poly0L, FR_p_AbsXM1, FR_p_OddPoly0L
+ // Reccurent computations - multiplying by X-1 (low part)
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TT = FR_r_ResL, FR_r_AbsX, f0 // X*sin
+ nop.i 0
+};;
+
+.pred.rel "mutex",p12,p13
+{ .mfi
+ nop.m 0
+(p12) fma.s0 f8 = FR_p_Poly0H, f1, FR_p_Poly0L // Even
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p13) fma.s0 f8 = FR_p_OddPoly0H, f1, FR_p_OddPoly0L // Odd
+(p14) br.ret.spnt b0 // Exit for 1 <= |X| < 13 path (positive arguments)/////
+};;
+
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_p_Poly0H = FR_p_OddPoly0H, f1, f0
+ // Reccurent computations
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p13) fma.s1 FR_p_Poly0L = FR_p_OddPoly0L, f1, f0
+ // Reccurent computations
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Res1H = FR_r_ResH, FR_r_AbsX, FR_r_TT // X*sin
+(p11) cmp.eq p13, p12 = r0, r0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r_Res1L = FR_r_ResH,FR_r_AbsX,FR_r_Res1H// X*sin
+(p9) cmp.eq p13, p12 = r0, r0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Res1L = FR_r_Res1L, f1, FR_r_TT // sin for neg
+(p10) cmp.eq p13, p12 = r0, r0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_p_Poly0L, FR_r_Res1H, f0 // mult by sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_p_Poly0H,FR_r_Res1L,FR_r_TL//mult by sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResH = FR_p_Poly0H,FR_r_Res1H,FR_r_TL//mult by sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r_ResL = FR_p_Poly0H,FR_r_Res1H,FR_r_ResH//sin mult
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ frcpa.s1 FR_r_Y0,p0 = f1,FR_r_ResH // y = frcpa(b)
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fneg FR_r_NegOne = f1 // Form -1.0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TL //Low result of mult
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Q0 = f1,FR_r_Y0,f0 // q = a*y
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_r_E0 = FR_r_Y0,FR_r_ResH,f1 // e = 1-b*y
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_E2 = FR_r_E0,FR_r_E0,FR_r_E0 // e2 = e+e^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_E1 = FR_r_E0,FR_r_E0,f0 // e1 = e^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Y1 = FR_r_Y0,FR_r_E2,FR_r_Y0 // y1 = y+y*e2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_E3 = FR_r_E1,FR_r_E1,FR_r_E0 // e3 = e+e1^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Y2 = FR_r_Y1,FR_r_E3,FR_r_Y0 // y2 = y+y1*e3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_r_R0 = FR_r_ResH,FR_r_Q0,f1 // r = a-b*q
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_r_E4 = FR_r_ResH,FR_r_Y2,f1 // e4 = 1-b*y2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ZH = FR_r_R0,FR_r_Y2,FR_r_Q0 // x = q+r*y2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Y3 = FR_r_Y2,FR_r_E4,FR_r_Y2 // y3 = y2+y2*e4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_r_R1 = FR_r_ResH,FR_r_ZH,f1 // r1 = a-b*x
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_r_R1 = FR_r_ResL,FR_r_ZH,FR_r_R1 // r1=r1-b_lo*X
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+(p12) fma.s1 FR_r_ZHN = FR_r_ZH,FR_r_NegOne, f0 // Negate for evens
+ nop.i 0
+};;
+
+.pred.rel "mutex",p13,p12
+{ .mfi
+ nop.m 0
+(p13) fma.s0 f8 = FR_r_R1,FR_r_Y3,FR_r_ZH // Final result
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p12) fnma.s0 f8 = FR_r_R1,FR_r_Y3,FR_r_ZHN // Final result
+ br.ret.sptk b0 // Exit for 1 <= |X| < 13 path (negative arguments)//////
+};;
+
+
+//////////// |X| < 1 path /////////////////////////////////////////////////////
+//------------------------------------------------------------------------------
+.align 64
+tgamma_lt_1:
+{ .mfi
+ getf.exp GR_p_Exp = FR_p_AbsX // exp of abs X
+ fma.s1 FR_z_Q0 = f1,FR_z_Y0,f0 // q = a*y
+ add GR_r_sin_Table2= 0x50, GR_r_sin_Table
+}
+{ .mfi
+ ldfpd FR_p_0p5, FR_p_1p5 = [GR_c_Table], 16
+ fnma.s1 FR_z_E0 = FR_z_Y0,f8,f1 // e = 1-b*y
+ add GR_p_Table2 = 0xB0, GR_p_Table
+};;
+
+{ .mfi
+ ldfd FR_p_0p25 = [GR_c_Table]
+ fcvt.xf FR_r_XNS = FR_r_IXNS // Convert int repr to float
+ shr.u GR_p_X_Sgnd = GR_p_X_Sgnd, 60
+ // Obtain only 4 bits of significand
+}
+{ .mfi
+ nop.m 0
+ nop.f 0
+ add GR_p_Bias = 0xffff, r0 // Set bias
+};;
+
+{ .mfi
+ ldfpd FR_r_A2H, FR_r_A2L = [GR_r_sin_Table], 16
+ nop.f 0
+ shl GR_p_XN = GR_p_Exp, 4
+ // Shift exp to 4 bits left to set place for significand
+}
+{ .mlx
+ ldfe FR_r_A6 = [GR_r_sin_Table2], 16
+ movl GR_p_0p75 = 0xfffec // 0.75
+};;
+
+{ .mfi
+ ldfpd FR_r_A1H, FR_r_A1L = [GR_r_sin_Table], 16
+ nop.f 0
+ or GR_p_XN = GR_p_XN, GR_p_X_Sgnd
+ // Combine exp with 4 high bits of significand
+}
+{ .mfi
+ ldfe FR_r_A5 = [GR_r_sin_Table2], 16
+ nop.f 0
+ sub GR_p_Exp = GR_p_Exp, GR_p_Bias // Unbiased exp
+};;
+
+{ .mmi
+ ldfe FR_r_A9 = [GR_r_sin_Table], 16
+ ldfe FR_r_A4 = [GR_r_sin_Table2], 16
+ cmp.gtu.unc p10, p11 = GR_p_0p75, GR_p_XN // sgnd(x) < 0.75
+};;
+
+{ .mfi
+ ldfe FR_r_A8 = [GR_r_sin_Table], 16
+ fma.s1 FR_z_E2 = FR_z_E0,FR_z_E0,FR_z_E0 // e2 = e+e^2
+(p10) cmp.gt.unc p9, p10 = -2, GR_p_Exp // x < 0.25
+}
+{ .mfi
+ ldfe FR_r_A3 = [GR_r_sin_Table2], 16
+ fma.s1 FR_z_E1 = FR_z_E0,FR_z_E0,f0 // e1 = e^2
+(p11) add GR_p_Offset = 168, r0 // [0.75;1] interval
+};;
+
+{ .mmi
+(p10) add GR_p_Offset = 147, r0 // [0.25;0.75] interval
+ ldfe FR_r_A7 = [GR_r_sin_Table], 16
+(p9) cmp.gt.unc p8, p9 = -3, GR_p_Exp // x < 0.125
+};;
+
+.pred.rel "mutex",p9,p8
+{ .mmi
+(p9) add GR_p_Offset = 126, r0 // [0.125;0.25] interval
+(p8) add GR_p_Offset = 189, r0 // [0.;0.125] interval
+ nop.i 0
+};;
+
+{ .mmf
+ shladd GR_p_Table = GR_p_Offset, 4, GR_p_Table //Make addresses
+ shladd GR_p_Table2 = GR_p_Offset, 4, GR_p_Table2
+ fma.s1 FR_r_XS = FR_r_AbsX , f1, FR_r_XNS // xs = |x|-[x]
+};;
+
+.pred.rel "mutex",p8,p11
+{ .mfi
+ ldfpd FR_p_A5H, FR_p_A5L = [GR_p_Table], 16
+(p11) fms.s1 FR_p_XR = f1, f1, FR_p_AbsX // r = 1 - |x|
+ // for [0.75;1] interval
+ nop.i 0
+}
+{ .mfi
+ ldfpd FR_p_A2H, FR_p_A2L = [GR_p_Table2], 16
+(p8) fms.s1 FR_p_XR = FR_p_AbsX, f1, f0 // r = |x|
+ // for [0.;0.125] interval
+ nop.i 0
+};;
+
+{ .mfi
+ ldfpd FR_p_A4H, FR_p_A4L = [GR_p_Table], 16
+ fma.s1 FR_z_Y1 = FR_z_Y0,FR_z_E2,FR_z_Y0 // y1 = y+y*e2
+ nop.i 0
+}
+{ .mfi
+ ldfpd FR_p_A1H, FR_p_A1L = [GR_p_Table2], 16
+ fma.s1 FR_z_E3 = FR_z_E1,FR_z_E1,FR_z_E0 // e3 = e+e1^2
+ nop.i 0
+};;
+
+.pred.rel "mutex",p9,p10
+{ .mfi
+ ldfpd FR_p_A3H, FR_p_A3L = [GR_p_Table], 16
+(p9) fms.s1 FR_p_XR = FR_p_AbsX, f1, f0 // r = |x|
+ // for [0.125;0.25] interval
+ nop.i 0
+}
+{ .mfi
+ ldfpd FR_p_A0H, FR_p_A0L = [GR_p_Table2], 16
+(p10) fms.s1 FR_p_XR = FR_p_AbsX, f1, FR_p_0p5 // r = |x| - 0.5
+ // for [0.25;0.75] interval
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe FR_p_A20 = [GR_p_Table], 16
+ ldfe FR_p_A12 = [GR_p_Table2], 16
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_p_A19 = [GR_p_Table], 16
+ fma.s1 FR_r_XS2 = FR_r_XS, FR_r_XS, f0 // xs^2
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_p_A11 = [GR_p_Table2], 16
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe FR_p_A18 = [GR_p_Table], 16
+ ldfe FR_p_A10 = [GR_p_Table2], 16
+ nop.i 0
+};;
+
+.pred.rel "mutex",p12,p13
+{ .mfi
+ ldfe FR_p_A17 = [GR_p_Table], 16
+ fma.s1 FR_z_Y2 = FR_z_Y1,FR_z_E3,FR_z_Y0 // y2 = y+y1*e3
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_p_A9 = [GR_p_Table2], 16
+ fnma.s1 FR_z_R0 = f8,FR_z_Q0,f1 // r = a-b*q
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe FR_p_A16 = [GR_p_Table], 16
+ ldfe FR_p_A8 = [GR_p_Table2], 16
+ nop.i 0
+};;
+
+{ .mmi
+ ldfe FR_p_A15 = [GR_p_Table], 16
+ ldfe FR_p_A7 = [GR_p_Table2], 16
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_p_A14 = [GR_p_Table], 16
+ fma.s1 FR_r_TH = FR_r_A2H, FR_r_XS2, f0 // neg sin
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_p_A6 = [GR_p_Table2], 16
+ fma.s1 FR_r_TL = FR_r_A2L, FR_r_XS2, f0 // neg sin
+ nop.i 0
+};;
+
+{ .mfi
+ ldfe FR_p_A13 = [GR_p_Table], 16
+ fms.s1 FR_r_XS2L = FR_r_XS, FR_r_XS, FR_r_XS2 // xs^2 delta
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp5H = FR_p_A5H, FR_p_XR, f0 // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_XR2 = FR_p_XR, FR_p_XR, f0 // poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fabs FR_r_XS = FR_r_XS // Absolute value of xs
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp2H = FR_p_A2H, FR_p_XR, f0 // High poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_z_E4 = f8,FR_z_Y2,f1 // e4 = 1-b*y2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_z_ZH = FR_z_R0,FR_z_Y2,FR_z_Q0 // 1/x = q+r*y2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r_TT = FR_r_A2H, FR_r_XS2, FR_r_TH // neg sin
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResH = FR_r_TH, f1, FR_r_A1H // neg sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_r_A2H, FR_r_XS2L, FR_r_TL // neg sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Temp5L = FR_p_A5H, FR_p_XR, FR_p_Temp5H // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly5H = FR_p_Temp5H, f1, FR_p_A4H // Low poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Temp2L = FR_p_A2H, FR_p_XR, FR_p_Temp2H // High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly2H = FR_p_Temp2H, f1, FR_p_A1H // High poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_XR3 = FR_p_XR2, FR_p_XR, f0 // r^3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_XR2L = FR_p_XR, FR_p_XR, FR_p_XR2 // r^2 delta
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A18 = FR_p_A19, FR_p_XR, FR_p_A18 // poly tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A14 = FR_p_A15, FR_p_XR, FR_p_A14 // poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_XR4 = FR_p_XR2, FR_p_XR2, f0 // poly tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_z_Y3 = FR_z_Y2,FR_z_E4,FR_z_Y2 // y3 = y2+y2*e4
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp5L = FR_p_A5L, FR_p_XR, FR_p_Temp5L // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Poly5L = FR_p_A4H, f1, FR_p_Poly5H // Low poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp4H = FR_p_Poly5H, FR_p_XR, f0 // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp2L = FR_p_A2L, FR_p_XR, FR_p_Temp2L // High poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Poly2L = FR_p_A1H, f1, FR_p_Poly2H // High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp1H = FR_p_Poly2H, FR_p_XR, f0 // High poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_XR3L = FR_p_XR2, FR_p_XR, FR_p_XR3 // x^3 delta
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A16 = FR_p_A17, FR_p_XR, FR_p_A16 //poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r_ResL = FR_r_A1H, f1, FR_r_ResH // neg sin
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // neg sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp5L = FR_p_Temp5L, f1, FR_p_A4L // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5H //Low poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Temp4L = FR_p_Poly5H, FR_p_XR, FR_p_Temp4H//Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly4H = FR_p_Temp4H, f1, FR_p_A3H // Low poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp2L = FR_p_Temp2L, f1, FR_p_A1L // High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2H // High poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Temp1L = FR_p_Poly2H,FR_p_XR,FR_p_Temp1H //High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly1H = FR_p_Temp1H, f1, FR_p_A0H // High poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A12 = FR_p_A13, FR_p_XR, FR_p_A12 // poly tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_XR3L = FR_p_XR2L, FR_p_XR, FR_p_XR3L // x^3 low
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly5L = FR_p_Poly5L, f1, FR_p_Temp5L //Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A10 = FR_p_A11, FR_p_XR, FR_p_A10 //poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Poly4L = FR_p_A3H, f1, FR_p_Poly4H /// Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A6 = FR_p_A7, FR_p_XR, FR_p_A6 // poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A8 = FR_p_A9, FR_p_XR, FR_p_A8 // poly tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_XR6 = FR_p_XR4, FR_p_XR2, f0 // r^6
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly2L = FR_p_Poly2L, f1, FR_p_Temp2L // High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Poly1L = FR_p_A0H, f1, FR_p_Poly1H // High poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TH // neg sin
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TT = FR_r_TL, f1, FR_r_A1L // neg sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp4L = FR_p_Poly5L,FR_p_XR,FR_p_Temp4L //Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A18 = FR_p_A20, FR_p_XR2, FR_p_A18 // poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4H // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A14 = FR_p_A16, FR_p_XR2, FR_p_A14 // poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A6 = FR_p_A8, FR_p_XR2, FR_p_A6 // poly tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A10 = FR_p_A12, FR_p_XR2, FR_p_A10 // poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp1L = FR_p_Poly2L,FR_p_XR,FR_p_Temp1L //High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1H // High poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TT // neg sin
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TH = FR_r_ResH, FR_r_XS2, f0 // neg sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp4L = FR_p_Temp4L, f1, FR_p_A3L // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly3H = FR_p_Poly4H, FR_p_XR3, f0 // Low poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A14 = FR_p_A18, FR_p_XR4, FR_p_A14 // poly tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_XR8 = FR_p_XR4, FR_p_XR4, f0 // r^8
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_r_ResH, FR_r_XS2L, f0 // neg sin
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_z_R1 = f8,FR_z_ZH,f1 // r1 = a-b*x
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp1L = FR_p_Temp1L, f1, FR_p_A0L // High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_A6 = FR_p_A10, FR_p_XR4, FR_p_A6 // poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r_TT = FR_r_ResH, FR_r_XS2, FR_r_TH // neg sin
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Res3H = FR_r_TH, f1, f1 // neg sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly4L = FR_p_Poly4L, f1, FR_p_Temp4L // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly3L = FR_p_Poly4H, FR_p_XR3L, f0 // Low poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly0H = FR_p_Poly3H, f1, FR_p_Poly1H // Result
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_A7 = FR_r_A8, FR_r_XS2, FR_r_A7 // neg sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_r_ResL, FR_r_XS2, FR_r_TL // neg sin
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_XS4 = FR_r_XS2, FR_r_XS2, f0 // xs^4
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly1L = FR_p_Poly1L, f1, FR_p_Temp1L // High poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_PolyTail = FR_p_A14, FR_p_XR8, FR_p_A6 // poly tail
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r_Res3L = f1, f1, FR_r_Res3H // neg sin
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResH = FR_r_Res3H, FR_r_XS, f0 // neg sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Temp0L = FR_p_Poly4H,FR_p_XR3,FR_p_Poly3H //Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly3L = FR_p_Poly4L,FR_p_XR3,FR_p_Poly3L //Low poly
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_p_Poly0L = FR_p_Poly1H, f1, FR_p_Poly0H // Result
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_z_ZL = FR_z_R1,FR_z_Y3, f0 // x_lo = r1*y3
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_r_TL, f1, FR_r_TT // neg sin
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_A3 = FR_r_A4, FR_r_XS2, FR_r_A3 /// neg sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly1L = FR_p_PolyTail,FR_p_XR6,FR_p_Poly1L // High
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_A5 = FR_r_A6, FR_r_XS2, FR_r_A5 // neg sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TH // neg sin
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r_ResL = FR_r_Res3H, FR_r_XS, FR_r_ResH // neg sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly3L = FR_p_Poly3L, f1, FR_p_Temp0L // Low poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_A7 = FR_r_A9, FR_r_XS4, FR_r_A7 // neg sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly0L = FR_p_Poly0L, f1, FR_p_Poly3H // result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p14) fma.s1 f8 = FR_p_Poly0H, FR_z_ZH, f0 // z*poly
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp1L = FR_p_Poly0H, FR_z_ZL, f0 // z*poly low
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_A3 = FR_r_A5, FR_r_XS4, FR_r_A3 // sin tail
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_XS7 = FR_r_XS4, FR_r_XS2, f0 // xs^6
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Res3L = FR_r_Res3L, f1, FR_r_TL // sin low
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_XS8 = FR_r_XS4, FR_r_XS4, f0 // xs^8
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Temp0H = FR_p_Poly3L, f1, FR_p_Poly1L // result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p14) fms.s1 FR_p_Temp1H = FR_p_Poly0H, FR_z_ZH, f8 // hi result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_XS7 = FR_r_XS7, FR_r_XS, f0 // xs^7
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResL = FR_r_Res3L, FR_r_XS, FR_r_ResL // lo result
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Tail = FR_r_A7, FR_r_XS8, FR_r_A3 // tail result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_p_Poly0L = FR_p_Poly0L, f1, FR_p_Temp0H // lo result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResL = FR_r_Tail, FR_r_XS7, FR_r_ResL // lo result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_p_Temp1L = FR_p_Poly0L,FR_z_ZH,FR_p_Temp1L //hi result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TT = FR_r_ResL, f1, f0 // for low result
+ nop.i 0
+};;
+
+.pred.rel "mutex",p12,p13
+{ .mfi
+ nop.m 0
+(p14) fma.s1 FR_p_Temp1L = FR_p_Temp1L, f1, FR_p_Temp1H // for lo res
+ nop.i 0
+};;
+
+{ .mfi
+(p10) cmp.eq p13, p12 = r0, r0 // set p13, clear p12
+ fma.s1 FR_r_Res1H = FR_r_ResH, f1, FR_r_TT // hi res
+ nop.i 0
+};;
+
+{ .mfb
+(p9) cmp.eq p13, p12 = r0, r0 // set p13, clear p12
+(p14) fma.s0 f8 = f8, f1, FR_p_Temp1L // Final result
+(p14) br.ret.spnt b0 // Exit for 0 < |X| < 1 path (positive arguments)///////
+};;
+
+{ .mfi
+(p11) cmp.eq p13, p12 = r0, r0 // set p13, clear p12
+ fms.s1 FR_r_Res1L = FR_r_ResH, f1, FR_r_Res1H // Low sin result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Res1L = FR_r_Res1L, f1, FR_r_TT // Low sin result
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_p_Poly0L,FR_r_Res1H,f0 //Low sin result
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_TL = FR_p_Poly0H, FR_r_Res1L, FR_r_TL //Low sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResH = FR_p_Poly0H, FR_r_Res1H, FR_r_TL //High sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r_ResL = FR_p_Poly0H,FR_r_Res1H,FR_r_ResH //Low res
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ frcpa.s1 FR_r_Y0,p0 = f1,FR_r_ResH // y = frcpa(b)
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fneg FR_r_NegOne = f1 // Construct -1.0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ResL = FR_r_ResL, f1, FR_r_TL // low sin
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Q0 = f1,FR_r_Y0,f0 // q = a*y
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_r_E0 = FR_r_Y0,FR_r_ResH,f1 // e = 1-b*y
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_E2 = FR_r_E0,FR_r_E0,FR_r_E0 // e2 = e+e^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_E1 = FR_r_E0,FR_r_E0,f0 // e1 = e^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Y1 = FR_r_Y0,FR_r_E2,FR_r_Y0 // y1 = y+y*e2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_E3 = FR_r_E1,FR_r_E1,FR_r_E0 // e3 = e+e1^2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Y2 = FR_r_Y1,FR_r_E3,FR_r_Y0 // y2 = y+y1*e3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_r_R0 = FR_r_ResH,FR_r_Q0,f1 // r = a-b*q
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_r_E4 = FR_r_ResH,FR_r_Y2,f1 // e4 = 1-b*y2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ZH = FR_r_R0,FR_r_Y2,FR_r_Q0 // x = q+r*y2
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_Y3 = FR_r_Y2,FR_r_E4,FR_r_Y2 // y3 = y2+y2*e4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_r_R1 = FR_r_ResH,FR_r_ZH,f1 // r1 = a-b*x
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_r_R1 = FR_r_ResL,FR_r_ZH,FR_r_R1 // r1=r1 - b_lo*X
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_r_ZHN = FR_r_ZH,FR_r_NegOne, f0 // Negate
+ nop.i 0
+};;
+
+.pred.rel "mutex",p13,p12
+{ .mfb
+ nop.m 0
+ fnma.s0 f8 = FR_r_R1,FR_r_Y3,FR_r_ZHN // Result for neg
+ br.ret.sptk b0 // Exit for 0 < |X| < 1 path (negative arguments)//////
+};;
+
+
+
+
+// SPECIALS (x for natval, nan, +/-inf or +/-0) ///////////////////////////////
+//------------------------------------------------------------------------------
+.align 32
+tgammal_spec:
+{ .mlx
+ nop.m 0
+ movl GR_DenOverflow = 0x2000000000000001
+}
+{ .mfi
+ nop.m 0
+ fclass.m p9,p0 = f8,0xB // +/-denormals
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fclass.m p6,p0 = f8,0x1E1 // Test x for natval, nan, +inf
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fclass.m p7,p8 = f8,0x7 // +/-0
+ nop.i 0
+}
+
+{ .mfi
+(p9) cmp.ltu.unc p10,p11 = GR_l_signif_Z, GR_DenOverflow
+(p9) fnorm.s0 f8 = f8
+ nop.i 0
+};;
+
+{ .mfb
+ nop.m 0
+(p9) fcvt.fx.trunc.s1 FR_n_IXN = FR_l_AbsX // Round by truncate
+(p11) br.cond.sptk tgamma_lt_1 // Return to gamma ('good' denormal)////////////
+};;
+
+{ .mfb
+ nop.m 0
+ nop.f 0
+(p10) br.cond.spnt tgammal_overflow // "Bad" denormal - overflow! /////////////
+};;
+
+{ .mfi
+ nop.m 0
+ mov FR_X = f8 // for error handler
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p6) fma.s0 f8 = f8,f1,f8 // res = x + x
+(p6) br.ret.spnt b0 // Exit for NAN, INF and NatVals ////////////////////////
+};;
+.pred.rel "mutex",p7,p8
+{ .mfi
+(p7) mov GR_Parameter_TAG = 256 // negative
+(p7) frcpa.s0 f8,p0 = f1,f8 // Raise V flag
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ nop.f 0
+(p8) br.cond.spnt tgammal_singularity // Branch for +ZERO ////////////////////
+};;
+
+{ .mfb
+ nop.m 0
+ nop.f 0
+ br.cond.spnt tgammal_libm_err // Branch for -ZERO ///////////////////////
+};;
+
+
+
+
+// SINGULARITY (x is negative integer or 0) ////////////////////////////////////
+//------------------------------------------------------------------------------
+.align 32
+tgammal_singularity:
+{ .mfi
+ nop.m 0
+ mov FR_X = f8 // For error handler
+ mov GR_Parameter_TAG = 256 // negative
+}
+{ .mfb
+ nop.m 0
+ frcpa.s0 f8,p0 = f0,f0 // Raise V flag
+ br.cond.sptk tgammal_libm_err // Call error handler /////////////////////
+ // with singularity error /////////////////
+};;
+
+
+
+
+// OVERFLOW (result is too big and cannot be represented by normal value) //////
+// ( X > 1755.54 and for denormals with abs value less than 0x2000000000000001 )
+//------------------------------------------------------------------------------
+.align 32
+tgammal_overflow:
+{ .mfi
+ addl r8 = 0x1FFFE, r0 // Exp of INF
+ fcmp.lt.s1 p15,p14 = f8,f0 // p14 - pos arg, p15 - neg arg
+ nop.i 0
+};;
+
+{ .mfi
+ setf.exp f9 = r8
+ mov FR_X = f8 // For error handler
+ mov GR_Parameter_TAG = 255 // overflow
+};;
+
+.pred.rel "mutex",p14,p15
+{ .mfi
+ nop.m 0
+(p14) fma.s0 f8 = f9,f9,f0 // Set I,O and +INF result
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p15) fnma.s0 f8 = f9,f9,f0 // Set I,O and -INF result
+ br.cond.sptk tgammal_libm_err // Call error handler /////////////////////
+ // with overflow error ////////////////////
+};;
+
+
+
+
+
+// UNDERFLOW (x is negative noninteger with big absolute value) ////////////////
+//------------------------------------------------------------------------------
+.align 32
+tgammal_underflow:
+{ .mfi
+ nop.m 0
+ fcvt.fx.trunc.s1 FR_u_IXN = f8 // Convert arg to int repres. in FR
+ nop.i 0
+};;
+
+{ .mmi
+ getf.sig GR_u_XN = FR_u_IXN
+ mov r11 = 0x00001
+ nop.i 0
+};;
+
+{ .mfi
+ setf.exp f9 = r11
+ nop.f 0
+ nop.i 0
+};;
+
+{ .mfi
+ nop.m 0
+ nop.f 0
+ tbit.z p6,p7 = GR_u_XN,0 // even or odd
+};;
+
+.pred.rel "mutex",p6,p7
+{ .mfi
+ nop.m 0
+(p6) fms.s0 f8 = f9,f9,f9 // for negatives
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+(p7) fma.s0 f8 = f9,f9,f9 // for positives
+ br.ret.sptk b0 // Exit for underflow path //////////////////////////////
+};;
+
+
+GLOBAL_LIBM_END(tgammal)
+
+
+
+////////////////// Tgammal error handler ///////////////////////////////////////
+//------------------------------------------------------------------------------
+LOCAL_LIBM_ENTRY(__libm_error_region)
+tgammal_libm_err:
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ nop.b 0 // Parameter 3 address
+}
+{ .mib
+ stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ nop.m 999
+ nop.m 999
+ add GR_Parameter_RESULT = 48,sp
+};;
+{ .mmi
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region#)
+
+.type __libm_error_support#,@function
+.global __libm_error_support#
diff --git a/sysdeps/mips/Makefile b/sysdeps/mips/Makefile
index 849785a550..49ad3e1b91 100644
--- a/sysdeps/mips/Makefile
+++ b/sysdeps/mips/Makefile
@@ -6,3 +6,7 @@ endif
ifeq ($(subdir),setjmp)
sysdep_routines += setjmp_aux
endif
+
+ifeq ($(subdir),rt)
+librt-sysdep_routines += rt-sysdep
+endif
diff --git a/sysdeps/unix/alarm.c b/sysdeps/unix/alarm.c
index ae77782c54..84ab5a52fe 100644
--- a/sysdeps/unix/alarm.c
+++ b/sysdeps/unix/alarm.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1991,92,94,97,2002 Free Software Foundation, Inc.
+/* Copyright (C) 1991,1992,1994,1997,2002,2004 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -41,7 +41,10 @@ alarm (seconds)
return 0;
retval = old.it_value.tv_sec;
- if (old.it_value.tv_usec)
+ /* Round to the nearest second, but never report zero seconds when
+ the alarm is still set. */
+ if (old.it_value.tv_usec >= 500000
+ || (retval == 0 && old.it_value.tv_usec > 0))
++retval;
return retval;
}
diff --git a/sysdeps/unix/mips/rt-sysdep.S b/sysdeps/unix/mips/rt-sysdep.S
new file mode 100644
index 0000000000..f966bf1e59
--- /dev/null
+++ b/sysdeps/unix/mips/rt-sysdep.S
@@ -0,0 +1 @@
+#include <sysdep.S>
diff --git a/sysdeps/unix/sysv/linux/bits/waitflags.h b/sysdeps/unix/sysv/linux/bits/waitflags.h
index e3f80f6814..464cedb1fc 100644
--- a/sysdeps/unix/sysv/linux/bits/waitflags.h
+++ b/sysdeps/unix/sysv/linux/bits/waitflags.h
@@ -1,5 +1,5 @@
/* Definitions of flag bits for `waitpid' et al.
- Copyright (C) 1992, 1996, 1997, 2000, 2004 Free Software Foundation, Inc.
+ Copyright (C) 1992,1996,1997,2000,2004,2005 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -32,5 +32,7 @@
#define WCONTINUED 8 /* Report continued child. */
#define WNOWAIT 0x01000000 /* Don't reap, just poll status. */
+#define __WNOTHREAD 0x20000000 /* Don't wait on children of other threads
+ in this group */
#define __WALL 0x40000000 /* Wait for any child. */
#define __WCLONE 0x80000000 /* Wait for cloned process. */
diff --git a/sysdeps/unix/sysv/linux/i386/clone.S b/sysdeps/unix/sysv/linux/i386/clone.S
index acd43dfb0b..c7d31f7a32 100644
--- a/sysdeps/unix/sysv/linux/i386/clone.S
+++ b/sysdeps/unix/sysv/linux/i386/clone.S
@@ -67,7 +67,7 @@ ENTRY (BP_SYM (__clone))
/* Insert the argument onto the new stack. Make sure the new
thread is started with an alignment of (mod 16). */
andl $0xfffffff0, %ecx
- subl $24,%ecx
+ subl $28,%ecx
movl ARG(%esp),%eax /* no negative argument counts */
movl %eax,12(%ecx)
diff --git a/sysdeps/unix/sysv/linux/init-first.c b/sysdeps/unix/sysv/linux/init-first.c
index f00271255d..7f0b963cb7 100644
--- a/sysdeps/unix/sysv/linux/init-first.c
+++ b/sysdeps/unix/sysv/linux/init-first.c
@@ -1,5 +1,5 @@
/* Initialization code run first thing by the ELF startup code. Linux version.
- Copyright (C) 1995-1999,2000,01,02,03,2004 Free Software Foundation, Inc.
+ Copyright (C) 1995-2004, 2005 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -29,9 +29,6 @@
#include <ldsodefs.h>
-/* The function is called from assembly stubs the compiler can't see. */
-static void init (int, char **, char **) __attribute__ ((used));
-
/* Set nonzero if we have to be prepared for more then one libc being
used in the process. Safe assumption if initializer never runs. */
int __libc_multiple_libcs attribute_hidden = 1;
@@ -42,9 +39,19 @@ int __libc_argc attribute_hidden;
char **__libc_argv attribute_hidden;
-static void
-init (int argc, char **argv, char **envp)
+void
+attribute_hidden
+__libc_init_first (int argc, char **argv, char **envp)
+{
+#ifdef SHARED
+ /* For DSOs we do not need __libc_init_first but instead _init. */
+}
+
+void
+attribute_hidden
+_init (int argc, char **argv, char **envp)
{
+#endif
#ifdef USE_NONOPTION_FLAGS
extern void __getopt_clean_environment (char **);
#endif
@@ -88,27 +95,6 @@ init (int argc, char **argv, char **envp)
#endif
}
-#ifdef SHARED
-
-strong_alias (init, _init);
-
-extern void __libc_init_first (void);
-
-void
-__libc_init_first (void)
-{
-}
-
-#else
-extern void __libc_init_first (int argc, char **argv, char **envp);
-
-void
-__libc_init_first (int argc, char **argv, char **envp)
-{
- init (argc, argv, envp);
-}
-#endif
-
/* This function is defined here so that if this file ever gets into
ld.so we will get a link error. Having this file silently included
diff --git a/version.h b/version.h
index 4eeb1cceb3..aec3a8f2b1 100644
--- a/version.h
+++ b/version.h
@@ -1,4 +1,4 @@
/* This file just defines the current version number of libc. */
-#define RELEASE "stable"
-#define VERSION "2.3.4"
+#define RELEASE "development"
+#define VERSION "2.3.90"