summaryrefslogtreecommitdiff
path: root/sysdeps/ia64/fpu/s_atanl.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/ia64/fpu/s_atanl.S')
-rw-r--r--sysdeps/ia64/fpu/s_atanl.S2157
1 files changed, 1077 insertions, 1080 deletions
diff --git a/sysdeps/ia64/fpu/s_atanl.S b/sysdeps/ia64/fpu/s_atanl.S
index 28d44c1850..bfd9f458f4 100644
--- a/sysdeps/ia64/fpu/s_atanl.S
+++ b/sysdeps/ia64/fpu/s_atanl.S
@@ -1,10 +1,10 @@
.file "atanl.s"
-// Copyright (C) 2000, 2001, Intel Corporation
+
+// Copyright (c) 2000 - 2003, Intel Corporation
// All rights reserved.
-//
-// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
-// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
+//
+// Contributed 2000 by the Intel Numerics Group, Intel Corporation
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
@@ -20,7 +20,7 @@
// * The name of Intel Corporation may not be used to endorse or promote
// products derived from this software without specific prior written
// permission.
-//
+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -35,41 +35,52 @@
//
// Intel Corporation is the author of this code, and requests that all
// problem reports or change requests be submitted to it directly at
-// http://developer.intel.com/opensource.
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
//
//
-// *********************************************************************
+//*********************************************************************
//
// History
-// 2/02/00 (hand-optimized)
-// 4/04/00 Unwind support added
-// 8/15/00 Bundle added after call to __libm_error_support to properly
+// 02/02/00 (hand-optimized)
+// 04/04/00 Unwind support added
+// 08/15/00 Bundle added after call to __libm_error_support to properly
// set [the previously overwritten] GR_Parameter_RESULT.
+// 03/13/01 Fixed flags when denormal raised on intermediate result
+// 01/08/02 Improved speed.
+// 02/06/02 Corrected .section statement
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 02/10/03 Reordered header: .section, .global, .proc, .align;
+// used data8 for long double table values
//
-// *********************************************************************
+//*********************************************************************
//
// Function: atanl(x) = inverse tangent(x), for double extended x values
-// Function: atan2l(y,x) = atan(y/x), for double extended x values
+// Function: atan2l(y,x) = atan(y/x), for double extended y, x values
+//
+// API
+//
+// long double atanl (long double x)
+// long double atan2l (long double y, long double x)
//
-// *********************************************************************
+//*********************************************************************
//
// Resources Used:
//
// Floating-Point Registers: f8 (Input and Return Value)
-// f9-f15
-// f32-f79
+// f9 (Input for atan2l)
+// f10-f15, f32-f83
//
// General Purpose Registers:
-// r32-r48
-// r49,r50,r51,r52 (Arguments to error support for 0,0 case)
+// r32-r51
+// r49-r52 (Arguments to error support for 0,0 case)
//
// Predicate Registers: p6-p15
//
-// *********************************************************************
+//*********************************************************************
//
// IEEE Special Conditions:
//
-// Denormal fault raised on denormal inputs
+// Denormal fault raised on denormal inputs
// Underflow exceptions may occur
// Special error handling for the y=0 and x=0 case
// Inexact raised when appropriate by algorithm
@@ -92,7 +103,7 @@
// atan2l(+/-Inf, Inf) = +/-pi/4
// atan2l(+/-Inf, -Inf) = +/-3pi/4
//
-// *********************************************************************
+//*********************************************************************
//
// Mathematical Description
// ---------------------------
@@ -108,16 +119,16 @@
//
//
// (Arg_X, Arg_Y) x
-// \
-// \
-// \
-// \
+// \
+// \
+// \
+// \
// \ angle between is ATANL(Arg_Y,Arg_X)
-// \
+// \
// ------------------> X-axis
// Origin
@@ -232,14 +243,14 @@
// z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1
//
// then
-// / \
+// / \
// | (V/U) - z_hi |
// arctan(V/U) = arctan(z_hi) + acrtan| -------------- |
// | 1 + (V/U)*z_hi |
// \ /
//
-// / \
+// / \
// | V - z_hi*U |
// = arctan(z_hi) + acrtan| -------------- |
@@ -295,7 +306,7 @@
// U := max( |Arg_X|, |Arg_Y| )
// V := min( |Arg_X|, |Arg_Y| )
//
-// execute: frcap E, pred, V, U
+// execute: frcpa E, pred, V, U
// If pred is 0, go to Step 5 for special cases handling.
//
// Step 2. Decide on branch.
@@ -399,7 +410,7 @@
//
// z := V * E ...z approximates V/U to roughly working precision
// zsq := z * z
-// z8 := zsq * zsq; z8 := z8 * z8
+// z4 := zsq * zsq; z8 := z4 * z4
//
// poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
// poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3))
@@ -438,12 +449,11 @@
//
// Step 5. Special Cases
//
-// If pred is 0 where pred is obtained in
-// frcap E, pred, V, U
+// These are detected early in the function by fclass instructions.
//
-// we are in one of those special cases of 0,+-inf or NaN
+// We are in one of those special cases when X or Y is 0,+-inf or NaN
//
-// If one of U and V is NaN, return U+V (which will generate
+// If one of X and Y is NaN, return X+Y (which will generate
// invalid in case one is a signaling NaN). Otherwise,
// return the Result as described in the table
//
@@ -469,8 +479,6 @@
//
//
-#include "libm_support.h"
-
ArgY_orig = f8
Result = f8
FR_RESULT = f8
@@ -504,6 +512,7 @@ Res_hi = f49
Res_lo = f50
Z = f52
zsq = f53
+z4 = f54
z8 = f54
poly1 = f55
poly2 = f56
@@ -521,8 +530,8 @@ P_5 = f67
P_6 = f68
P_7 = f69
P_8 = f70
-TWO_TO_NEG3 = f71
-U_hold = f72
+U_hold = f71
+TWO_TO_NEG3 = f72
C_hi_hold = f73
E_hold = f74
M = f75
@@ -530,6 +539,11 @@ ArgX_abs = f76
ArgY_abs = f77
Result_lo = f78
A_temp = f79
+FR_temp = f80
+Xsq = f81
+Ysq = f82
+tmp_small = f83
+
GR_SAVE_PFS = r33
GR_SAVE_B0 = r34
GR_SAVE_GP = r35
@@ -545,1415 +559,1399 @@ exp_ArgY = r44
exponent_Q = r45
significand_Q = r46
special = r47
-special1 = r48
+sp_exp_Q = r48
+sp_exp_4sig_Q = r49
+table_base = r50
+int_temp = r51
+
GR_Parameter_X = r49
GR_Parameter_Y = r50
GR_Parameter_RESULT = r51
GR_Parameter_TAG = r52
-int_temp = r52
-
-#ifdef _LIBC
-.rodata
-#else
-.data
-#endif
-.align 64
-
-Constants_atan:
-ASM_TYPE_DIRECTIVE(Constants_atan,@object)
-data4 0x54442D18, 0x3FF921FB, 0x248D3132, 0x3E000000
-// double pi/2, single lo_pi/2, two**(-3)
-data4 0xAAAAAAA3, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // P_1
-data4 0xCCCC54B2, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // P_2
-data4 0x47E4D0C2, 0x92492492, 0x0000BFFC, 0x00000000 // P_3
-data4 0x58870889, 0xE38E38E0, 0x00003FFB, 0x00000000 // P_4
-data4 0x290149F8, 0xBA2E895B, 0x0000BFFB, 0x00000000 // P_5
-data4 0x250F733D, 0x9D88E6D4, 0x00003FFB, 0x00000000 // P_6
-data4 0xFB8745A0, 0x884E51FF, 0x0000BFFB, 0x00000000 // P_7
-data4 0x394396BD, 0xE1C7412B, 0x00003FFA, 0x00000000 // P_8
-data4 0xAAAAA52F, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // Q_1
-data4 0xC75B60D3, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // Q_2
-data4 0x011F1940, 0x924923AD, 0x0000BFFC, 0x00000000 // Q_3
-data4 0x2A5F89BD, 0xE36F716D, 0x00003FFB, 0x00000000 // Q_4
+GR_temp = r52
+
+RODATA
+.align 16
+
+LOCAL_OBJECT_START(Constants_atan)
+// double pi/2
+data8 0x3FF921FB54442D18
+// single lo_pi/2, two**(-3)
+data4 0x248D3132, 0x3E000000
+data8 0xAAAAAAAAAAAAAAA3, 0xBFFD // P_1
+data8 0xCCCCCCCCCCCC54B2, 0x3FFC // P_2
+data8 0x9249249247E4D0C2, 0xBFFC // P_3
+data8 0xE38E38E058870889, 0x3FFB // P_4
+data8 0xBA2E895B290149F8, 0xBFFB // P_5
+data8 0x9D88E6D4250F733D, 0x3FFB // P_6
+data8 0x884E51FFFB8745A0, 0xBFFB // P_7
+data8 0xE1C7412B394396BD, 0x3FFA // P_8
+data8 0xAAAAAAAAAAAAA52F, 0xBFFD // Q_1
+data8 0xCCCCCCCCC75B60D3, 0x3FFC // Q_2
+data8 0x924923AD011F1940, 0xBFFC // Q_3
+data8 0xE36F716D2A5F89BD, 0x3FFB // Q_4
//
// Entries Tbl_hi (double precision)
// B = 1+Index/16+1/32 Index = 0
// Entries Tbl_lo (single precision)
// B = 1+Index/16+1/32 Index = 0
//
-data4 0xA935BD8E, 0x3FE9A000, 0x23ACA08F, 0x00000000
+data8 0x3FE9A000A935BD8E
+data4 0x23ACA08F, 0x00000000
//
// Entries Tbl_hi (double precision) Index = 0,1,...,15
// B = 2^(-1)*(1+Index/16+1/32)
// Entries Tbl_lo (single precision)
// Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32)
//
-data4 0x7F175A34, 0x3FDE77EB, 0x238729EE, 0x00000000
-data4 0x73C1A40B, 0x3FE0039C, 0x249334DB, 0x00000000
-data4 0x5B5B43DA, 0x3FE0C614, 0x22CBA7D1, 0x00000000
-data4 0x88BE7C13, 0x3FE1835A, 0x246310E7, 0x00000000
-data4 0xE2CC9E6A, 0x3FE23B71, 0x236210E5, 0x00000000
-data4 0x8406CBCA, 0x3FE2EE62, 0x2462EAF5, 0x00000000
-data4 0x1CD41719, 0x3FE39C39, 0x24B73EF3, 0x00000000
-data4 0x5B795B55, 0x3FE44506, 0x24C11260, 0x00000000
-data4 0x5BB6EC04, 0x3FE4E8DE, 0x242519EE, 0x00000000
-data4 0x1F732FBA, 0x3FE587D8, 0x24D4346C, 0x00000000
-data4 0x115D7B8D, 0x3FE6220D, 0x24ED487B, 0x00000000
-data4 0x920B3D98, 0x3FE6B798, 0x2495FF1E, 0x00000000
-data4 0x8FBA8E0F, 0x3FE74897, 0x223D9531, 0x00000000
-data4 0x289FA093, 0x3FE7D528, 0x242B0411, 0x00000000
-data4 0x576CC2C5, 0x3FE85D69, 0x2335B374, 0x00000000
-data4 0xA99CC05D, 0x3FE8E17A, 0x24C27CFB, 0x00000000
+data8 0x3FDE77EB7F175A34
+data4 0x238729EE, 0x00000000
+data8 0x3FE0039C73C1A40B
+data4 0x249334DB, 0x00000000
+data8 0x3FE0C6145B5B43DA
+data4 0x22CBA7D1, 0x00000000
+data8 0x3FE1835A88BE7C13
+data4 0x246310E7, 0x00000000
+data8 0x3FE23B71E2CC9E6A
+data4 0x236210E5, 0x00000000
+data8 0x3FE2EE628406CBCA
+data4 0x2462EAF5, 0x00000000
+data8 0x3FE39C391CD41719
+data4 0x24B73EF3, 0x00000000
+data8 0x3FE445065B795B55
+data4 0x24C11260, 0x00000000
+data8 0x3FE4E8DE5BB6EC04
+data4 0x242519EE, 0x00000000
+data8 0x3FE587D81F732FBA
+data4 0x24D4346C, 0x00000000
+data8 0x3FE6220D115D7B8D
+data4 0x24ED487B, 0x00000000
+data8 0x3FE6B798920B3D98
+data4 0x2495FF1E, 0x00000000
+data8 0x3FE748978FBA8E0F
+data4 0x223D9531, 0x00000000
+data8 0x3FE7D528289FA093
+data4 0x242B0411, 0x00000000
+data8 0x3FE85D69576CC2C5
+data4 0x2335B374, 0x00000000
+data8 0x3FE8E17AA99CC05D
+data4 0x24C27CFB, 0x00000000
//
// Entries Tbl_hi (double precision) Index = 0,1,...,15
// B = 2^(-2)*(1+Index/16+1/32)
// Entries Tbl_lo (single precision)
// Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32)
//
-data4 0x510665B5, 0x3FD025FA, 0x24263482, 0x00000000
-data4 0x362431C9, 0x3FD1151A, 0x242C8DC9, 0x00000000
-data4 0x67E47C95, 0x3FD20255, 0x245CF9BA, 0x00000000
-data4 0x7A823CFE, 0x3FD2ED98, 0x235C892C, 0x00000000
-data4 0x29271134, 0x3FD3D6D1, 0x2389BE52, 0x00000000
-data4 0x586890E6, 0x3FD4BDEE, 0x24436471, 0x00000000
-data4 0x175E0F4E, 0x3FD5A2E0, 0x2389DBD4, 0x00000000
-data4 0x9F5FA6FD, 0x3FD68597, 0x2476D43F, 0x00000000
-data4 0x52817501, 0x3FD76607, 0x24711774, 0x00000000
-data4 0xB8DF95D7, 0x3FD84422, 0x23EBB501, 0x00000000
-data4 0x7CD0C662, 0x3FD91FDE, 0x23883A0C, 0x00000000
-data4 0x66168001, 0x3FD9F930, 0x240DF63F, 0x00000000
-data4 0x5422058B, 0x3FDAD00F, 0x23FE261A, 0x00000000
-data4 0x378624A5, 0x3FDBA473, 0x23A8CD0E, 0x00000000
-data4 0x0AAD71F8, 0x3FDC7655, 0x2422D1D0, 0x00000000
-data4 0xC9EC862B, 0x3FDD45AE, 0x2344A109, 0x00000000
+data8 0x3FD025FA510665B5
+data4 0x24263482, 0x00000000
+data8 0x3FD1151A362431C9
+data4 0x242C8DC9, 0x00000000
+data8 0x3FD2025567E47C95
+data4 0x245CF9BA, 0x00000000
+data8 0x3FD2ED987A823CFE
+data4 0x235C892C, 0x00000000
+data8 0x3FD3D6D129271134
+data4 0x2389BE52, 0x00000000
+data8 0x3FD4BDEE586890E6
+data4 0x24436471, 0x00000000
+data8 0x3FD5A2E0175E0F4E
+data4 0x2389DBD4, 0x00000000
+data8 0x3FD685979F5FA6FD
+data4 0x2476D43F, 0x00000000
+data8 0x3FD7660752817501
+data4 0x24711774, 0x00000000
+data8 0x3FD84422B8DF95D7
+data4 0x23EBB501, 0x00000000
+data8 0x3FD91FDE7CD0C662
+data4 0x23883A0C, 0x00000000
+data8 0x3FD9F93066168001
+data4 0x240DF63F, 0x00000000
+data8 0x3FDAD00F5422058B
+data4 0x23FE261A, 0x00000000
+data8 0x3FDBA473378624A5
+data4 0x23A8CD0E, 0x00000000
+data8 0x3FDC76550AAD71F8
+data4 0x2422D1D0, 0x00000000
+data8 0x3FDD45AEC9EC862B
+data4 0x2344A109, 0x00000000
//
// Entries Tbl_hi (double precision) Index = 0,1,...,15
// B = 2^(-3)*(1+Index/16+1/32)
// Entries Tbl_lo (single precision)
// Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32)
//
-data4 0x84212B3D, 0x3FC068D5, 0x239874B6, 0x00000000
-data4 0x41060850, 0x3FC16465, 0x2335E774, 0x00000000
-data4 0x171A535C, 0x3FC25F6E, 0x233E36BE, 0x00000000
-data4 0xEDEB99A3, 0x3FC359E8, 0x239680A3, 0x00000000
-data4 0xC6092A9E, 0x3FC453CE, 0x230FB29E, 0x00000000
-data4 0xBA11570A, 0x3FC54D18, 0x230C1418, 0x00000000
-data4 0xFFB3AA73, 0x3FC645BF, 0x23F0564A, 0x00000000
-data4 0xE8A7D201, 0x3FC73DBD, 0x23D4A5E1, 0x00000000
-data4 0xE398EBC7, 0x3FC8350B, 0x23D4ADDA, 0x00000000
-data4 0x7D050271, 0x3FC92BA3, 0x23BCB085, 0x00000000
-data4 0x601081A5, 0x3FCA217E, 0x23BC841D, 0x00000000
-data4 0x574D780B, 0x3FCB1696, 0x23CF4A8E, 0x00000000
-data4 0x4D768466, 0x3FCC0AE5, 0x23BECC90, 0x00000000
-data4 0x4E1D5395, 0x3FCCFE65, 0x2323DCD2, 0x00000000
-data4 0x864C9D9D, 0x3FCDF110, 0x23F53F3A, 0x00000000
-data4 0x451D980C, 0x3FCEE2E1, 0x23CCB11F, 0x00000000
-
-data4 0x54442D18, 0x400921FB, 0x33145C07, 0x3CA1A626 // PI two doubles
-data4 0x54442D18, 0x3FF921FB, 0x33145C07, 0x3C91A626 // PI_by_2 two dbles
-data4 0x54442D18, 0x3FE921FB, 0x33145C07, 0x3C81A626 // PI_by_4 two dbles
-data4 0x7F3321D2, 0x4002D97C, 0x4C9E8A0A, 0x3C9A7939 // 3PI_by_4 two dbles
-ASM_SIZE_DIRECTIVE(Constants_atan)
-
-
-.text
-.proc atanl#
-.global atanl#
-.align 64
-
-atanl:
-{ .mfb
- nop.m 999
-(p0) mov ArgX_orig = f1
-(p0) br.cond.sptk atan2l ;;
-}
-.endp atanl
-ASM_SIZE_DIRECTIVE(atanl)
-
-.text
-.proc atan2l#
-.global atan2l#
-#ifdef _LIBC
-.proc __atan2l#
-.global __atan2l#
-.proc __ieee754_atan2l#
-.global __ieee754_atan2l#
-#endif
-.align 64
-
-
-atan2l:
-#ifdef _LIBC
-__atan2l:
-__ieee754_atan2l:
-#endif
-{ .mfi
-alloc r32 = ar.pfs, 0, 17 , 4, 0
-(p0) mov ArgY = ArgY_orig
-}
-{ .mfi
- nop.m 999
-(p0) mov ArgX = ArgX_orig
- nop.i 999
-};;
+data8 0x3FC068D584212B3D
+data4 0x239874B6, 0x00000000
+data8 0x3FC1646541060850
+data4 0x2335E774, 0x00000000
+data8 0x3FC25F6E171A535C
+data4 0x233E36BE, 0x00000000
+data8 0x3FC359E8EDEB99A3
+data4 0x239680A3, 0x00000000
+data8 0x3FC453CEC6092A9E
+data4 0x230FB29E, 0x00000000
+data8 0x3FC54D18BA11570A
+data4 0x230C1418, 0x00000000
+data8 0x3FC645BFFFB3AA73
+data4 0x23F0564A, 0x00000000
+data8 0x3FC73DBDE8A7D201
+data4 0x23D4A5E1, 0x00000000
+data8 0x3FC8350BE398EBC7
+data4 0x23D4ADDA, 0x00000000
+data8 0x3FC92BA37D050271
+data4 0x23BCB085, 0x00000000
+data8 0x3FCA217E601081A5
+data4 0x23BC841D, 0x00000000
+data8 0x3FCB1696574D780B
+data4 0x23CF4A8E, 0x00000000
+data8 0x3FCC0AE54D768466
+data4 0x23BECC90, 0x00000000
+data8 0x3FCCFE654E1D5395
+data4 0x2323DCD2, 0x00000000
+data8 0x3FCDF110864C9D9D
+data4 0x23F53F3A, 0x00000000
+data8 0x3FCEE2E1451D980C
+data4 0x23CCB11F, 0x00000000
+//
+data8 0x400921FB54442D18, 0x3CA1A62633145C07 // PI two doubles
+data8 0x3FF921FB54442D18, 0x3C91A62633145C07 // PI_by_2 two dbles
+data8 0x3FE921FB54442D18, 0x3C81A62633145C07 // PI_by_4 two dbles
+data8 0x4002D97C7F3321D2, 0x3C9A79394C9E8A0A // 3PI_by_4 two dbles
+LOCAL_OBJECT_END(Constants_atan)
+
+
+.section .text
+GLOBAL_IEEE754_ENTRY(atanl)
+
+// Use common code with atan2l after setting x=1.0
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p7,p0 = ArgY_orig, 0x103
- nop.i 999
+ alloc r32 = ar.pfs, 0, 17, 4, 0
+ fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-//
-// Save original input args and load table ptr.
-//
-(p0) fclass.m.unc p6,p0 = ArgX_orig, 0x103
- nop.i 999
-};;
+ addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer
+ fma.s1 Xsq = f1, f1, f0 // Form x*x
+ nop.i 999
+}
+;;
+
{ .mfi
-(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp
-(p0) fclass.m.unc p0,p9 = ArgY_orig, 0x1FF
- nop.i 999 ;;
+ ld8 table_ptr1 = [table_ptr1] // Get table pointer
+ fnorm.s1 ArgY = ArgY_orig
+ nop.i 999
}
{ .mfi
- ld8 table_ptr1 = [table_ptr1]
-(p0) fclass.m.unc p0,p8 = ArgX_orig, 0x1FF
- nop.i 999
+ nop.m 999
+ fnorm.s1 ArgX = f1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p13,p0 = ArgY_orig, 0x0C3
- nop.i 999 ;;
+ getf.exp sign_X = f1 // Get signexp of x
+ fmerge.s ArgX_abs = f0, f1 // Form |x|
+ nop.i 999
}
{ .mfi
-(p0) fclass.m.unc p12,p0 = ArgX_orig, 0x0C3
- nop.i 999
+ nop.m 999
+ fnorm.s1 ArgX_orig = f1
+ nop.i 999
}
+;;
+{ .mfi
+ getf.exp sign_Y = ArgY_orig // Get signexp of y
+ fmerge.s ArgY_abs = f0, ArgY_orig // Form |y|
+ mov table_base = table_ptr1 // Save base pointer to tables
+}
+;;
-//
-// Check for NatVals.
-// Check for everything - if false, then must be pseudo-zero
-// or pseudo-nan (IA unsupporteds).
-//
-{ .mib
- nop.m 999
- nop.i 999
-(p6) br.cond.spnt L(ATANL_NATVAL) ;;
+{ .mfi
+ ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi
+ fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero
+ nop.i 999
}
+;;
-{ .mib
- nop.m 999
- nop.i 999
-(p7) br.cond.spnt L(ATANL_NATVAL) ;;
+{ .mfi
+ ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
+ nop.f 999
+ nop.i 999
}
-{ .mib
-(p0) ldfd P_hi = [table_ptr1],8
- nop.i 999
-(p8) br.cond.spnt L(ATANL_UNSUPPORTED) ;;
+{ .mfi
+ nop.m 999
+ fma.s1 M = f1, f1, f0 // Set M = 1.0
+ nop.i 999
}
-{ .mbb
-(p0) add table_ptr2 = 96, table_ptr1
-(p9) br.cond.spnt L(ATANL_UNSUPPORTED)
+;;
+
//
-// Load double precision high-order part of pi
+// Check for everything - if false, then must be pseudo-zero
+// or pseudo-nan (IA unsupporteds).
//
-(p12) br.cond.spnt L(ATANL_NAN) ;;
-}
{ .mfb
- nop.m 999
-(p0) fnorm.s1 ArgX = ArgX
-(p13) br.cond.spnt L(ATANL_NAN) ;;
-}
-//
-// Normalize the input argument.
-// Branch out if NaN inputs
-//
-{ .mmf
-(p0) ldfs P_lo = [table_ptr1], 4
- nop.m 999
-(p0) fnorm.s1 ArgY = ArgY ;;
+ nop.m 999
+ fclass.m p0,p12 = f1, 0x1FF // Test x unsupported
+(p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero
}
-{ .mmf
- nop.m 999
-(p0) ldfs TWO_TO_NEG3 = [table_ptr1], 180
-//
+;;
+
// U = max(ArgX_abs,ArgY_abs)
// V = min(ArgX_abs,ArgY_abs)
-// if PR1, swap = 0
-// if PR2, swap = 1
-//
-(p0) mov M = f1 ;;
-}
{ .mfi
- nop.m 999
-//
-// Get exp and sign of ArgX
-// Get exp and sign of ArgY
-// Load 2**(-3) and increment ptr to Q_4.
-//
-(p0) fmerge.s ArgX_abs = f1, ArgX
- nop.i 999 ;;
+ nop.m 999
+ fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares
+ nop.i 999
}
-//
-// load single precision low-order part of pi = P_lo
-//
+{ .mfb
+ nop.m 999
+ fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y|
+ br.cond.sptk ATANL_COMMON // Branch to common code
+}
+;;
+
+GLOBAL_IEEE754_END(atanl)
+GLOBAL_IEEE754_ENTRY(atan2l)
+
{ .mfi
-(p0) getf.exp sign_X = ArgX
-(p0) fmerge.s ArgY_abs = f1, ArgY
- nop.i 999 ;;
+ alloc r32 = ar.pfs, 0, 17, 4, 0
+ fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y
+ nop.i 999
}
-{ .mii
-(p0) getf.exp sign_Y = ArgY
- nop.i 999 ;;
-(p0) shr sign_X = sign_X, 17 ;;
+{ .mfi
+ addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer
+ fma.s1 Xsq = ArgX_orig, ArgX_orig, f0 // Form x*x
+ nop.i 999
}
-{ .mii
- nop.m 999
-(p0) shr sign_Y = sign_Y, 17 ;;
-(p0) cmp.eq.unc p8, p9 = 0x00000, sign_Y ;;
+;;
+
+{ .mfi
+ ld8 table_ptr1 = [table_ptr1] // Get table pointer
+ fnorm.s1 ArgY = ArgY_orig
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Is ArgX_abs >= ArgY_abs
-// Is sign_Y == 0?
-//
-(p0) fmax.s1 U = ArgX_abs, ArgY_abs
- nop.i 999
+ nop.m 999
+ fnorm.s1 ArgX = ArgX_orig
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// ArgX_abs = |ArgX|
-// ArgY_abs = |ArgY|
-// sign_X is sign bit of ArgX
-// sign_Y is sign bit of ArgY
-//
-(p0) fcmp.ge.s1 p6, p7 = ArgX_abs, ArgY_abs
- nop.i 999 ;;
+ getf.exp sign_X = ArgX_orig // Get signexp of x
+ fmerge.s ArgX_abs = f0, ArgX_orig // Form |x|
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmin.s1 V = ArgX_abs, ArgY_abs
- nop.i 999 ;;
+ getf.exp sign_Y = ArgY_orig // Get signexp of y
+ fmerge.s ArgY_abs = f0, ArgY_orig // Form |y|
+ mov table_base = table_ptr1 // Save base pointer to tables
}
+;;
+
{ .mfi
- nop.m 999
-(p8) fadd.s1 s_Y = f0, f1
-(p6) cmp.eq.unc p10, p11 = 0x00000, sign_X
+ ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi
+ fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero
+ nop.i 999
}
-{ .mii
-(p6) add swap = r0, r0
- nop.i 999 ;;
-(p7) add swap = 1, r0
+;;
+
+{ .mfi
+ ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3
+ fclass.m p9,p0 = ArgX_orig, 0x1e7 // Test x natval, nan, inf, zero
+ nop.i 999
}
{ .mfi
- nop.m 999
+ nop.m 999
+ fma.s1 M = f1, f1, f0 // Set M = 1.0
+ nop.i 999
+}
+;;
+
//
-// Let M = 1.0
-// if p8, s_Y = 1.0
-// if p9, s_Y = -1.0
+// Check for everything - if false, then must be pseudo-zero
+// or pseudo-nan (IA unsupporteds).
//
-(p10) fsub.s1 M = M, f1
- nop.i 999 ;;
+{ .mfb
+ nop.m 999
+ fclass.m p0,p12 = ArgX_orig, 0x1FF // Test x unsupported
+(p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero
}
+;;
+
+// U = max(ArgX_abs,ArgY_abs)
+// V = min(ArgX_abs,ArgY_abs)
{ .mfi
- nop.m 999
-(p9) fsub.s1 s_Y = f0, f1
- nop.i 999 ;;
+ nop.m 999
+ fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares
+ nop.i 999
}
+{ .mfb
+ nop.m 999
+ fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y|
+(p9) br.cond.spnt ATANL_X_SPECIAL // Branch if x natval, nan, inf, zero
+}
+;;
+
+// Now common code for atanl and atan2l
+ATANL_COMMON:
{ .mfi
- nop.m 999
-(p0) frcpa.s1 E, p6 = V, U
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p0,p13 = ArgY_orig, 0x1FF // Test y unsupported
+ shr sign_X = sign_X, 17 // Get sign bit of x
+}
+{ .mfi
+ nop.m 999
+ fma.s1 U = ArgY_abs, f1, f0 // Set U assuming |x| < |y|
+ adds table_ptr1 = 176, table_ptr1 // Point to Q4
}
-{ .mbb
- nop.m 999
+;;
+
+{ .mfi
+(p6) add swap = r0, r0 // Set swap=0 if |x| >= |y|
+(p6) frcpa.s1 E, p0 = ArgY_abs, ArgX_abs // Compute E if |x| >= |y|
+ shr sign_Y = sign_Y, 17 // Get sign bit of y
+}
+{ .mfb
+ nop.m 999
+(p6) fma.s1 V = ArgY_abs, f1, f0 // Set V if |x| >= |y|
+(p12) br.cond.spnt ATANL_UNSUPPORTED // Branch if x unsupported
+}
+;;
+
+// Set p8 if y >=0
+// Set p9 if y < 0
+// Set p10 if |x| >= |y| and x >=0
+// Set p11 if |x| >= |y| and x < 0
+{ .mfi
+ cmp.eq p8, p9 = 0, sign_Y // Test for y >= 0
+(p7) frcpa.s1 E, p0 = ArgX_abs, ArgY_abs // Compute E if |x| < |y|
+(p7) add swap = 1, r0 // Set swap=1 if |x| < |y|
+}
+{ .mfb
+(p6) cmp.eq.unc p10, p11 = 0, sign_X // If |x| >= |y|, test for x >= 0
+(p6) fma.s1 U = ArgX_abs, f1, f0 // Set U if |x| >= |y|
+(p13) br.cond.spnt ATANL_UNSUPPORTED // Branch if y unsupported
+}
+;;
+
//
-// E = frcpa(V,U)
+// if p8, s_Y = 1.0
+// if p9, s_Y = -1.0
//
-(p6) br.cond.sptk L(ATANL_STEP2)
-(p0) br.cond.spnt L(ATANL_SPECIAL_HANDLING) ;;
+.pred.rel "mutex",p8,p9
+{ .mfi
+ nop.m 999
+(p8) fadd.s1 s_Y = f0, f1 // If y >= 0 set s_Y = 1.0
+ nop.i 999
}
-L(ATANL_STEP2):
{ .mfi
- nop.m 999
-(p0) fmpy.s1 Q = E, V
- nop.i 999
+ nop.m 999
+(p9) fsub.s1 s_Y = f0, f1 // If y < 0 set s_Y = -1.0
+ nop.i 999
}
+;;
+
+.pred.rel "mutex",p10,p11
{ .mfi
- nop.m 999
-(p0) fcmp.eq.s0 p0, p9 = f1, ArgY_orig
- nop.i 999 ;;
+ nop.m 999
+(p10) fsub.s1 M = M, f1 // If |x| >= |y| and x >=0, set M=0
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Is Q < 2**(-3)?
-//
-(p0) fcmp.eq.s0 p0, p8 = f1, ArgX_orig
- nop.i 999
+ nop.m 999
+(p11) fadd.s1 M = M, f1 // If |x| >= |y| and x < 0, set M=2.0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p11) fadd.s1 M = M, f1
- nop.i 999 ;;
+ nop.m 999
+ fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag
+ nop.i 999
}
-{ .mlx
- nop.m 999
// *************************************************
// ********************* STEP2 *********************
// *************************************************
-(p0) movl special = 0x8400000000000000
-}
-{ .mlx
- nop.m 999
//
-// lookup = b_1 b_2 b_3 B_4
+// Q = E * V
//
-(p0) movl special1 = 0x0000000000000100 ;;
+{ .mfi
+ nop.m 999
+ fmpy.s1 Q = E, V
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Do fnorms to raise any denormal operand
-// exceptions.
-//
-(p0) fmpy.s1 P_hi = M, P_hi
- nop.i 999
+ nop.m 999
+ fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (1) if POLY path
+ nop.i 999
}
+;;
+
+// Create a single precision representation of the signexp of Q with the
+// 4 most significant bits of the significand followed by a 1 and then 18 0's
{ .mfi
- nop.m 999
-(p0) fmpy.s1 P_lo = M, P_lo
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 P_hi = M, P_hi
+ dep.z special = 0x1, 18, 1 // Form 0x0000000000040000
}
{ .mfi
- nop.m 999
-//
-// Q = E * V
-//
-(p0) fcmp.lt.unc.s1 p6, p7 = Q, TWO_TO_NEG3
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 P_lo = M, P_lo
+ add table_ptr2 = 32, table_ptr1
}
-{ .mmb
-(p0) getf.sig significand_Q = Q
-(p0) getf.exp exponent_Q = Q
- nop.b 999 ;;
+;;
+
+{ .mfi
+ nop.m 999
+ fma.s1 A_temp = Q, f1, f0 // Set A_temp if POLY path
+ nop.i 999
}
-{ .mmi
- nop.m 999 ;;
-(p0) andcm k = 0x0003, exponent_Q
-(p0) extr.u lookup = significand_Q, 59, 4 ;;
+{ .mfi
+ nop.m 999
+ fma.s1 E = E, E_hold, E // E = E + E*E_hold (1) if POLY path
+ nop.i 999
}
-{ .mib
- nop.m 999
-(p0) dep special = lookup, special, 59, 4
+;;
+
//
-// Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
+// Is Q < 2**(-3)?
+// swap = xor(swap,sign_X)
//
-(p6) br.cond.spnt L(ATANL_POLY) ;;
-}
{ .mfi
-(p0) cmp.eq.unc p8, p9 = 0x0000, k
-(p0) fmpy.s1 P_hi = s_Y, P_hi
+ nop.m 999
+ fcmp.lt.s1 p9, p0 = Q, TWO_TO_NEG3 // Test Q < 2^-3
+ xor swap = sign_X, swap
+}
+;;
+
+// P_hi = s_Y * P_hi
+{ .mmf
+ getf.exp exponent_Q = Q // Get signexp of Q
+ cmp.eq.unc p7, p6 = 0x00000, swap
+ fmpy.s1 P_hi = s_Y, P_hi
+}
+;;
+
//
-// We waited a few extra cycles so P_lo and P_hi could be calculated.
-// Load the constant 256 for loading up table entries.
+// if (PR_1) sigma = -1.0
+// if (PR_2) sigma = 1.0
+//
+{ .mfi
+ getf.sig significand_Q = Q // Get significand of Q
+(p6) fsub.s1 sigma = f0, f1
+ nop.i 999
+}
+{ .mfb
+(p9) add table_ptr1 = 128, table_base // Point to P8 if POLY path
+(p7) fadd.s1 sigma = f0, f1
+(p9) br.cond.spnt ATANL_POLY // Branch to POLY if 0 < Q < 2^-3
+}
+;;
+
//
// *************************************************
// ******************** STEP3 **********************
// *************************************************
-(p0) add table_ptr2 = 16, table_ptr1
-}
//
-// Let z_hi have exponent and sign of original Q
-// Load the Tbl_hi(0) else, increment pointer.
+// lookup = b_1 b_2 b_3 B_4
//
-{ .mii
-(p0) ldfe Q_4 = [table_ptr1], -16
-(p0) xor swap = sign_X, swap ;;
-(p9) sub k = k, r0, 1
-}
{ .mmi
-(p0) setf.sig z_hi = special
-(p0) ldfe Q_3 = [table_ptr1], -16
-(p9) add table_ptr2 = 16, table_ptr2 ;;
+ nop.m 999
+ nop.m 999
+ andcm k = 0x0003, exponent_Q // k=0,1,2,3 for exp_Q=0,-1,-2,-3
}
+;;
+
//
-// U_hold = U - U_prime_hi
-// k = k * 256 - Result can be 0, 256, or 512.
+// Generate sign_exp_Q b_1 b_2 b_3 b_4 1 0 0 0 ... 0 in single precision
+// representation. Note sign of Q is always 0.
//
-{ .mmb
-(p0) ldfe Q_2 = [table_ptr1], -16
-(p8) ldfd Tbl_hi = [table_ptr2], 8
- nop.b 999 ;;
+{ .mfi
+ cmp.eq p8, p9 = 0x0000, k // Test k=0
+ nop.f 999
+ extr.u lookup = significand_Q, 59, 4 // Extract b_1 b_2 b_3 b_4 for index
}
-//
-// U_prime_lo = U_hold + V * z_hi
-// lookup -> lookup * 16 + k
-//
-{ .mmi
-(p0) ldfe Q_1 = [table_ptr1], -16 ;;
-(p8) ldfs Tbl_lo = [table_ptr2], 8
-//
-// U_prime_hi = U + V * z_hi
-// Load the Tbl_lo(0)
-//
-(p9) pmpy2.r k = k, special1 ;;
+{ .mfi
+ sub sp_exp_Q = 0x7f, k // Form single prec biased exp of Q
+ nop.f 999
+ sub k = k, r0, 1 // Decrement k
}
-{ .mii
- nop.m 999
- nop.i 999
- nop.i 999 ;;
+;;
+
+// Form pointer to B index table
+{ .mfi
+ ldfe Q_4 = [table_ptr1], -16 // Load Q_4
+ nop.f 999
+(p9) shl k = k, 8 // k = 0, 256, or 512
}
-{ .mii
- nop.m 999
- nop.i 999
- nop.i 999 ;;
+{ .mfi
+(p9) shladd table_ptr2 = lookup, 4, table_ptr2
+ nop.f 999
+ shladd sp_exp_4sig_Q = sp_exp_Q, 4, lookup // Shift and add in 4 high bits
}
-{ .mii
- nop.m 999
- nop.i 999
- nop.i 999 ;;
+;;
+
+{ .mmi
+(p8) add table_ptr2 = -16, table_ptr2 // Pointer if original k was 0
+(p9) add table_ptr2 = k, table_ptr2 // Pointer if k was 1, 2, 3
+ dep special = sp_exp_4sig_Q, special, 19, 13 // Form z_hi as single prec
}
-{ .mii
- nop.m 999
- nop.i 999 ;;
-(p9) shladd lookup = lookup, 0x0004, k ;;
+;;
+
+// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
+{ .mmi
+ ldfd Tbl_hi = [table_ptr2], 8 // Load Tbl_hi from index table
+;;
+ setf.s z_hi = special // Form z_hi
+ nop.i 999
}
{ .mmi
-(p9) add table_ptr2 = table_ptr2, lookup ;;
-//
-// V_prime = V - U * z_hi
-//
-(p9) ldfd Tbl_hi = [table_ptr2], 8
- nop.i 999 ;;
+ ldfs Tbl_lo = [table_ptr2], 8 // Load Tbl_lo from index table
+;;
+ ldfe Q_3 = [table_ptr1], -16 // Load Q_3
+ nop.i 999
}
+;;
+
+{ .mmi
+ ldfe Q_2 = [table_ptr1], -16 // Load Q_2
+ nop.m 999
+ nop.i 999
+}
+;;
+
{ .mmf
- nop.m 999
-//
-// C_hi = frcpa(1,U_prime_hi)
-//
-(p9) ldfs Tbl_lo = [table_ptr2], 8
-//
-// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0
-// Point to beginning of Tbl_hi entries - k = 0.
-//
-(p0) fmerge.se z_hi = Q, z_hi ;;
+ ldfe Q_1 = [table_ptr1], -16 // Load Q_1
+ nop.m 999
+ nop.f 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 U_prime_hi = V, z_hi, U
- nop.i 999
+ nop.m 999
+ fma.s1 U_prime_hi = V, z_hi, U // U_prime_hi = U + V * z_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fnma.s1 V_prime = U, z_hi, V
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 V_prime = U, z_hi, V // V_prime = V - U * z_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) mov A_hi = Tbl_hi
- nop.i 999 ;;
+ nop.m 999
+ mov A_hi = Tbl_hi // Start with A_hi = Tbl_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fsub.s1 U_hold = U, U_prime_hi
- nop.i 999 ;;
+ nop.m 999
+ fsub.s1 U_hold = U, U_prime_hi // U_hold = U - U_prime_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) frcpa.s1 C_hi, p6 = f1, U_prime_hi
- nop.i 999 ;;
+ nop.m 999
+ frcpa.s1 C_hi, p0 = f1, U_prime_hi // C_hi = frcpa(1,U_prime_hi)
+ nop.i 999
}
+;;
+
{ .mfi
-(p0) cmp.eq.unc p7, p6 = 0x00000, swap
-(p0) fmpy.s1 A_hi = s_Y, A_hi
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly = wsq * poly
-//
-(p7) fadd.s1 sigma = f0, f1
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 U_prime_lo = z_hi, V, U_hold // U_prime_lo = U_hold + V * z_hi
+ nop.i 999
}
+;;
+
+// C_hi_hold = 1 - C_hi * U_prime_hi (1)
{ .mfi
- nop.m 999
-(p0) fma.s1 U_prime_lo = z_hi, V, U_hold
- nop.i 999
+ nop.m 999
+ fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p6) fsub.s1 sigma = f0, f1
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (1)
+ nop.i 999
}
+;;
+
+// C_hi_hold = 1 - C_hi * U_prime_hi (2)
{ .mfi
- nop.m 999
-//
-// A_lo = A_lo + w_hi
-// A_hi = s_Y * A_hi
-//
-(p0) fma.s1 Res_hi = sigma, A_hi, P_hi
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// C_hi_hold = 1 - C_hi * U_prime_hi (1)
-//
-(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (2)
+ nop.i 999
}
+;;
+
+// C_hi_hold = 1 - C_hi * U_prime_hi (3)
{ .mfi
- nop.m 999
-//
-// C_hi = C_hi + C_hi * C_hi_hold (1)
-//
-(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// C_hi_hold = 1 - C_hi * U_prime_hi (2)
-//
-(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (3)
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// C_hi = C_hi + C_hi * C_hi_hold (2)
-//
-(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 w_hi = V_prime, C_hi // w_hi = V_prime * C_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// C_hi_hold = 1 - C_hi * U_prime_hi (3)
-//
-(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 wsq = w_hi, w_hi // wsq = w_hi * w_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// C_hi = C_hi + C_hi * C_hi_hold (3)
-//
-(p0) fmpy.s1 w_hi = V_prime, C_hi
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 w_lo = w_hi, U_prime_hi, V_prime // w_lo = V_prime-w_hi*U_prime_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// w_hi = V_prime * C_hi
-//
-(p0) fmpy.s1 wsq = w_hi, w_hi
- nop.i 999
+ nop.m 999
+ fma.s1 poly = wsq, Q_4, Q_3 // poly = Q_3 + wsq * Q_4
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fnma.s1 w_lo = w_hi, U_prime_hi, V_prime
- nop.i 999 ;;
+ nop.m 999
+ fnma.s1 w_lo = w_hi, U_prime_lo, w_lo // w_lo = w_lo - w_hi * U_prime_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// wsq = w_hi * w_hi
-// w_lo = = V_prime - w_hi * U_prime_hi
-//
-(p0) fma.s1 poly = wsq, Q_4, Q_3
- nop.i 999
+ nop.m 999
+ fma.s1 poly = wsq, poly, Q_2 // poly = Q_2 + wsq * poly
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fnma.s1 w_lo = w_hi, U_prime_lo, w_lo
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 w_lo = C_hi, w_lo // w_lo = = w_lo * C_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly = Q_3 + wsq * Q_4
-// w_lo = = w_lo - w_hi * U_prime_lo
-//
-(p0) fma.s1 poly = wsq, poly, Q_2
- nop.i 999
+ nop.m 999
+ fma.s1 poly = wsq, poly, Q_1 // poly = Q_1 + wsq * poly
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 w_lo = C_hi, w_lo
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 A_lo = Tbl_lo, w_lo // A_lo = Tbl_lo + w_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly = Q_2 + wsq * poly
-// w_lo = = w_lo * C_hi
-//
-(p0) fma.s1 poly = wsq, poly, Q_1
- nop.i 999
+ nop.m 999
+ fmpy.s0 Q_1 = Q_1, Q_1 // Dummy operation to raise inexact
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fadd.s1 A_lo = Tbl_lo, w_lo
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 poly = wsq, poly // poly = wsq * poly
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode)
-//
-(p0) fmpy.s0 Q_1 = Q_1, Q_1
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 poly = w_hi, poly // poly = w_hi * poly
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly = Q_1 + wsq * poly
-// A_lo = Tbl_lo + w_lo
-// swap = xor(swap,sign_X)
-//
-(p0) fmpy.s1 poly = wsq, poly
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 A_lo = A_lo, poly // A_lo = A_lo + poly
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// Is (swap) != 0 ?
-// poly = wsq * poly
-// A_hi = Tbl_hi
-//
-(p0) fmpy.s1 poly = w_hi, poly
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 A_lo = A_lo, w_hi // A_lo = A_lo + w_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// if (PR_1) sigma = -1.0
-// if (PR_2) sigma = 1.0
-//
-(p0) fadd.s1 A_lo = A_lo, poly
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 Res_lo = sigma, A_lo, P_lo // Res_lo = P_lo + sigma * A_lo
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
-// P_hi = s_Y * P_hi
-// A_lo = A_lo + poly
+// Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode)
//
-(p0) fadd.s1 A_lo = A_lo, w_hi
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fma.s1 Res_lo = sigma, A_lo, P_lo
- nop.i 999 ;;
-}
{ .mfb
- nop.m 999
-//
-// Res_hi = P_hi + sigma * A_hi
-// Res_lo = P_lo + sigma * A_lo
-//
-(p0) fma.s0 Result = Res_lo, s_Y, Res_hi
-//
-// Raise inexact.
-//
-br.ret.sptk b0 ;;
-}
-//
-// poly1 = P_5 + zsq * poly1
-// poly2 = zsq * poly2
-//
-L(ATANL_POLY):
-{ .mmf
-(p0) xor swap = sign_X, swap
- nop.m 999
-(p0) fnma.s1 E_hold = E, U, f1 ;;
+ nop.m 999
+ fma.s0 Result = Res_lo, s_Y, Res_hi
+ br.ret.sptk b0 // Exit table path 2^-3 <= V/U < 1
}
-{ .mfi
- nop.m 999
-(p0) mov A_temp = Q
+;;
+
+
+ATANL_POLY:
+// Here if 0 < V/U < 2^-3
//
-// poly1 = P_4 + zsq * poly1
-// swap = xor(swap,sign_X)
+// ***********************************************
+// ******************** STEP4 ********************
+// ***********************************************
+
//
-// sign_X gr_002
-// swap gr_004
-// poly1 = poly1 <== Done with poly1
-// poly1 = P_4 + zsq * poly1
-// swap = xor(swap,sign_X)
+// Following:
+// Iterate 3 times E = E + E*(1.0 - E*U)
+// Also load P_8, P_7, P_6, P_5, P_4
//
-(p0) cmp.eq.unc p7, p6 = 0x00000, swap
-}
-{ .mfi
- nop.m 999
-(p0) fmpy.s1 P_hi = s_Y, P_hi
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p6) fsub.s1 sigma = f0, f1
- nop.i 999
+ ldfe P_8 = [table_ptr1], -16 // Load P_8
+ fnma.s1 z_lo = A_temp, U, V // z_lo = V - A_temp * U
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p7) fadd.s1 sigma = f0, f1
- nop.i 999 ;;
-}
-
-// ***********************************************
-// ******************** STEP4 ********************
-// ***********************************************
-
-{ .mmi
nop.m 999
-(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp
+ fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (2)
nop.i 999
}
;;
{ .mmi
- ld8 table_ptr1 = [table_ptr1]
- nop.m 999
+ ldfe P_7 = [table_ptr1], -16 // Load P_7
+;;
+ ldfe P_6 = [table_ptr1], -16 // Load P_6
nop.i 999
}
;;
-
{ .mfi
- nop.m 999
-(p0) fma.s1 E = E, E_hold, E
-//
-// Following:
-// Iterate 3 times E = E + E*(1.0 - E*U)
-// Also load P_8, P_7, P_6, P_5, P_4
-// E_hold = 1.0 - E * U (1)
-// A_temp = Q
-//
-(p0) add table_ptr1 = 128, table_ptr1 ;;
-}
-{ .mmf
- nop.m 999
-//
-// E = E + E_hold*E (1)
-// Point to P_8.
-//
-(p0) ldfe P_8 = [table_ptr1], -16
-//
-// poly = z8*poly1 + poly2 (Typo in writeup)
-// Is (swap) != 0 ?
-//
-(p0) fnma.s1 z_lo = A_temp, U, V ;;
+ ldfe P_5 = [table_ptr1], -16 // Load P_5
+ fma.s1 E = E, E_hold, E // E = E + E_hold*E (2)
+ nop.i 999
}
-{ .mmb
- nop.m 999
-//
-// E_hold = 1.0 - E * U (2)
-//
-(p0) ldfe P_7 = [table_ptr1], -16
- nop.b 999 ;;
+;;
+
+{ .mmi
+ ldfe P_4 = [table_ptr1], -16 // Load P_4
+;;
+ ldfe P_3 = [table_ptr1], -16 // Load P_3
+ nop.i 999
}
-{ .mmb
- nop.m 999
-//
-// E = E + E_hold*E (2)
-//
-(p0) ldfe P_6 = [table_ptr1], -16
- nop.b 999 ;;
+;;
+
+{ .mfi
+ ldfe P_2 = [table_ptr1], -16 // Load P_2
+ fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (3)
+ nop.i 999
}
-{ .mmb
- nop.m 999
-//
-// E_hold = 1.0 - E * U (3)
-//
-(p0) ldfe P_5 = [table_ptr1], -16
- nop.b 999 ;;
+{ .mlx
+ nop.m 999
+ movl int_temp = 0x24005 // Signexp for small neg number
}
+;;
+
{ .mmf
- nop.m 999
-//
-// E = E + E_hold*E (3)
+ ldfe P_1 = [table_ptr1], -16 // Load P_1
+ setf.exp tmp_small = int_temp // Form small neg number
+ fma.s1 E = E, E_hold, E // E = E + E_hold*E (3)
+}
+;;
+
//
//
// At this point E approximates 1/U to roughly working precision
-// z = V*E approximates V/U
+// Z = V*E approximates V/U
//
-(p0) ldfe P_4 = [table_ptr1], -16
-(p0) fnma.s1 E_hold = E, U, f1 ;;
+{ .mfi
+ nop.m 999
+ fmpy.s1 Z = V, E // Z = V * E
+ nop.i 999
}
-{ .mmb
- nop.m 999
-//
-// Z = V * E
-//
-(p0) ldfe P_3 = [table_ptr1], -16
- nop.b 999 ;;
+{ .mfi
+ nop.m 999
+ fmpy.s1 z_lo = z_lo, E // z_lo = z_lo * E
+ nop.i 999
}
-{ .mmb
- nop.m 999
+;;
+
//
-// zsq = Z * Z
+// Now what we want to do is
+// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
+// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
//
-(p0) ldfe P_2 = [table_ptr1], -16
- nop.b 999 ;;
-}
-{ .mmb
- nop.m 999
//
-// z8 = zsq * zsq
+// Fixup added to force inexact later -
+// A_hi = A_temp + z_lo
+// z_lo = (A_temp - A_hi) + z_lo
//
-(p0) ldfe P_1 = [table_ptr1], -16
- nop.b 999 ;;
-}
-{ .mlx
- nop.m 999
-(p0) movl int_temp = 0x24005
-}
{ .mfi
- nop.m 999
-(p0) fma.s1 E = E, E_hold, E
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 zsq = Z, Z // zsq = Z * Z
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fnma.s1 E_hold = E, U, f1
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 A_hi = A_temp, z_lo // A_hi = A_temp + z_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 E = E, E_hold, E
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 poly1 = zsq, P_8, P_7 // poly1 = P_7 + zsq * P_8
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 Z = V, E
- nop.i 999
+ nop.m 999
+ fma.s1 poly2 = zsq, P_3, P_2 // poly2 = P_2 + zsq * P_3
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// z_lo = V - A_temp * U
-// if (PR_2) sigma = 1.0
-//
-(p0) fmpy.s1 z_lo = z_lo, E
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 z4 = zsq, zsq // z4 = zsq * zsq
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fmpy.s1 zsq = Z, Z
- nop.i 999
+ nop.m 999
+ fsub.s1 A_temp = A_temp, A_hi // A_temp = A_temp - A_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// z_lo = z_lo * E
-// if (PR_1) sigma = -1.0
-//
-(p0) fadd.s1 A_hi = A_temp, z_lo
- nop.i 999 ;;
+ nop.m 999
+ fmerge.s tmp = A_hi, A_hi // Copy tmp = A_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// z8 = z8 * z8
-//
-//
-// Now what we want to do is
-// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8)))
-// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3))
-//
-(p0) fma.s1 poly1 = zsq, P_8, P_7
- nop.i 999
+ nop.m 999
+ fma.s1 poly1 = zsq, poly1, P_6 // poly1 = P_6 + zsq * poly1
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 poly2 = zsq, P_3, P_2
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 poly2 = zsq, poly2, P_1 // poly2 = P_2 + zsq * poly2
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmpy.s1 z8 = zsq, zsq
- nop.i 999
+ nop.m 999
+ fmpy.s1 z8 = z4, z4 // z8 = z4 * z4
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fsub.s1 A_temp = A_temp, A_hi
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 z_lo = A_temp, z_lo // z_lo = (A_temp - A_hi) + z_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// A_lo = Z * poly + z_lo
-//
-(p0) fmerge.s tmp = A_hi, A_hi
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 poly1 = zsq, poly1, P_5 // poly1 = P_5 + zsq * poly1
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// poly1 = P_7 + zsq * P_8
-// poly2 = P_2 + zsq * P_3
-//
-(p0) fma.s1 poly1 = zsq, poly1, P_6
- nop.i 999
+ nop.m 999
+ fmpy.s1 poly2 = poly2, zsq // poly2 = zsq * poly2
+ nop.i 999
}
+;;
+
+// Create small GR double in case need to raise underflow
{ .mfi
- nop.m 999
-(p0) fma.s1 poly2 = zsq, poly2, P_1
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 poly1 = zsq, poly1, P_4 // poly1 = P_4 + zsq * poly1
+ dep GR_temp = -1,r0,0,53
}
+;;
+
+// Create small double in case need to raise underflow
{ .mfi
- nop.m 999
-(p0) fmpy.s1 z8 = z8, z8
- nop.i 999
+ setf.d FR_temp = GR_temp
+ fma.s1 poly = z8, poly1, poly2 // poly = poly2 + z8 * poly1
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fadd.s1 z_lo = A_temp, z_lo
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 A_lo = Z, poly, z_lo // A_lo = z_lo + Z * poly
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// poly1 = P_6 + zsq * poly1
-// poly2 = P_2 + zsq * poly2
-//
-(p0) fma.s1 poly1 = zsq, poly1, P_5
- nop.i 999
+ nop.m 999
+ fadd.s1 A_hi = tmp, A_lo // A_hi = tmp + A_lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmpy.s1 poly2 = poly2, zsq
- nop.i 999 ;;
+ nop.m 999
+ fsub.s1 tmp = tmp, A_hi // tmp = tmp - A_hi
+ nop.i 999
}
{ .mfi
- nop.m 999
-//
-// Result = Res_hi + Res_lo (User Supplied Rounding Mode)
-//
-(p0) fmpy.s1 P_5 = P_5, P_5
- nop.i 999 ;;
+ nop.m 999
+ fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fma.s1 poly1 = zsq, poly1, P_4
- nop.i 999 ;;
+ nop.m 999
+ fadd.s1 A_lo = tmp, A_lo // A_lo = tmp + A_lo
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 poly = z8, poly1, poly2
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
+ nop.m 999
+ fsub.s1 tmp = P_hi, Res_hi // tmp = P_hi - Res_hi
+ nop.i 999
+}
+;;
+
//
-// Fixup added to force inexact later -
-// A_hi = A_temp + z_lo
-// z_lo = (A_temp - A_hi) + z_lo
+// Test if A_lo is zero
//
-(p0) fma.s1 A_lo = Z, poly, z_lo
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p0) fadd.s1 A_hi = tmp, A_lo
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p6,p0 = A_lo, 0x007 // Test A_lo = 0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fsub.s1 tmp = tmp, A_hi
- nop.i 999
+ nop.m 999
+(p6) mov A_lo = tmp_small // If A_lo zero, make very small
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fmpy.s1 A_hi = s_Y, A_hi
- nop.i 999 ;;
+ nop.m 999
+ fma.s1 tmp = A_hi, sigma, tmp // tmp = sigma * A_hi + tmp
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fadd.s1 A_lo = tmp, A_lo
- nop.i 999
+ nop.m 999
+ fma.s1 sigma = A_lo, sigma, P_lo // sigma = A_lo * sigma + P_lo
+ nop.i 999
}
+;;
+
{ .mfi
-(p0) setf.exp tmp = int_temp
+ nop.m 999
+ fma.s1 Res_lo = s_Y, sigma, tmp // Res_lo = s_Y * sigma + tmp
+ nop.i 999
+}
+;;
+
//
-// P_hi = s_Y * P_hi
-// A_hi = s_Y * A_hi
+// Test if Res_lo is denormal
//
-(p0) fma.s1 Res_hi = sigma, A_hi, P_hi
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6,p0 = A_lo, 0x007
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p6) mov A_lo = tmp
- nop.i 999
+ nop.m 999
+ fclass.m p14, p15 = Res_lo, 0x0b
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
-// Res_hi = P_hi + sigma * A_hi
+// Compute Result = Res_lo + Res_hi. Use s3 if Res_lo is denormal.
//
-(p0) fsub.s1 tmp = P_hi, Res_hi
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-//
-// tmp = P_hi - Res_hi
-//
-(p0) fma.s1 tmp = A_hi, sigma, tmp
- nop.i 999
+ nop.m 999
+(p14) fadd.s3 Result = Res_lo, Res_hi // Result for Res_lo denormal
+ nop.i 999
}
{ .mfi
- nop.m 999
-(p0) fma.s1 sigma = A_lo, sigma, P_lo
- nop.i 999 ;;
+ nop.m 999
+(p15) fadd.s0 Result = Res_lo, Res_hi // Result for Res_lo normal
+ nop.i 999
}
+;;
+
+//
+// If Res_lo is denormal test if Result equals zero
+//
{ .mfi
- nop.m 999
-//
-// tmp = sigma * A_hi + tmp
-// sigma = A_lo * sigma + P_lo
-//
-(p0) fma.s1 Res_lo = s_Y, sigma, tmp
- nop.i 999 ;;
+ nop.m 999
+(p14) fclass.m.unc p14, p0 = Result, 0x07
+ nop.i 999
}
-{ .mfb
- nop.m 999
+;;
+
//
-// Res_lo = s_Y * sigma + tmp
+// If Res_lo is denormal and Result equals zero, raise inexact, underflow
+// by squaring small double
//
-(p0) fadd.s0 Result = Res_lo, Res_hi
-br.ret.sptk b0 ;;
+{ .mfb
+ nop.m 999
+(p14) fmpy.d.s0 FR_temp = FR_temp, FR_temp
+ br.ret.sptk b0 // Exit POLY path, 0 < Q < 2^-3
}
-L(ATANL_NATVAL):
-L(ATANL_UNSUPPORTED):
-L(ATANL_NAN):
+;;
+
+
+ATANL_UNSUPPORTED:
{ .mfb
- nop.m 999
-(p0) fmpy.s0 Result = ArgX,ArgY
-(p0) br.ret.sptk b0 ;;
+ nop.m 999
+ fmpy.s0 Result = ArgX,ArgY
+ br.ret.sptk b0
}
-L(ATANL_SPECIAL_HANDLING):
+;;
+
+// Here if y natval, nan, inf, zero
+ATANL_Y_SPECIAL:
+// Here if x natval, nan, inf, zero
+ATANL_X_SPECIAL:
{ .mfi
- nop.m 999
-(p0) fcmp.eq.s0 p0, p6 = f1, ArgY_orig
- nop.i 999
+ nop.m 999
+ fclass.m p13,p12 = ArgY_orig, 0x0c3 // Test y nan
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fcmp.eq.s0 p0, p5 = f1, ArgX_orig
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p15,p14 = ArgY_orig, 0x103 // Test y natval
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p6, p7 = ArgY, 0x007
- nop.i 999
-}
-{ .mlx
- nop.m 999
-(p0) movl special = 992
+ nop.m 999
+(p12) fclass.m p13,p0 = ArgX_orig, 0x0c3 // Test x nan
+ nop.i 999
}
;;
-
-{ .mmi
+{ .mfi
nop.m 999
-(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp
+(p14) fclass.m p15,p0 = ArgX_orig, 0x103 // Test x natval
nop.i 999
}
;;
-{ .mmi
- ld8 table_ptr1 = [table_ptr1]
+{ .mfb
nop.m 999
- nop.i 999
+(p13) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result nan if x or y nan
+(p13) br.ret.spnt b0 // Exit if x or y nan
+}
+;;
+
+{ .mfb
+ nop.m 999
+(p15) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result natval if x or y natval
+(p15) br.ret.spnt b0 // Exit if x or y natval
}
;;
-{ .mib
-(p0) add table_ptr1 = table_ptr1, special
- nop.i 999
-(p7) br.cond.spnt L(ATANL_ArgY_Not_ZERO) ;;
+// Here if x or y inf or zero
+ATANL_SPECIAL_HANDLING:
+{ .mfi
+ nop.m 999
+ fclass.m p6, p7 = ArgY_orig, 0x007 // Test y zero
+ mov special = 992 // Offset to table
}
+;;
+
+{ .mfb
+ add table_ptr1 = table_base, special // Point to 3pi/4
+ fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag
+(p7) br.cond.spnt ATANL_ArgY_Not_ZERO // Branch if y not zero
+}
+;;
+
+// Here if y zero
{ .mmf
-(p0) ldfd Result = [table_ptr1], 8
- nop.m 999
-(p6) fclass.m.unc p14, p0 = ArgX, 0x035 ;;
+ ldfd Result = [table_ptr1], 8 // Get pi high
+ nop.m 999
+ fclass.m p14, p0 = ArgX, 0x035 // Test for x>=+0
}
+;;
+
{ .mmf
- nop.m 999
-(p0) ldfd Result_lo = [table_ptr1], -8
-(p6) fclass.m.unc p15, p0 = ArgX, 0x036 ;;
+ nop.m 999
+ ldfd Result_lo = [table_ptr1], -8 // Get pi lo
+ fclass.m p15, p0 = ArgX, 0x036 // Test for x<=-0
}
+;;
+
+//
+// Return sign_Y * 0 when ArgX > +0
+//
{ .mfi
- nop.m 999
-(p14) fmerge.s Result = ArgY, f0
- nop.i 999
+ nop.m 999
+(p14) fmerge.s Result = ArgY, f0 // If x>=+0, y=0, hi sgn(y)*0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p6) fclass.m.unc p13, p0 = ArgX, 0x007
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p13, p0 = ArgX, 0x007 // Test for x=0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p14) fmerge.s Result_lo = ArgY, f0
- nop.i 999 ;;
+ nop.m 999
+(p14) fmerge.s Result_lo = ArgY, f0 // If x>=+0, y=0, lo sgn(y)*0
+ nop.i 999
}
+;;
+
{ .mfi
-(p13) mov GR_Parameter_TAG = 36
- nop.f 999
- nop.i 999 ;;
+(p13) mov GR_Parameter_TAG = 36 // Error tag for x=0, y=0
+ nop.f 999
+ nop.i 999
}
-{ .mfi
- nop.m 999
+;;
+
//
-// Return sign_Y * 0 when ArgX > +0
+// Return sign_Y * pi when ArgX < -0
//
-(p15) fmerge.s Result = ArgY, Result
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p15) fmerge.s Result_lo = ArgY, Result_lo
- nop.i 999 ;;
+ nop.m 999
+(p15) fmerge.s Result = ArgY, Result // If x<0, y=0, hi=sgn(y)*pi
+ nop.i 999
}
-{ .mfb
- nop.m 999
-//
-// Return sign_Y * 0 when ArgX < -0
-//
-(p0) fadd.s0 Result = Result, Result_lo
-(p13) br.cond.spnt __libm_error_region ;;
+;;
+
+{ .mfi
+ nop.m 999
+(p15) fmerge.s Result_lo = ArgY, Result_lo // If x<0, y=0, lo=sgn(y)*pi
+ nop.i 999
}
-{ .mib
- nop.m 999
- nop.i 999
+;;
+
//
-// Call error support funciton for atan(0,0)
+// Call error support function for atan(0,0)
//
-(p0) br.ret.sptk b0 ;;
-}
-L(ATANL_ArgY_Not_ZERO):
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p9, p10 = ArgY, 0x023
- nop.i 999 ;;
+{ .mfb
+ nop.m 999
+ fadd.s0 Result = Result, Result_lo
+(p13) br.cond.spnt __libm_error_region // Branch if atan(0,0)
}
+;;
+
{ .mib
- nop.m 999
- nop.i 999
-(p10) br.cond.spnt L(ATANL_ArgY_Not_INF) ;;
-}
-{ .mfi
- nop.m 999
-(p9) fclass.m.unc p6, p0 = ArgX, 0x017
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p9) fclass.m.unc p7, p0 = ArgX, 0x021
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p9) fclass.m.unc p8, p0 = ArgX, 0x022
- nop.i 999 ;;
-}
-{ .mmi
-(p6) add table_ptr1 = 16, table_ptr1 ;;
-(p0) ldfd Result = [table_ptr1], 8
- nop.i 999 ;;
-}
-{ .mfi
-(p0) ldfd Result_lo = [table_ptr1], -8
- nop.f 999
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p6) fmerge.s Result = ArgY, Result
- nop.i 999 ;;
+ nop.m 999
+ nop.i 999
+ br.ret.sptk b0 // Exit for y=0, x not 0
}
+;;
+
+// Here if y not zero
+ATANL_ArgY_Not_ZERO:
{ .mfi
- nop.m 999
-(p6) fmerge.s Result_lo = ArgY, Result_lo
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p0, p10 = ArgY, 0x023 // Test y inf
+ nop.i 999
}
+;;
+
{ .mfb
- nop.m 999
-(p6) fadd.s0 Result = Result, Result_lo
-(p6) br.ret.sptk b0 ;;
+ nop.m 999
+ fclass.m p6, p0 = ArgX, 0x017 // Test for 0 <= |x| < inf
+(p10) br.cond.spnt ATANL_ArgY_Not_INF // Branch if 0 < |y| < inf
}
+;;
+
+// Here if y=inf
//
-// Load PI/2 and adjust its sign.
// Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal
// Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal
+// Return +PI/4 when ArgY = +Inf and ArgX = +Inf
+// Return -PI/4 when ArgY = -Inf and ArgX = +Inf
+// Return +3PI/4 when ArgY = +Inf and ArgX = -Inf
+// Return -3PI/4 when ArgY = -Inf and ArgX = -Inf
//
-{ .mmi
-(p7) add table_ptr1 = 32, table_ptr1 ;;
-(p7) ldfd Result = [table_ptr1], 8
- nop.i 999 ;;
-}
{ .mfi
-(p7) ldfd Result_lo = [table_ptr1], -8
- nop.f 999
- nop.i 999 ;;
-}
-{ .mfi
- nop.m 999
-(p7) fmerge.s Result = ArgY, Result
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p7, p0 = ArgX, 0x021 // Test for x=+inf
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p7) fmerge.s Result_lo = ArgY, Result_lo
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p7) fadd.s0 Result = Result, Result_lo
-(p7) br.ret.sptk b0 ;;
+(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2, if x finite
+ fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf
+ nop.i 999
}
-//
-// Load PI/4 and adjust its sign.
-// Return +PI/4 when ArgY = +Inf and ArgX = +Inf
-// Return -PI/4 when ArgY = -Inf and ArgX = +Inf
-//
+;;
+
{ .mmi
-(p8) add table_ptr1 = 48, table_ptr1 ;;
-(p8) ldfd Result = [table_ptr1], 8
- nop.i 999 ;;
+(p7) add table_ptr1 = 32, table_ptr1 // Point to pi/4 if x=+inf
+;;
+(p8) add table_ptr1 = 48, table_ptr1 // Point to 3pi/4 if x=-inf
+
+ nop.i 999
}
-{ .mfi
-(p8) ldfd Result_lo = [table_ptr1], -8
- nop.f 999
- nop.i 999 ;;
+;;
+
+{ .mmi
+ ldfd Result = [table_ptr1], 8 // Load pi/2, pi/4, or 3pi/4 hi
+;;
+ ldfd Result_lo = [table_ptr1], -8 // Load pi/2, pi/4, or 3pi/4 lo
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p8) fmerge.s Result = ArgY, Result
- nop.i 999 ;;
+ nop.m 999
+ fmerge.s Result = ArgY, Result // Merge sgn(y) in hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p8) fmerge.s Result_lo = ArgY, Result_lo
- nop.i 999 ;;
+ nop.m 999
+ fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo
+ nop.i 999
}
+;;
+
{ .mfb
- nop.m 999
-(p8) fadd.s0 Result = Result, Result_lo
-(p8) br.ret.sptk b0 ;;
+ nop.m 999
+ fadd.s0 Result = Result, Result_lo // Compute complete result
+ br.ret.sptk b0 // Exit for y=inf
}
-L(ATANL_ArgY_Not_INF):
-{ .mfi
- nop.m 999
+;;
+
+// Here if y not INF, and x=0 or INF
+ATANL_ArgY_Not_INF:
//
-// Load PI/4 and adjust its sign.
-// Return +3PI/4 when ArgY = +Inf and ArgX = -Inf
-// Return -3PI/4 when ArgY = -Inf and ArgX = -Inf
+// Return +PI/2 when ArgY NOT Inf, ArgY > 0 and ArgX = +/-0
+// Return -PI/2 when ArgY NOT Inf, ArgY < 0 and ArgX = +/-0
+// Return +0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
+// Return -0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf
+// Return +PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
+// Return -PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf
//
-(p0) fclass.m.unc p6, p0 = ArgX, 0x007
- nop.i 999
-}
-{ .mfi
- nop.m 999
-(p0) fclass.m.unc p7, p0 = ArgX, 0x021
- nop.i 999 ;;
-}
{ .mfi
- nop.m 999
-(p0) fclass.m.unc p8, p0 = ArgX, 0x022
- nop.i 999 ;;
-}
-{ .mmi
-(p6) add table_ptr1 = 16, table_ptr1 ;;
-(p6) ldfd Result = [table_ptr1], 8
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p7, p9 = ArgX, 0x021 // Test for x=+inf
+ nop.i 999
}
+;;
+
{ .mfi
-(p6) ldfd Result_lo = [table_ptr1], -8
- nop.f 999
- nop.i 999 ;;
+ nop.m 999
+ fclass.m p6, p0 = ArgX, 0x007 // Test for x=0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p6) fmerge.s Result = ArgY, Result
- nop.i 999 ;;
+(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2
+ fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf
+ nop.i 999
}
+;;
+
+.pred.rel "mutex",p7,p9
{ .mfi
- nop.m 999
-(p6) fmerge.s Result_lo = ArgY, Result_lo
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p6) fadd.s0 Result = Result, Result_lo
-(p6) br.ret.spnt b0 ;;
+(p9) ldfd Result = [table_ptr1], 8 // Load pi or pi/2 hi
+(p7) fmerge.s Result = ArgY, f0 // If y not inf, x=+inf, sgn(y)*0
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-//
-// return = sign_Y * PI/2 when ArgX = 0
-//
-(p7) fmerge.s Result = ArgY, f0
- nop.i 999 ;;
-}
-{ .mfb
- nop.m 999
-(p7) fnorm.s0 Result = Result
-(p7) br.ret.spnt b0 ;;
-}
-//
-// return = sign_Y * 0 when ArgX = Inf
-//
-{ .mmi
-(p8) ldfd Result = [table_ptr1], 8 ;;
-(p8) ldfd Result_lo = [table_ptr1], -8
- nop.i 999 ;;
+(p9) ldfd Result_lo = [table_ptr1], -8 // Load pi or pi/2 lo
+(p7) fnorm.s0 Result = Result // If y not inf, x=+inf normalize
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p8) fmerge.s Result = ArgY, Result
- nop.i 999 ;;
+ nop.m 999
+(p9) fmerge.s Result = ArgY, Result // Merge sgn(y) in hi
+ nop.i 999
}
+;;
+
{ .mfi
- nop.m 999
-(p8) fmerge.s Result_lo = ArgY, Result_lo
- nop.i 999 ;;
+ nop.m 999
+(p9) fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo
+ nop.i 999
}
+;;
+
{ .mfb
- nop.m 999
-(p8) fadd.s0 Result = Result, Result_lo
-(p8) br.ret.sptk b0 ;;
+ nop.m 999
+(p9) fadd.s0 Result = Result, Result_lo // Compute complete result
+ br.ret.spnt b0 // Exit for y not inf, x=0,inf
}
-//
-// return = sign_Y * PI when ArgX = -Inf
-//
-.endp atan2l
-ASM_SIZE_DIRECTIVE(atan2l)
-ASM_SIZE_DIRECTIVE(__atan2l)
-ASM_SIZE_DIRECTIVE(__ieee754_atan2l)
-
-.proc __libm_error_region
-__libm_error_region:
+;;
+
+GLOBAL_IEEE754_END(atan2l)
+LOCAL_LIBM_ENTRY(__libm_error_region)
.prologue
{ .mfi
add GR_Parameter_Y=-32,sp // Parameter 2 value
@@ -2001,7 +1999,6 @@ __libm_error_region:
br.ret.sptk b0 // Return
};;
-.endp __libm_error_region
-ASM_SIZE_DIRECTIVE(__libm_error_region)
+LOCAL_LIBM_END(__libm_error_region#)
.type __libm_error_support#,@function
.global __libm_error_support#