From 833861be818bb5d45ab0c47370b84068dfb2fedf Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 16 Feb 2005 12:31:10 +0000 Subject: import later fedora-branch tweaks --- sysdeps/ia64/fpu/s_atanl.S | 2167 ++++++++++++++++++++++---------------------- 1 file changed, 1085 insertions(+), 1082 deletions(-) (limited to 'sysdeps/ia64/fpu/s_atanl.S') diff --git a/sysdeps/ia64/fpu/s_atanl.S b/sysdeps/ia64/fpu/s_atanl.S index bfd9f458f4..28d44c1850 100644 --- a/sysdeps/ia64/fpu/s_atanl.S +++ b/sysdeps/ia64/fpu/s_atanl.S @@ -1,10 +1,10 @@ .file "atanl.s" - -// Copyright (c) 2000 - 2003, Intel Corporation +// Copyright (C) 2000, 2001, Intel Corporation // All rights reserved. -// -// Contributed 2000 by the Intel Numerics Group, Intel Corporation +// +// Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, +// and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are @@ -20,7 +20,7 @@ // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. - +// // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -35,52 +35,41 @@ // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at -// http://www.intel.com/software/products/opensource/libraries/num.htm. +// http://developer.intel.com/opensource. // // -//********************************************************************* +// ********************************************************************* // // History -// 02/02/00 (hand-optimized) -// 04/04/00 Unwind support added -// 08/15/00 Bundle added after call to __libm_error_support to properly +// 2/02/00 (hand-optimized) +// 4/04/00 Unwind support added +// 8/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. -// 03/13/01 Fixed flags when denormal raised on intermediate result -// 01/08/02 Improved speed. -// 02/06/02 Corrected .section statement -// 05/20/02 Cleaned up namespace and sf0 syntax -// 02/10/03 Reordered header: .section, .global, .proc, .align; -// used data8 for long double table values // -//********************************************************************* +// ********************************************************************* // // Function: atanl(x) = inverse tangent(x), for double extended x values -// Function: atan2l(y,x) = atan(y/x), for double extended y, x values -// -// API +// Function: atan2l(y,x) = atan(y/x), for double extended x values // -// long double atanl (long double x) -// long double atan2l (long double y, long double x) -// -//********************************************************************* +// ********************************************************************* // // Resources Used: // // Floating-Point Registers: f8 (Input and Return Value) -// f9 (Input for atan2l) -// f10-f15, f32-f83 +// f9-f15 +// f32-f79 // // General Purpose Registers: -// r32-r51 -// r49-r52 (Arguments to error support for 0,0 case) +// r32-r48 +// r49,r50,r51,r52 (Arguments to error support for 0,0 case) // // Predicate Registers: p6-p15 // -//********************************************************************* +// ********************************************************************* // // IEEE Special Conditions: // -// Denormal fault raised on denormal inputs +// Denormal fault raised on denormal inputs // Underflow exceptions may occur // Special error handling for the y=0 and x=0 case // Inexact raised when appropriate by algorithm @@ -103,7 +92,7 @@ // atan2l(+/-Inf, Inf) = +/-pi/4 // atan2l(+/-Inf, -Inf) = +/-3pi/4 // -//********************************************************************* +// ********************************************************************* // // Mathematical Description // --------------------------- @@ -119,16 +108,16 @@ // // // (Arg_X, Arg_Y) x -// \ -// \ -// \ -// \ +// \ +// \ +// \ +// \ // \ angle between is ATANL(Arg_Y,Arg_X) -// \ +// \ // ------------------> X-axis // Origin @@ -243,14 +232,14 @@ // z_hi = 2^k * 1.b_1 b_2 b_3 b_4 1 // // then -// / \ +// / \ // | (V/U) - z_hi | // arctan(V/U) = arctan(z_hi) + acrtan| -------------- | // | 1 + (V/U)*z_hi | // \ / // -// / \ +// / \ // | V - z_hi*U | // = arctan(z_hi) + acrtan| -------------- | @@ -306,7 +295,7 @@ // U := max( |Arg_X|, |Arg_Y| ) // V := min( |Arg_X|, |Arg_Y| ) // -// execute: frcpa E, pred, V, U +// execute: frcap E, pred, V, U // If pred is 0, go to Step 5 for special cases handling. // // Step 2. Decide on branch. @@ -410,7 +399,7 @@ // // z := V * E ...z approximates V/U to roughly working precision // zsq := z * z -// z4 := zsq * zsq; z8 := z4 * z4 +// z8 := zsq * zsq; z8 := z8 * z8 // // poly1 := P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) // poly2 := zsq*(P_1 + zsq*(P_2 + zsq*P_3)) @@ -449,11 +438,12 @@ // // Step 5. Special Cases // -// These are detected early in the function by fclass instructions. +// If pred is 0 where pred is obtained in +// frcap E, pred, V, U // -// We are in one of those special cases when X or Y is 0,+-inf or NaN +// we are in one of those special cases of 0,+-inf or NaN // -// If one of X and Y is NaN, return X+Y (which will generate +// If one of U and V is NaN, return U+V (which will generate // invalid in case one is a signaling NaN). Otherwise, // return the Result as described in the table // @@ -479,6 +469,8 @@ // // +#include "libm_support.h" + ArgY_orig = f8 Result = f8 FR_RESULT = f8 @@ -512,7 +504,6 @@ Res_hi = f49 Res_lo = f50 Z = f52 zsq = f53 -z4 = f54 z8 = f54 poly1 = f55 poly2 = f56 @@ -530,8 +521,8 @@ P_5 = f67 P_6 = f68 P_7 = f69 P_8 = f70 -U_hold = f71 -TWO_TO_NEG3 = f72 +TWO_TO_NEG3 = f71 +U_hold = f72 C_hi_hold = f73 E_hold = f74 M = f75 @@ -539,11 +530,6 @@ ArgX_abs = f76 ArgY_abs = f77 Result_lo = f78 A_temp = f79 -FR_temp = f80 -Xsq = f81 -Ysq = f82 -tmp_small = f83 - GR_SAVE_PFS = r33 GR_SAVE_B0 = r34 GR_SAVE_GP = r35 @@ -559,1399 +545,1415 @@ exp_ArgY = r44 exponent_Q = r45 significand_Q = r46 special = r47 -sp_exp_Q = r48 -sp_exp_4sig_Q = r49 -table_base = r50 -int_temp = r51 - +special1 = r48 GR_Parameter_X = r49 GR_Parameter_Y = r50 GR_Parameter_RESULT = r51 GR_Parameter_TAG = r52 -GR_temp = r52 - -RODATA -.align 16 - -LOCAL_OBJECT_START(Constants_atan) -// double pi/2 -data8 0x3FF921FB54442D18 -// single lo_pi/2, two**(-3) -data4 0x248D3132, 0x3E000000 -data8 0xAAAAAAAAAAAAAAA3, 0xBFFD // P_1 -data8 0xCCCCCCCCCCCC54B2, 0x3FFC // P_2 -data8 0x9249249247E4D0C2, 0xBFFC // P_3 -data8 0xE38E38E058870889, 0x3FFB // P_4 -data8 0xBA2E895B290149F8, 0xBFFB // P_5 -data8 0x9D88E6D4250F733D, 0x3FFB // P_6 -data8 0x884E51FFFB8745A0, 0xBFFB // P_7 -data8 0xE1C7412B394396BD, 0x3FFA // P_8 -data8 0xAAAAAAAAAAAAA52F, 0xBFFD // Q_1 -data8 0xCCCCCCCCC75B60D3, 0x3FFC // Q_2 -data8 0x924923AD011F1940, 0xBFFC // Q_3 -data8 0xE36F716D2A5F89BD, 0x3FFB // Q_4 +int_temp = r52 + +#ifdef _LIBC +.rodata +#else +.data +#endif +.align 64 + +Constants_atan: +ASM_TYPE_DIRECTIVE(Constants_atan,@object) +data4 0x54442D18, 0x3FF921FB, 0x248D3132, 0x3E000000 +// double pi/2, single lo_pi/2, two**(-3) +data4 0xAAAAAAA3, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // P_1 +data4 0xCCCC54B2, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // P_2 +data4 0x47E4D0C2, 0x92492492, 0x0000BFFC, 0x00000000 // P_3 +data4 0x58870889, 0xE38E38E0, 0x00003FFB, 0x00000000 // P_4 +data4 0x290149F8, 0xBA2E895B, 0x0000BFFB, 0x00000000 // P_5 +data4 0x250F733D, 0x9D88E6D4, 0x00003FFB, 0x00000000 // P_6 +data4 0xFB8745A0, 0x884E51FF, 0x0000BFFB, 0x00000000 // P_7 +data4 0x394396BD, 0xE1C7412B, 0x00003FFA, 0x00000000 // P_8 +data4 0xAAAAA52F, 0xAAAAAAAA, 0x0000BFFD, 0x00000000 // Q_1 +data4 0xC75B60D3, 0xCCCCCCCC, 0x00003FFC, 0x00000000 // Q_2 +data4 0x011F1940, 0x924923AD, 0x0000BFFC, 0x00000000 // Q_3 +data4 0x2A5F89BD, 0xE36F716D, 0x00003FFB, 0x00000000 // Q_4 // // Entries Tbl_hi (double precision) // B = 1+Index/16+1/32 Index = 0 // Entries Tbl_lo (single precision) // B = 1+Index/16+1/32 Index = 0 // -data8 0x3FE9A000A935BD8E -data4 0x23ACA08F, 0x00000000 +data4 0xA935BD8E, 0x3FE9A000, 0x23ACA08F, 0x00000000 // // Entries Tbl_hi (double precision) Index = 0,1,...,15 // B = 2^(-1)*(1+Index/16+1/32) // Entries Tbl_lo (single precision) // Index = 0,1,...,15 B = 2^(-1)*(1+Index/16+1/32) // -data8 0x3FDE77EB7F175A34 -data4 0x238729EE, 0x00000000 -data8 0x3FE0039C73C1A40B -data4 0x249334DB, 0x00000000 -data8 0x3FE0C6145B5B43DA -data4 0x22CBA7D1, 0x00000000 -data8 0x3FE1835A88BE7C13 -data4 0x246310E7, 0x00000000 -data8 0x3FE23B71E2CC9E6A -data4 0x236210E5, 0x00000000 -data8 0x3FE2EE628406CBCA -data4 0x2462EAF5, 0x00000000 -data8 0x3FE39C391CD41719 -data4 0x24B73EF3, 0x00000000 -data8 0x3FE445065B795B55 -data4 0x24C11260, 0x00000000 -data8 0x3FE4E8DE5BB6EC04 -data4 0x242519EE, 0x00000000 -data8 0x3FE587D81F732FBA -data4 0x24D4346C, 0x00000000 -data8 0x3FE6220D115D7B8D -data4 0x24ED487B, 0x00000000 -data8 0x3FE6B798920B3D98 -data4 0x2495FF1E, 0x00000000 -data8 0x3FE748978FBA8E0F -data4 0x223D9531, 0x00000000 -data8 0x3FE7D528289FA093 -data4 0x242B0411, 0x00000000 -data8 0x3FE85D69576CC2C5 -data4 0x2335B374, 0x00000000 -data8 0x3FE8E17AA99CC05D -data4 0x24C27CFB, 0x00000000 +data4 0x7F175A34, 0x3FDE77EB, 0x238729EE, 0x00000000 +data4 0x73C1A40B, 0x3FE0039C, 0x249334DB, 0x00000000 +data4 0x5B5B43DA, 0x3FE0C614, 0x22CBA7D1, 0x00000000 +data4 0x88BE7C13, 0x3FE1835A, 0x246310E7, 0x00000000 +data4 0xE2CC9E6A, 0x3FE23B71, 0x236210E5, 0x00000000 +data4 0x8406CBCA, 0x3FE2EE62, 0x2462EAF5, 0x00000000 +data4 0x1CD41719, 0x3FE39C39, 0x24B73EF3, 0x00000000 +data4 0x5B795B55, 0x3FE44506, 0x24C11260, 0x00000000 +data4 0x5BB6EC04, 0x3FE4E8DE, 0x242519EE, 0x00000000 +data4 0x1F732FBA, 0x3FE587D8, 0x24D4346C, 0x00000000 +data4 0x115D7B8D, 0x3FE6220D, 0x24ED487B, 0x00000000 +data4 0x920B3D98, 0x3FE6B798, 0x2495FF1E, 0x00000000 +data4 0x8FBA8E0F, 0x3FE74897, 0x223D9531, 0x00000000 +data4 0x289FA093, 0x3FE7D528, 0x242B0411, 0x00000000 +data4 0x576CC2C5, 0x3FE85D69, 0x2335B374, 0x00000000 +data4 0xA99CC05D, 0x3FE8E17A, 0x24C27CFB, 0x00000000 // // Entries Tbl_hi (double precision) Index = 0,1,...,15 // B = 2^(-2)*(1+Index/16+1/32) // Entries Tbl_lo (single precision) // Index = 0,1,...,15 B = 2^(-2)*(1+Index/16+1/32) // -data8 0x3FD025FA510665B5 -data4 0x24263482, 0x00000000 -data8 0x3FD1151A362431C9 -data4 0x242C8DC9, 0x00000000 -data8 0x3FD2025567E47C95 -data4 0x245CF9BA, 0x00000000 -data8 0x3FD2ED987A823CFE -data4 0x235C892C, 0x00000000 -data8 0x3FD3D6D129271134 -data4 0x2389BE52, 0x00000000 -data8 0x3FD4BDEE586890E6 -data4 0x24436471, 0x00000000 -data8 0x3FD5A2E0175E0F4E -data4 0x2389DBD4, 0x00000000 -data8 0x3FD685979F5FA6FD -data4 0x2476D43F, 0x00000000 -data8 0x3FD7660752817501 -data4 0x24711774, 0x00000000 -data8 0x3FD84422B8DF95D7 -data4 0x23EBB501, 0x00000000 -data8 0x3FD91FDE7CD0C662 -data4 0x23883A0C, 0x00000000 -data8 0x3FD9F93066168001 -data4 0x240DF63F, 0x00000000 -data8 0x3FDAD00F5422058B -data4 0x23FE261A, 0x00000000 -data8 0x3FDBA473378624A5 -data4 0x23A8CD0E, 0x00000000 -data8 0x3FDC76550AAD71F8 -data4 0x2422D1D0, 0x00000000 -data8 0x3FDD45AEC9EC862B -data4 0x2344A109, 0x00000000 +data4 0x510665B5, 0x3FD025FA, 0x24263482, 0x00000000 +data4 0x362431C9, 0x3FD1151A, 0x242C8DC9, 0x00000000 +data4 0x67E47C95, 0x3FD20255, 0x245CF9BA, 0x00000000 +data4 0x7A823CFE, 0x3FD2ED98, 0x235C892C, 0x00000000 +data4 0x29271134, 0x3FD3D6D1, 0x2389BE52, 0x00000000 +data4 0x586890E6, 0x3FD4BDEE, 0x24436471, 0x00000000 +data4 0x175E0F4E, 0x3FD5A2E0, 0x2389DBD4, 0x00000000 +data4 0x9F5FA6FD, 0x3FD68597, 0x2476D43F, 0x00000000 +data4 0x52817501, 0x3FD76607, 0x24711774, 0x00000000 +data4 0xB8DF95D7, 0x3FD84422, 0x23EBB501, 0x00000000 +data4 0x7CD0C662, 0x3FD91FDE, 0x23883A0C, 0x00000000 +data4 0x66168001, 0x3FD9F930, 0x240DF63F, 0x00000000 +data4 0x5422058B, 0x3FDAD00F, 0x23FE261A, 0x00000000 +data4 0x378624A5, 0x3FDBA473, 0x23A8CD0E, 0x00000000 +data4 0x0AAD71F8, 0x3FDC7655, 0x2422D1D0, 0x00000000 +data4 0xC9EC862B, 0x3FDD45AE, 0x2344A109, 0x00000000 // // Entries Tbl_hi (double precision) Index = 0,1,...,15 // B = 2^(-3)*(1+Index/16+1/32) // Entries Tbl_lo (single precision) // Index = 0,1,...,15 B = 2^(-3)*(1+Index/16+1/32) // -data8 0x3FC068D584212B3D -data4 0x239874B6, 0x00000000 -data8 0x3FC1646541060850 -data4 0x2335E774, 0x00000000 -data8 0x3FC25F6E171A535C -data4 0x233E36BE, 0x00000000 -data8 0x3FC359E8EDEB99A3 -data4 0x239680A3, 0x00000000 -data8 0x3FC453CEC6092A9E -data4 0x230FB29E, 0x00000000 -data8 0x3FC54D18BA11570A -data4 0x230C1418, 0x00000000 -data8 0x3FC645BFFFB3AA73 -data4 0x23F0564A, 0x00000000 -data8 0x3FC73DBDE8A7D201 -data4 0x23D4A5E1, 0x00000000 -data8 0x3FC8350BE398EBC7 -data4 0x23D4ADDA, 0x00000000 -data8 0x3FC92BA37D050271 -data4 0x23BCB085, 0x00000000 -data8 0x3FCA217E601081A5 -data4 0x23BC841D, 0x00000000 -data8 0x3FCB1696574D780B -data4 0x23CF4A8E, 0x00000000 -data8 0x3FCC0AE54D768466 -data4 0x23BECC90, 0x00000000 -data8 0x3FCCFE654E1D5395 -data4 0x2323DCD2, 0x00000000 -data8 0x3FCDF110864C9D9D -data4 0x23F53F3A, 0x00000000 -data8 0x3FCEE2E1451D980C -data4 0x23CCB11F, 0x00000000 -// -data8 0x400921FB54442D18, 0x3CA1A62633145C07 // PI two doubles -data8 0x3FF921FB54442D18, 0x3C91A62633145C07 // PI_by_2 two dbles -data8 0x3FE921FB54442D18, 0x3C81A62633145C07 // PI_by_4 two dbles -data8 0x4002D97C7F3321D2, 0x3C9A79394C9E8A0A // 3PI_by_4 two dbles -LOCAL_OBJECT_END(Constants_atan) - - -.section .text -GLOBAL_IEEE754_ENTRY(atanl) - -// Use common code with atan2l after setting x=1.0 -{ .mfi - alloc r32 = ar.pfs, 0, 17, 4, 0 - fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y - nop.i 999 -} -{ .mfi - addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer - fma.s1 Xsq = f1, f1, f0 // Form x*x - nop.i 999 -} -;; - +data4 0x84212B3D, 0x3FC068D5, 0x239874B6, 0x00000000 +data4 0x41060850, 0x3FC16465, 0x2335E774, 0x00000000 +data4 0x171A535C, 0x3FC25F6E, 0x233E36BE, 0x00000000 +data4 0xEDEB99A3, 0x3FC359E8, 0x239680A3, 0x00000000 +data4 0xC6092A9E, 0x3FC453CE, 0x230FB29E, 0x00000000 +data4 0xBA11570A, 0x3FC54D18, 0x230C1418, 0x00000000 +data4 0xFFB3AA73, 0x3FC645BF, 0x23F0564A, 0x00000000 +data4 0xE8A7D201, 0x3FC73DBD, 0x23D4A5E1, 0x00000000 +data4 0xE398EBC7, 0x3FC8350B, 0x23D4ADDA, 0x00000000 +data4 0x7D050271, 0x3FC92BA3, 0x23BCB085, 0x00000000 +data4 0x601081A5, 0x3FCA217E, 0x23BC841D, 0x00000000 +data4 0x574D780B, 0x3FCB1696, 0x23CF4A8E, 0x00000000 +data4 0x4D768466, 0x3FCC0AE5, 0x23BECC90, 0x00000000 +data4 0x4E1D5395, 0x3FCCFE65, 0x2323DCD2, 0x00000000 +data4 0x864C9D9D, 0x3FCDF110, 0x23F53F3A, 0x00000000 +data4 0x451D980C, 0x3FCEE2E1, 0x23CCB11F, 0x00000000 + +data4 0x54442D18, 0x400921FB, 0x33145C07, 0x3CA1A626 // PI two doubles +data4 0x54442D18, 0x3FF921FB, 0x33145C07, 0x3C91A626 // PI_by_2 two dbles +data4 0x54442D18, 0x3FE921FB, 0x33145C07, 0x3C81A626 // PI_by_4 two dbles +data4 0x7F3321D2, 0x4002D97C, 0x4C9E8A0A, 0x3C9A7939 // 3PI_by_4 two dbles +ASM_SIZE_DIRECTIVE(Constants_atan) + + +.text +.proc atanl# +.global atanl# +.align 64 + +atanl: +{ .mfb + nop.m 999 +(p0) mov ArgX_orig = f1 +(p0) br.cond.sptk atan2l ;; +} +.endp atanl +ASM_SIZE_DIRECTIVE(atanl) + +.text +.proc atan2l# +.global atan2l# +#ifdef _LIBC +.proc __atan2l# +.global __atan2l# +.proc __ieee754_atan2l# +.global __ieee754_atan2l# +#endif +.align 64 + + +atan2l: +#ifdef _LIBC +__atan2l: +__ieee754_atan2l: +#endif +{ .mfi +alloc r32 = ar.pfs, 0, 17 , 4, 0 +(p0) mov ArgY = ArgY_orig +} +{ .mfi + nop.m 999 +(p0) mov ArgX = ArgX_orig + nop.i 999 +};; { .mfi - ld8 table_ptr1 = [table_ptr1] // Get table pointer - fnorm.s1 ArgY = ArgY_orig - nop.i 999 + nop.m 999 +(p0) fclass.m.unc p7,p0 = ArgY_orig, 0x103 + nop.i 999 } { .mfi - nop.m 999 - fnorm.s1 ArgX = f1 - nop.i 999 -} -;; - + nop.m 999 +// +// +// Save original input args and load table ptr. +// +(p0) fclass.m.unc p6,p0 = ArgX_orig, 0x103 + nop.i 999 +};; { .mfi - getf.exp sign_X = f1 // Get signexp of x - fmerge.s ArgX_abs = f0, f1 // Form |x| - nop.i 999 +(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp +(p0) fclass.m.unc p0,p9 = ArgY_orig, 0x1FF + nop.i 999 ;; } { .mfi - nop.m 999 - fnorm.s1 ArgX_orig = f1 - nop.i 999 + ld8 table_ptr1 = [table_ptr1] +(p0) fclass.m.unc p0,p8 = ArgX_orig, 0x1FF + nop.i 999 } -;; - { .mfi - getf.exp sign_Y = ArgY_orig // Get signexp of y - fmerge.s ArgY_abs = f0, ArgY_orig // Form |y| - mov table_base = table_ptr1 // Save base pointer to tables + nop.m 999 +(p0) fclass.m.unc p13,p0 = ArgY_orig, 0x0C3 + nop.i 999 ;; } -;; - { .mfi - ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi - fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero - nop.i 999 +(p0) fclass.m.unc p12,p0 = ArgX_orig, 0x0C3 + nop.i 999 } -;; -{ .mfi - ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3 - nop.f 999 - nop.i 999 -} -{ .mfi - nop.m 999 - fma.s1 M = f1, f1, f0 // Set M = 1.0 - nop.i 999 -} -;; // +// Check for NatVals. // Check for everything - if false, then must be pseudo-zero // or pseudo-nan (IA unsupporteds). // -{ .mfb - nop.m 999 - fclass.m p0,p12 = f1, 0x1FF // Test x unsupported -(p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero +{ .mib + nop.m 999 + nop.i 999 +(p6) br.cond.spnt L(ATANL_NATVAL) ;; } -;; -// U = max(ArgX_abs,ArgY_abs) -// V = min(ArgX_abs,ArgY_abs) -{ .mfi - nop.m 999 - fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares - nop.i 999 +{ .mib + nop.m 999 + nop.i 999 +(p7) br.cond.spnt L(ATANL_NATVAL) ;; } -{ .mfb - nop.m 999 - fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y| - br.cond.sptk ATANL_COMMON // Branch to common code +{ .mib +(p0) ldfd P_hi = [table_ptr1],8 + nop.i 999 +(p8) br.cond.spnt L(ATANL_UNSUPPORTED) ;; } -;; - -GLOBAL_IEEE754_END(atanl) -GLOBAL_IEEE754_ENTRY(atan2l) - -{ .mfi - alloc r32 = ar.pfs, 0, 17, 4, 0 - fma.s1 Ysq = ArgY_orig, ArgY_orig, f0 // Form y*y - nop.i 999 +{ .mbb +(p0) add table_ptr2 = 96, table_ptr1 +(p9) br.cond.spnt L(ATANL_UNSUPPORTED) +// +// Load double precision high-order part of pi +// +(p12) br.cond.spnt L(ATANL_NAN) ;; } -{ .mfi - addl table_ptr1 = @ltoff(Constants_atan#), gp // Address of table pointer - fma.s1 Xsq = ArgX_orig, ArgX_orig, f0 // Form x*x - nop.i 999 +{ .mfb + nop.m 999 +(p0) fnorm.s1 ArgX = ArgX +(p13) br.cond.spnt L(ATANL_NAN) ;; } -;; - -{ .mfi - ld8 table_ptr1 = [table_ptr1] // Get table pointer - fnorm.s1 ArgY = ArgY_orig - nop.i 999 +// +// Normalize the input argument. +// Branch out if NaN inputs +// +{ .mmf +(p0) ldfs P_lo = [table_ptr1], 4 + nop.m 999 +(p0) fnorm.s1 ArgY = ArgY ;; } -{ .mfi - nop.m 999 - fnorm.s1 ArgX = ArgX_orig - nop.i 999 +{ .mmf + nop.m 999 +(p0) ldfs TWO_TO_NEG3 = [table_ptr1], 180 +// +// U = max(ArgX_abs,ArgY_abs) +// V = min(ArgX_abs,ArgY_abs) +// if PR1, swap = 0 +// if PR2, swap = 1 +// +(p0) mov M = f1 ;; } -;; - { .mfi - getf.exp sign_X = ArgX_orig // Get signexp of x - fmerge.s ArgX_abs = f0, ArgX_orig // Form |x| - nop.i 999 + nop.m 999 +// +// Get exp and sign of ArgX +// Get exp and sign of ArgY +// Load 2**(-3) and increment ptr to Q_4. +// +(p0) fmerge.s ArgX_abs = f1, ArgX + nop.i 999 ;; } -;; - +// +// load single precision low-order part of pi = P_lo +// { .mfi - getf.exp sign_Y = ArgY_orig // Get signexp of y - fmerge.s ArgY_abs = f0, ArgY_orig // Form |y| - mov table_base = table_ptr1 // Save base pointer to tables +(p0) getf.exp sign_X = ArgX +(p0) fmerge.s ArgY_abs = f1, ArgY + nop.i 999 ;; } -;; - -{ .mfi - ldfd P_hi = [table_ptr1],8 // Load double precision hi part of pi - fclass.m p8,p0 = ArgY_orig, 0x1e7 // Test y natval, nan, inf, zero - nop.i 999 +{ .mii +(p0) getf.exp sign_Y = ArgY + nop.i 999 ;; +(p0) shr sign_X = sign_X, 17 ;; } -;; - -{ .mfi - ldfps P_lo, TWO_TO_NEG3 = [table_ptr1], 8 // Load P_lo and constant 2^-3 - fclass.m p9,p0 = ArgX_orig, 0x1e7 // Test x natval, nan, inf, zero - nop.i 999 +{ .mii + nop.m 999 +(p0) shr sign_Y = sign_Y, 17 ;; +(p0) cmp.eq.unc p8, p9 = 0x00000, sign_Y ;; } { .mfi - nop.m 999 - fma.s1 M = f1, f1, f0 // Set M = 1.0 - nop.i 999 -} -;; - + nop.m 999 // -// Check for everything - if false, then must be pseudo-zero -// or pseudo-nan (IA unsupporteds). +// Is ArgX_abs >= ArgY_abs +// Is sign_Y == 0? // -{ .mfb - nop.m 999 - fclass.m p0,p12 = ArgX_orig, 0x1FF // Test x unsupported -(p8) br.cond.spnt ATANL_Y_SPECIAL // Branch if y natval, nan, inf, zero -} -;; - -// U = max(ArgX_abs,ArgY_abs) -// V = min(ArgX_abs,ArgY_abs) -{ .mfi - nop.m 999 - fcmp.ge.s1 p6,p7 = Xsq, Ysq // Test for |x| >= |y| using squares - nop.i 999 +(p0) fmax.s1 U = ArgX_abs, ArgY_abs + nop.i 999 } -{ .mfb - nop.m 999 - fma.s1 V = ArgX_abs, f1, f0 // Set V assuming |x| < |y| -(p9) br.cond.spnt ATANL_X_SPECIAL // Branch if x natval, nan, inf, zero -} -;; - -// Now common code for atanl and atan2l -ATANL_COMMON: { .mfi - nop.m 999 - fclass.m p0,p13 = ArgY_orig, 0x1FF // Test y unsupported - shr sign_X = sign_X, 17 // Get sign bit of x + nop.m 999 +// +// ArgX_abs = |ArgX| +// ArgY_abs = |ArgY| +// sign_X is sign bit of ArgX +// sign_Y is sign bit of ArgY +// +(p0) fcmp.ge.s1 p6, p7 = ArgX_abs, ArgY_abs + nop.i 999 ;; } { .mfi - nop.m 999 - fma.s1 U = ArgY_abs, f1, f0 // Set U assuming |x| < |y| - adds table_ptr1 = 176, table_ptr1 // Point to Q4 + nop.m 999 +(p0) fmin.s1 V = ArgX_abs, ArgY_abs + nop.i 999 ;; } -;; - { .mfi -(p6) add swap = r0, r0 // Set swap=0 if |x| >= |y| -(p6) frcpa.s1 E, p0 = ArgY_abs, ArgX_abs // Compute E if |x| >= |y| - shr sign_Y = sign_Y, 17 // Get sign bit of y + nop.m 999 +(p8) fadd.s1 s_Y = f0, f1 +(p6) cmp.eq.unc p10, p11 = 0x00000, sign_X } -{ .mfb - nop.m 999 -(p6) fma.s1 V = ArgY_abs, f1, f0 // Set V if |x| >= |y| -(p12) br.cond.spnt ATANL_UNSUPPORTED // Branch if x unsupported +{ .mii +(p6) add swap = r0, r0 + nop.i 999 ;; +(p7) add swap = 1, r0 } -;; - -// Set p8 if y >=0 -// Set p9 if y < 0 -// Set p10 if |x| >= |y| and x >=0 -// Set p11 if |x| >= |y| and x < 0 { .mfi - cmp.eq p8, p9 = 0, sign_Y // Test for y >= 0 -(p7) frcpa.s1 E, p0 = ArgX_abs, ArgY_abs // Compute E if |x| < |y| -(p7) add swap = 1, r0 // Set swap=1 if |x| < |y| -} -{ .mfb -(p6) cmp.eq.unc p10, p11 = 0, sign_X // If |x| >= |y|, test for x >= 0 -(p6) fma.s1 U = ArgX_abs, f1, f0 // Set U if |x| >= |y| -(p13) br.cond.spnt ATANL_UNSUPPORTED // Branch if y unsupported -} -;; - + nop.m 999 // +// Let M = 1.0 // if p8, s_Y = 1.0 // if p9, s_Y = -1.0 // -.pred.rel "mutex",p8,p9 +(p10) fsub.s1 M = M, f1 + nop.i 999 ;; +} { .mfi - nop.m 999 -(p8) fadd.s1 s_Y = f0, f1 // If y >= 0 set s_Y = 1.0 - nop.i 999 + nop.m 999 +(p9) fsub.s1 s_Y = f0, f1 + nop.i 999 ;; } { .mfi - nop.m 999 -(p9) fsub.s1 s_Y = f0, f1 // If y < 0 set s_Y = -1.0 - nop.i 999 + nop.m 999 +(p0) frcpa.s1 E, p6 = V, U + nop.i 999 ;; } -;; - -.pred.rel "mutex",p10,p11 +{ .mbb + nop.m 999 +// +// E = frcpa(V,U) +// +(p6) br.cond.sptk L(ATANL_STEP2) +(p0) br.cond.spnt L(ATANL_SPECIAL_HANDLING) ;; +} +L(ATANL_STEP2): { .mfi - nop.m 999 -(p10) fsub.s1 M = M, f1 // If |x| >= |y| and x >=0, set M=0 - nop.i 999 + nop.m 999 +(p0) fmpy.s1 Q = E, V + nop.i 999 } { .mfi - nop.m 999 -(p11) fadd.s1 M = M, f1 // If |x| >= |y| and x < 0, set M=2.0 - nop.i 999 + nop.m 999 +(p0) fcmp.eq.s0 p0, p9 = f1, ArgY_orig + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag - nop.i 999 + nop.m 999 +// +// Is Q < 2**(-3)? +// +(p0) fcmp.eq.s0 p0, p8 = f1, ArgX_orig + nop.i 999 +} +{ .mfi + nop.m 999 +(p11) fadd.s1 M = M, f1 + nop.i 999 ;; } +{ .mlx + nop.m 999 // ************************************************* // ********************* STEP2 ********************* // ************************************************* +(p0) movl special = 0x8400000000000000 +} +{ .mlx + nop.m 999 // -// Q = E * V +// lookup = b_1 b_2 b_3 B_4 // -{ .mfi - nop.m 999 - fmpy.s1 Q = E, V - nop.i 999 +(p0) movl special1 = 0x0000000000000100 ;; } -;; - { .mfi - nop.m 999 - fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (1) if POLY path - nop.i 999 + nop.m 999 +// +// Do fnorms to raise any denormal operand +// exceptions. +// +(p0) fmpy.s1 P_hi = M, P_hi + nop.i 999 } -;; - -// Create a single precision representation of the signexp of Q with the -// 4 most significant bits of the significand followed by a 1 and then 18 0's { .mfi - nop.m 999 - fmpy.s1 P_hi = M, P_hi - dep.z special = 0x1, 18, 1 // Form 0x0000000000040000 + nop.m 999 +(p0) fmpy.s1 P_lo = M, P_lo + nop.i 999 ;; } { .mfi - nop.m 999 - fmpy.s1 P_lo = M, P_lo - add table_ptr2 = 32, table_ptr1 + nop.m 999 +// +// Q = E * V +// +(p0) fcmp.lt.unc.s1 p6, p7 = Q, TWO_TO_NEG3 + nop.i 999 ;; } -;; - -{ .mfi - nop.m 999 - fma.s1 A_temp = Q, f1, f0 // Set A_temp if POLY path - nop.i 999 +{ .mmb +(p0) getf.sig significand_Q = Q +(p0) getf.exp exponent_Q = Q + nop.b 999 ;; } -{ .mfi - nop.m 999 - fma.s1 E = E, E_hold, E // E = E + E*E_hold (1) if POLY path - nop.i 999 +{ .mmi + nop.m 999 ;; +(p0) andcm k = 0x0003, exponent_Q +(p0) extr.u lookup = significand_Q, 59, 4 ;; } -;; - +{ .mib + nop.m 999 +(p0) dep special = lookup, special, 59, 4 // -// Is Q < 2**(-3)? -// swap = xor(swap,sign_X) +// Generate 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 // -{ .mfi - nop.m 999 - fcmp.lt.s1 p9, p0 = Q, TWO_TO_NEG3 // Test Q < 2^-3 - xor swap = sign_X, swap +(p6) br.cond.spnt L(ATANL_POLY) ;; } -;; - -// P_hi = s_Y * P_hi -{ .mmf - getf.exp exponent_Q = Q // Get signexp of Q - cmp.eq.unc p7, p6 = 0x00000, swap - fmpy.s1 P_hi = s_Y, P_hi +{ .mfi +(p0) cmp.eq.unc p8, p9 = 0x0000, k +(p0) fmpy.s1 P_hi = s_Y, P_hi +// +// We waited a few extra cycles so P_lo and P_hi could be calculated. +// Load the constant 256 for loading up table entries. +// +// ************************************************* +// ******************** STEP3 ********************** +// ************************************************* +(p0) add table_ptr2 = 16, table_ptr1 } -;; - // -// if (PR_1) sigma = -1.0 -// if (PR_2) sigma = 1.0 +// Let z_hi have exponent and sign of original Q +// Load the Tbl_hi(0) else, increment pointer. // -{ .mfi - getf.sig significand_Q = Q // Get significand of Q -(p6) fsub.s1 sigma = f0, f1 - nop.i 999 +{ .mii +(p0) ldfe Q_4 = [table_ptr1], -16 +(p0) xor swap = sign_X, swap ;; +(p9) sub k = k, r0, 1 } -{ .mfb -(p9) add table_ptr1 = 128, table_base // Point to P8 if POLY path -(p7) fadd.s1 sigma = f0, f1 -(p9) br.cond.spnt ATANL_POLY // Branch to POLY if 0 < Q < 2^-3 +{ .mmi +(p0) setf.sig z_hi = special +(p0) ldfe Q_3 = [table_ptr1], -16 +(p9) add table_ptr2 = 16, table_ptr2 ;; } -;; - // -// ************************************************* -// ******************** STEP3 ********************** -// ************************************************* +// U_hold = U - U_prime_hi +// k = k * 256 - Result can be 0, 256, or 512. // -// lookup = b_1 b_2 b_3 B_4 +{ .mmb +(p0) ldfe Q_2 = [table_ptr1], -16 +(p8) ldfd Tbl_hi = [table_ptr2], 8 + nop.b 999 ;; +} +// +// U_prime_lo = U_hold + V * z_hi +// lookup -> lookup * 16 + k // { .mmi - nop.m 999 - nop.m 999 - andcm k = 0x0003, exponent_Q // k=0,1,2,3 for exp_Q=0,-1,-2,-3 +(p0) ldfe Q_1 = [table_ptr1], -16 ;; +(p8) ldfs Tbl_lo = [table_ptr2], 8 +// +// U_prime_hi = U + V * z_hi +// Load the Tbl_lo(0) +// +(p9) pmpy2.r k = k, special1 ;; } -;; - +{ .mii + nop.m 999 + nop.i 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 + nop.i 999 ;; +} +{ .mii + nop.m 999 + nop.i 999 ;; +(p9) shladd lookup = lookup, 0x0004, k ;; +} +{ .mmi +(p9) add table_ptr2 = table_ptr2, lookup ;; +// +// V_prime = V - U * z_hi +// +(p9) ldfd Tbl_hi = [table_ptr2], 8 + nop.i 999 ;; +} +{ .mmf + nop.m 999 +// +// C_hi = frcpa(1,U_prime_hi) // -// Generate sign_exp_Q b_1 b_2 b_3 b_4 1 0 0 0 ... 0 in single precision -// representation. Note sign of Q is always 0. +(p9) ldfs Tbl_lo = [table_ptr2], 8 +// +// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 +// Point to beginning of Tbl_hi entries - k = 0. // +(p0) fmerge.se z_hi = Q, z_hi ;; +} { .mfi - cmp.eq p8, p9 = 0x0000, k // Test k=0 - nop.f 999 - extr.u lookup = significand_Q, 59, 4 // Extract b_1 b_2 b_3 b_4 for index + nop.m 999 +(p0) fma.s1 U_prime_hi = V, z_hi, U + nop.i 999 } { .mfi - sub sp_exp_Q = 0x7f, k // Form single prec biased exp of Q - nop.f 999 - sub k = k, r0, 1 // Decrement k + nop.m 999 +(p0) fnma.s1 V_prime = U, z_hi, V + nop.i 999 ;; } -;; - -// Form pointer to B index table { .mfi - ldfe Q_4 = [table_ptr1], -16 // Load Q_4 - nop.f 999 -(p9) shl k = k, 8 // k = 0, 256, or 512 + nop.m 999 +(p0) mov A_hi = Tbl_hi + nop.i 999 ;; } { .mfi -(p9) shladd table_ptr2 = lookup, 4, table_ptr2 - nop.f 999 - shladd sp_exp_4sig_Q = sp_exp_Q, 4, lookup // Shift and add in 4 high bits + nop.m 999 +(p0) fsub.s1 U_hold = U, U_prime_hi + nop.i 999 ;; } -;; - -{ .mmi -(p8) add table_ptr2 = -16, table_ptr2 // Pointer if original k was 0 -(p9) add table_ptr2 = k, table_ptr2 // Pointer if k was 1, 2, 3 - dep special = sp_exp_4sig_Q, special, 19, 13 // Form z_hi as single prec +{ .mfi + nop.m 999 +(p0) frcpa.s1 C_hi, p6 = f1, U_prime_hi + nop.i 999 ;; } -;; - -// z_hi = s exp 1.b_1 b_2 b_3 b_4 1 0 0 0 ... 0 -{ .mmi - ldfd Tbl_hi = [table_ptr2], 8 // Load Tbl_hi from index table -;; - setf.s z_hi = special // Form z_hi - nop.i 999 +{ .mfi +(p0) cmp.eq.unc p7, p6 = 0x00000, swap +(p0) fmpy.s1 A_hi = s_Y, A_hi + nop.i 999 ;; } -{ .mmi - ldfs Tbl_lo = [table_ptr2], 8 // Load Tbl_lo from index table -;; - ldfe Q_3 = [table_ptr1], -16 // Load Q_3 - nop.i 999 -} -;; - -{ .mmi - ldfe Q_2 = [table_ptr1], -16 // Load Q_2 - nop.m 999 - nop.i 999 -} -;; - -{ .mmf - ldfe Q_1 = [table_ptr1], -16 // Load Q_1 - nop.m 999 - nop.f 999 -} -;; - { .mfi - nop.m 999 - fma.s1 U_prime_hi = V, z_hi, U // U_prime_hi = U + V * z_hi - nop.i 999 + nop.m 999 +// +// poly = wsq * poly +// +(p7) fadd.s1 sigma = f0, f1 + nop.i 999 ;; } { .mfi - nop.m 999 - fnma.s1 V_prime = U, z_hi, V // V_prime = V - U * z_hi - nop.i 999 + nop.m 999 +(p0) fma.s1 U_prime_lo = z_hi, V, U_hold + nop.i 999 } -;; - { .mfi - nop.m 999 - mov A_hi = Tbl_hi // Start with A_hi = Tbl_hi - nop.i 999 + nop.m 999 +(p6) fsub.s1 sigma = f0, f1 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fsub.s1 U_hold = U, U_prime_hi // U_hold = U - U_prime_hi - nop.i 999 + nop.m 999 +(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - frcpa.s1 C_hi, p0 = f1, U_prime_hi // C_hi = frcpa(1,U_prime_hi) - nop.i 999 + nop.m 999 +// +// A_lo = A_lo + w_hi +// A_hi = s_Y * A_hi +// +(p0) fma.s1 Res_hi = sigma, A_hi, P_hi + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi - nop.i 999 + nop.m 999 +// +// C_hi_hold = 1 - C_hi * U_prime_hi (1) +// +(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 U_prime_lo = z_hi, V, U_hold // U_prime_lo = U_hold + V * z_hi - nop.i 999 + nop.m 999 +// +// C_hi = C_hi + C_hi * C_hi_hold (1) +// +(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 + nop.i 999 ;; } -;; - -// C_hi_hold = 1 - C_hi * U_prime_hi (1) { .mfi - nop.m 999 - fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 - nop.i 999 + nop.m 999 +// +// C_hi_hold = 1 - C_hi * U_prime_hi (2) +// +(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi - nop.i 999 + nop.m 999 +// +// C_hi = C_hi + C_hi * C_hi_hold (2) +// +(p0) fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (1) - nop.i 999 + nop.m 999 +// +// C_hi_hold = 1 - C_hi * U_prime_hi (3) +// +(p0) fma.s1 C_hi = C_hi_hold, C_hi, C_hi + nop.i 999 ;; } -;; - -// C_hi_hold = 1 - C_hi * U_prime_hi (2) { .mfi - nop.m 999 - fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 - nop.i 999 + nop.m 999 +// +// C_hi = C_hi + C_hi * C_hi_hold (3) +// +(p0) fmpy.s1 w_hi = V_prime, C_hi + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (2) - nop.i 999 + nop.m 999 +// +// w_hi = V_prime * C_hi +// +(p0) fmpy.s1 wsq = w_hi, w_hi + nop.i 999 } -;; - -// C_hi_hold = 1 - C_hi * U_prime_hi (3) { .mfi - nop.m 999 - fnma.s1 C_hi_hold = C_hi, U_prime_hi, f1 - nop.i 999 + nop.m 999 +(p0) fnma.s1 w_lo = w_hi, U_prime_hi, V_prime + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 C_hi = C_hi_hold, C_hi, C_hi // C_hi = C_hi + C_hi * C_hi_hold (3) - nop.i 999 + nop.m 999 +// +// wsq = w_hi * w_hi +// w_lo = = V_prime - w_hi * U_prime_hi +// +(p0) fma.s1 poly = wsq, Q_4, Q_3 + nop.i 999 } -;; - { .mfi - nop.m 999 - fmpy.s1 w_hi = V_prime, C_hi // w_hi = V_prime * C_hi - nop.i 999 + nop.m 999 +(p0) fnma.s1 w_lo = w_hi, U_prime_lo, w_lo + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fmpy.s1 wsq = w_hi, w_hi // wsq = w_hi * w_hi - nop.i 999 + nop.m 999 +// +// poly = Q_3 + wsq * Q_4 +// w_lo = = w_lo - w_hi * U_prime_lo +// +(p0) fma.s1 poly = wsq, poly, Q_2 + nop.i 999 } { .mfi - nop.m 999 - fnma.s1 w_lo = w_hi, U_prime_hi, V_prime // w_lo = V_prime-w_hi*U_prime_hi - nop.i 999 + nop.m 999 +(p0) fmpy.s1 w_lo = C_hi, w_lo + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 poly = wsq, Q_4, Q_3 // poly = Q_3 + wsq * Q_4 - nop.i 999 + nop.m 999 +// +// poly = Q_2 + wsq * poly +// w_lo = = w_lo * C_hi +// +(p0) fma.s1 poly = wsq, poly, Q_1 + nop.i 999 } { .mfi - nop.m 999 - fnma.s1 w_lo = w_hi, U_prime_lo, w_lo // w_lo = w_lo - w_hi * U_prime_lo - nop.i 999 + nop.m 999 +(p0) fadd.s1 A_lo = Tbl_lo, w_lo + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 poly = wsq, poly, Q_2 // poly = Q_2 + wsq * poly - nop.i 999 + nop.m 999 +// +// Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode) +// +(p0) fmpy.s0 Q_1 = Q_1, Q_1 + nop.i 999 ;; } { .mfi - nop.m 999 - fmpy.s1 w_lo = C_hi, w_lo // w_lo = = w_lo * C_hi - nop.i 999 + nop.m 999 +// +// poly = Q_1 + wsq * poly +// A_lo = Tbl_lo + w_lo +// swap = xor(swap,sign_X) +// +(p0) fmpy.s1 poly = wsq, poly + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 poly = wsq, poly, Q_1 // poly = Q_1 + wsq * poly - nop.i 999 + nop.m 999 +// +// Is (swap) != 0 ? +// poly = wsq * poly +// A_hi = Tbl_hi +// +(p0) fmpy.s1 poly = w_hi, poly + nop.i 999 ;; } { .mfi - nop.m 999 - fadd.s1 A_lo = Tbl_lo, w_lo // A_lo = Tbl_lo + w_lo - nop.i 999 + nop.m 999 +// +// if (PR_1) sigma = -1.0 +// if (PR_2) sigma = 1.0 +// +(p0) fadd.s1 A_lo = A_lo, poly + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fmpy.s0 Q_1 = Q_1, Q_1 // Dummy operation to raise inexact - nop.i 999 + nop.m 999 +// +// P_hi = s_Y * P_hi +// A_lo = A_lo + poly +// +(p0) fadd.s1 A_lo = A_lo, w_hi + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fmpy.s1 poly = wsq, poly // poly = wsq * poly - nop.i 999 + nop.m 999 +(p0) fma.s1 Res_lo = sigma, A_lo, P_lo + nop.i 999 ;; } -;; - -{ .mfi - nop.m 999 - fmpy.s1 poly = w_hi, poly // poly = w_hi * poly - nop.i 999 +{ .mfb + nop.m 999 +// +// Res_hi = P_hi + sigma * A_hi +// Res_lo = P_lo + sigma * A_lo +// +(p0) fma.s0 Result = Res_lo, s_Y, Res_hi +// +// Raise inexact. +// +br.ret.sptk b0 ;; +} +// +// poly1 = P_5 + zsq * poly1 +// poly2 = zsq * poly2 +// +L(ATANL_POLY): +{ .mmf +(p0) xor swap = sign_X, swap + nop.m 999 +(p0) fnma.s1 E_hold = E, U, f1 ;; } -;; - { .mfi - nop.m 999 - fadd.s1 A_lo = A_lo, poly // A_lo = A_lo + poly - nop.i 999 + nop.m 999 +(p0) mov A_temp = Q +// +// poly1 = P_4 + zsq * poly1 +// swap = xor(swap,sign_X) +// +// sign_X gr_002 +// swap gr_004 +// poly1 = poly1 <== Done with poly1 +// poly1 = P_4 + zsq * poly1 +// swap = xor(swap,sign_X) +// +(p0) cmp.eq.unc p7, p6 = 0x00000, swap } -;; - { .mfi - nop.m 999 - fadd.s1 A_lo = A_lo, w_hi // A_lo = A_lo + w_hi - nop.i 999 + nop.m 999 +(p0) fmpy.s1 P_hi = s_Y, P_hi + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 Res_lo = sigma, A_lo, P_lo // Res_lo = P_lo + sigma * A_lo - nop.i 999 + nop.m 999 +(p6) fsub.s1 sigma = f0, f1 + nop.i 999 } -;; - -// -// Result = Res_hi + Res_lo * s_Y (User Supplied Rounding Mode) -// -{ .mfb - nop.m 999 - fma.s0 Result = Res_lo, s_Y, Res_hi - br.ret.sptk b0 // Exit table path 2^-3 <= V/U < 1 +{ .mfi + nop.m 999 +(p7) fadd.s1 sigma = f0, f1 + nop.i 999 ;; } -;; - -ATANL_POLY: -// Here if 0 < V/U < 2^-3 -// // *********************************************** // ******************** STEP4 ******************** // *********************************************** -// -// Following: -// Iterate 3 times E = E + E*(1.0 - E*U) -// Also load P_8, P_7, P_6, P_5, P_4 -// -{ .mfi - ldfe P_8 = [table_ptr1], -16 // Load P_8 - fnma.s1 z_lo = A_temp, U, V // z_lo = V - A_temp * U - nop.i 999 -} -{ .mfi +{ .mmi nop.m 999 - fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (2) +(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp nop.i 999 } ;; { .mmi - ldfe P_7 = [table_ptr1], -16 // Load P_7 -;; - ldfe P_6 = [table_ptr1], -16 // Load P_6 + ld8 table_ptr1 = [table_ptr1] + nop.m 999 nop.i 999 } ;; + { .mfi - ldfe P_5 = [table_ptr1], -16 // Load P_5 - fma.s1 E = E, E_hold, E // E = E + E_hold*E (2) - nop.i 999 + nop.m 999 +(p0) fma.s1 E = E, E_hold, E +// +// Following: +// Iterate 3 times E = E + E*(1.0 - E*U) +// Also load P_8, P_7, P_6, P_5, P_4 +// E_hold = 1.0 - E * U (1) +// A_temp = Q +// +(p0) add table_ptr1 = 128, table_ptr1 ;; } -;; - -{ .mmi - ldfe P_4 = [table_ptr1], -16 // Load P_4 -;; - ldfe P_3 = [table_ptr1], -16 // Load P_3 - nop.i 999 +{ .mmf + nop.m 999 +// +// E = E + E_hold*E (1) +// Point to P_8. +// +(p0) ldfe P_8 = [table_ptr1], -16 +// +// poly = z8*poly1 + poly2 (Typo in writeup) +// Is (swap) != 0 ? +// +(p0) fnma.s1 z_lo = A_temp, U, V ;; } -;; - -{ .mfi - ldfe P_2 = [table_ptr1], -16 // Load P_2 - fnma.s1 E_hold = E, U, f1 // E_hold = 1.0 - E*U (3) - nop.i 999 +{ .mmb + nop.m 999 +// +// E_hold = 1.0 - E * U (2) +// +(p0) ldfe P_7 = [table_ptr1], -16 + nop.b 999 ;; } -{ .mlx - nop.m 999 - movl int_temp = 0x24005 // Signexp for small neg number +{ .mmb + nop.m 999 +// +// E = E + E_hold*E (2) +// +(p0) ldfe P_6 = [table_ptr1], -16 + nop.b 999 ;; } -;; - -{ .mmf - ldfe P_1 = [table_ptr1], -16 // Load P_1 - setf.exp tmp_small = int_temp // Form small neg number - fma.s1 E = E, E_hold, E // E = E + E_hold*E (3) +{ .mmb + nop.m 999 +// +// E_hold = 1.0 - E * U (3) +// +(p0) ldfe P_5 = [table_ptr1], -16 + nop.b 999 ;; } -;; - +{ .mmf + nop.m 999 +// +// E = E + E_hold*E (3) // // // At this point E approximates 1/U to roughly working precision -// Z = V*E approximates V/U +// z = V*E approximates V/U // -{ .mfi - nop.m 999 - fmpy.s1 Z = V, E // Z = V * E - nop.i 999 +(p0) ldfe P_4 = [table_ptr1], -16 +(p0) fnma.s1 E_hold = E, U, f1 ;; } -{ .mfi - nop.m 999 - fmpy.s1 z_lo = z_lo, E // z_lo = z_lo * E - nop.i 999 +{ .mmb + nop.m 999 +// +// Z = V * E +// +(p0) ldfe P_3 = [table_ptr1], -16 + nop.b 999 ;; } -;; - +{ .mmb + nop.m 999 // -// Now what we want to do is -// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) -// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3)) +// zsq = Z * Z // +(p0) ldfe P_2 = [table_ptr1], -16 + nop.b 999 ;; +} +{ .mmb + nop.m 999 // -// Fixup added to force inexact later - -// A_hi = A_temp + z_lo -// z_lo = (A_temp - A_hi) + z_lo +// z8 = zsq * zsq // -{ .mfi - nop.m 999 - fmpy.s1 zsq = Z, Z // zsq = Z * Z - nop.i 999 +(p0) ldfe P_1 = [table_ptr1], -16 + nop.b 999 ;; } -{ .mfi - nop.m 999 - fadd.s1 A_hi = A_temp, z_lo // A_hi = A_temp + z_lo - nop.i 999 +{ .mlx + nop.m 999 +(p0) movl int_temp = 0x24005 } -;; - { .mfi - nop.m 999 - fma.s1 poly1 = zsq, P_8, P_7 // poly1 = P_7 + zsq * P_8 - nop.i 999 + nop.m 999 +(p0) fma.s1 E = E, E_hold, E + nop.i 999 ;; } { .mfi - nop.m 999 - fma.s1 poly2 = zsq, P_3, P_2 // poly2 = P_2 + zsq * P_3 - nop.i 999 + nop.m 999 +(p0) fnma.s1 E_hold = E, U, f1 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fmpy.s1 z4 = zsq, zsq // z4 = zsq * zsq - nop.i 999 + nop.m 999 +(p0) fma.s1 E = E, E_hold, E + nop.i 999 ;; } { .mfi - nop.m 999 - fsub.s1 A_temp = A_temp, A_hi // A_temp = A_temp - A_hi - nop.i 999 + nop.m 999 +(p0) fmpy.s1 Z = V, E + nop.i 999 } -;; - { .mfi - nop.m 999 - fmerge.s tmp = A_hi, A_hi // Copy tmp = A_hi - nop.i 999 + nop.m 999 +// +// z_lo = V - A_temp * U +// if (PR_2) sigma = 1.0 +// +(p0) fmpy.s1 z_lo = z_lo, E + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 poly1 = zsq, poly1, P_6 // poly1 = P_6 + zsq * poly1 - nop.i 999 + nop.m 999 +(p0) fmpy.s1 zsq = Z, Z + nop.i 999 } { .mfi - nop.m 999 - fma.s1 poly2 = zsq, poly2, P_1 // poly2 = P_2 + zsq * poly2 - nop.i 999 + nop.m 999 +// +// z_lo = z_lo * E +// if (PR_1) sigma = -1.0 +// +(p0) fadd.s1 A_hi = A_temp, z_lo + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fmpy.s1 z8 = z4, z4 // z8 = z4 * z4 - nop.i 999 + nop.m 999 +// +// z8 = z8 * z8 +// +// +// Now what we want to do is +// poly1 = P_4 + zsq*(P_5 + zsq*(P_6 + zsq*(P_7 + zsq*P_8))) +// poly2 = zsq*(P_1 + zsq*(P_2 + zsq*P_3)) +// +(p0) fma.s1 poly1 = zsq, P_8, P_7 + nop.i 999 } { .mfi - nop.m 999 - fadd.s1 z_lo = A_temp, z_lo // z_lo = (A_temp - A_hi) + z_lo - nop.i 999 + nop.m 999 +(p0) fma.s1 poly2 = zsq, P_3, P_2 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 poly1 = zsq, poly1, P_5 // poly1 = P_5 + zsq * poly1 - nop.i 999 + nop.m 999 +(p0) fmpy.s1 z8 = zsq, zsq + nop.i 999 } { .mfi - nop.m 999 - fmpy.s1 poly2 = poly2, zsq // poly2 = zsq * poly2 - nop.i 999 + nop.m 999 +(p0) fsub.s1 A_temp = A_temp, A_hi + nop.i 999 ;; } -;; - -// Create small GR double in case need to raise underflow { .mfi - nop.m 999 - fma.s1 poly1 = zsq, poly1, P_4 // poly1 = P_4 + zsq * poly1 - dep GR_temp = -1,r0,0,53 + nop.m 999 +// +// A_lo = Z * poly + z_lo +// +(p0) fmerge.s tmp = A_hi, A_hi + nop.i 999 ;; } -;; - -// Create small double in case need to raise underflow { .mfi - setf.d FR_temp = GR_temp - fma.s1 poly = z8, poly1, poly2 // poly = poly2 + z8 * poly1 - nop.i 999 + nop.m 999 +// +// poly1 = P_7 + zsq * P_8 +// poly2 = P_2 + zsq * P_3 +// +(p0) fma.s1 poly1 = zsq, poly1, P_6 + nop.i 999 } -;; - { .mfi - nop.m 999 - fma.s1 A_lo = Z, poly, z_lo // A_lo = z_lo + Z * poly - nop.i 999 + nop.m 999 +(p0) fma.s1 poly2 = zsq, poly2, P_1 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fadd.s1 A_hi = tmp, A_lo // A_hi = tmp + A_lo - nop.i 999 + nop.m 999 +(p0) fmpy.s1 z8 = z8, z8 + nop.i 999 } -;; - { .mfi - nop.m 999 - fsub.s1 tmp = tmp, A_hi // tmp = tmp - A_hi - nop.i 999 + nop.m 999 +(p0) fadd.s1 z_lo = A_temp, z_lo + nop.i 999 ;; } { .mfi - nop.m 999 - fmpy.s1 A_hi = s_Y, A_hi // A_hi = s_Y * A_hi - nop.i 999 + nop.m 999 +// +// poly1 = P_6 + zsq * poly1 +// poly2 = P_2 + zsq * poly2 +// +(p0) fma.s1 poly1 = zsq, poly1, P_5 + nop.i 999 } -;; - { .mfi - nop.m 999 - fadd.s1 A_lo = tmp, A_lo // A_lo = tmp + A_lo - nop.i 999 + nop.m 999 +(p0) fmpy.s1 poly2 = poly2, zsq + nop.i 999 ;; } { .mfi - nop.m 999 - fma.s1 Res_hi = sigma, A_hi, P_hi // Res_hi = P_hi + sigma * A_hi - nop.i 999 + nop.m 999 +// +// Result = Res_hi + Res_lo (User Supplied Rounding Mode) +// +(p0) fmpy.s1 P_5 = P_5, P_5 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p0) fma.s1 poly1 = zsq, poly1, P_4 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fsub.s1 tmp = P_hi, Res_hi // tmp = P_hi - Res_hi - nop.i 999 + nop.m 999 +(p0) fma.s1 poly = z8, poly1, poly2 + nop.i 999 ;; } -;; - +{ .mfi + nop.m 999 // -// Test if A_lo is zero +// Fixup added to force inexact later - +// A_hi = A_temp + z_lo +// z_lo = (A_temp - A_hi) + z_lo // -{ .mfi - nop.m 999 - fclass.m p6,p0 = A_lo, 0x007 // Test A_lo = 0 - nop.i 999 +(p0) fma.s1 A_lo = Z, poly, z_lo + nop.i 999 ;; } -;; - { .mfi - nop.m 999 -(p6) mov A_lo = tmp_small // If A_lo zero, make very small - nop.i 999 + nop.m 999 +(p0) fadd.s1 A_hi = tmp, A_lo + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 tmp = A_hi, sigma, tmp // tmp = sigma * A_hi + tmp - nop.i 999 + nop.m 999 +(p0) fsub.s1 tmp = tmp, A_hi + nop.i 999 } { .mfi - nop.m 999 - fma.s1 sigma = A_lo, sigma, P_lo // sigma = A_lo * sigma + P_lo - nop.i 999 + nop.m 999 +(p0) fmpy.s1 A_hi = s_Y, A_hi + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fma.s1 Res_lo = s_Y, sigma, tmp // Res_lo = s_Y * sigma + tmp - nop.i 999 + nop.m 999 +(p0) fadd.s1 A_lo = tmp, A_lo + nop.i 999 } -;; - +{ .mfi +(p0) setf.exp tmp = int_temp // -// Test if Res_lo is denormal +// P_hi = s_Y * P_hi +// A_hi = s_Y * A_hi // +(p0) fma.s1 Res_hi = sigma, A_hi, P_hi + nop.i 999 ;; +} { .mfi - nop.m 999 - fclass.m p14, p15 = Res_lo, 0x0b - nop.i 999 + nop.m 999 +(p0) fclass.m.unc p6,p0 = A_lo, 0x007 + nop.i 999 ;; } -;; - +{ .mfi + nop.m 999 +(p6) mov A_lo = tmp + nop.i 999 +} +{ .mfi + nop.m 999 // -// Compute Result = Res_lo + Res_hi. Use s3 if Res_lo is denormal. +// Res_hi = P_hi + sigma * A_hi // -{ .mfi - nop.m 999 -(p14) fadd.s3 Result = Res_lo, Res_hi // Result for Res_lo denormal - nop.i 999 +(p0) fsub.s1 tmp = P_hi, Res_hi + nop.i 999 ;; } { .mfi - nop.m 999 -(p15) fadd.s0 Result = Res_lo, Res_hi // Result for Res_lo normal - nop.i 999 + nop.m 999 +// +// tmp = P_hi - Res_hi +// +(p0) fma.s1 tmp = A_hi, sigma, tmp + nop.i 999 } -;; - -// -// If Res_lo is denormal test if Result equals zero -// { .mfi - nop.m 999 -(p14) fclass.m.unc p14, p0 = Result, 0x07 - nop.i 999 + nop.m 999 +(p0) fma.s1 sigma = A_lo, sigma, P_lo + nop.i 999 ;; } -;; - +{ .mfi + nop.m 999 // -// If Res_lo is denormal and Result equals zero, raise inexact, underflow -// by squaring small double +// tmp = sigma * A_hi + tmp +// sigma = A_lo * sigma + P_lo // +(p0) fma.s1 Res_lo = s_Y, sigma, tmp + nop.i 999 ;; +} { .mfb - nop.m 999 -(p14) fmpy.d.s0 FR_temp = FR_temp, FR_temp - br.ret.sptk b0 // Exit POLY path, 0 < Q < 2^-3 + nop.m 999 +// +// Res_lo = s_Y * sigma + tmp +// +(p0) fadd.s0 Result = Res_lo, Res_hi +br.ret.sptk b0 ;; } -;; - - -ATANL_UNSUPPORTED: +L(ATANL_NATVAL): +L(ATANL_UNSUPPORTED): +L(ATANL_NAN): { .mfb - nop.m 999 - fmpy.s0 Result = ArgX,ArgY - br.ret.sptk b0 + nop.m 999 +(p0) fmpy.s0 Result = ArgX,ArgY +(p0) br.ret.sptk b0 ;; } -;; - -// Here if y natval, nan, inf, zero -ATANL_Y_SPECIAL: -// Here if x natval, nan, inf, zero -ATANL_X_SPECIAL: +L(ATANL_SPECIAL_HANDLING): { .mfi - nop.m 999 - fclass.m p13,p12 = ArgY_orig, 0x0c3 // Test y nan - nop.i 999 + nop.m 999 +(p0) fcmp.eq.s0 p0, p6 = f1, ArgY_orig + nop.i 999 } -;; - { .mfi - nop.m 999 - fclass.m p15,p14 = ArgY_orig, 0x103 // Test y natval - nop.i 999 + nop.m 999 +(p0) fcmp.eq.s0 p0, p5 = f1, ArgX_orig + nop.i 999 ;; } -;; - { .mfi - nop.m 999 -(p12) fclass.m p13,p0 = ArgX_orig, 0x0c3 // Test x nan - nop.i 999 + nop.m 999 +(p0) fclass.m.unc p6, p7 = ArgY, 0x007 + nop.i 999 } -;; - -{ .mfi - nop.m 999 -(p14) fclass.m p15,p0 = ArgX_orig, 0x103 // Test x natval - nop.i 999 +{ .mlx + nop.m 999 +(p0) movl special = 992 } ;; -{ .mfb - nop.m 999 -(p13) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result nan if x or y nan -(p13) br.ret.spnt b0 // Exit if x or y nan -} -;; -{ .mfb +{ .mmi nop.m 999 -(p15) fmpy.s0 Result = ArgX_orig, ArgY_orig // Result natval if x or y natval -(p15) br.ret.spnt b0 // Exit if x or y natval +(p0) addl table_ptr1 = @ltoff(Constants_atan#), gp + nop.i 999 } ;; - -// Here if x or y inf or zero -ATANL_SPECIAL_HANDLING: -{ .mfi +{ .mmi + ld8 table_ptr1 = [table_ptr1] nop.m 999 - fclass.m p6, p7 = ArgY_orig, 0x007 // Test y zero - mov special = 992 // Offset to table + nop.i 999 } ;; -{ .mfb - add table_ptr1 = table_base, special // Point to 3pi/4 - fcmp.eq.s0 p0, p9 = ArgX_orig, ArgY_orig // Dummy to set denormal flag -(p7) br.cond.spnt ATANL_ArgY_Not_ZERO // Branch if y not zero -} -;; -// Here if y zero +{ .mib +(p0) add table_ptr1 = table_ptr1, special + nop.i 999 +(p7) br.cond.spnt L(ATANL_ArgY_Not_ZERO) ;; +} { .mmf - ldfd Result = [table_ptr1], 8 // Get pi high - nop.m 999 - fclass.m p14, p0 = ArgX, 0x035 // Test for x>=+0 +(p0) ldfd Result = [table_ptr1], 8 + nop.m 999 +(p6) fclass.m.unc p14, p0 = ArgX, 0x035 ;; } -;; - { .mmf - nop.m 999 - ldfd Result_lo = [table_ptr1], -8 // Get pi lo - fclass.m p15, p0 = ArgX, 0x036 // Test for x<=-0 + nop.m 999 +(p0) ldfd Result_lo = [table_ptr1], -8 +(p6) fclass.m.unc p15, p0 = ArgX, 0x036 ;; } -;; - -// -// Return sign_Y * 0 when ArgX > +0 -// { .mfi - nop.m 999 -(p14) fmerge.s Result = ArgY, f0 // If x>=+0, y=0, hi sgn(y)*0 - nop.i 999 + nop.m 999 +(p14) fmerge.s Result = ArgY, f0 + nop.i 999 } -;; - { .mfi - nop.m 999 - fclass.m p13, p0 = ArgX, 0x007 // Test for x=0 - nop.i 999 + nop.m 999 +(p6) fclass.m.unc p13, p0 = ArgX, 0x007 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 -(p14) fmerge.s Result_lo = ArgY, f0 // If x>=+0, y=0, lo sgn(y)*0 - nop.i 999 + nop.m 999 +(p14) fmerge.s Result_lo = ArgY, f0 + nop.i 999 ;; } -;; - { .mfi -(p13) mov GR_Parameter_TAG = 36 // Error tag for x=0, y=0 - nop.f 999 - nop.i 999 +(p13) mov GR_Parameter_TAG = 36 + nop.f 999 + nop.i 999 ;; } -;; - +{ .mfi + nop.m 999 // -// Return sign_Y * pi when ArgX < -0 +// Return sign_Y * 0 when ArgX > +0 // -{ .mfi - nop.m 999 -(p15) fmerge.s Result = ArgY, Result // If x<0, y=0, hi=sgn(y)*pi - nop.i 999 +(p15) fmerge.s Result = ArgY, Result + nop.i 999 ;; } -;; - { .mfi - nop.m 999 -(p15) fmerge.s Result_lo = ArgY, Result_lo // If x<0, y=0, lo=sgn(y)*pi - nop.i 999 + nop.m 999 +(p15) fmerge.s Result_lo = ArgY, Result_lo + nop.i 999 ;; } -;; - +{ .mfb + nop.m 999 // -// Call error support function for atan(0,0) +// Return sign_Y * 0 when ArgX < -0 // -{ .mfb - nop.m 999 - fadd.s0 Result = Result, Result_lo -(p13) br.cond.spnt __libm_error_region // Branch if atan(0,0) +(p0) fadd.s0 Result = Result, Result_lo +(p13) br.cond.spnt __libm_error_region ;; } -;; - { .mib - nop.m 999 - nop.i 999 - br.ret.sptk b0 // Exit for y=0, x not 0 + nop.m 999 + nop.i 999 +// +// Call error support funciton for atan(0,0) +// +(p0) br.ret.sptk b0 ;; } -;; - -// Here if y not zero -ATANL_ArgY_Not_ZERO: +L(ATANL_ArgY_Not_ZERO): { .mfi - nop.m 999 - fclass.m p0, p10 = ArgY, 0x023 // Test y inf - nop.i 999 + nop.m 999 +(p0) fclass.m.unc p9, p10 = ArgY, 0x023 + nop.i 999 ;; +} +{ .mib + nop.m 999 + nop.i 999 +(p10) br.cond.spnt L(ATANL_ArgY_Not_INF) ;; +} +{ .mfi + nop.m 999 +(p9) fclass.m.unc p6, p0 = ArgX, 0x017 + nop.i 999 +} +{ .mfi + nop.m 999 +(p9) fclass.m.unc p7, p0 = ArgX, 0x021 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p9) fclass.m.unc p8, p0 = ArgX, 0x022 + nop.i 999 ;; +} +{ .mmi +(p6) add table_ptr1 = 16, table_ptr1 ;; +(p0) ldfd Result = [table_ptr1], 8 + nop.i 999 ;; +} +{ .mfi +(p0) ldfd Result_lo = [table_ptr1], -8 + nop.f 999 + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p6) fmerge.s Result = ArgY, Result + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p6) fmerge.s Result_lo = ArgY, Result_lo + nop.i 999 ;; } -;; - { .mfb - nop.m 999 - fclass.m p6, p0 = ArgX, 0x017 // Test for 0 <= |x| < inf -(p10) br.cond.spnt ATANL_ArgY_Not_INF // Branch if 0 < |y| < inf + nop.m 999 +(p6) fadd.s0 Result = Result, Result_lo +(p6) br.ret.sptk b0 ;; } -;; - -// Here if y=inf // +// Load PI/2 and adjust its sign. // Return +PI/2 when ArgY = +Inf and ArgX = +/-0 or normal // Return -PI/2 when ArgY = -Inf and ArgX = +/-0 or normal -// Return +PI/4 when ArgY = +Inf and ArgX = +Inf -// Return -PI/4 when ArgY = -Inf and ArgX = +Inf -// Return +3PI/4 when ArgY = +Inf and ArgX = -Inf -// Return -3PI/4 when ArgY = -Inf and ArgX = -Inf // +{ .mmi +(p7) add table_ptr1 = 32, table_ptr1 ;; +(p7) ldfd Result = [table_ptr1], 8 + nop.i 999 ;; +} { .mfi - nop.m 999 - fclass.m p7, p0 = ArgX, 0x021 // Test for x=+inf - nop.i 999 +(p7) ldfd Result_lo = [table_ptr1], -8 + nop.f 999 + nop.i 999 ;; } -;; - { .mfi -(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2, if x finite - fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf - nop.i 999 + nop.m 999 +(p7) fmerge.s Result = ArgY, Result + nop.i 999 ;; } -;; - -{ .mmi -(p7) add table_ptr1 = 32, table_ptr1 // Point to pi/4 if x=+inf -;; -(p8) add table_ptr1 = 48, table_ptr1 // Point to 3pi/4 if x=-inf - - nop.i 999 +{ .mfi + nop.m 999 +(p7) fmerge.s Result_lo = ArgY, Result_lo + nop.i 999 ;; } -;; - +{ .mfb + nop.m 999 +(p7) fadd.s0 Result = Result, Result_lo +(p7) br.ret.sptk b0 ;; +} +// +// Load PI/4 and adjust its sign. +// Return +PI/4 when ArgY = +Inf and ArgX = +Inf +// Return -PI/4 when ArgY = -Inf and ArgX = +Inf +// { .mmi - ldfd Result = [table_ptr1], 8 // Load pi/2, pi/4, or 3pi/4 hi -;; - ldfd Result_lo = [table_ptr1], -8 // Load pi/2, pi/4, or 3pi/4 lo - nop.i 999 +(p8) add table_ptr1 = 48, table_ptr1 ;; +(p8) ldfd Result = [table_ptr1], 8 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fmerge.s Result = ArgY, Result // Merge sgn(y) in hi - nop.i 999 +(p8) ldfd Result_lo = [table_ptr1], -8 + nop.f 999 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo - nop.i 999 + nop.m 999 +(p8) fmerge.s Result = ArgY, Result + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p8) fmerge.s Result_lo = ArgY, Result_lo + nop.i 999 ;; } -;; - { .mfb - nop.m 999 - fadd.s0 Result = Result, Result_lo // Compute complete result - br.ret.sptk b0 // Exit for y=inf + nop.m 999 +(p8) fadd.s0 Result = Result, Result_lo +(p8) br.ret.sptk b0 ;; } -;; - -// Here if y not INF, and x=0 or INF -ATANL_ArgY_Not_INF: +L(ATANL_ArgY_Not_INF): +{ .mfi + nop.m 999 // -// Return +PI/2 when ArgY NOT Inf, ArgY > 0 and ArgX = +/-0 -// Return -PI/2 when ArgY NOT Inf, ArgY < 0 and ArgX = +/-0 -// Return +0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf -// Return -0 when ArgY NOT Inf, ArgY > 0 and ArgX = +Inf -// Return +PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf -// Return -PI when ArgY NOT Inf, ArgY > 0 and ArgX = -Inf +// Load PI/4 and adjust its sign. +// Return +3PI/4 when ArgY = +Inf and ArgX = -Inf +// Return -3PI/4 when ArgY = -Inf and ArgX = -Inf // +(p0) fclass.m.unc p6, p0 = ArgX, 0x007 + nop.i 999 +} { .mfi - nop.m 999 - fclass.m p7, p9 = ArgX, 0x021 // Test for x=+inf - nop.i 999 + nop.m 999 +(p0) fclass.m.unc p7, p0 = ArgX, 0x021 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 - fclass.m p6, p0 = ArgX, 0x007 // Test for x=0 - nop.i 999 + nop.m 999 +(p0) fclass.m.unc p8, p0 = ArgX, 0x022 + nop.i 999 ;; +} +{ .mmi +(p6) add table_ptr1 = 16, table_ptr1 ;; +(p6) ldfd Result = [table_ptr1], 8 + nop.i 999 ;; } -;; - { .mfi -(p6) add table_ptr1 = 16, table_ptr1 // Point to pi/2 - fclass.m p8, p0 = ArgX, 0x022 // Test for x=-inf - nop.i 999 +(p6) ldfd Result_lo = [table_ptr1], -8 + nop.f 999 + nop.i 999 ;; } -;; - -.pred.rel "mutex",p7,p9 { .mfi -(p9) ldfd Result = [table_ptr1], 8 // Load pi or pi/2 hi -(p7) fmerge.s Result = ArgY, f0 // If y not inf, x=+inf, sgn(y)*0 - nop.i 999 + nop.m 999 +(p6) fmerge.s Result = ArgY, Result + nop.i 999 ;; } -;; - { .mfi -(p9) ldfd Result_lo = [table_ptr1], -8 // Load pi or pi/2 lo -(p7) fnorm.s0 Result = Result // If y not inf, x=+inf normalize - nop.i 999 + nop.m 999 +(p6) fmerge.s Result_lo = ArgY, Result_lo + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p6) fadd.s0 Result = Result, Result_lo +(p6) br.ret.spnt b0 ;; } -;; - { .mfi - nop.m 999 -(p9) fmerge.s Result = ArgY, Result // Merge sgn(y) in hi - nop.i 999 + nop.m 999 +// +// return = sign_Y * PI/2 when ArgX = 0 +// +(p7) fmerge.s Result = ArgY, f0 + nop.i 999 ;; +} +{ .mfb + nop.m 999 +(p7) fnorm.s0 Result = Result +(p7) br.ret.spnt b0 ;; +} +// +// return = sign_Y * 0 when ArgX = Inf +// +{ .mmi +(p8) ldfd Result = [table_ptr1], 8 ;; +(p8) ldfd Result_lo = [table_ptr1], -8 + nop.i 999 ;; } -;; - { .mfi - nop.m 999 -(p9) fmerge.s Result_lo = ArgY, Result_lo // Merge sgn(y) in lo - nop.i 999 + nop.m 999 +(p8) fmerge.s Result = ArgY, Result + nop.i 999 ;; +} +{ .mfi + nop.m 999 +(p8) fmerge.s Result_lo = ArgY, Result_lo + nop.i 999 ;; } -;; - { .mfb - nop.m 999 -(p9) fadd.s0 Result = Result, Result_lo // Compute complete result - br.ret.spnt b0 // Exit for y not inf, x=0,inf + nop.m 999 +(p8) fadd.s0 Result = Result, Result_lo +(p8) br.ret.sptk b0 ;; } -;; - -GLOBAL_IEEE754_END(atan2l) -LOCAL_LIBM_ENTRY(__libm_error_region) +// +// return = sign_Y * PI when ArgX = -Inf +// +.endp atan2l +ASM_SIZE_DIRECTIVE(atan2l) +ASM_SIZE_DIRECTIVE(__atan2l) +ASM_SIZE_DIRECTIVE(__ieee754_atan2l) + +.proc __libm_error_region +__libm_error_region: .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value @@ -1999,6 +2001,7 @@ LOCAL_LIBM_ENTRY(__libm_error_region) br.ret.sptk b0 // Return };; -LOCAL_LIBM_END(__libm_error_region#) +.endp __libm_error_region +ASM_SIZE_DIRECTIVE(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support# -- cgit v1.2.3