.file "powl.s" // Copyright (C) 2000, 2001, Intel Corporation // All rights reserved. // // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story, // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // * The name of Intel Corporation may not be used to endorse or promote // products derived from this software without specific prior written // permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Intel Corporation is the author of this code, and requests that all // problem reports or change requests be submitted to it directly at // http://developer.intel.com/opensource. // // ********************************************************************* // // Function: powl(x,y), where // y // powl(x,y) = x , for double extended precision x and y values // // ********************************************************************* // // History: // 2/02/00 (Hand Optimized) // 4/04/00 Unwind support added // 8/15/00 Bundle added after call to __libm_error_support to properly // set [the previously overwritten] GR_Parameter_RESULT. // 1/22/01 Corrected results for powl(1,inf), powl(1,nan), and // powl(snan,0) to be 1 per C99, not nan. Fixed many flag settings. // 2/06/01 Call __libm_error support if over/underflow when y=2. // // ********************************************************************* // // Resources Used: // // Floating-Point Registers: // f8 (Input and Return Value) // f9-f15,f32-f63,f99 // // General Purpose Registers: // Locals r32 - r61 // Parameters to __libm_error_support r62,r63,r64,r65 // // Predicate Registers: p6-p15 // // ********************************************************************* // // Special Cases and IEEE special conditions: // // Denormal fault raised on denormal inputs // Overflow exceptions raised when appropriate for pow // Underflow exceptions raised when appropriate for pow // (Error Handling Routine called for overflow and Underflow) // Inexact raised when appropriate by algorithm // // 1. (anything) ** NatVal or (NatVal) ** anything is NatVal // 2. X or Y unsupported or sNaN is qNaN/Invalid // 3. (anything) ** 0 is 1 // 4. (anything) ** 1 is itself // 5. (anything except 1) ** qNAN is qNAN // 6. qNAN ** (anything except 0) is qNAN // 7. +-(|x| > 1) ** +INF is +INF // 8. +-(|x| > 1) ** -INF is +0 // 9. +-(|x| < 1) ** +INF is +0 // 10. +-(|x| < 1) ** -INF is +INF // 11. +-1 ** +-INF is +1 // 12. +0 ** (+anything except 0, NAN) is +0 // 13. -0 ** (+anything except 0, NAN, odd integer) is +0 // 14. +0 ** (-anything except 0, NAN) is +INF/div_0 // 15. -0 ** (-anything except 0, NAN, odd integer) is +INF/div_0 // 16. -0 ** (odd integer) = -( +0 ** (odd integer) ) // 17. +INF ** (+anything except 0,NAN) is +INF // 18. +INF ** (-anything except 0,NAN) is +0 // 19. -INF ** (anything except NAN) = -0 ** (-anything) // 20. (-anything) ** (integer) is (-1)**(integer)*(+anything**integer) // 21. (-anything except 0 and inf) ** (non-integer) is qNAN/Invalid // 22. X or Y denorm/unorm and denorm/unorm operand trap is enabled, // generate denorm/unorm fault except if invalid or div_0 raised. // // ********************************************************************* // // Algorithm // ========= // // Special Cases // // If Y = 2, return X*X. // If Y = 0.5, return sqrt(X). // // Compute log(X) to extra precision. // // ker_log_80( X, logX_hi, logX_lo, Safe ); // // ...logX_hi + logX_lo approximates log(X) to roughly 80 // ...significant bits of accuracy. // // Compute Y*log(X) to extra precision. // // P_hi := Y * logX_hi // P_lo := Y * logX_hi - P_hi ...using FMA // P_lo := Y * logX_lo + P_lo ...using FMA // // Compute exp(P_hi + P_lo) // // Flag := 2; // Expo_Range := 2; (assuming double-extended power function) // ker_exp_64( P_hi, P_lo, Flag, Expo_Range, // Z_hi, Z_lo, scale, Safe ) // // scale := sgn * scale // // If (Safe) then ...result will not over/underflow // return scale*Z_hi + (scale*Z_lo) // quickly // Else // take necessary precaution in computing // scale*Z_hi + (scale*Z_lo) // to set possible exceptions correctly. // End If // // Case_Y_Special // // ...Follow the order of the case checks // // If Y is +-0, return +1 without raising any exception. // If Y is +1, return X without raising any exception. // If Y is qNaN, return Y without exception. // If X is qNaN, return X without exception. // // At this point, X is real and Y is +-inf. // Thus |X| can only be 1, strictly bigger than 1, or // strictly less than 1. // // If |X| < 1, then // return ( Y == +inf? +0 : +inf ) // elseif |X| > 1, then // return ( Y == +inf? +0 : +inf ) // else // goto Case_Invalid // // Case_X_Special // // ...Follow the order of the case checks // ...Note that Y is real, finite, non-zero, and not +1. // // If X is qNaN, return X without exception. // // If X is +-0, // return ( Y > 0 ? +0 : +inf ) // // If X is +inf // return ( Y > 0 ? +inf : +0 ) // // If X is -inf // return -0 ** -Y // return ( Y > 0 ? +inf : +0 ) // // Case_Invalid // // Return 0 * inf to generate a quiet NaN together // with an invalid exception. // // Implementation // ============== // // We describe the quick branch since this part is important // in reaching the normal case efficiently. // // STAGE 1 // ------- // This stage contains two threads. // // Stage1.Thread1 // // fclass.m X_excep, X_ok = X, (NatVal or s/qNaN) or // +-0, +-infinity // // fclass.nm X_unsupp, X_supp = X, (NatVal or s/qNaN) or // +-(0, unnorm, norm, infinity) // // X_norm := fnorm( X ) with traps disabled // // If (X_excep) goto Filtering (Step 2) // If (X_unsupp) goto Filtering (Step 2) // // Stage1.Thread2 // .............. // // fclass.m Y_excep, Y_ok = Y, (NatVal or s/qNaN) or // +-0, +-infinity // // fclass.nm Y_unsupp, Y_supp = Y, (NatVal or s/qNaN) or // +-(0, unnorm, norm, infinity) // // Y_norm := fnorm( Y ) with traps disabled // // If (Y_excep) goto Filtering (Step 2) // If (Y_unsupp) goto Filtering (Step 2) // // // STAGE 2 // ------- // This stage contains two threads. // // Stage2.Thread1 // .............. // // Set X_lt_0 if X < 0 (using fcmp) // sgn := +1.0 // If (X_lt_0) goto Filtering (Step 2) // // Stage2.Thread2 // .............. // // Set Y_is_1 if Y = +1 (using fcmp) // If (Y_is_1) goto Filtering (Step 2) // // STAGE 3 // ------- // This stage contains two threads. // // // Stage3.Thread1 // .............. // // X := fnorm(X) in prevailing traps // // // Stage3.Thread2 // .............. // // Y := fnorm(Y) in prevailing traps // // STAGE 4 // ------- // // Go to Case_Normal. // #include "libm_support.h" #ifdef _LIBC .rodata #else .data #endif // Inv_L, L_hi, L_lo .align 64 Constants_exp_64_Arg: ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object) data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000 data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000 data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000 ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg) .align 64 Constants_exp_64_Exponents: ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object) data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents) .align 64 Constants_exp_64_A: ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object) // Reversed data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000 data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000 data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000 ASM_SIZE_DIRECTIVE(Constants_exp_64_A) .align 64 Constants_exp_64_P: ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object) // Reversed data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000 data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000 data4 0x7474C518,0x88888888,0x00003FF8,0x00000000 data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000 data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000 data4 0x000004C7,0x80000000,0x00003FFE,0x00000000 ASM_SIZE_DIRECTIVE(Constants_exp_64_P) .align 64 Constants_exp_64_T1: ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object) data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29 data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5 data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516 data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4 data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15 data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5 data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177 data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C ASM_SIZE_DIRECTIVE(Constants_exp_64_T1) .align 64 Constants_exp_64_T2: ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object) data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4 data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7 data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349 data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987 data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610 data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8 data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50 data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07 data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269 data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37 ASM_SIZE_DIRECTIVE(Constants_exp_64_T2) .align 64 Constants_exp_64_W1: ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object) data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454 data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6 data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50 data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2 data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04 data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419 data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376 data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08 data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75 data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79 data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7 data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087 data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643 data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873 data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861 data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0 data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148 ASM_SIZE_DIRECTIVE(Constants_exp_64_W1) .align 64 Constants_exp_64_W2: ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object) data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25 data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8 data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9 data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2 data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0 data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509 data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33 data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87 data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3 data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9 data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82 data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4 data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030 data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29 data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893 data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35 data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313 data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426 data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550 data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4 data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31 data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9 ASM_SIZE_DIRECTIVE(Constants_exp_64_W2) .align 64 Constants_log_80_P: ASM_TYPE_DIRECTIVE(Constants_log_80_P,@object) // 1/2, P_8, P_7, ..., P_1 data4 0x00000000, 0x80000000, 0x00003FFE, 0x00000000 data4 0x3B1042BC, 0xCCCE8B88, 0x0000BFFB, 0x00000000 data4 0xCADC2149, 0xE38997B7, 0x00003FFB, 0x00000000 data4 0xB1ACB090, 0xFFFFFFFE, 0x0000BFFB, 0x00000000 data4 0x06481C81, 0x92492498, 0x00003FFC, 0x00000000 data4 0xAAAAB0EF, 0xAAAAAAAA, 0x0000BFFC, 0x00000000 data4 0xCCC91416, 0xCCCCCCCC, 0x00003FFC, 0x00000000 data4 0x00000000, 0x80000000, 0x0000BFFD, 0x00000000 data4 0xAAAAAAAB, 0xAAAAAAAA, 0x00003FFD ASM_SIZE_DIRECTIVE(Constants_log_80_P) .align 64 Constants_log_80_Q: ASM_TYPE_DIRECTIVE(Constants_log_80_Q,@object) // log2_hi, log2_lo, Q_6, Q_5, Q_4, Q_3, Q_2, Q_1 data4 0x00000000,0xB1721800,0x00003FFE,0x00000000 data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 data4 0xA51BE0AF,0x92492453,0x00003FFC,0x00000000 data4 0xA0CFD29F,0xAAAAAB73,0x0000BFFC,0x00000000 data4 0xCCCE3872,0xCCCCCCCC,0x00003FFC,0x00000000 data4 0xFFFFB4FB,0xFFFFFFFF,0x0000BFFC,0x00000000 data4 0xAAAAAAAB,0xAAAAAAAA,0x00003FFD,0x00000000 data4 0x00000000,0x80000000,0x0000BFFE,0x00000000 ASM_SIZE_DIRECTIVE(Constants_log_80_Q) .align 64 Constants_log_80_Z_G_H_h1: ASM_TYPE_DIRECTIVE(Constants_log_80_Z_G_H_h1,@object) // Z1 - 16 bit fixed, G1 and H1 IEEE single, h1 IEEE double data4 0x00008000,0x3F800000,0x00000000,0x00000000 data4 0x00000000,0x00000000,0x00000000,0x00000000 data4 0x00007879,0x3F70F0F0,0x3D785196,0x00000000 data4 0xEBA0E0D1,0x8B1D330B,0x00003FDA,0x00000000 data4 0x000071C8,0x3F638E38,0x3DF13843,0x00000000 data4 0x9EADD553,0xE2AF365E,0x00003FE2,0x00000000 data4 0x00006BCB,0x3F579430,0x3E2FF9A0,0x00000000 data4 0x752F34A2,0xF585FEC3,0x0000BFE3,0x00000000 data4 0x00006667,0x3F4CCCC8,0x3E647FD6,0x00000000 data4 0x893B03F3,0xF3546435,0x00003FE2,0x00000000 data4 0x00006187,0x3F430C30,0x3E8B3AE7,0x00000000 data4 0x39CDD2AC,0xBABA62E0,0x00003FE4,0x00000000 data4 0x00005D18,0x3F3A2E88,0x3EA30C68,0x00000000 data4 0x457978A1,0x8718789F,0x00003FE2,0x00000000 data4 0x0000590C,0x3F321640,0x3EB9CEC8,0x00000000 data4 0x3185E56A,0x9442DF96,0x0000BFE4,0x00000000 data4 0x00005556,0x3F2AAAA8,0x3ECF9927,0x00000000 data4 0x2BBE2CBD,0xCBF9A4BF,0x00003FE4,0x00000000 data4 0x000051EC,0x3F23D708,0x3EE47FC5,0x00000000 data4 0x852D5935,0xF3537535,0x00003FE3,0x00000000 data4 0x00004EC5,0x3F1D89D8,0x3EF8947D,0x00000000 data4 0x46CDF32F,0xA1F1E699,0x0000BFDF,0x00000000 data4 0x00004BDB,0x3F17B420,0x3F05F3A1,0x00000000 data4 0xD8484CE3,0x84A61856,0x00003FE4,0x00000000 data4 0x00004925,0x3F124920,0x3F0F4303,0x00000000 data4 0xFF28821B,0xC7DD97E0,0x0000BFE2,0x00000000 data4 0x0000469F,0x3F0D3DC8,0x3F183EBF,0x00000000 data4 0xEF1FD32F,0xD3C4A887,0x00003FE3,0x00000000 data4 0x00004445,0x3F088888,0x3F20EC80,0x00000000 data4 0x464C76DA,0x84672BE6,0x00003FE5,0x00000000 data4 0x00004211,0x3F042108,0x3F29516A,0x00000000 data4 0x18835FB9,0x9A43A511,0x0000BFE5,0x00000000 ASM_SIZE_DIRECTIVE(Constants_log_80_Z_G_H_h1) .align 64 Constants_log_80_Z_G_H_h2: ASM_TYPE_DIRECTIVE(Constants_log_80_Z_G_H_h2,@object) // Z2 - 16 bit fixed, G2 and H2 IEEE single, h2 IEEE double data4 0x00008000,0x3F800000,0x00000000,0x00000000 data4 0x00000000,0x00000000,0x00000000,0x00000000 data4 0x00007F81,0x3F7F00F8,0x3B7F875D,0x00000000 data4 0x211398BF,0xAD08B116,0x00003FDB,0x00000000 data4 0x00007F02,0x3F7E03F8,0x3BFF015B,0x00000000 data4 0xC376958E,0xB106790F,0x00003FDE,0x00000000 data4 0x00007E85,0x3F7D08E0,0x3C3EE393,0x00000000 data4 0x79A7679A,0xFD03F242,0x0000BFDA,0x00000000 data4 0x00007E08,0x3F7C0FC0,0x3C7E0586,0x00000000 data4 0x05E7AE08,0xF03F81C3,0x0000BFDF,0x00000000 data4 0x00007D8D,0x3F7B1880,0x3C9E75D2,0x00000000 data4 0x049EB22F,0xD1B87D3C,0x00003FDE,0x00000000 data4 0x00007D12,0x3F7A2328,0x3CBDC97A,0x00000000 data4 0x3A9E81E0,0xFABC8B95,0x00003FDF,0x00000000 data4 0x00007C98,0x3F792FB0,0x3CDCFE47,0x00000000 data4 0x7C4B5443,0xF5F3653F,0x00003FDF,0x00000000 data4 0x00007C20,0x3F783E08,0x3CFC15D0,0x00000000 data4 0xF65A1773,0xE78AB204,0x00003FE0,0x00000000 data4 0x00007BA8,0x3F774E38,0x3D0D874D,0x00000000 data4 0x7B8EF695,0xDB7CBFFF,0x0000BFE0,0x00000000 data4 0x00007B31,0x3F766038,0x3D1CF49B,0x00000000 data4 0xCF773FB3,0xC0241AEA,0x0000BFE0,0x00000000 data4 0x00007ABB,0x3F757400,0x3D2C531D,0x00000000 data4 0xC9539FDF,0xFC8F4D48,0x00003FE1,0x00000000 data4 0x00007A45,0x3F748988,0x3D3BA322,0x00000000 data4 0x954665C2,0x9CD035FB,0x0000BFE1,0x00000000 data4 0x000079D1,0x3F73A0D0,0x3D4AE46F,0x00000000 data4 0xDD367A30,0xEC9017C7,0x00003FE1,0x00000000 data4 0x0000795D,0x3F72B9D0,0x3D5A1756,0x00000000 data4 0xCB11189C,0xEE6625D3,0x0000BFE1,0x00000000 data4 0x000078EB,0x3F71D488,0x3D693B9D,0x00000000 data4 0xBE11C424,0xA49C8DB5,0x0000BFE0,0x00000000 ASM_SIZE_DIRECTIVE(Constants_log_80_Z_G_H_h2) .align 64 Constants_log_80_h3_G_H: ASM_TYPE_DIRECTIVE(Constants_log_80_h3_G_H,@object) // h3 IEEE double extended, H3 and G3 IEEE single data4 0x112666B0,0xAAACAAB1,0x00003FD3,0x3F7FFC00 data4 0x9B7FAD21,0x90051030,0x00003FD8,0x3F7FF400 data4 0xF4D783C4,0xA6B46F46,0x00003FDA,0x3F7FEC00 data4 0x11C6DDCA,0xDA148D88,0x0000BFD8,0x3F7FE400 data4 0xCA964D95,0xCE65C1D8,0x0000BFD8,0x3F7FDC00 data4 0x23412D13,0x883838EE,0x0000BFDB,0x3F7FD400 data4 0x983ED687,0xB7E5CFA1,0x00003FDB,0x3F7FCC08 data4 0xE3C3930B,0xDBE23B16,0x0000BFD9,0x3F7FC408 data4 0x48AA4DFC,0x9B92F1FC,0x0000BFDC,0x3F7FBC10 data4 0xCE9C8F7E,0x9A8CEB15,0x0000BFD9,0x3F7FB410 data4 0x0DECE74A,0x8C220879,0x00003FDC,0x3F7FAC18 data4 0x2F053150,0xB25CA912,0x0000BFDA,0x3F7FA420 data4 0xD9A5BE20,0xA5876555,0x00003FDB,0x3F7F9C20 data4 0x2053F087,0xC919BB6E,0x00003FD9,0x3F7F9428 data4 0x041E9A77,0xB70BDA79,0x00003FDC,0x3F7F8C30 data4 0xEA1C9C30,0xF18A5C08,0x00003FDA,0x3F7F8438 data4 0x796D89E5,0xA3790D84,0x0000BFDD,0x3F7F7C40 data4 0xA2915A3A,0xE1852369,0x0000BFDD,0x3F7F7448 data4 0xA39ED868,0xD803858F,0x00003FDC,0x3F7F6C50 data4 0x9417EBB7,0xB2EEE356,0x0000BFDD,0x3F7F6458 data4 0x9BB0D07F,0xED5C1F8A,0x0000BFDC,0x3F7F5C68 data4 0xE87C740A,0xD6D201A0,0x0000BFDD,0x3F7F5470 data4 0x1CA74025,0xE8DEBF5E,0x00003FDC,0x3F7F4C78 data4 0x1F34A7EB,0x9A995A97,0x0000BFDC,0x3F7F4488 data4 0x359EED97,0x9CB0F742,0x0000BFDA,0x3F7F3C90 data4 0xBBC6A1C8,0xD6F833C2,0x0000BFDD,0x3F7F34A0 data4 0xE71090EC,0xE1F68F2A,0x00003FDC,0x3F7F2CA8 data4 0xC160A74F,0xD1881CF1,0x0000BFDB,0x3F7F24B8 data4 0xD78CB5A4,0x9AD05AE2,0x00003FD6,0x3F7F1CC8 data4 0x9A77DC4B,0xE658CB8E,0x0000BFDD,0x3F7F14D8 data4 0x6BD6D312,0xBA281296,0x00003FDC,0x3F7F0CE0 data4 0xF95210D0,0xB478BBEB,0x0000BFDB,0x3F7F04F0 data4 0x38800100,0x39400480,0x39A00640,0x39E00C41 // H's start here data4 0x3A100A21,0x3A300F22,0x3A4FF51C,0x3A6FFC1D data4 0x3A87F20B,0x3A97F68B,0x3AA7EB86,0x3AB7E101 data4 0x3AC7E701,0x3AD7DD7B,0x3AE7D474,0x3AF7CBED data4 0x3B03E1F3,0x3B0BDE2F,0x3B13DAAA,0x3B1BD766 data4 0x3B23CC5C,0x3B2BC997,0x3B33C711,0x3B3BBCC6 data4 0x3B43BAC0,0x3B4BB0F4,0x3B53AF6D,0x3B5BA620 data4 0x3B639D12,0x3B6B9444,0x3B7393BC,0x3B7B8B6D ASM_SIZE_DIRECTIVE(Constants_log_80_h3_G_H) .align 64 Constant_half: ASM_TYPE_DIRECTIVE(Constant_half,@object) data4 0x00000000,0x80000000,0x00003FFE ASM_SIZE_DIRECTIVE(Constant_half) GR_Expo_Range = r32 GR_Flag = r33 GR_Table_Ptr = r34 GR_Table_Ptr1 = r35 GR_BIAS = r35 GR_Index1 = r36 GR_sign_mask = r36 GR_Index2 = r37 GR_Expo_X = r37 GR_signif_Z = r38 GR_M = r38 GR_X_0 = r39 GR_Mask = r39 GR_X_1 = r40 GR_W1_ptr = r40 GR_W2_ptr = r41 GR_X_2 = r41 GR_Z_1 = r42 GR_M2 = r42 GR_M1 = r43 GR_Z_2 = r43 GR_N = r44 GR_k = r44 GR_Big_Pos_Exp = r45 GR_BIAS_p_k = r47 GR_BIASed_exp_y = r47 GR_Big_Neg_Exp = r48 GR_Index3 = r48 GR_temp = r48 GR_vsm_expo = r49 GR_y_sign = r49 GR_T1_ptr = r50 GR_T2_ptr = r51 GR_N_fix = r52 GR_exp_y = r53 GR_signif_y = r54 GR_exp_and_sign_y = r55 GR_low_order_bit = r56 GR_get_exp_mask = r57 GR_exponent_zero = r58 // ** Registers for unwind support GR_SAVE_PFS = r59 GR_SAVE_B0 = r60 GR_SAVE_GP = r61 GR_Parameter_X = r62 GR_Parameter_Y = r63 GR_Parameter_RESULT = r64 GR_Parameter_TAG = r65 FR_X = f8 FR_Y = f9 FR_RESULT = f99 // ** FR_Input_X = f8 FR_Output = f8 FR_Input_Y = f9 FR_Neg = f10 FR_P_hi = f10 FR_X = f10 FR_Half = f11 FR_h_3 = f11 FR_poly_hi = f11 FR_Sgn = f12 FR_Neg_X = f13 FR_half_W = f13 FR_X_cor = f14 FR_P_lo = f14 FR_W = f15 FR_X_lo = f32 FR_S = f33 FR_W3 = f33 FR_Y_hi = f34 FR_logx_hi = f34 FR_Z = f35 FR_logx_lo = f35 FR_GS_hi = f35 FR_Y_lo = f35 FR_r_cor = f36 FR_Scale = f36 FR_G_1 = f37 FR_G = f37 FR_Wsq = f37 FR_L_Inv = f37 FR_temp = f37 FR_H_1 = f38 FR_H = f38 FR_W4 = f38 FR_float_N = f38 FR_h = f39 FR_h_1 = f39 FR_N = f39 FR_P_7 = f39 FR_G_2 = f40 FR_P_8 = f40 FR_L_hi = f40 FR_H_2 = f41 FR_L_lo = f41 FR_A_1 = f41 FR_h_2 = f42 FR_P_6 = f42 FR_abs_W = f43 FR_W1 = f43 FR_G_3 = f44 FR_P_8 = f44 FR_T1 = f44 FR_log2_hi = f45 FR_W2 = f45 FR_GS_lo = f46 FR_T2 = f46 FR_W_1_p1 = f47 FR_H_3 = f47 FR_float_N = f48 FR_P_4 = f49 FR_A_2 = f49 FR_Q_4 = f50 FR_r4 = f50 FR_Q_3 = f51 FR_A_3 = f51 FR_Q_2 = f52 FR_P_2 = f52 FR_Q_1 = f53 FR_P_1 = f53 FR_T = f53 FR_Wp1 = f54 FR_Q_5 = f54 FR_P_3 = f54 FR_Q_6 = f55 FR_log2_lo = f56 FR_Two = f56 FR_Big = f57 FR_neg_2_mK = f58 FR_NBig = f58 FR_r = f59 FR_poly_lo = f60 FR_poly = f61 FR_P_5 = f62 FR_rsq = f63 FR_Result = f99 FR_Result_small = f100 FR_Result_big = f101 .section .text .proc powl# .global powl# .align 64 powl: { .mfi alloc GR_Expo_Range = ar.pfs,0,30,4,0 (p0) fclass.m.unc p7, p13 = FR_Input_Y, 0x1E7 nop.i 0 } { .mfi (p0) getf.exp GR_exp_and_sign_y = FR_Input_Y // // Save State // (p0) fclass.m.unc p6, p12 = FR_Input_X, 0x1E7 nop.i 0 };; { .mfi (p0) getf.sig GR_signif_y = FR_Input_Y (p0) fcmp.eq.unc.s1 p12, p13 = FR_Input_X, f1 nop.i 0 } { .mfi nop.m 999 // // Check for y = 1 // Identify EM unsupporteds. // Load FR_half = .5 // (p0) fadd.s1 FR_Two = f1, f1 // // Load 1/2 in GP register // nop.i 0 } ;; { .mmi nop.m 999 (p0) addl GR_Table_Ptr = @ltoff(Constant_half#), gp nop.i 999 } ;; { .mmi ld8 GR_Table_Ptr = [GR_Table_Ptr] nop.m 999 nop.i 999 } ;; { .mlx (p0) ldfe FR_Half =[GR_Table_Ptr],0 (p0) movl GR_get_exp_mask = 0x1FFFF ;; } { .mfi nop.m 999 (p0) fclass.nm.unc p9, p15 = FR_Input_Y, 0x1FF // // Create FR_Two = 2 // Get exp and significand of Y // Crate Masks // sgn = 1 // (p0) and GR_exp_y = GR_get_exp_mask,GR_exp_and_sign_y } { .mlx nop.m 999 (p0) movl GR_exponent_zero = 0xFFFF ;; } { .mfi nop.m 999 (p0) mov FR_Sgn = f1 nop.i 999 } { .mfi nop.m 999 (p0) fcmp.eq.unc.s1 p10, p11 = FR_Input_Y, f1 nop.i 999 ;; } { .mfb nop.m 999 // // Identify NatVals, NaNs, Infs, and Zeros. // Load Half // (p0) fclass.nm.unc p8, p14 = FR_Input_X, 0x1FF // // Remove sign bit from exponent of y. // Check for x = 1 // (p6) br.cond.spnt L(POWL_64_SPECIAL) ;; } { .mib nop.m 999 nop.i 999 (p7) br.cond.spnt L(POWL_64_SPECIAL) ;; } { .mib nop.m 999 nop.i 999 (p8) br.cond.spnt L(POWL_64_UNSUPPORT) ;; } { .mib nop.m 999 nop.i 999 (p9) br.cond.spnt L(POWL_64_UNSUPPORT) ;; } { .mfi (p0) cmp.lt.unc p9, p0 = GR_exp_y,GR_exponent_zero (p0) fcmp.lt.unc.s1 p6, p13 = FR_Input_X, f0 // // Branch on Infs, Nans, Zeros, and Natvals // Check to see that exponent < 0 // (p0) sub GR_exp_y = GR_exp_y,GR_exponent_zero } // x not zero, is y ==2? { .mfi nop.m 999 (p11) fcmp.eq.unc.s1 p7, p14 = FR_Input_Y, FR_Two nop.i 999 ;; } { .mfb nop.m 999 (p9) fcmp.lt.unc.s1 p9, p0 = FR_Input_X, f0 (p7) br.cond.spnt L(POWL_64_SQUARE) ;; // Branch if x not zero and y=2 } { .mfi nop.m 999 (p6) fmerge.ns FR_Neg_X = FR_Input_X, FR_Input_X nop.i 999 ;; } { .mfi nop.m 999 (p10) fmpy.s0 FR_Result = FR_Input_X, f1 // // For y = 1, compute result = x // For x = 1, compute 1 // When Y is one return X and possible raise // denormal operand exception. // Remove exponent BIAS // (p6) shl GR_exp_and_sign_y= GR_signif_y,GR_exp_y ;; } { .mfi (p9) or GR_exp_and_sign_y = 0xF,GR_signif_y (p12) fma.s0 FR_Result = FR_Input_Y, f0, f1 nop.i 999 ;; } { .mii nop.m 999 (p6) extr.u GR_exp_y = GR_exp_and_sign_y,63,1 ;; (p6) cmp.ne.unc p9, p0 = GR_exp_y, r0 } { .mii nop.m 999 // // Both predicates can be set. // Don't consider y's < 1. // (p6) shl GR_signif_y= GR_exp_and_sign_y,1 ;; // // Is shift off integer part of y. // Get y's even or odd bit. // (p6) cmp.ne.unc p8, p0 = GR_signif_y, r0 } { .mib nop.m 999 nop.i 999 // // Is the fractional part of the y = 0? // Is the integer even or odd. // (p10) br.cond.spnt L(POWL_64_RETURN) ;; } { .mib nop.m 999 nop.i 999 (p12) br.cond.spnt L(POWL_64_RETURN) ;; } { .mib nop.m 999 nop.i 999 (p8) br.cond.spnt L(POWL_64_XNEG) ;; } { .mfi nop.m 999 (p9) fmerge.ns FR_Sgn = FR_Sgn, FR_Sgn nop.i 999 } { .mfi nop.m 999 (p0) fcmp.eq.unc.s0 p11, p0 = FR_Input_Y, FR_Half nop.i 999 ;; } // // Raise possible denormal operand exception for both // X and Y. // { .mfb nop.m 999 // // Branch for (x < 0) and Y not an integer. // (p0) fcmp.eq.unc.s0 p12, p0 = FR_Input_X, f1 // // For x < 0 and y integer, make x positive // For x < 0 and y odd integer,, set sign = -1. // (p11) br.cond.spnt L(POWL_64_SQRT) ;; } { .mmf (p0) cmp.eq.unc p15, p14 = r0, r0 nop.m 999 (p13) fnorm.s1 FR_Z = FR_Input_X ;; } { .mfi nop.m 999 (p6) fnorm.s1 FR_Z = FR_Neg_X nop.i 999 } ;; // // Branch to embedded sqrt(x) // // // Computes ln( x ) to extra precision // Input FR 1: FR_X // Output FR 2: FR_Y_hi // Output FR 3: FR_Y_lo // Output PR 1: PR_Safe // { .mmi nop.m 999 (p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Z_G_H_h1#), gp nop.i 999 } ;; { .mmi ld8 GR_Table_Ptr = [GR_Table_Ptr] nop.m 999 nop.i 999 } ;; { .mlx nop.m 999 (p0) movl GR_BIAS = 0x000000000000FFFF ;; } { .mfi nop.m 999 (p0) fsub.s1 FR_W = FR_Z, f1 nop.i 999 ;; } // // Z = Norm(X) - both + and - case // Set Safe = True // { .mmb (p0) getf.sig GR_signif_Z = FR_Z (p0) getf.exp GR_N = FR_Z nop.b 999 ;; } { .mii nop.m 999 // // Get significand of Z // W = Z - 1 // (p0) extr.u GR_Index1 = GR_signif_Z, 59, 4 ;; // // Index1 = High order 4 bits of Z // X_0 = High order 15 bit of Z // (p0) shl GR_Index1 = GR_Index1,5 ;; } { .mfi nop.m 999 // // Add offset to Index1 ptr. // (p0) fabs FR_abs_W = FR_W // // BIAS = 0x000...FFFF // Adjust Index1 ptr ( x 32) . // (p0) add GR_Index1 = GR_Index1,GR_Table_Ptr } { .mmi nop.m 999 ;; (p0) ld2 GR_Z_1 =[GR_Index1],4 (p0) extr.u GR_X_0 = GR_signif_Z, 49, 15 } ;; { .mmi nop.m 999 (p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Z_G_H_h2#), gp nop.i 999 } ;; { .mmi ld8 GR_Table_Ptr = [GR_Table_Ptr] nop.m 999 nop.i 999 } ;; { .mmi (p0) ldfs FR_G_1 = [GR_Index1],4 ;; (p0) ldfs FR_H_1 = [GR_Index1],8 nop.i 999 ;; } // // Adjust Index2 (x 32). // { .mfi (p0) ldfe FR_h_1 = [GR_Index1],0 nop.f 999 (p0) pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 ;; } { .mmi nop.m 999 ;; // // load Z_1 from Index1 // abs_W = |W| // Point to Table2 // (p0) getf.exp GR_M = FR_abs_W // // M = M - BIAS // Load G_1 // N = exponent of Z // nop.i 999;; } { .mmi nop.m 999 nop.m 999 nop.i 999;; } { .mmi nop.m 999 nop.m 999 nop.i 999;; } { .mmi nop.m 999 nop.m 999 (p0) extr.u GR_Index2 = GR_X_1, 6, 4 ;; } { .mii nop.m 999 // // Extract Index2 // Load H_1 // Is -8 > M ? // (p0) shl GR_Index2=GR_Index2,5 ;; (p0) add GR_Index2 = GR_Index2, GR_Table_Ptr } // // M = exponent of abs_W // X_1 = X_0 * Z_1 // { .mii (p0) sub GR_M = GR_M, GR_BIAS nop.i 999 ;; (p0) cmp.gt.unc p7, p14 = -8, GR_M } { .mib nop.m 999 nop.i 999 (p7) br.cond.spnt L(LOGL80_NEAR) ;; } // // Load h_1 // Possible branch out. // Add offset of table to Index2 // { .mfi (p0) ld2 GR_Z_2 =[GR_Index2],4 (p0) fmerge.se FR_S = f1,FR_Z (p0) sub GR_N = GR_N, GR_BIAS } ;; { .mmi nop.m 999 (p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_h3_G_H#), gp nop.i 999 } ;; { .mmi ld8 GR_Table_Ptr = [GR_Table_Ptr] nop.m 999 nop.i 999 } ;; // // load Z_2 // N - BIAS // Point to Table 3. // S = merging of Z and 1.0 // { .mmi (p0) ldfs FR_G_2 = [GR_Index2],4 (p0) setf.sig FR_float_N = GR_N (p0) add GR_Table_Ptr1 = 0x200,GR_Table_Ptr ;; } // // load G_2 // X_2 = X_1 * Z_2 // Add offset to Table 2 ptr. // float_N = significand of N // { .mmi (p0) ldfs FR_H_2 = [GR_Index2],8 ;; // // load H_2 // G = G * G_2 // (p0) ldfe FR_h_2 = [GR_Index2],0 (p0) pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 ;; } { .mmi nop.m 999 nop.m 999 nop.i 999;; } { .mmi nop.m 999 nop.m 999 nop.i 999;; } { .mmi nop.m 999 nop.m 999 nop.i 999;; } { .mii nop.m 999 nop.i 999 ;; (p0) extr.u GR_Index3 = GR_X_2, 1, 5 ;; } { .mfi (p0) shladd GR_Table_Ptr1 = GR_Index3,2,GR_Table_Ptr1 nop.f 999 // // h = h_1 + h_2 // Adjust Index3 // (p0) shladd GR_Index3 = GR_Index3,4,GR_Table_Ptr ;; } { .mmb nop.m 999 (p0) ldfe FR_h_3 = [GR_Index3],12 nop.b 999 ;; } { .mmf (p0) ldfs FR_H_3 = [GR_Table_Ptr1],0 // // float_N = Make N a fp number // Load h_3 // Get pointer to Q table. // (p0) ldfs FR_G_3 = [GR_Index3],0 (p0) fmpy.s1 FR_G = FR_G_1, FR_G_2 } ;; { .mmi nop.m 999 (p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_Q#), gp nop.i 999 } ;; { .mmi ld8 GR_Table_Ptr = [GR_Table_Ptr] nop.m 999 nop.i 999 } ;; { .mfi (p0) ldfe FR_log2_hi = [GR_Table_Ptr],16 (p0) fadd.s1 FR_H = FR_H_1, FR_H_2 nop.i 999 ;; } { .mmf nop.m 999 // // G = G_1 * G_2 * G_3 // (p0) ldfe FR_log2_lo = [GR_Table_Ptr],16 // // load h_2 // H = H_1 + H_2 // Get Index3 // (p0) fadd.s1 FR_h = FR_h_1, FR_h_2 ;; } // // Load log2_lo part // r = G*S -1 // { .mfi (p0) ldfe FR_Q_6 = [GR_Table_Ptr],16 // // Load H_3 // (p0) fcvt.xf FR_float_N = FR_float_N nop.i 999 ;; } // // Load Q_6 // { .mmi (p0) ldfe FR_Q_5 = [GR_Table_Ptr],16 ;; (p0) ldfe FR_Q_4 = [GR_Table_Ptr],16 nop.i 999 ;; } { .mmi (p0) ldfe FR_Q_3 = [GR_Table_Ptr],16 ;; (p0) ldfe FR_Q_2 = [GR_Table_Ptr],16 nop.i 999 ;; } { .mmf nop.m 999 // // poly_lo = Q_5 + r * Q_6 // Load Q_2 // rsq = r * r // (p0) ldfe FR_Q_1 = [GR_Table_Ptr],16 // // h = h_1 + h_2 + h_3 // H = H_1 + H_2 + H_3 // Load G_3. // Begin Loading Q's - load log2_hi part // (p0) fmpy.s1 FR_G = FR_G, FR_G_3 } { .mfi nop.m 999 (p0) fadd.s1 FR_H = FR_H, FR_H_3 nop.i 999 } ;; // // Y_lo = poly + Y_lo // { .mmi nop.m 999 (p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Arg#), gp nop.i 999 } ;; { .mmi ld8 GR_Table_Ptr = [GR_Table_Ptr] nop.m 999 nop.i 999 } ;; { .mfi nop.m 999 (p0) fadd.s1 FR_h = FR_h, FR_h_3 nop.i 999 ;; } { .mfi nop.m 999 // // Load Q_5 // (p0) fmpy.s1 FR_GS_hi = FR_G, FR_S nop.i 999 } { .mfi nop.m 999 (p0) fms.s1 FR_r = FR_G, FR_S, f1 nop.i 999 ;; } { .mfi nop.m 999 (p0) fma.s1 FR_poly_lo = FR_r, FR_Q_6, FR_Q_5 nop.i 999 } { .mfi nop.m 999 // // GS_hi = G*S // Load Q_4 // (p0) fsub.s1 FR_r_cor = FR_GS_hi, f1 nop.i 999 ;; } { .mfi nop.m 999 (p0) fms.s1 FR_GS_lo = FR_G, FR_S, FR_GS_hi nop.i 999 } { .mfi nop.m 999 (p0) fma.s1 FR_poly = FR_r, FR_Q_2, FR_Q_1 nop.i 999 ;; } { .mfi nop.m 999 // // Load Q_3 // r_cor = GS_hi -1 // GS_lo = G*S - GS_hi // (p0) fmpy.s1 FR_rsq = FR_r, FR_r nop.i 999 } { .mfi nop.m 999 (p0) fma.s1 FR_G = FR_float_N, FR_log2_hi, FR_H nop.i 999 ;; } { .mfi nop.m 999 // // poly = poly_hi + rsq * poly_lo // Tbl = float_N*log2_hi + H // (p0) fma.s1 FR_Y_lo = FR_float_N, FR_log2_lo, FR_h nop.i 999 ;; } { .mfi nop.m 999 // // r_cor = r_cor - r // poly_hi = r * Q_2 + Q_1 // (p0) fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_4 nop.i 999 } { .mfi nop.m 999 // // Load Q_1 // (p0) fsub.s1 FR_r_cor = FR_r_cor, FR_r nop.i 999 ;; } { .mfi nop.m 999 // // Y_lo = float_N*log2_lo + h // (p0) fadd.s1 FR_Y_hi = FR_G, FR_r nop.i 999 ;; } { .mfi nop.m 999 // // poly_lo = Q_4 + r * poly_lo;; // r_cor = r_cor + GS_lo;; // (p0) fma.s1 FR_poly_lo = FR_r, FR_poly_lo, FR_Q_3 nop.i 999 } { .mfi nop.m 999 (p0) fadd.s1 FR_r_cor = FR_r_cor, FR_GS_lo nop.i 999 ;; } { .mfi nop.m 999 (p0) fadd.s1 FR_r_cor = FR_r_cor, FR_Y_lo nop.i 999 } { .mfi nop.m 999 // // poly_lo = Q_3 + r * poly_lo;; // (p0) fma.s1 FR_poly = FR_rsq, FR_poly_lo, FR_poly nop.i 999 ;; } { .mfi nop.m 999 (p0) fsub.s1 FR_Y_lo = FR_G, FR_Y_hi nop.i 999 } { .mmi (p0) ldfe FR_L_Inv = [GR_Table_Ptr],16 ;; (p0) ldfe FR_L_hi = [GR_Table_Ptr],16 nop.i 999 ;; } { .mfi (p0) ldfe FR_L_lo = [GR_Table_Ptr],16 nop.f 999 nop.i 999 ;; } { .mfi nop.m 999 // // Y_hi = Tbl + r // r_cor = r_cor + Y_lo // (p0) fma.s1 FR_poly = FR_rsq, FR_poly, FR_r_cor nop.i 999 ;; } { .mfi nop.m 999 // Y_lo = Tbl - Y_hi // poly = rsq * poly + r_cor // (p0) fadd.s1 FR_Y_lo = FR_Y_lo, FR_r nop.i 999 ;; } { .mfb nop.m 999 // // Y_lo = Y_lo + r // (p0) fadd.s1 FR_Y_lo = FR_Y_lo, FR_poly // // Load L_Inv // Load L_hi // Load L_lo // all long before they are needed. // They are used in LOGL_RETURN PATH // br.cond.sptk L(LOGL_RETURN) ;; } L(LOGL80_NEAR): // // Branch LOGL80_NEAR // { .mmi nop.m 999 (p0) addl GR_Table_Ptr = @ltoff(Constants_log_80_P#), gp nop.i 999 } ;; { .mmi ld8 GR_Table_Ptr = [GR_Table_Ptr] nop.m 999 nop.i 999 } ;; { .mfi nop.m 999 (p0) fmpy.s1 FR_Wsq = FR_W, FR_W (p0) add GR_Table_Ptr1 = 0x50,GR_Table_Ptr } // // Adjust ptr to 1/2 // Adjust Ptr1 to P_4 // { .mmi (p0) ldfe FR_Half = [GR_Table_Ptr],16 ;; (p0) ldfe FR_P_4 = [GR_Table_Ptr1],16 nop.i 999 } // // Load 1/2 // { .mmi (p0) ldfe FR_P_8 = [GR_Table_Ptr],16 ;; (p0) ldfe FR_P_3 = [GR_Table_Ptr1],16 nop.i 999 } { .mmi (p0) ldfe FR_P_7 = [GR_Table_Ptr],16 ;; (p0) ldfe FR_P_2 = [GR_Table_Ptr1],16 nop.i 999 } // // Load P_7 // half_W = .5 * W // Load P_3 // { .mmi (p0) ldfe FR_P_6 = [GR_Table_Ptr],16 ;; (p0) ldfe FR_P_1 = [GR_Table_Ptr1],16 nop.i 999 ;; } // // Load P_6 // Wsq = w * w // poly = w*P_4 + P_3 // Load P_2 // { .mfi (p0) ldfe FR_P_5 = [GR_Table_Ptr],16 // // Load P_5 // poly_lo = w * P_8 + P_7 // Y_hi = w - (1/2)w*w // Load P_1 // (p0) fmpy.s1 FR_W4 = FR_Wsq, FR_Wsq nop.i 999 } { .mfi nop.m 999 (p0) fmpy.s1 FR_W3 = FR_Wsq, FR_W nop.i 999 } ;; // // Y_lo = W3 * poly + Y_lo // { .mmi nop.m 999 (p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Arg#), gp nop.i 999 } ;; { .mmi ld8 GR_Table_Ptr = [GR_Table_Ptr] nop.m 999 nop.i 999 } ;; { .mmi (p0) ldfe FR_L_Inv = [GR_Table_Ptr],16 ;; (p0) ldfe FR_L_hi = [GR_Table_Ptr],16 nop.i 999 ;; } { .mfi (p0) ldfe FR_L_lo = [GR_Table_Ptr],16 // // Load P_8 // Load P_4 // (p0) fmpy.s1 FR_half_W = FR_Half, FR_W nop.i 999 ;; } { .mfi nop.m 999 (p0) fma.s1 FR_poly_lo = FR_W, FR_P_8,FR_P_7 nop.i 999 } { .mfi nop.m 999 (p0) fma.s1 FR_poly = FR_W, FR_P_4, FR_P_3 nop.i 999 ;; } { .mfi nop.m 999 (p0) fnma.s1 FR_Y_hi = FR_W, FR_half_W, FR_W nop.i 999 ;; } { .mfi nop.m 999 // // W4 = Wsq * Wsq // poly = w *poly + P_2 // (p0) fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_6 nop.i 999 } { .mfi nop.m 999 (p0) fma.s1 FR_poly = FR_W, FR_poly, FR_P_2 nop.i 999 ;; } { .mfi nop.m 999 (p0) fsub.s1 FR_Y_lo = FR_W, FR_Y_hi nop.i 999 ;; } { .mfi nop.m 999 // // poly = w * poly + P_1 // w3 = wsq * w // (p0) fma.s1 FR_poly_lo = FR_W, FR_poly_lo, FR_P_5 nop.i 999 } { .mfi nop.m 999 // // poly_lo = w * poly_lo + P_6 // Y_lo = W - Y_hi // (p0) fma.s1 FR_poly = FR_W, FR_poly, FR_P_1 nop.i 999 ;; } { .mfi nop.m 999 (p0) fnma.s1 FR_Y_lo = FR_W, FR_half_W, FR_Y_lo nop.i 999 ;; } { .mfi nop.m 999 // // poly_lo = w * poly_lo + // Y_lo = Y_lo - w * (1/2)w // (p0) fma.s1 FR_poly = FR_poly_lo, FR_W4, FR_poly nop.i 999 ;; } { .mfi nop.m 999 // // Y_lo = (W-Y_hi) - w * (1/2)w // poly = W4* poly_lo + poly // (p0) fma.s1 FR_Y_lo = FR_poly, FR_W3, FR_Y_lo nop.i 999 ;; } L(LOGL_RETURN): { .mfi (p0) add GR_Expo_Range = 0x2,r0 // // Load L_Inv // Load L_hi // Load L_lo // all long before they are needed. // // // kernel_log_80 computed ln(X) // and return logX_hi and logX_lo as results. // PR_pow_Safe set as well. // (p0) fmpy.s1 FR_X_lo = FR_Input_Y, FR_logx_lo // // Compute Y * (logX_hi + logX_lo) // P_hi -> X // P_lo -> X_cor // (Manipulate names so that inputs are in // the place kernel_exp expects them) // Set GR_Flag to 2 // Set GR_Expo_Range to Double // // This function computes exp( x + x_cor) // Input FR 1: FR_X // Input FR 2: FR_X_cor // Input GR 1: GR_Flag // Input GR 2: GR_Expo_Range // Output FR 3: FR_Y_hi // Output FR 4: FR_Y_lo // Output FR 5: FR_Scale // Output PR 1: PR_Safe // (p0) cmp.eq.unc p15, p0 = r0, r0 } ;; { .mmi (p0) addl GR_W1_ptr = @ltoff(Constants_exp_64_W1#), gp (p0) addl GR_W2_ptr = @ltoff(Constants_exp_64_W2#), gp (p0) add GR_Flag = 0x2,r0 } ;; { .mmi ld8 GR_W1_ptr = [GR_W1_ptr] ld8 GR_W2_ptr = [GR_W2_ptr] (p0) cmp.ne.unc p7, p0 = 0x1, GR_Flag } ;; { .mlx nop.m 999 (p0) movl GR_Mask = 0x1FFFF ;; } { .mlx nop.m 999 (p0) movl GR_BIAS = 0x0FFFF ;; } { .mfi nop.m 999 // // X_lo = Y * logX_lo // (p0) fma.s1 FR_P_hi = FR_Input_Y, FR_logx_hi,FR_X_lo nop.i 999 ;; } { .mfi nop.m 999 // // Set Safe=True // Flag is always 2 for this routine // (p0) fmpy.s1 FR_float_N = FR_X, FR_L_Inv nop.i 999 } { .mfi nop.m 999 // // X_hi = Y * logX_hi + X_lo // Set GR_Flag = 2 for exp(x + xcor) // (p0) fms.s1 FR_P_lo= FR_Input_Y, FR_logx_hi, FR_P_hi nop.i 999 ;; } { .mmi nop.m 999 ;; (p0) getf.exp GR_Expo_X = FR_X nop.i 999 ;; } { .mfi (p0) and GR_Expo_X = GR_Expo_X, GR_Mask // // Calculate unBIASed exponent of X // Point to Table of W1s // Point to Table of W2s // (p0) fcvt.fx.s1 FR_N = FR_float_N nop.i 999 ;; } { .mfi nop.m 999 (p0) fadd.s1 FR_P_lo = FR_P_lo, FR_X_lo // // Float_N = X * L_Inv // Create exponent BIAS // Get BIASed exponent of X // (p0) sub GR_Expo_X = GR_Expo_X, GR_BIAS ;; } { .mib (p0) cmp.gt.unc p9, p0 = -6, GR_Expo_X nop.i 999 // // N = fcvt.fx(float_N) // If -6 > Expo_X, set P9 // (p9) br.cond.spnt L(EXPL_SMALL) } ;; // // If expo_X < -6 goto exp_small // { .mmi nop.m 999 (p0) addl GR_T1_ptr = @ltoff(Constants_exp_64_T1#), gp (p0) cmp.lt.unc p10, p0 = 14, GR_Expo_X } ;; { .mmi ld8 GR_T1_ptr = [GR_T1_ptr] nop.m 999 nop.i 999 } ;; { .mib nop.m 999 nop.i 999 // // If 14 < Expo_X, set P10 // Create pointer to T1 table // (p10) br.cond.spnt L(EXPL_HUGE) ;; } { .mmi (p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Exponents#), gp (p0) addl GR_T2_ptr = @ltoff(Constants_exp_64_T2#), gp nop.i 999 } ;; { .mmi ld8 GR_Table_Ptr = [GR_Table_Ptr] ld8 GR_T2_ptr = [GR_T2_ptr] nop.i 999 } ;; { .mmi (p0) shladd GR_Table_Ptr = GR_Expo_Range,4,GR_Table_Ptr ;; // // Adjust T1_ptr by x 4 for single-precision values // Adjust T2_ptr by x 4 for single-precision values // (p0) ld8 GR_Big_Pos_Exp = [GR_Table_Ptr],8 nop.i 999 ;; } // // Load double W1 // Load +max exponent // { .mfi (p0) ld8 GR_Big_Neg_Exp = [GR_Table_Ptr],0 // // If 14 < Expo_X, goto exp_huge // (p0) fcvt.xf FR_float_N = FR_N nop.i 999 } ;; // // Load double W2 // Load -max exponent // Load ptr to A's // { .mmi (p0) getf.sig GR_N_fix = FR_N (p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_A#), gp nop.i 999 } ;; { .mmi ld8 GR_Table_Ptr = [GR_Table_Ptr] nop.m 999 nop.i 999 } ;; // // Load single T1 // Load single T2 // W_1_p1 = W_1 + 1 // { .mmi (p0) ldfe FR_A_3 = [GR_Table_Ptr],16 ;; // // Load A_3 // if k > big_pos_exp, set p14 and Safe=False // (p0) ldfe FR_A_2 = [GR_Table_Ptr],16 (p0) extr.u GR_M1 = GR_N_fix, 6, 6 } { .mmi nop.m 999 ;; (p0) shladd GR_W1_ptr = GR_M1,3,GR_W1_ptr // // float_N = fcvt.xf(N) // N_fix = significand of N // Create pointer to T2 table // (p0) extr.u GR_M2 = GR_N_fix, 0, 6 } // // r = r + X_cor // Adjust W1_ptr by x 8 for double-precision values // Adjust W2_ptr by x 8 for double-precision values // Adjust Table_ptr by Expo_Rangex16 // { .mmi (p0) shladd GR_T1_ptr = GR_M1,2,GR_T1_ptr ;; (p0) ldfd FR_W1 = [GR_W1_ptr],0 (p0) shladd GR_W2_ptr = GR_M2,3,GR_W2_ptr } // // Load ptr to A's // { .mfi (p0) ldfs FR_T1 = [GR_T1_ptr],0 (p0) fnma.s1 FR_r = FR_L_hi, FR_float_N, FR_X (p0) shladd GR_T2_ptr = GR_M2,2,GR_T2_ptr ;; } { .mmi (p0) ldfd FR_W2 = [GR_W2_ptr],0 (p0) ldfs FR_T2 = [GR_T2_ptr],0 // // r = x - L_hi * float_N // M2 = extr.u(N_fix,0,6) // M1 = extr.u(N_fix,6,6) // (p0) extr GR_k = GR_N_fix, 12, 52 ;; } // // Load A_1 // poly = A_3 * r + A_2 // rsq = r*r // { .mii (p0) add GR_BIAS_p_k = GR_BIAS, GR_k (p0) cmp.gt.unc p14,p15 = GR_k,GR_Big_Pos_Exp ;; (p15) cmp.lt p14,p15 = GR_k,GR_Big_Neg_Exp } // // BIAS_p_K = BIAS + k // T = T1 * T2 // { .mfi (p0) setf.exp FR_Scale = GR_BIAS_p_k nop.f 999 nop.i 999 ;; } { .mfi nop.m 999 (p0) fnma.s1 FR_r = FR_L_lo, FR_float_N, FR_r nop.i 999 } // // W = W_1_p1 * W2 + W1 // { .mfi (p0) ldfe FR_A_1 = [GR_Table_Ptr],16 nop.f 999 nop.i 999 ;; } { .mfi nop.m 999 (p0) fadd.s1 FR_W_1_p1 = FR_W1, f1 nop.i 999 ;; } { .mfi nop.m 999 // // k = extr.u(N_fix,0,6) // r = r - N * L_lo // Load ptr to Table of exponent thresholds. // (p0) fadd.s1 FR_r = FR_r, FR_X_cor nop.i 999 } { .mfi nop.m 999 (p0) fmpy.s1 FR_T = FR_T1, FR_T2 nop.i 999 ;; } { .mfi nop.m 999 // // if k < big_neg_exp, set p14 and Safe=False // Load A_2 // (p0) fma.s1 FR_W = FR_W2, FR_W_1_p1, FR_W1 nop.i 999 ;; } { .mfi nop.m 999 (p0) fma.s1 FR_poly = FR_r, FR_A_3, FR_A_2 nop.i 999 } { .mfi nop.m 999 (p0) fmpy.s1 FR_rsq = FR_r, FR_r nop.i 999 ;; } { .mfi nop.m 999 (p0) mov FR_Y_hi = FR_T nop.i 999 ;; } { .mfi nop.m 999 // // Scale = set_exp(BIAS_p_k) // poly = r * poly + A_1 // (p0) fadd.s1 FR_Wp1 = FR_W, f1 nop.i 999 ;; } { .mfi nop.m 999 (p0) fma.s1 FR_poly = FR_r, FR_poly, FR_A_1 nop.i 999 ;; } { .mfi nop.m 999 (p0) fma.s1 FR_poly = FR_rsq, FR_poly,FR_r nop.i 999 ;; } { .mfi nop.m 999 // // Wp1 = W + 1 // poly = rsq * poly + rk // (p0) fma.s1 FR_Y_lo = FR_Wp1, FR_poly, FR_W nop.i 999 ;; } { .mfb nop.m 999 // // Y_lo = poly * Wp1 + W // Y_hi = T // (p0) fmpy.s1 FR_Y_lo = FR_Y_lo, FR_T // // Y_lo = T * Y_lo // (p0) br.cond.sptk L(EXPL_RETURN) ;; } L(EXPL_SMALL): // // r4 = rsq * rsq // { .mmi nop.m 999 (p0) addl GR_Table_Ptr1 = @ltoff(Constants_exp_64_P), gp nop.i 999 } ;; { .mmi ld8 GR_Table_Ptr1 = [GR_Table_Ptr1] nop.m 999 nop.i 999 } ;; { .mmf nop.m 999 (p0) ldfe FR_P_6 = [GR_Table_Ptr1],16 // // Return // (p0) fadd.s1 FR_r = FR_X,f0 ;; } { .mmi nop.m 999 (p0) addl GR_Table_Ptr = @ltoff(Constants_exp_64_Exponents#), gp nop.i 999 } ;; { .mmi ld8 GR_Table_Ptr = [GR_Table_Ptr] (p0) ldfe FR_P_5 = [GR_Table_Ptr1],16 nop.i 999 } ;; // // Is input very small? // Load P_5 // { .mii (p0) ldfe FR_P_4 = [GR_Table_Ptr1],16 (p0) add GR_Table_Ptr = 0x040,GR_Table_Ptr ;; (p0) shladd GR_Table_Ptr = GR_Expo_Range,3,GR_Table_Ptr ;; } { .mmb (p0) ldfe FR_P_3 = [GR_Table_Ptr1],16 // // Adjust ptr. // (p0) ld8 GR_vsm_expo = [GR_Table_Ptr],0 nop.b 999 ;; } { .mfi nop.m 999 // // r = X (don't seem to need X_Cor) // Load the threshold exponents // (p0) fmpy.s1 FR_rsq = FR_r, FR_r nop.i 999 ;; } // // Load the negative integer // Load P_5 // { .mfi (p0) cmp.lt.unc p12, p0 = GR_Expo_X, GR_vsm_expo nop.f 999 nop.i 999 ;; } { .mfb nop.m 999 // // rsq = r * r // Offset into exponents // (p0) fmpy.s1 FR_r4 = FR_rsq, FR_rsq (p12) br.cond.spnt L(EXPL_VERY_SMALL) ;; } { .mfi (p0) ldfe FR_P_2 = [GR_Table_Ptr1],16 // // Load p4,p3,p2,p1 // (p0) fma.s1 FR_poly_lo = FR_P_6, FR_r, FR_P_5 // // Y_lo = r4 * poly_lo + poly_hi // Scale = 1.0 // (p0) add GR_temp = 0x1,r0 ;; } { .mmf nop.m 999 (p0) ldfe FR_P_1 = [GR_Table_Ptr1],0 (p0) mov FR_Scale = f1 } // // Begin creating lsb to perturb final result // { .mfi (p0) setf.sig FR_temp = GR_temp (p0) mov FR_Y_hi = f1 nop.i 999 ;; } { .mfi nop.m 999 // // poly_lo = p_5 + p_6 * r // poly_hi = p_1 + p_2 * r // (p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_P_4 nop.i 999 ;; } { .mfi nop.m 999 // // poly_lo = p_4 + poly_lo * r // poly_hi = r + poly_hi * rsq // (p0) fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_P_3 nop.i 999 } { .mfi nop.m 999 (p0) fma.s1 FR_poly_hi = FR_P_2, FR_r, FR_P_1 nop.i 999 ;; } { .mfi nop.m 999 (p0) fma.s1 FR_poly_hi = FR_poly_hi, FR_rsq, FR_r nop.i 999 ;; } { .mfi nop.m 999 // // poly_lo = p_3 + poly_lo * r // Y_hi = 1, always // (p0) fma.s1 FR_Y_lo = FR_poly_lo, FR_r4, FR_poly_hi nop.i 999 ;; } { .mfi nop.m 999 // // Set lsb in fp register // (p0) for FR_temp = FR_Y_lo,FR_temp nop.i 999 ;; } { .mfb nop.m 999 // // Toggle on last bit of Y_lo // (p0) fmerge.se FR_Y_lo = FR_Y_lo,FR_temp // // Set lsb of Y_lo to 1 // (p0) br.cond.sptk L(EXPL_RETURN) ;; } L(EXPL_VERY_SMALL): { .mfi nop.m 999 (p0) mov FR_Y_lo = FR_r (p0) cmp.eq.unc p15, p0 = r0, r0 } { .mfi nop.m 999 (p0) mov FR_Scale = f1 nop.i 999 };; { .mfb nop.m 999 (p0) mov FR_Y_hi = f1 // // If flag_not_1, // Y_hi = 1.0 // Y_lo = X + X_cor // PR_Safe = true // (p0) br.cond.sptk L(EXPL_RETURN) ;; } L(EXPL_HUGE): { .mfi nop.m 999 // // Return for flag=2 // (p0) fcmp.gt.unc.s1 p12, p13 = FR_X, f0 (p0) cmp.eq.unc p14, p15 = r0, r0 ;; } { .mlx nop.m 999 // // Set Safe to false // Is x > 0 // (p12) movl GR_Mask = 0x15DC0 ;; } { .mlx (p12) setf.exp FR_Y_hi = GR_Mask (p13) movl GR_Mask = 0xA240 ;; } { .mlx (p13) setf.exp FR_Y_hi = GR_Mask // // x > 0: Create mask for Y_hi = 2**(24,000) // x <= 0: Create mask for Y_hi = 2**(-24,000) // (p13) movl GR_temp = 0xA1DC ;; } { .mfi (p13) setf.exp FR_Y_lo = GR_temp // // x < =0: Create mask for 2**(-24,100) // x <= 0: Y_lo = w**(-24,100) // (p12) mov FR_Y_lo = f1 nop.i 999 ;; } { .mfi nop.m 999 (p12) mov FR_Scale = FR_Y_hi nop.i 999 ;; } { .mfi nop.m 999 // // x > 0: Y_lo = 1.0 // x > 0: Scale = 2**(24,000) // (p13) mov FR_Scale = FR_Y_hi nop.i 999 ;; } L(EXPL_RETURN): { .mfi nop.m 999 // // Scale = 2**(24,000) // // // exp(y *ln(x)) almost complete // FR_Scale is Scale // f34 is Z_hi // f35 is Z_lo // (p0) fmpy.s1 FR_Sgn = FR_Scale, FR_Sgn nop.i 999 ;; } { .mfi nop.m 999 // // sgn * scale // (p0) fmpy.s1 FR_Y_lo = FR_Y_lo,FR_Sgn nop.i 999 ;; } { .mfb nop.m 999 // // Z_lo * (sgn * scale) // (p0) fma.s0 FR_Result = FR_Y_hi, FR_Sgn, FR_Y_lo // // Z_hi * (sgn * scale) + Z_lo // (p15) br.cond.sptk L(POWL_64_RETURN) ;; } { .mfi nop.m 999 (p0) fsetc.s3 0x7F,0x01 nop.i 999 } { .mlx nop.m 999 // // Z_hi * (sgn * scale) + Z_lo with wre & td // Z_hi * (sgn * scale) + Z_lo with fz & td // (p0) movl GR_T1_ptr = 0x00000000013FFF ;; } { .mfi nop.m 999 (p0) fma.s3 FR_Result_small = FR_Y_hi, FR_Sgn, FR_Y_lo nop.i 999 } { .mfi nop.m 999 (p0) fsetc.s3 0x7F,0x40 nop.i 999 ;; } { .mfi nop.m 999 // // Return if no danger of over of underflow. // (p0) fsetc.s2 0x7F,0x42 nop.i 999;; } { .mfi nop.m 999 // // S0 user supplied status // S2 user supplied status + WRE + TD (Overflows) // S3 user supplied status + FZ + TD (Underflows) // (p0) fma.s2 FR_Result_big = FR_Y_hi, FR_Sgn, FR_Y_lo nop.i 999 ;; } // // S0 user supplied status // S2 user supplied status + WRE + TD (Overflows) // S3 user supplied status + FZ + TD (Underflows) // // // If (Safe) is true, then // Compute result using user supplied status field. // No overflow or underflow here, but perhaps inexact. // Return // Else // Determine if overflow or underflow was raised. // Fetch +/- overflow threshold for IEEE single, double, // double extended // { .mfi (p0) setf.exp FR_Big = GR_T1_ptr (p0) fsetc.s2 0x7F,0x40 nop.i 999 ;; } { .mfi nop.m 999 (p0) fclass.m.unc p11, p0 = FR_Result_small, 0x00F nop.i 999 ;; } { .mfi nop.m 999 (p0) fmerge.ns FR_NBig = FR_Big, FR_Big nop.i 999 } { .mfi nop.m 999 // // Create largest double exponent + 1. // Create smallest double exponent - 1. // Identify denormals // (p0) fcmp.ge.unc.s1 p8, p0 = FR_Result_big , FR_Big nop.i 999 ;; } { .mii nop.m 999 nop.i 999 ;; // // fcmp: resultS2 <= - overflow threshold // fclass: resultS3 is denorm/unorm/0 // (p8) mov GR_Parameter_TAG = 18 ;; } { .mfb nop.m 999 // // fcmp: resultS2 >= + overflow threshold // (p0) fcmp.le.unc.s1 p9, p0 = FR_Result_big, FR_NBig (p8) br.cond.spnt __libm_error_region ;; } { .mii nop.m 999 nop.i 999 ;; (p9) mov GR_Parameter_TAG = 18 } { .mib nop.m 999 nop.i 999 (p9) br.cond.spnt __libm_error_region ;; } // // Report that pow overflowed - either +Inf, or -Inf // { .mmb (p11) mov GR_Parameter_TAG = 19 nop.m 999 (p11) br.cond.spnt __libm_error_region ;; } { .mib nop.m 999 nop.i 999 // // Report that pow underflowed // (p0) br.cond.sptk L(POWL_64_RETURN) ;; } L(POWL_64_SQUARE): // Here if x not zero and y=2. // Must call __libm_error_support for overflow or underflow // // S0 user supplied status // S2 user supplied status + WRE + TD (Overflows) // S3 user supplied status + FZ + TD (Underflows) // { .mfi nop.m 999 (p0) fma.s0 FR_Result = FR_Input_X, FR_Input_X, f0 nop.i 999 } { .mfi nop.m 999 (p0) fsetc.s3 0x7F,0x01 nop.i 999 } { .mlx nop.m 999 (p0) movl GR_T1_ptr = 0x00000000013FFF ;; } { .mfi nop.m 999 (p0) fma.s3 FR_Result_small = FR_Input_X, FR_Input_X, f0 nop.i 999 } { .mfi nop.m 999 (p0) fsetc.s3 0x7F,0x40 nop.i 999 ;; } { .mfi nop.m 999 // // Return if no danger of over of underflow. // (p0) fsetc.s2 0x7F,0x42 nop.i 999;; } { .mfi nop.m 999 (p0) fma.s2 FR_Result_big = FR_Input_X, FR_Input_X, f0 nop.i 999 ;; } // // S0 user supplied status // S2 user supplied status + WRE + TD (Overflows) // S3 user supplied status + FZ + TD (Underflows) // // // If (Safe) is true, then // Compute result using user supplied status field. // No overflow or underflow here, but perhaps inexact. // Return // Else // Determine if overflow or underflow was raised. // Fetch +/- overflow threshold for IEEE single, double, // double extended // { .mfi (p0) setf.exp FR_Big = GR_T1_ptr (p0) fsetc.s2 0x7F,0x40 nop.i 999 ;; } { .mfi nop.m 999 (p0) fclass.m.unc p11, p0 = FR_Result_small, 0x00F nop.i 999 ;; } { .mfi nop.m 999 (p0) fmerge.ns FR_NBig = FR_Big, FR_Big nop.i 999 } { .mfi nop.m 999 // // Create largest double exponent + 1. // Create smallest double exponent - 1. // Identify denormals // (p0) fcmp.ge.unc.s1 p8, p0 = FR_Result_big , FR_Big nop.i 999 ;; } { .mii nop.m 999 nop.i 999 ;; // // fcmp: resultS2 <= - overflow threshold // fclass: resultS3 is denorm/unorm/0 // (p8) mov GR_Parameter_TAG = 18 ;; } { .mfb nop.m 999 // // fcmp: resultS2 >= + overflow threshold // (p0) fcmp.le.unc.s1 p9, p0 = FR_Result_big, FR_NBig (p8) br.cond.spnt __libm_error_region ;; } { .mii nop.m 999 nop.i 999 ;; (p9) mov GR_Parameter_TAG = 18 } { .mib nop.m 999 nop.i 999 (p9) br.cond.spnt __libm_error_region ;; } // // Report that pow overflowed - either +Inf, or -Inf // { .mmb (p11) mov GR_Parameter_TAG = 19 nop.m 999 (p11) br.cond.spnt __libm_error_region ;; } { .mib nop.m 999 nop.i 999 // // Report that pow underflowed // (p0) br.cond.sptk L(POWL_64_RETURN) ;; } L(POWL_64_SPECIAL): { .mfi nop.m 999 (p0) fcmp.eq.s1 p15, p0 = FR_Input_X, f1 // Is x=+1 nop.i 999 ;; } { .mfi nop.m 999 (p0) fclass.m.unc p14, p0 = FR_Input_Y, 0x023 nop.i 999 ;; } { .mfi nop.m 999 (p15) fcmp.eq.unc.s0 p6,p0 = FR_Input_Y, f0 // If x=1, flag invalid if y=SNaN nop.i 999 } { .mfb nop.m 999 (p15) fmpy.s0 FR_Result = f1,f1 // If x=1, result=1 (p15) br.cond.spnt L(POWL_64_RETURN) ;; // Exit if x=1 } { .mfi nop.m 999 (p0) fclass.m.unc p13, p0 = FR_Input_X, 0x023 nop.i 999 ;; } { .mfi nop.m 999 (p0) fclass.m.unc p8, p0 = FR_Input_X, 0x143 nop.i 999 } { .mfi nop.m 999 (p0) fclass.m.unc p9, p0 = FR_Input_Y, 0x143 nop.i 999 ;; } { .mfi nop.m 999 (p0) fclass.m.unc p10, p0 = FR_Input_X, 0x083 nop.i 999 } { .mfi nop.m 999 (p0) fclass.m.unc p11, p0 = FR_Input_Y, 0x083 nop.i 999 ;; } { .mfi nop.m 999 (p0) fclass.m.unc p6, p0 = FR_Input_Y, 0x007 nop.i 999 } { .mfi nop.m 999 (p0) fcmp.eq.unc.s1 p7, p0 = FR_Input_Y, f1 nop.i 999 ;; } { .mfi nop.m 999 // // set p13 if x +/- Inf // set p14 if y +/- Inf // set p8 if x Natval or +/-SNaN // set p9 if y Natval or +/-SNaN // set p10 if x QNaN // set p11 if y QNaNs // set p6 if y is +/-0 // set p7 if y is 1 // (p8) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X (p6) cmp.ne p8,p0 = r0,r0 ;; // Don't exit if x=snan, y=0 ==> result=+1 } { .mfb nop.m 999 (p9) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_X (p8) br.cond.spnt L(POWL_64_RETURN) ;; } { .mfb nop.m 999 (p10) fmpy.s0 FR_Result = FR_Input_X, f0 (p9) br.cond.spnt L(POWL_64_RETURN) ;; } { .mfi nop.m 999 // // Produce result for SNaN and NatVals and return // (p6) fclass.m.unc p15, p0 = FR_Input_X,0x007 nop.i 999 } { .mfi nop.m 999 // // If Y +/- 0, set p15 if x +/- 0 // (p6) fclass.m.unc p8, p0 = FR_Input_X,0x0C3 nop.i 999 ;; } { .mfi nop.m 999 (p6) fcmp.eq.s0 p9,p0 = FR_Input_X, f0 // If y=0, flag if x denormal nop.i 999 } { .mfi nop.m 999 (p6) fadd.s0 FR_Result = f1, f0 nop.i 999 ;; } { .mfi nop.m 999 // // Set p8 if y = +/-0 and X is a QNaN/SNaN // If y = +/-0, let result = 1.0 // (p7) fmpy.s0 FR_Result = FR_Input_X,f1 // // If y == 1, result = x * 1 // (p15) mov GR_Parameter_TAG = 20 } { .mib nop.m 999 nop.i 999 (p15) br.cond.spnt __libm_error_region ;; } { .mib nop.m 999 // // If x and y are both zero, result = 1.0 and call error // support. // (p8) mov GR_Parameter_TAG = 23 (p8) br.cond.spnt __libm_error_region ;; } { .mib nop.m 999 nop.i 999 // // If y = +/-0 and x is a QNaN, result = 1.0 and call error // support. // (p6) br.cond.spnt L(POWL_64_RETURN) ;; } // If x=0, y=-inf, go to the X_IS_ZERO path { .mfb nop.m 999 (p14) fcmp.eq.unc.s1 p0,p14 = FR_Input_X,f0 (p7) br.cond.spnt L(POWL_64_RETURN) ;; } { .mfi nop.m 999 // // Produce all results for x**0 and x**1 // Let all the result x ** 0 == 1 and return // Let all x ** 1 == x and return // (p10) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X nop.i 999 ;; } { .mfb nop.m 999 (p11) fmpy.s0 FR_Result = FR_Input_Y,FR_Input_X (p10) br.cond.spnt L(POWL_64_RETURN) ;; } { .mib nop.m 999 nop.i 999 (p11) br.cond.spnt L(POWL_64_RETURN) ;; } { .mib nop.m 999 nop.i 999 // // Return result for x or y QNaN input with QNaN result // (p14) br.cond.spnt L(POWL_64_Y_IS_INF) ;; } { .mib nop.m 999 nop.i 999 (p13) br.cond.spnt L(POWL_64_X_IS_INF) ;; } L(POWL_64_X_IS_ZERO): { .mmb (p0) getf.sig GR_signif_y = FR_Input_Y (p0) getf.exp GR_BIASed_exp_y = FR_Input_Y nop.b 999 ;; } { .mlx nop.m 999 (p0) movl GR_Mask = 0x1FFFF } { .mlx nop.m 999 (p0) movl GR_y_sign = 0x20000 ;; } // // Get BIASed exp and significand of y // { .mfi (p0) and GR_exp_y = GR_Mask,GR_BIASed_exp_y nop.f 999 (p0) and GR_y_sign = GR_y_sign,GR_BIASed_exp_y } { .mlx nop.m 999 (p0) movl GR_BIAS = 0xFFFF ;; } { .mfi (p0) cmp.lt.unc p9, p8 = GR_exp_y,GR_BIAS nop.f 999 // // Maybe y is < 1 already, so // can never be an integer. // Remove sign bit from exponent. // (p0) sub GR_exp_y = GR_exp_y,GR_BIAS ;; } { .mii nop.m 999 nop.i 999 ;; // // Remove exponent BIAS // (p8) shl GR_exp_y= GR_signif_y,GR_exp_y ;; } { .mfi (p9) or GR_exp_y= 0xF,GR_signif_y nop.f 999 nop.i 999 ;; } { .mii nop.m 999 // // Shift significand of y looking for nonzero bits // For y > 1, shift signif_y exp_y bits to the left // For y < 1, turn on 4 low order bits of significand of y // so that the fraction will always be non-zero // (p0) shl GR_signif_y= GR_exp_y,1 ;; (p0) extr.u GR_low_order_bit = GR_exp_y,63,1 } // // Integer part of y shifted off. // Get y's low even or odd bit - y might not be an int. // { .mii (p0) cmp.eq.unc p13,p0 = GR_signif_y, r0 (p0) cmp.eq.unc p8,p9 = GR_y_sign, r0 ;; // // Is y an int? // Is y positive // (p13) cmp.ne.unc p13,p0 = GR_low_order_bit, r0 ;; } // // Is y and int and odd? // { .mfb (p13) cmp.eq.unc p13,p14 = GR_y_sign, r0 (p8) fcmp.eq.s0 p12,p0 = FR_Input_Y, f0 // If x=0 and y>0 flag if y denormal nop.b 999 ;; } { .mfb nop.m 999 // // Is y and int and odd and positive? // (p13) mov FR_Result = FR_Input_X (p13) br.cond.sptk L(POWL_64_RETURN) ;; } { .mfi nop.m 999 // // Return +/-0 when x=+/-0 and y is and odd pos. int // (p14) frcpa.s0 FR_Result, p10 = f1, FR_Input_X (p14) mov GR_Parameter_TAG = 21 } { .mib nop.m 999 nop.i 999 (p14) br.cond.spnt __libm_error_region ;; } { .mfb nop.m 999 // // Return +/-0 when x=+/-Inf and y is and odd neg int // and raise dz exception // (p8) mov FR_Result = f0 (p8) br.cond.sptk L(POWL_64_RETURN) ;; } { .mfi nop.m 999 // // Return +0 when x=+/-0 and y > 0 and not odd. // (p9) frcpa.s0 FR_Result, p10 = f1,f0 (p9) mov GR_Parameter_TAG = 21 } { .mib nop.m 999 nop.i 999 (p9) br.cond.sptk __libm_error_region ;; } L(POWL_64_X_IS_INF): { .mfi (p0) getf.exp GR_exp_y = FR_Input_Y (p0) fclass.m.unc p13, p0 = FR_Input_X,0x022 (p0) mov GR_Mask = 0x1FFFF ;; } { .mfi (p0) getf.sig GR_signif_y = FR_Input_Y (p0) fcmp.eq.s0 p9,p0 = FR_Input_Y, f0 // Flag if y denormal nop.i 999 ;; } // // Get exp and significand of y // Create exponent mask and sign mask // { .mlx (p0) and GR_low_order_bit = GR_Mask,GR_exp_y (p0) movl GR_BIAS = 0xFFFF } { .mmi nop.m 999 ;; // // Remove sign bit from exponent. // (p0) cmp.lt.unc p9, p8 = GR_low_order_bit,GR_BIAS // // Maybe y is < 1 already, so // isn't an int. // (p0) sub GR_low_order_bit = GR_low_order_bit,GR_BIAS } { .mlx nop.m 999 (p0) movl GR_sign_mask = 0x20000 ;; } { .mfi (p0) and GR_sign_mask = GR_sign_mask,GR_exp_y // // Return +Inf when x=+/-0 and y < 0 and not odd and raise // divide-by-zero exception. // (p0) fclass.m.unc p11, p0 = FR_Input_X,0x021 nop.i 999 ;; } { .mmi nop.m 999 ;; // // Is shift off integer part of y. // Get y's even or odd bit - y might not be an int. // (p11) cmp.eq.unc p11,p12 = GR_sign_mask, r0 // // Remove exponent BIAS // (p8) shl GR_exp_y = GR_signif_y,GR_low_order_bit ;; } { .mfi (p9) or GR_exp_y = 0xF,GR_signif_y // // Is y positive or negative when x is +Inf? // Is y and int when x = -Inf // (p11) mov FR_Result = FR_Input_X nop.i 999 ;; } { .mfi nop.m 999 (p12) mov FR_Result = f0 nop.i 999 ;; } { .mii nop.m 999 // // Shift signficand looking for nonzero bits // For y non-ints, upset the significand. // (p0) shl GR_signif_y = GR_exp_y,1 ;; (p13) cmp.eq.unc p13,p0 = GR_signif_y, r0 } { .mii nop.m 999 (p0) extr.u GR_low_order_bit = GR_exp_y,63,1 ;; (p13) cmp.ne.unc p13,p0 = GR_low_order_bit, r0 } { .mib nop.m 999 nop.i 999 (p11) br.cond.sptk L(POWL_64_RETURN) ;; } { .mib nop.m 999 nop.i 999 (p12) br.cond.sptk L(POWL_64_RETURN) ;; } // // Return Inf for y > 0 // Return +0 for y < 0 // Is y even or odd? // { .mii (p13) cmp.eq.unc p13,p10 = GR_sign_mask, r0 (p0) cmp.eq.unc p8,p9 = GR_sign_mask, r0 ;; nop.i 999 } { .mfi nop.m 999 // // For x = -inf, y is and int, positive // and odd // Is y positive in general? // (p13) mov FR_Result = FR_Input_X nop.i 999 ;; } { .mfb nop.m 999 (p10) fmerge.ns FR_Result = f0, f0 (p13) br.cond.sptk L(POWL_64_RETURN) ;; } { .mib nop.m 999 nop.i 999 (p10) br.cond.sptk L(POWL_64_RETURN) ;; } { .mfi nop.m 999 // // Return -Inf for x = -inf and y > 0 and odd int. // Return -0 for x = -inf and y < 0 and odd int. // (p8) fmerge.ns FR_Result = FR_Input_X, FR_Input_X nop.i 999 ;; } { .mfb nop.m 999 (p9) mov FR_Result = f0 (p8) br.cond.sptk L(POWL_64_RETURN) ;; } { .mib nop.m 999 nop.i 999 (p9) br.cond.sptk L(POWL_64_RETURN) ;; } L(POWL_64_Y_IS_INF): { .mfi nop.m 999 // // Return Inf for x = -inf and y > 0 not an odd int. // Return +0 for x = -inf and y < 0 and not an odd int. // (p0) fclass.m.unc p8, p0 = FR_Input_Y, 0x021 nop.i 999 } { .mfi nop.m 999 (p0) fclass.m.unc p9, p0 = FR_Input_Y, 0x022 nop.i 999 ;; } { .mfi nop.m 999 (p0) fabs FR_X = FR_Input_X nop.i 999 ;; } { .mfi nop.m 999 (p0) fcmp.eq.s0 p10,p0 = FR_Input_X, f0 // flag if x denormal nop.i 999 ;; } { .mfi nop.m 999 // // Find y = +/- Inf // Compute |x| // (p8) fcmp.lt.unc.s1 p6, p0 = FR_X, f1 nop.i 999 } { .mfi nop.m 999 (p8) fcmp.gt.unc.s1 p7, p0 = FR_X, f1 nop.i 999 ;; } { .mfi nop.m 999 (p9) fcmp.lt.unc.s1 p12, p0 = FR_X, f1 nop.i 999 } { .mfi nop.m 999 (p9) fcmp.gt.unc.s1 p13, p0 = FR_X, f1 nop.i 999 ;; } { .mfi nop.m 999 // // For y = +Inf and |x| < 1 returns 0 // For y = +Inf and |x| > 1 returns Inf // For y = -Inf and |x| < 1 returns Inf // For y = -Inf and |x| > 1 returns 0 // (p6) mov FR_Result = f0 nop.i 999 ;; } { .mfi nop.m 999 (p7) mov FR_Result = FR_Input_Y nop.i 999 ;; } { .mfi nop.m 999 (p12) fmpy.s0 FR_Result = FR_Input_Y, FR_Input_Y nop.i 999 ;; } { .mfb nop.m 999 (p13) mov FR_Result = f0 // // Produce x ** +/- Inf results // (p6) br.cond.spnt L(POWL_64_RETURN) ;; } { .mib nop.m 999 nop.i 999 (p7) br.cond.spnt L(POWL_64_RETURN) ;; } { .mib nop.m 999 nop.i 999 (p12) br.cond.spnt L(POWL_64_RETURN) ;; } { .mib nop.m 999 nop.i 999 (p13) br.cond.spnt L(POWL_64_RETURN) ;; } { .mfb nop.m 999 // // +/-1 ** +/-Inf, result is +1 // (p0) fmpy.s0 FR_Result = f1,f1 (p0) br.cond.sptk L(POWL_64_RETURN) ;; } L(POWL_64_UNSUPPORT): { .mfb nop.m 999 // // Return NaN and raise invalid // (p0) fmpy.s0 FR_Result = FR_Input_X,f0 // // Raise exceptions for specific // values - pseudo NaN and // infinities. // (p0) br.cond.sptk L(POWL_64_RETURN) ;; } L(POWL_64_XNEG): { .mfi nop.m 999 (p0) frcpa.s0 FR_Result, p8 = f0, f0 // // Raise invalid for x < 0 and // y not an integer and // (p0) mov GR_Parameter_TAG = 22 } { .mib nop.m 999 nop.i 999 (p0) br.cond.sptk __libm_error_region ;; } L(POWL_64_SQRT): { .mfi nop.m 999 (p0) frsqrta.s0 FR_Result,p10 = FR_Input_X nop.i 999 ;; } { .mfi nop.m 999 (p10) fma.s1 f62=FR_Half,FR_Input_X,f0 nop.i 999 ;; } { .mfi nop.m 999 // // Step (2) // h = 1/2 * a in f9 // (p10) fma.s1 f63=FR_Result,FR_Result,f0 nop.i 999 ;; } { .mfi nop.m 999 // // Step (3) // t1 = y0 * y0 in f10 // (p10) fnma.s1 f32=f63,f62,f11 nop.i 999 ;; } { .mfi nop.m 999 // // Step (4) // t2 = 1/2 - t1 * h in f10 // (p10) fma.s1 f33=f32,FR_Result,FR_Result nop.i 999 ;; } { .mfi nop.m 999 // // Step (5) // y1 = y0 + t2 * y0 in f13 // (p10) fma.s1 f34=f33,f62,f0 nop.i 999 ;; } { .mfi nop.m 999 // // Step (6) // t3 = y1 * h in f10 // (p10) fnma.s1 f35=f34,f33,f11 nop.i 999 ;; } { .mfi nop.m 999 // // Step (7) // t4 = 1/2 - t3 * y1 in f10 // (p10) fma.s1 f63=f35,f33,f33 nop.i 999 ;; } { .mfi nop.m 999 // // Step (8) // y2 = y1 + t4 * y1 in f13 // (p10) fma.s1 f32=FR_Input_X,f63,f0 nop.i 999 } { .mfi nop.m 999 // // Step (9) // S = a * y2 in f10 // (p10) fma.s1 FR_Result=f63,f62,f0 nop.i 999 ;; } { .mfi nop.m 999 // // Step (10) // t5 = y2 * h in f9 // (p10) fma.s1 f33=f11,f63,f0 nop.i 999 ;; } { .mfi nop.m 999 // // Step (11) // H = 1/2 * y2 in f11 // (p10) fnma.s1 f34=f32,f32,f8 nop.i 999 } { .mfi nop.m 999 // // Step (12) // d = a - S * S in f12 // (p10) fnma.s1 f35=FR_Result,f63,f11 nop.i 999 ;; } { .mfi nop.m 999 // // Step (13) // t6 = 1/2 - t5 * y2 in f7 // (p10) fma.s1 f62=f33,f34,f32 nop.i 999 } { .mfi nop.m 999 // // Step (14) // S1 = S + d * H in f13 // (p10) fma.s1 f63=f33,f35,f33 nop.i 999 ;; } { .mfi nop.m 999 // // Step (15) // H1 = H + t6 * h in f7 // (p10) fnma.s1 f32=f62,f62,FR_Input_X nop.i 999 ;; } { .mfb nop.m 999 // // Step (16) // d1 = a - S1 * S1 // (p10) fma.s0 FR_Result=f32,f63,f62 // // Step (17) // R = S1 + d1 * H1 // (p10) br.cond.sptk L(POWL_64_RETURN) ;; } { .mib nop.m 999 nop.i 999 // // Do the Newton-Raphson iteration from the EAS. // (p0) br.cond.sptk L(POWL_64_RETURN) ;; } // // Take care of the degenerate cases. // L(POWL_64_RETURN): { .mfb nop.m 999 (p0) mov FR_Output = FR_Result (p0) br.ret.sptk b0 ;; } .endp powl ASM_SIZE_DIRECTIVE(powl) .proc __libm_error_region __libm_error_region: .prologue { .mfi add GR_Parameter_Y=-32,sp // Parameter 2 value nop.f 0 .save ar.pfs,GR_SAVE_PFS mov GR_SAVE_PFS=ar.pfs // Save ar.pfs } { .mfi .fframe 64 add sp=-64,sp // Create new stack nop.f 0 mov GR_SAVE_GP=gp // Save gp };; { .mmi stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack add GR_Parameter_X = 16,sp // Parameter 1 address .save b0, GR_SAVE_B0 mov GR_SAVE_B0=b0 // Save b0 };; .body { .mib stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack add GR_Parameter_RESULT = 0,GR_Parameter_Y nop.b 0 // Parameter 3 address } { .mib stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack add GR_Parameter_Y = -16,GR_Parameter_Y br.call.sptk b0=__libm_error_support# // Call error handling function };; { .mmi nop.m 0 nop.m 0 add GR_Parameter_RESULT = 48,sp };; { .mmi ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack .restore sp add sp = 64,sp // Restore stack pointer mov b0 = GR_SAVE_B0 // Restore return address };; { .mib mov gp = GR_SAVE_GP // Restore gp mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs br.ret.sptk b0 // Return };; .endp __libm_error_region ASM_SIZE_DIRECTIVE(__libm_error_region) .type __libm_error_support#,@function .global __libm_error_support#