summaryrefslogtreecommitdiff
path: root/sysdeps/ia64/fpu/libm_lgammal.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/ia64/fpu/libm_lgammal.S')
-rw-r--r--sysdeps/ia64/fpu/libm_lgammal.S7676
1 files changed, 7676 insertions, 0 deletions
diff --git a/sysdeps/ia64/fpu/libm_lgammal.S b/sysdeps/ia64/fpu/libm_lgammal.S
new file mode 100644
index 0000000000..056171b7d2
--- /dev/null
+++ b/sysdeps/ia64/fpu/libm_lgammal.S
@@ -0,0 +1,7676 @@
+.file "libm_lgammal.s"
+
+
+// Copyright (c) 2002 - 2003, Intel Corporation
+// All rights reserved.
+//
+// Contributed 2002 by the Intel Numerics Group, Intel Corporation
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// * The name of Intel Corporation may not be used to endorse or promote
+// products derived from this software without specific prior written
+// permission.
+
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
+// LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
+// EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code,and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/software/products/opensource/libraries/num.htm.
+//
+//*********************************************************************
+//
+// History:
+// 03/28/02 Original version
+// 05/20/02 Cleaned up namespace and sf0 syntax
+// 08/21/02 Added support of SIGN(GAMMA(x)) calculation
+// 09/26/02 Algorithm description improved
+// 10/21/02 Now it returns SIGN(GAMMA(x))=-1 for negative zero
+// 02/10/03 Reordered header: .section, .global, .proc, .align
+//
+//*********************************************************************
+//
+// Function: __libm_lgammal(long double x, int* signgam, int szsigngam)
+// computes the principal value of the logarithm of the GAMMA function
+// of x. Signum of GAMMA(x) is stored to memory starting at the address
+// specified by the signgam.
+//
+//*********************************************************************
+//
+// Resources Used:
+//
+// Floating-Point Registers: f8 (Input and Return Value)
+// f9-f15
+// f32-f127
+//
+// General Purpose Registers:
+// r2, r3, r8-r11, r14-r31
+// r32-r65
+// r66-r69 (Used to pass arguments to error handling routine)
+//
+// Predicate Registers: p6-p15
+//
+//*********************************************************************
+//
+// IEEE Special Conditions:
+//
+// __libm_lgammal(+inf) = +inf
+// __libm_lgammal(-inf) = QNaN
+// __libm_lgammal(+/-0) = +inf
+// __libm_lgammal(x<0, x - integer) = QNaN
+// __libm_lgammal(SNaN) = QNaN
+// __libm_lgammal(QNaN) = QNaN
+//
+//*********************************************************************
+//
+// ALGORITHM DESCRIPTION
+//
+// Below we suppose that there is log(z) function which takes an long
+// double argument and returns result as a pair of long double numbers
+// lnHi and lnLo (such that sum lnHi + lnLo provides ~80 correct bits
+// of significand). Algorithm description for such log(z) function
+// see below.
+// Also, it this algorithm description we use the following notational
+// conventions:
+// a) pair A = (Ahi, Alo) means number A represented as sum of Ahi and Alo
+// b) C = A + B = (Ahi, Alo) + (Bhi, Blo) means multi-precision addition.
+// The result would be C = (Chi, Clo). Notice, that Clo shouldn't be
+// equal to Alo + Blo
+// c) D = A*B = (Ahi, Alo)*(Bhi, Blo) = (Dhi, Dlo) multi-precisiion
+// multiplication.
+//
+// So, lgammal has the following computational paths:
+// 1) |x| < 0.5
+// P = A1*|x| + A2*|x|^2 + ... + A22*|x|^22
+// A1, A2, A3 represented as a sum of two double precision
+// numbers and multi-precision computations are used for 3 higher
+// terms of the polynomial. We get polynomial as a sum of two
+// double extended numbers: P = (Phi, Plo)
+// 1.1) x > 0
+// lgammal(x) = P - log(|x|) = (Phi, Plo) - (lnHi(|x|), lnLo(|x|))
+// 1.2) x < 0
+// lgammal(x) = -P - log(|x|) - log(sin(Pi*x)/(Pi*x))
+// P and log(|x|) are computed by the same way as in 1.1;
+// - log(sin(Pi*x)/(Pi*x)) is approximated by a polynomial Plnsin.
+// Plnsin:= fLnSin2*|x|^2 + fLnSin4*|x|^4 + ... + fLnSin36*|x|^36
+// The first coefficient of Plnsin is represented as sum of two
+// double precision numbers (fLnSin2, fLnSin2L). Multi-precision
+// computations for higher two terms of Plnsin are used.
+// So, the final result is reconstructed by the following formula
+// lgammal(x) = (-(Phi, Plo) - (lnHi(|x|), lnLo(|x|))) -
+// - (PlnsinHi,PlnsinLo)
+//
+// 2) 0.5 <= x < 0.75 -> t = x - 0.625
+// -0.75 < x <= -0.5 -> t = x + 0.625
+// 2.25 <= x < 4.0 -> t = x/2 - 1.5
+// 4.0 <= x < 8.0 -> t = x/4 - 1.5
+// -0.5 < x <= -0.40625 -> t = x + 0.5
+// -2.6005859375 < x <= -2.5 -> t = x + 2.5
+// 1.3125 <= x < 1.5625 -> t = x - LOC_MIN, where LOC_MIN is point in
+// which lgammal has local minimum. Exact
+// value can be found in the table below,
+// approximate value is ~1.46
+//
+// lgammal(x) is approximated by the polynomial of 25th degree: P25(t)
+// P25(t) = A0 + A1*t + ... + A25*t^25 = (Phi, Plo) + t^4*P21(t),
+// where
+// (Phi, Plo) is sum of four highest terms of the polynomial P25(t):
+// (Phi, Plo) = ((A0, A0L) + (A1, A1L)*t) + t^2 *((A2, A2L) + (A3, A3L)*t),
+// (Ai, AiL) - coefficients represented as pairs of DP numbers.
+//
+// P21(t) = (PolC(t)*t^8 + PolD(t))*t^8 + PolE(t),
+// where
+// PolC(t) = C21*t^5 + C20*t^4 + ... + C16,
+// C21 = A25, C20 = A24, ..., C16 = A20
+//
+// PolD(t) = D7*t^7 + D6*t^6 + ... + D0,
+// D7 = A19, D6 = A18, ..., D0 = A12
+//
+// PolE(t) = E7*t^7 + E6*t^6 + ... + E0,
+// E7 = A11, E6 = A10, ..., E0 = A4
+//
+// Cis and Dis are represented as double precision numbers,
+// Eis are represented as double extended numbers.
+//
+// 3) 0.75 <= x < 1.3125 -> t = x - 1.0
+// 1.5625 <= x < 2.25 -> t = x - 2.0
+// lgammal(x) is approximated by the polynomial of 25th degree: P25(t)
+// P25(t) = A1*t + ... + A25*t^25, and computations are carried out
+// by similar way as in the previous case
+//
+// 4) 10.0 < x <= Overflow Bound ("positive Sterling" range)
+// lgammal(x) is approximated using Sterling's formula:
+// lgammal(x) ~ ((x*(lnHi(x) - 1, lnLo(x))) - 0.5*(lnHi(x), lnLo(x))) +
+// + ((Chi, Clo) + S(1/x))
+// where
+// C = (Chi, Clo) - pair of double precision numbers representing constant
+// 0.5*ln(2*Pi);
+// S(1/x) = 1/x * (B2 + B4*(1/x)^2 + ... + B20*(1/x)^18), B2, ..., B20 are
+// Bernulli numbers. S is computed in native precision and then added to
+// Clo;
+// lnHi(x) - 1 is computed in native precision and the multiprecision
+// multiplication (x, 0) *(lnHi(x) - 1, lnLo(x)) is used.
+//
+// 5) -INF < x <= -2^63, any negative integer < 0
+// All numbers in this range are integers -> error handler is called
+//
+// 6) -2^63 < x <= -0.75 ("negative Sterling" range), x is "far" from root,
+// lgammal(-t) for positive t is approximated using the following formula:
+// lgammal(-t) = -lgammal(t)-log(t)-log(|dT|)+log(sin(Pi*|dT|)/(Pi*|dT|))
+// where dT = -t -round_to_nearest_integer(-t)
+// Last item is approximated by the same polynomial as described in 1.2.
+// We split the whole range into three subranges due to different ways of
+// approximation of the first terms.
+// 6.1) -2^63 < x < -6.0 ("negative Sterling" range)
+// lgammal(t) is approximated exactly as in #4. The only difference that
+// for -13.0 < x < -6.0 subrange instead of Bernulli numbers we use their
+// minimax approximation on this range.
+// log(t), log(|dT|) are approximated by the log routine mentioned above.
+// 6.2) -6.0 < x <= -0.75, |x + 1|> 2^(-7)
+// log(t), log(|dT|) are approximated by the log routine mentioned above,
+// lgammal(t) is approximated by polynomials of the 25th degree similar
+// to ones from #2. Arguments z of the polynomials are as follows
+// a) 0.75 <= t < 1.0 - 2^(-7), z = 2*t - 1.5
+// b) 1.0 - 2^(-7) < t < 2.0, z = t - 1.5
+// c) 2.0 < t < 3.0, z = t/2 - 1.5
+// d) 3.0 < t < 4.0, z = t/2 - 1.5. Notice, that range reduction is
+// the same as in case c) but the set of coefficients is different
+// e) 4.0 < t < 6.0, z = t/4 - 1.5
+// 6.3) |x + 1| <= 2^(-7)
+// log(1 + (x-1)) is approximated by Taylor series,
+// log(sin(Pi*|dT|)/(Pi*|dT|)) is still approximated by polynomial but
+// it has just 4th degree.
+// log(|dT|) is approximated by the log routine mentioned above.
+// lgammal(-x) is approximated by polynomial of 8th degree from (-x + 1).
+//
+// 7) -20.0 < x < -2.0, x falls in root "neighbourhood".
+// "Neighbourhood" means that |lgammal(x)| < epsilon, where epsilon is
+// different for every root (and it is stored in the table), but typically
+// it is ~ 0.15. There are 35 roots significant from "double extended"
+// point of view. We split all the roots into two subsets: "left" and "right"
+// roots. Considering [-(N+1), -N] range we call root as "left" one if it
+// lies closer to -(N+1) and "right" otherwise. There is no "left" root in
+// the [-20, -19] range (it exists, but is insignificant for double extended
+// precision). To determine if x falls in root "neighbourhood" we store
+// significands of all the 35 roots as well as epsilon values (expressed
+// by the left and right bound).
+// In these ranges we approximate lgammal(x) by polynomial series of 19th
+// degree:
+// lgammal(x) = P19(t) = A0 + A1*t + ...+ A19*t^19, where t = x - EDP_Root,
+// EDP_Root is the exact value of the corresponding root rounded to double
+// extended precision. So, we have 35 different polynomials which make our
+// table rather big. We may hope that x falls in root "neighbourhood"
+// quite rarely -> ther might be no need in frequent use of different
+// polynomials.
+// A0, A1, A2, A3 are represented as pairs of double precision numbers,
+// A4, A5 are long doubles, and to decrease the size of the table we
+// keep the rest of coefficients in just double precision
+//
+//*********************************************************************
+// Algorithm for log(X) = (lnHi(X), lnLo(X))
+//
+// ALGORITHM
+//
+// Here we use a table lookup method. The basic idea is that in
+// order to compute logl(Arg) for an argument Arg in [1,2), we
+// construct a value G such that G*Arg is close to 1 and that
+// logl(1/G) is obtainable easily from a table of values calculated
+// beforehand. Thus
+//
+// logl(Arg) = logl(1/G) + logl(G*Arg)
+// = logl(1/G) + logl(1 + (G*Arg - 1))
+//
+// Because |G*Arg - 1| is small, the second term on the right hand
+// side can be approximated by a short polynomial. We elaborate
+// this method in four steps.
+//
+// Step 0: Initialization
+//
+// We need to calculate logl( X ). Obtain N, S_hi such that
+//
+// X = 2^N * S_hi exactly
+//
+// where S_hi in [1,2)
+//
+// Step 1: Argument Reduction
+//
+// Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
+//
+// G := G_1 * G_2 * G_3
+// r := (G * S_hi - 1)
+//
+// These G_j's have the property that the product is exactly
+// representable and that |r| < 2^(-12) as a result.
+//
+// Step 2: Approximation
+//
+//
+// logl(1 + r) is approximated by a short polynomial poly(r).
+//
+// Step 3: Reconstruction
+//
+//
+// Finally, logl( X ) is given by
+//
+// logl( X ) = logl( 2^N * S_hi )
+// ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
+// ~=~ N*logl(2) + logl(1/G) + poly(r).
+//
+// IMPLEMENTATION
+//
+// Step 0. Initialization
+// ----------------------
+//
+// Z := X
+// N := unbaised exponent of Z
+// S_hi := 2^(-N) * Z
+//
+// Step 1. Argument Reduction
+// --------------------------
+//
+// Let
+//
+// Z = 2^N * S_hi = 2^N * 1.d_1 d_2 d_3 ... d_63
+//
+// We obtain G_1, G_2, G_3 by the following steps.
+//
+//
+// Define X_0 := 1.d_1 d_2 ... d_14. This is extracted
+// from S_hi.
+//
+// Define A_1 := 1.d_1 d_2 d_3 d_4. This is X_0 truncated
+// to lsb = 2^(-4).
+//
+// Define index_1 := [ d_1 d_2 d_3 d_4 ].
+//
+// Fetch Z_1 := (1/A_1) rounded UP in fixed point with
+// fixed point lsb = 2^(-15).
+// Z_1 looks like z_0.z_1 z_2 ... z_15
+// Note that the fetching is done using index_1.
+// A_1 is actually not needed in the implementation
+// and is used here only to explain how is the value
+// Z_1 defined.
+//
+// Fetch G_1 := (1/A_1) truncated to 21 sig. bits.
+// floating pt. Again, fetching is done using index_1. A_1
+// explains how G_1 is defined.
+//
+// Calculate X_1 := X_0 * Z_1 truncated to lsb = 2^(-14)
+// = 1.0 0 0 0 d_5 ... d_14
+// This is accomplised by integer multiplication.
+// It is proved that X_1 indeed always begin
+// with 1.0000 in fixed point.
+//
+//
+// Define A_2 := 1.0 0 0 0 d_5 d_6 d_7 d_8. This is X_1
+// truncated to lsb = 2^(-8). Similar to A_1,
+// A_2 is not needed in actual implementation. It
+// helps explain how some of the values are defined.
+//
+// Define index_2 := [ d_5 d_6 d_7 d_8 ].
+//
+// Fetch Z_2 := (1/A_2) rounded UP in fixed point with
+// fixed point lsb = 2^(-15). Fetch done using index_2.
+// Z_2 looks like z_0.z_1 z_2 ... z_15
+//
+// Fetch G_2 := (1/A_2) truncated to 21 sig. bits.
+// floating pt.
+//
+// Calculate X_2 := X_1 * Z_2 truncated to lsb = 2^(-14)
+// = 1.0 0 0 0 0 0 0 0 d_9 d_10 ... d_14
+// This is accomplised by integer multiplication.
+// It is proved that X_2 indeed always begin
+// with 1.00000000 in fixed point.
+//
+//
+// Define A_3 := 1.0 0 0 0 0 0 0 0 d_9 d_10 d_11 d_12 d_13 1.
+// This is 2^(-14) + X_2 truncated to lsb = 2^(-13).
+//
+// Define index_3 := [ d_9 d_10 d_11 d_12 d_13 ].
+//
+// Fetch G_3 := (1/A_3) truncated to 21 sig. bits.
+// floating pt. Fetch is done using index_3.
+//
+// Compute G := G_1 * G_2 * G_3.
+//
+// This is done exactly since each of G_j only has 21 sig. bits.
+//
+// Compute
+//
+// r := (G*S_hi - 1)
+//
+//
+// Step 2. Approximation
+// ---------------------
+//
+// This step computes an approximation to logl( 1 + r ) where r is the
+// reduced argument just obtained. It is proved that |r| <= 1.9*2^(-13);
+// thus logl(1+r) can be approximated by a short polynomial:
+//
+// logl(1+r) ~=~ poly = r + Q1 r^2 + ... + Q4 r^5
+//
+//
+// Step 3. Reconstruction
+// ----------------------
+//
+// This step computes the desired result of logl(X):
+//
+// logl(X) = logl( 2^N * S_hi )
+// = N*logl(2) + logl( S_hi )
+// = N*logl(2) + logl(1/G) +
+// logl(1 + G*S_hi - 1 )
+//
+// logl(2), logl(1/G_j) are stored as pairs of (single,double) numbers:
+// log2_hi, log2_lo, log1byGj_hi, log1byGj_lo. The high parts are
+// single-precision numbers and the low parts are double precision
+// numbers. These have the property that
+//
+// N*log2_hi + SUM ( log1byGj_hi )
+//
+// is computable exactly in double-extended precision (64 sig. bits).
+// Finally
+//
+// lnHi(X) := N*log2_hi + SUM ( log1byGj_hi )
+// lnLo(X) := poly_hi + [ poly_lo +
+// ( SUM ( log1byGj_lo ) + N*log2_lo ) ]
+//
+//
+//*********************************************************************
+// General Purpose Registers
+// scratch registers
+rPolDataPtr = r2
+rLnSinDataPtr = r3
+rExpX = r8
+rSignifX = r9
+rDelta = r10
+rSignExpX = r11
+GR_ad_z_1 = r14
+r17Ones = r15
+GR_Index1 = r16
+rSignif1andQ = r17
+GR_X_0 = r18
+GR_X_1 = r19
+GR_X_2 = r20
+GR_Z_1 = r21
+GR_Z_2 = r22
+GR_N = r23
+rExpHalf = r24
+rExp8 = r25
+rX0Dx = r25
+GR_ad_tbl_1 = r26
+GR_ad_tbl_2 = r27
+GR_ad_tbl_3 = r28
+GR_ad_q = r29
+GR_ad_z_1 = r30
+GR_ad_z_2 = r31
+// stacked registers
+rPFS_SAVED = r32
+GR_ad_z_3 = r33
+rSgnGamAddr = r34
+rSgnGamSize = r35
+rLogDataPtr = r36
+rZ1offsett = r37
+rTmpPtr = r38
+rTmpPtr2 = r39
+rTmpPtr3 = r40
+rExp2 = r41
+rExp2tom7 = r42
+rZ625 = r42
+rExpOne = r43
+rNegSingularity = r44
+rXint = r45
+rTbl1Addr = r46
+rTbl2Addr = r47
+rTbl3Addr = r48
+rZ2Addr = r49
+rRootsAddr = r50
+rRootsBndAddr = r51
+rRoot = r52
+rRightBound = r53
+rLeftBound = r54
+rSignifDx = r55
+rBernulliPtr = r56
+rLnSinTmpPtr = r56
+rIndex1Dx = r57
+rIndexPol = r58
+GR_Index3 = r59
+GR_Index2 = r60
+rSgnGam = r61
+rXRnd = r62
+
+GR_SAVE_B0 = r63
+GR_SAVE_GP = r64
+GR_SAVE_PFS = r65
+// output parameters when calling error handling routine
+GR_Parameter_X = r66
+GR_Parameter_Y = r67
+GR_Parameter_RESULT = r68
+GR_Parameter_TAG = r69
+
+//********************************************************************
+// Floating Point Registers
+// CAUTION: due to the lack of registers there exist (below in the code)
+// sometimes "unconventional" use of declared registers
+//
+fAbsX = f6
+fDelX4 = f6
+fSignifX = f7
+// macros for error handling routine
+FR_X = f10 // first argument
+FR_Y = f1 // second argument (lgammal has just one)
+FR_RESULT = f8 // result
+
+// First 7 Bernulli numbers
+fB2 = f9
+fLnDeltaL = f9
+fXSqr = f9
+fB4 = f10
+fX4 = f10
+fB6 = f11
+fX6 = f11
+fB8 = f12
+fXSqrL = f12
+fB10 = f13
+fRes7H = f13
+fB12 = f14
+fRes7L = f14
+fB14 = f15
+
+// stack registers
+// Polynomial coefficients: A0, ..., A25
+fA0 = f32
+fA0L = f33
+fInvXL = f33
+fA1 = f34
+fA1L = f35
+fA2 = f36
+fA2L = f37
+fA3 = f38
+fA3L = f39
+fA4 = f40
+fA4L = f41
+fRes6H = f41
+fA5 = f42
+fB2L = f42
+fA5L = f43
+fMinNegStir = f43
+fRes6L = f43
+fA6 = f44
+fMaxNegStir = f44
+fA7 = f45
+fLnDeltaH = f45
+fA8 = f46
+fBrnL = f46
+fA9 = f47
+fBrnH = f47
+fA10 = f48
+fRes5L = f48
+fA11 = f49
+fRes5H = f49
+fA12 = f50
+fDx6 = f50
+fA13 = f51
+fDx8 = f51
+fA14 = f52
+fDx4 = f52
+fA15 = f53
+fYL = f53
+fh3Dx = f53
+fA16 = f54
+fYH = f54
+fH3Dx = f54
+fA17 = f55
+fResLnDxL = f55
+fG3Dx = f55
+fA18 = f56
+fResLnDxH = f56
+fh2Dx = f56
+fA19 = f57
+fFloatNDx = f57
+fA20 = f58
+fPolyHiDx = f58
+fhDx = f58
+fA21 = f59
+fRDxCub = f59
+fHDx = f59
+fA22 = f60
+fRDxSq = f60
+fGDx = f60
+fA23 = f61
+fPolyLoDx = f61
+fInvX3 = f61
+fA24 = f62
+fRDx = f62
+fInvX8 = f62
+fA25 = f63
+fInvX4 = f63
+fPol = f64
+fPolL = f65
+// Coefficients of ln(sin(Pi*x)/Pi*x)
+fLnSin2 = f66
+fLnSin2L = f67
+fLnSin4 = f68
+fLnSin6 = f69
+fLnSin8 = f70
+fLnSin10 = f71
+fLnSin12 = f72
+fLnSin14 = f73
+fLnSin16 = f74
+fLnSin18 = f75
+fDelX8 = f75
+fLnSin20 = f76
+fLnSin22 = f77
+fDelX6 = f77
+fLnSin24 = f78
+fLnSin26 = f79
+fLnSin28 = f80
+fLnSin30 = f81
+fhDelX = f81
+fLnSin32 = f82
+fLnSin34 = f83
+fLnSin36 = f84
+fXint = f85
+fDxSqr = f85
+fRes3L = f86
+fRes3H = f87
+fRes4H = f88
+fRes4L = f89
+fResH = f90
+fResL = f91
+fDx = f92
+FR_MHalf = f93
+fRes1H = f94
+fRes1L = f95
+fRes2H = f96
+fRes2L = f97
+FR_FracX = f98
+fRcpX = f99
+fLnSinH = f99
+fTwo = f100
+fMOne = f100
+FR_G = f101
+FR_H = f102
+FR_h = f103
+FR_G2 = f104
+FR_H2 = f105
+FR_poly_lo = f106
+FR_poly_hi = f107
+FR_h2 = f108
+FR_rsq = f109
+FR_r = f110
+FR_log2_hi = f111
+FR_log2_lo = f112
+fFloatN = f113
+FR_Q4 = f114
+FR_G3 = f115
+FR_H3 = f116
+FR_h3 = f117
+FR_Q3 = f118
+FR_Q2 = f119
+FR_Q1 = f120
+fThirteen = f121
+fSix = f121
+FR_rcub = f121
+// Last three Bernulli numbers
+fB16 = f122
+fB18 = f123
+fB20 = f124
+fInvX = f125
+fLnSinL = f125
+fDxSqrL = f126
+fFltIntX = f126
+fRoot = f127
+fNormDx = f127
+
+// Data tables
+//==============================================================
+RODATA
+// ************* DO NOT CHANGE THE ORDER OF THESE TABLES *************
+.align 16
+LOCAL_OBJECT_START(lgammal_right_roots_data)
+// List of all right roots themselves
+data8 0x9D3FE4B007C360AB, 0x0000C000 // Range [-3, -2]
+data8 0xC9306DE4F2CD7BEE, 0x0000C000 // Range [-4, -3]
+data8 0x814273C2CCAC0618, 0x0000C001 // Range [-5, -4]
+data8 0xA04352BF85B6C865, 0x0000C001 // Range [-6, -5]
+data8 0xC00B592C4BE4676C, 0x0000C001 // Range [-7, -6]
+data8 0xE0019FEF6FF0F5BF, 0x0000C001 // Range [-8, -7]
+data8 0x80001A01459FC9F6, 0x0000C002 // Range [-9, -8]
+data8 0x900002E3BB47D86D, 0x0000C002 // Range [-10, -9]
+data8 0xA0000049F93BB992, 0x0000C002 // Range [-11, -10]
+data8 0xB0000006B9915316, 0x0000C002 // Range [-12, -11]
+data8 0xC00000008F76C773, 0x0000C002 // Range [-13, -12]
+data8 0xD00000000B09230A, 0x0000C002 // Range [-14, -13]
+data8 0xE000000000C9CBA5, 0x0000C002 // Range [-15, -14]
+data8 0xF0000000000D73FA, 0x0000C002 // Range [-16, -15]
+data8 0x8000000000006BA0, 0x0000C003 // Range [-17, -16]
+data8 0x8800000000000655, 0x0000C003 // Range [-18, -17]
+data8 0x900000000000005A, 0x0000C003 // Range [-19, -18]
+data8 0x9800000000000005, 0x0000C003 // Range [-20, -19]
+// List of bounds of ranges with special polynomial approximation near root
+// Only significands of bounds are actually stored
+data8 0xA000000000000000, 0x9800000000000000 // Bounds for root on [-3, -2]
+data8 0xCAB88035C5EFBB41, 0xC7E05E31F4B02115 // Bounds for root on [-4, -3]
+data8 0x817831B899735C72, 0x8114633941B8053A // Bounds for root on [-5, -4]
+data8 0xA04E8B34C6AA9476, 0xA039B4A42978197B // Bounds for root on [-6, -5]
+data8 0xC00D3D5E588A78A9, 0xC009BA25F7E858A6 // Bounds for root on [-7, -6]
+data8 0xE001E54202991EB4, 0xE001648416CE897F // Bounds for root on [-8, -7]
+data8 0x80001E56D13A6B9F, 0x8000164A3BAD888A // Bounds for root on [-9, -8]
+data8 0x9000035F0529272A, 0x9000027A0E3D94F0 // Bounds for root on [-10, -9]
+data8 0xA00000564D705880, 0xA000003F67EA0CC7 // Bounds for root on [-11, -10]
+data8 0xB0000007D87EE0EF, 0xB0000005C3A122A5 // Bounds for root on [-12, -11]
+data8 0xC0000000A75FE8B1, 0xC00000007AF818AC // Bounds for root on [-13, -12]
+data8 0xD00000000CDFFE36, 0xD000000009758BBF // Bounds for root on [-14, -13]
+data8 0xE000000000EB6D96, 0xE000000000ACF7B2 // Bounds for root on [-15, -14]
+data8 0xF0000000000FB1F9, 0xF0000000000B87FB // Bounds for root on [-16, -15]
+data8 0x8000000000007D90, 0x8000000000005C40 // Bounds for root on [-17, -16]
+data8 0x8800000000000763, 0x880000000000056D // Bounds for root on [-18, -17]
+data8 0x9000000000000069, 0x900000000000004D // Bounds for root on [-19, -18]
+data8 0x9800000000000006, 0x9800000000000005 // Bounds for root on [-20, -19]
+// List of all left roots themselves
+data8 0xAFDA0850DEC8065E, 0x0000C000 // Range [-3, -2]
+data8 0xFD238AA3E17F285C, 0x0000C000 // Range [-4, -3]
+data8 0x9FBABBD37757E6A2, 0x0000C001 // Range [-5, -4]
+data8 0xBFF497AC8FA06AFC, 0x0000C001 // Range [-6, -5]
+data8 0xDFFE5FBB5C377FE8, 0x0000C001 // Range [-7, -6]
+data8 0xFFFFCBFC0ACE7879, 0x0000C001 // Range [-8, -7]
+data8 0x8FFFFD1C425E8100, 0x0000C002 // Range [-9, -8]
+data8 0x9FFFFFB606BDFDCD, 0x0000C002 // Range [-10, -9]
+data8 0xAFFFFFF9466E9F1B, 0x0000C002 // Range [-11, -10]
+data8 0xBFFFFFFF70893874, 0x0000C002 // Range [-12, -11]
+data8 0xCFFFFFFFF4F6DCF6, 0x0000C002 // Range [-13, -12]
+data8 0xDFFFFFFFFF36345B, 0x0000C002 // Range [-14, -13]
+data8 0xEFFFFFFFFFF28C06, 0x0000C002 // Range [-15, -14]
+data8 0xFFFFFFFFFFFF28C0, 0x0000C002 // Range [-16, -15]
+data8 0x87FFFFFFFFFFF9AB, 0x0000C003 // Range [-17, -16]
+data8 0x8FFFFFFFFFFFFFA6, 0x0000C003 // Range [-18, -17]
+data8 0x97FFFFFFFFFFFFFB, 0x0000C003 // Range [-19, -18]
+data8 0x0000000000000000, 0x00000000 // pad to keep logic in the main path
+// List of bounds of ranges with special polynomial approximation near root
+// Only significands of bounds are actually stored
+data8 0xB235880944CC758E, 0xADD2F1A9FBE76C8B // Bounds for root on [-3, -2]
+data8 0xFD8E7844F307B07C, 0xFCA655C2152BDE4D // Bounds for root on [-4, -3]
+data8 0x9FC4D876EE546967, 0x9FAEE4AF68BC4292 // Bounds for root on [-5, -4]
+data8 0xBFF641FFBFCC44F1, 0xBFF2A47919F4BA89 // Bounds for root on [-6, -5]
+data8 0xDFFE9C803DEFDD59, 0xDFFE18932EB723FE // Bounds for root on [-7, -6]
+data8 0xFFFFD393FA47AFC3, 0xFFFFC317CF638AE1 // Bounds for root on [-8, -7]
+data8 0x8FFFFD8840279925, 0x8FFFFC9DCECEEE92 // Bounds for root on [-9, -8]
+data8 0x9FFFFFC0D34E2AF8, 0x9FFFFFA9619AA3B7 // Bounds for root on [-10, -9]
+data8 0xAFFFFFFA41C18246, 0xAFFFFFF82025A23C // Bounds for root on [-11, -10]
+data8 0xBFFFFFFF857ACB4E, 0xBFFFFFFF58032378 // Bounds for root on [-12, -11]
+data8 0xCFFFFFFFF6934AB8, 0xCFFFFFFFF313EF0A // Bounds for root on [-13, -12]
+data8 0xDFFFFFFFFF53A9E9, 0xDFFFFFFFFF13B5A5 // Bounds for root on [-14, -13]
+data8 0xEFFFFFFFFFF482CB, 0xEFFFFFFFFFF03F4F // Bounds for root on [-15, -14]
+data8 0xFFFFFFFFFFFF482D, 0xFFFFFFFFFFFF03F5 // Bounds for root on [-16, -15]
+data8 0x87FFFFFFFFFFFA98, 0x87FFFFFFFFFFF896 // Bounds for root on [-17, -16]
+data8 0x8FFFFFFFFFFFFFB3, 0x8FFFFFFFFFFFFF97 // Bounds for root on [-18, -17]
+data8 0x97FFFFFFFFFFFFFC, 0x97FFFFFFFFFFFFFB // Bounds for root on [-19, -18]
+LOCAL_OBJECT_END(lgammal_right_roots_data)
+
+LOCAL_OBJECT_START(lgammal_0_Half_data)
+// Polynomial coefficients for the lgammal(x), 0.0 < |x| < 0.5
+data8 0xBFD9A4D55BEAB2D6, 0xBC8AA3C097746D1F //A3
+data8 0x3FEA51A6625307D3, 0x3C7180E7BD2D0DCC //A2
+data8 0xBFE2788CFC6FB618, 0xBC9E9346C4692BCC //A1
+data8 0x8A8991563EC1BD13, 0x00003FFD //A4
+data8 0xD45CE0BD52C27EF2, 0x0000BFFC //A5
+data8 0xADA06587FA2BBD47, 0x00003FFC //A6
+data8 0x9381D0ED2194902A, 0x0000BFFC //A7
+data8 0x80859B3CF92D4192, 0x00003FFC //A8
+data8 0xE4033517C622A946, 0x0000BFFB //A9
+data8 0xCD00CE67A51FC82A, 0x00003FFB //A10
+data8 0xBA44E2A96C3B5700, 0x0000BFFB //A11
+data8 0xAAAD008FA46DBD99, 0x00003FFB //A12
+data8 0x9D604AC65A41153D, 0x0000BFFB //A13
+data8 0x917CECB864B5A861, 0x00003FFB //A14
+data8 0x85A4810EB730FDE4, 0x0000BFFB //A15
+data8 0xEF2761C38BD21F77, 0x00003FFA //A16
+data8 0xC913043A128367DA, 0x0000BFFA //A17
+data8 0x96A29B71FF7AFFAA, 0x00003FFA //A18
+data8 0xBB9FFA1A5FE649BB, 0x0000BFF9 //A19
+data8 0xB17982CD2DAA0EE3, 0x00003FF8 //A20
+data8 0xDE1DDCBFFB9453F0, 0x0000BFF6 //A21
+data8 0x87FBF5D7ACD9FA9D, 0x00003FF4 //A22
+LOCAL_OBJECT_END(lgammal_0_Half_data)
+
+LOCAL_OBJECT_START(Constants_Q)
+// log2_hi, log2_lo, Q_4, Q_3, Q_2, and Q_1
+data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
+data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
+data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
+data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
+data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
+data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
+LOCAL_OBJECT_END(Constants_Q)
+
+LOCAL_OBJECT_START(Constants_Z_1)
+// Z1 - 16 bit fixed
+data4 0x00008000
+data4 0x00007879
+data4 0x000071C8
+data4 0x00006BCB
+data4 0x00006667
+data4 0x00006187
+data4 0x00005D18
+data4 0x0000590C
+data4 0x00005556
+data4 0x000051EC
+data4 0x00004EC5
+data4 0x00004BDB
+data4 0x00004925
+data4 0x0000469F
+data4 0x00004445
+data4 0x00004211
+LOCAL_OBJECT_END(Constants_Z_1)
+
+LOCAL_OBJECT_START(Constants_G_H_h1)
+// G1 and H1 - IEEE single and h1 - IEEE double
+data4 0x3F800000,0x00000000,0x00000000,0x00000000
+data4 0x3F70F0F0,0x3D785196,0x617D741C,0x3DA163A6
+data4 0x3F638E38,0x3DF13843,0xCBD3D5BB,0x3E2C55E6
+data4 0x3F579430,0x3E2FF9A0,0xD86EA5E7,0xBE3EB0BF
+data4 0x3F4CCCC8,0x3E647FD6,0x86B12760,0x3E2E6A8C
+data4 0x3F430C30,0x3E8B3AE7,0x5C0739BA,0x3E47574C
+data4 0x3F3A2E88,0x3EA30C68,0x13E8AF2F,0x3E20E30F
+data4 0x3F321640,0x3EB9CEC8,0xF2C630BD,0xBE42885B
+data4 0x3F2AAAA8,0x3ECF9927,0x97E577C6,0x3E497F34
+data4 0x3F23D708,0x3EE47FC5,0xA6B0A5AB,0x3E3E6A6E
+data4 0x3F1D89D8,0x3EF8947D,0xD328D9BE,0xBDF43E3C
+data4 0x3F17B420,0x3F05F3A1,0x0ADB090A,0x3E4094C3
+data4 0x3F124920,0x3F0F4303,0xFC1FE510,0xBE28FBB2
+data4 0x3F0D3DC8,0x3F183EBF,0x10FDE3FA,0x3E3A7895
+data4 0x3F088888,0x3F20EC80,0x7CC8C98F,0x3E508CE5
+data4 0x3F042108,0x3F29516A,0xA223106C,0xBE534874
+LOCAL_OBJECT_END(Constants_G_H_h1)
+
+LOCAL_OBJECT_START(Constants_Z_2)
+// Z2 - 16 bit fixed
+data4 0x00008000
+data4 0x00007F81
+data4 0x00007F02
+data4 0x00007E85
+data4 0x00007E08
+data4 0x00007D8D
+data4 0x00007D12
+data4 0x00007C98
+data4 0x00007C20
+data4 0x00007BA8
+data4 0x00007B31
+data4 0x00007ABB
+data4 0x00007A45
+data4 0x000079D1
+data4 0x0000795D
+data4 0x000078EB
+LOCAL_OBJECT_END(Constants_Z_2)
+
+LOCAL_OBJECT_START(Constants_G_H_h2)
+// G2 and H2 - IEEE single and h2 - IEEE double
+data4 0x3F800000,0x00000000,0x00000000,0x00000000
+data4 0x3F7F00F8,0x3B7F875D,0x22C42273,0x3DB5A116
+data4 0x3F7E03F8,0x3BFF015B,0x21F86ED3,0x3DE620CF
+data4 0x3F7D08E0,0x3C3EE393,0x484F34ED,0xBDAFA07E
+data4 0x3F7C0FC0,0x3C7E0586,0x3860BCF6,0xBDFE07F0
+data4 0x3F7B1880,0x3C9E75D2,0xA78093D6,0x3DEA370F
+data4 0x3F7A2328,0x3CBDC97A,0x72A753D0,0x3DFF5791
+data4 0x3F792FB0,0x3CDCFE47,0xA7EF896B,0x3DFEBE6C
+data4 0x3F783E08,0x3CFC15D0,0x409ECB43,0x3E0CF156
+data4 0x3F774E38,0x3D0D874D,0xFFEF71DF,0xBE0B6F97
+data4 0x3F766038,0x3D1CF49B,0x5D59EEE8,0xBE080483
+data4 0x3F757400,0x3D2C531D,0xA9192A74,0x3E1F91E9
+data4 0x3F748988,0x3D3BA322,0xBF72A8CD,0xBE139A06
+data4 0x3F73A0D0,0x3D4AE46F,0xF8FBA6CF,0x3E1D9202
+data4 0x3F72B9D0,0x3D5A1756,0xBA796223,0xBE1DCCC4
+data4 0x3F71D488,0x3D693B9D,0xB6B7C239,0xBE049391
+LOCAL_OBJECT_END(Constants_G_H_h2)
+
+LOCAL_OBJECT_START(Constants_G_H_h3)
+// G3 and H3 - IEEE single and h3 - IEEE double
+data4 0x3F7FFC00,0x38800100,0x562224CD,0x3D355595
+data4 0x3F7FF400,0x39400480,0x06136FF6,0x3D8200A2
+data4 0x3F7FEC00,0x39A00640,0xE8DE9AF0,0x3DA4D68D
+data4 0x3F7FE400,0x39E00C41,0xB10238DC,0xBD8B4291
+data4 0x3F7FDC00,0x3A100A21,0x3B1952CA,0xBD89CCB8
+data4 0x3F7FD400,0x3A300F22,0x1DC46826,0xBDB10707
+data4 0x3F7FCC08,0x3A4FF51C,0xF43307DB,0x3DB6FCB9
+data4 0x3F7FC408,0x3A6FFC1D,0x62DC7872,0xBD9B7C47
+data4 0x3F7FBC10,0x3A87F20B,0x3F89154A,0xBDC3725E
+data4 0x3F7FB410,0x3A97F68B,0x62B9D392,0xBD93519D
+data4 0x3F7FAC18,0x3AA7EB86,0x0F21BD9D,0x3DC18441
+data4 0x3F7FA420,0x3AB7E101,0x2245E0A6,0xBDA64B95
+data4 0x3F7F9C20,0x3AC7E701,0xAABB34B8,0x3DB4B0EC
+data4 0x3F7F9428,0x3AD7DD7B,0x6DC40A7E,0x3D992337
+data4 0x3F7F8C30,0x3AE7D474,0x4F2083D3,0x3DC6E17B
+data4 0x3F7F8438,0x3AF7CBED,0x811D4394,0x3DAE314B
+data4 0x3F7F7C40,0x3B03E1F3,0xB08F2DB1,0xBDD46F21
+data4 0x3F7F7448,0x3B0BDE2F,0x6D34522B,0xBDDC30A4
+data4 0x3F7F6C50,0x3B13DAAA,0xB1F473DB,0x3DCB0070
+data4 0x3F7F6458,0x3B1BD766,0x6AD282FD,0xBDD65DDC
+data4 0x3F7F5C68,0x3B23CC5C,0xF153761A,0xBDCDAB83
+data4 0x3F7F5470,0x3B2BC997,0x341D0F8F,0xBDDADA40
+data4 0x3F7F4C78,0x3B33C711,0xEBC394E8,0x3DCD1BD7
+data4 0x3F7F4488,0x3B3BBCC6,0x52E3E695,0xBDC3532B
+data4 0x3F7F3C90,0x3B43BAC0,0xE846B3DE,0xBDA3961E
+data4 0x3F7F34A0,0x3B4BB0F4,0x785778D4,0xBDDADF06
+data4 0x3F7F2CA8,0x3B53AF6D,0xE55CE212,0x3DCC3ED1
+data4 0x3F7F24B8,0x3B5BA620,0x9E382C15,0xBDBA3103
+data4 0x3F7F1CC8,0x3B639D12,0x5C5AF197,0x3D635A0B
+data4 0x3F7F14D8,0x3B6B9444,0x71D34EFC,0xBDDCCB19
+data4 0x3F7F0CE0,0x3B7393BC,0x52CD7ADA,0x3DC74502
+data4 0x3F7F04F0,0x3B7B8B6D,0x7D7F2A42,0xBDB68F17
+LOCAL_OBJECT_END(Constants_G_H_h3)
+
+LOCAL_OBJECT_START(lgammal_data)
+// Positive overflow value
+data8 0xB8D54C8BFFFDEBF4, 0x00007FF1
+LOCAL_OBJECT_END(lgammal_data)
+
+LOCAL_OBJECT_START(lgammal_Stirling)
+// Coefficients needed for Strirling's formula
+data8 0x3FED67F1C864BEB4 // High part of 0.5*ln(2*Pi)
+data8 0x3C94D252F2400510 // Low part of 0.5*ln(2*Pi)
+//
+// Bernulli numbers used in Striling's formula for -2^63 < |x| < -13.0
+//(B1H, B1L) = 8.3333333333333333333262747254e-02
+data8 0x3FB5555555555555, 0x3C55555555555555
+data8 0xB60B60B60B60B60B, 0x0000BFF6 //B2 = -2.7777777777777777777777777778e-03
+data8 0xD00D00D00D00D00D, 0x00003FF4 //B3 = 7.9365079365079365079365079365e-04
+data8 0x9C09C09C09C09C0A, 0x0000BFF4 //B4 = -5.9523809523809523809523809524e-04
+data8 0xDCA8F158C7F91AB8, 0x00003FF4 //B5 = 8.4175084175084175084175084175e-04
+data8 0xFB5586CCC9E3E410, 0x0000BFF5 //B6 = -1.9175269175269175269175269175e-03
+data8 0xD20D20D20D20D20D, 0x00003FF7 //B7 = 6.4102564102564102564102564103e-03
+data8 0xF21436587A9CBEE1, 0x0000BFF9 //B8 = -2.9550653594771241830065359477e-02
+data8 0xB7F4B1C0F033FFD1, 0x00003FFC //B9 = 1.7964437236883057316493849002e-01
+data8 0xB23B3808C0F9CF6E, 0x0000BFFF //B10 = -1.3924322169059011164274322169e+00
+// Polynomial coefficients for Stirling's formula, -13.0 < x < -6.0
+data8 0x3FB5555555555555, 0x3C4D75060289C58B //A0
+data8 0xB60B60B60B0F0876, 0x0000BFF6 //A1
+data8 0xD00D00CE54B1256C, 0x00003FF4 //A2
+data8 0x9C09BF46B58F75E1, 0x0000BFF4 //A3
+data8 0xDCA8483BC91ACC6D, 0x00003FF4 //A4
+data8 0xFB3965C939CC9FEE, 0x0000BFF5 //A5
+data8 0xD0723ADE3F0BC401, 0x00003FF7 //A6
+data8 0xE1ED7434E81F0B73, 0x0000BFF9 //A7
+data8 0x8069C6982F993283, 0x00003FFC //A8
+data8 0xC271F65BFA5BEE3F, 0x0000BFFD //A9
+LOCAL_OBJECT_END(lgammal_Stirling)
+
+LOCAL_OBJECT_START(lgammal_lnsin_data)
+// polynomial approximation of -ln(sin(Pi*x)/(Pi*x)), 0 < x <= 0.5
+data8 0x3FFA51A6625307D3, 0x3C81873332FAF94C //A2
+data8 0x8A8991563EC241C3, 0x00003FFE //A4
+data8 0xADA06588061805DF, 0x00003FFD //A6
+data8 0x80859B57C338D0F7, 0x00003FFD //A8
+data8 0xCD00F1C2D78754BD, 0x00003FFC //A10
+data8 0xAAB56B1D3A1F4655, 0x00003FFC //A12
+data8 0x924B6F2FBBED12B1, 0x00003FFC //A14
+data8 0x80008E58765F43FC, 0x00003FFC //A16
+data8 0x3FBC718EC115E429//A18
+data8 0x3FB99CE544FE183E//A20
+data8 0x3FB7251C09EAAD89//A22
+data8 0x3FB64A970733628C//A24
+data8 0x3FAC92D6802A3498//A26
+data8 0x3FC47E1165261586//A28
+data8 0xBFCA1BAA434750D4//A30
+data8 0x3FE460001C4D5961//A32
+data8 0xBFE6F06A3E4908AD//A34
+data8 0x3FE300889EBB203A//A36
+LOCAL_OBJECT_END(lgammal_lnsin_data)
+
+LOCAL_OBJECT_START(lgammal_half_3Q_data)
+// Polynomial coefficients for the lgammal(x), 0.5 <= x < 0.75
+data8 0xBFF7A648EE90C62E, 0x3C713F326857E066 // A3, A0L
+data8 0xBFF73E4B8BA780AE, 0xBCA953BC788877EF // A1, A1L
+data8 0x403774DCD58D0291, 0xC0415254D5AE6623 // D0, D1
+data8 0x40B07213855CBFB0, 0xC0B8855E25D2D229 // C20, C21
+data8 0x3FFB359F85FF5000, 0x3C9BAECE6EF9EF3A // A2, A2L
+data8 0x3FD717D498A3A8CC, 0xBC9088E101CFEDFA // A0, A3L
+data8 0xAFEF36CC5AEC3FF0, 0x00004002 // E6
+data8 0xABE2054E1C34E791, 0x00004001 // E4
+data8 0xB39343637B2900D1, 0x00004000 // E2
+data8 0xD74FB710D53F58F6, 0x00003FFF // E0
+data8 0x4070655963BA4256, 0xC078DA9D263C4EA3 // D6, D7
+data8 0x405CD2B6A9B90978, 0xC065B3B9F4F4F171 // D4, D5
+data8 0x4049BC2204CF61FF, 0xC05337227E0BA152 // D2, D3
+data8 0x4095509A50C07A96, 0xC0A0747949D2FB45 // C18, C19
+data8 0x4082ECCBAD709414, 0xC08CD02FB088A702 // C16, C17
+data8 0xFFE4B2A61B508DD5, 0x0000C002 // E7
+data8 0xF461ADB8AE17E0A5, 0x0000C001 // E5
+data8 0xF5BE8B0B90325F20, 0x0000C000 // E3
+data8 0x877B275F3FB78DCA, 0x0000C000 // E1
+LOCAL_OBJECT_END(lgammal_half_3Q_data)
+
+LOCAL_OBJECT_START(lgammal_half_3Q_neg_data)
+// Polynomial coefficients for the lgammal(x), -0.75 < x <= -0.5
+data8 0xC014836EFD94899C, 0x3C9835679663B44F // A3, A0L
+data8 0xBFF276C7B4FB1875, 0xBC92D3D9FA29A1C0 // A1, A1L
+data8 0x40C5178F24E1A435, 0xC0D9DE84FBC5D76A // D0, D1
+data8 0x41D4D1B236BF6E93, 0xC1EBB0445CE58550 // C20, C21
+data8 0x4015718CD67F63D3, 0x3CC5354B6F04B59C // A2, A2L
+data8 0x3FF554493087E1ED, 0xBCB72715E37B02B9 // A0, A3L
+data8 0xE4AC7E915FA72229, 0x00004009 // E6
+data8 0xA28244206395FCC6, 0x00004007 // E4
+data8 0xFB045F19C07B2544, 0x00004004 // E2
+data8 0xE5C8A6E6A9BA7D7B, 0x00004002 // E0
+data8 0x4143943B55BF5118, 0xC158AC05EA675406 // D6, D7
+data8 0x4118F6833D19717C, 0xC12F51A6F375CC80 // D4, D5
+data8 0x40F00C209483481C, 0xC103F1DABF750259 // D2, D3
+data8 0x4191038F2D8F9E40, 0xC1A413066DA8AE4A // C18, C19
+data8 0x4170B537EDD833DE, 0xC1857E79424C61CE // C16, C17
+data8 0x8941D8AB4855DB73, 0x0000C00B // E7
+data8 0xBB822B131BD2E813, 0x0000C008 // E5
+data8 0x852B4C03B83D2D4F, 0x0000C006 // E3
+data8 0xC754CA7E2DDC0F1F, 0x0000C003 // E1
+LOCAL_OBJECT_END(lgammal_half_3Q_neg_data)
+
+LOCAL_OBJECT_START(lgammal_2Q_4_data)
+// Polynomial coefficients for the lgammal(x), 2.25 <= |x| < 4.0
+data8 0xBFCA4D55BEAB2D6F, 0x3C7ABC9DA14141F5 // A3, A0L
+data8 0x3FFD8773039049E7, 0x3C66CB7957A95BA4 // A1, A1L
+data8 0x3F45C3CC79E91E7D, 0xBF3A8E5005937E97 // D0, D1
+data8 0x3EC951E35E1C9203, 0xBEB030A90026C5DF // C20, C21
+data8 0x3FE94699894C1F4C, 0x3C91884D21D123F1 // A2, A2L
+data8 0x3FE62E42FEFA39EF, 0xBC66480CEB70870F // A0, A3L
+data8 0xF1C2EAFF0B3A7579, 0x00003FF5 // E6
+data8 0xB36AF863926B55A3, 0x00003FF7 // E4
+data8 0x9620656185BB44CA, 0x00003FF9 // E2
+data8 0xA264558FB0906AFF, 0x00003FFB // E0
+data8 0x3F03D59E9666C961, 0xBEF91115893D84A6 // D6, D7
+data8 0x3F19333611C46225, 0xBF0F89EB7D029870 // D4, D5
+data8 0x3F3055A96B347AFE, 0xBF243B5153E178A8 // D2, D3
+data8 0x3ED9A4AEF30C4BB2, 0xBED388138B1CEFF2 // C18, C19
+data8 0x3EEF7945A3C3A254, 0xBEE36F32A938EF11 // C16, C17
+data8 0x9028923F47C82118, 0x0000BFF5 // E7
+data8 0xCE0DAAFB6DC93B22, 0x0000BFF6 // E5
+data8 0xA0D0983B34AC4C8D, 0x0000BFF8 // E3
+data8 0x94D6C50FEB8B0CE7, 0x0000BFFA // E1
+LOCAL_OBJECT_END(lgammal_2Q_4_data)
+
+LOCAL_OBJECT_START(lgammal_4_8_data)
+// Polynomial coefficients for the lgammal(x), 4.0 <= |x| < 8.0
+data8 0xBFD6626BC9B31B54, 0x3CAA53C82493A92B // A3, A0L
+data8 0x401B4C420A50AD7C, 0x3C8C6E9929F789A3 // A1, A1L
+data8 0x3F49410427E928C2, 0xBF3E312678F8C146 // D0, D1
+data8 0x3ED51065F7CD5848, 0xBED052782A03312F // C20, C21
+data8 0x3FF735973273D5EC, 0x3C831DFC65BF8CCF // A2, A2L
+data8 0x401326643C4479C9, 0xBC6FA0498C5548A6 // A0, A3L
+data8 0x9382D8B3CD4EB7E3, 0x00003FF6 // E6
+data8 0xE9F92CAD8A85CBCD, 0x00003FF7 // E4
+data8 0xD58389FE38258CEC, 0x00003FF9 // E2
+data8 0x81310136363AE8AA, 0x00003FFC // E0
+data8 0x3F04F0AE38E78570, 0xBEF9E2144BB8F03C // D6, D7
+data8 0x3F1B5E992A6CBC2A, 0xBF10F3F400113911 // D4, D5
+data8 0x3F323EE00AAB7DEE, 0xBF2640FDFA9FB637 // D2, D3
+data8 0x3ED2143EBAFF067A, 0xBEBBDEB92D6FF35D // C18, C19
+data8 0x3EF173A42B69AAA4, 0xBEE78B9951A2EAA5 // C16, C17
+data8 0xAB3CCAC6344E52AA, 0x0000BFF5 // E7
+data8 0x81ACCB8915B16508, 0x0000BFF7 // E5
+data8 0xDA62C7221102C426, 0x0000BFF8 // E3
+data8 0xDF1BD44C4083580A, 0x0000BFFA // E1
+LOCAL_OBJECT_END(lgammal_4_8_data)
+
+LOCAL_OBJECT_START(lgammal_loc_min_data)
+// Polynomial coefficients for the lgammal(x), 1.3125 <= x < 1.5625
+data8 0xBB16C31AB5F1FB71, 0x00003FFF // xMin - point of local minimum
+data8 0xBFC2E4278DC6BC23, 0xBC683DA8DDCA9650 // A3, A0L
+data8 0x3BD4DB7D0CA61D5F, 0x386E719EDD01D801 // A1, A1L
+data8 0x3F4CC72638E1D93F, 0xBF4228EC9953CCB9 // D0, D1
+data8 0x3ED222F97A04613E,0xBED3DDD58095CB6C // C20, C21
+data8 0x3FDEF72BC8EE38AB, 0x3C863AFF3FC48940 // A2, A2L
+data8 0xBFBF19B9BCC38A41, 0xBC7425F1BFFC1442// A0, A3L
+data8 0x941890032BEB34C3, 0x00003FF6 // E6
+data8 0xC7E701591CE534BC, 0x00003FF7 // E4
+data8 0x93373CBD05138DD4, 0x00003FF9 // E2
+data8 0x845A14A6A81C05D6, 0x00003FFB // E0
+data8 0x3F0F6C4DF6D47A13, 0xBF045DCDB5B49E19 // D6, D7
+data8 0x3F22E23345DDE59C, 0xBF1851159AFB1735 // D4, D5
+data8 0x3F37101EA4022B78, 0xBF2D721E6323AF13 // D2, D3
+data8 0x3EE691EBE82DF09D, 0xBEDD42550961F730 // C18, C19
+data8 0x3EFA793EDE99AD85, 0xBEF14000108E70BE // C16, C17
+data8 0xB7CBC033ACE0C99C, 0x0000BFF5 // E7
+data8 0xF178D1F7B1A45E27, 0x0000BFF6 // E5
+data8 0xA8FCFCA8106F471C, 0x0000BFF8 // E3
+data8 0x864D46FA898A9AD2, 0x0000BFFA // E1
+LOCAL_OBJECT_END(lgammal_loc_min_data)
+
+LOCAL_OBJECT_START(lgammal_03Q_1Q_data)
+// Polynomial coefficients for the lgammal(x), 0.75 <= |x| < 1.3125
+data8 0x3FD151322AC7D848, 0x3C7184DE0DB7B4EE // A4, A2L
+data8 0x3FD9A4D55BEAB2D6, 0x3C9E934AAB10845F // A3, A1L
+data8 0x3FB111289C381259, 0x3FAFFFCFB32AE18D // D2, D3
+data8 0x3FB3B1D9E0E3E00D, 0x3FB2496F0D3768DF // D0, D1
+data8 0xBA461972C057D439, 0x00003FFB // E6
+data8 0x3FEA51A6625307D3, 0x3C76ABC886A72DA2 // A2, A4L
+data8 0x3FA8EFE46B32A70E, 0x3F8F31B3559576B6 // C17, C20
+data8 0xE403383700387D85, 0x00003FFB // E4
+data8 0x9381D0EE74BF7251, 0x00003FFC // E2
+data8 0x3FAA2177A6D28177, 0x3FA4895E65FBD995 // C18, C19
+data8 0x3FAAED2C77DBEE5D, 0x3FA94CA59385512C // D6, D7
+data8 0x3FAE1F522E8A5941, 0x3FAC785EF56DD87E // D4, D5
+data8 0x3FB556AD5FA56F0A, 0x3FA81F416E87C783 // E7, C16
+data8 0xCD00F1C2DC2C9F1E, 0x00003FFB // E5
+data8 0x3FE2788CFC6FB618, 0x3C8E52519B5B17CB // A1, A3L
+data8 0x80859B57C3E7F241, 0x00003FFC // E3
+data8 0xADA065880615F401, 0x00003FFC // E1
+data8 0xD45CE0BD530AB50E, 0x00003FFC // E0
+LOCAL_OBJECT_END(lgammal_03Q_1Q_data)
+
+LOCAL_OBJECT_START(lgammal_13Q_2Q_data)
+// Polynomial coefficients for the lgammal(x), 1.5625 <= |x| < 2.25
+data8 0x3F951322AC7D8483, 0x3C71873D88C6539D // A4, A2L
+data8 0xBFB13E001A557606, 0x3C56CB907018A101 // A3, A1L
+data8 0xBEC11B2EC1E7F6FC, 0x3EB0064ED9824CC7 // D2, D3
+data8 0xBEE3CBC963EC103A, 0x3ED2597A330C107D // D0, D1
+data8 0xBC6F2DEBDFE66F38, 0x0000BFF0 // E6
+data8 0x3FD4A34CC4A60FA6, 0x3C3AFC9BF775E8A0 // A2, A4L
+data8 0x3E48B0C542F85B32, 0xBE347F12EAF787AB // C17, C20
+data8 0xE9FEA63B6984FA1E, 0x0000BFF2 // E4
+data8 0x9C562E15FC703BBF, 0x0000BFF5 // E2
+data8 0xBE3C12A50AB0355E, 0xBE1C941626AE4717 // C18, C19
+data8 0xBE7AFA8714342BC4,0x3E69A12D2B7761CB // D6, D7
+data8 0xBE9E25EF1D526730, 0x3E8C762291889B99 // D4, D5
+data8 0x3EF580DCEE754733, 0xBE57C811D070549C // E7, C16
+data8 0xD093D878BE209C98, 0x00003FF1 // E5
+data8 0x3FDB0EE6072093CE, 0xBC6024B9E81281C4 // A1, A3L
+data8 0x859B57C31CB77D96, 0x00003FF4 // E3
+data8 0xBD6EB756DB617E8D, 0x00003FF6 // E1
+data8 0xF2027E10C7AF8C38, 0x0000BFF7 // E0
+LOCAL_OBJECT_END(lgammal_13Q_2Q_data)
+
+LOCAL_OBJECT_START(lgammal_8_10_data)
+// Polynomial coefficients for the lgammal(x), 8.0 <= |x| < 10.0
+// Multi Precision terms
+data8 0x40312008A3A23E5C, 0x3CE020B4F2E4083A //A1
+data8 0x4025358E82FCB70C, 0x3CD4A5A74AF7B99C //A0
+// Native precision terms
+data8 0xF0AA239FFBC616D2, 0x00004000 //A2
+data8 0x96A8EA798FE57D66, 0x0000BFFF //A3
+data8 0x8D501B7E3B9B9BDB, 0x00003FFE //A4
+data8 0x9EE062401F4B1DC2, 0x0000BFFD //A5
+data8 0xC63FD8CD31E93431, 0x00003FFC //A6
+data8 0x8461101709C23C30, 0x0000BFFC //A7
+data8 0xB96D7EA7EF3648B2, 0x00003FFB //A8
+data8 0x86886759D2ACC906, 0x0000BFFB //A9
+data8 0xC894B6E28265B183, 0x00003FFA //A10
+data8 0x98C4348CAD821662, 0x0000BFFA //A11
+data8 0xEC9B092226A94DF2, 0x00003FF9 //A12
+data8 0xB9F169FF9B98CDDC, 0x0000BFF9 //A13
+data8 0x9A3A32BB040894D3, 0x00003FF9 //A14
+data8 0xF9504CCC1003B3C3, 0x0000BFF8 //A15
+LOCAL_OBJECT_END(lgammal_8_10_data)
+
+LOCAL_OBJECT_START(lgammal_03Q_6_data)
+// Polynomial coefficients for the lgammal(x), 0.75 <= |x| < 1.0
+data8 0xBFBC47DCA479E295, 0xBC607E6C1A379D55 //A3
+data8 0x3FCA051C372609ED, 0x3C7B02D73EB7D831 //A0
+data8 0xBFE15FAFA86B04DB, 0xBC3F52EE4A8945B5 //A1
+data8 0x3FD455C4FF28F0BF, 0x3C75F8C6C99F30BB //A2
+data8 0xD2CF04CD934F03E1, 0x00003FFA //A4
+data8 0xDB4ED667E29256E1, 0x0000BFF9 //A5
+data8 0xF155A33A5B6021BF, 0x00003FF8 //A6
+data8 0x895E9B9D386E0338, 0x0000BFF8 //A7
+data8 0xA001BE94B937112E, 0x00003FF7 //A8
+data8 0xBD82846E490ED048, 0x0000BFF6 //A9
+data8 0xE358D24EC30DBB5D, 0x00003FF5 //A10
+data8 0x89C4F3652446B78B, 0x0000BFF5 //A11
+data8 0xA86043E10280193D, 0x00003FF4 //A12
+data8 0xCF3A2FBA61EB7682, 0x0000BFF3 //A13
+data8 0x3F300900CC9200EC //A14
+data8 0xBF23F42264B94AE8 //A15
+data8 0x3F18EEF29895FE73 //A16
+data8 0xBF0F3C4563E3EDFB //A17
+data8 0x3F0387DBBC385056 //A18
+data8 0xBEF81B4004F92900 //A19
+data8 0x3EECA6692A9A5B81 //A20
+data8 0xBEDF61A0059C15D3 //A21
+data8 0x3ECDA9F40DCA0111 //A22
+data8 0xBEB60FE788217BAF //A23
+data8 0x3E9661D795DFC8C6 //A24
+data8 0xBE66C7756A4EDEE5 //A25
+// Polynomial coefficients for the lgammal(x), 1.0 <= |x| < 2.0
+data8 0xBFC1AE55B180726B, 0xBC7DE1BC478453F5 //A3
+data8 0xBFBEEB95B094C191, 0xBC53456FF6F1C9D9 //A0
+data8 0x3FA2AED059BD608A, 0x3C0B65CC647D557F //A1
+data8 0x3FDDE9E64DF22EF2, 0x3C8993939A8BA8E4 //A2
+data8 0xF07C206D6B100CFF, 0x00003FFA //A4
+data8 0xED2CEA9BA52FE7FB, 0x0000BFF9 //A5
+data8 0xFCE51CED52DF3602, 0x00003FF8 //A6
+data8 0x8D45D27872326619, 0x0000BFF8 //A7
+data8 0xA2B78D6BCEBE27F7, 0x00003FF7 //A8
+data8 0xBF6DC0996A895B6F, 0x0000BFF6 //A9
+data8 0xE4B9AD335AF82D79, 0x00003FF5 //A10
+data8 0x8A451880195362A1, 0x0000BFF5 //A11
+data8 0xA8BE35E63089A7A9, 0x00003FF4 //A12
+data8 0xCF7FA175FA11C40C, 0x0000BFF3 //A13
+data8 0x3F300C282FAA3B02 //A14
+data8 0xBF23F6AEBDA68B80 //A15
+data8 0x3F18F6860E2224DD //A16
+data8 0xBF0F542B3CE32F28 //A17
+data8 0x3F039436218C9BF8 //A18
+data8 0xBEF8AE6307677AEC //A19
+data8 0x3EF0B55527B3A211 //A20
+data8 0xBEE576AC995E7605 //A21
+data8 0x3ED102DDC1365D2D //A22
+data8 0xBEC442184F97EA54 //A23
+data8 0x3ED4D2283DFE5FC6 //A24
+data8 0xBECB9219A9B46787 //A25
+// Polynomial coefficients for the lgammal(x), 2.0 <= |x| < 3.0
+data8 0xBFCA4D55BEAB2D6F, 0xBC66F80E5BFD5AF5 //A3
+data8 0x3FE62E42FEFA39EF, 0x3C7ABC9E3B347E3D //A0
+data8 0x3FFD8773039049E7, 0x3C66CB9007C426EA //A1
+data8 0x3FE94699894C1F4C, 0x3C918726EB111663 //A2
+data8 0xA264558FB0906209, 0x00003FFB //A4
+data8 0x94D6C50FEB902ADC, 0x0000BFFA //A5
+data8 0x9620656184243D17, 0x00003FF9 //A6
+data8 0xA0D0983B8BCA910B, 0x0000BFF8 //A7
+data8 0xB36AF8559B222BD3, 0x00003FF7 //A8
+data8 0xCE0DACB3260AE6E5, 0x0000BFF6 //A9
+data8 0xF1C2C0BF0437C7DB, 0x00003FF5 //A10
+data8 0x902A2F2F3AB74A92, 0x0000BFF5 //A11
+data8 0xAE05009B1B2C6E4C, 0x00003FF4 //A12
+data8 0xD5B71F6456D7D4CB, 0x0000BFF3 //A13
+data8 0x3F2F0351D71BC9C6 //A14
+data8 0xBF2B53BC56A3B793 //A15
+data8 0xBF18B12DC6F6B861 //A16
+data8 0xBF43EE6EB5215C2F //A17
+data8 0xBF5474787CDD455E //A18
+data8 0xBF642B503C9C060A //A19
+data8 0xBF6E07D1AA254AA3 //A20
+data8 0xBF71C785443AAEE8 //A21
+data8 0xBF6F67BF81B71052 //A22
+data8 0xBF63E4BCCF4FFABF //A23
+data8 0xBF50067F8C671D5A //A24
+data8 0xBF29C770D680A5AC //A25
+// Polynomial coefficients for the lgammal(x), 4.0 <= |x| < 6.0
+data8 0xBFD6626BC9B31B54, 0xBC85AABE08680902 //A3
+data8 0x401326643C4479C9, 0x3CAA53C26F31E364 //A0
+data8 0x401B4C420A50AD7C, 0x3C8C76D55E57DD8D //A1
+data8 0x3FF735973273D5EC, 0x3C83A0B78E09188A //A2
+data8 0x81310136363AAB6D, 0x00003FFC //A4
+data8 0xDF1BD44C4075C0E6, 0x0000BFFA //A5
+data8 0xD58389FE38D8D664, 0x00003FF9 //A6
+data8 0xDA62C7221D5B5F87, 0x0000BFF8 //A7
+data8 0xE9F92CAD0263E157, 0x00003FF7 //A8
+data8 0x81ACCB8606C165FE, 0x0000BFF7 //A9
+data8 0x9382D8D263D1C2A3, 0x00003FF6 //A10
+data8 0xAB3CCBA4C853B12C, 0x0000BFF5 //A11
+data8 0xCA0818BBCCC59296, 0x00003FF4 //A12
+data8 0xF18912691CBB5BD0, 0x0000BFF3 //A13
+data8 0x3F323EF5D8330339 //A14
+data8 0xBF2641132EA571F7 //A15
+data8 0x3F1B5D9576175CA9 //A16
+data8 0xBF10F56A689C623D //A17
+data8 0x3F04CACA9141A18D //A18
+data8 0xBEFA307AC9B4E85D //A19
+data8 0x3EF4B625939FBE32 //A20
+data8 0xBECEE6AC1420F86F //A21
+data8 0xBE9A95AE2E485964 //A22
+data8 0xBF039EF47F8C09BB //A23
+data8 0xBF05345957F7B7A9 //A24
+data8 0xBEF85AE6385D4CCC //A25
+// Polynomial coefficients for the lgammal(x), 3.0 <= |x| < 4.0
+data8 0xBFCA4D55BEAB2D6F, 0xBC667B20FF46C6A8 //A3
+data8 0x3FE62E42FEFA39EF, 0x3C7ABC9E3B398012 //A0
+data8 0x3FFD8773039049E7, 0x3C66CB9070238D77 //A1
+data8 0x3FE94699894C1F4C, 0x3C91873D8839B1CD //A2
+data8 0xA264558FB0906D7E, 0x00003FFB //A4
+data8 0x94D6C50FEB8AFD72, 0x0000BFFA //A5
+data8 0x9620656185B68F14, 0x00003FF9 //A6
+data8 0xA0D0983B34B7088A, 0x0000BFF8 //A7
+data8 0xB36AF863964AA440, 0x00003FF7 //A8
+data8 0xCE0DAAFB5497AFB8, 0x0000BFF6 //A9
+data8 0xF1C2EAFA79CC2864, 0x00003FF5 //A10
+data8 0x9028922A839572B8, 0x0000BFF5 //A11
+data8 0xAE1E62F870BA0278, 0x00003FF4 //A12
+data8 0xD4726F681E2ABA29, 0x0000BFF3 //A13
+data8 0x3F30559B9A02FADF //A14
+data8 0xBF243ADEB1266CAE //A15
+data8 0x3F19303B6F552603 //A16
+data8 0xBF0F768C288EC643 //A17
+data8 0x3F039D5356C21DE1 //A18
+data8 0xBEF81BCA8168E6BE //A19
+data8 0x3EEC74A53A06AD54 //A20
+data8 0xBEDED52D1A5DACDF //A21
+data8 0x3ECCB4C2C7087342 //A22
+data8 0xBEB4F1FAFDFF5C2F //A23
+data8 0x3E94C80B52D58904 //A24
+data8 0xBE64A328CBE92A27 //A25
+LOCAL_OBJECT_END(lgammal_03Q_6_data)
+
+LOCAL_OBJECT_START(lgammal_1pEps_data)
+// Polynomial coefficients for the lgammal(x), 1 - 2^(-7) <= |x| < 1 + 2^(-7)
+data8 0x93C467E37DB0C7A5, 0x00003FFE //A1
+data8 0xD28D3312983E9919, 0x00003FFE //A2
+data8 0xCD26AADF559A47E3, 0x00003FFD //A3
+data8 0x8A8991563EC22E81, 0x00003FFD //A4
+data8 0x3FCA8B9C168D52FE //A5
+data8 0x3FC5B40CB0696370 //A6
+data8 0x3FC270AC2229A65D //A7
+data8 0x3FC0110AF10FCBFC //A8
+// Polynomial coefficients for the log1p(x), - 2^(-7) <= |x| < 2^(-7)
+data8 0x3FBC71C71C71C71C //P8
+data8 0xBFC0000000000000 //P7
+data8 0x3FC2492492492492 //P6
+data8 0xBFC5555555555555 //P5
+data8 0x3FC999999999999A //P4
+data8 0xBFD0000000000000 //P3
+data8 0x3FD5555555555555 //P2
+data8 0xBFE0000000000000 //P1
+// short version of "lnsin" polynomial
+data8 0xD28D3312983E9918, 0x00003FFF //A2
+data8 0x8A8991563EC241B6, 0x00003FFE //A4
+data8 0xADA06588061830A5, 0x00003FFD //A6
+data8 0x80859B57C31CB746, 0x00003FFD //A8
+LOCAL_OBJECT_END(lgammal_1pEps_data)
+
+LOCAL_OBJECT_START(lgammal_neg2andHalf_data)
+// Polynomial coefficients for the lgammal(x), -2.005859375 <= x < -2.5
+data8 0xBF927781D4BB093A, 0xBC511D86D85B7045 // A3, A0L
+data8 0x3FF1A68793DEFC15, 0x3C9852AE2DA7DEEF // A1, A1L
+data8 0x408555562D45FAFD, 0xBF972CDAFE5FEFAD // D0, D1
+data8 0xC18682331EF492A5, 0xC1845E3E0D29606B // C20, C21
+data8 0x4013141822E16979, 0x3CCF8718B6E75F6C // A2, A2L
+data8 0xBFACCBF9F5ED0F15, 0xBBDD1AEB73297401 // A0, A3L
+data8 0xCCCDB17423046445, 0x00004006 // E6
+data8 0x800514E230A3A452, 0x00004005 // E4
+data8 0xAAE9A48EC162E76F, 0x00004003 // E2
+data8 0x81D4F88B3F3EA0FC, 0x00004002 // E0
+data8 0x40CF3F3E35238DA0, 0xC0F8B340945F1A7E // D6, D7
+data8 0x40BF89EC0BD609C6, 0xC095897242AEFEE2 // D4, D5
+data8 0x40A2482FF01DBC5C, 0xC02095E275FDCF62 // D2, D3
+data8 0xC1641354F2312A6A, 0xC17B3657F85258E9 // C18, C19
+data8 0xC11F964E9ECBE2C9, 0xC146D7A90F70696C // C16, C17
+data8 0xE7AECDE6AF8EA816, 0x0000BFEF // E7
+data8 0xD711252FEBBE1091, 0x0000BFEB // E5
+data8 0xE648BD10F8C43391, 0x0000BFEF // E3
+data8 0x948A1E78AA00A98D, 0x0000BFF4 // E1
+LOCAL_OBJECT_END(lgammal_neg2andHalf_data)
+
+LOCAL_OBJECT_START(lgammal_near_neg_half_data)
+// Polynomial coefficients for the lgammal(x), -0.5 < x < -0.40625
+data8 0xBFC1AE55B180726C, 0x3C8053CD734E6A1D // A3, A0L
+data8 0x3FA2AED059BD608A, 0x3C0CD3D2CDBA17F4 // A1, A1L
+data8 0x40855554DBCD1E1E, 0x3F96C51AC2BEE9E1 // D0, D1
+data8 0xC18682331EF4927D, 0x41845E3E0D295DFC // C20, C21
+data8 0x4011DE9E64DF22EF, 0x3CA692B70DAD6B7B // A2, A2L
+data8 0x3FF43F89A3F0EDD6, 0xBC4955AED0FA087D // A0, A3L
+data8 0xCCCD3F1DF4A2C1DD, 0x00004006 // E6
+data8 0x80028ADE33C7FCD9, 0x00004005 // E4
+data8 0xAACA474E485507EF, 0x00004003 // E2
+data8 0x80F07C206D6B0ECD, 0x00004002 // E0
+data8 0x40CF3F3E33E83056, 0x40F8B340944633D9 // D6, D7
+data8 0x40BF89EC059931F0, 0x409589723307AD20 // D4, D5
+data8 0x40A2482FD0054824, 0x402095CE7F19D011 // D2, D3
+data8 0xC1641354F2313614, 0x417B3657F8525354 // C18, C19
+data8 0xC11F964E9ECFD21C, 0x4146D7A90F701836 // C16, C17
+data8 0x86A9C01F0EA11E5A, 0x0000BFF5 // E7
+data8 0xBF6D8469142881C0, 0x0000BFF6 // E5
+data8 0x8D45D277BA8255F1, 0x0000BFF8 // E3
+data8 0xED2CEA9BA528BCC3, 0x0000BFF9 // E1
+LOCAL_OBJECT_END(lgammal_near_neg_half_data)
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+////////////// POLYNOMIAL COEFFICIENTS FOR "NEAR ROOTS" RANGES /////////////
+////////////// THIS PART OF TABLE SHOULD BE ADDRESSED REALLY RARE /////////////
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+LOCAL_OBJECT_START(lgammal_right_roots_polynomial_data)
+// Polynomial coefficients for right root on [-3, -2]
+// Lgammal is aproximated by polynomial within [-.056244 ; .158208 ] range
+data8 0xBBBD5E9DCD11030B, 0xB867411D9FF87DD4 //A0
+data8 0x3FF83FE966AF535E, 0x3CAA21235B8A769A //A1
+data8 0x40136EEBB002F55C, 0x3CC3959A6029838E //A2
+data8 0xB4A5302C53C2BEDD, 0x00003FFF //A3
+data8 0x8B8C6BE504F2DA1C, 0x00004002 //A4
+data8 0xB99CFF02593B4D98, 0x00004001 //A5
+data8 0x4038D32F682AA1CF //A6
+data8 0x403809F04EE6C5B5 //A7
+data8 0x40548EAA81634CEE //A8
+data8 0x4059297ADB6BC03D //A9
+data8 0x407286FB8EC5C9DA //A10
+data8 0x407A92E05B744CFB //A11
+data8 0x4091A9D4144258CD //A12
+data8 0x409C4D01D24F367E //A13
+data8 0x40B1871B9A426A83 //A14
+data8 0x40BE51C48BD9A583 //A15
+data8 0x40D2140D0C6153E7 //A16
+data8 0x40E0FB2C989CE4A3 //A17
+data8 0x40E52739AB005641 //A18
+data8 0x41161E3E6DDF503A //A19
+// Polynomial coefficients for right root on [-4, -3]
+// Lgammal is aproximated by polynomial within [-.172797 ; .171573 ] range
+data8 0x3C172712B248E42E, 0x38CB8D17801A5D67 //A0
+data8 0x401F20A65F2FAC54, 0x3CCB9EA1817A824E //A1
+data8 0x4039D4D2977150EF, 0x3CDA42E149B6276A //A2
+data8 0xE089B8926AE2D9CB, 0x00004005 //A3
+data8 0x933901EBBB586C37, 0x00004008 //A4
+data8 0xCCD319BED1CFA1CD, 0x0000400A //A5
+data8 0x40D293C3F78D3C37 //A6
+data8 0x40FBB97AA0B6DD02 //A7
+data8 0x41251EA3345E5EB9 //A8
+data8 0x415057F65C92E7B0 //A9
+data8 0x41799C865241B505 //A10
+data8 0x41A445209EFE896B //A11
+data8 0x41D02D21880C953B //A12
+data8 0x41F9FFDE8C63E16D //A13
+data8 0x422504DC8302D2BE //A14
+data8 0x425111BF18C95414 //A15
+data8 0x427BCBE74A2B8EF7 //A16
+data8 0x42A7256F59B286F7 //A17
+data8 0x42D462D1586DE61F //A18
+data8 0x42FBB1228D6C5118 //A19
+// Polynomial coefficients for right root on [-5, -4]
+// Lgammal is aproximated by polynomial within [-.163171 ; .161988 ] range
+data8 0x3C5840FBAFDEE5BB, 0x38CAC0336E8C490A //A0
+data8 0x403ACA5CF4921642, 0x3CCEDCDDA5491E56 //A1
+data8 0x40744415CD813F8E, 0x3CFBFEBC17E39146 //A2
+data8 0xAACD88D954E3E1BD, 0x0000400B //A3
+data8 0xCB68C710D75ED802, 0x0000400F //A4
+data8 0x8130F5AB997277AC, 0x00004014 //A5
+data8 0x41855E3DBF99EBA7 //A6
+data8 0x41CD14FE49C49FC2 //A7
+data8 0x421433DCE281F07D //A8
+data8 0x425C8399C7A92B6F //A9
+data8 0x42A45FBE67840F1A //A10
+data8 0x42ED68D75F9E6C98 //A11
+data8 0x433567291C27E5BE //A12
+data8 0x437F5ED7A9D9FD28 //A13
+data8 0x43C720A65C8AB711 //A14
+data8 0x441120A6C1D40B9B //A15
+data8 0x44596F561F2D1CBE //A16
+data8 0x44A3507DA81D5C01 //A17
+data8 0x44EF06A31E39EEDF //A18
+data8 0x45333774C99F523F //A19
+// Polynomial coefficients for right root on [-6, -5]
+// Lgammal is aproximated by polynomial within [-.156450 ; .156126 ] range
+data8 0x3C71B82D6B2B3304, 0x3917186E3C0DC231 //A0
+data8 0x405ED72E0829AE02, 0x3C960C25157980EB //A1
+data8 0x40BCECC32EC22F9B, 0x3D5D8335A32F019C //A2
+data8 0x929EC2B1FB931F17, 0x00004012 //A3
+data8 0xD112EF96D37316DE, 0x00004018 //A4
+data8 0x9F00BB9BB13416AB, 0x0000401F //A5
+data8 0x425F7D8D5BDCB223 //A6
+data8 0x42C9A8D00C776CC6 //A7
+data8 0x433557FD8C481424 //A8
+data8 0x43A209221A953EF0 //A9
+data8 0x440EDC98D5618AB7 //A10
+data8 0x447AABD25E367378 //A11
+data8 0x44E73DE20CC3B288 //A12
+data8 0x455465257B4E0BD8 //A13
+data8 0x45C2011532085353 //A14
+data8 0x462FEE4CC191945B //A15
+data8 0x469C63AEEFEF0A7F //A16
+data8 0x4709D045390A3810 //A17
+data8 0x4778D360873C9F64 //A18
+data8 0x47E26965BE9A682A //A19
+// Polynomial coefficients for right root on [-7, -6]
+// Lgammal is aproximated by polynomial within [-.154582 ; .154521 ] range
+data8 0x3C75F103A1B00A48, 0x391C041C190C726D //A0
+data8 0x40869DE49E3AF2AA, 0x3D1C17E1F813063B //A1
+data8 0x410FCE23484CFD10, 0x3DB6F38C2F11DAB9 //A2
+data8 0xEF281D1E1BE2055A, 0x00004019 //A3
+data8 0xFCE3DA92AC55DFF8, 0x00004022 //A4
+data8 0x8E9EA838A20BD58E, 0x0000402C //A5
+data8 0x4354F21E2FB9E0C9 //A6
+data8 0x43E9500994CD4F09 //A7
+data8 0x447F3A2C23C033DF //A8
+data8 0x45139152656606D8 //A9
+data8 0x45A8D45F8D3BF2E8 //A10
+data8 0x463FD32110E5BFE5 //A11
+data8 0x46D490B3BDBAE0BE //A12
+data8 0x476AC3CAD905DD23 //A13
+data8 0x48018558217AD473 //A14
+data8 0x48970AF371D30585 //A15
+data8 0x492E6273A8BEFFE3 //A16
+data8 0x49C47CC9AE3F1073 //A17
+data8 0x4A5D38E8C35EFF45 //A18
+data8 0x4AF0123E89694CD8 //A19
+// Polynomial coefficients for right root on [-8, -7]
+// Lgammal is aproximated by polynomial within [-.154217 ; .154208 ] range
+data8 0xBCD2507D818DDD68, 0xB97F6940EA2871A0 //A0
+data8 0x40B3B407AA387BCB, 0x3D6320238F2C43D1 //A1
+data8 0x41683E85DAAFBAC7, 0x3E148D085958EA3A //A2
+data8 0x9F2A95AF1E10A548, 0x00004022 //A3
+data8 0x92F21522F482300E, 0x0000402E //A4
+data8 0x90B51AB03A1F244D, 0x0000403A //A5
+data8 0x44628E1C70EF534F //A6
+data8 0x452393E2BC32D244 //A7
+data8 0x45E5164141F4BA0B //A8
+data8 0x46A712B3A8AF5808 //A9
+data8 0x47698FD36CEDD0F2 //A10
+data8 0x482C9AE6BBAA3637 //A11
+data8 0x48F023821857C8E9 //A12
+data8 0x49B2569053FC106F //A13
+data8 0x4A74F646D5C1604B //A14
+data8 0x4B3811CF5ABA4934 //A15
+data8 0x4BFBB5DD6C84E233 //A16
+data8 0x4CC05021086F637B //A17
+data8 0x4D8450A345B0FB49 //A18
+data8 0x4E43825848865DB2 //A19
+// Polynomial coefficients for right root on [-9, -8]
+// Lgammal is aproximated by polynomial within [-.154160 ; .154158 ] range
+data8 0x3CDF4358564F2B46, 0x397969BEE6042F81 //A0
+data8 0x40E3B088FED67721, 0x3D82787BA937EE85 //A1
+data8 0x41C83A3893550EF4, 0x3E542ED57E244DA8 //A2
+data8 0x9F003C6DC56E0B8E, 0x0000402B //A3
+data8 0x92BDF64A3213A699, 0x0000403A //A4
+data8 0x9074F503AAD417AF, 0x00004049 //A5
+data8 0x4582843E1313C8CD //A6
+data8 0x467387BD6A7826C1 //A7
+data8 0x4765074E788CF440 //A8
+data8 0x4857004DD9D1E09D //A9
+data8 0x4949792ED7530EAF //A10
+data8 0x4A3C7F089A292ED3 //A11
+data8 0x4B30125BF0AABB86 //A12
+data8 0x4C224175195E307E //A13
+data8 0x4D14DC4C8B32C08D //A14
+data8 0x4E07F1DB2786197E //A15
+data8 0x4EFB8EA1C336DACB //A16
+data8 0x4FF03797EACD0F23 //A17
+data8 0x50E4304A8E68A730 //A18
+data8 0x51D3618FB2EC9F93 //A19
+// Polynomial coefficients for right root on [-10, -9]
+// Lgammal is aproximated by polynomial within [-.154152 ; .154152 ] range
+data8 0x3D42F34DA97ECF0C, 0x39FD1256F345B0D0 //A0
+data8 0x4116261203919787, 0x3DC12D44055588EB //A1
+data8 0x422EA8F32FB7FE99, 0x3ED849CE4E7B2D77 //A2
+data8 0xE25BAF73477A57B5, 0x00004034 //A3
+data8 0xEB021FD10060504A, 0x00004046 //A4
+data8 0x8220A208EE206C5F, 0x00004059 //A5
+data8 0x46B2C3903EC9DA14 //A6
+data8 0x47D64393744B9C67 //A7
+data8 0x48FAF79CCDC604DD //A8
+data8 0x4A20975DB8061EBA //A9
+data8 0x4B44AB9CBB38DB21 //A10
+data8 0x4C6A032F60094FE9 //A11
+data8 0x4D908103927634B4 //A12
+data8 0x4EB516CA21D30861 //A13
+data8 0x4FDB1BF12C58D318 //A14
+data8 0x510180AAE094A553 //A15
+data8 0x5226A8F2A2D45D57 //A16
+data8 0x534E00B6B0C8B809 //A17
+data8 0x5475022FE21215B2 //A18
+data8 0x5596B02BF6C5E19B //A19
+// Polynomial coefficients for right root on [-11, -10]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0x3D7AA9C2E2B1029C, 0x3A15FB37578544DB //A0
+data8 0x414BAF825A0C91D4, 0x3DFB9DA2CE398747 //A1
+data8 0x4297F3EC8AE0AF03, 0x3F34208B55FB8781 //A2
+data8 0xDD0C97D3197F56DE, 0x0000403E //A3
+data8 0x8F6F3AF7A5499674, 0x00004054 //A4
+data8 0xC68DA1AF6D878EEB, 0x00004069 //A5
+data8 0x47F1E4E1E2197CE0 //A6
+data8 0x494A8A28E597C3EB //A7
+data8 0x4AA4175D0D35D705 //A8
+data8 0x4BFEE6F0AF69E814 //A9
+data8 0x4D580FE7B3DBB3C6 //A10
+data8 0x4EB2ECE60E4608AF //A11
+data8 0x500E04BE3E2B4F24 //A12
+data8 0x5167F9450F0FB8FD //A13
+data8 0x52C342BDE747603F //A14
+data8 0x541F1699D557268C //A15
+data8 0x557927C5F079864E //A16
+data8 0x56D4D10FEEDB030C //A17
+data8 0x5832385DF86AD28A //A18
+data8 0x598898914B4D6523 //A19
+// Polynomial coefficients for right root on [-12, -11]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0xBD96F61647C58B03, 0xBA3ABB0C2A6C755B //A0
+data8 0x418308A82714B70D, 0x3E1088FC6A104C39 //A1
+data8 0x4306A493DD613C39, 0x3FB2341ECBF85741 //A2
+data8 0x8FA8FE98339474AB, 0x00004049 //A3
+data8 0x802CCDF570BA7942, 0x00004062 //A4
+data8 0xF3F748AF11A32890, 0x0000407A //A5
+data8 0x493E3B567EF178CF //A6
+data8 0x4ACED38F651BA362 //A7
+data8 0x4C600B357337F946 //A8
+data8 0x4DF0F71A52B54CCF //A9
+data8 0x4F8229F3B9FA2C70 //A10
+data8 0x5113A4C4979B770E //A11
+data8 0x52A56BC367F298D5 //A12
+data8 0x543785CF31842DC0 //A13
+data8 0x55C9FC37E3E40896 //A14
+data8 0x575CD5D1BA556C82 //A15
+data8 0x58F00A7AD99A9E08 //A16
+data8 0x5A824088688B008D //A17
+data8 0x5C15F75EF7E08EBD //A18
+data8 0x5DA462EA902F0C90 //A19
+// Polynomial coefficients for right root on [-13, -12]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0x3DC3191752ACFC9D, 0x3A26CB6629532DBF //A0
+data8 0x41BC8CFC051191BD, 0x3E68A84DA4E62AF2 //A1
+data8 0x43797926294A0148, 0x400F345FF3723CFF //A2
+data8 0xF26D2AF700B82625, 0x00004053 //A3
+data8 0xA238B24A4B1F7B15, 0x00004070 //A4
+data8 0xE793B5C0A41A264F, 0x0000408C //A5
+data8 0x4A9585BDDACE863D //A6
+data8 0x4C6075953448088A //A7
+data8 0x4E29B2F38D1FC670 //A8
+data8 0x4FF4619B079C440F //A9
+data8 0x51C05DAE118D8AD9 //A10
+data8 0x538A8C7F87326AD4 //A11
+data8 0x5555B6937588DAB3 //A12
+data8 0x5721E1F8B6E6A7DB //A13
+data8 0x58EDA1D7A77DD6E5 //A14
+data8 0x5AB8A9616B7DC9ED //A15
+data8 0x5C84942AA209ED17 //A16
+data8 0x5E518FC34C6F54EF //A17
+data8 0x601FB3F17BCCD9A0 //A18
+data8 0x61E61128D512FE97 //A1
+// Polynomial coefficients for right root on [-14, -13]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0xBE170D646421B3F5, 0xBAAD95F79FCB5097 //A0
+data8 0x41F7328CBFCD9AC7, 0x3E743B8B1E8AEDB1 //A1
+data8 0x43F0D0FA2DBDA237, 0x40A0422D6A227B55 //A2
+data8 0x82082DF2D32686CC, 0x0000405F //A3
+data8 0x8D64EE9B42E68B43, 0x0000407F //A4
+data8 0xA3FFD82E08C5F1F1, 0x0000409F //A5
+data8 0x4BF8C49D99123454 //A6
+data8 0x4DFEC79DDF11342F //A7
+data8 0x50038615A892F6BD //A8
+data8 0x520929453DB32EF1 //A9
+data8 0x54106A7808189A7F //A10
+data8 0x5615A302D03C207B //A11
+data8 0x581CC175AA736F5E //A12
+data8 0x5A233E071147C017 //A13
+data8 0x5C29E81917243F22 //A14
+data8 0x5E3184B0B5AC4707 //A15
+data8 0x6037C11DE62D8388 //A16
+data8 0x6240787C4B1C9D6C //A17
+data8 0x6448289235E80977 //A18
+data8 0x664B5352C6C3449E //A19
+// Polynomial coefficients for right root on [-15, -14]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0x3E562C2E34A9207D, 0x3ADC00DA3DFF7A83 //A0
+data8 0x42344C3B2F0D90AB, 0x3EB8A2E979F24536 //A1
+data8 0x4469BFFF28B50D07, 0x41181E3D05C1C294 //A2
+data8 0xAE38F64DCB24D9F8, 0x0000406A //A3
+data8 0xA5C3F52C1B350702, 0x0000408E //A4
+data8 0xA83BC857BCD67A1B, 0x000040B2 //A5
+data8 0x4D663B4727B4D80A //A6
+data8 0x4FA82C965B0F7788 //A7
+data8 0x51EAD58C02908D95 //A8
+data8 0x542E427970E073D8 //A9
+data8 0x56714644C558A818 //A10
+data8 0x58B3EC2040C77BAE //A11
+data8 0x5AF72AE6A83D45B1 //A12
+data8 0x5D3B214F611F5D12 //A13
+data8 0x5F7FF5E49C54E92A //A14
+data8 0x61C2E917AB765FB2 //A15
+data8 0x64066FD70907B4C1 //A16
+data8 0x664B3998D60D0F9B //A17
+data8 0x689178710782FA8B //A18
+data8 0x6AD14A66C1C7BEC3 //A19
+// Polynomial coefficients for right root on [-16, -15]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0xBE6D7E7192615BAE, 0xBB0137677D7CC719 //A0
+data8 0x4273077763F6628C, 0x3F09250FB8FC8EC9 //A1
+data8 0x44E6A1BF095B1AB3, 0x4178D5A74F6CB3B3 //A2
+data8 0x8F8E0D5060FCC76E, 0x00004076 //A3
+data8 0x800CC1DCFF092A63, 0x0000409E //A4
+data8 0xF3AB0BA9D14CDA72, 0x000040C5 //A5
+data8 0x4EDE3000A2F6D54F //A6
+data8 0x515EC613B9C8E241 //A7
+data8 0x53E003309FEEEA96 //A8
+data8 0x5660ED908D7C9A90 //A9
+data8 0x58E21E9B517B1A50 //A10
+data8 0x5B639745E4374EE2 //A11
+data8 0x5DE55BB626B2075D //A12
+data8 0x606772B7506BA747 //A13
+data8 0x62E9E581AB2E057B //A14
+data8 0x656CBAD1CF85D396 //A15
+data8 0x67EFF4EBD7989872 //A16
+data8 0x6A722D2B19B7E2F9 //A17
+data8 0x6CF5DEB3073B0743 //A18
+data8 0x6F744AC11550B93A //A19
+// Polynomial coefficients for right root on [-17, -16]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0xBEDCC6291188207E, 0xBB872E3FDD48F5B7 //A0
+data8 0x42B3076EE7525EF9, 0x3F6687A5038CA81C //A1
+data8 0x4566A1AAD96EBCB5, 0x421F0FEDFBF548D2 //A2
+data8 0x8F8D4D3DE9850DBA, 0x00004082 //A3
+data8 0x800BDD6DA2CE1859, 0x000040AE //A4
+data8 0xF3A8EC4C9CDC1CE5, 0x000040D9 //A5
+data8 0x505E2FAFDB812628 //A6
+data8 0x531EC5B3A7508719 //A7
+data8 0x55E002F77E99B628 //A8
+data8 0x58A0ED4C9B4DAE54 //A9
+data8 0x5B621E4A8240F90C //A10
+data8 0x5E2396E5C8849814 //A11
+data8 0x60E55B43D8C5CE71 //A12
+data8 0x63A7722F5D45D01D //A13
+data8 0x6669E4E010DCE45A //A14
+data8 0x692CBA120D5E78F6 //A15
+data8 0x6BEFF4045350B22E //A16
+data8 0x6EB22C9807C21819 //A17
+data8 0x7175DE20D04617C4 //A18
+data8 0x74344AB87C6D655F //A19
+// Polynomial coefficients for right root on [-18, -17]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0xBF28AEEE7B61D77C, 0xBBDBBB5FC57ABF79 //A0
+data8 0x42F436F56B3B8A0C, 0x3FA43EE3C5C576E9 //A1
+data8 0x45E98A22535D115D, 0x42984678BE78CC48 //A2
+data8 0xAC176F3775E6FCFC, 0x0000408E //A3
+data8 0xA3114F53A9FEB922, 0x000040BE //A4
+data8 0xA4D168A8334ABF41, 0x000040EE //A5
+data8 0x51E5B0E7EC7182BB //A6
+data8 0x54E77D67B876EAB6 //A7
+data8 0x57E9F7C30C09C4B6 //A8
+data8 0x5AED29B0488614CA //A9
+data8 0x5DF09486F87E79F9 //A10
+data8 0x60F30B199979654E //A11
+data8 0x63F60E02C7DCCC5F //A12
+data8 0x66F9B8A00EB01684 //A13
+data8 0x69FE2D3ED0700044 //A14
+data8 0x6D01C8363C7DCC84 //A15
+data8 0x700502B29C2F06E3 //A16
+data8 0x730962B4500F4A61 //A17
+data8 0x76103C6ED099192A //A18
+data8 0x79100C7132CFD6E3 //A19
+// Polynomial coefficients for right root on [-19, -18]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0x3F3C19A53328A0C3, 0x3BE04ADC3FBE1458 //A0
+data8 0x4336C16C16C16C19, 0x3FE58CE3AC4A7C28 //A1
+data8 0x46702E85C0898B70, 0x432C922E412CEC6E //A2
+data8 0xF57B99A1C034335D, 0x0000409A //A3
+data8 0x82EC9634223DF909, 0x000040CF //A4
+data8 0x94F66D7557E2EA60, 0x00004103 //A5
+data8 0x5376118B79AE34D0 //A6
+data8 0x56BAE7106D52E548 //A7
+data8 0x5A00BD48CC8E25AB //A8
+data8 0x5D4529722821B493 //A9
+data8 0x608B1654AF31BBC1 //A10
+data8 0x63D182CC98AEA859 //A11
+data8 0x6716D43D5EEB05E8 //A12
+data8 0x6A5DF884FC172E1C //A13
+data8 0x6DA3CA7EBB97976B //A14
+data8 0x70EA416D0BE6D2EF //A15
+data8 0x743176C31EBB65F2 //A16
+data8 0x7777C401A8715CF9 //A17
+data8 0x7AC1110C6D350440 //A18
+data8 0x7E02D0971CF84865 //A19
+// Polynomial coefficients for right root on [-20, -19]
+// Lgammal is aproximated by polynomial within [-.154151 ; .154151 ] range
+data8 0xBFAB767F9BE21803, 0xBC5ACEF5BB1BD8B5 //A0
+data8 0x4379999999999999, 0x4029241C7F5914C8 //A1
+data8 0x46F47AE147AE147A, 0x43AC2979B64B9D7E //A2
+data8 0xAEC33E1F67152993, 0x000040A7 //A3
+data8 0xD1B71758E219616F, 0x000040DF //A4
+data8 0x8637BD05AF6CF468, 0x00004118 //A5
+data8 0x55065E9F80F293DE //A6
+data8 0x588EADA78C44EE66 //A7
+data8 0x5C15798EE22DEF09 //A8
+data8 0x5F9E8ABFD644FA63 //A9
+data8 0x6325FD7FE29BD7CD //A10
+data8 0x66AFFC5C57E1F802 //A11
+data8 0x6A3774CD7D5C0181 //A12
+data8 0x6DC152724DE2A6FE //A13
+data8 0x7149BB138EB3D0C2 //A14
+data8 0x74D32FF8A70896C2 //A15
+data8 0x785D3749F9C72BD7 //A16
+data8 0x7BE5CCF65EBC4E40 //A17
+data8 0x7F641A891B5FC652 //A18
+data8 0x7FEFFFFFFFFFFFFF //A19
+LOCAL_OBJECT_END(lgammal_right_roots_polynomial_data)
+
+LOCAL_OBJECT_START(lgammal_left_roots_polynomial_data)
+// Polynomial coefficients for left root on [-3, -2]
+// Lgammal is aproximated by polynomial within [.084641 ; -.059553 ] range
+data8 0xBC0844590979B82E, 0xB8BC7CE8CE2ECC3B //A0
+data8 0xBFFEA12DA904B18C, 0xBC91A6B2BAD5EF6E //A1
+data8 0x4023267F3C265A51, 0x3CD7055481D03AED //A2
+data8 0xA0C2D618645F8E00, 0x0000C003 //A3
+data8 0xFA8256664F8CD2BE, 0x00004004 //A4
+data8 0xC2C422C103F57158, 0x0000C006 //A5
+data8 0x4084373F7CC70AF5 //A6
+data8 0xC0A12239BDD6BB95 //A7
+data8 0x40BDBA65E2709397 //A8
+data8 0xC0DA2D2504DFB085 //A9
+data8 0x40F758173CA5BF3C //A10
+data8 0xC11506C65C267E72 //A11
+data8 0x413318EE3A6B05FC //A12
+data8 0xC1517767F247DA98 //A13
+data8 0x41701237B4754D73 //A14
+data8 0xC18DB8A03BC5C3D8 //A15
+data8 0x41AB80953AC14A07 //A16
+data8 0xC1C9B7B76638D0A4 //A17
+data8 0x41EA727E3033E2D9 //A18
+data8 0xC20812C297729142 //A19
+//
+// Polynomial coefficients for left root on [-4, -3]
+// Lgammal is aproximated by polynomial within [.147147 ; -.145158 ] range
+data8 0xBC3130AE5C4F54DB, 0xB8ED23294C13398A //A0
+data8 0xC034B99D966C5646, 0xBCE2E5FE3BC3DBB9 //A1
+data8 0x406F76DEAE0436BD, 0x3D14974DDEC057BD //A2
+data8 0xE929ACEA5979BE96, 0x0000C00A //A3
+data8 0xF47C14F8A0D52771, 0x0000400E //A4
+data8 0x88B7BC036937481C, 0x0000C013 //A5
+data8 0x4173E8F3AB9FC266 //A6
+data8 0xC1B7DBBE062FB11B //A7
+data8 0x41FD2F76DE7A47A7 //A8
+data8 0xC242225FE53B124D //A9
+data8 0x4286D12AE2FBFA30 //A10
+data8 0xC2CCFFC267A3C4C0 //A11
+data8 0x431294E10008E014 //A12
+data8 0xC357FAC8C9A2DF6A //A13
+data8 0x439F2190AB9FAE01 //A14
+data8 0xC3E44C1D8E8C67C3 //A15
+data8 0x442A8901105D5A38 //A16
+data8 0xC471C4421E908C3A //A17
+data8 0x44B92CD4D59D6D17 //A18
+data8 0xC4FB3A078B5247FA //A19
+// Polynomial coefficients for left root on [-5, -4]
+// Lgammal is aproximated by polynomial within [.155671 ; -.155300 ] range
+data8 0xBC57BF3C6E8A94C1, 0xB902FB666934AC9E //A0
+data8 0xC05D224A3EF9E41F, 0xBCF6F5713913E440 //A1
+data8 0x40BB533C678A3955, 0x3D688E53E3C72538 //A2
+data8 0x869FBFF732E99B84, 0x0000C012 //A3
+data8 0xBA9537AD61392DEC, 0x00004018 //A4
+data8 0x89EAE8B1DEA06B05, 0x0000C01F //A5
+data8 0x425A8C5C53458D3C //A6
+data8 0xC2C5068B3ED6509B //A7
+data8 0x4330FFA575E99B4E //A8
+data8 0xC39BEC12DDDF7669 //A9
+data8 0x44073825725F74F9 //A10
+data8 0xC47380EBCA299047 //A11
+data8 0x44E084DD9B666437 //A12
+data8 0xC54C2DA6BF787ACF //A13
+data8 0x45B82D65C8D6FA42 //A14
+data8 0xC624D62113FE950A //A15
+data8 0x469200CC19B45016 //A16
+data8 0xC6FFDDC6DD938E2E //A17
+data8 0x476DD7C07184B9F9 //A18
+data8 0xC7D554A30085C052 //A19
+// Polynomial coefficients for left root on [-6, -5]
+// Lgammal is aproximated by polynomial within [.157425 ; -.157360 ] range
+data8 0x3C9E20A87C8B79F1, 0x39488BE34B2427DB //A0
+data8 0xC08661F6A43A5E12, 0xBD3D912526D759CC //A1
+data8 0x410F79DCB794F270, 0x3DB9BEE7CD3C1BF5 //A2
+data8 0xEB7404450D0005DB, 0x0000C019 //A3
+data8 0xF7AE9846DFE4D4AB, 0x00004022 //A4
+data8 0x8AF535855A95B6DA, 0x0000C02C //A5
+data8 0x43544D54E9FE240E //A6
+data8 0xC3E8684E40CE6CFC //A7
+data8 0x447DF44C1D803454 //A8
+data8 0xC512AC305439B2BA //A9
+data8 0x45A79226AF79211A //A10
+data8 0xC63E0DFF7244893A //A11
+data8 0x46D35216C3A83AF3 //A12
+data8 0xC76903BE0C390E28 //A13
+data8 0x48004A4DECFA4FD5 //A14
+data8 0xC8954FBD243DB8BE //A15
+data8 0x492BF3A31EB18DDA //A16
+data8 0xC9C2C6A864521F3A //A17
+data8 0x4A5AB127C62E8DA1 //A18
+data8 0xCAECF60EF3183C57 //A19
+// Polynomial coefficients for left root on [-7, -6]
+// Lgammal is aproximated by polynomial within [.157749 ; -.157739 ] range
+data8 0x3CC9B9E8B8D551D6, 0x3961813C8E1E10DB //A0
+data8 0xC0B3ABF7A5CEA91F, 0xBD55638D4BCB4CC4 //A1
+data8 0x4168349A25504236, 0x3E0287ECE50CCF76 //A2
+data8 0x9EC8ED6E4C219E67, 0x0000C022 //A3
+data8 0x9279EB1B799A3FF3, 0x0000402E //A4
+data8 0x90213EF8D9A5DBCF, 0x0000C03A //A5
+data8 0x4462775E857FB71C //A6
+data8 0xC52377E70B45FDBF //A7
+data8 0x45E4F3D28EDA8C28 //A8
+data8 0xC6A6E85571BD2D0B //A9
+data8 0x47695BB17E74DF74 //A10
+data8 0xC82C5AC0ED6A662F //A11
+data8 0x48EFF8159441C2E3 //A12
+data8 0xC9B22602C1B68AE5 //A13
+data8 0x4A74BA8CE7B34100 //A14
+data8 0xCB37C7E208482E4B //A15
+data8 0x4BFB5A1D57352265 //A16
+data8 0xCCC01CB3021212FF //A17
+data8 0x4D841613AC3431D1 //A18
+data8 0xCE431C9E9EE43AD9 //A19
+// Polynomial coefficients for left root on [-8, -7]
+// Lgammal is aproximated by polynomial within [.157799 ; -.157798 ] range
+data8 0xBCF9C7A33AD9478C, 0xB995B0470F11E5ED //A0
+data8 0xC0E3AF76FE4C2F8B, 0xBD8DBCD503250511 //A1
+data8 0x41C838E76CAAF0D5, 0x3E5D79F5E2E069C3 //A2
+data8 0x9EF345992B262CE0, 0x0000C02B //A3
+data8 0x92AE0292985FD559, 0x0000403A //A4
+data8 0x90615420C08F7D8C, 0x0000C049 //A5
+data8 0x45828139342CEEB7 //A6
+data8 0xC67384066C31E2D3 //A7
+data8 0x476502BC4DAC2C35 //A8
+data8 0xC856FAADFF22ADC6 //A9
+data8 0x49497243255AB3CE //A10
+data8 0xCA3C768489520F6B //A11
+data8 0x4B300D1EA47AF838 //A12
+data8 0xCC223B0508AC620E //A13
+data8 0x4D14D46583338CD8 //A14
+data8 0xCE07E7A87AA068E4 //A15
+data8 0x4EFB811AD2F8BEAB //A16
+data8 0xCFF0351B51508523 //A17
+data8 0x50E4364CCBF53100 //A18
+data8 0xD1D33CFD0BF96FA6 //A19
+// Polynomial coefficients for left root on [-9, -8]
+// Lgammal is aproximated by polynomial within [.157806 ; -.157806 ] range
+data8 0x3D333E4438B1B9D4, 0x39E7B956B83964C1 //A0
+data8 0xC11625EDFC63DCD8, 0xBDCF39625709EFAC //A1
+data8 0x422EA8C150480F16, 0x3EC16ED908AB7EDD //A2
+data8 0xE2598725E2E11646, 0x0000C034 //A3
+data8 0xEAFF2346DE3EBC98, 0x00004046 //A4
+data8 0x821E90DE12A0F05F, 0x0000C059 //A5
+data8 0x46B2C334AE5366FE //A6
+data8 0xC7D64314B43191B6 //A7
+data8 0x48FAF6ED5899E01B //A8
+data8 0xCA2096E4472AF37D //A9
+data8 0x4B44AAF49FB7E4C8 //A10
+data8 0xCC6A02469F2BD920 //A11
+data8 0x4D9080626D2EFC07 //A12
+data8 0xCEB515EDCF0695F7 //A13
+data8 0x4FDB1AC69BF36960 //A14
+data8 0xD1017F8274339270 //A15
+data8 0x5226A684961BAE2F //A16
+data8 0xD34E085C088404A5 //A17
+data8 0x547511892FF8960E //A18
+data8 0xD5968FA3B1ED67A9 //A19
+// Polynomial coefficients for left root on [-10, -9]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBD355818A2B42BA2, 0xB9B7320B6A0D61EA //A0
+data8 0xC14BAF7DA5F3770E, 0xBDE64AF9A868F719 //A1
+data8 0x4297F3E8791F9CD3, 0x3F2A553E59B4835E //A2
+data8 0xDD0C5F7E551BD13C, 0x0000C03E //A3
+data8 0x8F6F0A3B2EB08BBB, 0x00004054 //A4
+data8 0xC68D4D5AD230BA08, 0x0000C069 //A5
+data8 0x47F1E4D8C35D1A3E //A6
+data8 0xC94A8A191DB0A466 //A7
+data8 0x4AA4174F65FE6AE8 //A8
+data8 0xCBFEE6D90F94E9DD //A9
+data8 0x4D580FD3438BE16C //A10
+data8 0xCEB2ECD456D50224 //A11
+data8 0x500E049F7FE64546 //A12
+data8 0xD167F92D9600F378 //A13
+data8 0x52C342AE2B43261A //A14
+data8 0xD41F15DEEDA4B67E //A15
+data8 0x55792638748AFB7D //A16
+data8 0xD6D4D760074F6E6B //A17
+data8 0x5832469D58ED3FA9 //A18
+data8 0xD988769F3DC76642 //A19
+// Polynomial coefficients for left root on [-11, -10]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBDA050601F39778A, 0xBA0D4D1CE53E8241 //A0
+data8 0xC18308A7D8EA4039, 0xBE370C379D3EAD41 //A1
+data8 0x4306A49380644E6C, 0x3FBBB143C0E7B5C8 //A2
+data8 0x8FA8FB233E4AA6D2, 0x0000C049 //A3
+data8 0x802CC9D8AEAC207D, 0x00004062 //A4
+data8 0xF3F73EE651A37A13, 0x0000C07A //A5
+data8 0x493E3B550A7B9568 //A6
+data8 0xCACED38DAA060929 //A7
+data8 0x4C600B346BAB3BC6 //A8
+data8 0xCDF0F719193E3D26 //A9
+data8 0x4F8229F24528B151 //A10
+data8 0xD113A4C2D32FBBE2 //A11
+data8 0x52A56BC13DC4474D //A12
+data8 0xD43785CFAF5E3CE3 //A13
+data8 0x55C9FC3EA5941202 //A14
+data8 0xD75CD545A3341AF5 //A15
+data8 0x58F009911F77C282 //A16
+data8 0xDA8246294D210BEC //A17
+data8 0x5C1608AAC32C3A8E //A18
+data8 0xDDA446E570A397D5 //A19
+// Polynomial coefficients for left root on [-12, -11]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0x3DEACBB3081C502E, 0x3A8AA6F01DEDF745 //A0
+data8 0xC1BC8CFBFB0A9912, 0xBE6556B6504A2AE6 //A1
+data8 0x43797926206941D7, 0x40289A9644C2A216 //A2
+data8 0xF26D2A78446D0839, 0x0000C053 //A3
+data8 0xA238B1D937FFED38, 0x00004070 //A4
+data8 0xE793B4F6DE470538, 0x0000C08C //A5
+data8 0x4A9585BDC44DC45D //A6
+data8 0xCC60759520342C47 //A7
+data8 0x4E29B2F3694C0404 //A8
+data8 0xCFF4619AE7B6BBAB //A9
+data8 0x51C05DADF52B89E8 //A10
+data8 0xD38A8C7F48819A4A //A11
+data8 0x5555B6932D687860 //A12
+data8 0xD721E1FACB6C1B5B //A13
+data8 0x58EDA1E2677C8F91 //A14
+data8 0xDAB8A8EC523C1F71 //A15
+data8 0x5C84930133F30411 //A16
+data8 0xDE51952FDFD1EC49 //A17
+data8 0x601FCCEC1BBD25F1 //A18
+data8 0xE1E5F2D76B610920 //A19
+// Polynomial coefficients for left root on [-13, -12]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBE01612F373268ED, 0xBA97B7A18CDF103B //A0
+data8 0xC1F7328CBF7A4FAC, 0xBE89A25A6952F481 //A1
+data8 0x43F0D0FA2DBDA237, 0x40A0422EC1CE6084 //A2
+data8 0x82082DF2D32686C5, 0x0000C05F //A3
+data8 0x8D64EE9B42E68B36, 0x0000407F //A4
+data8 0xA3FFD82E08C630C9, 0x0000C09F //A5
+data8 0x4BF8C49D99123466 //A6
+data8 0xCDFEC79DDF1119ED //A7
+data8 0x50038615A892D242 //A8
+data8 0xD20929453DC8B537 //A9
+data8 0x54106A78083BA1EE //A10
+data8 0xD615A302C69E27B2 //A11
+data8 0x581CC175870FF16F //A12
+data8 0xDA233E0979E12B74 //A13
+data8 0x5C29E822BC568C80 //A14
+data8 0xDE31845DB5340FBC //A15
+data8 0x6037BFC6D498D5F9 //A16
+data8 0xE2407D92CD613E82 //A17
+data8 0x64483B9B62367EB7 //A18
+data8 0xE64B2DC830E8A799 //A1
+// Polynomial coefficients for left root on [-14, -13]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0x3E563D0B930B371F, 0x3AE779957E14F012 //A0
+data8 0xC2344C3B2F083767, 0xBEC0B7769AA3DD66 //A1
+data8 0x4469BFFF28B50D07, 0x41181E3F13ED2401 //A2
+data8 0xAE38F64DCB24D9EE, 0x0000C06A //A3
+data8 0xA5C3F52C1B3506F2, 0x0000408E //A4
+data8 0xA83BC857BCD6BA92, 0x0000C0B2 //A5
+data8 0x4D663B4727B4D81A //A6
+data8 0xCFA82C965B0F62E9 //A7
+data8 0x51EAD58C02905B71 //A8
+data8 0xD42E427970FA56AD //A9
+data8 0x56714644C57D8476 //A10
+data8 0xD8B3EC2037EC95F2 //A11
+data8 0x5AF72AE68BBA5B3D //A12
+data8 0xDD3B2152C67AA6B7 //A13
+data8 0x5F7FF5F082861B8B //A14
+data8 0xE1C2E8BE125A5B7A //A15
+data8 0x64066E92FE9EBE7D //A16
+data8 0xE64B4201CDF9F138 //A17
+data8 0x689186351E58AA88 //A18
+data8 0xEAD132A585DFC60A //A19
+// Polynomial coefficients for left root on [-15, -14]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBE6D7DDE12700AC1, 0xBB1E025BF1667FB5 //A0
+data8 0xC273077763F60AD5, 0xBF2A1698184C7A9A //A1
+data8 0x44E6A1BF095B1AB3, 0x4178D5AE8A4A2874 //A2
+data8 0x8F8E0D5060FCC767, 0x0000C076 //A3
+data8 0x800CC1DCFF092A57, 0x0000409E //A4
+data8 0xF3AB0BA9D14D37D1, 0x0000C0C5 //A5
+data8 0x4EDE3000A2F6D565 //A6
+data8 0xD15EC613B9C8C800 //A7
+data8 0x53E003309FEECCAA //A8
+data8 0xD660ED908D8B15C4 //A9
+data8 0x58E21E9B51A1C4AE //A10
+data8 0xDB639745DB82210D //A11
+data8 0x5DE55BB60C68FCF6 //A12
+data8 0xE06772BA3FCA23C6 //A13
+data8 0x62E9E58B4F702C31 //A14
+data8 0xE56CBA49B071ABE2 //A15
+data8 0x67EFF31E4F2BA36A //A16
+data8 0xEA7232C8804F32C3 //A17
+data8 0x6CF5EFEE929A0928 //A18
+data8 0xEF742EE03EC3E8FF //A19
+// Polynomial coefficients for left root on [-16, -15]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBEDCC628FEAC7A1B, 0xBB80582C8BEBB198 //A0
+data8 0xC2B3076EE752595E, 0xBF5388F55AFAE53E //A1
+data8 0x4566A1AAD96EBCB5, 0x421F0FEFE2444293 //A2
+data8 0x8F8D4D3DE9850DB2, 0x0000C082 //A3
+data8 0x800BDD6DA2CE184C, 0x000040AE //A4
+data8 0xF3A8EC4C9CDC7A43, 0x0000C0D9 //A5
+data8 0x505E2FAFDB81263F //A6
+data8 0xD31EC5B3A7506CD9 //A7
+data8 0x55E002F77E999810 //A8
+data8 0xD8A0ED4C9B5C2900 //A9
+data8 0x5B621E4A8267C401 //A10
+data8 0xDE2396E5BFCFDA7A //A11
+data8 0x60E55B43BE6F9A79 //A12
+data8 0xE3A772324C7405FA //A13
+data8 0x6669E4E9B7E57A2D //A14
+data8 0xE92CB989F8A8FB37 //A15
+data8 0x6BEFF2368849A36E //A16
+data8 0xEEB23234FE191D55 //A17
+data8 0x7175EF5D1080B105 //A18
+data8 0xF4342ED7B1B7BE31 //A19
+// Polynomial coefficients for left root on [-17, -16]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBF28AEEE7B58C790, 0xBBC4448DE371FA0A //A0
+data8 0xC2F436F56B3B89B1, 0xBF636755245AC63A //A1
+data8 0x45E98A22535D115D, 0x4298467DA93DB784 //A2
+data8 0xAC176F3775E6FCF2, 0x0000C08E //A3
+data8 0xA3114F53A9FEB908, 0x000040BE //A4
+data8 0xA4D168A8334AFE5A, 0x0000C0EE //A5
+data8 0x51E5B0E7EC7182CF //A6
+data8 0xD4E77D67B876D6B4 //A7
+data8 0x57E9F7C30C098C83 //A8
+data8 0xDAED29B0489EF7A7 //A9
+data8 0x5DF09486F8A524B8 //A10
+data8 0xE0F30B19910A2393 //A11
+data8 0x63F60E02AB3109F4 //A12
+data8 0xE6F9B8A3431854D5 //A13
+data8 0x69FE2D4A6D94218E //A14
+data8 0xED01C7E272A73560 //A15
+data8 0x7005017D82B186B6 //A16
+data8 0xF3096A81A69BD8AE //A17
+data8 0x76104951BAD67D5C //A18
+data8 0xF90FECC99786FD5B //A19
+// Polynomial coefficients for left root on [-18, -17]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0x3F3C19A53328E26A, 0x3BE238D7BA036B3B //A0
+data8 0xC336C16C16C16C13, 0xBFEACE245DEC56F3 //A1
+data8 0x46702E85C0898B70, 0x432C922B64FD1DA4 //A2
+data8 0xF57B99A1C0343350, 0x0000C09A //A3
+data8 0x82EC9634223DF90D, 0x000040CF //A4
+data8 0x94F66D7557E3237D, 0x0000C103 //A5
+data8 0x5376118B79AE34D6 //A6
+data8 0xD6BAE7106D52CE49 //A7
+data8 0x5A00BD48CC8E11AB //A8
+data8 0xDD4529722833E2DF //A9
+data8 0x608B1654AF5F46AF //A10
+data8 0xE3D182CC90D8723F //A11
+data8 0x6716D43D46706AA0 //A12
+data8 0xEA5DF888C5B428D3 //A13
+data8 0x6DA3CA85888931A6 //A14
+data8 0xF0EA40EF2AC7E070 //A15
+data8 0x743175D1A251AFCD //A16
+data8 0xF777CB6E2B550D73 //A17
+data8 0x7AC11E468A134A51 //A18
+data8 0xFE02B6BDD0FC40AA //A19
+// Polynomial coefficients for left root on [-19, -18]
+// Lgammal is aproximated by polynomial within [.157807 ; -.157807 ] range
+data8 0xBFAB767F9BE217FC, 0xBC4A5541CE0D8D0D //A0
+data8 0xC379999999999999, 0xC01A84981B490BE8 //A1
+data8 0x46F47AE147AE147A, 0x43AC2987BBC466EB //A2
+data8 0xAEC33E1F67152987, 0x0000C0A7 //A3
+data8 0xD1B71758E2196153, 0x000040DF //A4
+data8 0x8637BD05AF6D420E, 0x0000C118 //A5
+data8 0x55065E9F80F293B2 //A6
+data8 0xD88EADA78C44BFA7 //A7
+data8 0x5C15798EE22EC6CD //A8
+data8 0xDF9E8ABFD67895CF //A9
+data8 0x6325FD7FE13B0DE0 //A10
+data8 0xE6AFFC5C3DE70858 //A11
+data8 0x6A3774CE81C70D43 //A12
+data8 0xEDC1527412D8129F //A13
+data8 0x7149BABCDA8B7A72 //A14
+data8 0xF4D330AD49071BB5 //A15
+data8 0x785D4046F4C5F1FD //A16
+data8 0xFBE59BFEDBA73FAF //A17
+data8 0x7F64BEF2B2EC8DA1 //A18
+data8 0xFFEFFFFFFFFFFFFF //A19
+LOCAL_OBJECT_END(lgammal_left_roots_polynomial_data)
+
+
+//==============================================================
+// Code
+//==============================================================
+
+.section .text
+GLOBAL_LIBM_ENTRY(__libm_lgammal)
+{ .mfi
+ getf.exp rSignExpX = f8
+ // Test x for NaTVal, NaN, +/-0, +/-INF, denormals
+ fclass.m p6,p0 = f8,0x1EF
+ addl r17Ones = 0x1FFFF, r0 // exponent mask
+}
+{ .mfi
+ addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp
+ fcvt.fx.s1 fXint = f8 // Convert arg to int (int repres. in FR)
+ adds rDelta = 0x3FC, r0
+}
+;;
+{ .mfi
+ getf.sig rSignifX = f8
+ fcmp.lt.s1 p15, p14 = f8, f0
+ shl rDelta = rDelta, 20 // single precision 1.5
+}
+{ .mfi
+ ld8 GR_ad_z_1 = [GR_ad_z_1]// get pointer to Constants_Z_1
+ fma.s1 fTwo = f1, f1, f1 // 2.0
+ addl rExp8 = 0x10002, r0 // exponent of 8.0
+}
+;;
+{ .mfi
+ alloc rPFS_SAVED = ar.pfs, 0, 34, 4, 0 // get some registers
+ fmerge.s fAbsX = f1, f8 // |x|
+ and rExpX = rSignExpX, r17Ones // mask sign bit
+}
+{ .mib
+ addl rExpHalf = 0xFFFE, r0 // exponent of 0.5
+ addl rExp2 = 0x10000, r0 // exponent of 2.0
+ // branch out if x is NaTVal, NaN, +/-0, +/-INF, or denormalized number
+(p6) br.cond.spnt lgammal_spec
+}
+;;
+_deno_back_to_main_path:
+{ .mfi
+ // Point to Constants_G_H_h1
+ add rTbl1Addr = 0x040, GR_ad_z_1
+ frcpa.s1 fRcpX, p0 = f1, f8 // initial approximation of 1/x
+ extr.u GR_Index1 = rSignifX, 59, 4
+}
+{ .mib
+(p14) cmp.ge.unc p8, p0 = rExpX, rExp8 // p8 = 1 if x >= 8.0
+ adds rZ625 = 0x3F2, r0
+(p8) br.cond.spnt lgammal_big_positive // branch out if x >= 8.0
+}
+;;
+{ .mfi
+ shladd rZ1offsett = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
+ fmerge.se fSignifX = f1, f8 // sifnificand of x
+ // Get high 15 bits of significand
+ extr.u GR_X_0 = rSignifX, 49, 15
+}
+{ .mib
+ cmp.lt.unc p9, p0 = rExpX, rExpHalf // p9 = 1 if |x| < 0.5
+ // set p11 if 2 <= x < 4
+(p14) cmp.eq.unc p11, p0 = rExpX, rExp2
+(p9) br.cond.spnt lgammal_0_half // branch out if |x| < 0.5
+}
+;;
+{ .mfi
+ ld4 GR_Z_1 = [rZ1offsett] // Load Z_1
+ fms.s1 fA5L = f1, f1, f8 // for 0.75 <= x < 1.3125 path
+ shl rZ625 = rZ625, 20 // sinfle precision 0.625
+}
+{ .mib
+ setf.s FR_MHalf = rDelta
+ // set p10 if x >= 4.0
+(p14) cmp.gt.unc p10, p0 = rExpX, rExp2
+ // branch to special path for 4.0 <= x < 8
+(p10) br.cond.spnt lgammal_4_8
+}
+;;
+{ .mfi
+ // for 1.3125 <= x < 1.5625 path
+ addl rPolDataPtr= @ltoff(lgammal_loc_min_data),gp
+ // argument of polynomial approximation for 1.5625 <= x < 2.25
+ fms.s1 fB4 = f8, f1, fTwo
+ cmp.eq p12, p0 = rExpX, rExpHalf
+}
+{ .mib
+ addl rExpOne = 0xFFFF, r0 // exponent of 1.0
+ // set p10 if significand of x >= 1.125
+(p11) cmp.le p11, p0 = 2, GR_Index1
+(p11) br.cond.spnt lgammal_2Q_4
+}
+;;
+{ .mfi
+ // point to xMin for 1.3125 <= x < 1.5625 path
+ ld8 rPolDataPtr = [rPolDataPtr]
+ fcvt.xf fFltIntX = fXint // RTN(x)
+(p14) cmp.eq.unc p13, p7 = rExpX, rExpOne // p13 set if 1.0 <= x < 2.0
+}
+{ .mib
+ setf.s FR_FracX = rZ625
+ // set p12 if |x| < 0.75
+(p12) cmp.gt.unc p12, p0 = 8, GR_Index1
+ // branch out to special path for |x| < 0.75
+(p12) br.cond.spnt lgammal_half_3Q
+}
+;;
+.pred.rel "mutex", p7, p13
+{ .mfi
+ getf.sig rXRnd = fXint // integer part of the input value
+ fnma.s1 fInvX = f8, fRcpX, f1 // start of 1st NR iteration
+ // Get bits 30-15 of X_0 * Z_1
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
+}
+{ .mib
+(p7) cmp.eq p6, p0 = rExpX, rExp2 // p6 set if 2.0 <= x < 2.25
+(p13) cmp.le p6, p0 = 9, GR_Index1
+ // branch to special path 1.5625 <= x < 2.25
+(p6) br.cond.spnt lgammal_13Q_2Q
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ shladd GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr // Point to G_1
+ fma.s1 fSix = fTwo, fTwo, fTwo // 6.0
+ add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q
+}
+{ .mib
+ add rTmpPtr3 = -0x50, GR_ad_z_1
+(p13) cmp.gt p7, p0 = 5, GR_Index1
+ // branch to special path 0.75 <= x < 1.3125
+(p7) br.cond.spnt lgammal_03Q_1Q
+}
+;;
+{ .mfi
+ add rTmpPtr = 8, GR_ad_tbl_1
+ fma.s1 fRoot = f8, f1, f1 // x + 1
+ // Absolute value of int arg. Will be used as index in table with roots
+ sub rXRnd = r0, rXRnd
+}
+{ .mib
+ ldfe fA5L = [rPolDataPtr], 16 // xMin
+ addl rNegSingularity = 0x3003E, r0
+(p14) br.cond.spnt lgammal_loc_min
+}
+;;
+{ .mfi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1], 8 // Load G_1, H_1
+ nop.f 0
+ add rZ2Addr = 0x140, GR_ad_z_1 // Point to Constants_Z_2
+}
+{ .mib
+ ldfd FR_h = [rTmpPtr] // Load h_1
+ // If arg is less or equal to -2^63
+ cmp.geu.unc p8,p0 = rSignExpX, rNegSingularity
+ // Singularity for x < -2^63 since all such arguments are integers
+ // branch to special code which deals with singularity
+(p8) br.cond.spnt lgammal_singularity
+}
+;;
+{ .mfi
+ ldfe FR_log2_hi = [GR_ad_q], 32 // Load log2_hi
+ nop.f 0
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+}
+{ .mfi
+ ldfe FR_log2_lo = [rTmpPtr3], 32 // Load log2_lo
+ fms.s1 fDx = f8, f1, fFltIntX // x - RTN(x)
+ // index in table with roots and bounds
+ adds rXint = -2, rXRnd
+}
+;;
+{ .mfi
+ ldfe FR_Q4 = [GR_ad_q], 32 // Load Q4
+ nop.f 0
+ // set p12 if x may be close to negative root: -19.5 < x < -2.0
+ cmp.gtu p12, p0 = 18, rXint
+}
+{ .mfi
+ shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2
+ fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 1st NR iteration
+ // Point to Constants_G_H_h2
+ add rTbl2Addr = 0x180, GR_ad_z_1
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
+ // set p9 if x is integer and negative
+ fcmp.eq.s1 p9, p0 = f8,fFltIntX
+ // Point to Constants_G_H_h3
+ add rTbl3Addr = 0x280, GR_ad_z_1
+}
+{ .mfi
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ nop.f 0
+ sub GR_N = rExpX, rExpHalf, 1
+}
+;;
+{ .mfi
+ ldfe FR_Q3 = [rTmpPtr3], 32 // Load Q3
+ nop.f 0
+ // Point to lnsin polynomial coefficients
+ adds rLnSinDataPtr = 864, rTbl3Addr
+}
+{ .mfi
+ ldfe FR_Q2 = [GR_ad_q],32 // Load Q2
+ nop.f 0
+ add rTmpPtr = 8, GR_ad_tbl_2
+}
+;;
+{ .mfi
+ ldfe FR_Q1 = [rTmpPtr3] // Load Q1
+ fcmp.lt.s1 p0, p15 = fAbsX, fSix // p15 is set when x < -6.0
+ // point to table with roots and bounds
+ adds rRootsBndAddr = -1296, GR_ad_z_1
+}
+{ .mfb
+ // Put integer N into rightmost significand
+ setf.sig fFloatN = GR_N
+ fma.s1 fThirteen = fSix, fTwo, f1 // 13.0
+ // Singularity if -2^63 < x < 0 and x is integer
+ // branch to special code which deals with singularity
+(p9) br.cond.spnt lgammal_singularity
+}
+;;
+{ .mfi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2] // Load G_2, H_2
+ // y = |x|/2^(exponent(x)) - 1.5
+ fms.s1 FR_FracX = fSignifX, f1, FR_MHalf
+ // Get bits 30-15 of X_1 * Z_2
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
+}
+{ .mfi
+ ldfd FR_h2 = [rTmpPtr] // Load h_2
+ fma.s1 fDxSqr = fDx, fDx, f0 // deltaX^2
+ adds rTmpPtr3 = 128, rLnSinDataPtr
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ getf.exp rRoot = fRoot // sign and biased exponent of (x + 1)
+ nop.f 0
+ // set p6 if -4 < x <= -2
+ cmp.eq p6, p0 = rExpX, rExp2
+}
+{ .mfi
+ ldfpd fLnSin2, fLnSin2L = [rLnSinDataPtr], 16
+ fnma.s1 fInvX = f8, fRcpX, f1 // start of 2nd NR iteration
+ sub rIndexPol = rExpX, rExpHalf // index of polynom
+}
+;;
+{ .mfi
+ ldfe fLnSin4 = [rLnSinDataPtr], 96
+ // p10 is set if x is potential "right" root
+ // p11 set for possible "left" root
+ fcmp.lt.s1 p10, p11 = fDx, f0
+ shl rIndexPol = rIndexPol, 6 // (i*16)*4
+}
+{ .mfi
+ ldfpd fLnSin18, fLnSin20 = [rTmpPtr3], 16
+ nop.f 0
+ mov rExp2tom7 = 0x0fff8 // Exponent of 2^-7
+}
+;;
+{ .mfi
+ getf.sig rSignifDx = fDx // Get significand of RTN(x)
+ nop.f 0
+ // set p6 if -4 < x <= -3.0
+(p6) cmp.le.unc p6, p0 = 0x8, GR_Index1
+}
+{ .mfi
+ ldfpd fLnSin22, fLnSin24 = [rTmpPtr3], 16
+ nop.f 0
+ // mask sign bit in the exponent of (x + 1)
+ and rRoot = rRoot, r17Ones
+}
+;;
+{ .mfi
+ ldfe fLnSin16 = [rLnSinDataPtr], -80
+ nop.f 0
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+}
+{ .mfi
+ ldfpd fLnSin26, fLnSin28 = [rTmpPtr3], 16
+ nop.f 0
+ and rXRnd = 1, rXRnd
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
+ fms.s1 fDxSqrL = fDx, fDx, fDxSqr // low part of deltaX^2
+ // potential "left" root
+(p11) adds rRootsBndAddr = 560, rRootsBndAddr
+}
+{ .mib
+ ldfpd fLnSin30, fLnSin32 = [rTmpPtr3], 16
+ // set p7 if |x+1| < 2^-7
+ cmp.lt p7, p0 = rRoot, rExp2tom7
+ // branch to special path for |x+1| < 2^-7
+(p7) br.cond.spnt _closeToNegOne
+}
+;;
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3
+ fcmp.lt.s1 p14, p0 = fAbsX, fThirteen // set p14 if x > -13.0
+ // base address of polynomial on range [-6.0, -0.75]
+ adds rPolDataPtr = 3440, rTbl3Addr
+}
+{ .mfi
+ // (i*16)*4 + (i*16)*8 - offsett of polynomial on range [-6.0, -0.75]
+ shladd rTmpPtr = rIndexPol, 2, rIndexPol
+ fma.s1 fXSqr = FR_FracX, FR_FracX, f0 // y^2
+ // point to left "near root" bound
+(p12) shladd rRootsBndAddr = rXint, 4, rRootsBndAddr
+}
+;;
+{ .mfi
+ ldfpd fLnSin34, fLnSin36 = [rTmpPtr3], 16
+ fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 2nd NR iteration
+ // add special offsett if -4 < x <= -3.0
+(p6) adds rPolDataPtr = 640, rPolDataPtr
+}
+{ .mfi
+ // point to right "near root" bound
+ adds rTmpPtr2 = 8, rRootsBndAddr
+ fnma.s1 fMOne = f1, f1, f0 // -1.0
+ // Point to Bernulli numbers
+ adds rBernulliPtr = 544, rTbl3Addr
+}
+;;
+{ .mfi
+ // left bound of "near root" range
+(p12) ld8 rLeftBound = [rRootsBndAddr]
+ fmerge.se fNormDx = f1, fDx // significand of DeltaX
+ // base address + offsett for polynomial coeff. on range [-6.0, -0.75]
+ add rPolDataPtr = rPolDataPtr, rTmpPtr
+}
+{ .mfi
+ // right bound of "near root" range
+(p12) ld8 rRightBound = [rTmpPtr2]
+ fcvt.xf fFloatN = fFloatN
+ // special "Bernulli" numbers for Stirling's formula for -13 < x < -6
+(p14) adds rBernulliPtr = 160, rBernulliPtr
+}
+;;
+{ .mfi
+ ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ adds rTmpPtr3 = -160, rTmpPtr3
+}
+{ .mfb
+ adds rTmpPtr = 80, rPolDataPtr
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ // p15 is set if -2^63 < x < 6.0 and x is not an integer
+ // branch to path with implementation using Stirling's formula for neg. x
+(p15) br.cond.spnt _negStirling
+}
+;;
+{ .mfi
+ ldfpd fA3, fA3L = [rPolDataPtr], 16 // A3
+ fma.s1 fDelX4 = fDxSqr, fDxSqr, f0 // deltaX^4
+ // Get high 4 bits of signif
+ extr.u rIndex1Dx = rSignifDx, 59, 4
+}
+{ .mfi
+ ldfe fA5 = [rTmpPtr], -16 // A5
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+ adds rLnSinTmpPtr = 16, rLnSinDataPtr
+}
+;;
+{ .mfi
+ ldfpd fA0, fA0L = [rPolDataPtr], 16 // A0
+ fma.s1 fLnSin20 = fLnSin20, fDxSqr, fLnSin18
+ // Get high 15 bits of significand
+ extr.u rX0Dx = rSignifDx, 49, 15
+}
+{ .mfi
+ ldfe fA4 = [rTmpPtr], 192 // A4
+ fms.s1 fXSqrL = FR_FracX, FR_FracX, fXSqr // low part of y^2
+ shladd GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1
+}
+;;
+{ .mfi
+ ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1
+ fma.s1 fX4 = fXSqr, fXSqr, f0 // y^4
+ adds rTmpPtr2 = 32, rTmpPtr
+}
+{ .mfi
+ ldfpd fA18, fA19 = [rTmpPtr], 16 // A18, A19
+ fma.s1 fLnSin24 = fLnSin24, fDxSqr, fLnSin22
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fLnSin6 = [rLnSinDataPtr], 32
+ fma.s1 fLnSin28 = fLnSin28, fDxSqr, fLnSin26
+ nop.i 0
+}
+{ .mfi
+ ldfe fLnSin8 = [rLnSinTmpPtr], 32
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA20, fA21 = [rTmpPtr], 16 // A20, A21
+ fma.s1 fLnSin32 = fLnSin32, fDxSqr, fLnSin30
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA22, fA23 = [rTmpPtr2], 16 // A22, A23
+ fma.s1 fB20 = f1, f1, FR_MHalf // 2.5
+(p12) cmp.ltu.unc p6, p0 = rSignifX, rLeftBound
+}
+;;
+{ .mfi
+ ldfpd fA2, fA2L = [rPolDataPtr], 16 // A2
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ // set p6 if x falls in "near root" range
+(p6) cmp.geu.unc p6, p0 = rSignifX, rRightBound
+}
+{ .mfb
+ adds rTmpPtr3 = -64, rTmpPtr
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ // branch to special path if x falls in "near root" range
+(p6) br.cond.spnt _negRoots
+}
+;;
+{ .mfi
+ ldfpd fA24, fA25 = [rTmpPtr2], 16 // A24, A25
+ fma.s1 fLnSin36 = fLnSin36, fDxSqr, fLnSin34
+(p11) cmp.eq.unc p7, p0 = 1,rXint // p7 set if -3.0 < x < -2.5
+}
+{ .mfi
+ adds rTmpPtr = -48, rTmpPtr
+ fma.s1 fLnSin20 = fLnSin20, fDxSqr, fLnSin16
+ addl rDelta = 0x5338, r0 // significand of -2.605859375
+}
+;;
+{ .mfi
+ getf.exp GR_N = fDx // Get N = exponent of DeltaX
+ fma.s1 fX6 = fX4, fXSqr, f0 // y^6
+ // p7 set if -2.605859375 <= x < -2.5
+(p7) cmp.gt.unc p7, p0 = rDelta, GR_X_0
+}
+{ .mfb
+ ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
+ fma.s1 fDelX8 = fDelX4, fDelX4, f0 // deltaX^8
+ // branch to special path for -2.605859375 <= x < -2.5
+(p7) br.cond.spnt _neg2andHalf
+}
+;;
+{ .mfi
+ ldfpd fA14, fA15 = [rTmpPtr3], 16 // A14, A15
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ adds rTmpPtr2 = 128 , rPolDataPtr
+}
+{ .mfi
+ ldfpd fA16, fA17 = [rTmpPtr], 16 // A16, A17
+ fma.s1 fLnSin28 = fLnSin28, fDelX4, fLnSin24
+ adds rPolDataPtr = 144 , rPolDataPtr
+}
+;;
+{ .mfi
+ ldfe fLnSin10 = [rLnSinDataPtr], 32
+ fma.s1 fRes1H = fA3, FR_FracX, f0 // (A3*y)hi
+ and GR_N = GR_N, r17Ones // mask sign bit
+}
+{ .mfi
+ ldfe fLnSin12 = [rLnSinTmpPtr]
+ fma.s1 fDelX6 = fDxSqr, fDelX4, f0 // DeltaX^6
+ shladd GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1
+}
+;;
+{ .mfi
+ ldfe fA13 = [rPolDataPtr], -32 // A13
+ fma.s1 fA4 = fA5, FR_FracX, fA4 // A5*y + A4
+ // Get bits 30-15 of X_0 * Z_1
+ pmpyshr2.u GR_X_1 = rX0Dx, GR_Z_1, 15
+}
+{ .mfi
+ ldfe fA12 = [rTmpPtr2], -32 // A12
+ fms.s1 FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1
+ sub GR_N = GR_N, rExpHalf, 1 // unbisaed exponent of DeltaX
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+.pred.rel "mutex",p10,p11
+{ .mfi
+ ldfe fA11 = [rPolDataPtr], -32 // A11
+ // High part of log(|x|) = Y_hi = N * log2_hi + H
+ fma.s1 fResH = fFloatN, FR_log2_hi, FR_H
+(p10) cmp.eq p8, p9 = rXRnd, r0
+}
+{ .mfi
+ ldfe fA10 = [rTmpPtr2], -32 // A10
+ fma.s1 fRes6H = fA1, FR_FracX, f0 // (A1*y)hi
+(p11) cmp.eq p9, p8 = rXRnd, r0
+}
+;;
+{ .mfi
+ ldfe fA9 = [rPolDataPtr], -32 // A9
+ fma.s1 fB14 = fLnSin6, fDxSqr, f0 // (LnSin6*deltaX^2)hi
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+{ .mfi
+ ldfe fA8 = [rTmpPtr2], -32 // A8
+ fma.s1 fA18 = fA19, FR_FracX, fA18
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fA7 = [rPolDataPtr] // A7
+ fma.s1 fA23 = fA23, FR_FracX, fA22
+ nop.i 0
+}
+{ .mfi
+ ldfe fA6 = [rTmpPtr2] // A6
+ fma.s1 fA21 = fA21, FR_FracX, fA20
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fLnSin14 = [rLnSinDataPtr]
+ fms.s1 fRes1L = fA3, FR_FracX, fRes1H // delta((A3*y)hi)
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+}
+{ .mfi
+ setf.sig fFloatNDx = GR_N
+ fadd.s1 fPol = fRes1H, fA2 // (A3*y + A2)hi
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1], 8 // Load G_1, H_1
+ fma.s1 fRes2H = fA4, fXSqr, f0 // ((A5 + A4*y)*y^2)hi
+ nop.i 0
+}
+{ .mfi
+ shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2
+ fma.s1 fA25 = fA25, FR_FracX, fA24
+ shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
+}
+;;
+.pred.rel "mutex",p8,p9
+{ .mfi
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ fms.s1 fRes6L = fA1, FR_FracX, fRes6H // delta((A1*y)hi)
+ // sign of GAMMA(x) is negative
+(p8) adds rSgnGam = -1, r0
+}
+{ .mfi
+ adds rTmpPtr = 8, GR_ad_tbl_2
+ fadd.s1 fRes3H = fRes6H, fA0 // (A1*y + A0)hi
+ // sign of GAMMA(x) is positive
+(p9) adds rSgnGam = 1, r0
+}
+;;
+{ .mfi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2] // Load G_2, H_2
+ // (LnSin6*deltaX^2 + LnSin4)hi
+ fadd.s1 fLnSinH = fB14, fLnSin4
+ nop.i 0
+}
+{ .mfi
+ ldfd FR_h2 = [rTmpPtr] // Load h_2
+ fms.s1 fB16 = fLnSin6, fDxSqr, fB14 // delta(LnSin6*deltaX^2)
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd fhDelX = [GR_ad_tbl_1] // Load h_1
+ fma.s1 fA21 = fA21, fXSqr, fA18
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDelX4, fLnSin32
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fA3L, FR_FracX, fRes1L // (A3*y)lo
+ // Get bits 30-15 of X_1 * Z_
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fPolL = fA2, fPol
+ nop.i 0
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ nop.m 0
+ // delta(((A5 + A4*y)*y^2)hi)
+ fms.s1 fRes2L = fA4, fXSqr, fRes2H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (((A5 + A4*y)*y^2) + A3*y + A2)hi
+ fadd.s1 fRes4H = fRes2H, fPol
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s1 fRes6L = fA1L, FR_FracX, fRes6L // (A1*y)lo
+ nop.i 0
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fsub.s1 fRes3L = fA0, fRes3H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fLnSinL = fLnSin4, fLnSinH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)hi
+ fma.s1 fB18 = fLnSinH, fDxSqr, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ adds rTmpPtr = 8, rTbl3Addr
+ fma.s1 fB16 = fLnSin6, fDxSqrL, fB16 // (LnSin6*deltaX^2)lo
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fXSqr, fA23
+ nop.i 0
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
+ fadd.s1 fPolL = fPolL, fRes1H
+ nop.i 0
+}
+{ .mfi
+ shladd rTmpPtr = GR_Index3, 4, rTmpPtr // Point to G_3
+ fadd.s1 fRes1L = fRes1L, fA2L // (A3*y)lo + A2lo
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3] // Load G_3, H_3
+ fma.s1 fRes2L = fA4, fXSqrL, fRes2L // ((A5 + A4*y)*y^2)lo
+ nop.i 0
+}
+{ .mfi
+ ldfd FR_h3 = [rTmpPtr] // Load h_3
+ fsub.s1 fRes4L = fPol, fRes4H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)hi
+ fma.s1 fRes7H = fRes4H, fXSqr, f0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, FR_FracX, fA14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fRes6H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes6L = fRes6L, fA0L // (A1*y)lo + A0lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fLnSinL = fLnSinL, fB14
+
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // delta((LnSin6*deltaX^2 + LnSin4)*deltaX^2)
+ fms.s1 fB20 = fLnSinH, fDxSqr, fB18
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fPolL = fPolL, fRes1L // (A3*y + A2)lo
+
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)hi
+ fadd.s1 fLnSin6 = fB18, fLnSin2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fRes2H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, FR_FracX, fA16
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // delta(((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)
+ fms.s1 fRes7L = fRes4H, fXSqr, fRes7H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fPol = fRes7H, fRes3H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fRes6L // (A1*y + A0)lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fX4, fA21
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (LnSin6*deltaX^2 + LnSin4)lo
+ fadd.s1 fLnSinL = fLnSinL, fB16
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB20 = fLnSinH, fDxSqrL, fB20
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fLnSin4 = fLnSin2, fLnSin6
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)*DeltaX^2)hi
+ fma.s1 fLnSinH = fLnSin6, fDxSqr, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ((A5 + A4*y)*y^2)lo + (A3*y + A2)lo
+ fadd.s1 fRes2L = fRes2L, fPolL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, fXSqr, fA15
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)lo
+ fma.s1 fRes7L = fRes4H, fXSqrL, fRes7L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fPolL = fRes3H, fPol
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, FR_FracX, fA12
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, FR_FracX, fA10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)lo
+ fma.s1 fB20 = fLnSinL, fDxSqr, fB20
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fLnSin4 = fLnSin4, fB18
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fLnSinL = fLnSin6, fDxSqr, fLnSinH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (((A5 + A4*y)*y^2) + A3*y + A2)lo
+ fadd.s1 fRes4L = fRes4L, fRes2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fhDelX = fhDelX, FR_h2 // h = h_1 + h_2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes7L = fRes7L, fRes3L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fPolL = fPolL, fRes7H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fcvt.xf fFloatNDx = fFloatNDx
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2)lo + (LnSin2)lo
+ fadd.s1 fLnSin2L = fLnSin2L, fB20
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fX4, fA17
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, fXSqr, fA11
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, FR_FracX, fA8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA7, FR_FracX, fA6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDelX8, fLnSin28
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin14 = fLnSin14, fDxSqr, fLnSin12
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin10 = fLnSin10, fDxSqr, fLnSin8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRDx = FR_G, fNormDx, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // poly_lo = r * Q4 + Q3
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // ((((A5 + A4*y)*y^2) + A3*y + A2)*y^2)lo + (A1*y + A0)lo
+ fma.s1 fRes7L = fRes4L, fXSqr, fRes7L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fX4, fA13
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, fXSqr, fA7
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // h = N * log2_lo + h
+ fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fhDelX = fhDelX, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDelX6, fLnSin20
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin14 = fLnSin14, fDelX4, fLnSin10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = r * Q4 + Q3
+ fma.s1 fPolyLoDx = fRDx, FR_Q4, FR_Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 fRDxSq = fRDx, fRDx // rsq = r * r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Y_hi = N * log2_hi + H
+ fma.s1 fResLnDxH = fFloatNDx, FR_log2_hi, FR_H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA25, fX4, fA9
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fPolL = fPolL, fRes7L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fLnSin4 = fLnSin4, fLnSin2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // h = N * log2_lo + h
+ fma.s1 fhDelX = fFloatNDx, FR_log2_lo, fhDelX
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDelX8, fLnSin14
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // ((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)lo
+ fma.s1 fLnSinL = fLnSin6, fDxSqrL, fLnSinL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 fPolyLoDx = fPolyLoDx, fRDx, FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fRDxCub = fRDxSq, fRDx, f0 // rcub = r^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ famax.s0 fRes5H = fPol, fResH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // High part of (lgammal(|x|) + log(|x|))
+ fadd.s1 fRes1H = fPol, fResH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fA9, fX6, fPolL // P25lo
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ famin.s0 fRes5L = fPol, fResH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // High part of -(LnSin + log(|DeltaX|))
+ fnma.s1 fRes2H = fResLnDxH, f1, fLnSinH
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // (((LnSin6*deltaX^2 + LnSin4)*deltaX^2 + LnSin2)*DeltaX^2)lo
+ fma.s1 fLnSinL = fLnSin4, fDxSqr, fLnSinL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDelX6, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 fPolyHiDx = FR_Q1, fRDxSq, fRDx
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 fPolyLoDx = fPolyLoDx, fRDxCub, fhDelX
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes1L = fRes5H, fRes1H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // -(lgammal(|x|) + log(|x|))hi
+ fnma.s1 fRes1H = fRes1H, f1, f0
+
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes2L = fResLnDxH, fMOne, fRes2H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSinL = fLnSin36, fDxSqr, fLnSinL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // Y_lo = poly_hi + poly_lo
+ fadd.s1 fResLnDxL = fPolyHiDx, fPolyLoDx
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fRes5L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // high part of the final result
+ fadd.s1 fYH = fRes2H, fRes1H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Y_lo = poly_hi + poly_lo
+ fadd.s1 fResL = FR_poly_hi, FR_poly_lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ famax.s0 fRes4H = fRes2H, fRes1H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ famin.s0 fRes4L = fRes2H, fRes1H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (LnSin)lo + (log(|DeltaX|))lo
+ fsub.s1 fLnSinL = fLnSinL, fResLnDxL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fLnSinH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ //(lgammal(|x|))lo + (log(|x|))lo
+ fadd.s1 fPolL = fResL, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fYL = fRes4H, fYH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Low part of -(LnSin + log(|DeltaX|))
+ fadd.s1 fRes2L = fRes2L, fLnSinL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // High part of (lgammal(|x|) + log(|x|))
+ fadd.s1 fRes1L = fRes1L, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fYL = fYL, fRes4L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes2L = fRes2L, fRes1L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // low part of the final result
+ fadd.s1 fYL = fYL, fRes2L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for -6.0 < x <= -0.75, non-integer, "far" from roots
+ fma.s0 f8 = fYH, f1, fYL
+ // exit here for -6.0 < x <= -0.75, non-integer, "far" from roots
+ br.ret.sptk b0
+}
+;;
+
+// here if |x+1| < 2^(-7)
+.align 32
+_closeToNegOne:
+{ .mfi
+ getf.exp GR_N = fDx // Get N = exponent of x
+ fmerge.se fAbsX = f1, fDx // Form |deltaX|
+ // Get high 4 bits of significand of deltaX
+ extr.u rIndex1Dx = rSignifDx, 59, 4
+}
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_1pEps_data),gp
+ fma.s1 fA0L = fDxSqr, fDxSqr, f0 // deltaX^4
+ // sign of GAMMA is positive if p10 is set to 1
+(p10) adds rSgnGam = 1, r0
+}
+;;
+{ .mfi
+ shladd GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1
+ fnma.s1 fResL = fDx, f1, f0 // -(x+1)
+ // Get high 15 bits of significand
+ extr.u GR_X_0 = rSignifDx, 49, 15
+}
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ shladd GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1
+}
+;;
+{ .mfi
+ ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
+ nop.f 0
+ and GR_N = GR_N, r17Ones // mask sign bit
+}
+{ .mfi
+ adds rTmpPtr = 8, GR_ad_tbl_1
+ nop.f 0
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+;;
+{ .mfi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
+ nop.f 0
+ adds rTmpPtr2 = 96, rPolDataPtr
+}
+{ .mfi
+ ldfd FR_h = [rTmpPtr] // Load h_1
+ nop.f 0
+ // unbiased exponent of deltaX
+ sub GR_N = GR_N, rExpHalf, 1
+}
+;;
+{ .mfi
+ adds rTmpPtr3 = 192, rPolDataPtr
+ nop.f 0
+ // sign of GAMMA is negative if p11 is set to 1
+(p11) adds rSgnGam = -1, r0
+}
+{ .mfi
+ ldfe fA1 = [rPolDataPtr], 16 // A1
+ nop.f 0
+ nop.i 0
+}
+;;
+{.mfi
+ ldfe fA2 = [rPolDataPtr], 16 // A2
+ nop.f 0
+ // Get bits 30-15 of X_0 * Z_1
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
+}
+{ .mfi
+ ldfpd fA20, fA19 = [rTmpPtr2], 16 // P8, P7
+ nop.f 0
+ nop.i 0
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ ldfe fA3 = [rPolDataPtr], 16 // A3
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA18, fA17 = [rTmpPtr2], 16 // P6, P5
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fA4 = [rPolDataPtr], 16 // A4
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA16, fA15 = [rTmpPtr2], 16 // P4, p3
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA5L, fA6 = [rPolDataPtr], 16 // A5, A6
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA14, fA13 = [rTmpPtr2], 16 // P2, P1
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA7, fA8 = [rPolDataPtr], 16 // A7, A8
+ nop.f 0
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+}
+{ .mfi
+ ldfe fLnSin2 = [rTmpPtr2], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2
+ nop.f 0
+ shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
+}
+{ .mfi
+ ldfe fLnSin4 = [rTmpPtr2], 32
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ nop.f 0
+ adds rTmpPtr = 8, GR_ad_tbl_2
+}
+{ .mfi
+ // Put integer N into rightmost significand
+ setf.sig fFloatN = GR_N
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fLnSin6 = [rTmpPtr3]
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe fLnSin8 = [rTmpPtr2]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfd FR_h2 = [rTmpPtr] // Load h_2
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s1 fResH = fA20, fResL, fA19 //polynomial for log(|x|)
+ // Get bits 30-15 of X_1 * Z_2
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s1 fA2 = fA2, fDx, fA1 // polynomial for lgammal(|x|)
+ nop.i 0
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ nop.m 0
+ fma.s1 fA18 = fA18, fResL, fA17 //polynomial for log(|x|)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA16 = fA16, fResL, fA15 //polynomial for log(|x|)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fA4, fDx, fA3 // polynomial for lgammal(|x|)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA14 = fA14, fResL, fA13 //polynomial for log(|x|)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA6 = fA6, fDx, fA5L // polynomial for lgammal(|x|)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fA8, fDx, fA7 // polynomial for lgammal(|x|)
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
+ // loqw part of lnsin polynomial
+ fma.s1 fRes3L = fLnSin4, fDxSqr, fLnSin2
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3
+ fcvt.xf fFloatN = fFloatN // N as FP number
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fResH, fDxSqr, fA18 // High part of log(|x|)
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+ fma.s1 fA4 = fA4, fDxSqr, fA2 // Low part of lgammal(|x|)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // high part of lnsin polynomial
+ fma.s1 fRes3H = fLnSin8, fDxSqr, fLnSin6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA16 = fA16, fDxSqr, fA14 // Low part of log(|x|)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, fDxSqr, fA6 // High part of lgammal(|x|)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fResH, fA0L, fA16 // log(|x|)/deltaX^2 - deltaX
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fResH, fDxSqr, fResL // log(|x|)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, fA0L, fA4 // lgammal(|x|)/|x|
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = FR_G, fAbsX, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // high part of log(deltaX)= Y_hi = N * log2_hi + H
+ fma.s1 fRes4H = fFloatN, FR_log2_hi, FR_H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // h = N * log2_lo + h
+ fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fPol, fDx, fResH // lgammal(|x|) + log(|x|)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // lnsin/deltaX^2
+ fma.s1 fRes3H = fRes3H, fA0L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = r * Q4 + Q3
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // lnSin - log(|x|) - lgammal(|x|)
+ fms.s1 fResH = fRes3H, fDxSqr, fResH
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // low part of log(|deltaX|) = Y_lo = poly_hi + poly_lo
+ fadd.s1 fRes4L = FR_poly_hi, FR_poly_lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fResH = fResH, fRes4L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for |x+1|< 2^(-7) path
+ fsub.s0 f8 = fResH, fRes4H
+ // exit for |x+1|< 2^(-7) path
+ br.ret.sptk b0
+}
+;;
+
+
+// here if -2^63 < x < -6.0 and x is not an integer
+// Also we are going to filter out cases when x falls in
+// range which is "close enough" to negative root. Rhis case
+// may occur only for -19.5 < x since other roots of lgamma are
+// insignificant from double extended point of view (they are closer
+// to RTN(x) than one ulp(x).
+.align 32
+_negStirling:
+{ .mfi
+ ldfe fLnSin6 = [rLnSinDataPtr], 32
+ fnma.s1 fInvX = f8, fRcpX, f1 // start of 3rd NR iteration
+ // Get high 4 bits of significand of deltaX
+ extr.u rIndex1Dx = rSignifDx, 59, 4
+}
+{ .mfi
+ ldfe fLnSin8 = [rTmpPtr3], 32
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+(p12) cmp.ltu.unc p6, p0 = rSignifX, rLeftBound
+}
+;;
+{ .mfi
+ ldfe fLnSin10 = [rLnSinDataPtr], 32
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ // Get high 15 bits of significand
+ extr.u GR_X_0 = rSignifDx, 49, 15
+}
+{ .mfi
+ shladd GR_ad_z_1 = rIndex1Dx, 2, GR_ad_z_1 // Point to Z_1
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ // set p6 if x falls in "near root" range
+(p6) cmp.geu.unc p6, p0 = rSignifX, rRightBound
+}
+;;
+{ .mfi
+ getf.exp GR_N = fDx // Get N = exponent of x
+ fma.s1 fDx4 = fDxSqr, fDxSqr, f0 // deltaX^4
+ adds rTmpPtr = 96, rBernulliPtr
+}
+{ .mfb
+ ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
+ fma.s1 fLnSin34 = fLnSin34, fDxSqr, fLnSin32
+ // branch to special path if x falls in "near root" range
+(p6) br.cond.spnt _negRoots
+}
+;;
+.pred.rel "mutex",p10,p11
+{ .mfi
+ ldfe fLnSin12 = [rTmpPtr3]
+ fma.s1 fLnSin26 = fLnSin26, fDxSqr, fLnSin24
+(p10) cmp.eq p8, p9 = rXRnd, r0
+}
+{ .mfi
+ ldfe fLnSin14 = [rLnSinDataPtr]
+ fma.s1 fLnSin30 = fLnSin30, fDxSqr, fLnSin28
+(p11) cmp.eq p9, p8 = rXRnd, r0
+}
+;;
+{ .mfi
+ ldfpd fB2, fB2L = [rBernulliPtr], 16
+ fma.s1 fLnSin18 = fLnSin18, fDxSqr, fLnSin16
+ shladd GR_ad_tbl_1 = rIndex1Dx, 4, rTbl1Addr // Point to G_1
+
+}
+{ .mfi
+ ldfe fB14 = [rTmpPtr], 16
+ fma.s1 fLnSin22 = fLnSin22, fDxSqr, fLnSin20
+ and GR_N = GR_N, r17Ones // mask sign bit
+}
+;;
+{ .mfi
+ ldfe fB4 = [rBernulliPtr], 16
+ fma.s1 fInvX = fInvX, fRcpX, fRcpX // end of 3rd NR iteration
+ // Get bits 30-15 of X_0 * Z_1
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
+}
+{ .mfi
+ ldfe fB16 = [rTmpPtr], 16
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ adds rTmpPtr2 = 8, GR_ad_tbl_1
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ ldfe fB6 = [rBernulliPtr], 16
+ fms.s1 FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1
+ adds rTmpPtr3 = -48, rTmpPtr
+}
+{ .mfi
+ ldfe fB18 = [rTmpPtr], 16
+ // High part of the log(|x|) = Y_hi = N * log2_hi + H
+ fma.s1 fResH = fFloatN, FR_log2_hi, FR_H
+ sub GR_N = GR_N, rExpHalf, 1 // unbiased exponent of deltaX
+}
+;;
+.pred.rel "mutex",p8,p9
+{ .mfi
+ ldfe fB8 = [rBernulliPtr], 16
+ fma.s1 fLnSin36 = fLnSin36, fDx4, fLnSin34
+ // sign of GAMMA(x) is negative
+(p8) adds rSgnGam = -1, r0
+}
+{ .mfi
+ ldfe fB20 = [rTmpPtr], -160
+ fma.s1 fRes5H = fLnSin4, fDxSqr, f0
+ // sign of GAMMA(x) is positive
+(p9) adds rSgnGam = 1, r0
+
+}
+;;
+{ .mfi
+ ldfe fB10 = [rBernulliPtr], 16
+ fma.s1 fLnSin30 = fLnSin30, fDx4, fLnSin26
+(p14) adds rTmpPtr = -160, rTmpPtr
+}
+{ .mfi
+ ldfe fB12 = [rTmpPtr3], 16
+ fma.s1 fDx8 = fDx4, fDx4, f0 // deltaX^8
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+;;
+{ .mfi
+ ldfps fGDx, fHDx = [GR_ad_tbl_1], 8 // Load G_1, H_1
+ fma.s1 fDx6 = fDx4, fDxSqr, f0 // deltaX^6
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+}
+{ .mfi
+ ldfd fhDx = [rTmpPtr2] // Load h_1
+ fma.s1 fLnSin22 = fLnSin22, fDx4, fLnSin18
+ nop.i 0
+}
+;;
+{ .mfi
+ // Load two parts of C
+ ldfpd fRes1H, fRes1L = [rTmpPtr], 16
+ fma.s1 fRcpX = fInvX, fInvX, f0 // (1/x)^2
+ shladd GR_ad_tbl_2 = GR_Index2, 4, rTbl2Addr // Point to G_2
+}
+{ .mfi
+ shladd GR_ad_z_2 = GR_Index2, 2, rZ2Addr // Point to Z_2
+ fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h// h = N * log2_lo + h
+ nop.i 0
+}
+;;
+{ .mfi
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ fnma.s1 fInvXL = f8, fInvX, f1 // relative error of 1/x
+ nop.i 0
+}
+{ .mfi
+ adds rTmpPtr2 = 8, GR_ad_tbl_2
+ fma.s1 fLnSin8 = fLnSin8, fDxSqr, fLnSin6
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
+ // poly_lo = r * Q4 + Q3
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
+ nop.i 0
+}
+{ .mfi
+ ldfd fh2Dx = [rTmpPtr2] // Load h_2
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA1L = fB2, fInvX, f0 // (B2*(1/x))hi
+ nop.i 0
+}
+{ .mfi
+ // Put integer N into rightmost significand
+ setf.sig fFloatNDx = GR_N
+ fms.s1 fRes4H = fResH, f1, f1 // ln(|x|)hi - 1
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2H = fRes5H, fLnSin2//(lnSin4*DeltaX^2 + lnSin2)hi
+ // Get bits 30-15 of X_1 * Z_2
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fRes5L = fLnSin4, fDxSqr, fRes5H
+ nop.i 0
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ nop.m 0
+ fma.s1 fInvX4 = fRcpX, fRcpX, f0 // (1/x)^4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB6 = fB6, fRcpX, fB4
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s1 fB18 = fB18, fRcpX, fB16
+ nop.i 0
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s1 fInvXL = fInvXL, fInvX, f0 // low part of 1/x
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3H = fRes4H, f8, f0 // (-|x|*(ln(|x|)-1))hi
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+}
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, rTbl3Addr // Point to G_3
+ fms.s1 fA2L = fB2, fInvX, fA1L // delta(B2*(1/x))
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 fBrnH = fRes1H, f1, fA1L // (-C - S(1/x))hi
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps fG3Dx, fH3Dx = [GR_ad_tbl_3],8 // Load G_3, H_3
+ fma.s1 fInvX8 = fInvX4, fInvX4, f0 // (1/x)^8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB10 = fB10, fRcpX, fB8
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfd fh3Dx = [GR_ad_tbl_3] // Load h_3
+ fma.s1 fB20 = fB20, fInvX4, fB18
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB14 = fB14, fRcpX, fB12
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDx8, fLnSin30
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin12 = fLnSin12, fDxSqr, fLnSin10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes2L = fLnSin2, fRes2H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fRes2H, fDxSqr, f0 // high part of LnSin
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fnma.s1 fResH = fResH, FR_MHalf, fResH // -0.5*ln(|x|)hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 fGDx = fGDx, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // B2lo*(1/x)hi+ delta(B2*(1/x))
+ fma.s1 fA2L = fB2L, fInvX, fA2L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB20 = fB20, fInvX4, fB14
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB10 = fB10, fInvX4, fB6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fcvt.xf fFloatNDx = fFloatNDx
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin14 = fLnSin14, fDx4, fLnSin12
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDx8, fLnSin22
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fRes3L = fRes4H, f8, fRes3H // delta(-|x|*(ln(|x|)-1))
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fmpy.s1 fGDx = fGDx, fG3Dx // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))hi
+ fadd.s1 fRes4H = fRes3H, fResH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA2L = fInvXL, fB2, fA2L //(B2*(1/x))lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // low part of log(|x|) = Y_lo = poly_hi + poly_lo
+ fadd.s1 fResL = FR_poly_hi, FR_poly_lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB20 = fB20, fInvX8, fB10
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fInvX3 = fInvX, fRcpX, f0 // (1/x)^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fHDx = fHDx, FR_H2 // H = H_1 + H_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes5L = fRes5L, fLnSin2L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fRes5H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fhDx = fhDx, fh2Dx // h = h_1 + h_2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fBrnL = fRes1H, fMOne, fBrnH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = fGDx, fNormDx, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3L = fResL, f8 , fRes3L // (-|x|*(ln(|x|)-1))lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes4L = fRes3H, fRes4H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // low part of "Bernulli" polynomial
+ fma.s1 fB20 = fB20, fInvX3, fA2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 fResL = fResL, FR_MHalf, fResL // -0.5*ln(|x|)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fHDx = fHDx, fH3Dx // H = (H_1 + H_2) + H_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fPolL = fRes2H, fDxSqr, fPol
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fhDx = fhDx, fh3Dx // h = (h_1 + h_2) + h_3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (-|x|*(ln(|x|)-1) - 0.5ln(|x|) - C - S(1/x))hi
+ fadd.s1 fB14 = fRes4H, fBrnH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = r * Q4 + Q3
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fResH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fBrnL = fBrnL, fA1L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (-|x|*(ln(|x|)-1))lo + (-0.5ln(|x|))lo
+ fadd.s1 fRes3L = fRes3L, fResL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 fB20 = fRes1L, f1, fB20 // -Clo - S(1/x)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fRes5L // (lnSin4*DeltaX^2 + lnSin2)lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fDxSqrL, fRes2H, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin14 = fLnSin14, fDx4, fLnSin8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin36, fDx8, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fB12 = fRes4H, fB14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))lo
+ fadd.s1 fRes4L = fRes4L, fRes3L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fBrnL = fBrnL, fB20 // (-C - S(1/x))lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // high part of log(|DeltaX|) = Y_hi = N * log2_hi + H
+ fma.s1 fLnDeltaH = fFloatNDx, FR_log2_hi, fHDx
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // h = N * log2_lo + h
+ fma.s1 fhDx = fFloatNDx, FR_log2_lo, fhDx
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fRes2L, fDxSqr, fPolL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin14 = fLnSin36, fDxSqr, fLnSin14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (-|x|*(ln(|x|)-1) - 0.5ln(|x|))lo + (- C - S(1/x))lo
+ fadd.s1 fBrnL = fBrnL, fRes4L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fB12 = fB12, fBrnH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, fhDx
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 fRes1H = fLnDeltaH, f1, fPol//(-ln(|DeltaX|) + LnSin)hi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fDxSqrL, fRes2L, fPolL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin36 = fLnSin14, fDx6, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (-|x|*(ln(|x|)-1) - 0.5ln(|x|) - C - S(1/x))lo
+ fadd.s1 fB12 = fB12, fBrnL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // low part of log(|DeltaX|) = Y_lo = poly_hi + poly_lo
+ fadd.s1 fLnDeltaL= FR_poly_hi, FR_poly_lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1L = fLnDeltaH, fMOne, fRes1H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fPolL = fPolL, fLnSin36
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ //(-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi + (-ln(|DeltaX|) + LnSin)hi
+ fadd.s1 f8 = fRes1H, fB14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ //max((-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi,
+ // (-ln(|DeltaX|) + LnSin)hi)
+ famax.s1 fMaxNegStir = fRes1H, fB14
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ //min((-|x|*(ln(|x|)-1)-0.5ln(|x|) - C - S(1/x))hi,
+ // (-ln(|DeltaX|) + LnSin)hi)
+ famin.s1 fMinNegStir = fRes1H, fB14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fPol
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (-ln(|DeltaX|))lo + (LnSin)lo
+ fnma.s1 fPolL = fLnDeltaL, f1, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 f9 = fMaxNegStir, f8 // delta1
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fPolL // (-ln(|DeltaX|) + LnSin)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 f9 = f9, fMinNegStir
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fB12
+ nop.i 0
+}
+;;
+{ .mfi
+ // low part of the result
+ fadd.s1 f9 = f9, fRes1L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for -2^63 < x < -6.0 path
+ fma.s0 f8 = f8, f1, f9
+ // exit here for -2^63 < x < -6.0 path
+ br.ret.sptk b0
+}
+;;
+
+// here if x falls in neighbourhood of any negative root
+// "neighbourhood" typically means that |lgammal(x)| < 0.17
+// on the [-3.0,-2.0] range |lgammal(x)| has even less
+// magnitude
+// rXint contains index of the root
+// p10 is set if root belongs to "right" ones
+// p11 is set if root belongs to "left" ones
+// lgammal(x) is approximated by polynomial of
+// 19th degree from (x - root) argument
+.align 32
+_negRoots:
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_right_roots_polynomial_data),gp
+ nop.f 0
+ shl rTmpPtr2 = rXint, 7 // (i*16)*8
+}
+{ .mfi
+ adds rRootsAddr = -288, rRootsBndAddr
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fRoot = [rRootsAddr] // FP representation of root
+ nop.f 0
+ shl rTmpPtr = rXint, 6 // (i*16)*4
+}
+{ .mfi
+(p11) adds rTmpPtr2 = 3536, rTmpPtr2
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ shladd rTmpPtr = rXint, 4, rTmpPtr // (i*16) + (i*16)*4
+}
+{ .mfi
+ adds rTmpPtr3 = 32, rTmpPtr2
+ nop.f 0
+ nop.i 0
+}
+;;
+.pred.rel "mutex",p10,p11
+{ .mfi
+ add rTmpPtr3 = rTmpPtr, rTmpPtr3
+ nop.f 0
+(p10) cmp.eq p8, p9 = rXRnd, r0
+}
+{ .mfi
+ // (i*16) + (i*16)*4 + (i*16)*8
+ add rTmpPtr = rTmpPtr, rTmpPtr2
+ nop.f 0
+(p11) cmp.eq p9, p8 = rXRnd, r0
+}
+;;
+{ .mfi
+ add rTmpPtr2 = rPolDataPtr, rTmpPtr3
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ add rPolDataPtr = rPolDataPtr, rTmpPtr // begin + offsett
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA0, fA0L = [rPolDataPtr], 16 // A0
+ nop.f 0
+ adds rTmpPtr = 112, rTmpPtr2
+}
+{ .mfi
+ ldfpd fA2, fA2L = [rTmpPtr2], 16 // A2
+ nop.f 0
+ cmp.eq p12, p13 = 4, rSgnGamSize
+}
+;;
+{ .mfi
+ ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe fA3 = [rTmpPtr2], 128 // A4
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA12, fA13 = [rTmpPtr], 16 // A12, A13
+ nop.f 0
+ adds rTmpPtr3 = 64, rPolDataPtr
+}
+{ .mfi
+ ldfpd fA16, fA17 = [rTmpPtr2], 16 // A16, A17
+ nop.f 0
+ adds rPolDataPtr = 32, rPolDataPtr
+}
+;;
+.pred.rel "mutex",p8,p9
+{ .mfi
+ ldfpd fA14, fA15 = [rTmpPtr], 16 // A14, A15
+ nop.f 0
+ // sign of GAMMA(x) is negative
+(p8) adds rSgnGam = -1, r0
+}
+{ .mfi
+ ldfpd fA18, fA19 = [rTmpPtr2], 16 // A18, A19
+ nop.f 0
+ // sign of GAMMA(x) is positive
+(p9) adds rSgnGam = 1, r0
+}
+;;
+{ .mfi
+ ldfe fA4 = [rPolDataPtr], 16 // A4
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA6, fA7 = [rTmpPtr3], 16 // A6, A7
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fA5 = [rPolDataPtr], 16 // A5
+ // if x equals to (rounded) root exactly
+ fcmp.eq.s1 p6, p0 = f8, fRoot
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA8, fA9 = [rTmpPtr3], 16 // A8, A9
+ fms.s1 FR_FracX = f8, f1, fRoot
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p12) st4 [rSgnGamAddr] = rSgnGam
+ nop.f 0
+ nop.i 0
+}
+{ .mfb
+ // store signgam if size of variable is 8 bytes
+(p13) st8 [rSgnGamAddr] = rSgnGam
+ // answer if x equals to (rounded) root exactly
+(p6) fadd.s0 f8 = fA0, fA0L
+ // exit if x equals to (rounded) root exactly
+(p6) br.ret.spnt b0
+}
+;;
+{ .mmf
+ ldfpd fA10, fA11 = [rTmpPtr3], 16 // A10, A11
+ nop.m 0
+ nop.f 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fA2, FR_FracX, f0 // (A2*x)hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4L = FR_FracX, FR_FracX, f0 // x^2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, FR_FracX, fA16
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, FR_FracX, fA12
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, FR_FracX, fA18
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, FR_FracX, fA14
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fPol = fA7, FR_FracX, fA6
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, FR_FracX, fA8
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fResL = fA2, FR_FracX, fResH // delta(A2*x)
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fadd.s1 fRes1H = fResH, fA1 // (A2*x + A1)hi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, FR_FracX, fA10
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 fA5L = fA4L, fA4L, f0 // x^4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, fA4L, fA17
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fA4L, fA13
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, FR_FracX, fA5
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 fA3L = fA4L, FR_FracX, f0 // x^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // delta(A2*x) + A2L*x = (A2*x)lo
+ fma.s1 fResL = fA2L, FR_FracX, fResL
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fsub.s1 fRes1L = fA1, fRes1H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, fA4L, fA9
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, fA5L, fA15
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fPol = fPol, FR_FracX, fA4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fResL = fResL, fA1L // (A2*x)lo + A1
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fResH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2H = fRes1H, FR_FracX, f0 // ((A2*x + A1)*x)hi
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, fA5L, fA11
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fPol = fPol, FR_FracX, fA3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fResL // (A2*x + A1)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // delta((A2*x + A1)*x)
+ fms.s1 fRes2L = fRes1H, FR_FracX, fRes2H
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fadd.s1 fRes3H = fRes2H, fA0 // ((A2*x + A1)*x + A0)hi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, fA5L, f0
+ nop.i 0
+}
+
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2L = fRes1L, FR_FracX, fRes2L // ((A2*x + A1)*x)lo
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fsub.s1 fRes3L = fRes2H, fRes3H
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fPol = fA19, FR_FracX, fPol
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fA0
+ nop.i 0
+}
+{.mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fA0L // ((A2*x + A1)*x)lo + A0L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fRes2L // (((A2*x + A1)*x) + A0)lo
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fRes3L = fPol, fA3L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for arguments which are close to negative roots
+ fma.s0 f8 = fRes3H, f1, fRes3L
+ // exit here for arguments which are close to negative roots
+ br.ret.sptk b0
+}
+;;
+
+// here if |x| < 0.5
+.align 32
+lgammal_0_half:
+{ .mfi
+ ld4 GR_Z_1 = [rZ1offsett] // Load Z_1
+ fma.s1 fA4L = f8, f8, f0 // x^2
+ addl rPolDataPtr = @ltoff(lgammal_0_Half_data), gp
+}
+{ .mfi
+ shladd GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr// Point to G_1
+ nop.f 0
+ addl rLnSinDataPtr = @ltoff(lgammal_lnsin_data), gp
+}
+;;
+{ .mfi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
+ nop.f 0
+ // Point to Constants_Z_2
+ add GR_ad_z_2 = 0x140, GR_ad_z_1
+}
+{ .mfi
+ add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q
+ nop.f 0
+ // Point to Constants_G_H_h2
+ add GR_ad_tbl_2 = 0x180, GR_ad_z_1
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ // Point to Constants_G_H_h3
+ add GR_ad_tbl_3 = 0x280, GR_ad_z_1
+}
+{ .mfi
+ ldfd FR_h = [GR_ad_tbl_1] // Load h_1
+ nop.f 0
+ sub GR_N = rExpX, rExpHalf, 1
+}
+;;
+{ .mfi
+ ld8 rLnSinDataPtr = [rLnSinDataPtr]
+ nop.f 0
+ // Get bits 30-15 of X_0 * Z_1
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
+}
+{ .mfi
+ ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
+ nop.f 0
+ sub GR_N = r0, GR_N
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ ldfe FR_log2_lo = [GR_ad_q], 16 // Load log2_lo
+ nop.f 0
+ add rTmpPtr2 = 320, rPolDataPtr
+}
+{ .mfi
+ add rTmpPtr = 32, rPolDataPtr
+ nop.f 0
+ // exponent of 0.25
+ adds rExp2 = -1, rExpHalf
+}
+;;
+{ .mfi
+ ldfpd fA3, fA3L = [rPolDataPtr], 16 // A3
+ fma.s1 fA5L = fA4L, fA4L, f0 // x^4
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA1, fA1L = [rTmpPtr], 16 // A1
+ fms.s1 fB8 = f8, f8, fA4L // x^2 - <x^2>
+ // set p6 if -0.5 < x <= -0.25
+(p15) cmp.eq.unc p6, p0 = rExpX, rExp2
+}
+;;
+{ .mfi
+ ldfpd fA2, fA2L = [rPolDataPtr], 16 // A2
+ nop.f 0
+ // set p6 if -0.5 < x <= -0.40625
+(p6) cmp.le.unc p6, p0 = 10, GR_Index1
+}
+{ .mfi
+ ldfe fA21 = [rTmpPtr2], -16 // A21
+ // Put integer N into rightmost significand
+ nop.f 0
+ adds rTmpPtr = 240, rTmpPtr
+}
+;;
+{ .mfi
+ setf.sig fFloatN = GR_N
+ nop.f 0
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+}
+{ .mfi
+ ldfe FR_Q4 = [GR_ad_q], 16 // Load Q4
+ nop.f 0
+ adds rPolDataPtr = 304, rPolDataPtr
+}
+;;
+{ .mfi
+ ldfe fA20 = [rTmpPtr2], -32 // A20
+ nop.f 0
+ shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
+}
+{ .mfi
+ ldfe fA19 = [rTmpPtr], -32 // A19
+ nop.f 0
+ shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2// Point to G_2
+}
+;;
+{ .mfi
+ ldfe fA17 = [rTmpPtr], -32 // A17
+ nop.f 0
+ adds rTmpPtr3 = 8, GR_ad_tbl_2
+}
+{ .mfb
+ ldfe fA18 = [rTmpPtr2], -32 // A18
+ nop.f 0
+ // branch to special path for -0.5 < x <= 0.40625
+(p6) br.cond.spnt lgammal_near_neg_half
+}
+;;
+{ .mmf
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ ldfe fA15 = [rTmpPtr], -32 // A15
+ fma.s1 fB20 = fA5L, fA5L, f0 // x^8
+}
+;;
+{ .mmf
+ ldfe fA16 = [rTmpPtr2], -32 // A16
+ ldfe fA13 = [rTmpPtr], -32 // A13
+ fms.s1 fB16 = fA4L, fA4L, fA5L
+}
+;;
+{ .mmf
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2], 8 // Load G_2, H_2
+ ldfd FR_h2 = [rTmpPtr3] // Load h_2
+ fmerge.s fB10 = f8, fA5L // sign(x) * x^4
+}
+;;
+{ .mmi
+ ldfe fA14 = [rTmpPtr2], -32 // A14
+ ldfe fA11 = [rTmpPtr], -32 // A11
+ // Get bits 30-15 of X_1 * Z_2
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ ldfe fA12 = [rTmpPtr2], -32 // A12
+ fma.s1 fRes4H = fA3, fAbsX, f0
+ adds rTmpPtr3 = 16, GR_ad_q
+}
+{ .mfi
+ ldfe fA9 = [rTmpPtr], -32 // A9
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mmf
+ ldfe fA10 = [rTmpPtr2], -32 // A10
+ ldfe fA7 = [rTmpPtr], -32 // A7
+ fma.s1 fB18 = fB20, fB20, f0 // x^16
+}
+;;
+{ .mmf
+ ldfe fA8 = [rTmpPtr2], -32 // A8
+ ldfe fA22 = [rPolDataPtr], 16 // A22
+ fcvt.xf fFloatN = fFloatN
+}
+;;
+{ .mfi
+ ldfe fA5 = [rTmpPtr], -32 // A5
+ fma.s1 fA21 = fA21, fAbsX, fA20 // v16
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+}
+{ .mfi
+ ldfe fA6 = [rTmpPtr2], -32 // A6
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mmf
+ // Point to G_3
+ shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3
+ ldfe fA4 = [rTmpPtr2], -32 // A4
+ fma.s1 fA19 = fA19, fAbsX, fA18 // v13
+}
+;;
+.pred.rel "mutex",p14,p15
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
+ fms.s1 fRes4L = fA3, fAbsX, fRes4H
+(p14) adds rSgnGam = 1, r0
+}
+{ .mfi
+ cmp.eq p6, p7 = 4, rSgnGamSize
+ fadd.s1 fRes2H = fRes4H, fA2
+(p15) adds rSgnGam = -1, r0
+}
+;;
+
+{ .mfi
+ ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+ fma.s1 fA17 = fA17, fAbsX, fA16 // v12
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe FR_Q3 = [GR_ad_q], 32 // Load Q3
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+{ .mfi
+ ldfe FR_Q2 = [rTmpPtr3], 16 // Load Q2
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe FR_Q1 = [GR_ad_q] // Load Q1
+ fma.s1 fA15 = fA15, fAbsX, fA14 // v8
+ nop.i 0
+}
+{ .mfi
+ adds rTmpPtr3 = 32, rLnSinDataPtr
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+ nop.i 0
+}
+;;
+{ .mmf
+ ldfpd fLnSin2, fLnSin2L = [rLnSinDataPtr], 16
+ ldfe fLnSin6 = [rTmpPtr3], 32
+ fma.s1 fA13 = fA13, fAbsX, fA12 // v7
+
+}
+;;
+{ .mfi
+ ldfe fLnSin4 = [rLnSinDataPtr], 32
+ fma.s1 fRes4L = fA3L, fAbsX, fRes4L
+ nop.i 0
+}
+{ .mfi
+ ldfe fLnSin10 = [rTmpPtr3], 32
+ fsub.s1 fRes2L = fA2, fRes2H
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fLnSin8 = [rLnSinDataPtr], 32
+ fma.s1 fResH = fRes2H, fAbsX, f0
+ nop.i 0
+}
+{ .mfi
+ ldfe fLnSin14 = [rTmpPtr3], 32
+ fma.s1 fA22 = fA22, fA4L, fA21 // v15
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fLnSin12 = [rLnSinDataPtr], 32
+ fma.s1 fA9 = fA9, fAbsX, fA8 // v4
+ nop.i 0
+}
+{ .mfi
+ ldfd fLnSin18 = [rTmpPtr3], 16
+ fma.s1 fA11 = fA11, fAbsX, fA10 // v5
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fLnSin16 = [rLnSinDataPtr], 24
+ fma.s1 fA19 = fA19, fA4L, fA17 // v11
+ nop.i 0
+}
+{ .mfi
+ ldfd fLnSin22 = [rTmpPtr3], 16
+ fma.s1 fPolL = fA7, fAbsX, fA6
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd fLnSin20 = [rLnSinDataPtr], 16
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ ldfd fLnSin26 = [rTmpPtr3], 16
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd fLnSin24 = [rLnSinDataPtr], 16
+ fadd.s1 fRes2L = fRes2L, fRes4H
+ nop.i 0
+}
+{ .mfi
+ ldfd fLnSin30 = [rTmpPtr3], 16
+ fadd.s1 fA2L = fA2L, fRes4L
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd fLnSin28 = [rLnSinDataPtr], 16
+ fms.s1 fResL = fRes2H, fAbsX, fResH
+ nop.i 0
+}
+{ .mfi
+ ldfd fLnSin34 = [rTmpPtr3], 8
+ fadd.s1 fRes2H = fResH, fA1
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd fLnSin32 = [rLnSinDataPtr]
+ fma.s1 fA11 = fA11, fA4L, fA9 // v3
+ nop.i 0
+}
+{ .mfi
+ ldfd fLnSin36 = [rTmpPtr3]
+ fma.s1 fA15 = fA15, fA4L, fA13 // v6
+ nop.i 0
+}
+;;
+
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 0
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s1 fA5 = fA5, fAbsX, fA4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = FR_G, fSignifX, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // High part of the log(|x|): Y_hi = N * log2_hi + H
+ fms.s1 FR_log2_hi = fFloatN, FR_log2_hi, FR_H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fA3L = fRes2L, fA2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA22 = fA22, fA5L, fA19
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes2L = fA1, fRes2H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3H = fRes2H, f8, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fA5L, fA11 // v2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin18 = fLnSin18, fA4L, fLnSin16
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // h = N * log2_lo + h
+ fms.s1 FR_h = fFloatN, FR_log2_lo, FR_h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fPolL, fA4L, fA5
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = r * Q4 + Q3
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResL = fA3L, fAbsX, fResL
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin30 = fLnSin30, fA4L, fLnSin28
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fResH
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fRes3L = fRes2H, f8, fRes3H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1H = fRes3H, FR_log2_hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fB20, fA22, fA15
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin34 = fLnSin34, fA4L, fLnSin32
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin14 = fLnSin14, fA4L, fLnSin12
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fA1L = fA1L, fResL
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin22 = fLnSin22, fA4L, fLnSin20
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin26 = fLnSin26, fA4L, fLnSin24
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes1L = FR_log2_hi, fRes1H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, fA5L, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin34 = fLnSin36, fA5L, fLnSin34
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin18 = fLnSin18, fA5L, fLnSin14
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin6 = fLnSin6, fA4L, fLnSin4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin10 = fLnSin10, fA4L, fLnSin8
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fA1L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB2 = fLnSin2, fA4L, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fRes3H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, fB10, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin26 = fLnSin26, fA5L, fLnSin22
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin34 = fLnSin34, fA5L, fLnSin30
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin10 = fLnSin10, fA5L, fLnSin6
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin2L = fLnSin2L, fA4L, f0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3L = fRes2L, f8, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Y_lo = poly_hi + poly_lo
+ fsub.s1 FR_log2_lo = FR_poly_lo, FR_poly_hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fB4 = fLnSin2, fA4L, fB2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2H = fRes1H, fPol
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin34 = fLnSin34, fB20, fLnSin26
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin18 = fLnSin18, fB20, fLnSin10
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fLnSin2L = fB8, fLnSin2, fLnSin2L
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_log2_lo = FR_log2_lo, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes2L = fRes1H, fRes2H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB6 = fLnSin34, fB18, fLnSin18
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fB4 = fLnSin2L, fB4
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, FR_log2_lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fPol
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fB12 = fB6, fA5L, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fRes1L
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fms.s1 fB14 = fB6, fA5L, fB12
+ nop.i 0
+}
+{ .mfb
+ nop.m 0
+ fadd.s1 fLnSin30 = fB2, fB12
+ // branch out if x is negative
+(p15) br.cond.spnt _O_Half_neg
+}
+;;
+{ .mfb
+ nop.m 0
+ // sign(x)*Pol(|x|) - log(|x|)
+ fma.s0 f8 = fRes2H, f1, fRes2L
+ // it's an answer already for positive x
+ // exit if 0 < x < 0.5
+ br.ret.sptk b0
+}
+;;
+
+// here if x is negative and |x| < 0.5
+.align 32
+_O_Half_neg:
+{ .mfi
+ nop.m 0
+ fma.s1 fB14 = fB16, fB6, fB14
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fLnSin16 = fB2, fLnSin30
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fResH = fLnSin30, fRes2H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fLnSin16 = fLnSin16, fB12
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fB4 = fB14, fB4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fLnSin16 = fB4, fLnSin16
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fResL = fRes2H, fResH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fResL = fResL, fLnSin30
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fLnSin16 = fLnSin16, fRes2L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fResL = fResL, fLnSin16
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for -0.5 < x < 0
+ fma.s0 f8 = fResH, f1, fResL
+ // exit for -0.5 < x < 0
+ br.ret.sptk b0
+}
+;;
+
+// here if x >= 8.0
+// there are two computational paths:
+// 1) For x >10.0 Stirling's formula is used
+// 2) Polynomial approximation for 8.0 <= x <= 10.0
+.align 32
+lgammal_big_positive:
+{ .mfi
+ addl rPolDataPtr = @ltoff(lgammal_data), gp
+ fmerge.se fSignifX = f1, f8
+ // Get high 15 bits of significand
+ extr.u GR_X_0 = rSignifX, 49, 15
+}
+{.mfi
+ shladd rZ1offsett = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
+ fnma.s1 fInvX = f8, fRcpX, f1 // start of 1st NR iteration
+ adds rSignif1andQ = 0x5, r0
+}
+;;
+{.mfi
+ ld4 GR_Z_1 = [rZ1offsett] // Load Z_1
+ nop.f 0
+ shl rSignif1andQ = rSignif1andQ, 61 // significand of 1.25
+}
+{ .mfi
+ cmp.eq p8, p0 = rExpX, rExp8 // p8 = 1 if 8.0 <= x < 16
+ nop.f 0
+ adds rSgnGam = 1, r0 // gamma is positive at this range
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_1 = GR_Index1, 4, rTbl1Addr// Point to G_1
+ nop.f 0
+ add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_Q
+}
+{ .mlx
+ ld8 rPolDataPtr = [rPolDataPtr]
+ movl rDelta = 0x3FF2000000000000
+}
+;;
+{ .mfi
+ ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
+ nop.f 0
+ add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
+}
+{ .mfi
+ // Point to Constants_G_H_h2
+ add GR_ad_tbl_2 = 0x180, GR_ad_z_1
+ nop.f 0
+ // p8 = 1 if 8.0 <= x <= 10.0
+(p8) cmp.leu.unc p8, p0 = rSignifX, rSignif1andQ
+}
+;;
+{ .mfi
+ ldfd FR_h = [GR_ad_tbl_1] // Load h_1
+ nop.f 0
+ // Get bits 30-15 of X_0 * Z_1
+ pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15
+}
+{ .mfb
+(p8) setf.d FR_MHalf = rDelta
+ nop.f 0
+(p8) br.cond.spnt lgammal_8_10 // branch out if 8.0 <= x <= 10.0
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ ldfe fA1 = [rPolDataPtr], 16 // Load overflow threshold
+ fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 1st NR iteration
+ // Point to Constants_G_H_h3
+ add GR_ad_tbl_3 = 0x280, GR_ad_z_1
+}
+{ .mlx
+ nop.m 0
+ movl rDelta = 0xBFE0000000000000 // -0.5 in DP
+}
+;;
+{ .mfi
+ ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
+ nop.f 0
+ sub GR_N = rExpX, rExpHalf, 1 // unbiased exponent of x
+}
+;;
+{ .mfi
+ ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ setf.d FR_MHalf = rDelta
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ // Put integer N into rightmost significand
+ setf.sig fFloatN = GR_N
+ nop.f 0
+ extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
+}
+{ .mfi
+ ldfe FR_Q4 = [GR_ad_q], 16 // Load Q4
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
+ nop.f 0
+ shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2// Point to G_2
+}
+{ .mfi
+ ldfe FR_Q3 = [GR_ad_q], 16 // Load Q3
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
+ fnma.s1 fInvX = f8, fRcpX, f1 // start of 2nd NR iteration
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G2, FR_H2 = [GR_ad_tbl_2], 8 // Load G_2, H_2
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
+ nop.f 0
+ // Get bits 30-15 of X_1 * Z_2
+ pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15
+}
+;;
+//
+// For performance, don't use result of pmpyshr2.u for 4 cycles.
+//
+{ .mfi
+ ldfe FR_Q1 = [GR_ad_q] // Load Q1
+ fcmp.gt.s1 p7,p0 = f8, fA1 // check if x > overflow threshold
+ nop.i 0
+}
+;;
+{.mfi
+ ldfpd fA0, fA0L = [rPolDataPtr], 16 // Load two parts of C
+ fma.s1 fRcpX = fInvX, fRcpX, fRcpX // end of 2nd NR iteration
+ nop.i 0
+}
+;;
+{ .mfb
+ ldfpd fB2, fA1 = [rPolDataPtr], 16
+ nop.f 0
+(p7) br.cond.spnt lgammal_overflow // branch if x > overflow threshold
+}
+;;
+{.mfi
+ ldfe fB4 = [rPolDataPtr], 16
+ fcvt.xf fFloatN = fFloatN
+ extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
+}
+;;
+{ .mfi
+ shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3// Point to G_3
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe fB6 = [rPolDataPtr], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfps FR_G3, FR_H3 = [GR_ad_tbl_3], 8 // Load G_3, H_3
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
+ fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe fB8 = [rPolDataPtr], 16
+ fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnma.s1 fInvX = f8, fRcpX, f1 // start of 3rd NR iteration
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fB10 = [rPolDataPtr], 16
+ nop.f 0
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+;;
+{ .mfi
+ ldfe fB12 = [rPolDataPtr], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fB14 = [rPolDataPtr], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfi
+ ldfe fB16 = [rPolDataPtr], 16
+ // get double extended coefficients from two doubles
+ // two doubles are needed in Stitling's formula for negative x
+ fadd.s1 fB2 = fB2, fA1
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fB18 = [rPolDataPtr], 16
+ fma.s1 fInvX = fInvX, fRcpX, fRcpX // end of 3rd NR iteration
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fB20 = [rPolDataPtr], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
+ nop.i 0
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRcpX = fInvX, fInvX, f0 // 1/x^2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA0L = fB2, fInvX, fA0L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 FR_r = fSignifX, FR_G, f1 // r = G * S_hi - 1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // High part of the log(x): Y_hi = N * log2_hi + H
+ fma.s1 fRes2H = fFloatN, FR_log2_hi, FR_H
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ // h = N * log2_lo + h
+ fma.s1 FR_h = fFloatN, FR_log2_lo, FR_h
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // High part of the log(x): Y_hi = N * log2_hi + H
+ fma.s1 fRes1H = fFloatN, FR_log2_hi, FR_H
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fPol = fB18, fRcpX, fB16 // v9
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA2L = fRcpX, fRcpX, f0 // v10
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fA3 = fB6, fRcpX, fB4 // v3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4 = fB10, fRcpX, fB8 // v4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes2H =fRes2H, f1, f1 // log_Hi(x) -1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // poly_lo = r * Q4 + Q3
+ fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1H = fRes1H, FR_MHalf, f0 // -0.5*log_Hi(x)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fB14, fRcpX, fB12 // v7
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA8 = fA2L, fB20, fPol // v8
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA2 = fA4, fA2L, fA3 // v2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA4L = fA2L, fA2L, f0 // v5
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fRes2H, f8, f0 // (x*(ln(x)-1))hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo * r + Q2
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // poly_hi = Q1 * rsq + r
+ fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fRcpX, fInvX, f0 // 1/x^3
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA6 = fA8, fA2L, fA7 // v6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fResL = fRes2H, f8, fResH // d(x*(ln(x)-1))
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3H = fResH, fRes1H // (x*(ln(x)-1) -0.5ln(x))hi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // poly_lo = poly_lo*r^3 + h
+ fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fA4L, fA6, fA2 // v1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // raise inexact exception
+ fma.s0 FR_log2_lo = FR_log2_lo, FR_log2_lo, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4H = fRes3H, fA0 // (x*(ln(x)-1) -0.5ln(x))hi + Chi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes3L = fResH, fRes3H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Y_lo = poly_hi + poly_lo
+ fadd.s1 fRes2L = FR_poly_hi, FR_poly_lo
+ nop.i 0
+}
+;;
+
+{ .mfi
+ nop.m 0
+ fma.s1 fA0L = fPol, fA11, fA0L // S(1/x) + Clo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fRes1H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes4L = fRes3H, fRes4H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResL = fRes2L, f8 , fResL // lo part of x*(ln(x)-1)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Clo + S(1/x) - 0.5*logLo(x)
+ fma.s1 fA0L = fRes2L, FR_MHalf, fA0L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fA0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Clo + S(1/x) - 0.5*logLo(x) + (x*(ln(x)-1))lo
+ fadd.s1 fA0L = fA0L, fResL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fA0L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ fma.s0 f8 = fRes4H, f1, fRes4L
+ // exit for x > 10.0
+ br.ret.sptk b0
+}
+;;
+// here if 8.0 <= x <= 10.0
+// Result = P15(y), where y = x/8.0 - 1.5
+.align 32
+lgammal_8_10:
+{ .mfi
+ addl rPolDataPtr = @ltoff(lgammal_8_10_data), gp
+ fms.s1 FR_FracX = fSignifX, f1, FR_MHalf // y = x/8.0 - 1.5
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+;;
+{ .mfi
+ ld8 rLnSinDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ adds rZ1offsett = 32, rLnSinDataPtr
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ adds rLnSinDataPtr = 48, rLnSinDataPtr
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe fA2 = [rZ1offsett], 32 // A5
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA0, fA0L = [rPolDataPtr], 16 // A0
+ fma.s1 FR_rsq = FR_FracX, FR_FracX, f0 // y^2
+ nop.i 0
+}
+{ .mfi
+ ldfe fA3 = [rLnSinDataPtr],32 // A5
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mmf
+ ldfe fA4 = [rZ1offsett], 32 // A4
+ ldfe fA5 = [rLnSinDataPtr], 32 // A5
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfe fA6 = [rZ1offsett], 32 // A6
+ ldfe fA7 = [rLnSinDataPtr], 32 // A7
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfe fA8 = [rZ1offsett], 32 // A8
+ ldfe fA9 = [rLnSinDataPtr], 32 // A9
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfe fA10 = [rZ1offsett], 32 // A10
+ ldfe fA11 = [rLnSinDataPtr], 32 // A11
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfe fA12 = [rZ1offsett], 32 // A12
+ ldfe fA13 = [rLnSinDataPtr], 32 // A13
+ fma.s1 FR_Q4 = FR_rsq, FR_rsq, f0 // y^4
+}
+;;
+{ .mmf
+ ldfe fA14 = [rZ1offsett], 32 // A14
+ ldfe fA15 = [rLnSinDataPtr], 32 // A15
+ nop.f 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1H = FR_FracX, fA1, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA3 = fA3, FR_FracX, fA2 // v4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA5 = fA5, FR_FracX, fA4 // v5
+ nop.i 0
+}
+;;
+{ .mfi
+ // store sign of GAMMA(x) if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s1 fA3L = FR_Q4, FR_Q4, f0 // v9 = y^8
+ nop.i 0
+}
+{ .mfi
+ // store sign of GAMMA(x) if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s1 fA7 = fA7, FR_FracX, fA6 // v7
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, FR_FracX, fA8 // v8
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1L = FR_FracX, fA1, fRes1H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, FR_FracX, fA10 // v12
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, FR_FracX, fA12 // v13
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2H = fRes1H, f1, fA0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, FR_FracX, fA14 // v16
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA5 = fA5, FR_rsq, fA3 // v3
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, FR_rsq, fA7 // v6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = FR_FracX, fA1L, fRes1L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes2L = fA0, f1, fRes2H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, FR_rsq, fA11 // v11
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, FR_Q4, fA5 // v2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes1L, f1, fA0L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2L = fRes2L, f1, fRes1H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, FR_Q4, fA13 // v10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2L = fRes1L, f1, fRes2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fA3L, fA15, fA9
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 f8 = FR_rsq , fPol, fRes2H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, FR_rsq, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes1L = fRes2H, f1, f8
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes1L, f1, fPol
+ nop.i 0
+}
+;;
+{.mfi
+ nop.m 0
+ fma.s1 fRes1L = fRes1L, f1, fRes2L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ fma.s0 f8 = f8, f1, fRes1L
+ // exit for 8.0 <= x <= 10.0
+ br.ret.sptk b0
+}
+;;
+
+// here if 4.0 <=x < 8.0
+.align 32
+lgammal_4_8:
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_4_8_data),gp
+ fms.s1 FR_FracX = fSignifX, f1, FR_MHalf
+ adds rSgnGam = 1, r0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfb
+ adds rTmpPtr = 160, rPolDataPtr
+ nop.f 0
+ // branch to special path which computes polynomial of 25th degree
+ br.sptk lgamma_polynom25
+}
+;;
+
+// here if 2.25 <=x < 4.0
+.align 32
+lgammal_2Q_4:
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_2Q_4_data),gp
+ fms.s1 FR_FracX = fSignifX, f1, FR_MHalf
+ adds rSgnGam = 1, r0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+
+{ .mfb
+ adds rTmpPtr = 160, rPolDataPtr
+ nop.f 0
+ // branch to special path which computes polynomial of 25th degree
+ br.sptk lgamma_polynom25
+}
+;;
+
+// here if 0.5 <= |x| < 0.75
+.align 32
+lgammal_half_3Q:
+.pred.rel "mutex", p14, p15
+{ .mfi
+(p14) addl rPolDataPtr= @ltoff(lgammal_half_3Q_data),gp
+ // FR_FracX = x - 0.625 for positive x
+(p14) fms.s1 FR_FracX = f8, f1, FR_FracX
+(p14) adds rSgnGam = 1, r0
+}
+{ .mfi
+(p15) addl rPolDataPtr= @ltoff(lgammal_half_3Q_neg_data),gp
+ // FR_FracX = x + 0.625 for negative x
+(p15) fma.s1 FR_FracX = f8, f1, FR_FracX
+(p15) adds rSgnGam = -1, r0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfb
+ adds rTmpPtr = 160, rPolDataPtr
+ nop.f 0
+ // branch to special path which computes polynomial of 25th degree
+ br.sptk lgamma_polynom25
+}
+;;
+// here if 1.3125 <= x < 1.5625
+.align 32
+lgammal_loc_min:
+{ .mfi
+ adds rSgnGam = 1, r0
+ nop.f 0
+ nop.i 0
+}
+{ .mfb
+ adds rTmpPtr = 160, rPolDataPtr
+ fms.s1 FR_FracX = f8, f1, fA5L
+ br.sptk lgamma_polynom25
+}
+;;
+// here if -2.605859375 <= x < -2.5
+// special polynomial approximation used since neither "near root"
+// approximation nor reflection formula give satisfactory accuracy on
+// this range
+.align 32
+_neg2andHalf:
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_neg2andHalf_data),gp
+ fma.s1 FR_FracX = fB20, f1, f8 // 2.5 + x
+ adds rSgnGam = -1, r0
+}
+;;
+{.mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfb
+ adds rTmpPtr = 160, rPolDataPtr
+ nop.f 0
+ // branch to special path which computes polynomial of 25th degree
+ br.sptk lgamma_polynom25
+}
+;;
+
+// here if -0.5 < x <= -0.40625
+.align 32
+lgammal_near_neg_half:
+{ .mmf
+ addl rPolDataPtr= @ltoff(lgammal_near_neg_half_data),gp
+ setf.exp FR_FracX = rExpHalf
+ nop.f 0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ adds rSgnGam = -1, r0
+}
+;;
+{ .mfb
+ adds rTmpPtr = 160, rPolDataPtr
+ fma.s1 FR_FracX = FR_FracX, f1, f8
+ // branch to special path which computes polynomial of 25th degree
+ br.sptk lgamma_polynom25
+}
+;;
+
+// here if there an answer is P25(x)
+// rPolDataPtr, rTmpPtr point to coefficients
+// x is in FR_FracX register
+.align 32
+lgamma_polynom25:
+{ .mfi
+ ldfpd fA3, fA0L = [rPolDataPtr], 16 // A3
+ nop.f 0
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+{ .mfi
+ ldfpd fA18, fA19 = [rTmpPtr], 16 // D7, D6
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA1, fA1L = [rPolDataPtr], 16 // A1
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA16, fA17 = [rTmpPtr], 16 // D4, D5
+ nop.f 0
+}
+;;
+{ .mfi
+ ldfpd fA12, fA13 = [rPolDataPtr], 16 // D0, D1
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA14, fA15 = [rTmpPtr], 16 // D2, D3
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA24, fA25 = [rPolDataPtr], 16 // C21, C20
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA22, fA23 = [rTmpPtr], 16 // C19, C18
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA2, fA2L = [rPolDataPtr], 16 // A2
+ fma.s1 fA4L = FR_FracX, FR_FracX, f0 // x^2
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA20, fA21 = [rTmpPtr], 16 // C17, C16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfe fA11 = [rTmpPtr], 16 // E7
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA0, fA3L = [rPolDataPtr], 16 // A0
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ ldfe fA10 = [rPolDataPtr], 16 // E6
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe fA9 = [rTmpPtr], 16 // E5
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mmf
+ ldfe fA8 = [rPolDataPtr], 16 // E4
+ ldfe fA7 = [rTmpPtr], 16 // E3
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfe fA6 = [rPolDataPtr], 16 // E2
+ ldfe fA5 = [rTmpPtr], 16 // E1
+ nop.f 0
+}
+;;
+{ .mfi
+ ldfe fA4 = [rPolDataPtr], 16 // E0
+ fma.s1 fA5L = fA4L, fA4L, f0 // x^4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fB2 = FR_FracX, FR_FracX, fA4L // x^2 - <x^2>
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s1 fRes4H = fA3, FR_FracX, f0 // (A3*x)hi
+ nop.i 0
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s1 fA19 = fA19, FR_FracX, fA18 // D7*x + D6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fA1, FR_FracX, f0 // (A1*x)hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB6 = fA1L, FR_FracX, fA0L // A1L*x + A0L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA17 = fA17, FR_FracX, fA16 // D5*x + D4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, FR_FracX, fA14 // D3*x + D2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, FR_FracX, fA24 // C21*x + C20
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA13 = fA13, FR_FracX, fA12 // D1*x + D0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA23 = fA23, FR_FracX, fA22 // C19*x + C18
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA21 = fA21, FR_FracX, fA20 // C17*x + C16
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes4L = fA3, FR_FracX, fRes4H // delta((A3*x)hi)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2H = fRes4H, fA2 // (A3*x + A2)hi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fResL = fA1, FR_FracX, fResH // d(A1*x)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1H = fResH, fA0 // (A1*x + A0)hi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, fA4L, fA17 // Dhi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, FR_FracX, fA10 // E7*x + E6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // Doing this to raise inexact flag
+ fma.s0 fA10 = fA0, fA0, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA15 = fA15, fA4L, fA13 // Dlo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // (C21*x + C20)*x^2 + C19*x + C18
+ fma.s1 fA25 = fA25, fA4L, fA23
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA9 = fA9, FR_FracX, fA8 // E5*x + E4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA7, FR_FracX, fA6 // E3*x + E2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes4L = fA3L, FR_FracX, fRes4L // (A3*x)lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes2L = fA2, fRes2H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fResL = fResL, fB6 // (A1L*x + A0L) + d(A1*x)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes1L = fA0, fRes1H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA5 = fA5, FR_FracX, fA4 // E1*x + E0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB8 = fA5L, fA5L, f0 // x^8
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ((C21*x + C20)*x^2 + C19*x + C18)*x^2 + C17*x + C16
+ fma.s1 fA25 = fA25, fA4L, fA21
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA19 = fA19, fA5L, fA15 // D
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, fA4L, fA9 // Ehi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fRes4H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fA2L // (A3*x)lo + A2L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3H = fRes2H, fA4L, f0 // ((A3*x + A2)*x^2)hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fResH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3L = fRes2H, fB2, f0 // (A3*x + A2)hi*d(x^2)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA7 = fA7, fA4L, fA5 // Elo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA25 = fA25, fB8, fA19 // C*x^8 + D
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fRes4L // (A3*x + A2)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fB4 = fRes2H, fA4L, fRes3H // d((A3*x + A2)*x^2))
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fResL // (A1*x + A0)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fB20 = fRes3H, fRes1H // Phi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA11 = fA11, fA5L, fA7 // E
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ( (A3*x + A2)lo*<x^2> + (A3*x + A2)hi*d(x^2))
+ fma.s1 fRes3L = fRes2L, fA4L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // d((A3*x + A2)*x^2)) + (A1*x + A0)lo
+ fadd.s1 fRes1L = fRes1L, fB4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fB18 = fRes1H, fB20
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fA25, fB8, fA11
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fB18 = fB18, fRes3H
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fRes4H = fPol, fA5L, fB20
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fPol, fA5L, f0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fB18 = fB18, fRes1L // Plo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes4L = fB20, fRes4H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fB18 = fB18, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fB18
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ fma.s0 f8 = fRes4H, f1, fRes4L
+ // P25(x) computed, exit here
+ br.ret.sptk b0
+}
+;;
+
+
+// here if 0.75 <= x < 1.3125
+.align 32
+lgammal_03Q_1Q:
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_03Q_1Q_data),gp
+ fma.s1 FR_FracX = fA5L, f1, f0 // x
+ adds rSgnGam = 1, r0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB4 = fA5L, fA5L, f0 // x^2
+ nop.i 0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfb
+ adds rTmpPtr = 144, rPolDataPtr
+ nop.f 0
+ br.sptk lgamma_polynom24x
+}
+;;
+
+// here if 1.5625 <= x < 2.25
+.align 32
+lgammal_13Q_2Q:
+{ .mfi
+ addl rPolDataPtr= @ltoff(lgammal_13Q_2Q_data),gp
+ fma.s1 FR_FracX = fB4, f1, f0 // x
+ adds rSgnGam = 1, r0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB4 = fB4, fB4, f0 // x^2
+ nop.i 0
+}
+;;
+{ .mfi
+ ld8 rPolDataPtr = [rPolDataPtr]
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfb
+ adds rTmpPtr = 144, rPolDataPtr
+ nop.f 0
+ br.sptk lgamma_polynom24x
+}
+;;
+
+// here if result is Pol24(x)
+// x is in FR_FracX,
+// rPolDataPtr, rTmpPtr point to coefficients
+.align 32
+lgamma_polynom24x:
+{ .mfi
+ ldfpd fA4, fA2L = [rPolDataPtr], 16
+ nop.f 0
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+{ .mfi
+ ldfpd fA23, fA24 = [rTmpPtr], 16 // C18, C19
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA3, fA1L = [rPolDataPtr], 16
+ fma.s1 fA5L = fB4, fB4, f0 // x^4
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA19, fA20 = [rTmpPtr], 16 // D6, D7
+ fms.s1 fB2 = FR_FracX, FR_FracX, fB4 // x^2 - <x^2>
+ nop.i 0
+}
+;;
+{ .mmf
+ ldfpd fA15, fA16 = [rPolDataPtr], 16 // D2, D3
+ ldfpd fA17, fA18 = [rTmpPtr], 16 // D4, D5
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfpd fA13, fA14 = [rPolDataPtr], 16 // D0, D1
+ ldfpd fA12, fA21 = [rTmpPtr], 16 // E7, C16
+ nop.f 0
+}
+;;
+{ .mfi
+ ldfe fA11 = [rPolDataPtr], 16 // E6
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfe fA10 = [rTmpPtr], 16 // E5
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA2, fA4L = [rPolDataPtr], 16
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ ldfpd fA1, fA3L = [rTmpPtr], 16
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ ldfpd fA22, fA25 = [rPolDataPtr], 16 // C17, C20
+ fma.s1 fA0 = fA5L, fA5L, f0 // x^8
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA0L = fA5L, FR_FracX, f0 // x^5
+ nop.i 0
+}
+;;
+{ .mmf
+ ldfe fA9 = [rPolDataPtr], 16 // E4
+ ldfe fA8 = [rTmpPtr], 16 // E3
+ nop.f 0
+}
+;;
+{ .mmf
+ ldfe fA7 = [rPolDataPtr], 16 // E2
+ ldfe fA6 = [rTmpPtr], 16 // E1
+ nop.f 0
+}
+;;
+{ .mfi
+ ldfe fA5 = [rTmpPtr], 16 // E0
+ fma.s1 fRes4H = fA4, fB4, f0 // A4*<x^2>
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fA24, FR_FracX, fA23 // C19*x + C18
+ nop.i 0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s1 fRes1H = fA3, fB4, f0 // A3*<x^2>
+ nop.i 0
+}
+{ .mfi
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s1 fA1L = fA3, fB2,fA1L // A3*d(x^2) + A1L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA20 = fA20, FR_FracX, fA19 // D7*x + D6
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA18 = fA18, FR_FracX, fA17 // D5*x + D4
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA16 = fA16, FR_FracX, fA15 // D3*x + D2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA14 = fA14, FR_FracX, fA13 // D1*x + D0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA2L = fA4, fB2,fA2L // A4*d(x^2) + A2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA12 = fA12, FR_FracX, fA11 // E7*x + E6
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes2L = fA4, fB4, fRes4H // delta(A4*<x^2>)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2H = fRes4H, fA2 // A4*<x^2> + A2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fms.s1 fRes3L = fA3, fB4, fRes1H // delta(A3*<x^2>)
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3H = fRes1H, fA1 // A3*<x^2> + A1
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA20 = fA20, fB4, fA18 // (D7*x + D6)*x^2 + D5*x + D4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA22 = fA22, FR_FracX, fA21 // C17*x + C16
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA16 = fA16, fB4, fA14 // (D3*x + D2)*x^2 + D1*x + D0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fA25, fB4, fPol // C20*x^2 + C19*x + C18
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA2L = fA4L, fB4, fA2L // A4L*<x^2> + A4*d(x^2) + A2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA1L = fA3L, fB4, fA1L // A3L*<x^2> + A3*d(x^2) + A1L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes4L = fA2, fRes2H // d1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fResH = fRes2H, fB4, f0 // (A4*<x^2> + A2)*x^2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes1L = fA1, fRes3H // d1
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fB6 = fRes3H, FR_FracX, f0 // (A3*<x^2> + A1)*x
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fA10 = fA10, FR_FracX, fA9 // E5*x + E4
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA8 = fA8, FR_FracX, fA7 // E3*x + E2
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // (C20*x^2 + C19*x + C18)*x^2 + C17*x + C16
+ fma.s1 fPol = fPol, fB4, fA22
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA6 = fA6, FR_FracX, fA5 // E1*x + E0
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // A4L*<x^2> + A4*d(x^2) + A2L + delta(A4*<x^2>)
+ fadd.s1 fRes2L = fA2L, fRes2L
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // A3L*<x^2> + A3*d(x^2) + A1L + delta(A3*<x^2>)
+ fadd.s1 fRes3L = fA1L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fRes4H // d2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fResL = fRes2H, fB4, fResH // d(A4*<x^2> + A2)*x^2)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes1L = fRes1L, fRes1H // d2
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fms.s1 fB8 = fRes3H, FR_FracX, fB6 // d((A3*<x^2> + A1)*x)
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fB10 = fResH, fB6 // (A4*x^4 + .. + A1*x)hi
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA12 = fA12, fB4, fA10 // Ehi
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ // ((D7*x + D6)*x^2 + D5*x + D4)*x^4 + (D3*x + D2)*x^2 + D1*x + D0
+ fma.s1 fA20 = fA20, fA5L, fA16
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fA8 = fA8, fB4, fA6 // Elo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes2L = fRes2L, fRes4L // (A4*<x^2> + A2)lo
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ // d(A4*<x^2> + A2)*x^2) + A4*<x^2> + A2)*d(x^2)
+ fma.s1 fResL = fRes2H, fB2, fResL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fRes1L // (A4*<x^2> + A2)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fB12 = fB6, fB10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, fA0, fA20 // PolC*x^8 + PolD
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fma.s1 fPolL = fA12, fA5L, fA8 // E
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fResL = fB4, fRes2L, fResL // ((A4*<x^2> + A2)*x^2)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes3L = fRes3L, FR_FracX, fB8 // ((A3*<x^2> + A1)*x)lo
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fB12 = fB12, fResH
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fPol = fPol, fA0, fPolL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fRes3L, fResL
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes2H = fPol, fA0L, fB10
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes3L = fB12, fRes3L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fsub.s1 fRes4L = fB10, fRes2H
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fma.s1 fRes4L = fPol, fA0L, fRes4L
+ nop.i 0
+}
+;;
+{ .mfi
+ nop.m 0
+ fadd.s1 fRes4L = fRes4L, fRes3L
+ nop.i 0
+}
+;;
+{ .mfb
+ nop.m 0
+ // final result for all paths for which the result is Pol24(x)
+ fma.s0 f8 = fRes2H, f1, fRes4L
+ // here is the exit for all paths for which the result is Pol24(x)
+ br.ret.sptk b0
+}
+;;
+
+
+// here if x is natval, nan, +/-inf, +/-0, or denormal
+.align 32
+lgammal_spec:
+{ .mfi
+ nop.m 0
+ fclass.m p9, p0 = f8, 0xB // +/-denormals
+ nop.i 0
+};;
+{ .mfi
+ nop.m 0
+ fclass.m p6, p0 = f8, 0x1E1 // Test x for natval, nan, +inf
+ nop.i 0
+};;
+{ .mfb
+ nop.m 0
+ fclass.m p7, p0 = f8, 0x7 // +/-0
+(p9) br.cond.sptk lgammal_denormal_input
+};;
+{ .mfb
+ nop.m 0
+ nop.f 0
+ // branch out if if x is natval, nan, +inf
+(p6) br.cond.spnt lgammal_nan_pinf
+};;
+{ .mfb
+ nop.m 0
+ nop.f 0
+(p7) br.cond.spnt lgammal_singularity
+};;
+// if we are still here then x = -inf
+{ .mfi
+ cmp.eq p6, p7 = 4, rSgnGamSize
+ nop.f 0
+ adds rSgnGam = 1, r0
+};;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ nop.f 0
+ nop.i 0
+}
+{ .mfb
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s0 f8 = f8,f8,f0 // return +inf, no call to error support
+ br.ret.spnt b0
+};;
+
+// here if x is NaN, NatVal or +INF
+.align 32
+lgammal_nan_pinf:
+{ .mfi
+ cmp.eq p6, p7 = 4, rSgnGamSize
+ nop.f 0
+ adds rSgnGam = 1, r0
+}
+;;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ fma.s0 f8 = f8,f1,f8 // return x+x if x is natval, nan, +inf
+ nop.i 0
+}
+{ .mfb
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ nop.f 0
+ br.ret.sptk b0
+}
+;;
+
+// here if x denormal or unnormal
+.align 32
+lgammal_denormal_input:
+{ .mfi
+ nop.m 0
+ fma.s0 fResH = f1, f1, f8 // raise denormal exception
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ fnorm.s1 f8 = f8 // normalize input value
+ nop.i 0
+}
+;;
+{ .mfi
+ getf.sig rSignifX = f8
+ fmerge.se fSignifX = f1, f8
+ nop.i 0
+}
+{ .mfi
+ getf.exp rSignExpX = f8
+ fcvt.fx.s1 fXint = f8 // Convert arg to int (int repres. in FR)
+ nop.i 0
+}
+;;
+{ .mfi
+ getf.exp rSignExpX = f8
+ fcmp.lt.s1 p15, p14 = f8, f0
+ nop.i 0
+}
+;;
+{ .mfb
+ and rExpX = rSignExpX, r17Ones
+ fmerge.s fAbsX = f1, f8 // |x|
+ br.cond.sptk _deno_back_to_main_path
+}
+;;
+
+
+// here if overflow (x > overflow_bound)
+.align 32
+lgammal_overflow:
+{ .mfi
+ addl r8 = 0x1FFFE, r0
+ nop.f 0
+ cmp.eq p6, p7 = 4, rSgnGamSize
+}
+{ .mfi
+ adds rSgnGam = 1, r0
+ nop.f 0
+ nop.i 0
+}
+;;
+{ .mfi
+ setf.exp f9 = r8
+ fmerge.s FR_X = f8,f8
+ mov GR_Parameter_TAG = 102 // overflow
+};;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ nop.f 0
+ nop.i 0
+}
+{ .mfb
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ fma.s0 FR_RESULT = f9,f9,f0 // Set I,O and +INF result
+ br.cond.sptk __libm_error_region
+};;
+
+// here if x is negative integer or +/-0 (SINGULARITY)
+.align 32
+lgammal_singularity:
+{ .mfi
+ adds rSgnGam = 1, r0
+ fclass.m p8,p0 = f8,0x6 // is x -0?
+ mov GR_Parameter_TAG = 103 // negative
+}
+{ .mfi
+ cmp.eq p6, p7 = 4, rSgnGamSize
+ fma.s1 FR_X = f0,f0,f8
+ nop.i 0
+};;
+{ .mfi
+(p8) sub rSgnGam = r0, rSgnGam
+ nop.f 0
+ nop.i 0
+}
+{ .mfi
+ nop.m 0
+ nop.f 0
+ nop.i 0
+};;
+{ .mfi
+ // store signgam if size of variable is 4 bytes
+(p6) st4 [rSgnGamAddr] = rSgnGam
+ nop.f 0
+ nop.i 0
+}
+{ .mfb
+ // store signgam if size of variable is 8 bytes
+(p7) st8 [rSgnGamAddr] = rSgnGam
+ frcpa.s0 FR_RESULT, p0 = f1, f0
+ br.cond.sptk __libm_error_region
+};;
+
+GLOBAL_LIBM_END(__libm_lgammal)
+
+
+LOCAL_LIBM_ENTRY(__libm_error_region)
+.prologue
+{ .mfi
+ add GR_Parameter_Y=-32,sp // Parameter 2 value
+ nop.f 0
+.save ar.pfs,GR_SAVE_PFS
+ mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
+}
+{ .mfi
+.fframe 64
+ add sp=-64,sp // Create new stack
+ nop.f 0
+ mov GR_SAVE_GP=gp // Save gp
+};;
+{ .mmi
+ stfe [GR_Parameter_Y] = FR_Y,16 // Save Parameter 2 on stack
+ add GR_Parameter_X = 16,sp // Parameter 1 address
+.save b0, GR_SAVE_B0
+ mov GR_SAVE_B0=b0 // Save b0
+};;
+.body
+{ .mib
+ stfe [GR_Parameter_X] = FR_X // Store Parameter 1 on stack
+ add GR_Parameter_RESULT = 0,GR_Parameter_Y
+ nop.b 0 // Parameter 3 address
+}
+{ .mib
+ stfe [GR_Parameter_Y] = FR_RESULT // Store Parameter 3 on stack
+ add GR_Parameter_Y = -16,GR_Parameter_Y
+ br.call.sptk b0=__libm_error_support# // Call error handling function
+};;
+{ .mmi
+ add GR_Parameter_RESULT = 48,sp
+ nop.m 999
+ nop.i 999
+};;
+{ .mmi
+ ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
+.restore sp
+ add sp = 64,sp // Restore stack pointer
+ mov b0 = GR_SAVE_B0 // Restore return address
+};;
+{ .mib
+ mov gp = GR_SAVE_GP // Restore gp
+ mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
+ br.ret.sptk b0 // Return
+};;
+
+LOCAL_LIBM_END(__libm_error_region#)
+
+.type __libm_error_support#,@function
+.global __libm_error_support#