//-------------------------------------------------------------------------------------
//
// Copyright 2009 Intel Corporation
// All Rights Reserved
//
// Permission is granted to use, copy, distribute and prepare derivative works of this
// software for any purpose and without fee, provided, that the above copyright notice
// and this statement appear in all copies.  Intel makes no representations about the
// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
// assume any responsibility for any errors which may appear in this software nor any
// responsibility to update it.
//

/////////////////////////////////////////////////////////////////////////////
// Based upon:
//
// Approximate Math Library for SSE / SSE2
//  Header File
//  Version 2.0
//  Author Alex Klimovitski, Intel GmbH
/////////////////////////////////////////////////////////////////////////////
#include <emmintrin.h>

#include "AMaths.h"
#include "AMaths_internal.h"

#ifdef AMATHS_ASM

__m128 __declspec(naked) __stdcall am_pow_ps(__m128 x, __m128 y)
{
	__asm
	{
		xorps	xmm4, xmm4
		cmpltps	xmm4, xmm0
		maxps	xmm0, _ps_am_min_norm_pos  // cut off denormalized stuff
		mov		ecx, esp
		movaps	xmm7, _ps_am_inv_mant_mask
		and		ecx, ~15
		movaps	xmm3, _ps_am_1
		movaps	[ecx - 16], xmm0

		andps	xmm0, xmm7
		orps	xmm0, xmm3
		movaps	xmm7, xmm0

		subps	xmm0, xmm3
		addps	xmm7, xmm3
		movq	mm0, [ecx - 16]
		rcpps	xmm7, xmm7  
		mulps	xmm0, xmm7
		movq	mm1, [ecx - 16 + 8]
		addps	xmm0, xmm0

		movq	mm7, _pi32_0x7f
		psrld	mm0, 23
		psrld	mm1, 23
		movaps	[ecx - 32], xmm4

		movaps	xmm2, xmm0
		psubd	mm0, mm7
		mulps	xmm2, xmm2
		psubd	mm1, mm7

		movaps	xmm4, _ps_log_p0
		movaps	xmm6, _ps_log_q0

		mulps	xmm4, xmm2
		movaps	xmm5, _ps_log_p1
		mulps	xmm6, xmm2
		movaps	xmm7, _ps_log_q1

		addps	xmm4, xmm5
		addps	xmm6, xmm7

		movaps	xmm5, _ps_log_p2
		mulps	xmm4, xmm2
		cvtpi2ps	xmm3, mm1
		movaps	xmm7, _ps_log_q2
		mulps	xmm6, xmm2

		ASM_MOVE_L2H(xmm3)
		addps	xmm4, xmm5
		addps	xmm6, xmm7

		movaps	xmm5, _ps_log2_c0
		mulps	xmm4, xmm2
		cvtpi2ps	xmm3, mm0
		rcpps	xmm6, xmm6  

		mulps	xmm5, xmm1
		mulps	xmm4, xmm6
		movaps	xmm6, _ps_exp2_hi
		mulps	xmm4, xmm0
		addps	xmm0, xmm4
		movaps	xmm4, _ps_exp2_lo
		mulps	xmm3, xmm1
		mulps	xmm0, xmm5
		movaps	xmm5, _ps_am_1
		xorps	xmm7, xmm7

		addps	xmm0, xmm3
		movaps	xmm3, _ps_am_0p5

		minps	xmm0, xmm6
		maxps	xmm0, xmm4

		addps	xmm3, xmm0

		movaps	xmm2, xmm3

		cvttps2pi	mm0, xmm3
		cmpltps	xmm2, xmm7
		ASM_MOVE_H2L(xmm3)
		andps	xmm2, xmm5
		cvttps2pi	mm1, xmm3
		movq	mm5, _pi32_0x7f

		cvtps2pi	mm2, xmm2  // needn't truncate
		ASM_MOVE_H2L(xmm2)
		cvtps2pi	mm3, xmm2  // needn't truncate

		psubd	mm0, mm2
		psubd	mm1, mm3

		cvtpi2ps	xmm3, mm1
		ASM_MOVE_L2H(xmm3)
		paddd	mm1, mm5
		cvtpi2ps	xmm3, mm0
		paddd	mm0, mm5

		subps	xmm0, xmm3

		movaps	xmm2, xmm0
		mulps	xmm2, xmm2

		movaps	xmm6, _ps_exp2_q0
		movaps	xmm4, _ps_exp2_p0

		mulps	xmm6, xmm2
		movaps	xmm7, _ps_exp2_q1
		mulps	xmm4, xmm2
		movaps	xmm5, _ps_exp2_p1

		addps	xmm6, xmm7
		pslld	mm0, 23
		addps	xmm4, xmm5

		movaps	xmm5, _ps_exp2_p2
		mulps	xmm4, xmm2
		pslld	mm1, 23
		movaps	xmm3, [ecx - 32]

		addps	xmm4, xmm5
		movq	[ecx - 16], mm0

		mulps	xmm4, xmm0
		movq	[ecx - 16 + 8], mm1

		subps	xmm6, xmm4
		movaps	xmm7, _ps_am_1
		rcpps	xmm6, xmm6  
		mulps	xmm4, xmm6
		movaps	xmm0, [ecx - 16]
		addps	xmm4, xmm4
		addps	xmm4, xmm7

		mulps	xmm0, xmm4
		andps	xmm0, xmm3

		ret		32
	}
}

#endif