#ifndef __VECMATH_H
#define __VECMATH_H

#include <math.h>

class vec2
{
public:
	union
	{
		float v[2];
		struct
		{
			float x;
			float y;
		};
	};
	vec2() { ; }
	vec2(float x, float y) : x(x), y(y) { ; }
	inline vec2(__m128 r);
	inline vec2(const vec2& a);

	inline vec2& operator+=(const vec2& b);
	inline vec2& operator+=(__m128 rb);
	inline vec2& operator-=(const vec2& b);
	inline vec2& operator-=(__m128 rb);
	inline vec2& operator*=(float c);
	inline vec2& operator/=(float c);

	//inline float square() const;
	//inline float length() const;
	//inline __m128 norm() const;
};
inline __m128 _loadl(const float* d)
{
	__m128 ra = _mm_load_ss(d);
	__m128 rb = _mm_load_ss(d+1);
	return _mm_unpacklo_ps(ra,rb);
	//return _mm_loadu_ps(d);
	//__m128 ra;
	//return _mm_loadu_ps(d);
	//return _mm_loadl_pi(ra,(__m64*)d);
}
inline void _storel(float* d, __m128 r)
{
	_mm_store_ss(d,r);
	r = _mm_shuffle_ps(r,r,_MM_SHUFFLE(3,2,0,1));
	_mm_store_ss(d+1,r);
}
inline vec2::vec2(__m128 r)
{
	_mm_storel_pi((__m64*)this,r);
}
inline vec2::vec2(const vec2& a) { _storel(v, _loadl(a.v)); }// : x(a.x),y(a.y) { ; }
//{
//	__m128 ra;
//	_mm_storel_pi((__m64*)this, _mm_loadl_pi(ra,(__m64*)&a));
//}
inline __m128 operator+(const vec2& a, const vec2& b)	{ return _mm_add_ps(_loadl(a.v),_loadl(b.v)); }
inline __m128 operator+(const vec2& a, __m128 rb)		{ return _mm_add_ps(_loadl(a.v),rb); }
inline __m128 operator+(__m128 ra, const vec2& b)		{ return _mm_add_ps(ra,_loadl(b.v)); }
inline __m128 operator+(__m128 ra, __m128 rb)			{ return _mm_add_ps(ra,rb); }

inline __m128 operator-(const vec2& a, const vec2& b)	{ return _mm_sub_ps(_loadl(a.v),_loadl(b.v)); }
inline __m128 operator-(const vec2& a, __m128 rb)		{ return _mm_sub_ps(_loadl(a.v),rb); }
inline __m128 operator-(__m128 ra, const vec2& b)		{ return _mm_sub_ps(ra,_loadl(b.v)); }
inline __m128 operator-(__m128 ra, __m128 rb)			{ return _mm_sub_ps(ra,rb); }

inline __m128 operator*(const vec2& a, float c)			{ return _mm_mul_ps(_loadl(a.v), _mm_set1_ps(c)); }
inline __m128 operator*(float c, const vec2& a)			{ return _mm_mul_ps(_loadl(a.v), _mm_set1_ps(c)); }
inline __m128 operator*(__m128 ra, float c)				{ return _mm_mul_ps(ra, _mm_set1_ps(c)); }
inline __m128 operator*(float c, __m128 ra)				{ return _mm_mul_ps(ra, _mm_set1_ps(c)); }

inline __m128 operator/(const vec2& a, float c)			{ return _mm_div_ps(_loadl(a.v), _mm_set1_ps(c)); }
inline __m128 operator/(__m128 ra, float c)				{ return _mm_div_ps(ra, _mm_set1_ps(c)); }

inline vec2& vec2::operator+=(const vec2& b)	{ *this = *this + b;  return *this; }
inline vec2& vec2::operator+=(__m128 rb)		{ *this = *this + rb; return *this; } 
inline vec2& vec2::operator-=(const vec2& b)	{ *this = *this - b;  return *this; }
inline vec2& vec2::operator-=(__m128 rb)		{ *this = *this - rb; return *this; }
inline vec2& vec2::operator*=(float c)			{ *this = *this*c;    return *this; }
inline vec2& vec2::operator/=(float c)			{ *this = *this/c;    return *this; }

inline vec2 operator-(const vec2& a) { return vec2(-a.x, -a.y); }

inline float square(const vec2& a)
{
	float result;
	__m128 ra, rb;
	ra = _loadl(a.v);
	ra = _mm_mul_ps(ra,ra);
	rb = _mm_shuffle_ps(ra,ra,_MM_SHUFFLE(3,2,0,1));
	ra = _mm_add_ss(ra,rb);
	_mm_store_ss(&result, ra);
	return result;
}
inline float length(const vec2& a) { return sqrt(square(a)); }
inline __m128 unitlen(const vec2& a) 
{
	__m128 ra = _loadl(a.v);
	__m128 rb = _mm_mul_ps(ra,ra);
	__m128 rc = _mm_shuffle_ps(rb,rb,_MM_SHUFFLE(3,2,0,1));
	rb = _mm_sqrt_ss(_mm_add_ss(rc,rb));
	rb = _mm_shuffle_ps(rb,rb,_MM_SHUFFLE(0,0,0,0));
	return _mm_div_ps(ra,rb);
}

inline float dot(const vec2& a, const vec2& b)
{
	float result;
	__m128 ra = _loadl(a.v);
	__m128 rb = _loadl(b.v);
	rb = _mm_mul_ps(rb,ra);
	ra = _mm_shuffle_ps(rb,rb,_MM_SHUFFLE(3,2,0,1));
	ra = _mm_add_ss(ra,rb);
	_mm_store_ss(&result, ra);
	return result;
}

#endif