GSVector: Add a bunch of fp64 operations

This commit is contained in:
Stenzek 2024-09-01 12:00:09 +10:00
parent b2e48ed5d8
commit 07d1c6ab14
No known key found for this signature in database
4 changed files with 610 additions and 342 deletions

View file

@ -1,6 +1,10 @@
// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
//
// Lightweight wrapper over native SIMD types for cross-platform vector code.
//
#pragma once
#include "common/intrin.h"

View file

@ -828,12 +828,8 @@ public:
ALWAYS_INLINE operator float32x2_t() const { return v2s; }
ALWAYS_INLINE GSVector2 abs() const { return GSVector2(vabs_f32(v2s)); }
ALWAYS_INLINE GSVector2 neg() const { return GSVector2(vneg_f32(v2s)); }
ALWAYS_INLINE GSVector2 rcp() const { return GSVector2(vrecpe_f32(v2s)); }
ALWAYS_INLINE GSVector2 rcpnr() const
ALWAYS_INLINE GSVector2 rcp() const
{
float32x2_t recip = vrecpe_f32(v2s);
recip = vmul_f32(recip, vrecps_f32(recip, v2s));
@ -843,7 +839,6 @@ public:
#ifdef CPU_ARCH_ARM64
ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); }
ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); }
#else
@ -2004,50 +1999,6 @@ public:
ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); }
template<int shift>
ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
{
// (a - this) * f << shift + this
return add16(a.sub16(*this).modulate16<shift>(f));
}
template<int shift>
ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
{
// (a - b) * c << shift
return a.sub16(b).modulate16<shift>(c);
}
template<int shift>
ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
const GSVector4i& d)
{
// (a - b) * c << shift + d
return d.add16(a.sub16(b).modulate16<shift>(c));
}
ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a, const GSVector4i& f) const
{
// (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
return add16(a.sub16(*this).mul16l(f).sra16<4>());
}
template<int shift>
ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
{
// a * f << shift
if (shift == 0)
{
return mul16hrs(f);
}
return sll16<shift + 1>().mul16hs(f);
}
ALWAYS_INLINE bool eq(const GSVector4i& v) const
{
const int32x4_t res = veorq_s32(v4s, v.v4s);
@ -2400,6 +2351,8 @@ class alignas(16) GSVector4
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
public:
union
{
@ -2442,6 +2395,10 @@ public:
constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
{
const float arr[4] = {x, y, z, w};
@ -2475,12 +2432,28 @@ public:
ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
#ifdef CPU_ARCH_ARM64
ALWAYS_INLINE static GSVector4 f64(double x, double y)
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
#else
GSVector4 ret;
ret.F64[0] = x;
ret.F64[1] = y;
return ret;
#endif
}
ALWAYS_INLINE static GSVector4 f64(double x)
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vdupq_n_f64(x)));
#else
GSVector4 ret;
ret.F64[0] = ret.F64[1] = x;
return ret;
#endif
}
ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
@ -2729,6 +2702,28 @@ public:
return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i);
}
template<int dst>
ALWAYS_INLINE GSVector4 insert64(double v) const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(v, vreinterpretq_f64_f32(v4s), dst)));
#else
GSVector4 ret;
ret.F64[dst] = v;
return ret;
#endif
}
template<int src>
ALWAYS_INLINE double extract64() const
{
#ifdef CPU_ARCH_ARM64
return vgetq_lane_f64(vreinterpretq_f64_f32(v4s), src);
#else
return F64[src];
#endif
}
ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); }
ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
@ -2903,43 +2898,182 @@ public:
return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
}
#ifdef CPU_ARCH_ARM64
// Not in ARM32
ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
{
return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
#else
return GSVector4::f64(F64[0] * v.F64[0], F64[1] * v.F64[1]);
#endif
}
ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const
{
return GSVector4(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
#else
return GSVector4::f64(F64[0] + v.F64[0], F64[1] + v.F64[1]);
#endif
}
ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const
{
return GSVector4(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
#else
return GSVector4::f64(F64[0] - v.F64[0], F64[1] - v.F64[1]);
#endif
}
ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vdivq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
#else
return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]);
#endif
}
ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
#else
GSVector4 ret;
ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
return ret;
#endif
}
ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vceqq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
#else
GSVector4 ret;
ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
return ret;
#endif
}
ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vcgtq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
#else
GSVector4 ret;
ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
return ret;
#endif
}
ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vcgeq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
#else
GSVector4 ret;
ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
return ret;
#endif
}
ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vcleq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
#else
GSVector4 ret;
ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
return ret;
#endif
}
ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vminq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
#else
return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1]));
#endif
}
ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vmaxq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))));
#else
return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1]));
#endif
}
ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
ALWAYS_INLINE GSVector4 sqrt64() const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
#else
return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1]));
#endif
}
ALWAYS_INLINE GSVector4 sqr64() const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vsqrtq_f64(vreinterpretq_f64_f32(v4s))));
#else
return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]);
#endif
}
ALWAYS_INLINE GSVector4 floor64() const
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vrndmq_f64(vreinterpretq_f64_f32(v4s))));
#else
return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1]));
#endif
}
ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v)
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
#else
return GSVector4::f64(static_cast<double>(vgetq_lane_f32(v.v4s, 0)), static_cast<double>(vgetq_lane_f32(v.v4s, 1)));
#endif
}
ALWAYS_INLINE static GSVector4 f32to64(const void* p)
{
#ifdef CPU_ARCH_ARM64
return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
#else
const float* fp = static_cast<const float*>(p);
return GSVector4::f64(static_cast<double>(fp[0]), static_cast<double>(fp[1]));
#endif
}
ALWAYS_INLINE GSVector4i f64toi32() const
{
#ifdef CPU_ARCH_ARM64
const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0));
const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1));
#else
const s32 low = static_cast<s32>(F64[0]);
const s32 high = static_cast<s32>(F64[1]);
#endif
return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
}
#endif
// clang-format off
#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \

View file

@ -603,13 +603,6 @@ public:
GSVector2 rcp() const { return GSVector2(1.0f / x, 1.0f / y); }
GSVector2 rcpnr() const
{
GSVector2 v_ = rcp();
return (v_ + v_) - (v_ * v_) * *this;
}
GSVector2 floor() const { return GSVector2(std::floor(x), std::floor(y)); }
GSVector2 ceil() const { return GSVector2(std::ceil(x), std::ceil(y)); }
@ -1461,50 +1454,6 @@ public:
GSVector4i mul32l(const GSVector4i& v) const { ALL_LANES_32(ret.S32[i] = S32[i] * v.S32[i]); }
template<s32 shift>
ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
{
// (a - this) * f << shift + this
return add16(a.sub16(*this).modulate16<shift>(f));
}
template<s32 shift>
ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
{
// (a - b) * c << shift
return a.sub16(b).modulate16<shift>(c);
}
template<s32 shift>
ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
const GSVector4i& d)
{
// (a - b) * c << shift + d
return d.add16(a.sub16(b).modulate16<shift>(c));
}
ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const
{
// (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
return add16(a_.sub16(*this).mul16l(f).sra16<4>());
}
template<s32 shift>
ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
{
// a * f << shift
if constexpr (shift == 0)
{
return mul16hrs(f);
}
return sll16<shift + 1>().mul16hs(f);
}
ALWAYS_INLINE bool eq(const GSVector4i& v) const { return (std::memcmp(S32, v.S32, sizeof(S32))) == 0; }
GSVector4i eq8(const GSVector4i& v) const { ALL_LANES_8(ret.S8[i] = (S8[i] == v.S8[i]) ? -1 : 0); }
@ -1791,6 +1740,8 @@ class alignas(16) GSVector4
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
public:
union
{
@ -1832,6 +1783,10 @@ public:
constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
{
this->x = x;
@ -1881,6 +1836,13 @@ public:
return ret;
}
ALWAYS_INLINE static GSVector4 f64(double x)
{
GSVector4 ret;
ret.F64[0] = ret.F64[1] = x;
return ret;
}
ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; }
u32 rgba32() const { return GSVector4i(*this).rgba32(); }
@ -2045,6 +2007,20 @@ public:
return I32[i];
}
template<int dst>
ALWAYS_INLINE GSVector4 insert64(double v) const
{
GSVector4 ret;
ret.F64[dst] = v;
return ret;
}
template<int src>
ALWAYS_INLINE double extract64() const
{
return F64[src];
}
ALWAYS_INLINE static constexpr GSVector4 zero() { return GSVector4::cxpr(0.0f, 0.0f, 0.0f, 0.0f); }
ALWAYS_INLINE static constexpr GSVector4 xffffffff()
@ -2300,6 +2276,71 @@ public:
return ret;
}
ALWAYS_INLINE GSVector4 div64(const GSVector4& v) const
{
return GSVector4::f64(F64[0] / v.F64[0], F64[1] / v.F64[1]);
}
ALWAYS_INLINE GSVector4 gt64(const GSVector4& v) const
{
GSVector4 ret;
ret.U64[0] = (F64[0] > v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
ret.U64[1] = (F64[1] > v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
return ret;
}
ALWAYS_INLINE GSVector4 eq64(const GSVector4& v) const
{
GSVector4 ret;
ret.U64[0] = (F64[0] == v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
ret.U64[1] = (F64[1] == v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
return ret;
}
ALWAYS_INLINE GSVector4 lt64(const GSVector4& v) const
{
GSVector4 ret;
ret.U64[0] = (F64[0] < v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
ret.U64[1] = (F64[1] < v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
return ret;
}
ALWAYS_INLINE GSVector4 ge64(const GSVector4& v) const
{
GSVector4 ret;
ret.U64[0] = (F64[0] >= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
ret.U64[1] = (F64[1] >= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
return ret;
}
ALWAYS_INLINE GSVector4 le64(const GSVector4& v) const
{
GSVector4 ret;
ret.U64[0] = (F64[0] <= v.F64[0]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
ret.U64[1] = (F64[1] <= v.F64[1]) ? 0xFFFFFFFFFFFFFFFFULL : 0;
return ret;
}
ALWAYS_INLINE GSVector4 min64(const GSVector4& v) const
{
return GSVector4::f64(std::min(F64[0], v.F64[0]), std::min(F64[1], v.F64[1]));
}
ALWAYS_INLINE GSVector4 max64(const GSVector4& v) const
{
return GSVector4::f64(std::max(F64[0], v.F64[0]), std::max(F64[1], v.F64[1]));
}
ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL(); }
ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4::f64(std::sqrt(F64[0]), std::sqrt(F64[1])); }
ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4::f64(F64[0] * F64[0], F64[1] * F64[1]); }
ALWAYS_INLINE GSVector4 floor64() const { return GSVector4::f64(std::floor(F64[0]), std::floor(F64[1])); }
ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_)
{
GSVector4 ret;

View file

@ -1,6 +1,7 @@
// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: LGPL-3.0+
//
// Lightweight wrapper over native SIMD types for cross-platform vector code.
// Rewritten and NEON+No-SIMD variants added for DuckStation.
//
@ -63,11 +64,9 @@ public:
GSVector2i() = default;
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x, s32 y) { return GSVector2i(cxpr_init, x, y); }
ALWAYS_INLINE constexpr static GSVector2i cxpr(s32 x) { return GSVector2i(cxpr_init, x, x); }
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 x) { return GSVector2i(cxpr_init, x, x, x, x); }
ALWAYS_INLINE constexpr static GSVector2i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3)
{
return GSVector2i(cxpr_init, s0, s1, s2, s3);
@ -79,26 +78,26 @@ public:
}
ALWAYS_INLINE GSVector2i(s32 x, s32 y) { m = _mm_set_epi32(0, 0, y, x); }
ALWAYS_INLINE GSVector2i(s16 s0, s16 s1, s16 s2, s16 s3) { m = _mm_set_epi16(0, 0, 0, 0, s3, s2, s1, s0); }
ALWAYS_INLINE constexpr GSVector2i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7)
: S8{b0, b1, b2, b3, b4, b5, b6, b7, 0, 0, 0, 0, 0, 0, 0, 0}
{
}
// MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
// so leave the non-constexpr version default
ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; }
ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
ALWAYS_INLINE constexpr explicit GSVector2i(__m128i m) : m(m) {}
ALWAYS_INLINE void operator=(s32 i) { m = _mm_set1_epi32(i); }
ALWAYS_INLINE void operator=(__m128i m_) { m = m_; }
ALWAYS_INLINE GSVector2i& operator=(s32 i)
{
m = _mm_set1_epi32(i);
return *this;
}
ALWAYS_INLINE GSVector2i& operator=(__m128i m_)
{
m = m_;
return *this;
}
ALWAYS_INLINE operator __m128i() const { return m; }
@ -142,10 +141,7 @@ public:
ALWAYS_INLINE GSVector2i min_u32(const GSVector2i& v) const { return GSVector2i(_mm_min_epu32(m, v)); }
ALWAYS_INLINE GSVector2i max_u32(const GSVector2i& v) const { return GSVector2i(_mm_max_epu32(m, v)); }
ALWAYS_INLINE s32 addv_s32() const
{
return _mm_cvtsi128_si32(_mm_hadd_epi32(m, m));
}
ALWAYS_INLINE s32 addv_s32() const { return _mm_cvtsi128_si32(_mm_hadd_epi32(m, m)); }
ALWAYS_INLINE u8 minv_u8() const
{
@ -180,11 +176,8 @@ public:
}
ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); }
ALWAYS_INLINE u32 minv_u32() const { return std::min<u32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); }
ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); }
ALWAYS_INLINE u32 maxv_u32() const { return std::max<u32>(_mm_extract_epi32(m, 0), _mm_extract_epi32(m, 1)); }
ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
@ -333,39 +326,25 @@ public:
#endif
ALWAYS_INLINE GSVector2i add8(const GSVector2i& v) const { return GSVector2i(_mm_add_epi8(m, v.m)); }
ALWAYS_INLINE GSVector2i add16(const GSVector2i& v) const { return GSVector2i(_mm_add_epi16(m, v.m)); }
ALWAYS_INLINE GSVector2i add32(const GSVector2i& v) const { return GSVector2i(_mm_add_epi32(m, v.m)); }
ALWAYS_INLINE GSVector2i adds8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi8(m, v.m)); }
ALWAYS_INLINE GSVector2i adds16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epi16(m, v.m)); }
ALWAYS_INLINE GSVector2i addus8(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu8(m, v.m)); }
ALWAYS_INLINE GSVector2i addus16(const GSVector2i& v) const { return GSVector2i(_mm_adds_epu16(m, v.m)); }
ALWAYS_INLINE GSVector2i sub8(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi8(m, v.m)); }
ALWAYS_INLINE GSVector2i sub16(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi16(m, v.m)); }
ALWAYS_INLINE GSVector2i sub32(const GSVector2i& v) const { return GSVector2i(_mm_sub_epi32(m, v.m)); }
ALWAYS_INLINE GSVector2i subs8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi8(m, v.m)); }
ALWAYS_INLINE GSVector2i subs16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epi16(m, v.m)); }
ALWAYS_INLINE GSVector2i subus8(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu8(m, v.m)); }
ALWAYS_INLINE GSVector2i subus16(const GSVector2i& v) const { return GSVector2i(_mm_subs_epu16(m, v.m)); }
ALWAYS_INLINE GSVector2i avg8(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu8(m, v.m)); }
ALWAYS_INLINE GSVector2i avg16(const GSVector2i& v) const { return GSVector2i(_mm_avg_epu16(m, v.m)); }
ALWAYS_INLINE GSVector2i mul16l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi16(m, v.m)); }
ALWAYS_INLINE GSVector2i mul32l(const GSVector2i& v) const { return GSVector2i(_mm_mullo_epi32(m, v.m)); }
ALWAYS_INLINE bool eq(const GSVector2i& v) const { return eq8(v).alltrue(); }
@ -399,7 +378,6 @@ public:
ALWAYS_INLINE s32 mask() const { return (_mm_movemask_epi8(m) & 0xff); }
ALWAYS_INLINE bool alltrue() const { return (mask() == 0xff); }
ALWAYS_INLINE bool allfalse() const { return (mask() == 0x00); }
template<s32 i>
@ -442,24 +420,35 @@ public:
}
ALWAYS_INLINE static GSVector2i load32(const void* p) { return GSVector2i(_mm_loadu_si32(p)); }
ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(_mm_loadl_epi64((__m128i*)p)); }
ALWAYS_INLINE static GSVector2i load(const void* p)
{
return GSVector2i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
}
ALWAYS_INLINE static GSVector2i load(s32 i) { return GSVector2i(_mm_cvtsi32_si128(i)); }
ALWAYS_INLINE static GSVector2i loadq(s64 i) { return GSVector2i(_mm_cvtsi64_si128(i)); }
ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64((__m128i*)p, v.m); }
ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { _mm_storeu_si32(p, v); }
ALWAYS_INLINE static s32 store(const GSVector2i& v) { return _mm_cvtsi128_si32(v.m); }
ALWAYS_INLINE static s64 storeq(const GSVector2i& v) { return _mm_cvtsi128_si64(v.m); }
ALWAYS_INLINE void operator&=(const GSVector2i& v) { m = _mm_and_si128(m, v); }
ALWAYS_INLINE void operator|=(const GSVector2i& v) { m = _mm_or_si128(m, v); }
ALWAYS_INLINE void operator^=(const GSVector2i& v) { m = _mm_xor_si128(m, v); }
ALWAYS_INLINE GSVector2i& operator&=(const GSVector2i& v)
{
m = _mm_and_si128(m, v);
return *this;
}
ALWAYS_INLINE GSVector2i& operator|=(const GSVector2i& v)
{
m = _mm_or_si128(m, v);
return *this;
}
ALWAYS_INLINE GSVector2i& operator^=(const GSVector2i& v)
{
m = _mm_xor_si128(m, v);
return *this;
}
ALWAYS_INLINE friend GSVector2i operator&(const GSVector2i& v1, const GSVector2i& v2)
{
@ -485,6 +474,7 @@ public:
ALWAYS_INLINE friend GSVector2i operator~(const GSVector2i& v) { return v ^ v.eq32(v); }
ALWAYS_INLINE static GSVector2i zero() { return GSVector2i(_mm_setzero_si128()); }
ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
ALWAYS_INLINE GSVector2i xy() const { return GSVector2i(m); }
ALWAYS_INLINE GSVector2i xx() const { return GSVector2i(_mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 0, 0))); }
@ -500,7 +490,6 @@ class alignas(16) GSVector2
static constexpr cxpr_init_tag cxpr_init{};
constexpr GSVector2(cxpr_init_tag, float x, float y) : F32{x, y} {}
constexpr GSVector2(cxpr_init_tag, int x, int y) : I32{x, y} {}
public:
@ -530,28 +519,20 @@ public:
GSVector2() = default;
constexpr static GSVector2 cxpr(float x, float y) { return GSVector2(cxpr_init, x, y); }
constexpr static GSVector2 cxpr(float x) { return GSVector2(cxpr_init, x, x); }
constexpr static GSVector2 cxpr(int x, int y) { return GSVector2(cxpr_init, x, y); }
constexpr static GSVector2 cxpr(int x) { return GSVector2(cxpr_init, x, x); }
ALWAYS_INLINE GSVector2(float x, float y) { m = _mm_set_ps(0, 0, y, x); }
ALWAYS_INLINE GSVector2(int x, int y)
{
GSVector2i v_(x, y);
m = _mm_cvtepi32_ps(v_.m);
}
ALWAYS_INLINE constexpr explicit GSVector2(__m128 m) : m(m) {}
ALWAYS_INLINE explicit GSVector2(__m128d m) : m(_mm_castpd_ps(m)) {}
ALWAYS_INLINE explicit GSVector2(float f) { *this = f; }
ALWAYS_INLINE explicit GSVector2(int i)
{
#ifdef CPU_ARCH_AVX2
@ -563,38 +544,23 @@ public:
ALWAYS_INLINE explicit GSVector2(const GSVector2i& v);
ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
ALWAYS_INLINE void operator=(float f)
ALWAYS_INLINE GSVector2& operator=(float f)
{
#if CPU_ARCH_AVX2
m = _mm_broadcastss_ps(_mm_load_ss(&f));
#else
m = _mm_set1_ps(f);
#endif
return *this;
}
ALWAYS_INLINE void operator=(__m128 m_) { this->m = m_; }
ALWAYS_INLINE GSVector2& operator=(__m128 m_)
{
m = m_;
return *this;
}
ALWAYS_INLINE operator __m128() const { return m; }
ALWAYS_INLINE GSVector2 abs() const { return *this & cast(GSVector2i::cxpr(0x7fffffff)); }
ALWAYS_INLINE GSVector2 neg() const { return *this ^ cast(GSVector2i::cxpr(0x80000000)); }
ALWAYS_INLINE GSVector2 rcp() const { return GSVector2(_mm_rcp_ps(m)); }
ALWAYS_INLINE GSVector2 rcpnr() const
{
GSVector2 v_ = rcp();
return (v_ + v_) - (v_ * v_) * *this;
}
ALWAYS_INLINE GSVector2 floor() const
{
return GSVector2(_mm_round_ps(m, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
@ -657,27 +623,77 @@ public:
ALWAYS_INLINE static GSVector2 xffffffff() { return zero() == zero(); }
ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(_mm_castpd_ps(_mm_load_sd((double*)p))); }
ALWAYS_INLINE static GSVector2 load(const void* p)
{
return GSVector2(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
}
ALWAYS_INLINE static GSVector2 load(float f) { return GSVector2(_mm_load_ss(&f)); }
ALWAYS_INLINE static void store(void* p, const GSVector2& v) { _mm_store_sd((double*)p, _mm_castps_pd(v.m)); }
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
{
_mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
}
ALWAYS_INLINE GSVector2 operator-() const { return neg(); }
ALWAYS_INLINE void operator+=(const GSVector2& v_) { m = _mm_add_ps(m, v_); }
ALWAYS_INLINE void operator-=(const GSVector2& v_) { m = _mm_sub_ps(m, v_); }
ALWAYS_INLINE void operator*=(const GSVector2& v_) { m = _mm_mul_ps(m, v_); }
ALWAYS_INLINE void operator/=(const GSVector2& v_) { m = _mm_div_ps(m, v_); }
ALWAYS_INLINE GSVector2& operator+=(const GSVector2& v_)
{
m = _mm_add_ps(m, v_);
return *this;
}
ALWAYS_INLINE GSVector2& operator-=(const GSVector2& v_)
{
m = _mm_sub_ps(m, v_);
return *this;
}
ALWAYS_INLINE GSVector2& operator*=(const GSVector2& v_)
{
m = _mm_mul_ps(m, v_);
return *this;
}
ALWAYS_INLINE GSVector2& operator/=(const GSVector2& v_)
{
m = _mm_div_ps(m, v_);
return *this;
}
ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); }
ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); }
ALWAYS_INLINE void operator*=(float f) { *this *= GSVector2(f); }
ALWAYS_INLINE void operator/=(float f) { *this /= GSVector2(f); }
ALWAYS_INLINE GSVector2& operator+=(float f)
{
*this += GSVector2(f);
return *this;
}
ALWAYS_INLINE GSVector2& operator-=(float f)
{
*this -= GSVector2(f);
return *this;
}
ALWAYS_INLINE GSVector2& operator*=(float f)
{
*this *= GSVector2(f);
return *this;
}
ALWAYS_INLINE GSVector2& operator/=(float f)
{
*this /= GSVector2(f);
return *this;
}
ALWAYS_INLINE void operator&=(const GSVector2& v_) { m = _mm_and_ps(m, v_); }
ALWAYS_INLINE void operator|=(const GSVector2& v_) { m = _mm_or_ps(m, v_); }
ALWAYS_INLINE void operator^=(const GSVector2& v_) { m = _mm_xor_ps(m, v_); }
ALWAYS_INLINE GSVector2& operator&=(const GSVector2& v_)
{
m = _mm_and_ps(m, v_);
return *this;
}
ALWAYS_INLINE GSVector2& operator|=(const GSVector2& v_)
{
m = _mm_or_ps(m, v_);
return *this;
}
ALWAYS_INLINE GSVector2& operator^=(const GSVector2& v_)
{
m = _mm_xor_ps(m, v_);
return *this;
}
ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v1, const GSVector2& v2)
{
@ -752,6 +768,8 @@ public:
return GSVector2(_mm_cmple_ps(v1, v2));
}
ALWAYS_INLINE static GSVector2 cast(const GSVector2i& v);
ALWAYS_INLINE GSVector2 xy() const { return *this; }
ALWAYS_INLINE GSVector2 xx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 0))); }
ALWAYS_INLINE GSVector2 yx() const { return GSVector2(_mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 0, 1))); }
@ -811,11 +829,9 @@ public:
{
return GSVector4i(cxpr_init, x, y, z, w);
}
ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
{
return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
@ -828,9 +844,7 @@ public:
}
ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) { m = _mm_set_epi32(w, z, y, x); }
ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); }
ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
{
m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
@ -844,25 +858,27 @@ public:
ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { m = v.m; }
// MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
// so leave the non-constexpr version default
ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
ALWAYS_INLINE explicit GSVector4i(const GSVector2& v);
ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
ALWAYS_INLINE constexpr explicit GSVector4i(__m128i m) : m(m) {}
ALWAYS_INLINE void operator=(s32 i) { m = _mm_set1_epi32(i); }
ALWAYS_INLINE void operator=(__m128i m_) { m = m_; }
ALWAYS_INLINE GSVector4i& operator=(s32 i)
{
m = _mm_set1_epi32(i);
return *this;
}
ALWAYS_INLINE GSVector4i& operator=(__m128i m_)
{
m = m_;
return *this;
}
ALWAYS_INLINE operator __m128i() const { return m; }
// rect
ALWAYS_INLINE s32 width() const { return right - left; }
ALWAYS_INLINE s32 height() const { return bottom - top; }
@ -882,8 +898,6 @@ public:
ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); }
ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
//
ALWAYS_INLINE u32 rgba32() const
{
GSVector4i v = *this;
@ -1237,99 +1251,34 @@ public:
#endif
ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const { return GSVector4i(_mm_add_epi8(m, v.m)); }
ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const { return GSVector4i(_mm_add_epi16(m, v.m)); }
ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(_mm_add_epi32(m, v.m)); }
ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi8(m, v.m)); }
ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi16(m, v.m)); }
ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const { return GSVector4i(_mm_hadds_epi16(m, v.m)); }
ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu8(m, v.m)); }
ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu16(m, v.m)); }
ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi8(m, v.m)); }
ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi16(m, v.m)); }
ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi32(m, v.m)); }
ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi8(m, v.m)); }
ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi16(m, v.m)); }
ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu8(m, v.m)); }
ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu16(m, v.m)); }
ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu8(m, v.m)); }
ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu16(m, v.m)); }
ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epi16(m, v.m)); }
ALWAYS_INLINE GSVector4i mul16hu(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epu16(m, v.m)); }
ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi16(m, v.m)); }
ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const { return GSVector4i(_mm_mulhrs_epi16(m, v.m)); }
ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi32(m, v.m)); }
template<s32 shift>
ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
{
// (a - this) * f << shift + this
return add16(a.sub16(*this).modulate16<shift>(f));
}
template<s32 shift>
ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
{
// (a - b) * c << shift
return a.sub16(b).modulate16<shift>(c);
}
template<s32 shift>
ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
const GSVector4i& d)
{
// (a - b) * c << shift + d
return d.add16(a.sub16(b).modulate16<shift>(c));
}
ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const
{
// (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
return add16(a_.sub16(*this).mul16l(f).sra16<4>());
}
template<s32 shift>
ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
{
// a * f << shift
if (shift == 0)
{
return mul16hrs(f);
}
return sll16<shift + 1>().mul16hs(f);
}
ALWAYS_INLINE bool eq(const GSVector4i& v) const
{
// pxor, ptest, je
GSVector4i t = *this ^ v;
const GSVector4i t = *this ^ v;
return _mm_testz_si128(t, t) != 0;
}
@ -1420,15 +1369,21 @@ public:
return _mm_extract_epi64(m, i);
}
ALWAYS_INLINE static GSVector4i loadnt(const void* p) { return GSVector4i(_mm_stream_load_si128((__m128i*)p)); }
ALWAYS_INLINE static GSVector4i loadnt(const void* p)
{
return GSVector4i(_mm_stream_load_si128(static_cast<const __m128i*>(p)));
}
ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); }
ALWAYS_INLINE static GSVector4i loadl(const void* p) { return GSVector4i(_mm_loadl_epi64((__m128i*)p)); }
ALWAYS_INLINE static GSVector4i loadl(const void* p)
{
return GSVector4i(_mm_loadl_epi64(static_cast<const __m128i*>(p)));
}
ALWAYS_INLINE static GSVector4i loadh(const void* p)
{
return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), (__m64*)p)));
return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), static_cast<const __m64*>(p))));
}
ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
@ -1439,18 +1394,19 @@ public:
template<bool aligned>
ALWAYS_INLINE static GSVector4i load(const void* p)
{
return GSVector4i(aligned ? _mm_load_si128((__m128i*)p) : _mm_loadu_si128((__m128i*)p));
return GSVector4i(aligned ? _mm_load_si128(static_cast<const __m128i*>(p)) :
_mm_loadu_si128(static_cast<const __m128i*>(p)));
}
ALWAYS_INLINE static GSVector4i load(s32 i) { return GSVector4i(_mm_cvtsi32_si128(i)); }
ALWAYS_INLINE static GSVector4i loadq(s64 i) { return GSVector4i(_mm_cvtsi64_si128(i)); }
ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128((__m128i*)p, v.m); }
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64((__m128i*)p, v.m); }
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { _mm_storeh_pi((__m64*)p, _mm_castsi128_ps(v.m)); }
ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128(static_cast<__m128i*>(p), v.m); }
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64(static_cast<__m128i*>(p), v.m); }
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
{
_mm_storeh_pi(static_cast<__m64*>(p), _mm_castsi128_ps(v.m));
}
ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v)
{
@ -1462,20 +1418,30 @@ public:
ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
{
if constexpr (aligned)
_mm_store_si128((__m128i*)p, v.m);
_mm_store_si128(static_cast<__m128i*>(p), v.m);
else
_mm_storeu_si128((__m128i*)p, v.m);
_mm_storeu_si128(static_cast<__m128i*>(p), v.m);
}
ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { _mm_storeu_si32(p, v); }
ALWAYS_INLINE static s32 store(const GSVector4i& v) { return _mm_cvtsi128_si32(v.m); }
ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return _mm_cvtsi128_si64(v.m); }
ALWAYS_INLINE void operator&=(const GSVector4i& v) { m = _mm_and_si128(m, v); }
ALWAYS_INLINE void operator|=(const GSVector4i& v) { m = _mm_or_si128(m, v); }
ALWAYS_INLINE void operator^=(const GSVector4i& v) { m = _mm_xor_si128(m, v); }
ALWAYS_INLINE GSVector4i& operator&=(const GSVector4i& v)
{
m = _mm_and_si128(m, v);
return *this;
}
ALWAYS_INLINE GSVector4i& operator|=(const GSVector4i& v)
{
m = _mm_or_si128(m, v);
return *this;
}
ALWAYS_INLINE GSVector4i& operator^=(const GSVector4i& v)
{
m = _mm_xor_si128(m, v);
return *this;
}
ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
{
@ -1493,14 +1459,12 @@ public:
}
ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); }
ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); }
ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); }
ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); }
ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); }
ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
@ -1555,6 +1519,8 @@ class alignas(16) GSVector4
constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
constexpr GSVector4(cxpr_init_tag, double x, double y) : F64{x, y} {}
public:
union
{
@ -1586,35 +1552,29 @@ public:
GSVector4() = default;
constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); }
constexpr static GSVector4 cxpr64(double x, double y) { return GSVector4(cxpr_init, x, y); }
constexpr static GSVector4 cxpr64(double x) { return GSVector4(cxpr_init, x, x); }
ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); }
ALWAYS_INLINE GSVector4(float x, float y) { m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y)); }
ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
{
GSVector4i v_(x, y, z, w);
m = _mm_cvtepi32_ps(v_.m);
}
ALWAYS_INLINE GSVector4(int x, int y)
{
m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y)));
}
ALWAYS_INLINE explicit GSVector4(const GSVector2& v) : m(v.m) {}
ALWAYS_INLINE explicit GSVector4(const GSVector2i& v)
: m(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtepi32_ps(v.m)), _mm_setzero_pd())))
{
@ -1637,24 +1597,20 @@ public:
ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
ALWAYS_INLINE static GSVector4 f64(double x, double y) { return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x))); }
ALWAYS_INLINE static GSVector4 f64(double x) { return GSVector4(_mm_castpd_ps(_mm_set1_pd(x))); }
ALWAYS_INLINE void operator=(float f)
ALWAYS_INLINE GSVector4& operator=(float f)
{
#if CPU_ARCH_AVX2
m = _mm_broadcastss_ps(_mm_load_ss(&f));
#else
m = _mm_set1_ps(f);
#endif
return *this;
}
ALWAYS_INLINE void operator=(__m128 m_) { this->m = m_; }
ALWAYS_INLINE GSVector4& operator=(__m128 m_)
{
this->m = m_;
return *this;
}
ALWAYS_INLINE operator __m128() const { return m; }
@ -1824,52 +1780,132 @@ public:
return _mm_extract_ps(m, i);
}
template<int dst>
ALWAYS_INLINE GSVector4 insert64(double v) const
{
if constexpr (dst == 0)
return GSVector4(_mm_move_sd(_mm_castps_pd(m), _mm_load_pd(&v)));
else
return GSVector4(_mm_shuffle_pd(_mm_castps_pd(m), _mm_load_pd(&v), 0));
}
template<int src>
ALWAYS_INLINE double extract64() const
{
double ret;
if constexpr (src == 0)
_mm_storel_pd(&ret, _mm_castps_pd(m));
else
_mm_storeh_pd(&ret, _mm_castps_pd(m));
return ret;
}
ALWAYS_INLINE static GSVector4 zero() { return GSVector4(_mm_setzero_ps()); }
ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); }
ALWAYS_INLINE static GSVector4 loadl(const void* p) { return GSVector4(_mm_castpd_ps(_mm_load_sd((double*)p))); }
ALWAYS_INLINE static GSVector4 loadl(const void* p)
{
return GSVector4(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p))));
}
ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(_mm_load_ss(&f)); }
template<bool aligned>
ALWAYS_INLINE static GSVector4 load(const void* p)
{
return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p));
return GSVector4(aligned ? _mm_load_ps(static_cast<const float*>(p)) : _mm_loadu_ps(static_cast<const float*>(p)));
}
ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps((float*)p, v.m); }
ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { _mm_store_sd((double*)p, _mm_castps_pd(v.m)); }
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { _mm_storeh_pd((double*)p, _mm_castps_pd(v.m)); }
ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps(static_cast<float*>(p), v.m); }
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
{
_mm_store_sd(static_cast<double*>(p), _mm_castps_pd(v.m));
}
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
{
_mm_storeh_pd(static_cast<double*>(p), _mm_castps_pd(v.m));
}
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector4& v)
{
if constexpr (aligned)
_mm_store_ps((float*)p, v.m);
_mm_store_ps(static_cast<float*>(p), v.m);
else
_mm_storeu_ps((float*)p, v.m);
_mm_storeu_ps(static_cast<float*>(p), v.m);
}
ALWAYS_INLINE static void store(float* p, const GSVector4& v) { _mm_store_ss(p, v.m); }
ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
ALWAYS_INLINE void operator+=(const GSVector4& v_) { m = _mm_add_ps(m, v_); }
ALWAYS_INLINE void operator-=(const GSVector4& v_) { m = _mm_sub_ps(m, v_); }
ALWAYS_INLINE void operator*=(const GSVector4& v_) { m = _mm_mul_ps(m, v_); }
ALWAYS_INLINE void operator/=(const GSVector4& v_) { m = _mm_div_ps(m, v_); }
ALWAYS_INLINE GSVector4& operator+=(const GSVector4& v_)
{
m = _mm_add_ps(m, v_);
return *this;
}
ALWAYS_INLINE GSVector4& operator-=(const GSVector4& v_)
{
m = _mm_sub_ps(m, v_);
return *this;
}
ALWAYS_INLINE GSVector4& operator*=(const GSVector4& v_)
{
m = _mm_mul_ps(m, v_);
return *this;
}
ALWAYS_INLINE GSVector4& operator/=(const GSVector4& v_)
{
m = _mm_div_ps(m, v_);
return *this;
}
ALWAYS_INLINE GSVector4& operator+=(float f)
{
*this += GSVector4(f);
return *this;
}
ALWAYS_INLINE GSVector4& operator-=(float f)
{
*this -= GSVector4(f);
return *this;
}
ALWAYS_INLINE GSVector4& operator*=(float f)
{
*this *= GSVector4(f);
return *this;
}
ALWAYS_INLINE GSVector4& operator/=(float f)
{
*this /= GSVector4(f);
return *this;
}
ALWAYS_INLINE GSVector4& operator&=(const GSVector4& v_)
{
m = _mm_and_ps(m, v_);
return *this;
}
ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); }
ALWAYS_INLINE GSVector4& operator|=(const GSVector4& v_)
{
m = _mm_or_ps(m, v_);
return *this;
}
ALWAYS_INLINE void operator&=(const GSVector4& v_) { m = _mm_and_ps(m, v_); }
ALWAYS_INLINE void operator|=(const GSVector4& v_) { m = _mm_or_ps(m, v_); }
ALWAYS_INLINE void operator^=(const GSVector4& v_) { m = _mm_xor_ps(m, v_); }
ALWAYS_INLINE GSVector4& operator^=(const GSVector4& v_)
{
m = _mm_xor_ps(m, v_);
return *this;
}
ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
{
@ -1959,6 +1995,59 @@ public:
return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
}
ALWAYS_INLINE GSVector4 div64(const GSVector4& v_) const
{
return GSVector4(_mm_div_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
}
ALWAYS_INLINE GSVector4 gt64(const GSVector4& v2) const
{
return GSVector4(_mm_cmpgt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
}
ALWAYS_INLINE GSVector4 eq64(const GSVector4& v2) const
{
return GSVector4(_mm_cmpeq_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
}
ALWAYS_INLINE GSVector4 lt64(const GSVector4& v2) const
{
return GSVector4(_mm_cmplt_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
}
ALWAYS_INLINE GSVector4 ge64(const GSVector4& v2) const
{
return GSVector4(_mm_cmpge_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
}
ALWAYS_INLINE GSVector4 le64(const GSVector4& v2) const
{
return GSVector4(_mm_cmple_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
}
ALWAYS_INLINE GSVector4 min64(const GSVector4& v2) const
{
return GSVector4(_mm_min_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
}
ALWAYS_INLINE GSVector4 max64(const GSVector4& v2) const
{
return GSVector4(_mm_max_pd(_mm_castps_pd(m), _mm_castps_pd(v2.m)));
}
ALWAYS_INLINE GSVector4 abs64() const { return *this & GSVector4::cxpr64(static_cast<u64>(0x7FFFFFFFFFFFFFFFULL)); }
ALWAYS_INLINE GSVector4 neg64() const { return *this ^ GSVector4::cxpr64(static_cast<u64>(0x8000000000000000ULL)); }
ALWAYS_INLINE GSVector4 sqrt64() const { return GSVector4(_mm_sqrt_pd(_mm_castps_pd(m))); }
ALWAYS_INLINE GSVector4 sqr64() const { return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(m))); }
ALWAYS_INLINE GSVector4 floor64() const
{
return GSVector4(_mm_round_pd(_mm_castps_pd(m), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
}
ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) { return GSVector4(_mm_cvtps_pd(v_.m)); }
ALWAYS_INLINE static GSVector4 f32to64(const void* p)
@ -2007,7 +2096,7 @@ public:
ALWAYS_INLINE static GSVector4 broadcast32(const void* f)
{
return GSVector4(_mm_broadcastss_ps(_mm_load_ss((const float*)f)));
return GSVector4(_mm_broadcastss_ps(_mm_load_ss(static_cast<const float*>(f))));
}
#endif