diff --git a/CMakeModules/DuckStationUtils.cmake b/CMakeModules/DuckStationUtils.cmake
index b2dd91c6a..12d7a52ed 100644
--- a/CMakeModules/DuckStationUtils.cmake
+++ b/CMakeModules/DuckStationUtils.cmake
@@ -57,6 +57,8 @@ function(detect_architecture)
if("x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES)
message(STATUS "Building x86_64 MacOS binaries.")
set(CPU_ARCH_X64 TRUE PARENT_SCOPE)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Xarch_x86_64 -msse4.1" PARENT_SCOPE)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xarch_864_64 -msse4.1" PARENT_SCOPE)
endif()
if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
message(STATUS "Building ARM64 MacOS binaries.")
@@ -67,6 +69,8 @@ function(detect_architecture)
CMAKE_SIZEOF_VOID_P EQUAL 8)
message(STATUS "Building x86_64 binaries.")
set(CPU_ARCH_X64 TRUE PARENT_SCOPE)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1" PARENT_SCOPE)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1" PARENT_SCOPE)
elseif(("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64") AND
CMAKE_SIZEOF_VOID_P EQUAL 8) # Might have an A64 kernel, e.g. Raspbian.
message(STATUS "Building ARM64 binaries.")
diff --git a/dep/msvc/vsprops/Base.props b/dep/msvc/vsprops/Base.props
index 9d485d4e4..81efe34c4 100644
--- a/dep/msvc/vsprops/Base.props
+++ b/dep/msvc/vsprops/Base.props
@@ -30,6 +30,7 @@
_HAS_EXCEPTIONS=0;_CRT_INTERNAL_NONSTDC_NAMES;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;WIN32;%(PreprocessorDefinitions)
%(AdditionalIncludeDirectories);$(DepsIncludeDir)
/Zc:__cplusplus /Zo /utf-8 %(AdditionalOptions)
+ -msse4.1 %(AdditionalOptions)
-flto=thin %(AdditionalOptions)
false
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index bc532cf6c..c0c8566d5 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -21,6 +21,11 @@ add_library(common
fifo_queue.h
file_system.cpp
file_system.h
+ gsvector.h
+ gsvector_formatter.h
+ gsvector_neon.h
+ gsvector_nosimd.h
+ gsvector_sse.h
intrin.h
hash_combine.h
heap_array.h
diff --git a/src/common/common.vcxproj b/src/common/common.vcxproj
index 9d0feb068..aafeebe45 100644
--- a/src/common/common.vcxproj
+++ b/src/common/common.vcxproj
@@ -16,6 +16,11 @@
+
+
+
+
+
@@ -70,6 +75,7 @@
+
diff --git a/src/common/common.vcxproj.filters b/src/common/common.vcxproj.filters
index a36015e9f..2e280f9de 100644
--- a/src/common/common.vcxproj.filters
+++ b/src/common/common.vcxproj.filters
@@ -46,6 +46,11 @@
+
+
+
+
+
@@ -80,6 +85,7 @@
thirdparty
+
diff --git a/src/common/gsvector.h b/src/common/gsvector.h
new file mode 100644
index 000000000..8dda3b369
--- /dev/null
+++ b/src/common/gsvector.h
@@ -0,0 +1,65 @@
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include "common/intrin.h"
+
+#include
+
+enum Align_Mode
+{
+ Align_Outside,
+ Align_Inside,
+ Align_NegInf,
+ Align_PosInf
+};
+
+enum Round_Mode
+{
+ Round_NearestInt = 8,
+ Round_NegInf = 9,
+ Round_PosInf = 10,
+ Round_Truncate = 11
+};
+
+template
+class GSVector2T
+{
+public:
+ union
+ {
+ struct
+ {
+ T x, y;
+ };
+ struct
+ {
+ T r, g;
+ };
+ struct
+ {
+ T v[2];
+ };
+ };
+
+ GSVector2T() = default;
+
+ ALWAYS_INLINE constexpr GSVector2T(T x) : x(x), y(x) {}
+ ALWAYS_INLINE constexpr GSVector2T(T x, T y) : x(x), y(y) {}
+ ALWAYS_INLINE constexpr bool operator==(const GSVector2T& v) const { return std::memcmp(this, &v, sizeof(*this)) == 0; }
+ ALWAYS_INLINE constexpr bool operator!=(const GSVector2T& v) const { return std::memcmp(this, &v, sizeof(*this)) != 0; }
+ ALWAYS_INLINE constexpr GSVector2T operator*(const GSVector2T& v) const { return {x * v.x, y * v.y}; }
+ ALWAYS_INLINE constexpr GSVector2T operator/(const GSVector2T& v) const { return {x / v.x, y / v.y}; }
+};
+
+using GSVector2 = GSVector2T;
+using GSVector2i = GSVector2T;
+
+#if defined(CPU_ARCH_SSE)
+#include "common/gsvector_sse.h"
+#elif defined(CPU_ARCH_NEON)
+#include "common/gsvector_neon.h"
+#else
+#include "common/gsvector_nosimd.h"
+#endif
diff --git a/src/common/gsvector.natvis b/src/common/gsvector.natvis
new file mode 100644
index 000000000..077d748e0
--- /dev/null
+++ b/src/common/gsvector.natvis
@@ -0,0 +1,10 @@
+
+
+
+ {{ {x}, {y} }}
+
+
+
+ {{ {I32[0]}, {I32[1]}, {I32[2]}, {I32[3]} }}
+
+
\ No newline at end of file
diff --git a/src/common/gsvector_formatter.h b/src/common/gsvector_formatter.h
new file mode 100644
index 000000000..5ed7c1665
--- /dev/null
+++ b/src/common/gsvector_formatter.h
@@ -0,0 +1,21 @@
+// SPDX-FileCopyrightText: 2024 Connor McLaughlin
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include "gsvector.h"
+#include "small_string.h"
+
+#include "fmt/format.h"
+
+template<>
+struct fmt::formatter : formatter
+{
+ auto format(const GSVector4i& rc, format_context& ctx) const
+ {
+ const TinyString str =
+ TinyString::from_format("{},{} => {},{} ({}x{})", rc.left, rc.top, rc.right, rc.bottom, rc.width(), rc.height());
+
+ return fmt::formatter::format(str.view(), ctx);
+ }
+};
diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h
new file mode 100644
index 000000000..1362b22aa
--- /dev/null
+++ b/src/common/gsvector_neon.h
@@ -0,0 +1,1712 @@
+// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#define GSVECTOR_HAS_UNSIGNED 1
+#define GSVECTOR_HAS_SRLV 1
+
+class GSVector4;
+
+class alignas(16) GSVector4i
+{
+ struct cxpr_init_tag
+ {
+ };
+ static constexpr cxpr_init_tag cxpr_init{};
+
+ constexpr GSVector4i(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
+
+ constexpr GSVector4i(cxpr_init_tag, short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7)
+ : I16{s0, s1, s2, s3, s4, s5, s6, s7}
+ {
+ }
+
+ constexpr GSVector4i(cxpr_init_tag, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8,
+ char b9, char b10, char b11, char b12, char b13, char b14, char b15)
+#if !defined(__APPLE__) && !defined(_MSC_VER)
+ : U8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+#else
+ : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+#endif
+ {
+ }
+
+public:
+ union
+ {
+ struct
+ {
+ int x, y, z, w;
+ };
+ struct
+ {
+ int r, g, b, a;
+ };
+ struct
+ {
+ int left, top, right, bottom;
+ };
+ float F32[4];
+ s8 I8[16];
+ s16 I16[8];
+ s32 I32[4];
+ s64 I64[2];
+ u8 U8[16];
+ u16 U16[8];
+ u32 U32[4];
+ u64 U64[2];
+ int32x4_t v4s;
+ };
+
+ GSVector4i() = default;
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
+ {
+ return GSVector4i(cxpr_init, x, y, z, w);
+ }
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+ {
+ return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
+ }
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
+ s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+ {
+ return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
+ }
+
+ ALWAYS_INLINE GSVector4i(int x, int y, int z, int w)
+ {
+ GSVector4i xz = load(x).upl32(load(z));
+ GSVector4i yw = load(y).upl32(load(w));
+
+ *this = xz.upl32(yw);
+ }
+
+ ALWAYS_INLINE GSVector4i(int x, int y) { *this = load(x).upl32(load(y)); }
+
+ ALWAYS_INLINE GSVector4i(short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7)
+ : I16{s0, s1, s2, s3, s4, s5, s6, s7}
+ {
+ }
+
+ constexpr GSVector4i(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9,
+ char b10, char b11, char b12, char b13, char b14, char b15)
+#if !defined(__APPLE__) && !defined(_MSC_VER)
+ : U8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+#else
+ : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+#endif
+ {
+ }
+
+ ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { v4s = vcombine_s32(vld1_s32(v.v), vcreate_s32(0)); }
+
+ // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
+ // so leave the non-constexpr version default
+ ALWAYS_INLINE explicit GSVector4i(int i) { *this = i; }
+
+ ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {}
+
+ ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
+
+ ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
+
+ ALWAYS_INLINE void operator=(int i) { v4s = vdupq_n_s32(i); }
+
+ ALWAYS_INLINE operator int32x4_t() const { return v4s; }
+
+ // rect
+
+ ALWAYS_INLINE int width() const { return right - left; }
+
+ ALWAYS_INLINE int height() const { return bottom - top; }
+
+ ALWAYS_INLINE GSVector4i rsize() const
+ {
+ return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
+ }
+
+ ALWAYS_INLINE s32 rarea() const { return width() * height(); }
+
+ ALWAYS_INLINE bool rempty() const { return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0); }
+
+ ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_i32(a).upl64(max_i32(a).srl<8>()); }
+
+ ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& a) const { return sat_i32(a); }
+ ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); }
+ ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
+
+ template
+ GSVector4i _ralign_helper(const GSVector4i& mask) const
+ {
+ GSVector4i v;
+
+ switch (mode)
+ {
+ case Align_Inside:
+ v = add32(mask);
+ break;
+ case Align_Outside:
+ v = add32(mask.zwxy());
+ break;
+ case Align_NegInf:
+ v = *this;
+ break;
+ case Align_PosInf:
+ v = add32(mask.xyxy());
+ break;
+
+ default:
+ UnreachableCode();
+ break;
+ }
+
+ return v.andnot(mask.xyxy());
+ }
+
+ /// Align the rect using mask values that already have one subtracted (1 << n - 1 aligns to 1 << n)
+ template
+ GSVector4i ralign_presub(const GSVector2i& a) const
+ {
+ return _ralign_helper(GSVector4i(a));
+ }
+
+ template
+ GSVector4i ralign(const GSVector2i& a) const
+ {
+ // a must be 1 << n
+
+ return _ralign_helper(GSVector4i(a) - GSVector4i(1, 1));
+ }
+
+ //
+
+ ALWAYS_INLINE u32 rgba32() const
+ {
+ GSVector4i v = *this;
+
+ v = v.ps32(v);
+ v = v.pu16(v);
+
+ return (u32)store(v);
+ }
+
+ ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_i8(min).min_i8(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const
+ {
+ return max_i8(minmax.xyxy()).min_i8(minmax.zwzw());
+ }
+ ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_i16(min).min_i16(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const
+ {
+ return max_i16(minmax.xyxy()).min_i16(minmax.zwzw());
+ }
+ ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_i32(min).min_i32(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const
+ {
+ return max_i32(minmax.xyxy()).min_i32(minmax.zwzw());
+ }
+
+ ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_u8(min).min_u8(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
+ {
+ return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
+ }
+ ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_u16(min).min_u16(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
+ {
+ return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
+ }
+ ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_u32(min).min_u32(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
+ {
+ return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
+ }
+
+ ALWAYS_INLINE GSVector4i min_i8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vminq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i max_i8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vmaxq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i min_i16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vminq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i max_i16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vmaxq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i min_i32(const GSVector4i& v) const { return GSVector4i(vminq_s32(v4s, v.v4s)); }
+
+ ALWAYS_INLINE GSVector4i max_i32(const GSVector4i& v) const { return GSVector4i(vmaxq_s32(v4s, v.v4s)); }
+
+ ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u8(vminq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u8(vmaxq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u16(vminq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u16(vmaxq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
+ {
+ int32x4_t acc =
+ vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
+ acc = vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s));
+ return GSVector4i(acc);
+ }
+
+ ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(vpaddq_s32(v4s, v4s)); }
+
+ ALWAYS_INLINE s32 minv_s32() const { return vminvq_s32(v4s); }
+
+ ALWAYS_INLINE u32 minv_u32() const { return vminvq_u32(v4s); }
+
+ ALWAYS_INLINE s32 maxv_s32() const { return vmaxvq_s32(v4s); }
+
+ ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); }
+
+ ALWAYS_INLINE static int min_i16(int a, int b) { return store(load(a).min_i16(load(b))); }
+
+ ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
+
+ ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
+ {
+ uint8x16_t mask2 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_s32(mask.v4s), 7));
+ return GSVector4i(vreinterpretq_s32_u8(vbslq_u8(mask2, vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(v4s))));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i blend16(const GSVector4i& a) const
+ {
+ static constexpr const uint16_t _mask[8] = {
+ ((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0,
+ ((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0,
+ ((mask) & (1 << 4)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 5)) ? (uint16_t)-1 : 0x0,
+ ((mask) & (1 << 6)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 7)) ? (uint16_t)-1 : 0x0};
+ return GSVector4i(
+ vreinterpretq_s32_u16(vbslq_u16(vld1q_u16(_mask), vreinterpretq_u16_s32(a.v4s), vreinterpretq_u16_s32(v4s))));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
+ {
+ constexpr int bit3 = ((mask & 8) * 3) << 3;
+ constexpr int bit2 = ((mask & 4) * 3) << 2;
+ constexpr int bit1 = ((mask & 2) * 3) << 1;
+ constexpr int bit0 = (mask & 1) * 3;
+ return blend16(v);
+ }
+
+ ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
+ {
+ return GSVector4i(
+ vreinterpretq_s32_s8(vorrq_s8(vbicq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(mask.v4s)),
+ vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s)))));
+ }
+
+ ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
+
+ ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(
+ vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v.v4s)))));
+ }
+
+ ALWAYS_INLINE GSVector4i ps16() const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(
+ vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v4s)))));
+ }
+
+ ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u8(
+ vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v.v4s)))));
+ }
+
+ ALWAYS_INLINE GSVector4i pu16() const
+ {
+ return GSVector4i(vreinterpretq_s32_u8(
+ vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v4s)))));
+ }
+
+ ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i ps32() const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i pu32() const
+ {
+ return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(vzip1q_s32(v4s, v.v4s)); }
+
+ ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(vzip2q_s32(v4s, v.v4s)); }
+
+ ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s64(
+ vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
+ }
+
+ ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s64(
+ vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
+ }
+
+ ALWAYS_INLINE GSVector4i upl8() const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
+ }
+
+ ALWAYS_INLINE GSVector4i uph8() const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
+ }
+
+ ALWAYS_INLINE GSVector4i upl16() const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
+ }
+
+ ALWAYS_INLINE GSVector4i uph16() const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
+ }
+
+ ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(vzip1q_s32(v4s, vdupq_n_s32(0))); }
+
+ ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(vzip2q_s32(v4s, vdupq_n_s32(0))); }
+
+ ALWAYS_INLINE GSVector4i upl64() const
+ {
+ return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
+ }
+
+ ALWAYS_INLINE GSVector4i uph64() const
+ {
+ return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
+ }
+
+ ALWAYS_INLINE GSVector4i i8to16() const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))));
+ }
+
+ ALWAYS_INLINE GSVector4i u8to16() const
+ {
+ return GSVector4i(vreinterpretq_s32_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))));
+ }
+
+ ALWAYS_INLINE GSVector4i i8to32() const
+ {
+ return GSVector4i(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))));
+ }
+
+ ALWAYS_INLINE GSVector4i u8to32() const
+ {
+ return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))));
+ }
+
+ ALWAYS_INLINE GSVector4i i8to64() const
+ {
+ return GSVector4i(vreinterpretq_s32_s64(
+ vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))))))));
+ }
+
+ ALWAYS_INLINE GSVector4i u8to64() const
+ {
+ return GSVector4i(vreinterpretq_s32_u64(
+ vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))))));
+ }
+
+ ALWAYS_INLINE GSVector4i i16to32() const { return GSVector4i(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))); }
+
+ ALWAYS_INLINE GSVector4i u16to32() const
+ {
+ return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))));
+ }
+
+ ALWAYS_INLINE GSVector4i i16to64() const
+ {
+ return GSVector4i(
+ vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))))));
+ }
+
+ ALWAYS_INLINE GSVector4i u16to64() const
+ {
+ return GSVector4i(
+ vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))))));
+ }
+
+ ALWAYS_INLINE GSVector4i i32to64() const { return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(v4s)))); }
+
+ ALWAYS_INLINE GSVector4i u32to64() const
+ {
+ return GSVector4i(vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)))));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i srl() const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0), i)));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i srl(const GSVector4i& v)
+ {
+ if constexpr (i >= 16)
+ return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v.v4s), vdupq_n_u8(0), i - 16)));
+ else
+ return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s), i)));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i sll() const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s32(v4s), 16 - i)));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i sll16() const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vshlq_n_s16(vreinterpretq_s16_s32(v4s), i)));
+ }
+
+ ALWAYS_INLINE GSVector4i sll16(s32 i) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(i))));
+ }
+
+ ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i srl16() const
+ {
+ return GSVector4i(vreinterpretq_s32_u16(vshrq_n_u16(vreinterpretq_u16_s32(v4s), i)));
+ }
+
+ ALWAYS_INLINE GSVector4i srl16(s32 i) const
+ {
+ return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_u16(-i))));
+ }
+
+ ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const
+ {
+ return GSVector4i(
+ vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i sra16() const
+ {
+ constexpr int count = (i & ~15) ? 15 : i;
+ return GSVector4i(vreinterpretq_s32_s16(vshrq_n_s16(vreinterpretq_s16_s32(v4s), count)));
+ }
+
+ ALWAYS_INLINE GSVector4i sra16(s32 i) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(-i))));
+ }
+
+ ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const
+ {
+ return GSVector4i(
+ vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i sll32() const
+ {
+ return GSVector4i(vshlq_n_s32(v4s, i));
+ }
+
+ ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(i))); }
+
+ ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, v.v4s)); }
+
+ template
+ ALWAYS_INLINE GSVector4i srl32() const
+ {
+ return GSVector4i(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(v4s), i)));
+ }
+
+ ALWAYS_INLINE GSVector4i srl32(s32 i) const
+ {
+ return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(-i))));
+ }
+
+ ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s))));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i sra32() const
+ {
+ return GSVector4i(vshrq_n_s32(v4s, i));
+ }
+
+ ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(-i))); }
+
+ ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const
+ {
+ return GSVector4i(vshlq_s32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s)));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i sll64() const
+ {
+ return GSVector4i(vreinterpretq_s32_s64(vshlq_n_s64(vreinterpretq_s64_s32(v4s), i)));
+ }
+
+ ALWAYS_INLINE GSVector4i sll64(s32 i) const
+ {
+ return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(i))));
+ }
+
+ ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i sra64() const
+ {
+ return GSVector4i(vreinterpretq_s32_s64(vshrq_n_s64(vreinterpretq_s64_s32(v4s), i)));
+ }
+
+ ALWAYS_INLINE GSVector4i sra64(s32 i) const
+ {
+ return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(-i))));
+ }
+
+ ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const
+ {
+ return GSVector4i(
+ vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i srl64() const
+ {
+ return GSVector4i(vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(v4s), i)));
+ }
+
+ ALWAYS_INLINE GSVector4i srl64(s32 i) const
+ {
+ return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i))));
+ }
+
+ ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const
+ {
+ return GSVector4i(
+ vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
+ }
+
+ ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(vaddq_s32(v4s, v.v4s)); }
+
+ ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vqaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const
+ {
+ // can't use vpaddq_s16() here, because we need saturation.
+ // return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ const int16x8_t a = vreinterpretq_s16_s32(v4s);
+ const int16x8_t b = vreinterpretq_s16_s32(v.v4s);
+ return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+ }
+
+ ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u8(vqaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u16(vqaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(vsubq_s32(v4s, v.v4s)); }
+
+ ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vqsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vqsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u8(vqsubq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u16(vqsubq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u8(vrhaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u16(vrhaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const
+ {
+ // from sse2neon
+ int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_s32(v4s));
+ int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_s32(v.v4s));
+ int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+ int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_s32(v4s));
+ int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_s32(v.v4s));
+ int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+ uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+ return GSVector4i(vreinterpretq_s32_u16(r.val[1]));
+ }
+
+ ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vmulq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const
+ {
+ int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
+ int32x4_t mul_hi =
+ vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
+ int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+ int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+ return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(narrow_lo, narrow_hi)));
+ }
+
+ ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); }
+
+ template
+ ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
+ {
+ // (a - this) * f << shift + this
+
+ return add16(a.sub16(*this).modulate16(f));
+ }
+
+ template
+ ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
+ {
+ // (a - b) * c << shift
+
+ return a.sub16(b).modulate16(c);
+ }
+
+ template
+ ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
+ const GSVector4i& d)
+ {
+ // (a - b) * c << shift + d
+
+ return d.add16(a.sub16(b).modulate16(c));
+ }
+
+ ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a, const GSVector4i& f) const
+ {
+ // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
+
+ return add16(a.sub16(*this).mul16l(f).sra16<4>());
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
+ {
+ // a * f << shift
+ if (shift == 0)
+ {
+ return mul16hrs(f);
+ }
+
+ return sll16().mul16hs(f);
+ }
+
+ ALWAYS_INLINE bool eq(const GSVector4i& v) const
+ {
+ return (vmaxvq_u32(vreinterpretq_u32_s32(veorq_s32(v4s, v.v4s))) == 0);
+ }
+
+ ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u8(vceqq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u16(vceqq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s)));
+ }
+
+ ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
+
+ ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); }
+
+ ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); }
+
+ ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vcgtq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vcgtq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(vcgtq_s32(v4s, v.v4s)); }
+
+ ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vcgeq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+ }
+ ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vcgeq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+ ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return GSVector4i(vcgeq_s32(v4s, v.v4s)); }
+
+ ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vcltq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vcltq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(vcltq_s32(v4s, v.v4s)); }
+
+ ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vcleq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+ }
+ ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const
+ {
+ return GSVector4i(vreinterpretq_s32_s16(vcleq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+ }
+ ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return GSVector4i(vcleq_s32(v4s, v.v4s)); }
+
+ ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(vbicq_s32(v4s, v.v4s)); }
+
+ ALWAYS_INLINE int mask() const
+ {
+ // borrowed from sse2neon
+ const uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s32(v4s), 7));
+ const uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+ const uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+ const uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+ return static_cast(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8));
+ }
+
+ ALWAYS_INLINE bool alltrue() const
+ {
+ // MSB should be set in all 8-bit lanes.
+ return (vminvq_u8(vreinterpretq_u8_s32(v4s)) & 0x80) == 0x80;
+ }
+
+ ALWAYS_INLINE bool allfalse() const
+ {
+ // MSB should be clear in all 8-bit lanes.
+ return (vmaxvq_u32(vreinterpretq_u8_s32(v4s)) & 0x80) != 0x80;
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i insert8(int a) const
+ {
+ return GSVector4i(vreinterpretq_s32_u8(vsetq_lane_u8(a, vreinterpretq_u8_s32(v4s), static_cast(i))));
+ }
+
+ template
+ ALWAYS_INLINE int extract8() const
+ {
+ return vgetq_lane_u8(vreinterpretq_u8_s32(v4s), i);
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i insert16(int a) const
+ {
+ return GSVector4i(vreinterpretq_s32_u16(vsetq_lane_u16(a, vreinterpretq_u16_s32(v4s), static_cast(i))));
+ }
+
+ template
+ ALWAYS_INLINE int extract16() const
+ {
+ return vgetq_lane_u16(vreinterpretq_u16_s32(v4s), i);
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i insert32(int a) const
+ {
+ return GSVector4i(vsetq_lane_s32(a, v4s, i));
+ }
+
+ template
+ ALWAYS_INLINE int extract32() const
+ {
+ return vgetq_lane_s32(v4s, i);
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i insert64(s64 a) const
+ {
+ return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(a, vreinterpretq_s64_s32(v4s), i)));
+ }
+
+ template
+ ALWAYS_INLINE s64 extract64() const
+ {
+ return vgetq_lane_s64(vreinterpretq_s64_s32(v4s), i);
+ }
+
+ ALWAYS_INLINE static GSVector4i loadnt(const void* p)
+ {
+#if __has_builtin(__builtin_nontemporal_store)
+ return GSVector4i(__builtin_nontemporal_load((int32x4_t*)p));
+#else
+ return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
+#endif
+ }
+
+ ALWAYS_INLINE static GSVector4i load32(const void* p)
+ {
+ // should be ldr s0, [x0]
+ u32 val;
+ std::memcpy(&val, p, sizeof(u32));
+ return GSVector4i(vsetq_lane_u32(val, vdupq_n_u32(0), 0));
+ }
+
+ ALWAYS_INLINE static GSVector4i loadl(const void* p)
+ {
+ return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
+ }
+
+ ALWAYS_INLINE static GSVector4i loadh(const void* p)
+ {
+ return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
+ }
+
+ ALWAYS_INLINE static GSVector4i loadh(const void* p, const GSVector4i& v)
+ {
+ return GSVector4i(
+ vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v.v4s)), vld1_s64((int64_t*)p))));
+ }
+
+ ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); }
+
+ ALWAYS_INLINE static GSVector4i load(const void* pl, const void* ph)
+ {
+ return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vld1_s64((int64_t*)pl), vld1_s64((int64_t*)ph))));
+ }
+
+ template
+ ALWAYS_INLINE static GSVector4i load(const void* p)
+ {
+ return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
+ }
+
+ ALWAYS_INLINE static GSVector4i load(int i) { return GSVector4i(vsetq_lane_s32(i, vdupq_n_s32(0), 0)); }
+
+ ALWAYS_INLINE static GSVector4i loadq(s64 i)
+ {
+ return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(i, vdupq_n_s64(0), 0)));
+ }
+
+ ALWAYS_INLINE static void storent(void* p, const GSVector4i& v)
+ {
+#if __has_builtin(__builtin_nontemporal_store)
+ __builtin_nontemporal_store(v.v4s, ((int32x4_t*)p));
+#else
+ vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
+#endif
+ }
+
+ ALWAYS_INLINE static void store32(void* p, const GSVector4i& v)
+ {
+ u32 val = vgetq_lane_s32(v, 0);
+ std::memcpy(p, &val, sizeof(u32));
+ }
+
+ ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
+ {
+ vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
+ }
+
+ ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
+ {
+ vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
+ }
+
+ ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v)
+ {
+ GSVector4i::storel(pl, v);
+ GSVector4i::storeh(ph, v);
+ }
+
+ template
+ ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
+ {
+ vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
+ }
+
+ ALWAYS_INLINE static int store(const GSVector4i& v) { return vgetq_lane_s32(v.v4s, 0); }
+
+ ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return vgetq_lane_s64(vreinterpretq_s64_s32(v.v4s), 0); }
+
+ ALWAYS_INLINE void operator&=(const GSVector4i& v)
+ {
+ v4s = vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
+ }
+
+ ALWAYS_INLINE void operator|=(const GSVector4i& v)
+ {
+ v4s = vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
+ }
+
+ ALWAYS_INLINE void operator^=(const GSVector4i& v)
+ {
+ v4s = vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
+ }
+
+ ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
+ }
+
+ ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
+ {
+ return GSVector4i(vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
+ }
+
+ ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
+ {
+ return GSVector4i(vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
+ }
+
+ ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, int i) { return v & GSVector4i(i); }
+
+ ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, int i) { return v | GSVector4i(i); }
+
+ ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, int i) { return v ^ GSVector4i(i); }
+
+ ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return GSVector4i(vmvnq_s32(v.v4s)); }
+
+ ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(0); }
+
+ ALWAYS_INLINE static GSVector4i xffffffff() { return GSVector4i(0xFFFFFFFF); }
+
+ ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
+
+ ALWAYS_INLINE GSVector2i xy() const
+ {
+ GSVector2i ret;
+ storel(&ret, *this);
+ return ret;
+ }
+
+ ALWAYS_INLINE GSVector2i zw() const
+ {
+ GSVector2i ret;
+ storeh(&ret, *this);
+ return ret;
+ }
+
+ // clang-format off
+
+
+#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+ ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); }
+
+ // ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));}
+ // ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));}
+ // ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));}
+ // ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));}
+
+#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
+ VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+ VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+ VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+ VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+#define VECTOR4i_SHUFFLE_1(xs, xn) \
+ VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
+ VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
+ VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
+ VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
+
+ VECTOR4i_SHUFFLE_1(x, 0)
+ VECTOR4i_SHUFFLE_1(y, 1)
+ VECTOR4i_SHUFFLE_1(z, 2)
+ VECTOR4i_SHUFFLE_1(w, 3)
+
+ // TODO: Make generic like above.
+ ALWAYS_INLINE GSVector4i xxzzlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 2, 2, 4, 4, 6, 6))); }
+ ALWAYS_INLINE GSVector4i yywwlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 1, 3, 3, 5, 5, 7, 7))); }
+ ALWAYS_INLINE GSVector4i yxwzlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 0, 3, 2, 5, 4, 7, 6))); }
+ ALWAYS_INLINE GSVector4i xxxxlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 0, 0, 4, 4, 4, 4))); }
+
+ ALWAYS_INLINE GSVector4i xxxxl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 0, 0, 4, 5, 6, 7))); }
+ ALWAYS_INLINE GSVector4i zwxyl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 2, 3, 0, 1, 4, 5, 6, 7))); }
+ ALWAYS_INLINE GSVector4i yxwzl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 0, 3, 2, 4, 5, 6, 7))); }
+ ALWAYS_INLINE GSVector4i zwzwl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 2, 3, 2, 3, 4, 5, 6, 7))); }
+
+ ALWAYS_INLINE GSVector4i zzzzh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 1, 2, 3, 6, 6, 6, 6))); }
+
+ // clang-format on
+};
+
+class alignas(16) GSVector4
+{
+ struct cxpr_init_tag
+ {
+ };
+ static constexpr cxpr_init_tag cxpr_init{};
+
+ constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
+
+ constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
+
+ constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
+
+public:
+ union
+ {
+ struct
+ {
+ float x, y, z, w;
+ };
+ struct
+ {
+ float r, g, b, a;
+ };
+ struct
+ {
+ float left, top, right, bottom;
+ };
+ float F32[4];
+ double F64[2];
+ s8 I8[16];
+ s16 I16[8];
+ s32 I32[4];
+ s64 I64[2];
+ u8 U8[16];
+ u16 U16[8];
+ u32 U32[4];
+ u64 U64[2];
+ float32x4_t v4s;
+ };
+
+ GSVector4() = default;
+
+ constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
+
+ constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
+
+ constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
+
+ constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
+
+ constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
+
+ constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
+
+ ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
+ {
+ const float arr[4] = {x, y, z, w};
+ v4s = vld1q_f32(arr);
+ }
+
+ ALWAYS_INLINE GSVector4(float x, float y)
+ {
+ v4s = vzip1q_f32(vsetq_lane_f32(x, vdupq_n_f32(0.0f), 0), vsetq_lane_f32(y, vdupq_n_f32(0.0f), 0));
+ }
+
+ ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
+ {
+ const int arr[4] = {x, y, z, w};
+ v4s = vcvtq_f32_s32(vld1q_s32(arr));
+ }
+
+ ALWAYS_INLINE GSVector4(int x, int y)
+ {
+ v4s = vcvtq_f32_s32(vzip1q_s32(vsetq_lane_s32(x, vdupq_n_s32(0), 0), vsetq_lane_s32(y, vdupq_n_s32(0), 0)));
+ }
+
+ ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(vld1_f32(v.v), vcreate_f32(0)); }
+
+ ALWAYS_INLINE explicit GSVector4(const GSVector2i& v)
+ {
+ v4s = vcvtq_f32_s32(vcombine_s32(vld1_s32(v.v), vcreate_s32(0)));
+ }
+
+ ALWAYS_INLINE constexpr explicit GSVector4(float32x4_t m) : v4s(m) {}
+
+ ALWAYS_INLINE explicit GSVector4(float f) { v4s = vdupq_n_f32(f); }
+
+ ALWAYS_INLINE explicit GSVector4(int i) { v4s = vcvtq_f32_s32(vdupq_n_s32(i)); }
+
+ ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
+
+ ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
+
+ ALWAYS_INLINE static GSVector4 f64(double x, double y)
+ {
+ return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
+ }
+
+ ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
+
+ ALWAYS_INLINE void operator=(float32x4_t m) { v4s = m; }
+
+ ALWAYS_INLINE operator float32x4_t() const { return v4s; }
+
+ /// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks
+ /// we don't need the whole vector Useful for e.g. preventing clang from optimizing shuffles that remove
+ /// possibly-denormal garbage data from vectors before computing with them
+ ALWAYS_INLINE GSVector4 noopt()
+ {
+ // Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the
+ // future the implementation should be updated
+#ifdef __clang__
+ // __asm__("":"+x"(m)::);
+#endif
+ return *this;
+ }
+
+ ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); }
+
+ ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
+
+ ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
+
+ ALWAYS_INLINE GSVector4 abs() const { return GSVector4(vabsq_f32(v4s)); }
+
+ ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); }
+
+ ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(vrecpeq_f32(v4s)); }
+
+ ALWAYS_INLINE GSVector4 rcpnr() const
+ {
+ float32x4_t recip = vrecpeq_f32(v4s);
+ recip = vmulq_f32(recip, vrecpsq_f32(recip, v4s));
+ return GSVector4(recip);
+ }
+
+ template
+ ALWAYS_INLINE GSVector4 round() const
+ {
+ if constexpr (mode == Round_NegInf)
+ return floor();
+ else if constexpr (mode == Round_PosInf)
+ return ceil();
+ else if constexpr (mode == Round_NearestInt)
+ return GSVector4(vrndnq_f32(v4s));
+ else
+ return GSVector4(vrndq_f32(v4s));
+ }
+
+ ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }
+
+ ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); }
+
+ ALWAYS_INLINE GSVector4 madd(const GSVector4& a, const GSVector4& b) const
+ {
+ return GSVector4(vfmaq_f32(b.v4s, v4s, a.v4s));
+ }
+ ALWAYS_INLINE GSVector4 msub(const GSVector4& a, const GSVector4& b) const
+ {
+ return GSVector4(vfmsq_f32(b.v4s, v4s, a.v4s));
+ }
+ ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const { return b - *this * a; }
+ ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const { return -b - *this * a; }
+
+ ALWAYS_INLINE GSVector4 addm(const GSVector4& a, const GSVector4& b) const
+ {
+ return a.madd(b, *this); // *this + a * b
+ }
+
+ ALWAYS_INLINE GSVector4 subm(const GSVector4& a, const GSVector4& b) const
+ {
+ return a.nmadd(b, *this); // *this - a * b
+ }
+
+ ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }
+
+ ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); }
+
+ ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v4s), vuzp2q_f32(v4s, v4s))); }
+
+ ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
+ {
+ return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
+ }
+
+ ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }
+
+ ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const
+ {
+ const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
+ const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
+ return sat(minv, maxv);
+ }
+
+ ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
+
+ ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
+
+ ALWAYS_INLINE GSVector4 min(const GSVector4& a) const { return GSVector4(vminq_f32(v4s, a.v4s)); }
+
+ ALWAYS_INLINE GSVector4 max(const GSVector4& a) const { return GSVector4(vmaxq_f32(v4s, a.v4s)); }
+
+ template
+ ALWAYS_INLINE GSVector4 blend32(const GSVector4& a) const
+ {
+ return GSVector4(__builtin_shufflevector(v4s, a.v4s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1, (mask & 4) ? 6 : 2,
+ (mask & 8) ? 7 : 3));
+ }
+
+ ALWAYS_INLINE GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const
+ {
+ // duplicate sign bit across and bit select
+ const uint32x4_t bitmask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(mask.v4s), 31));
+ return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
+ }
+
+ ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); }
+
+ ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); }
+
+ ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
+ {
+ return GSVector4(vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
+ {
+ return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
+ }
+
+ ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const
+ {
+ return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
+ }
+
+ ALWAYS_INLINE GSVector4 h2l(const GSVector4& a) const
+ {
+ return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)));
+ }
+
+ ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
+ {
+ return GSVector4(vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v4s), vreinterpretq_s32_f32(v.v4s))));
+ }
+
+ ALWAYS_INLINE int mask() const
+ {
+ static constexpr const int32_t shifts[] = {0, 1, 2, 3};
+ return static_cast(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
+ }
+
+ ALWAYS_INLINE bool alltrue() const
+ {
+ // return mask() == 0xf;
+ return ~(vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) & vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
+ }
+
+ ALWAYS_INLINE bool allfalse() const
+ {
+ return (vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) | vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
+ }
+
+ ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
+
+ template
+ ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
+ {
+ return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
+ }
+
+ template
+ ALWAYS_INLINE int extract32() const
+ {
+ return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i);
+ }
+
+ ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); }
+
+ ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
+
+ ALWAYS_INLINE static GSVector4 loadl(const void* p)
+ {
+ return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
+ }
+
+ ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(vsetq_lane_f32(f, vmovq_n_f32(0.0f), 0)); }
+
+ template
+ ALWAYS_INLINE static GSVector4 load(const void* p)
+ {
+ return GSVector4(vld1q_f32((const float*)p));
+ }
+
+ ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); }
+
+ ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
+ {
+ vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
+ }
+
+ ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
+ {
+ vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
+ }
+
+ template
+ ALWAYS_INLINE static void store(void* p, const GSVector4& v)
+ {
+ vst1q_f32((float*)p, v.v4s);
+ }
+
+ ALWAYS_INLINE static void store(float* p, const GSVector4& v) { vst1q_lane_f32(p, v.v4s, 0); }
+
+ ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
+
+ ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); }
+ ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); }
+ ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); }
+ ALWAYS_INLINE void operator/=(const GSVector4& v) { v4s = vdivq_f32(v4s, v.v4s); }
+
+ ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
+ ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
+ ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
+ ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); }
+
+ ALWAYS_INLINE void operator&=(const GSVector4& v)
+ {
+ v4s = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
+ }
+
+ ALWAYS_INLINE void operator|=(const GSVector4& v)
+ {
+ v4s = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
+ }
+
+ ALWAYS_INLINE void operator^=(const GSVector4& v)
+ {
+ v4s = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(vaddq_f32(v1.v4s, v2.v4s));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(vsubq_f32(v1.v4s, v2.v4s));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(vmulq_f32(v1.v4s, v2.v4s));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
+ ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
+ ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
+ ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); }
+
+ ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(vreinterpretq_f32_u32(vceqq_f32(v1.v4s, v2.v4s)));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
+ {
+ // NEON has no !=
+ return GSVector4(vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(v1.v4s, v2.v4s))));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(vreinterpretq_f32_u32(vcgtq_f32(v1.v4s, v2.v4s)));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(vreinterpretq_f32_u32(vcltq_f32(v1.v4s, v2.v4s)));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(vreinterpretq_f32_u32(vcgeq_f32(v1.v4s, v2.v4s)));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
+ }
+
+ ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
+ {
+ return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
+ }
+
+ ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const
+ {
+ return GSVector4(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
+ }
+
+ ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const
+ {
+ return GSVector4(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
+ }
+
+ ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v)
+ {
+ return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
+ }
+
+ ALWAYS_INLINE static GSVector4 f32to64(const void* p)
+ {
+ return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast(p)))));
+ }
+
+ ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
+ {
+ const float64x2_t r = truncate ? v4s : vrndiq_f64(vreinterpretq_f64_f32(v4s));
+ const s32 low = static_cast(vgetq_lane_f64(r, 0));
+ const s32 high = static_cast(vgetq_lane_f64(r, 1));
+ return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
+ }
+
+ // clang-format off
+
+#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+ ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } \
+ ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const { return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); }
+
+#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+ VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+ VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+ VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+ VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
+ VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+ VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+ VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+ VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+#define VECTOR4_SHUFFLE_1(xs, xn) \
+ VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
+ VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
+ VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
+ VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
+
+ VECTOR4_SHUFFLE_1(x, 0)
+ VECTOR4_SHUFFLE_1(y, 1)
+ VECTOR4_SHUFFLE_1(z, 2)
+ VECTOR4_SHUFFLE_1(w, 3)
+
+ // clang-format on
+
+ ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(vdupq_laneq_f32(v4s, 0)); }
+
+ ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(vdupq_laneq_f32(v.v4s, 0)); }
+
+ ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); }
+
+ ALWAYS_INLINE static GSVector4 broadcast64(const void* f)
+ {
+ return GSVector4(vreinterpretq_f64_f32(vld1q_dup_f64((const double*)f)));
+ }
+};
+
+ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
+{
+ v4s = truncate ? vcvtq_s32_f32(v.v4s) : vcvtnq_u32_f32(v.v4s);
+}
+
+ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
+{
+ v4s = vcvtq_f32_s32(v.v4s);
+}
+
+ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
+{
+ return GSVector4i(vreinterpretq_s32_f32(v.v4s));
+}
+
+ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
+{
+ return GSVector4(vreinterpretq_f32_s32(v.v4s));
+}
diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h
new file mode 100644
index 000000000..b718b8e3c
--- /dev/null
+++ b/src/common/gsvector_nosimd.h
@@ -0,0 +1,1612 @@
+// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin
+// SPDX-License-Identifier: LGPL-3.0+
+
+// Implementation of GSVector4/GSVector4i when the host does not support any form of SIMD.
+
+#pragma once
+
+#include "common/assert.h"
+#include "common/types.h"
+
+#include
+#include
+#include
+
+#define GSVECTOR_HAS_UNSIGNED 1
+#define GSVECTOR_HAS_SRLV 1
+
+class GSVector4;
+
+#define ALL_LANES_8(expr) \
+ GSVector4i ret; \
+ for (size_t i = 0; i < 16; i++) \
+ expr; \
+ return ret;
+#define ALL_LANES_16(expr) \
+ GSVector4i ret; \
+ for (size_t i = 0; i < 8; i++) \
+ expr; \
+ return ret;
+#define ALL_LANES_32(expr) \
+ GSVector4i ret; \
+ for (size_t i = 0; i < 4; i++) \
+ expr; \
+ return ret;
+#define ALL_LANES_64(expr) \
+ GSVector4i ret; \
+ for (size_t i = 0; i < 2; i++) \
+ expr; \
+ return ret;
+#define SSATURATE8(expr) static_cast(std::clamp(expr, -128, 127))
+#define USATURATE8(expr) static_cast(std::clamp(expr, 0, 255))
+#define SSATURATE16(expr) static_cast(std::clamp(expr, -32768, 32767))
+#define USATURATE16(expr) static_cast(std::clamp(expr, 0, 65535))
+
+class alignas(16) GSVector4i
+{
+ struct cxpr_init_tag
+ {
+ };
+ static constexpr cxpr_init_tag cxpr_init{};
+
+ constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : I32{x, y, z, w} {}
+
+ constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+ : I16{s0, s1, s2, s3, s4, s5, s6, s7}
+ {
+ }
+
+ constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
+ s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+ : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+ {
+ }
+
+public:
+ union
+ {
+ struct
+ {
+ s32 x, y, z, w;
+ };
+ struct
+ {
+ s32 r, g, b, a;
+ };
+ struct
+ {
+ s32 left, top, right, bottom;
+ };
+ float F32[4];
+ s8 I8[16];
+ s16 I16[8];
+ s32 I32[4];
+ s64 I64[2];
+ u8 U8[16];
+ u16 U16[8];
+ u32 U32[4];
+ u64 U64[2];
+ };
+
+ GSVector4i() = default;
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
+ {
+ return GSVector4i(cxpr_init, x, y, z, w);
+ }
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+ {
+ return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
+ }
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
+ s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+ {
+ return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
+ }
+
+ ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w)
+ {
+ this->x = x;
+ this->y = y;
+ this->z = z;
+ this->w = w;
+ }
+
+ ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); }
+
+ ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+ {
+ I16[0] = s0;
+ I16[1] = s1;
+ I16[2] = s2;
+ I16[3] = s3;
+ I16[4] = s4;
+ I16[5] = s5;
+ I16[6] = s6;
+ I16[7] = s7;
+ }
+
+ ALWAYS_INLINE constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
+ s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+ : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+ {
+ }
+
+ ALWAYS_INLINE GSVector4i(const GSVector4i& v) { std::memcpy(I32, v.I32, sizeof(I32)); }
+
+ ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v)
+ {
+ x = v.x;
+ y = v.y;
+ z = 0;
+ w = 0;
+ }
+
+ // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
+ // so leave the non-constexpr version default
+ ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
+
+ ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
+
+ ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
+
+ ALWAYS_INLINE void operator=(const GSVector4i& v) { std::memcpy(I32, v.I32, sizeof(I32)); }
+ ALWAYS_INLINE void operator=(s32 i)
+ {
+ x = i;
+ y = i;
+ z = i;
+ w = i;
+ }
+
+ // rect
+
+ ALWAYS_INLINE s32 width() const { return right - left; }
+
+ ALWAYS_INLINE s32 height() const { return bottom - top; }
+
+ ALWAYS_INLINE GSVector4i rsize() const
+ {
+ return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
+ }
+
+ ALWAYS_INLINE s32 rarea() const { return width() * height(); }
+
+ ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; }
+
+ // TODO: Optimize for no-simd, this generates crap code.
+ ALWAYS_INLINE GSVector4i runion(const GSVector4i& v) const { return min_i32(v).upl64(max_i32(v).srl<8>()); }
+
+ ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& v) const { return sat_i32(v); }
+ ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); }
+ ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
+
+ template
+ GSVector4i _ralign_helper(const GSVector4i& mask) const
+ {
+ GSVector4i v;
+
+ switch (mode)
+ {
+ case Align_Inside:
+ v = add32(mask);
+ break;
+ case Align_Outside:
+ v = add32(mask.zwxy());
+ break;
+ case Align_NegInf:
+ v = *this;
+ break;
+ case Align_PosInf:
+ v = add32(mask.xyxy());
+ break;
+
+ default:
+ UnreachableCode();
+ break;
+ }
+
+ return v.andnot(mask.xyxy());
+ }
+
+ /// Align the rect using mask values that already have one subtracted (1 << n - 1 aligns to 1 << n)
+ template
+ GSVector4i ralign_presub(const GSVector2i& v) const
+ {
+ return _ralign_helper(GSVector4i(v));
+ }
+
+ template
+ GSVector4i ralign(const GSVector2i& v) const
+ {
+ // a must be 1 << n
+
+ return _ralign_helper(GSVector4i(v).sub32(GSVector4i(1, 1)));
+ }
+
+ //
+
+ ALWAYS_INLINE u32 rgba32() const
+ {
+ GSVector4i v = *this;
+
+ v = v.ps32(v);
+ v = v.pu16(v);
+
+ return (u32)store(v);
+ }
+
+ ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_i8(min).min_i8(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const
+ {
+ return max_i8(minmax.xyxy()).min_i8(minmax.zwzw());
+ }
+ ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_i16(min).min_i16(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const
+ {
+ return max_i16(minmax.xyxy()).min_i16(minmax.zwzw());
+ }
+ ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_i32(min).min_i32(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const
+ {
+ return max_i32(minmax.xyxy()).min_i32(minmax.zwzw());
+ }
+
+ ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_u8(min).min_u8(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
+ {
+ return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
+ }
+ ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_u16(min).min_u16(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
+ {
+ return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
+ }
+ ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_u32(min).min_u32(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
+ {
+ return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
+ }
+
+ GSVector4i min_i8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = std::min(I8[i], v.I8[i])); }
+ GSVector4i max_i8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = std::max(I8[i], v.I8[i])); }
+ GSVector4i min_i16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = std::min(I16[i], v.I16[i])); }
+ GSVector4i max_i16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = std::max(I16[i], v.I16[i])); }
+ GSVector4i min_i32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = std::min(I32[i], v.I32[i])); }
+ GSVector4i max_i32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = std::max(I32[i], v.I32[i])); }
+
+ GSVector4i min_u8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = std::min(U8[i], v.U8[i])); }
+ GSVector4i max_u8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = std::max(U8[i], v.U8[i])); }
+ GSVector4i min_u16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = std::min(U16[i], v.U16[i])); }
+ GSVector4i max_u16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = std::max(U16[i], v.U16[i])); }
+ GSVector4i min_u32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = std::min(U32[i], v.U32[i])); }
+ GSVector4i max_u32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = std::max(U32[i], v.U32[i])); }
+
+ GSVector4i madd_s16(const GSVector4i& v) const
+ {
+ ALL_LANES_32(ret.I32[i] = (I16[i * 2] * v.I16[i * 2]) + (I16[i * 2 + 1] * v.I16[i * 2 + 1]));
+ }
+
+ GSVector4i addp_s32() const { return GSVector4i(x + y, z + w, 0, 0); }
+
+ s32 minv_s32() const { return std::min(x, std::min(y, std::min(z, w))); }
+
+ u32 minv_u32() const { return std::min(U32[0], std::min(U32[1], std::min(U32[2], U32[3]))); }
+
+ s32 maxv_s32() const { return std::max(x, std::max(y, std::max(z, w))); }
+
+ u32 maxv_u32() const { return std::max(U32[0], std::max(U32[1], std::max(U32[2], U32[3]))); }
+
+ static s32 min_i16(s32 a, s32 b) { return store(load(a).min_i16(load(b))); }
+
+ ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
+
+ GSVector4i blend8(const GSVector4i& v, const GSVector4i& mask) const
+ {
+ GSVector4i ret;
+ for (size_t i = 0; i < 16; i++)
+ ret.U8[i] = (mask.U8[i] & 0x80) ? v.U8[i] : U8[i];
+ return ret;
+ }
+
+ template
+ GSVector4i blend16(const GSVector4i& v) const
+ {
+ GSVector4i ret;
+ for (size_t i = 0; i < 8; i++)
+ ret.U16[i] = ((mask & (1 << i)) != 0) ? v.U16[i] : U16[i];
+ return ret;
+ }
+
+ template
+ GSVector4i blend32(const GSVector4i& v) const
+ {
+ GSVector4i ret;
+ for (size_t i = 0; i < 4; i++)
+ ret.U32[i] = ((mask & (1 << i)) != 0) ? v.U32[i] : U32[i];
+ return ret;
+ }
+
+ GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
+ {
+ GSVector4i ret;
+ for (size_t i = 0; i < 2; i++)
+ ret.U64[0] = (v.U64[i] & mask.U64[i]) | (U64[i] & ~mask.U64[i]);
+ return ret;
+ }
+
+ ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
+
+ GSVector4i shuffle8(const GSVector4i& mask) const
+ {
+ ALL_LANES_8(ret.I8[i] = (mask.I8[i] & 0x80) ? 0 : (I8[mask.I8[i] & 0xf]));
+ }
+
+ GSVector4i ps16(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8((i < 8) ? I16[i] : v.I16[i])); }
+ GSVector4i ps16() const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I16[i])); }
+ GSVector4i pu16(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE16((i < 8) ? U16[i] : v.U16[i])); }
+ GSVector4i pu16() const { ALL_LANES_8(ret.U8[i] = USATURATE8(U16[i])); }
+ GSVector4i ps32(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = SSATURATE16((i < 8) ? I32[i] : v.I32[i])); }
+ GSVector4i ps32() const { ALL_LANES_16(ret.I16[i] = SSATURATE8(I32[i])); }
+ GSVector4i pu32(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16((i < 8) ? U32[i] : v.U32[i])); }
+ GSVector4i pu32() const { ALL_LANES_16(ret.U16[i] = USATURATE8(U32[i])); }
+
+ GSVector4i upl8(const GSVector4i& v) const
+ {
+ return GSVector4i(I8[0], v.I8[0], I8[1], v.I8[1], I8[2], v.I8[2], I8[3], v.I8[3], I8[4], v.I8[4], I8[5], v.I8[5],
+ I8[6], v.I8[6], I8[7], v.I8[7]);
+ }
+ GSVector4i uph8(const GSVector4i& v) const
+ {
+ return GSVector4i(I8[8], v.I8[8], I8[9], v.I8[9], I8[10], v.I8[10], I8[11], v.I8[11], I8[12], v.I8[12], I8[13],
+ v.I8[13], I8[14], v.I8[14], I8[15], v.I8[15]);
+ }
+ GSVector4i upl16(const GSVector4i& v) const
+ {
+ return GSVector4i(I16[0], v.I16[0], I16[1], v.I16[1], I16[2], v.I16[2], I16[3], v.I16[3]);
+ }
+ GSVector4i uph16(const GSVector4i& v) const
+ {
+ return GSVector4i(I16[4], v.I16[4], I16[5], v.I16[5], I16[6], v.I16[6], I16[7], v.I16[7]);
+ }
+ GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(I32[0], v.I32[0], I32[1], v.I32[1]); }
+ GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(I32[2], v.I32[2], I32[3], v.I32[3]); }
+ GSVector4i upl64(const GSVector4i& v) const
+ {
+ GSVector4i ret;
+ ret.I64[0] = I64[0];
+ ret.I64[1] = v.I64[0];
+ return ret;
+ }
+ GSVector4i uph64(const GSVector4i& v) const
+ {
+ GSVector4i ret;
+ ret.I64[0] = I64[1];
+ ret.I64[1] = v.I64[1];
+ return ret;
+ }
+
+ GSVector4i upl8() const
+ {
+ return GSVector4i(I8[0], 0, I8[1], 0, I8[2], 0, I8[3], 0, I8[4], 0, I8[5], 0, I8[6], 0, I8[7], 0);
+ }
+ GSVector4i uph8() const
+ {
+ return GSVector4i(I8[8], 0, I8[9], 0, I8[10], 0, I8[11], 0, I8[12], 0, I8[13], 0, I8[14], 0, I8[15], 0);
+ }
+
+ GSVector4i upl16() const { return GSVector4i(I16[0], 0, I16[1], 0, I16[2], 0, I16[3], 0); }
+ GSVector4i uph16() const { return GSVector4i(I16[4], 0, I16[5], 0, I16[6], 0, I16[7], 0); }
+
+ GSVector4i upl32() const { return GSVector4i(I32[0], 0, I32[1], 0); }
+ GSVector4i uph32() const { return GSVector4i(I32[2], 0, I32[3], 0); }
+ GSVector4i upl64() const
+ {
+ GSVector4i ret;
+ ret.I64[0] = I64[0];
+ ret.I64[1] = 0;
+ return ret;
+ }
+ GSVector4i uph64() const
+ {
+ GSVector4i ret;
+ ret.I64[0] = I64[1];
+ ret.I64[1] = 0;
+ return ret;
+ }
+
+ GSVector4i i8to16() const { ALL_LANES_16(ret.I16[i] = I8[i]); }
+ GSVector4i i8to32() const { ALL_LANES_32(ret.I32[i] = I8[i]); }
+ GSVector4i i8to64() const { ALL_LANES_64(ret.I64[i] = I8[i]); }
+
+ GSVector4i i16to32() const { ALL_LANES_32(ret.I32[i] = I16[i]); }
+ GSVector4i i16to64() const { ALL_LANES_64(ret.I64[i] = I16[i]); }
+ GSVector4i i32to64() const { ALL_LANES_64(ret.I64[i] = I32[i]); }
+ GSVector4i u8to16() const { ALL_LANES_64(ret.U16[i] = U8[i]); }
+ GSVector4i u8to32() const { ALL_LANES_32(ret.U32[i] = U8[i]); }
+ GSVector4i u8to64() const { ALL_LANES_64(ret.U64[i] = U8[i]); }
+ GSVector4i u16to32() const { ALL_LANES_32(ret.U32[i] = U16[i]); }
+ GSVector4i u16to64() const { ALL_LANES_64(ret.U64[i] = U16[i]); }
+ GSVector4i u32to64() const { ALL_LANES_64(ret.U64[i] = U32[i]); }
+
+ template
+ GSVector4i srl() const
+ {
+ GSVector4i ret = {};
+ if constexpr (v < 16)
+ {
+ for (s32 i = 0; i < (16 - v); i++)
+ ret.U8[i] = U8[v + i];
+ }
+ return ret;
+ }
+
+ template
+ GSVector4i srl(const GSVector4i& r)
+ {
+ // This sucks. Hopefully it's never used.
+ u8 concat[32];
+ std::memcpy(concat, U8, sizeof(u8) * 16);
+ std::memcpy(concat + 16, r.U8, sizeof(u8) * 16);
+
+ GSVector4i ret;
+ std::memcpy(ret.U8, &concat[v], sizeof(u8) * 16);
+ return ret;
+ }
+
+ template
+ GSVector4i sll() const
+ {
+ GSVector4i ret = {};
+ if constexpr (v < 16)
+ {
+ for (s32 i = 0; i < (16 - v); i++)
+ ret.U8[v + i] = U8[i];
+ }
+ return ret;
+ }
+
+ template
+ GSVector4i sll16() const
+ {
+ ALL_LANES_16(ret.U16[i] = U16[i] << v);
+ }
+
+ GSVector4i sll16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v); }
+
+ GSVector4i sllv16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v.U16[i]); }
+
+ template
+ GSVector4i srl16() const
+ {
+ ALL_LANES_16(ret.U16[i] = U16[i] >> v);
+ }
+
+ GSVector4i srl16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v); }
+
+ GSVector4i srlv16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v.U16[i]); }
+
+ template
+ GSVector4i sra16() const
+ {
+ ALL_LANES_16(ret.I16[i] = I16[i] >> v);
+ }
+
+ GSVector4i sra16(s32 v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v); }
+
+ GSVector4i srav16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v.I16[i]); }
+
+ template
+ GSVector4i sll32() const
+ {
+ ALL_LANES_32(ret.U32[i] = U32[i] << v);
+ }
+
+ GSVector4i sll32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v); }
+
+ GSVector4i sllv32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v.U32[i]); }
+
+ template
+ GSVector4i srl32() const
+ {
+ ALL_LANES_32(ret.U32[i] = U32[i] >> v);
+ }
+
+ GSVector4i srl32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v); }
+
+ GSVector4i srlv32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v.U32[i]); }
+
+ template
+ GSVector4i sra32() const
+ {
+ ALL_LANES_32(ret.I32[i] = I32[i] >> v);
+ }
+
+ GSVector4i sra32(s32 v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v); }
+
+ GSVector4i srav32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v.I32[i]); }
+
+ template
+ GSVector4i sll64() const
+ {
+ ALL_LANES_64(ret.U64[i] = U64[i] << v);
+ }
+
+ GSVector4i sll64(s32 v) const { ALL_LANES_64(ret.U64[i] = U64[i] << v); }
+
+ GSVector4i sllv64(const GSVector4i& v) const { ALL_LANES_64(ret.U64[i] = U64[i] << v.U64[i]); }
+
+ template
+ GSVector4i srl64() const
+ {
+ ALL_LANES_64(ret.U64[i] = U64[i] >> v);
+ }
+
+ GSVector4i srl64(s32 v) const { ALL_LANES_64(ret.U64[i] = U64[i] >> v); }
+
+ GSVector4i srlv64(const GSVector4i& v) const { ALL_LANES_64(ret.U64[i] = U64[i] >> v.U64[i]); }
+
+ template
+ GSVector4i sra64() const
+ {
+ ALL_LANES_64(ret.I64[i] = I64[i] >> v);
+ }
+
+ GSVector4i sra64(s32 v) const { ALL_LANES_64(ret.I64[i] = I64[i] >> v); }
+
+ GSVector4i srav64(const GSVector4i& v) const { ALL_LANES_64(ret.I64[i] = I64[i] >> v.I64[i]); }
+
+ GSVector4i add8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] + v.I8[i]); }
+
+ GSVector4i add16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] + v.I16[i]); }
+
+ GSVector4i add32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] + v.I32[i]); }
+
+ GSVector4i adds8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] + v.I8[i])); }
+
+ GSVector4i adds16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] + v.I16[i])); }
+
+ GSVector4i hadds16(const GSVector4i& v) const
+ {
+ return GSVector4i(SSATURATE16(I16[0] + I16[1]), SSATURATE16(I16[2] + I16[3]), SSATURATE16(I16[4] + I16[5]),
+ SSATURATE16(I16[6] + I16[7]), SSATURATE16(v.I16[0] + v.I16[1]), SSATURATE16(v.I16[2] + v.I16[3]),
+ SSATURATE16(v.I16[4] + v.I16[5]), SSATURATE16(v.I16[6] + v.I16[7]));
+ }
+
+ GSVector4i addus8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] + v.U8[i])); }
+
+ GSVector4i addus16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] + v.U16[i])); }
+
+ GSVector4i sub8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] - v.I8[i]); }
+
+ GSVector4i sub16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] - v.I16[i]); }
+
+ GSVector4i sub32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] - v.I32[i]); }
+
+ GSVector4i subs8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] - v.I8[i])); }
+
+ GSVector4i subs16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] - v.I16[i])); }
+
+ GSVector4i subus8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] - v.U8[i])); }
+
+ GSVector4i subus16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] - v.U16[i])); }
+
+ GSVector4i avg8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = (U8[i] + v.U8[i]) >> 1); }
+
+ GSVector4i avg16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] + v.U16[i]) >> 1); }
+
+ GSVector4i mul16hs(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] * v.I16[i]) >> 16); }
+
+ GSVector4i mul16hu(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] * v.U16[i]) >> 16); }
+
+ GSVector4i mul16l(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] * v.I16[i]); }
+
+ GSVector4i mul16hrs(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = ((I16[i] * v.I16[i]) >> 14) + 1); }
+
+ GSVector4i mul32l(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] * v.I32[i]); }
+
+ template
+ ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
+ {
+ // (a - this) * f << shift + this
+
+ return add16(a.sub16(*this).modulate16(f));
+ }
+
+ template
+ ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
+ {
+ // (a - b) * c << shift
+
+ return a.sub16(b).modulate16(c);
+ }
+
+ template
+ ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
+ const GSVector4i& d)
+ {
+ // (a - b) * c << shift + d
+
+ return d.add16(a.sub16(b).modulate16(c));
+ }
+
+ ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const
+ {
+ // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
+
+ return add16(a_.sub16(*this).mul16l(f).sra16<4>());
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
+ {
+ // a * f << shift
+ if constexpr (shift == 0)
+ {
+ return mul16hrs(f);
+ }
+
+ return sll16().mul16hs(f);
+ }
+
+ ALWAYS_INLINE bool eq(const GSVector4i& v) const { return (std::memcmp(I32, v.I32, sizeof(I32))) == 0; }
+
+ GSVector4i eq8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] == v.I8[i]) ? -1 : 0); }
+ GSVector4i eq16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] == v.I16[i]) ? -1 : 0); }
+ GSVector4i eq32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] == v.I32[i]) ? -1 : 0); }
+ GSVector4i eq64(const GSVector4i& v) const { ALL_LANES_64(ret.I64[i] = (I64[i] == v.I64[i]) ? -1 : 0); }
+
+ GSVector4i neq8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] != v.I8[i]) ? -1 : 0); }
+ GSVector4i neq16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] != v.I16[i]) ? -1 : 0); }
+ GSVector4i neq32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] != v.I32[i]) ? -1 : 0); }
+
+ GSVector4i gt8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] > v.I8[i]) ? -1 : 0); }
+ GSVector4i gt16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] > v.I16[i]) ? -1 : 0); }
+ GSVector4i gt32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] > v.I32[i]) ? -1 : 0); }
+
+ GSVector4i ge8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] >= v.I8[i]) ? -1 : 0); }
+ GSVector4i ge16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] >= v.I16[i]) ? -1 : 0); }
+ GSVector4i ge32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] >= v.I32[i]) ? -1 : 0); }
+
+ GSVector4i lt8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] < v.I8[i]) ? -1 : 0); }
+ GSVector4i lt16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] < v.I16[i]) ? -1 : 0); }
+ GSVector4i lt32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] < v.I32[i]) ? -1 : 0); }
+
+ GSVector4i le8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] <= v.I8[i]) ? -1 : 0); }
+ GSVector4i le16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] <= v.I16[i]) ? -1 : 0); }
+ GSVector4i le32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] <= v.I32[i]) ? -1 : 0); }
+
+ ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { ALL_LANES_64(ret.U64[i] = (~v.U64[i]) & U64[i]); }
+
+ s32 mask() const
+ {
+ return static_cast((static_cast(U8[0] >> 7) << 0) | (static_cast(U8[1] >> 7) << 1) |
+ (static_cast(U8[2] >> 7) << 2) | (static_cast(U8[3] >> 7) << 3) |
+ (static_cast(U8[4] >> 7) << 4) | (static_cast(U8[5] >> 7) << 5) |
+ (static_cast(U8[6] >> 7) << 6) | (static_cast(U8[7] >> 7) << 7) |
+ (static_cast(U8[8] >> 7) << 8) | (static_cast(U8[9] >> 7) << 9) |
+ (static_cast(U8[10] >> 7) << 10) | (static_cast(U8[11] >> 7) << 11) |
+ (static_cast(U8[12] >> 7) << 12) | (static_cast(U8[13] >> 7) << 13) |
+ (static_cast(U8[14] >> 7) << 14) | (static_cast(U8[15] >> 7) << 15));
+ }
+
+ ALWAYS_INLINE bool alltrue() const { return ((U64[0] & U64[1]) == 0xFFFFFFFFFFFFFFFFULL); }
+
+ ALWAYS_INLINE bool allfalse() const { return ((U64[0] | U64[1]) == 0); }
+
+ template
+ ALWAYS_INLINE GSVector4i insert8(s32 a) const
+ {
+ GSVector4i ret = *this;
+ ret.I8[i] = static_cast(a);
+ return ret;
+ }
+
+ template
+ ALWAYS_INLINE s32 extract8() const
+ {
+ return I8[i];
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i insert16(s32 a) const
+ {
+ GSVector4i ret = *this;
+ ret.I16[i] = static_cast(a);
+ return ret;
+ }
+
+ template
+ ALWAYS_INLINE s32 extract16() const
+ {
+ return I16[i];
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i insert32(s32 a) const
+ {
+ GSVector4i ret = *this;
+ ret.I32[i] = a;
+ return ret;
+ }
+
+ template
+ ALWAYS_INLINE s32 extract32() const
+ {
+ return I32[i];
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i insert64(s64 a) const
+ {
+ GSVector4i ret = *this;
+ ret.I64[i] = a;
+ return ret;
+ }
+
+ template
+ ALWAYS_INLINE s64 extract64() const
+ {
+ return I64[i];
+ }
+
+ ALWAYS_INLINE static GSVector4i loadnt(const void* p)
+ {
+ GSVector4i ret;
+ std::memcpy(&ret, p, sizeof(ret.I32));
+ return ret;
+ }
+
+ ALWAYS_INLINE static GSVector4i load32(const void* p)
+ {
+ GSVector4i ret;
+ std::memcpy(&ret.x, p, sizeof(s32));
+ ret.y = 0;
+ ret.z = 0;
+ ret.w = 0;
+ return ret;
+ }
+
+ ALWAYS_INLINE static GSVector4i loadl(const void* p)
+ {
+ GSVector4i ret;
+ std::memcpy(&ret.U64[0], p, sizeof(ret.U64[0]));
+ ret.U64[1] = 0;
+ return ret;
+ }
+
+ ALWAYS_INLINE static GSVector4i loadh(const void* p)
+ {
+ GSVector4i ret;
+ ret.U64[0] = 0;
+ std::memcpy(&ret.U64[1], p, sizeof(ret.U64[1]));
+ return ret;
+ }
+
+ ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); }
+
+ template
+ ALWAYS_INLINE static GSVector4i load(const void* p)
+ {
+ GSVector4i ret;
+ std::memcpy(ret.I32, p, sizeof(ret.I32));
+ return ret;
+ }
+
+ ALWAYS_INLINE static GSVector4i load(s32 i)
+ {
+ GSVector4i ret;
+ ret.x = i;
+ ret.y = 0;
+ ret.z = 0;
+ ret.w = 0;
+ return ret;
+ }
+
+ ALWAYS_INLINE static GSVector4i loadq(s64 i)
+ {
+ GSVector4i ret;
+ ret.I64[0] = i;
+ ret.I64[1] = 0;
+ return ret;
+ }
+
+ ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { std::memcpy(p, v.I32, sizeof(v.I32)); }
+
+ ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { std::memcpy(p, &v.I32[0], sizeof(s32) * 2); }
+
+ ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { std::memcpy(p, &v.I32[2], sizeof(s32) * 2); }
+
+ ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v)
+ {
+ GSVector4i::storel(pl, v);
+ GSVector4i::storeh(ph, v);
+ }
+
+ template
+ ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
+ {
+ std::memcpy(p, v.I32, sizeof(I32));
+ }
+
+ ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { std::memcpy(p, &v.x, sizeof(s32)); }
+
+ ALWAYS_INLINE static s32 store(const GSVector4i& v) { return v.x; }
+
+ ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return v.I64[0]; }
+
+ ALWAYS_INLINE void operator&=(const GSVector4i& v)
+ {
+ U64[0] &= v.U64[0];
+ U64[1] &= v.U64[1];
+ }
+ ALWAYS_INLINE void operator|=(const GSVector4i& v)
+ {
+ U64[0] |= v.U64[0];
+ U64[1] |= v.U64[1];
+ }
+ ALWAYS_INLINE void operator^=(const GSVector4i& v)
+ {
+ U64[0] ^= v.U64[0];
+ U64[1] ^= v.U64[1];
+ }
+
+ ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
+ {
+ GSVector4i ret;
+ ret.U64[0] = v1.U64[0] & v2.U64[0];
+ ret.U64[1] = v1.U64[1] & v2.U64[1];
+ return ret;
+ }
+
+ ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
+ {
+ GSVector4i ret;
+ ret.U64[0] = v1.U64[0] | v2.U64[0];
+ ret.U64[1] = v1.U64[1] | v2.U64[1];
+ return ret;
+ }
+
+ ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
+ {
+ GSVector4i ret;
+ ret.U64[0] = v1.U64[0] ^ v2.U64[0];
+ ret.U64[1] = v1.U64[1] ^ v2.U64[1];
+ return ret;
+ }
+
+ ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); }
+
+ ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); }
+
+ ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); }
+
+ ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); }
+
+ ALWAYS_INLINE static constexpr GSVector4i zero() { return GSVector4i::cxpr(0, 0, 0, 0); }
+
+ ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
+
+ ALWAYS_INLINE GSVector2i xy() const
+ {
+ GSVector2i ret;
+ storel(&ret, *this);
+ return ret;
+ }
+
+ ALWAYS_INLINE GSVector2i zw() const
+ {
+ GSVector2i ret;
+ storeh(&ret, *this);
+ return ret;
+ }
+
+ // clang-format off
+ // l/h/lh not implemented until needed
+
+#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+ ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(I32[xn], I32[yn], I32[zn], I32[wn]);}
+
+#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
+ VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+ VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+ VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+ VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+#define VECTOR4i_SHUFFLE_1(xs, xn) \
+ VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
+ VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
+ VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
+ VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
+
+ VECTOR4i_SHUFFLE_1(x, 0)
+ VECTOR4i_SHUFFLE_1(y, 1)
+ VECTOR4i_SHUFFLE_1(z, 2)
+ VECTOR4i_SHUFFLE_1(w, 3)
+
+ // clang-format on
+};
+
+class alignas(16) GSVector4
+{
+ struct cxpr_init_tag
+ {
+ };
+ static constexpr cxpr_init_tag cxpr_init{};
+
+ constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
+
+ constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
+
+ constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
+
+public:
+ union
+ {
+ struct
+ {
+ float x, y, z, w;
+ };
+ struct
+ {
+ float r, g, b, a;
+ };
+ struct
+ {
+ float left, top, right, bottom;
+ };
+ float F32[4];
+ double F64[2];
+ s8 I8[16];
+ s16 I16[8];
+ s32 I32[4];
+ s64 I64[2];
+ u8 U8[16];
+ u16 U16[8];
+ u32 U32[4];
+ u64 U64[2];
+ __m128 m;
+ };
+
+ GSVector4() = default;
+
+ constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
+
+ constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
+
+ constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
+
+ constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
+
+ constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
+
+ constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
+
+ ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
+ {
+ this->x = x;
+ this->y = y;
+ this->z = z;
+ this->w = w;
+ }
+
+ ALWAYS_INLINE GSVector4(float x, float y)
+ {
+ this->x = x;
+ this->y = y;
+ this->z = 0.0f;
+ this->w = 0.0f;
+ }
+
+ ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
+ {
+ this->x = static_cast(x);
+ this->y = static_cast(y);
+ this->z = static_cast(z);
+ this->w = static_cast(w);
+ }
+
+ ALWAYS_INLINE GSVector4(int x, int y)
+ {
+ this->x = static_cast(x);
+ this->y = static_cast(y);
+ this->z = 0.0f;
+ this->w = 0.0f;
+ }
+
+ ALWAYS_INLINE explicit GSVector4(const GSVector2& v)
+ {
+ x = v.x;
+ y = v.y;
+ z = 0.0f;
+ w = 0.0f;
+ }
+
+ ALWAYS_INLINE explicit GSVector4(const GSVector2i& v)
+ {
+ x = static_cast(v.x);
+ y = static_cast(v.y);
+ z = 0.0f;
+ w = 0.0f;
+ }
+
+ ALWAYS_INLINE explicit GSVector4(float f) { x = y = z = w = f; }
+
+ ALWAYS_INLINE explicit GSVector4(int i) { x = y = z = w = static_cast(i); }
+
+ ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
+
+ ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
+
+ ALWAYS_INLINE static GSVector4 f64(double x, double y)
+ {
+ GSVector4 ret;
+ ret.F64[0] = x;
+ ret.F64[1] = y;
+ return ret;
+ }
+
+ ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; }
+
+ ALWAYS_INLINE GSVector4 noopt() { return *this; }
+
+ u32 rgba32() const { return GSVector4i(*this).rgba32(); }
+
+ ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
+
+ ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
+
+ GSVector4 abs() const { return GSVector4(std::fabs(x), std::fabs(y), std::fabs(z), std::fabs(w)); }
+
+ GSVector4 neg() const { return GSVector4(-x, -y, -z, -w); }
+
+ GSVector4 rcp() const { return GSVector4(1.0f / x, 1.0f / y, 1.0f / z, 1.0f / w); }
+
+ GSVector4 rcpnr() const
+ {
+ GSVector4 v_ = rcp();
+
+ return (v_ + v_) - (v_ * v_) * *this;
+ }
+
+ GSVector4 floor() const { return GSVector4(std::floor(x), std::floor(y), std::floor(z), std::floor(w)); }
+
+ GSVector4 ceil() const { return GSVector4(std::ceil(x), std::ceil(y), std::ceil(z), std::ceil(w)); }
+
+ GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ + b_; }
+
+ GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ - b_; }
+
+ GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const { return b_ - *this * a_; }
+
+ GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const { return -b_ - *this * a_; }
+
+ GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const
+ {
+ return a_.madd(b_, *this); // *this + a * b
+ }
+
+ GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const
+ {
+ return a_.nmadd(b_, *this); // *this - a * b
+ }
+
+ GSVector4 hadd() const { return GSVector4(x + y, z + w, x + y, z + w); }
+
+ GSVector4 hadd(const GSVector4& v) const { return GSVector4(x + y, z + w, v.x + v.y, v.z + v.w); }
+
+ GSVector4 hsub() const { return GSVector4(x - y, z - w, x - y, z - w); }
+
+ GSVector4 hsub(const GSVector4& v) const { return GSVector4(x - y, z - w, v.x - v.y, v.z - v.w); }
+
+ template
+ GSVector4 dp(const GSVector4& v) const
+ {
+ float res = 0.0f;
+ if constexpr (i & 0x10)
+ res += x * v.x;
+ if constexpr (i & 0x20)
+ res += y * v.y;
+ if constexpr (i & 0x40)
+ res += z * v.z;
+ if constexpr (i & 0x80)
+ res += w * v.w;
+ return GSVector4((i & 0x01) ? res : 0.0f, (i & 0x02) ? res : 0.0f, (i & 0x04) ? res : 0.0f,
+ (i & 0x08) ? res : 0.0f);
+ }
+
+ GSVector4 sat(const GSVector4& min, const GSVector4& max) const
+ {
+ return GSVector4(std::clamp(x, min.x, max.x), std::clamp(y, min.y, max.y), std::clamp(z, min.z, max.z),
+ std::clamp(w, min.w, max.w));
+ }
+
+ GSVector4 sat(const GSVector4& v) const
+ {
+ return GSVector4(std::clamp(x, v.x, v.z), std::clamp(y, v.y, v.w), std::clamp(z, v.x, v.z),
+ std::clamp(w, v.y, v.w));
+ }
+
+ GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
+
+ GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
+
+ GSVector4 min(const GSVector4& v) const
+ {
+ return GSVector4(std::min(x, v.x), std::min(y, v.y), std::min(z, v.z), std::min(w, v.w));
+ }
+
+ GSVector4 max(const GSVector4& v) const
+ {
+ return GSVector4(std::max(x, v.x), std::max(y, v.y), std::max(z, v.z), std::max(w, v.w));
+ }
+
+ template
+ GSVector4 blend32(const GSVector4& v) const
+ {
+ return GSVector4(v.F32[mask & 1], v.F32[(mask >> 1) & 1], v.F32[(mask >> 2) & 1], v.F32[(mask >> 3) & 1]);
+ }
+
+ ALWAYS_INLINE GSVector4 blend32(const GSVector4& v, const GSVector4& mask) const
+ {
+ return GSVector4((mask.U32[0] & 0x80000000u) ? v.x : x, (mask.U32[1] & 0x80000000u) ? v.y : y,
+ (mask.U32[2] & 0x80000000u) ? v.z : z, (mask.U32[3] & 0x80000000u) ? v.w : w);
+ }
+
+ GSVector4 upl(const GSVector4& v) const { return GSVector4(x, y, v.x, v.y); }
+
+ GSVector4 uph(const GSVector4& v) const { return GSVector4(z, w, v.z, v.w); }
+
+ GSVector4 upld(const GSVector4& v) const
+ {
+ GSVector4 ret;
+ ret.U64[0] = U64[0];
+ ret.U64[1] = v.U64[0];
+ return ret;
+ }
+
+ GSVector4 uphd(const GSVector4& v) const
+ {
+ GSVector4 ret;
+ ret.U64[0] = U64[1];
+ ret.U64[1] = v.U64[1];
+ return ret;
+ }
+
+ ALWAYS_INLINE GSVector4 l2h(const GSVector4& v) const { return GSVector4(x, y, v.x, v.y); }
+
+ ALWAYS_INLINE GSVector4 h2l(const GSVector4& v) const { return GSVector4(v.z, v.w, z, w); }
+
+ ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
+ {
+ GSVector4 ret;
+ ret.U32[0] = ((~v.U32[0]) & U32[0]);
+ ret.U32[1] = ((~v.U32[1]) & U32[1]);
+ ret.U32[2] = ((~v.U32[2]) & U32[2]);
+ ret.U32[3] = ((~v.U32[3]) & U32[3]);
+ return ret;
+ }
+
+ ALWAYS_INLINE int mask() const
+ {
+ return (U32[0] >> 31) | ((U32[1] >> 30) & 2) | ((U32[2] >> 29) & 4) | ((U32[3] >> 28) & 8);
+ }
+
+ ALWAYS_INLINE bool alltrue() const { return ((U64[0] & U64[1]) == 0xFFFFFFFFFFFFFFFFULL); }
+
+ ALWAYS_INLINE bool allfalse() const { return ((U64[0] | U64[1]) == 0); }
+
+ ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
+
+ template
+ ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
+ {
+ GSVector4 ret = *this;
+ ret.F32[dst] = v.F32[src];
+ return ret;
+ }
+
+ template
+ ALWAYS_INLINE int extract32() const
+ {
+ return I32[i];
+ }
+
+ ALWAYS_INLINE static constexpr GSVector4 zero() { return GSVector4::cxpr(0.0f, 0.0f, 0.0f, 0.0f); }
+
+ ALWAYS_INLINE static constexpr GSVector4 xffffffff()
+ {
+ GSVector4 ret = zero();
+ ret.U64[0] = ~ret.U64[0];
+ ret.U64[1] = ~ret.U64[1];
+ return ret;
+ }
+
+ ALWAYS_INLINE static GSVector4 loadl(const void* p)
+ {
+ GSVector4 ret;
+ std::memcpy(&ret.x, p, sizeof(float) * 2);
+ ret.z = 0.0f;
+ ret.w = 0.0f;
+ return ret;
+ }
+
+ ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(f, f, f, f); }
+
+ template
+ ALWAYS_INLINE static GSVector4 load(const void* p)
+ {
+ GSVector4 ret;
+ std::memcpy(&ret.x, p, sizeof(float) * 4);
+ return ret;
+ }
+
+ ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { std::memcpy(p, &v, sizeof(v)); }
+
+ ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { std::memcpy(p, &v.x, sizeof(float) * 2); }
+
+ ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { std::memcpy(p, &v.z, sizeof(float) * 2); }
+
+ template
+ ALWAYS_INLINE static void store(void* p, const GSVector4& v)
+ {
+ std::memcpy(p, &v.x, sizeof(float));
+ }
+
+ ALWAYS_INLINE static void store(float* p, const GSVector4& v) { *p = v.x; }
+
+ ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
+
+ void operator+=(const GSVector4& v_)
+ {
+ x = x + v_.x;
+ y = y + v_.y;
+ z = z + v_.z;
+ w = w + v_.w;
+ }
+ void operator-=(const GSVector4& v_)
+ {
+ x = x - v_.x;
+ y = y - v_.y;
+ z = z - v_.z;
+ w = w - v_.w;
+ }
+ void operator*=(const GSVector4& v_)
+ {
+ x = x * v_.x;
+ y = y * v_.y;
+ z = z * v_.z;
+ w = w * v_.w;
+ }
+ void operator/=(const GSVector4& v_)
+ {
+ x = x / v_.x;
+ y = y / v_.y;
+ z = z / v_.z;
+ w = w / v_.w;
+ }
+
+ void operator+=(const float v_)
+ {
+ x = x + v_;
+ y = y + v_;
+ z = z + v_;
+ w = w + v_;
+ }
+ void operator-=(const float v_)
+ {
+ x = x - v_;
+ y = y - v_;
+ z = z - v_;
+ w = w - v_;
+ }
+ void operator*=(const float v_)
+ {
+ x = x * v_;
+ y = y * v_;
+ z = z * v_;
+ w = w * v_;
+ }
+ void operator/=(const float v_)
+ {
+ x = x / v_;
+ y = y / v_;
+ z = z / v_;
+ w = w / v_;
+ }
+
+ void operator&=(const GSVector4& v_)
+ {
+ U64[0] &= v_.U64[0];
+ U64[1] &= v_.U64[1];
+ }
+ void operator|=(const GSVector4& v_)
+ {
+ U64[0] |= v_.U64[0];
+ U64[1] |= v_.U64[1];
+ }
+ void operator^=(const GSVector4& v_)
+ {
+ U64[0] ^= v_.U64[0];
+ U64[1] ^= v_.U64[1];
+ }
+
+ friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z, v1.w + v2.w);
+ }
+
+ friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z, v1.w - v2.w);
+ }
+
+ friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z, v1.w * v2.w);
+ }
+
+ friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(v1.x / v2.x, v1.y / v2.y, v1.z / v2.z, v1.w / v2.w);
+ }
+
+ friend GSVector4 operator+(const GSVector4& v, float f) { return GSVector4(v.x + f, v.y + f, v.z + f, v.w + f); }
+
+ friend GSVector4 operator-(const GSVector4& v, float f) { return GSVector4(v.x - f, v.y - f, v.z - f, v.w - f); }
+
+ friend GSVector4 operator*(const GSVector4& v, float f) { return GSVector4(v.x * f, v.y * f, v.z * f, v.w * f); }
+
+ friend GSVector4 operator/(const GSVector4& v, float f) { return GSVector4(v.x / f, v.y / f, v.z / f, v.w / f); }
+
+ friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
+ {
+ GSVector4 ret;
+ ret.U64[0] = v1.U64[0] & v2.U64[0];
+ ret.U64[1] = v1.U64[1] & v2.U64[1];
+ return ret;
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
+ {
+ GSVector4 ret;
+ ret.U64[0] = v1.U64[0] | v2.U64[0];
+ ret.U64[1] = v1.U64[1] | v2.U64[1];
+ return ret;
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
+ {
+ GSVector4 ret;
+ ret.U64[0] = v1.U64[0] ^ v2.U64[0];
+ ret.U64[1] = v1.U64[1] ^ v2.U64[1];
+ return ret;
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
+ {
+ GSVector4 ret;
+ ret.I32[0] = (v1.x == v2.x) ? -1 : 0;
+ ret.I32[1] = (v1.y == v2.y) ? -1 : 0;
+ ret.I32[2] = (v1.z == v2.z) ? -1 : 0;
+ ret.I32[3] = (v1.w == v2.w) ? -1 : 0;
+ return ret;
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
+ {
+ GSVector4 ret;
+ ret.I32[0] = (v1.x != v2.x) ? -1 : 0;
+ ret.I32[1] = (v1.y != v2.y) ? -1 : 0;
+ ret.I32[2] = (v1.z != v2.z) ? -1 : 0;
+ ret.I32[3] = (v1.w != v2.w) ? -1 : 0;
+ return ret;
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
+ {
+ GSVector4 ret;
+ ret.I32[0] = (v1.x > v2.x) ? -1 : 0;
+ ret.I32[1] = (v1.y > v2.y) ? -1 : 0;
+ ret.I32[2] = (v1.z > v2.z) ? -1 : 0;
+ ret.I32[3] = (v1.w > v2.w) ? -1 : 0;
+ return ret;
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
+ {
+ GSVector4 ret;
+ ret.I32[0] = (v1.x < v2.x) ? -1 : 0;
+ ret.I32[1] = (v1.y < v2.y) ? -1 : 0;
+ ret.I32[2] = (v1.z < v2.z) ? -1 : 0;
+ ret.I32[3] = (v1.w < v2.w) ? -1 : 0;
+ return ret;
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
+ {
+ GSVector4 ret;
+ ret.I32[0] = (v1.x >= v2.x) ? -1 : 0;
+ ret.I32[1] = (v1.y >= v2.y) ? -1 : 0;
+ ret.I32[2] = (v1.z >= v2.z) ? -1 : 0;
+ ret.I32[3] = (v1.w >= v2.w) ? -1 : 0;
+ return ret;
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
+ {
+ GSVector4 ret;
+ ret.I32[0] = (v1.x <= v2.x) ? -1 : 0;
+ ret.I32[1] = (v1.y <= v2.y) ? -1 : 0;
+ ret.I32[2] = (v1.z <= v2.z) ? -1 : 0;
+ ret.I32[3] = (v1.w <= v2.w) ? -1 : 0;
+ return ret;
+ }
+
+ ALWAYS_INLINE GSVector4 mul64(const GSVector4& v_) const
+ {
+ GSVector4 ret;
+ ret.F64[0] = F64[0] * v_.F64[0];
+ ret.F64[1] = F64[1] * v_.F64[1];
+ return ret;
+ }
+
+ ALWAYS_INLINE GSVector4 add64(const GSVector4& v_) const
+ {
+ GSVector4 ret;
+ ret.F64[0] = F64[0] + v_.F64[0];
+ ret.F64[1] = F64[1] + v_.F64[1];
+ return ret;
+ }
+
+ ALWAYS_INLINE GSVector4 sub64(const GSVector4& v_) const
+ {
+ GSVector4 ret;
+ ret.F64[0] = F64[0] - v_.F64[0];
+ ret.F64[1] = F64[1] - v_.F64[1];
+ return ret;
+ }
+
+ ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_)
+ {
+ GSVector4 ret;
+ ret.F64[0] = v_.x;
+ ret.F64[1] = v_.y;
+ return ret;
+ }
+
+ ALWAYS_INLINE static GSVector4 f32to64(const void* p)
+ {
+ float f[2];
+ std::memcpy(f, p, sizeof(f));
+ GSVector4 ret;
+ ret.F64[0] = f[0];
+ ret.F64[1] = f[1];
+ return ret;
+ }
+
+ ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
+ {
+ return GSVector4i(static_cast(F64[0]), static_cast(F64[1]), 0, 0);
+ }
+
+ // clang-format off
+
+#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+ ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(F32[xn], F32[yn], F32[zn], F32[wn]); } \
+ ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(F32[xn], F32[yn], v_.F32[zn], v_.F32[wn]); }
+
+#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+ VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+ VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+ VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+ VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
+ VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+ VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+ VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+ VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+#define VECTOR4_SHUFFLE_1(xs, xn) \
+ VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
+ VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
+ VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
+ VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
+
+ VECTOR4_SHUFFLE_1(x, 0)
+ VECTOR4_SHUFFLE_1(y, 1)
+ VECTOR4_SHUFFLE_1(z, 2)
+ VECTOR4_SHUFFLE_1(w, 3)
+
+ // clang-format on
+
+ ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(x, x, x, x); }
+
+ ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(v.x, v.x, v.x, v.x); }
+
+ ALWAYS_INLINE static GSVector4 broadcast32(const void* f)
+ {
+ float ff;
+ std::memcpy(&ff, f, sizeof(ff));
+ return GSVector4(ff, ff, ff, ff);
+ }
+
+ ALWAYS_INLINE static GSVector4 broadcast64(const void* d)
+ {
+ GSVector4 ret;
+ std::memcpy(&ret.F64[0], d, sizeof(ret.F64[0]));
+ ret.F64[1] = ret.F64[0];
+ return ret;
+ }
+};
+
+ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
+{
+ // TODO: Truncation vs rounding...
+ x = static_cast(v.x);
+ y = static_cast(v.y);
+ z = static_cast(v.z);
+ w = static_cast(v.w);
+}
+
+ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
+{
+ x = static_cast(v.x);
+ y = static_cast(v.y);
+ z = static_cast(v.z);
+ w = static_cast(v.w);
+}
+
+ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
+{
+ GSVector4i ret;
+ std::memcpy(&ret, &v, sizeof(ret));
+ return ret;
+}
+
+ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
+{
+ GSVector4 ret;
+ std::memcpy(&ret, &v, sizeof(ret));
+ return ret;
+}
+
+#undef SSATURATE8
+#undef USATURATE8
+#undef SSATURATE16
+#undef USATURATE16
+#undef ALL_LANES_8
+#undef ALL_LANES_16
+#undef ALL_LANES_32
+#undef ALL_LANES_64
diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h
new file mode 100644
index 000000000..9975fbae8
--- /dev/null
+++ b/src/common/gsvector_sse.h
@@ -0,0 +1,1322 @@
+// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin
+// SPDX-License-Identifier: LGPL-3.0+
+
+#pragma once
+
+#include "common/assert.h"
+#include "common/intrin.h"
+#include "common/types.h"
+
+#include
+
+#ifdef CPU_ARCH_AVX2
+#define GSVECTOR_HAS_UNSIGNED 1
+#define GSVECTOR_HAS_SRLV 1
+#endif
+
+class GSVector4;
+
+class alignas(16) GSVector4i
+{
+ struct cxpr_init_tag
+ {
+ };
+ static constexpr cxpr_init_tag cxpr_init{};
+
+ constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : I32{x, y, z, w} {}
+
+ constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+ : I16{s0, s1, s2, s3, s4, s5, s6, s7}
+ {
+ }
+
+ constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
+ s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+ : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+ {
+ }
+
+public:
+ union
+ {
+ struct
+ {
+ s32 x, y, z, w;
+ };
+ struct
+ {
+ s32 r, g, b, a;
+ };
+ struct
+ {
+ s32 left, top, right, bottom;
+ };
+ float F32[4];
+ s8 I8[16];
+ s16 I16[8];
+ s32 I32[4];
+ s64 I64[2];
+ u8 U8[16];
+ u16 U16[8];
+ u32 U32[4];
+ u64 U64[2];
+ __m128i m;
+ };
+
+ GSVector4i() = default;
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
+ {
+ return GSVector4i(cxpr_init, x, y, z, w);
+ }
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+ {
+ return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
+ }
+
+ ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
+ s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+ {
+ return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
+ }
+
+ ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) { m = _mm_set_epi32(w, z, y, x); }
+
+ ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); }
+
+ ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+ {
+ m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
+ }
+
+ ALWAYS_INLINE constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
+ s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+ : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+ {
+ }
+
+ ALWAYS_INLINE GSVector4i(const GSVector4i& v) { m = v.m; }
+
+ ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { m = _mm_loadl_epi64((__m128i*)&v); }
+
+ // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
+ // so leave the non-constexpr version default
+ ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
+
+ ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
+
+ ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
+
+ ALWAYS_INLINE constexpr explicit GSVector4i(__m128i m) : m(m) {}
+
+ ALWAYS_INLINE void operator=(const GSVector4i& v) { m = v.m; }
+ ALWAYS_INLINE void operator=(s32 i) { m = _mm_set1_epi32(i); }
+ ALWAYS_INLINE void operator=(__m128i m_) { m = m_; }
+
+ ALWAYS_INLINE operator __m128i() const { return m; }
+
+ // rect
+
+ ALWAYS_INLINE s32 width() const { return right - left; }
+
+ ALWAYS_INLINE s32 height() const { return bottom - top; }
+
+ ALWAYS_INLINE GSVector4i rsize() const
+ {
+ return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
+ }
+
+ ALWAYS_INLINE s32 rarea() const { return width() * height(); }
+
+ ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; }
+
+ ALWAYS_INLINE GSVector4i runion(const GSVector4i& v) const { return min_i32(v).upl64(max_i32(v).srl<8>()); }
+
+ ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& v) const { return sat_i32(v); }
+ ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); }
+ ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
+
+ template
+ GSVector4i _ralign_helper(const GSVector4i& mask) const
+ {
+ GSVector4i v;
+
+ switch (mode)
+ {
+ case Align_Inside:
+ v = add32(mask);
+ break;
+ case Align_Outside:
+ v = add32(mask.zwxy());
+ break;
+ case Align_NegInf:
+ v = *this;
+ break;
+ case Align_PosInf:
+ v = add32(mask.xyxy());
+ break;
+
+ default:
+ UnreachableCode();
+ break;
+ }
+
+ return v.andnot(mask.xyxy());
+ }
+
+ /// Align the rect using mask values that already have one subtracted (1 << n - 1 aligns to 1 << n)
+ template
+ GSVector4i ralign_presub(const GSVector2i& v) const
+ {
+ return _ralign_helper(GSVector4i(v));
+ }
+
+ template
+ GSVector4i ralign(const GSVector2i& v) const
+ {
+ // a must be 1 << n
+
+ return _ralign_helper(GSVector4i(v).sub32(GSVector4i(1, 1)));
+ }
+
+ //
+
+ ALWAYS_INLINE u32 rgba32() const
+ {
+ GSVector4i v = *this;
+
+ v = v.ps32(v);
+ v = v.pu16(v);
+
+ return (u32)store(v);
+ }
+
+ ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_i8(min).min_i8(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const
+ {
+ return max_i8(minmax.xyxy()).min_i8(minmax.zwzw());
+ }
+ ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_i16(min).min_i16(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const
+ {
+ return max_i16(minmax.xyxy()).min_i16(minmax.zwzw());
+ }
+ ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_i32(min).min_i32(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const
+ {
+ return max_i32(minmax.xyxy()).min_i32(minmax.zwzw());
+ }
+
+ ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_u8(min).min_u8(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
+ {
+ return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
+ }
+ ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_u16(min).min_u16(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
+ {
+ return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
+ }
+ ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
+ {
+ return max_u32(min).min_u32(max);
+ }
+ ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
+ {
+ return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
+ }
+
+ ALWAYS_INLINE GSVector4i min_i8(const GSVector4i& v) const { return GSVector4i(_mm_min_epi8(m, v)); }
+ ALWAYS_INLINE GSVector4i max_i8(const GSVector4i& v) const { return GSVector4i(_mm_max_epi8(m, v)); }
+ ALWAYS_INLINE GSVector4i min_i16(const GSVector4i& v) const { return GSVector4i(_mm_min_epi16(m, v)); }
+ ALWAYS_INLINE GSVector4i max_i16(const GSVector4i& v) const { return GSVector4i(_mm_max_epi16(m, v)); }
+ ALWAYS_INLINE GSVector4i min_i32(const GSVector4i& v) const { return GSVector4i(_mm_min_epi32(m, v)); }
+ ALWAYS_INLINE GSVector4i max_i32(const GSVector4i& v) const { return GSVector4i(_mm_max_epi32(m, v)); }
+
+ ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const { return GSVector4i(_mm_min_epu8(m, v)); }
+ ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const { return GSVector4i(_mm_max_epu8(m, v)); }
+ ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const { return GSVector4i(_mm_min_epu16(m, v)); }
+ ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const { return GSVector4i(_mm_max_epu16(m, v)); }
+ ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const { return GSVector4i(_mm_min_epu32(m, v)); }
+ ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const { return GSVector4i(_mm_max_epu32(m, v)); }
+
+ ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const { return GSVector4i(_mm_madd_epi16(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(_mm_hadd_epi32(m, m)); }
+
+ ALWAYS_INLINE s32 minv_s32() const
+ {
+ const __m128i vmin = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
+ return std::min(_mm_extract_epi32(vmin, 0), _mm_extract_epi32(vmin, 1));
+ }
+
+ ALWAYS_INLINE u32 minv_u32() const
+ {
+ const __m128i vmin = _mm_min_epu32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
+ return std::min(_mm_extract_epi32(vmin, 0), _mm_extract_epi32(vmin, 1));
+ }
+
+ ALWAYS_INLINE s32 maxv_s32() const
+ {
+ const __m128i vmax = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
+ return std::max(_mm_extract_epi32(vmax, 0), _mm_extract_epi32(vmax, 1));
+ }
+
+ ALWAYS_INLINE u32 maxv_u32() const
+ {
+ const __m128i vmax = _mm_max_epu32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
+ return std::max(_mm_extract_epi32(vmax, 0), _mm_extract_epi32(vmax, 1));
+ }
+
+ ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
+
+ ALWAYS_INLINE GSVector4i blend8(const GSVector4i& v, const GSVector4i& mask) const
+ {
+ return GSVector4i(_mm_blendv_epi8(m, v, mask));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i blend16(const GSVector4i& v) const
+ {
+ return GSVector4i(_mm_blend_epi16(m, v, mask));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
+ {
+#if defined(__AVX2__)
+ return GSVector4i(_mm_blend_epi32(m, v.m, mask));
+#else
+ constexpr s32 bit3 = ((mask & 8) * 3) << 3;
+ constexpr s32 bit2 = ((mask & 4) * 3) << 2;
+ constexpr s32 bit1 = ((mask & 2) * 3) << 1;
+ constexpr s32 bit0 = (mask & 1) * 3;
+ return blend16(v);
+#endif
+ }
+
+ ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
+ {
+ return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v)));
+ }
+
+ ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
+
+ ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const { return GSVector4i(_mm_shuffle_epi8(m, mask)); }
+
+ ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi16(m, v)); }
+ ALWAYS_INLINE GSVector4i ps16() const { return GSVector4i(_mm_packs_epi16(m, m)); }
+ ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi16(m, v)); }
+ ALWAYS_INLINE GSVector4i pu16() const { return GSVector4i(_mm_packus_epi16(m, m)); }
+ ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi32(m, v)); }
+ ALWAYS_INLINE GSVector4i ps32() const { return GSVector4i(_mm_packs_epi32(m, m)); }
+ ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi32(m, v)); }
+ ALWAYS_INLINE GSVector4i pu32() const { return GSVector4i(_mm_packus_epi32(m, m)); }
+
+ ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi8(m, v)); }
+ ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi8(m, v)); }
+ ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi16(m, v)); }
+ ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi16(m, v)); }
+ ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi32(m, v)); }
+ ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi32(m, v)); }
+ ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi64(m, v)); }
+ ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi64(m, v)); }
+
+ ALWAYS_INLINE GSVector4i upl8() const { return GSVector4i(_mm_unpacklo_epi8(m, _mm_setzero_si128())); }
+ ALWAYS_INLINE GSVector4i uph8() const { return GSVector4i(_mm_unpackhi_epi8(m, _mm_setzero_si128())); }
+
+ ALWAYS_INLINE GSVector4i upl16() const { return GSVector4i(_mm_unpacklo_epi16(m, _mm_setzero_si128())); }
+ ALWAYS_INLINE GSVector4i uph16() const { return GSVector4i(_mm_unpackhi_epi16(m, _mm_setzero_si128())); }
+
+ ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(_mm_unpacklo_epi32(m, _mm_setzero_si128())); }
+
+ ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(_mm_unpackhi_epi32(m, _mm_setzero_si128())); }
+ ALWAYS_INLINE GSVector4i upl64() const { return GSVector4i(_mm_unpacklo_epi64(m, _mm_setzero_si128())); }
+ ALWAYS_INLINE GSVector4i uph64() const { return GSVector4i(_mm_unpackhi_epi64(m, _mm_setzero_si128())); }
+
+ ALWAYS_INLINE GSVector4i i8to16() const { return GSVector4i(_mm_cvtepi8_epi16(m)); }
+ ALWAYS_INLINE GSVector4i i8to32() const { return GSVector4i(_mm_cvtepi8_epi32(m)); }
+ ALWAYS_INLINE GSVector4i i8to64() const { return GSVector4i(_mm_cvtepi8_epi64(m)); }
+
+#ifdef CPU_ARCH_SSE41
+ ALWAYS_INLINE GSVector4i i16to32() const { return GSVector4i(_mm_cvtepi16_epi32(m)); }
+ ALWAYS_INLINE GSVector4i i16to64() const { return GSVector4i(_mm_cvtepi16_epi64(m)); }
+ ALWAYS_INLINE GSVector4i i32to64() const { return GSVector4i(_mm_cvtepi32_epi64(m)); }
+ ALWAYS_INLINE GSVector4i u8to16() const { return GSVector4i(_mm_cvtepu8_epi16(m)); }
+ ALWAYS_INLINE GSVector4i u8to32() const { return GSVector4i(_mm_cvtepu8_epi32(m)); }
+ ALWAYS_INLINE GSVector4i u8to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); }
+ ALWAYS_INLINE GSVector4i u16to32() const { return GSVector4i(_mm_cvtepu16_epi32(m)); }
+ ALWAYS_INLINE GSVector4i u16to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); }
+ ALWAYS_INLINE GSVector4i u32to64() const { return GSVector4i(_mm_cvtepu32_epi64(m)); }
+#endif
+
+ template
+ ALWAYS_INLINE GSVector4i srl() const
+ {
+ return GSVector4i(_mm_srli_si128(m, i));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i srl(const GSVector4i& v)
+ {
+ return GSVector4i(_mm_alignr_epi8(v.m, m, i));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i sll() const
+ {
+ return GSVector4i(_mm_slli_si128(m, i));
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i sll16() const
+ {
+ return GSVector4i(_mm_slli_epi16(m, i));
+ }
+
+ ALWAYS_INLINE GSVector4i sll16(s32 i) const { return GSVector4i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+ ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi16(m, v.m)); }
+#endif
+
+ template
+ ALWAYS_INLINE GSVector4i srl16() const
+ {
+ return GSVector4i(_mm_srli_epi16(m, i));
+ }
+
+ ALWAYS_INLINE GSVector4i srl16(s32 i) const { return GSVector4i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+ ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi16(m, v.m)); }
+#endif
+
+ template
+ ALWAYS_INLINE GSVector4i sra16() const
+ {
+ return GSVector4i(_mm_srai_epi16(m, i));
+ }
+
+ ALWAYS_INLINE GSVector4i sra16(s32 i) const { return GSVector4i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+ ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi16(m, v.m)); }
+#endif
+
+ template
+ ALWAYS_INLINE GSVector4i sll32() const
+ {
+ return GSVector4i(_mm_slli_epi32(m, i));
+ }
+
+ ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+ ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi32(m, v.m)); }
+#endif
+
+ template
+ ALWAYS_INLINE GSVector4i srl32() const
+ {
+ return GSVector4i(_mm_srli_epi32(m, i));
+ }
+
+ ALWAYS_INLINE GSVector4i srl32(s32 i) const { return GSVector4i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+ ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi32(m, v.m)); }
+#endif
+
+ template
+ ALWAYS_INLINE GSVector4i sra32() const
+ {
+ return GSVector4i(_mm_srai_epi32(m, i));
+ }
+
+ ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+ ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi32(m, v.m)); }
+#endif
+
+ template
+ ALWAYS_INLINE GSVector4i sll64() const
+ {
+ return GSVector4i(_mm_slli_epi64(m, i));
+ }
+
+ ALWAYS_INLINE GSVector4i sll64(s32 i) const { return GSVector4i(_mm_sll_epi64(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+ ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi64(m, v.m)); }
+#endif
+
+ template
+ ALWAYS_INLINE GSVector4i srl64() const
+ {
+ return GSVector4i(_mm_srli_epi64(m, i));
+ }
+
+ ALWAYS_INLINE GSVector4i srl64(s32 i) const { return GSVector4i(_mm_srl_epi64(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+ ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi64(m, v.m)); }
+#endif
+
+ template
+ ALWAYS_INLINE GSVector4i sra64() const
+ {
+ return GSVector4i(_mm_srai_epi64(m, i));
+ }
+
+ ALWAYS_INLINE GSVector4i sra64(s32 i) const { return GSVector4i(_mm_sra_epi64(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+ ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi64(m, v.m)); }
+#endif
+
+ ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const { return GSVector4i(_mm_add_epi8(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const { return GSVector4i(_mm_add_epi16(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(_mm_add_epi32(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi8(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi16(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const { return GSVector4i(_mm_hadds_epi16(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu8(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu16(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi8(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi16(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi32(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi8(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi16(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu8(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu16(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu8(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu16(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epi16(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i mul16hu(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epu16(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi16(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const { return GSVector4i(_mm_mulhrs_epi16(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi32(m, v.m)); }
+
+ template
+ ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
+ {
+ // (a - this) * f << shift + this
+
+ return add16(a.sub16(*this).modulate16(f));
+ }
+
+ template
+ ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
+ {
+ // (a - b) * c << shift
+
+ return a.sub16(b).modulate16(c);
+ }
+
+ template
+ ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
+ const GSVector4i& d)
+ {
+ // (a - b) * c << shift + d
+
+ return d.add16(a.sub16(b).modulate16(c));
+ }
+
+ ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const
+ {
+ // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
+
+ return add16(a_.sub16(*this).mul16l(f).sra16<4>());
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
+ {
+ // a * f << shift
+ if (shift == 0)
+ {
+ return mul16hrs(f);
+ }
+
+ return sll16().mul16hs(f);
+ }
+
+ ALWAYS_INLINE bool eq(const GSVector4i& v) const
+ {
+ // pxor, ptest, je
+
+ GSVector4i t = *this ^ v;
+
+ return _mm_testz_si128(t, t) != 0;
+ }
+
+ ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi8(m, v.m)); }
+ ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi16(m, v.m)); }
+ ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi32(m, v.m)); }
+ ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi64(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
+ ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); }
+ ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); }
+
+ ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi8(m, v.m)); }
+ ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi16(m, v.m)); }
+ ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi32(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi8(m, v.m)); }
+ ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi16(m, v.m)); }
+ ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi32(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi8(m, v.m)); }
+ ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi16(m, v.m)); }
+ ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi32(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi8(m, v.m)); }
+ ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi16(m, v.m)); }
+ ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi32(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(_mm_andnot_si128(v.m, m)); }
+
+ ALWAYS_INLINE s32 mask() const { return _mm_movemask_epi8(m); }
+
+ ALWAYS_INLINE bool alltrue() const { return mask() == 0xffff; }
+
+ ALWAYS_INLINE bool allfalse() const { return _mm_testz_si128(m, m) != 0; }
+
+ template
+ ALWAYS_INLINE GSVector4i insert8(s32 a) const
+ {
+ return GSVector4i(_mm_insert_epi8(m, a, i));
+ }
+
+ template
+ ALWAYS_INLINE s32 extract8() const
+ {
+ return _mm_extract_epi8(m, i);
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i insert16(s32 a) const
+ {
+ return GSVector4i(_mm_insert_epi16(m, a, i));
+ }
+
+ template
+ ALWAYS_INLINE s32 extract16() const
+ {
+ return _mm_extract_epi16(m, i);
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i insert32(s32 a) const
+ {
+ return GSVector4i(_mm_insert_epi32(m, a, i));
+ }
+
+ template
+ ALWAYS_INLINE s32 extract32() const
+ {
+ if constexpr (i == 0)
+ return GSVector4i::store(*this);
+
+ return _mm_extract_epi32(m, i);
+ }
+
+ template
+ ALWAYS_INLINE GSVector4i insert64(s64 a) const
+ {
+ return GSVector4i(_mm_insert_epi64(m, a, i));
+ }
+
+ template
+ ALWAYS_INLINE s64 extract64() const
+ {
+ if (i == 0)
+ return GSVector4i::storeq(*this);
+
+ return _mm_extract_epi64(m, i);
+ }
+
+ ALWAYS_INLINE static GSVector4i loadnt(const void* p) { return GSVector4i(_mm_stream_load_si128((__m128i*)p)); }
+
+ ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); }
+
+ ALWAYS_INLINE static GSVector4i loadl(const void* p) { return GSVector4i(_mm_loadl_epi64((__m128i*)p)); }
+
+ ALWAYS_INLINE static GSVector4i loadh(const void* p)
+ {
+ return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), (__m64*)p)));
+ }
+
+ ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); }
+
+ template
+ ALWAYS_INLINE static GSVector4i load(const void* p)
+ {
+ return GSVector4i(aligned ? _mm_load_si128((__m128i*)p) : _mm_loadu_si128((__m128i*)p));
+ }
+
+ ALWAYS_INLINE static GSVector4i load(s32 i) { return GSVector4i(_mm_cvtsi32_si128(i)); }
+
+ ALWAYS_INLINE static GSVector4i loadq(s64 i) { return GSVector4i(_mm_cvtsi64_si128(i)); }
+
+ ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128((__m128i*)p, v.m); }
+
+ ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64((__m128i*)p, v.m); }
+
+ ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { _mm_storeh_pi((__m64*)p, _mm_castsi128_ps(v.m)); }
+
+ ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v)
+ {
+ GSVector4i::storel(pl, v);
+ GSVector4i::storeh(ph, v);
+ }
+
+ template
+ ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
+ {
+ if constexpr (aligned)
+ _mm_store_si128((__m128i*)p, v.m);
+ else
+ _mm_storeu_si128((__m128i*)p, v.m);
+ }
+
+ ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { _mm_storeu_si32(p, v); }
+
+ ALWAYS_INLINE static s32 store(const GSVector4i& v) { return _mm_cvtsi128_si32(v.m); }
+
+ ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return _mm_cvtsi128_si64(v.m); }
+
+ ALWAYS_INLINE void operator&=(const GSVector4i& v) { m = _mm_and_si128(m, v); }
+ ALWAYS_INLINE void operator|=(const GSVector4i& v) { m = _mm_or_si128(m, v); }
+ ALWAYS_INLINE void operator^=(const GSVector4i& v) { m = _mm_xor_si128(m, v); }
+
+ ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
+ {
+ return GSVector4i(_mm_and_si128(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
+ {
+ return GSVector4i(_mm_or_si128(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
+ {
+ return GSVector4i(_mm_xor_si128(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); }
+
+ ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); }
+
+ ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); }
+
+ ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); }
+
+ ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); }
+
+ ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
+
+ ALWAYS_INLINE GSVector2i xy() const
+ {
+ GSVector2i ret;
+ storel(&ret, *this);
+ return ret;
+ }
+
+ ALWAYS_INLINE GSVector2i zw() const
+ {
+ GSVector2i ret;
+ storeh(&ret, *this);
+ return ret;
+ }
+
+ // clang-format off
+
+#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+ ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+ ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+ ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+ ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} \
+
+#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+ VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
+ VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+ VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+ VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+ VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+#define VECTOR4i_SHUFFLE_1(xs, xn) \
+ VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
+ VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
+ VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
+ VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
+
+ VECTOR4i_SHUFFLE_1(x, 0)
+ VECTOR4i_SHUFFLE_1(y, 1)
+ VECTOR4i_SHUFFLE_1(z, 2)
+ VECTOR4i_SHUFFLE_1(w, 3)
+
+ // clang-format on
+};
+
+class alignas(16) GSVector4
+{
+ struct cxpr_init_tag
+ {
+ };
+ static constexpr cxpr_init_tag cxpr_init{};
+
+ constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
+
+ constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
+
+ constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
+
+public:
+ union
+ {
+ struct
+ {
+ float x, y, z, w;
+ };
+ struct
+ {
+ float r, g, b, a;
+ };
+ struct
+ {
+ float left, top, right, bottom;
+ };
+ float F32[4];
+ double F64[2];
+ s8 I8[16];
+ s16 I16[8];
+ s32 I32[4];
+ s64 I64[2];
+ u8 U8[16];
+ u16 U16[8];
+ u32 U32[4];
+ u64 U64[2];
+ __m128 m;
+ };
+
+ GSVector4() = default;
+
+ constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
+
+ constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
+
+ constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
+
+ constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
+
+ constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
+
+ constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
+
+ ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); }
+
+ ALWAYS_INLINE GSVector4(float x, float y) { m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y)); }
+
+ ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
+ {
+ GSVector4i v_(x, y, z, w);
+
+ m = _mm_cvtepi32_ps(v_.m);
+ }
+
+ ALWAYS_INLINE GSVector4(int x, int y)
+ {
+ m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y)));
+ }
+
+ ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { m = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&v)); }
+
+ ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) { m = _mm_cvtepi32_ps(_mm_loadl_epi64((__m128i*)&v)); }
+
+ ALWAYS_INLINE constexpr explicit GSVector4(__m128 m) : m(m) {}
+
+ ALWAYS_INLINE explicit GSVector4(__m128d m) : m(_mm_castpd_ps(m)) {}
+
+ ALWAYS_INLINE explicit GSVector4(float f) { *this = f; }
+
+ ALWAYS_INLINE explicit GSVector4(int i)
+ {
+#ifdef CPU_ARCH_AVX2
+ m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i)));
+#else
+ *this = GSVector4(GSVector4i(i));
+#endif
+ }
+
+ ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
+
+ ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
+
+ ALWAYS_INLINE static GSVector4 f64(double x, double y) { return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x))); }
+
+ ALWAYS_INLINE void operator=(float f)
+ {
+#if CPU_ARCH_AVX2
+
+ m = _mm_broadcastss_ps(_mm_load_ss(&f));
+
+#else
+
+ m = _mm_set1_ps(f);
+
+#endif
+ }
+
+ ALWAYS_INLINE void operator=(__m128 m_) { this->m = m_; }
+
+ ALWAYS_INLINE operator __m128() const { return m; }
+
+ /// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks
+ /// we don't need the whole vector Useful for e.g. preventing clang from optimizing shuffles that remove
+ /// possibly-denormal garbage data from vectors before computing with them
+ ALWAYS_INLINE GSVector4 noopt()
+ {
+ // Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the
+ // future the implementation should be updated
+#ifdef __clang__
+ __asm__("" : "+x"(m)::);
+#endif
+ return *this;
+ }
+
+ u32 rgba32() const { return GSVector4i(*this).rgba32(); }
+
+ ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
+
+ ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
+
+ ALWAYS_INLINE GSVector4 abs() const { return *this & cast(GSVector4i::cxpr(0x7fffffff)); }
+
+ ALWAYS_INLINE GSVector4 neg() const { return *this ^ cast(GSVector4i::cxpr(0x80000000)); }
+
+ ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(_mm_rcp_ps(m)); }
+
+ ALWAYS_INLINE GSVector4 rcpnr() const
+ {
+ GSVector4 v_ = rcp();
+
+ return (v_ + v_) - (v_ * v_) * *this;
+ }
+
+ template
+ ALWAYS_INLINE GSVector4 round() const
+ {
+ return GSVector4(_mm_round_ps(m, mode));
+ }
+
+ ALWAYS_INLINE GSVector4 floor() const { return round(); }
+
+ ALWAYS_INLINE GSVector4 ceil() const { return round(); }
+
+ ALWAYS_INLINE GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const
+ {
+#ifdef CPU_ARCH_AVX2
+ return GSVector4(_mm_fmadd_ps(m, a_, b_));
+#else
+ return *this * a_ + b_;
+#endif
+ }
+
+ ALWAYS_INLINE GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const
+ {
+#ifdef CPU_ARCH_AVX2
+ return GSVector4(_mm_fmsub_ps(m, a_, b_));
+#else
+ return *this * a_ - b_;
+#endif
+ }
+
+ ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const
+ {
+#ifdef CPU_ARCH_AVX2
+ return GSVector4(_mm_fnmadd_ps(m, a_, b_));
+#else
+ return b_ - *this * a_;
+#endif
+ }
+
+ ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const
+ {
+#ifdef CPU_ARCH_AVX2
+ return GSVector4(_mm_fnmsub_ps(m, a_, b_));
+#else
+ return -b_ - *this * a_;
+#endif
+ }
+
+ ALWAYS_INLINE GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const
+ {
+ return a_.madd(b_, *this); // *this + a * b
+ }
+
+ ALWAYS_INLINE GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const
+ {
+ return a_.nmadd(b_, *this); // *this - a * b
+ }
+
+ ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(_mm_hadd_ps(m, m)); }
+
+ ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(_mm_hadd_ps(m, v.m)); }
+
+ ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(_mm_hsub_ps(m, m)); }
+
+ ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const { return GSVector4(_mm_hsub_ps(m, v.m)); }
+
+ template
+ ALWAYS_INLINE GSVector4 dp(const GSVector4& v) const
+ {
+ return GSVector4(_mm_dp_ps(m, v.m, i));
+ }
+
+ ALWAYS_INLINE GSVector4 sat(const GSVector4& min, const GSVector4& max) const
+ {
+ return GSVector4(_mm_min_ps(_mm_max_ps(m, min), max));
+ }
+
+ ALWAYS_INLINE GSVector4 sat(const GSVector4& v) const
+ {
+ return GSVector4(_mm_min_ps(_mm_max_ps(m, v.xyxy()), v.zwzw()));
+ }
+
+ ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
+
+ ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
+
+ ALWAYS_INLINE GSVector4 min(const GSVector4& v) const { return GSVector4(_mm_min_ps(m, v)); }
+
+ ALWAYS_INLINE GSVector4 max(const GSVector4& v) const { return GSVector4(_mm_max_ps(m, v)); }
+
+ template
+ ALWAYS_INLINE GSVector4 blend32(const GSVector4& v) const
+ {
+ return GSVector4(_mm_blend_ps(m, v, mask));
+ }
+
+ ALWAYS_INLINE GSVector4 blend32(const GSVector4& v, const GSVector4& mask) const
+ {
+ return GSVector4(_mm_blendv_ps(m, v, mask));
+ }
+
+ ALWAYS_INLINE GSVector4 upl(const GSVector4& v) const { return GSVector4(_mm_unpacklo_ps(m, v)); }
+
+ ALWAYS_INLINE GSVector4 uph(const GSVector4& v) const { return GSVector4(_mm_unpackhi_ps(m, v)); }
+
+ ALWAYS_INLINE GSVector4 upld(const GSVector4& v) const
+ {
+ return GSVector4(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m), _mm_castps_pd(v.m))));
+ }
+
+ ALWAYS_INLINE GSVector4 uphd(const GSVector4& v) const
+ {
+ return GSVector4(_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m), _mm_castps_pd(v.m))));
+ }
+
+ ALWAYS_INLINE GSVector4 l2h(const GSVector4& v) const { return GSVector4(_mm_movelh_ps(m, v)); }
+
+ ALWAYS_INLINE GSVector4 h2l(const GSVector4& v) const { return GSVector4(_mm_movehl_ps(m, v)); }
+
+ ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const { return GSVector4(_mm_andnot_ps(v.m, m)); }
+
+ ALWAYS_INLINE int mask() const { return _mm_movemask_ps(m); }
+
+ ALWAYS_INLINE bool alltrue() const { return mask() == 0xf; }
+
+ ALWAYS_INLINE bool allfalse() const
+ {
+#ifdef CPU_ARCH_AVX2
+ return _mm_testz_ps(m, m) != 0;
+#else
+ const __m128i ii = _mm_castps_si128(m);
+ return _mm_testz_si128(ii, ii) != 0;
+#endif
+ }
+
+ ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
+
+ template
+ ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
+ {
+ if constexpr (src == dst)
+ return GSVector4(_mm_blend_ps(m, v.m, 1 << src));
+ else
+ return GSVector4(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0)));
+ }
+
+ template
+ ALWAYS_INLINE int extract32() const
+ {
+ return _mm_extract_ps(m, i);
+ }
+
+ ALWAYS_INLINE static GSVector4 zero() { return GSVector4(_mm_setzero_ps()); }
+
+ ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); }
+
+ ALWAYS_INLINE static GSVector4 loadl(const void* p) { return GSVector4(_mm_castpd_ps(_mm_load_sd((double*)p))); }
+
+ ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(_mm_load_ss(&f)); }
+
+ template
+ ALWAYS_INLINE static GSVector4 load(const void* p)
+ {
+ return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p));
+ }
+
+ ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps((float*)p, v.m); }
+
+ ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { _mm_store_sd((double*)p, _mm_castps_pd(v.m)); }
+
+ ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { _mm_storeh_pd((double*)p, _mm_castps_pd(v.m)); }
+
+ template
+ ALWAYS_INLINE static void store(void* p, const GSVector4& v)
+ {
+ if constexpr (aligned)
+ _mm_store_ps((float*)p, v.m);
+ else
+ _mm_storeu_ps((float*)p, v.m);
+ }
+
+ ALWAYS_INLINE static void store(float* p, const GSVector4& v) { _mm_store_ss(p, v.m); }
+
+ ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
+
+ ALWAYS_INLINE void operator+=(const GSVector4& v_) { m = _mm_add_ps(m, v_); }
+ ALWAYS_INLINE void operator-=(const GSVector4& v_) { m = _mm_sub_ps(m, v_); }
+ ALWAYS_INLINE void operator*=(const GSVector4& v_) { m = _mm_mul_ps(m, v_); }
+ ALWAYS_INLINE void operator/=(const GSVector4& v_) { m = _mm_div_ps(m, v_); }
+
+ ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
+ ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
+ ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
+ ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); }
+
+ ALWAYS_INLINE void operator&=(const GSVector4& v_) { m = _mm_and_ps(m, v_); }
+ ALWAYS_INLINE void operator|=(const GSVector4& v_) { m = _mm_or_ps(m, v_); }
+ ALWAYS_INLINE void operator^=(const GSVector4& v_) { m = _mm_xor_ps(m, v_); }
+
+ ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(_mm_add_ps(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(_mm_sub_ps(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(_mm_mul_ps(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(_mm_div_ps(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
+
+ ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
+
+ ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
+
+ ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); }
+
+ ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(_mm_and_ps(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(_mm_or_ps(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(_mm_xor_ps(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(_mm_cmpeq_ps(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(_mm_cmpneq_ps(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(_mm_cmpgt_ps(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(_mm_cmplt_ps(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(_mm_cmpge_ps(v1, v2));
+ }
+
+ ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
+ {
+ return GSVector4(_mm_cmple_ps(v1, v2));
+ }
+
+ ALWAYS_INLINE GSVector4 mul64(const GSVector4& v_) const
+ {
+ return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
+ }
+
+ ALWAYS_INLINE GSVector4 add64(const GSVector4& v_) const
+ {
+ return GSVector4(_mm_add_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
+ }
+
+ ALWAYS_INLINE GSVector4 sub64(const GSVector4& v_) const
+ {
+ return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
+ }
+
+ ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) { return GSVector4(_mm_cvtps_pd(v_.m)); }
+
+ ALWAYS_INLINE static GSVector4 f32to64(const void* p)
+ {
+ return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast(p)))));
+ }
+
+ ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
+ {
+ return GSVector4i(truncate ? _mm_cvttpd_epi32(_mm_castps_pd(m)) : _mm_cvtpd_epi32(_mm_castps_pd(m)));
+ }
+
+ // clang-format off
+
+#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+ ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn))); } \
+ ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(_mm_shuffle_ps(m, v_.m, _MM_SHUFFLE(wn, zn, yn, xn))); } \
+
+#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+ VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+ VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+ VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+ VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
+ VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+ VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+ VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+ VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+#define VECTOR4_SHUFFLE_1(xs, xn) \
+ VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
+ VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
+ VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
+ VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
+
+ VECTOR4_SHUFFLE_1(x, 0)
+ VECTOR4_SHUFFLE_1(y, 1)
+ VECTOR4_SHUFFLE_1(z, 2)
+ VECTOR4_SHUFFLE_1(w, 3)
+
+ // clang-format on
+
+#if CPU_ARCH_AVX2
+
+ ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(_mm_broadcastss_ps(m)); }
+
+ ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(_mm_broadcastss_ps(v.m)); }
+
+ ALWAYS_INLINE static GSVector4 broadcast32(const void* f)
+ {
+ return GSVector4(_mm_broadcastss_ps(_mm_load_ss((const float*)f)));
+ }
+
+#endif
+
+ ALWAYS_INLINE static GSVector4 broadcast64(const void* d)
+ {
+ return GSVector4(_mm_loaddup_pd(static_cast(d)));
+ }
+};
+
+ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
+{
+ m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v);
+}
+
+ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
+{
+ m = _mm_cvtepi32_ps(v);
+}
+
+ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
+{
+ return GSVector4i(_mm_castps_si128(v.m));
+}
+
+ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
+{
+ return GSVector4(_mm_castsi128_ps(v.m));
+}
diff --git a/src/common/intrin.h b/src/common/intrin.h
index 795a7f950..1b0587e7e 100644
--- a/src/common/intrin.h
+++ b/src/common/intrin.h
@@ -13,7 +13,21 @@
#if defined(CPU_ARCH_X86) || defined(CPU_ARCH_X64)
#define CPU_ARCH_SSE 1
#include
-#elif defined(CPU_ARCH_ARM64)
+#include
+#include
+#include
+
+#if defined(__AVX2__)
+#define CPU_ARCH_AVX 1
+#define CPU_ARCH_AVX2 1
+#define CPU_ARCH_SSE41 1
+#elif defined(__AVX__)
+#define CPU_ARCH_AVX 1
+#define CPU_ARCH_SSE41 1
+#elif defined(__SSE4_1__) || defined(_MSC_VER)
+#define CPU_ARCH_SSE41 1
+#endif
+#elif defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64)
#define CPU_ARCH_NEON 1
#if defined(_MSC_VER) && !defined(__clang__)
#include