diff --git a/CMakeModules/DuckStationUtils.cmake b/CMakeModules/DuckStationUtils.cmake
index b2dd91c6a..12d7a52ed 100644
--- a/CMakeModules/DuckStationUtils.cmake
+++ b/CMakeModules/DuckStationUtils.cmake
@@ -57,6 +57,8 @@ function(detect_architecture)
     if("x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES)
       message(STATUS "Building x86_64 MacOS binaries.")
       set(CPU_ARCH_X64 TRUE PARENT_SCOPE)
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Xarch_x86_64 -msse4.1" PARENT_SCOPE)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xarch_864_64 -msse4.1" PARENT_SCOPE)
     endif()
     if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
       message(STATUS "Building ARM64 MacOS binaries.")
@@ -67,6 +69,8 @@ function(detect_architecture)
          CMAKE_SIZEOF_VOID_P EQUAL 8)
     message(STATUS "Building x86_64 binaries.")
     set(CPU_ARCH_X64 TRUE PARENT_SCOPE)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1" PARENT_SCOPE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1" PARENT_SCOPE)
   elseif(("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64") AND
          CMAKE_SIZEOF_VOID_P EQUAL 8) # Might have an A64 kernel, e.g. Raspbian.
     message(STATUS "Building ARM64 binaries.")
diff --git a/dep/msvc/vsprops/Base.props b/dep/msvc/vsprops/Base.props
index 9d485d4e4..81efe34c4 100644
--- a/dep/msvc/vsprops/Base.props
+++ b/dep/msvc/vsprops/Base.props
@@ -30,6 +30,7 @@
       <PreprocessorDefinitions>_HAS_EXCEPTIONS=0;_CRT_INTERNAL_NONSTDC_NAMES;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;WIN32;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories);$(DepsIncludeDir)</AdditionalIncludeDirectories>
       <AdditionalOptions Condition="!$(Configuration.Contains(Clang))">/Zc:__cplusplus /Zo /utf-8 %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="$(Configuration.Contains(Clang)) And '$(Platform)'=='x64'"> -msse4.1 %(AdditionalOptions)</AdditionalOptions>
       <!-- Force ThinLTO for Release builds, MSVC doesn't seem to do it otherwise. -->
       <AdditionalOptions Condition="$(Configuration.Contains(Clang)) And $(Configuration.Contains(ReleaseLTCG))"> -flto=thin %(AdditionalOptions)</AdditionalOptions>
       <ExceptionHandling>false</ExceptionHandling>
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index bc532cf6c..c0c8566d5 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -21,6 +21,11 @@ add_library(common
   fifo_queue.h
   file_system.cpp
   file_system.h
+  gsvector.h
+  gsvector_formatter.h
+  gsvector_neon.h
+  gsvector_nosimd.h
+  gsvector_sse.h
   intrin.h
   hash_combine.h
   heap_array.h
diff --git a/src/common/common.vcxproj b/src/common/common.vcxproj
index 9d0feb068..aafeebe45 100644
--- a/src/common/common.vcxproj
+++ b/src/common/common.vcxproj
@@ -16,6 +16,11 @@
     <ClInclude Include="fastjmp.h" />
     <ClInclude Include="fifo_queue.h" />
     <ClInclude Include="file_system.h" />
+    <ClInclude Include="gsvector.h" />
+    <ClInclude Include="gsvector_formatter.h" />
+    <ClInclude Include="gsvector_neon.h" />
+    <ClInclude Include="gsvector_nosimd.h" />
+    <ClInclude Include="gsvector_sse.h" />
     <ClInclude Include="hash_combine.h" />
     <ClInclude Include="heap_array.h" />
     <ClInclude Include="intrin.h" />
@@ -70,6 +75,7 @@
   </ItemGroup>
   <ItemGroup>
     <Natvis Include="bitfield.natvis" />
+    <Natvis Include="gsvector.natvis" />
     <Natvis Include="thirdparty\llvm.natvis" />
   </ItemGroup>
   <ItemGroup>
diff --git a/src/common/common.vcxproj.filters b/src/common/common.vcxproj.filters
index a36015e9f..2e280f9de 100644
--- a/src/common/common.vcxproj.filters
+++ b/src/common/common.vcxproj.filters
@@ -46,6 +46,11 @@
     </ClInclude>
     <ClInclude Include="dynamic_library.h" />
     <ClInclude Include="binary_span_reader_writer.h" />
+    <ClInclude Include="gsvector_sse.h" />
+    <ClInclude Include="gsvector_neon.h" />
+    <ClInclude Include="gsvector.h" />
+    <ClInclude Include="gsvector_formatter.h" />
+    <ClInclude Include="gsvector_nosimd.h" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="small_string.cpp" />
@@ -80,6 +85,7 @@
     <Natvis Include="thirdparty\llvm.natvis">
       <Filter>thirdparty</Filter>
     </Natvis>
+    <Natvis Include="gsvector.natvis" />
   </ItemGroup>
   <ItemGroup>
     <Filter Include="thirdparty">
diff --git a/src/common/gsvector.h b/src/common/gsvector.h
new file mode 100644
index 000000000..8dda3b369
--- /dev/null
+++ b/src/common/gsvector.h
@@ -0,0 +1,65 @@
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include "common/intrin.h"
+
+#include <cstring>
+
+enum Align_Mode
+{
+  Align_Outside,
+  Align_Inside,
+  Align_NegInf,
+  Align_PosInf
+};
+
+enum Round_Mode
+{
+  Round_NearestInt = 8,
+  Round_NegInf = 9,
+  Round_PosInf = 10,
+  Round_Truncate = 11
+};
+
+template<class T>
+class GSVector2T
+{
+public:
+  union
+  {
+    struct
+    {
+      T x, y;
+    };
+    struct
+    {
+      T r, g;
+    };
+    struct
+    {
+      T v[2];
+    };
+  };
+
+  GSVector2T() = default;
+
+  ALWAYS_INLINE constexpr GSVector2T(T x) : x(x), y(x) {}
+  ALWAYS_INLINE constexpr GSVector2T(T x, T y) : x(x), y(y) {}
+  ALWAYS_INLINE constexpr bool operator==(const GSVector2T& v) const { return std::memcmp(this, &v, sizeof(*this)) == 0; }
+  ALWAYS_INLINE constexpr bool operator!=(const GSVector2T& v) const { return std::memcmp(this, &v, sizeof(*this)) != 0; }
+  ALWAYS_INLINE constexpr GSVector2T operator*(const GSVector2T& v) const { return {x * v.x, y * v.y}; }
+  ALWAYS_INLINE constexpr GSVector2T operator/(const GSVector2T& v) const { return {x / v.x, y / v.y}; }
+};
+
+using GSVector2 = GSVector2T<float>;
+using GSVector2i = GSVector2T<s32>;
+
+#if defined(CPU_ARCH_SSE)
+#include "common/gsvector_sse.h"
+#elif defined(CPU_ARCH_NEON)
+#include "common/gsvector_neon.h"
+#else
+#include "common/gsvector_nosimd.h"
+#endif
diff --git a/src/common/gsvector.natvis b/src/common/gsvector.natvis
new file mode 100644
index 000000000..077d748e0
--- /dev/null
+++ b/src/common/gsvector.natvis
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="utf-8"?>
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+	<Type Name="GSVector2T&lt;*&gt;">
+		<DisplayString>{{ {x}, {y} }}</DisplayString>
+	</Type>
+
+	<Type Name="GSVector4i">
+		<DisplayString>{{ {I32[0]}, {I32[1]}, {I32[2]}, {I32[3]} }}</DisplayString>
+	</Type>
+</AutoVisualizer>
\ No newline at end of file
diff --git a/src/common/gsvector_formatter.h b/src/common/gsvector_formatter.h
new file mode 100644
index 000000000..5ed7c1665
--- /dev/null
+++ b/src/common/gsvector_formatter.h
@@ -0,0 +1,21 @@
+// SPDX-FileCopyrightText: 2024 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include "gsvector.h"
+#include "small_string.h"
+
+#include "fmt/format.h"
+
+template<>
+struct fmt::formatter<GSVector4i> : formatter<std::string_view>
+{
+  auto format(const GSVector4i& rc, format_context& ctx) const
+  {
+    const TinyString str =
+      TinyString::from_format("{},{} => {},{} ({}x{})", rc.left, rc.top, rc.right, rc.bottom, rc.width(), rc.height());
+
+    return fmt::formatter<std::string_view>::format(str.view(), ctx);
+  }
+};
diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h
new file mode 100644
index 000000000..1362b22aa
--- /dev/null
+++ b/src/common/gsvector_neon.h
@@ -0,0 +1,1712 @@
+// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#define GSVECTOR_HAS_UNSIGNED 1
+#define GSVECTOR_HAS_SRLV 1
+
+class GSVector4;
+
+class alignas(16) GSVector4i
+{
+  struct cxpr_init_tag
+  {
+  };
+  static constexpr cxpr_init_tag cxpr_init{};
+
+  constexpr GSVector4i(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
+
+  constexpr GSVector4i(cxpr_init_tag, short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7)
+    : I16{s0, s1, s2, s3, s4, s5, s6, s7}
+  {
+  }
+
+  constexpr GSVector4i(cxpr_init_tag, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8,
+                       char b9, char b10, char b11, char b12, char b13, char b14, char b15)
+#if !defined(__APPLE__) && !defined(_MSC_VER)
+    : U8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+#else
+    : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+#endif
+  {
+  }
+
+public:
+  union
+  {
+    struct
+    {
+      int x, y, z, w;
+    };
+    struct
+    {
+      int r, g, b, a;
+    };
+    struct
+    {
+      int left, top, right, bottom;
+    };
+    float F32[4];
+    s8 I8[16];
+    s16 I16[8];
+    s32 I32[4];
+    s64 I64[2];
+    u8 U8[16];
+    u16 U16[8];
+    u32 U32[4];
+    u64 U64[2];
+    int32x4_t v4s;
+  };
+
+  GSVector4i() = default;
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
+  {
+    return GSVector4i(cxpr_init, x, y, z, w);
+  }
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+  {
+    return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
+  }
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
+                                                  s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+  {
+    return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
+  }
+
+  ALWAYS_INLINE GSVector4i(int x, int y, int z, int w)
+  {
+    GSVector4i xz = load(x).upl32(load(z));
+    GSVector4i yw = load(y).upl32(load(w));
+
+    *this = xz.upl32(yw);
+  }
+
+  ALWAYS_INLINE GSVector4i(int x, int y) { *this = load(x).upl32(load(y)); }
+
+  ALWAYS_INLINE GSVector4i(short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7)
+    : I16{s0, s1, s2, s3, s4, s5, s6, s7}
+  {
+  }
+
+  constexpr GSVector4i(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9,
+                       char b10, char b11, char b12, char b13, char b14, char b15)
+#if !defined(__APPLE__) && !defined(_MSC_VER)
+    : U8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+#else
+    : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+#endif
+  {
+  }
+
+  ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { v4s = vcombine_s32(vld1_s32(v.v), vcreate_s32(0)); }
+
+  // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
+  // so leave the non-constexpr version default
+  ALWAYS_INLINE explicit GSVector4i(int i) { *this = i; }
+
+  ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {}
+
+  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
+
+  ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
+
+  ALWAYS_INLINE void operator=(int i) { v4s = vdupq_n_s32(i); }
+
+  ALWAYS_INLINE operator int32x4_t() const { return v4s; }
+
+  // rect
+
+  ALWAYS_INLINE int width() const { return right - left; }
+
+  ALWAYS_INLINE int height() const { return bottom - top; }
+
+  ALWAYS_INLINE GSVector4i rsize() const
+  {
+    return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
+  }
+
+  ALWAYS_INLINE s32 rarea() const { return width() * height(); }
+
+  ALWAYS_INLINE bool rempty() const { return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0); }
+
+  ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_i32(a).upl64(max_i32(a).srl<8>()); }
+
+  ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& a) const { return sat_i32(a); }
+  ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); }
+  ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
+
+  template<Align_Mode mode>
+  GSVector4i _ralign_helper(const GSVector4i& mask) const
+  {
+    GSVector4i v;
+
+    switch (mode)
+    {
+      case Align_Inside:
+        v = add32(mask);
+        break;
+      case Align_Outside:
+        v = add32(mask.zwxy());
+        break;
+      case Align_NegInf:
+        v = *this;
+        break;
+      case Align_PosInf:
+        v = add32(mask.xyxy());
+        break;
+
+      default:
+        UnreachableCode();
+        break;
+    }
+
+    return v.andnot(mask.xyxy());
+  }
+
+  /// Align the rect using mask values that already have one subtracted (1 << n - 1 aligns to 1 << n)
+  template<Align_Mode mode>
+  GSVector4i ralign_presub(const GSVector2i& a) const
+  {
+    return _ralign_helper<mode>(GSVector4i(a));
+  }
+
+  template<Align_Mode mode>
+  GSVector4i ralign(const GSVector2i& a) const
+  {
+    // a must be 1 << n
+
+    return _ralign_helper<mode>(GSVector4i(a) - GSVector4i(1, 1));
+  }
+
+  //
+
+  ALWAYS_INLINE u32 rgba32() const
+  {
+    GSVector4i v = *this;
+
+    v = v.ps32(v);
+    v = v.pu16(v);
+
+    return (u32)store(v);
+  }
+
+  ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_i8(min).min_i8(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const
+  {
+    return max_i8(minmax.xyxy()).min_i8(minmax.zwzw());
+  }
+  ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_i16(min).min_i16(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const
+  {
+    return max_i16(minmax.xyxy()).min_i16(minmax.zwzw());
+  }
+  ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_i32(min).min_i32(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const
+  {
+    return max_i32(minmax.xyxy()).min_i32(minmax.zwzw());
+  }
+
+  ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_u8(min).min_u8(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
+  {
+    return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
+  }
+  ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_u16(min).min_u16(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
+  {
+    return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
+  }
+  ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_u32(min).min_u32(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
+  {
+    return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
+  }
+
+  ALWAYS_INLINE GSVector4i min_i8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vminq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i max_i8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vmaxq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i min_i16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vminq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i max_i16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vmaxq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i min_i32(const GSVector4i& v) const { return GSVector4i(vminq_s32(v4s, v.v4s)); }
+
+  ALWAYS_INLINE GSVector4i max_i32(const GSVector4i& v) const { return GSVector4i(vmaxq_s32(v4s, v.v4s)); }
+
+  ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u8(vminq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u8(vmaxq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u16(vminq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u16(vmaxq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
+  {
+    int32x4_t acc =
+      vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
+    acc = vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s));
+    return GSVector4i(acc);
+  }
+
+  ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(vpaddq_s32(v4s, v4s)); }
+
+  ALWAYS_INLINE s32 minv_s32() const { return vminvq_s32(v4s); }
+
+  ALWAYS_INLINE u32 minv_u32() const { return vminvq_u32(v4s); }
+
+  ALWAYS_INLINE s32 maxv_s32() const { return vmaxvq_s32(v4s); }
+
+  ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); }
+
+  ALWAYS_INLINE static int min_i16(int a, int b) { return store(load(a).min_i16(load(b))); }
+
+  ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
+
+  ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
+  {
+    uint8x16_t mask2 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_s32(mask.v4s), 7));
+    return GSVector4i(vreinterpretq_s32_u8(vbslq_u8(mask2, vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(v4s))));
+  }
+
+  template<int mask>
+  ALWAYS_INLINE GSVector4i blend16(const GSVector4i& a) const
+  {
+    static constexpr const uint16_t _mask[8] = {
+      ((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0,
+      ((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0,
+      ((mask) & (1 << 4)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 5)) ? (uint16_t)-1 : 0x0,
+      ((mask) & (1 << 6)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 7)) ? (uint16_t)-1 : 0x0};
+    return GSVector4i(
+      vreinterpretq_s32_u16(vbslq_u16(vld1q_u16(_mask), vreinterpretq_u16_s32(a.v4s), vreinterpretq_u16_s32(v4s))));
+  }
+
+  template<int mask>
+  ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
+  {
+    constexpr int bit3 = ((mask & 8) * 3) << 3;
+    constexpr int bit2 = ((mask & 4) * 3) << 2;
+    constexpr int bit1 = ((mask & 2) * 3) << 1;
+    constexpr int bit0 = (mask & 1) * 3;
+    return blend16<bit3 | bit2 | bit1 | bit0>(v);
+  }
+
+  ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
+  {
+    return GSVector4i(
+      vreinterpretq_s32_s8(vorrq_s8(vbicq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(mask.v4s)),
+                                    vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
+
+  ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(
+      vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v.v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i ps16() const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(
+      vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u8(
+      vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v.v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i pu16() const
+  {
+    return GSVector4i(vreinterpretq_s32_u8(
+      vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i ps32() const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i pu32() const
+  {
+    return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(vzip1q_s32(v4s, v.v4s)); }
+
+  ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(vzip2q_s32(v4s, v.v4s)); }
+
+  ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(
+      vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(
+      vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i upl8() const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
+  }
+
+  ALWAYS_INLINE GSVector4i uph8() const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0))));
+  }
+
+  ALWAYS_INLINE GSVector4i upl16() const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
+  }
+
+  ALWAYS_INLINE GSVector4i uph16() const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0))));
+  }
+
+  ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(vzip1q_s32(v4s, vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(vzip2q_s32(v4s, vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i upl64() const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
+  }
+
+  ALWAYS_INLINE GSVector4i uph64() const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
+  }
+
+  ALWAYS_INLINE GSVector4i i8to16() const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i u8to16() const
+  {
+    return GSVector4i(vreinterpretq_s32_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i i8to32() const
+  {
+    return GSVector4i(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))));
+  }
+
+  ALWAYS_INLINE GSVector4i u8to32() const
+  {
+    return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))));
+  }
+
+  ALWAYS_INLINE GSVector4i i8to64() const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(
+      vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))))))));
+  }
+
+  ALWAYS_INLINE GSVector4i u8to64() const
+  {
+    return GSVector4i(vreinterpretq_s32_u64(
+      vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)))))))));
+  }
+
+  ALWAYS_INLINE GSVector4i i16to32() const { return GSVector4i(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))); }
+
+  ALWAYS_INLINE GSVector4i u16to32() const
+  {
+    return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i i16to64() const
+  {
+    return GSVector4i(
+      vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))))));
+  }
+
+  ALWAYS_INLINE GSVector4i u16to64() const
+  {
+    return GSVector4i(
+      vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)))))));
+  }
+
+  ALWAYS_INLINE GSVector4i i32to64() const { return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(v4s)))); }
+
+  ALWAYS_INLINE GSVector4i u32to64() const
+  {
+    return GSVector4i(vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)))));
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i srl() const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0), i)));
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i srl(const GSVector4i& v)
+  {
+    if constexpr (i >= 16)
+      return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v.v4s), vdupq_n_u8(0), i - 16)));
+    else
+      return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s), i)));
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i sll() const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s32(v4s), 16 - i)));
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i sll16() const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vshlq_n_s16(vreinterpretq_s16_s32(v4s), i)));
+  }
+
+  ALWAYS_INLINE GSVector4i sll16(s32 i) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(i))));
+  }
+
+  ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i srl16() const
+  {
+    return GSVector4i(vreinterpretq_s32_u16(vshrq_n_u16(vreinterpretq_u16_s32(v4s), i)));
+  }
+
+  ALWAYS_INLINE GSVector4i srl16(s32 i) const
+  {
+    return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_u16(-i))));
+  }
+
+  ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const
+  {
+    return GSVector4i(
+      vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i sra16() const
+  {
+    constexpr int count = (i & ~15) ? 15 : i;
+    return GSVector4i(vreinterpretq_s32_s16(vshrq_n_s16(vreinterpretq_s16_s32(v4s), count)));
+  }
+
+  ALWAYS_INLINE GSVector4i sra16(s32 i) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(-i))));
+  }
+
+  ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const
+  {
+    return GSVector4i(
+      vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s)))));
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i sll32() const
+  {
+    return GSVector4i(vshlq_n_s32(v4s, i));
+  }
+
+  ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(i))); }
+
+  ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, v.v4s)); }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i srl32() const
+  {
+    return GSVector4i(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(v4s), i)));
+  }
+
+  ALWAYS_INLINE GSVector4i srl32(s32 i) const
+  {
+    return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(-i))));
+  }
+
+  ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s))));
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i sra32() const
+  {
+    return GSVector4i(vshrq_n_s32(v4s, i));
+  }
+
+  ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(-i))); }
+
+  ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const
+  {
+    return GSVector4i(vshlq_s32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s)));
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i sll64() const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vshlq_n_s64(vreinterpretq_s64_s32(v4s), i)));
+  }
+
+  ALWAYS_INLINE GSVector4i sll64(s32 i) const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(i))));
+  }
+
+  ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i sra64() const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vshrq_n_s64(vreinterpretq_s64_s32(v4s), i)));
+  }
+
+  ALWAYS_INLINE GSVector4i sra64(s32 i) const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(-i))));
+  }
+
+  ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const
+  {
+    return GSVector4i(
+      vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i srl64() const
+  {
+    return GSVector4i(vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(v4s), i)));
+  }
+
+  ALWAYS_INLINE GSVector4i srl64(s32 i) const
+  {
+    return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i))));
+  }
+
+  ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const
+  {
+    return GSVector4i(
+      vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(vaddq_s32(v4s, v.v4s)); }
+
+  ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vqaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const
+  {
+    // can't use vpaddq_s16() here, because we need saturation.
+    // return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+    const int16x8_t a = vreinterpretq_s16_s32(v4s);
+    const int16x8_t b = vreinterpretq_s16_s32(v.v4s);
+    return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+  }
+
+  ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u8(vqaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u16(vqaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(vsubq_s32(v4s, v.v4s)); }
+
+  ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vqsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vqsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u8(vqsubq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u16(vqsubq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u8(vrhaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u16(vrhaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const
+  {
+    // from sse2neon
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_s32(v4s));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_s32(v.v4s));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_s32(v4s));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_s32(v.v4s));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return GSVector4i(vreinterpretq_s32_u16(r.val[1]));
+  }
+
+  ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vmulq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const
+  {
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
+    int32x4_t mul_hi =
+      vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(narrow_lo, narrow_hi)));
+  }
+
+  ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); }
+
+  template<int shift>
+  ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
+  {
+    // (a - this) * f << shift + this
+
+    return add16(a.sub16(*this).modulate16<shift>(f));
+  }
+
+  template<int shift>
+  ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
+  {
+    // (a - b) * c << shift
+
+    return a.sub16(b).modulate16<shift>(c);
+  }
+
+  template<int shift>
+  ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
+                                         const GSVector4i& d)
+  {
+    // (a - b) * c << shift + d
+
+    return d.add16(a.sub16(b).modulate16<shift>(c));
+  }
+
+  ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a, const GSVector4i& f) const
+  {
+    // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
+
+    return add16(a.sub16(*this).mul16l(f).sra16<4>());
+  }
+
+  template<int shift>
+  ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
+  {
+    // a * f << shift
+    if (shift == 0)
+    {
+      return mul16hrs(f);
+    }
+
+    return sll16<shift + 1>().mul16hs(f);
+  }
+
+  ALWAYS_INLINE bool eq(const GSVector4i& v) const
+  {
+    return (vmaxvq_u32(vreinterpretq_u32_s32(veorq_s32(v4s, v.v4s))) == 0);
+  }
+
+  ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u8(vceqq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u16(vceqq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s)));
+  }
+
+  ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
+
+  ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); }
+
+  ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); }
+
+  ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vcgtq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vcgtq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(vcgtq_s32(v4s, v.v4s)); }
+
+  ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vcgeq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+  }
+  ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vcgeq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+  ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return GSVector4i(vcgeq_s32(v4s, v.v4s)); }
+
+  ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vcltq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vcltq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(vcltq_s32(v4s, v.v4s)); }
+
+  ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vcleq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
+  }
+  ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s16(vcleq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
+  }
+  ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return GSVector4i(vcleq_s32(v4s, v.v4s)); }
+
+  ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(vbicq_s32(v4s, v.v4s)); }
+
+  ALWAYS_INLINE int mask() const
+  {
+    // borrowed from sse2neon
+    const uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s32(v4s), 7));
+    const uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+    const uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+    const uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+    return static_cast<int>(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8));
+  }
+
+  ALWAYS_INLINE bool alltrue() const
+  {
+    // MSB should be set in all 8-bit lanes.
+    return (vminvq_u8(vreinterpretq_u8_s32(v4s)) & 0x80) == 0x80;
+  }
+
+  ALWAYS_INLINE bool allfalse() const
+  {
+    // MSB should be clear in all 8-bit lanes.
+    return (vmaxvq_u32(vreinterpretq_u8_s32(v4s)) & 0x80) != 0x80;
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i insert8(int a) const
+  {
+    return GSVector4i(vreinterpretq_s32_u8(vsetq_lane_u8(a, vreinterpretq_u8_s32(v4s), static_cast<uint8_t>(i))));
+  }
+
+  template<int i>
+  ALWAYS_INLINE int extract8() const
+  {
+    return vgetq_lane_u8(vreinterpretq_u8_s32(v4s), i);
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i insert16(int a) const
+  {
+    return GSVector4i(vreinterpretq_s32_u16(vsetq_lane_u16(a, vreinterpretq_u16_s32(v4s), static_cast<uint16_t>(i))));
+  }
+
+  template<int i>
+  ALWAYS_INLINE int extract16() const
+  {
+    return vgetq_lane_u16(vreinterpretq_u16_s32(v4s), i);
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i insert32(int a) const
+  {
+    return GSVector4i(vsetq_lane_s32(a, v4s, i));
+  }
+
+  template<int i>
+  ALWAYS_INLINE int extract32() const
+  {
+    return vgetq_lane_s32(v4s, i);
+  }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4i insert64(s64 a) const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(a, vreinterpretq_s64_s32(v4s), i)));
+  }
+
+  template<int i>
+  ALWAYS_INLINE s64 extract64() const
+  {
+    return vgetq_lane_s64(vreinterpretq_s64_s32(v4s), i);
+  }
+
+  ALWAYS_INLINE static GSVector4i loadnt(const void* p)
+  {
+#if __has_builtin(__builtin_nontemporal_store)
+    return GSVector4i(__builtin_nontemporal_load((int32x4_t*)p));
+#else
+    return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
+#endif
+  }
+
+  ALWAYS_INLINE static GSVector4i load32(const void* p)
+  {
+    // should be ldr s0, [x0]
+    u32 val;
+    std::memcpy(&val, p, sizeof(u32));
+    return GSVector4i(vsetq_lane_u32(val, vdupq_n_u32(0), 0));
+  }
+
+  ALWAYS_INLINE static GSVector4i loadl(const void* p)
+  {
+    return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
+  }
+
+  ALWAYS_INLINE static GSVector4i loadh(const void* p)
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
+  }
+
+  ALWAYS_INLINE static GSVector4i loadh(const void* p, const GSVector4i& v)
+  {
+    return GSVector4i(
+      vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v.v4s)), vld1_s64((int64_t*)p))));
+  }
+
+  ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); }
+
+  ALWAYS_INLINE static GSVector4i load(const void* pl, const void* ph)
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vld1_s64((int64_t*)pl), vld1_s64((int64_t*)ph))));
+  }
+
+  template<bool aligned>
+  ALWAYS_INLINE static GSVector4i load(const void* p)
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
+  }
+
+  ALWAYS_INLINE static GSVector4i load(int i) { return GSVector4i(vsetq_lane_s32(i, vdupq_n_s32(0), 0)); }
+
+  ALWAYS_INLINE static GSVector4i loadq(s64 i)
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(i, vdupq_n_s64(0), 0)));
+  }
+
+  ALWAYS_INLINE static void storent(void* p, const GSVector4i& v)
+  {
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(v.v4s, ((int32x4_t*)p));
+#else
+    vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
+#endif
+  }
+
+  ALWAYS_INLINE static void store32(void* p, const GSVector4i& v)
+  {
+    u32 val = vgetq_lane_s32(v, 0);
+    std::memcpy(p, &val, sizeof(u32));
+  }
+
+  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
+  {
+    vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
+  }
+
+  ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
+  {
+    vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
+  }
+
+  ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v)
+  {
+    GSVector4i::storel(pl, v);
+    GSVector4i::storeh(ph, v);
+  }
+
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
+  {
+    vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
+  }
+
+  ALWAYS_INLINE static int store(const GSVector4i& v) { return vgetq_lane_s32(v.v4s, 0); }
+
+  ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return vgetq_lane_s64(vreinterpretq_s64_s32(v.v4s), 0); }
+
+  ALWAYS_INLINE void operator&=(const GSVector4i& v)
+  {
+    v4s = vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
+  }
+
+  ALWAYS_INLINE void operator|=(const GSVector4i& v)
+  {
+    v4s = vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
+  }
+
+  ALWAYS_INLINE void operator^=(const GSVector4i& v)
+  {
+    v4s = vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)));
+  }
+
+  ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
+  }
+
+  ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
+  {
+    return GSVector4i(vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
+  }
+
+  ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
+  {
+    return GSVector4i(vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s))));
+  }
+
+  ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, int i) { return v & GSVector4i(i); }
+
+  ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, int i) { return v | GSVector4i(i); }
+
+  ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, int i) { return v ^ GSVector4i(i); }
+
+  ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return GSVector4i(vmvnq_s32(v.v4s)); }
+
+  ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(0); }
+
+  ALWAYS_INLINE static GSVector4i xffffffff() { return GSVector4i(0xFFFFFFFF); }
+
+  ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
+
+  ALWAYS_INLINE GSVector2i xy() const
+  {
+    GSVector2i ret;
+    storel(&ret, *this);
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector2i zw() const
+  {
+    GSVector2i ret;
+    storeh(&ret, *this);
+    return ret;
+  }
+
+  // clang-format off
+
+
+#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+    ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); }
+
+    // ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));}
+    // ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));}
+    // ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));}
+    // ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));}
+
+#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
+    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+#define VECTOR4i_SHUFFLE_1(xs, xn) \
+    VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
+    VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
+    VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
+    VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
+
+  VECTOR4i_SHUFFLE_1(x, 0)
+    VECTOR4i_SHUFFLE_1(y, 1)
+    VECTOR4i_SHUFFLE_1(z, 2)
+    VECTOR4i_SHUFFLE_1(w, 3)
+
+  // TODO: Make generic like above.
+  ALWAYS_INLINE GSVector4i xxzzlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 2, 2, 4, 4, 6, 6))); }
+  ALWAYS_INLINE GSVector4i yywwlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 1, 3, 3, 5, 5, 7, 7))); }
+  ALWAYS_INLINE GSVector4i yxwzlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 0, 3, 2, 5, 4, 7, 6))); }
+  ALWAYS_INLINE GSVector4i xxxxlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 0, 0, 4, 4, 4, 4))); }
+
+  ALWAYS_INLINE GSVector4i xxxxl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 0, 0, 4, 5, 6, 7))); }
+  ALWAYS_INLINE GSVector4i zwxyl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 2, 3, 0, 1, 4, 5, 6, 7))); }
+  ALWAYS_INLINE GSVector4i yxwzl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 0, 3, 2, 4, 5, 6, 7))); }
+  ALWAYS_INLINE GSVector4i zwzwl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 2, 3, 2, 3, 4, 5, 6, 7))); }
+
+  ALWAYS_INLINE GSVector4i zzzzh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 1, 2, 3, 6, 6, 6, 6))); }
+
+  // clang-format on
+};
+
+class alignas(16) GSVector4
+{
+  struct cxpr_init_tag
+  {
+  };
+  static constexpr cxpr_init_tag cxpr_init{};
+
+  constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
+
+  constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
+
+  constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
+
+public:
+  union
+  {
+    struct
+    {
+      float x, y, z, w;
+    };
+    struct
+    {
+      float r, g, b, a;
+    };
+    struct
+    {
+      float left, top, right, bottom;
+    };
+    float F32[4];
+    double F64[2];
+    s8 I8[16];
+    s16 I16[8];
+    s32 I32[4];
+    s64 I64[2];
+    u8 U8[16];
+    u16 U16[8];
+    u32 U32[4];
+    u64 U64[2];
+    float32x4_t v4s;
+  };
+
+  GSVector4() = default;
+
+  constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
+
+  constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
+
+  constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
+
+  constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
+
+  constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
+
+  constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
+
+  ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
+  {
+    const float arr[4] = {x, y, z, w};
+    v4s = vld1q_f32(arr);
+  }
+
+  ALWAYS_INLINE GSVector4(float x, float y)
+  {
+    v4s = vzip1q_f32(vsetq_lane_f32(x, vdupq_n_f32(0.0f), 0), vsetq_lane_f32(y, vdupq_n_f32(0.0f), 0));
+  }
+
+  ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
+  {
+    const int arr[4] = {x, y, z, w};
+    v4s = vcvtq_f32_s32(vld1q_s32(arr));
+  }
+
+  ALWAYS_INLINE GSVector4(int x, int y)
+  {
+    v4s = vcvtq_f32_s32(vzip1q_s32(vsetq_lane_s32(x, vdupq_n_s32(0), 0), vsetq_lane_s32(y, vdupq_n_s32(0), 0)));
+  }
+
+  ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(vld1_f32(v.v), vcreate_f32(0)); }
+
+  ALWAYS_INLINE explicit GSVector4(const GSVector2i& v)
+  {
+    v4s = vcvtq_f32_s32(vcombine_s32(vld1_s32(v.v), vcreate_s32(0)));
+  }
+
+  ALWAYS_INLINE constexpr explicit GSVector4(float32x4_t m) : v4s(m) {}
+
+  ALWAYS_INLINE explicit GSVector4(float f) { v4s = vdupq_n_f32(f); }
+
+  ALWAYS_INLINE explicit GSVector4(int i) { v4s = vcvtq_f32_s32(vdupq_n_s32(i)); }
+
+  ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
+
+  ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
+
+  ALWAYS_INLINE static GSVector4 f64(double x, double y)
+  {
+    return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
+  }
+
+  ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
+
+  ALWAYS_INLINE void operator=(float32x4_t m) { v4s = m; }
+
+  ALWAYS_INLINE operator float32x4_t() const { return v4s; }
+
+  /// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks
+  /// we don't need the whole vector Useful for e.g. preventing clang from optimizing shuffles that remove
+  /// possibly-denormal garbage data from vectors before computing with them
+  ALWAYS_INLINE GSVector4 noopt()
+  {
+    // Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the
+    // future the implementation should be updated
+#ifdef __clang__
+    // __asm__("":"+x"(m)::);
+#endif
+    return *this;
+  }
+
+  ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); }
+
+  ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
+
+  ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
+
+  ALWAYS_INLINE GSVector4 abs() const { return GSVector4(vabsq_f32(v4s)); }
+
+  ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); }
+
+  ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(vrecpeq_f32(v4s)); }
+
+  ALWAYS_INLINE GSVector4 rcpnr() const
+  {
+    float32x4_t recip = vrecpeq_f32(v4s);
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, v4s));
+    return GSVector4(recip);
+  }
+
+  template<int mode>
+  ALWAYS_INLINE GSVector4 round() const
+  {
+    if constexpr (mode == Round_NegInf)
+      return floor();
+    else if constexpr (mode == Round_PosInf)
+      return ceil();
+    else if constexpr (mode == Round_NearestInt)
+      return GSVector4(vrndnq_f32(v4s));
+    else
+      return GSVector4(vrndq_f32(v4s));
+  }
+
+  ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }
+
+  ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); }
+
+  ALWAYS_INLINE GSVector4 madd(const GSVector4& a, const GSVector4& b) const
+  {
+    return GSVector4(vfmaq_f32(b.v4s, v4s, a.v4s));
+  }
+  ALWAYS_INLINE GSVector4 msub(const GSVector4& a, const GSVector4& b) const
+  {
+    return GSVector4(vfmsq_f32(b.v4s, v4s, a.v4s));
+  }
+  ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const { return b - *this * a; }
+  ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const { return -b - *this * a; }
+
+  ALWAYS_INLINE GSVector4 addm(const GSVector4& a, const GSVector4& b) const
+  {
+    return a.madd(b, *this); // *this + a * b
+  }
+
+  ALWAYS_INLINE GSVector4 subm(const GSVector4& a, const GSVector4& b) const
+  {
+    return a.nmadd(b, *this); // *this - a * b
+  }
+
+  ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }
+
+  ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); }
+
+  ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v4s), vuzp2q_f32(v4s, v4s))); }
+
+  ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
+  {
+    return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
+  }
+
+  ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }
+
+  ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const
+  {
+    const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
+    const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
+    return sat(minv, maxv);
+  }
+
+  ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
+
+  ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
+
+  ALWAYS_INLINE GSVector4 min(const GSVector4& a) const { return GSVector4(vminq_f32(v4s, a.v4s)); }
+
+  ALWAYS_INLINE GSVector4 max(const GSVector4& a) const { return GSVector4(vmaxq_f32(v4s, a.v4s)); }
+
+  template<int mask>
+  ALWAYS_INLINE GSVector4 blend32(const GSVector4& a) const
+  {
+    return GSVector4(__builtin_shufflevector(v4s, a.v4s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1, (mask & 4) ? 6 : 2,
+                                             (mask & 8) ? 7 : 3));
+  }
+
+  ALWAYS_INLINE GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const
+  {
+    // duplicate sign bit across and bit select
+    const uint32x4_t bitmask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(mask.v4s), 31));
+    return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
+  }
+
+  ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); }
+
+  ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); }
+
+  ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
+  {
+    return GSVector4(vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
+  {
+    return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
+  }
+
+  ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const
+  {
+    return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
+  }
+
+  ALWAYS_INLINE GSVector4 h2l(const GSVector4& a) const
+  {
+    return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)));
+  }
+
+  ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
+  {
+    return GSVector4(vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v4s), vreinterpretq_s32_f32(v.v4s))));
+  }
+
+  ALWAYS_INLINE int mask() const
+  {
+    static constexpr const int32_t shifts[] = {0, 1, 2, 3};
+    return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
+  }
+
+  ALWAYS_INLINE bool alltrue() const
+  {
+    // return mask() == 0xf;
+    return ~(vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) & vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
+  }
+
+  ALWAYS_INLINE bool allfalse() const
+  {
+    return (vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) | vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
+  }
+
+  ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
+
+  template<int src, int dst>
+  ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
+  {
+    return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
+  }
+
+  template<int i>
+  ALWAYS_INLINE int extract32() const
+  {
+    return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i);
+  }
+
+  ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); }
+
+  ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }
+
+  ALWAYS_INLINE static GSVector4 loadl(const void* p)
+  {
+    return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
+  }
+
+  ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(vsetq_lane_f32(f, vmovq_n_f32(0.0f), 0)); }
+
+  template<bool aligned>
+  ALWAYS_INLINE static GSVector4 load(const void* p)
+  {
+    return GSVector4(vld1q_f32((const float*)p));
+  }
+
+  ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); }
+
+  ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
+  {
+    vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
+  }
+
+  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
+  {
+    vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
+  }
+
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector4& v)
+  {
+    vst1q_f32((float*)p, v.v4s);
+  }
+
+  ALWAYS_INLINE static void store(float* p, const GSVector4& v) { vst1q_lane_f32(p, v.v4s, 0); }
+
+  ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
+
+  ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); }
+  ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); }
+  ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); }
+  ALWAYS_INLINE void operator/=(const GSVector4& v) { v4s = vdivq_f32(v4s, v.v4s); }
+
+  ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
+  ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
+  ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
+  ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); }
+
+  ALWAYS_INLINE void operator&=(const GSVector4& v)
+  {
+    v4s = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
+  }
+
+  ALWAYS_INLINE void operator|=(const GSVector4& v)
+  {
+    v4s = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
+  }
+
+  ALWAYS_INLINE void operator^=(const GSVector4& v)
+  {
+    v4s = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s)));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(vaddq_f32(v1.v4s, v2.v4s));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(vsubq_f32(v1.v4s, v2.v4s));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(vmulq_f32(v1.v4s, v2.v4s));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
+  ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
+  ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
+  ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); }
+
+  ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s))));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(vreinterpretq_f32_u32(vceqq_f32(v1.v4s, v2.v4s)));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
+  {
+    // NEON has no !=
+    return GSVector4(vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(v1.v4s, v2.v4s))));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(vreinterpretq_f32_u32(vcgtq_f32(v1.v4s, v2.v4s)));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(vreinterpretq_f32_u32(vcltq_f32(v1.v4s, v2.v4s)));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(vreinterpretq_f32_u32(vcgeq_f32(v1.v4s, v2.v4s)));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
+  }
+
+  ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
+  {
+    return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
+  }
+
+  ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const
+  {
+    return GSVector4(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
+  }
+
+  ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const
+  {
+    return GSVector4(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
+  }
+
+  ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v)
+  {
+    return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s))));
+  }
+
+  ALWAYS_INLINE static GSVector4 f32to64(const void* p)
+  {
+    return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
+  }
+
+  ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
+  {
+    const float64x2_t r = truncate ? v4s : vrndiq_f64(vreinterpretq_f64_f32(v4s));
+    const s32 low = static_cast<s32>(vgetq_lane_f64(r, 0));
+    const s32 high = static_cast<s32>(vgetq_lane_f64(r, 1));
+    return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
+  }
+
+  // clang-format off
+
+#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+    ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } \
+    ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const { return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); }
+
+#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
+    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+#define VECTOR4_SHUFFLE_1(xs, xn) \
+    VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
+    VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
+    VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
+    VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
+
+  VECTOR4_SHUFFLE_1(x, 0)
+    VECTOR4_SHUFFLE_1(y, 1)
+    VECTOR4_SHUFFLE_1(z, 2)
+    VECTOR4_SHUFFLE_1(w, 3)
+
+  // clang-format on
+
+  ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(vdupq_laneq_f32(v4s, 0)); }
+
+  ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(vdupq_laneq_f32(v.v4s, 0)); }
+
+  ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); }
+
+  ALWAYS_INLINE static GSVector4 broadcast64(const void* f)
+  {
+    return GSVector4(vreinterpretq_f64_f32(vld1q_dup_f64((const double*)f)));
+  }
+};
+
+ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
+{
+  v4s = truncate ? vcvtq_s32_f32(v.v4s) : vcvtnq_u32_f32(v.v4s);
+}
+
+ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
+{
+  v4s = vcvtq_f32_s32(v.v4s);
+}
+
+ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
+{
+  return GSVector4i(vreinterpretq_s32_f32(v.v4s));
+}
+
+ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
+{
+  return GSVector4(vreinterpretq_f32_s32(v.v4s));
+}
diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h
new file mode 100644
index 000000000..b718b8e3c
--- /dev/null
+++ b/src/common/gsvector_nosimd.h
@@ -0,0 +1,1612 @@
+// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: LGPL-3.0+
+
+// Implementation of GSVector4/GSVector4i when the host does not support any form of SIMD.
+
+#pragma once
+
+#include "common/assert.h"
+#include "common/types.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+
+#define GSVECTOR_HAS_UNSIGNED 1
+#define GSVECTOR_HAS_SRLV 1
+
+class GSVector4;
+
+#define ALL_LANES_8(expr)                                                                                              \
+  GSVector4i ret;                                                                                                      \
+  for (size_t i = 0; i < 16; i++)                                                                                      \
+    expr;                                                                                                              \
+  return ret;
+#define ALL_LANES_16(expr)                                                                                             \
+  GSVector4i ret;                                                                                                      \
+  for (size_t i = 0; i < 8; i++)                                                                                       \
+    expr;                                                                                                              \
+  return ret;
+#define ALL_LANES_32(expr)                                                                                             \
+  GSVector4i ret;                                                                                                      \
+  for (size_t i = 0; i < 4; i++)                                                                                       \
+    expr;                                                                                                              \
+  return ret;
+#define ALL_LANES_64(expr)                                                                                             \
+  GSVector4i ret;                                                                                                      \
+  for (size_t i = 0; i < 2; i++)                                                                                       \
+    expr;                                                                                                              \
+  return ret;
+#define SSATURATE8(expr) static_cast<s8>(std::clamp<decltype(expr)>(expr, -128, 127))
+#define USATURATE8(expr) static_cast<u8>(std::clamp<decltype(expr)>(expr, 0, 255))
+#define SSATURATE16(expr) static_cast<s8>(std::clamp<decltype(expr)>(expr, -32768, 32767))
+#define USATURATE16(expr) static_cast<u8>(std::clamp<decltype(expr)>(expr, 0, 65535))
+
+class alignas(16) GSVector4i
+{
+  struct cxpr_init_tag
+  {
+  };
+  static constexpr cxpr_init_tag cxpr_init{};
+
+  constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : I32{x, y, z, w} {}
+
+  constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+    : I16{s0, s1, s2, s3, s4, s5, s6, s7}
+  {
+  }
+
+  constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
+                       s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+    : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+  {
+  }
+
+public:
+  union
+  {
+    struct
+    {
+      s32 x, y, z, w;
+    };
+    struct
+    {
+      s32 r, g, b, a;
+    };
+    struct
+    {
+      s32 left, top, right, bottom;
+    };
+    float F32[4];
+    s8 I8[16];
+    s16 I16[8];
+    s32 I32[4];
+    s64 I64[2];
+    u8 U8[16];
+    u16 U16[8];
+    u32 U32[4];
+    u64 U64[2];
+  };
+
+  GSVector4i() = default;
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
+  {
+    return GSVector4i(cxpr_init, x, y, z, w);
+  }
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+  {
+    return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
+  }
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
+                                                  s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+  {
+    return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
+  }
+
+  ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w)
+  {
+    this->x = x;
+    this->y = y;
+    this->z = z;
+    this->w = w;
+  }
+
+  ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); }
+
+  ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+  {
+    I16[0] = s0;
+    I16[1] = s1;
+    I16[2] = s2;
+    I16[3] = s3;
+    I16[4] = s4;
+    I16[5] = s5;
+    I16[6] = s6;
+    I16[7] = s7;
+  }
+
+  ALWAYS_INLINE constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
+                                     s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+    : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+  {
+  }
+
+  ALWAYS_INLINE GSVector4i(const GSVector4i& v) { std::memcpy(I32, v.I32, sizeof(I32)); }
+
+  ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v)
+  {
+    x = v.x;
+    y = v.y;
+    z = 0;
+    w = 0;
+  }
+
+  // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
+  // so leave the non-constexpr version default
+  ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
+
+  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
+
+  ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
+
+  ALWAYS_INLINE void operator=(const GSVector4i& v) { std::memcpy(I32, v.I32, sizeof(I32)); }
+  ALWAYS_INLINE void operator=(s32 i)
+  {
+    x = i;
+    y = i;
+    z = i;
+    w = i;
+  }
+
+  // rect
+
+  ALWAYS_INLINE s32 width() const { return right - left; }
+
+  ALWAYS_INLINE s32 height() const { return bottom - top; }
+
+  ALWAYS_INLINE GSVector4i rsize() const
+  {
+    return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
+  }
+
+  ALWAYS_INLINE s32 rarea() const { return width() * height(); }
+
+  ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; }
+
+  // TODO: Optimize for no-simd, this generates crap code.
+  ALWAYS_INLINE GSVector4i runion(const GSVector4i& v) const { return min_i32(v).upl64(max_i32(v).srl<8>()); }
+
+  ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& v) const { return sat_i32(v); }
+  ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); }
+  ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
+
+  template<Align_Mode mode>
+  GSVector4i _ralign_helper(const GSVector4i& mask) const
+  {
+    GSVector4i v;
+
+    switch (mode)
+    {
+      case Align_Inside:
+        v = add32(mask);
+        break;
+      case Align_Outside:
+        v = add32(mask.zwxy());
+        break;
+      case Align_NegInf:
+        v = *this;
+        break;
+      case Align_PosInf:
+        v = add32(mask.xyxy());
+        break;
+
+      default:
+        UnreachableCode();
+        break;
+    }
+
+    return v.andnot(mask.xyxy());
+  }
+
+  /// Align the rect using mask values that already have one subtracted (1 << n - 1 aligns to 1 << n)
+  template<Align_Mode mode>
+  GSVector4i ralign_presub(const GSVector2i& v) const
+  {
+    return _ralign_helper<mode>(GSVector4i(v));
+  }
+
+  template<Align_Mode mode>
+  GSVector4i ralign(const GSVector2i& v) const
+  {
+    // a must be 1 << n
+
+    return _ralign_helper<mode>(GSVector4i(v).sub32(GSVector4i(1, 1)));
+  }
+
+  //
+
+  ALWAYS_INLINE u32 rgba32() const
+  {
+    GSVector4i v = *this;
+
+    v = v.ps32(v);
+    v = v.pu16(v);
+
+    return (u32)store(v);
+  }
+
+  ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_i8(min).min_i8(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const
+  {
+    return max_i8(minmax.xyxy()).min_i8(minmax.zwzw());
+  }
+  ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_i16(min).min_i16(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const
+  {
+    return max_i16(minmax.xyxy()).min_i16(minmax.zwzw());
+  }
+  ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_i32(min).min_i32(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const
+  {
+    return max_i32(minmax.xyxy()).min_i32(minmax.zwzw());
+  }
+
+  ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_u8(min).min_u8(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
+  {
+    return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
+  }
+  ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_u16(min).min_u16(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
+  {
+    return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
+  }
+  ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_u32(min).min_u32(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
+  {
+    return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
+  }
+
+  GSVector4i min_i8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = std::min(I8[i], v.I8[i])); }
+  GSVector4i max_i8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = std::max(I8[i], v.I8[i])); }
+  GSVector4i min_i16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = std::min(I16[i], v.I16[i])); }
+  GSVector4i max_i16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = std::max(I16[i], v.I16[i])); }
+  GSVector4i min_i32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = std::min(I32[i], v.I32[i])); }
+  GSVector4i max_i32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = std::max(I32[i], v.I32[i])); }
+
+  GSVector4i min_u8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = std::min(U8[i], v.U8[i])); }
+  GSVector4i max_u8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = std::max(U8[i], v.U8[i])); }
+  GSVector4i min_u16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = std::min(U16[i], v.U16[i])); }
+  GSVector4i max_u16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = std::max(U16[i], v.U16[i])); }
+  GSVector4i min_u32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = std::min(U32[i], v.U32[i])); }
+  GSVector4i max_u32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = std::max(U32[i], v.U32[i])); }
+
+  GSVector4i madd_s16(const GSVector4i& v) const
+  {
+    ALL_LANES_32(ret.I32[i] = (I16[i * 2] * v.I16[i * 2]) + (I16[i * 2 + 1] * v.I16[i * 2 + 1]));
+  }
+
+  GSVector4i addp_s32() const { return GSVector4i(x + y, z + w, 0, 0); }
+
+  s32 minv_s32() const { return std::min(x, std::min(y, std::min(z, w))); }
+
+  u32 minv_u32() const { return std::min(U32[0], std::min(U32[1], std::min(U32[2], U32[3]))); }
+
+  s32 maxv_s32() const { return std::max(x, std::max(y, std::max(z, w))); }
+
+  u32 maxv_u32() const { return std::max(U32[0], std::max(U32[1], std::max(U32[2], U32[3]))); }
+
+  static s32 min_i16(s32 a, s32 b) { return store(load(a).min_i16(load(b))); }
+
+  ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
+
+  GSVector4i blend8(const GSVector4i& v, const GSVector4i& mask) const
+  {
+    GSVector4i ret;
+    for (size_t i = 0; i < 16; i++)
+      ret.U8[i] = (mask.U8[i] & 0x80) ? v.U8[i] : U8[i];
+    return ret;
+  }
+
+  template<s32 mask>
+  GSVector4i blend16(const GSVector4i& v) const
+  {
+    GSVector4i ret;
+    for (size_t i = 0; i < 8; i++)
+      ret.U16[i] = ((mask & (1 << i)) != 0) ? v.U16[i] : U16[i];
+    return ret;
+  }
+
+  template<s32 mask>
+  GSVector4i blend32(const GSVector4i& v) const
+  {
+    GSVector4i ret;
+    for (size_t i = 0; i < 4; i++)
+      ret.U32[i] = ((mask & (1 << i)) != 0) ? v.U32[i] : U32[i];
+    return ret;
+  }
+
+  GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
+  {
+    GSVector4i ret;
+    for (size_t i = 0; i < 2; i++)
+      ret.U64[0] = (v.U64[i] & mask.U64[i]) | (U64[i] & ~mask.U64[i]);
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
+
+  GSVector4i shuffle8(const GSVector4i& mask) const
+  {
+    ALL_LANES_8(ret.I8[i] = (mask.I8[i] & 0x80) ? 0 : (I8[mask.I8[i] & 0xf]));
+  }
+
+  GSVector4i ps16(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8((i < 8) ? I16[i] : v.I16[i])); }
+  GSVector4i ps16() const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I16[i])); }
+  GSVector4i pu16(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE16((i < 8) ? U16[i] : v.U16[i])); }
+  GSVector4i pu16() const { ALL_LANES_8(ret.U8[i] = USATURATE8(U16[i])); }
+  GSVector4i ps32(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = SSATURATE16((i < 8) ? I32[i] : v.I32[i])); }
+  GSVector4i ps32() const { ALL_LANES_16(ret.I16[i] = SSATURATE8(I32[i])); }
+  GSVector4i pu32(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16((i < 8) ? U32[i] : v.U32[i])); }
+  GSVector4i pu32() const { ALL_LANES_16(ret.U16[i] = USATURATE8(U32[i])); }
+
+  GSVector4i upl8(const GSVector4i& v) const
+  {
+    return GSVector4i(I8[0], v.I8[0], I8[1], v.I8[1], I8[2], v.I8[2], I8[3], v.I8[3], I8[4], v.I8[4], I8[5], v.I8[5],
+                      I8[6], v.I8[6], I8[7], v.I8[7]);
+  }
+  GSVector4i uph8(const GSVector4i& v) const
+  {
+    return GSVector4i(I8[8], v.I8[8], I8[9], v.I8[9], I8[10], v.I8[10], I8[11], v.I8[11], I8[12], v.I8[12], I8[13],
+                      v.I8[13], I8[14], v.I8[14], I8[15], v.I8[15]);
+  }
+  GSVector4i upl16(const GSVector4i& v) const
+  {
+    return GSVector4i(I16[0], v.I16[0], I16[1], v.I16[1], I16[2], v.I16[2], I16[3], v.I16[3]);
+  }
+  GSVector4i uph16(const GSVector4i& v) const
+  {
+    return GSVector4i(I16[4], v.I16[4], I16[5], v.I16[5], I16[6], v.I16[6], I16[7], v.I16[7]);
+  }
+  GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(I32[0], v.I32[0], I32[1], v.I32[1]); }
+  GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(I32[2], v.I32[2], I32[3], v.I32[3]); }
+  GSVector4i upl64(const GSVector4i& v) const
+  {
+    GSVector4i ret;
+    ret.I64[0] = I64[0];
+    ret.I64[1] = v.I64[0];
+    return ret;
+  }
+  GSVector4i uph64(const GSVector4i& v) const
+  {
+    GSVector4i ret;
+    ret.I64[0] = I64[1];
+    ret.I64[1] = v.I64[1];
+    return ret;
+  }
+
+  GSVector4i upl8() const
+  {
+    return GSVector4i(I8[0], 0, I8[1], 0, I8[2], 0, I8[3], 0, I8[4], 0, I8[5], 0, I8[6], 0, I8[7], 0);
+  }
+  GSVector4i uph8() const
+  {
+    return GSVector4i(I8[8], 0, I8[9], 0, I8[10], 0, I8[11], 0, I8[12], 0, I8[13], 0, I8[14], 0, I8[15], 0);
+  }
+
+  GSVector4i upl16() const { return GSVector4i(I16[0], 0, I16[1], 0, I16[2], 0, I16[3], 0); }
+  GSVector4i uph16() const { return GSVector4i(I16[4], 0, I16[5], 0, I16[6], 0, I16[7], 0); }
+
+  GSVector4i upl32() const { return GSVector4i(I32[0], 0, I32[1], 0); }
+  GSVector4i uph32() const { return GSVector4i(I32[2], 0, I32[3], 0); }
+  GSVector4i upl64() const
+  {
+    GSVector4i ret;
+    ret.I64[0] = I64[0];
+    ret.I64[1] = 0;
+    return ret;
+  }
+  GSVector4i uph64() const
+  {
+    GSVector4i ret;
+    ret.I64[0] = I64[1];
+    ret.I64[1] = 0;
+    return ret;
+  }
+
+  GSVector4i i8to16() const { ALL_LANES_16(ret.I16[i] = I8[i]); }
+  GSVector4i i8to32() const { ALL_LANES_32(ret.I32[i] = I8[i]); }
+  GSVector4i i8to64() const { ALL_LANES_64(ret.I64[i] = I8[i]); }
+
+  GSVector4i i16to32() const { ALL_LANES_32(ret.I32[i] = I16[i]); }
+  GSVector4i i16to64() const { ALL_LANES_64(ret.I64[i] = I16[i]); }
+  GSVector4i i32to64() const { ALL_LANES_64(ret.I64[i] = I32[i]); }
+  GSVector4i u8to16() const { ALL_LANES_64(ret.U16[i] = U8[i]); }
+  GSVector4i u8to32() const { ALL_LANES_32(ret.U32[i] = U8[i]); }
+  GSVector4i u8to64() const { ALL_LANES_64(ret.U64[i] = U8[i]); }
+  GSVector4i u16to32() const { ALL_LANES_32(ret.U32[i] = U16[i]); }
+  GSVector4i u16to64() const { ALL_LANES_64(ret.U64[i] = U16[i]); }
+  GSVector4i u32to64() const { ALL_LANES_64(ret.U64[i] = U32[i]); }
+
+  template<s32 v>
+  GSVector4i srl() const
+  {
+    GSVector4i ret = {};
+    if constexpr (v < 16)
+    {
+      for (s32 i = 0; i < (16 - v); i++)
+        ret.U8[i] = U8[v + i];
+    }
+    return ret;
+  }
+
+  template<s32 v>
+  GSVector4i srl(const GSVector4i& r)
+  {
+    // This sucks. Hopefully it's never used.
+    u8 concat[32];
+    std::memcpy(concat, U8, sizeof(u8) * 16);
+    std::memcpy(concat + 16, r.U8, sizeof(u8) * 16);
+
+    GSVector4i ret;
+    std::memcpy(ret.U8, &concat[v], sizeof(u8) * 16);
+    return ret;
+  }
+
+  template<s32 v>
+  GSVector4i sll() const
+  {
+    GSVector4i ret = {};
+    if constexpr (v < 16)
+    {
+      for (s32 i = 0; i < (16 - v); i++)
+        ret.U8[v + i] = U8[i];
+    }
+    return ret;
+  }
+
+  template<s32 v>
+  GSVector4i sll16() const
+  {
+    ALL_LANES_16(ret.U16[i] = U16[i] << v);
+  }
+
+  GSVector4i sll16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v); }
+
+  GSVector4i sllv16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v.U16[i]); }
+
+  template<s32 v>
+  GSVector4i srl16() const
+  {
+    ALL_LANES_16(ret.U16[i] = U16[i] >> v);
+  }
+
+  GSVector4i srl16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v); }
+
+  GSVector4i srlv16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v.U16[i]); }
+
+  template<s32 v>
+  GSVector4i sra16() const
+  {
+    ALL_LANES_16(ret.I16[i] = I16[i] >> v);
+  }
+
+  GSVector4i sra16(s32 v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v); }
+
+  GSVector4i srav16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v.I16[i]); }
+
+  template<s32 v>
+  GSVector4i sll32() const
+  {
+    ALL_LANES_32(ret.U32[i] = U32[i] << v);
+  }
+
+  GSVector4i sll32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v); }
+
+  GSVector4i sllv32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v.U32[i]); }
+
+  template<s32 v>
+  GSVector4i srl32() const
+  {
+    ALL_LANES_32(ret.U32[i] = U32[i] >> v);
+  }
+
+  GSVector4i srl32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v); }
+
+  GSVector4i srlv32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v.U32[i]); }
+
+  template<s32 v>
+  GSVector4i sra32() const
+  {
+    ALL_LANES_32(ret.I32[i] = I32[i] >> v);
+  }
+
+  GSVector4i sra32(s32 v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v); }
+
+  GSVector4i srav32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v.I32[i]); }
+
+  template<s64 v>
+  GSVector4i sll64() const
+  {
+    ALL_LANES_64(ret.U64[i] = U64[i] << v);
+  }
+
+  GSVector4i sll64(s32 v) const { ALL_LANES_64(ret.U64[i] = U64[i] << v); }
+
+  GSVector4i sllv64(const GSVector4i& v) const { ALL_LANES_64(ret.U64[i] = U64[i] << v.U64[i]); }
+
+  template<s64 v>
+  GSVector4i srl64() const
+  {
+    ALL_LANES_64(ret.U64[i] = U64[i] >> v);
+  }
+
+  GSVector4i srl64(s32 v) const { ALL_LANES_64(ret.U64[i] = U64[i] >> v); }
+
+  GSVector4i srlv64(const GSVector4i& v) const { ALL_LANES_64(ret.U64[i] = U64[i] >> v.U64[i]); }
+
+  template<s64 v>
+  GSVector4i sra64() const
+  {
+    ALL_LANES_64(ret.I64[i] = I64[i] >> v);
+  }
+
+  GSVector4i sra64(s32 v) const { ALL_LANES_64(ret.I64[i] = I64[i] >> v); }
+
+  GSVector4i srav64(const GSVector4i& v) const { ALL_LANES_64(ret.I64[i] = I64[i] >> v.I64[i]); }
+
+  GSVector4i add8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] + v.I8[i]); }
+
+  GSVector4i add16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] + v.I16[i]); }
+
+  GSVector4i add32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] + v.I32[i]); }
+
+  GSVector4i adds8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] + v.I8[i])); }
+
+  GSVector4i adds16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] + v.I16[i])); }
+
+  GSVector4i hadds16(const GSVector4i& v) const
+  {
+    return GSVector4i(SSATURATE16(I16[0] + I16[1]), SSATURATE16(I16[2] + I16[3]), SSATURATE16(I16[4] + I16[5]),
+                      SSATURATE16(I16[6] + I16[7]), SSATURATE16(v.I16[0] + v.I16[1]), SSATURATE16(v.I16[2] + v.I16[3]),
+                      SSATURATE16(v.I16[4] + v.I16[5]), SSATURATE16(v.I16[6] + v.I16[7]));
+  }
+
+  GSVector4i addus8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] + v.U8[i])); }
+
+  GSVector4i addus16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] + v.U16[i])); }
+
+  GSVector4i sub8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] - v.I8[i]); }
+
+  GSVector4i sub16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] - v.I16[i]); }
+
+  GSVector4i sub32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] - v.I32[i]); }
+
+  GSVector4i subs8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] - v.I8[i])); }
+
+  GSVector4i subs16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] - v.I16[i])); }
+
+  GSVector4i subus8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] - v.U8[i])); }
+
+  GSVector4i subus16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] - v.U16[i])); }
+
+  GSVector4i avg8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = (U8[i] + v.U8[i]) >> 1); }
+
+  GSVector4i avg16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] + v.U16[i]) >> 1); }
+
+  GSVector4i mul16hs(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] * v.I16[i]) >> 16); }
+
+  GSVector4i mul16hu(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] * v.U16[i]) >> 16); }
+
+  GSVector4i mul16l(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] * v.I16[i]); }
+
+  GSVector4i mul16hrs(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = ((I16[i] * v.I16[i]) >> 14) + 1); }
+
+  GSVector4i mul32l(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] * v.I32[i]); }
+
+  template<s32 shift>
+  ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
+  {
+    // (a - this) * f << shift + this
+
+    return add16(a.sub16(*this).modulate16<shift>(f));
+  }
+
+  template<s32 shift>
+  ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
+  {
+    // (a - b) * c << shift
+
+    return a.sub16(b).modulate16<shift>(c);
+  }
+
+  template<s32 shift>
+  ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
+                                         const GSVector4i& d)
+  {
+    // (a - b) * c << shift + d
+
+    return d.add16(a.sub16(b).modulate16<shift>(c));
+  }
+
+  ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const
+  {
+    // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
+
+    return add16(a_.sub16(*this).mul16l(f).sra16<4>());
+  }
+
+  template<s32 shift>
+  ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
+  {
+    // a * f << shift
+    if constexpr (shift == 0)
+    {
+      return mul16hrs(f);
+    }
+
+    return sll16<shift + 1>().mul16hs(f);
+  }
+
+  ALWAYS_INLINE bool eq(const GSVector4i& v) const { return (std::memcmp(I32, v.I32, sizeof(I32))) == 0; }
+
+  GSVector4i eq8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] == v.I8[i]) ? -1 : 0); }
+  GSVector4i eq16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] == v.I16[i]) ? -1 : 0); }
+  GSVector4i eq32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] == v.I32[i]) ? -1 : 0); }
+  GSVector4i eq64(const GSVector4i& v) const { ALL_LANES_64(ret.I64[i] = (I64[i] == v.I64[i]) ? -1 : 0); }
+
+  GSVector4i neq8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] != v.I8[i]) ? -1 : 0); }
+  GSVector4i neq16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] != v.I16[i]) ? -1 : 0); }
+  GSVector4i neq32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] != v.I32[i]) ? -1 : 0); }
+
+  GSVector4i gt8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] > v.I8[i]) ? -1 : 0); }
+  GSVector4i gt16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] > v.I16[i]) ? -1 : 0); }
+  GSVector4i gt32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] > v.I32[i]) ? -1 : 0); }
+
+  GSVector4i ge8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] >= v.I8[i]) ? -1 : 0); }
+  GSVector4i ge16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] >= v.I16[i]) ? -1 : 0); }
+  GSVector4i ge32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] >= v.I32[i]) ? -1 : 0); }
+
+  GSVector4i lt8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] < v.I8[i]) ? -1 : 0); }
+  GSVector4i lt16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] < v.I16[i]) ? -1 : 0); }
+  GSVector4i lt32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] < v.I32[i]) ? -1 : 0); }
+
+  GSVector4i le8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] <= v.I8[i]) ? -1 : 0); }
+  GSVector4i le16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] <= v.I16[i]) ? -1 : 0); }
+  GSVector4i le32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] <= v.I32[i]) ? -1 : 0); }
+
+  ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { ALL_LANES_64(ret.U64[i] = (~v.U64[i]) & U64[i]); }
+
+  s32 mask() const
+  {
+    return static_cast<s32>((static_cast<u32>(U8[0] >> 7) << 0) | (static_cast<u32>(U8[1] >> 7) << 1) |
+                            (static_cast<u32>(U8[2] >> 7) << 2) | (static_cast<u32>(U8[3] >> 7) << 3) |
+                            (static_cast<u32>(U8[4] >> 7) << 4) | (static_cast<u32>(U8[5] >> 7) << 5) |
+                            (static_cast<u32>(U8[6] >> 7) << 6) | (static_cast<u32>(U8[7] >> 7) << 7) |
+                            (static_cast<u32>(U8[8] >> 7) << 8) | (static_cast<u32>(U8[9] >> 7) << 9) |
+                            (static_cast<u32>(U8[10] >> 7) << 10) | (static_cast<u32>(U8[11] >> 7) << 11) |
+                            (static_cast<u32>(U8[12] >> 7) << 12) | (static_cast<u32>(U8[13] >> 7) << 13) |
+                            (static_cast<u32>(U8[14] >> 7) << 14) | (static_cast<u32>(U8[15] >> 7) << 15));
+  }
+
+  ALWAYS_INLINE bool alltrue() const { return ((U64[0] & U64[1]) == 0xFFFFFFFFFFFFFFFFULL); }
+
+  ALWAYS_INLINE bool allfalse() const { return ((U64[0] | U64[1]) == 0); }
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i insert8(s32 a) const
+  {
+    GSVector4i ret = *this;
+    ret.I8[i] = static_cast<s8>(a);
+    return ret;
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE s32 extract8() const
+  {
+    return I8[i];
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i insert16(s32 a) const
+  {
+    GSVector4i ret = *this;
+    ret.I16[i] = static_cast<s16>(a);
+    return ret;
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE s32 extract16() const
+  {
+    return I16[i];
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i insert32(s32 a) const
+  {
+    GSVector4i ret = *this;
+    ret.I32[i] = a;
+    return ret;
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE s32 extract32() const
+  {
+    return I32[i];
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i insert64(s64 a) const
+  {
+    GSVector4i ret = *this;
+    ret.I64[i] = a;
+    return ret;
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE s64 extract64() const
+  {
+    return I64[i];
+  }
+
+  ALWAYS_INLINE static GSVector4i loadnt(const void* p)
+  {
+    GSVector4i ret;
+    std::memcpy(&ret, p, sizeof(ret.I32));
+    return ret;
+  }
+
+  ALWAYS_INLINE static GSVector4i load32(const void* p)
+  {
+    GSVector4i ret;
+    std::memcpy(&ret.x, p, sizeof(s32));
+    ret.y = 0;
+    ret.z = 0;
+    ret.w = 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE static GSVector4i loadl(const void* p)
+  {
+    GSVector4i ret;
+    std::memcpy(&ret.U64[0], p, sizeof(ret.U64[0]));
+    ret.U64[1] = 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE static GSVector4i loadh(const void* p)
+  {
+    GSVector4i ret;
+    ret.U64[0] = 0;
+    std::memcpy(&ret.U64[1], p, sizeof(ret.U64[1]));
+    return ret;
+  }
+
+  ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); }
+
+  template<bool aligned>
+  ALWAYS_INLINE static GSVector4i load(const void* p)
+  {
+    GSVector4i ret;
+    std::memcpy(ret.I32, p, sizeof(ret.I32));
+    return ret;
+  }
+
+  ALWAYS_INLINE static GSVector4i load(s32 i)
+  {
+    GSVector4i ret;
+    ret.x = i;
+    ret.y = 0;
+    ret.z = 0;
+    ret.w = 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE static GSVector4i loadq(s64 i)
+  {
+    GSVector4i ret;
+    ret.I64[0] = i;
+    ret.I64[1] = 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { std::memcpy(p, v.I32, sizeof(v.I32)); }
+
+  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { std::memcpy(p, &v.I32[0], sizeof(s32) * 2); }
+
+  ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { std::memcpy(p, &v.I32[2], sizeof(s32) * 2); }
+
+  ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v)
+  {
+    GSVector4i::storel(pl, v);
+    GSVector4i::storeh(ph, v);
+  }
+
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
+  {
+    std::memcpy(p, v.I32, sizeof(I32));
+  }
+
+  ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { std::memcpy(p, &v.x, sizeof(s32)); }
+
+  ALWAYS_INLINE static s32 store(const GSVector4i& v) { return v.x; }
+
+  ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return v.I64[0]; }
+
+  ALWAYS_INLINE void operator&=(const GSVector4i& v)
+  {
+    U64[0] &= v.U64[0];
+    U64[1] &= v.U64[1];
+  }
+  ALWAYS_INLINE void operator|=(const GSVector4i& v)
+  {
+    U64[0] |= v.U64[0];
+    U64[1] |= v.U64[1];
+  }
+  ALWAYS_INLINE void operator^=(const GSVector4i& v)
+  {
+    U64[0] ^= v.U64[0];
+    U64[1] ^= v.U64[1];
+  }
+
+  ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
+  {
+    GSVector4i ret;
+    ret.U64[0] = v1.U64[0] & v2.U64[0];
+    ret.U64[1] = v1.U64[1] & v2.U64[1];
+    return ret;
+  }
+
+  ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
+  {
+    GSVector4i ret;
+    ret.U64[0] = v1.U64[0] | v2.U64[0];
+    ret.U64[1] = v1.U64[1] | v2.U64[1];
+    return ret;
+  }
+
+  ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
+  {
+    GSVector4i ret;
+    ret.U64[0] = v1.U64[0] ^ v2.U64[0];
+    ret.U64[1] = v1.U64[1] ^ v2.U64[1];
+    return ret;
+  }
+
+  ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); }
+
+  ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); }
+
+  ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); }
+
+  ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); }
+
+  ALWAYS_INLINE static constexpr GSVector4i zero() { return GSVector4i::cxpr(0, 0, 0, 0); }
+
+  ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
+
+  ALWAYS_INLINE GSVector2i xy() const
+  {
+    GSVector2i ret;
+    storel(&ret, *this);
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector2i zw() const
+  {
+    GSVector2i ret;
+    storeh(&ret, *this);
+    return ret;
+  }
+
+  // clang-format off
+  // l/h/lh not implemented until needed
+
+#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+    ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(I32[xn], I32[yn], I32[zn], I32[wn]);}
+
+#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
+    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+#define VECTOR4i_SHUFFLE_1(xs, xn) \
+    VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
+    VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
+    VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
+    VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
+
+  VECTOR4i_SHUFFLE_1(x, 0)
+    VECTOR4i_SHUFFLE_1(y, 1)
+    VECTOR4i_SHUFFLE_1(z, 2)
+    VECTOR4i_SHUFFLE_1(w, 3)
+
+  // clang-format on
+};
+
+class alignas(16) GSVector4
+{
+  struct cxpr_init_tag
+  {
+  };
+  static constexpr cxpr_init_tag cxpr_init{};
+
+  constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
+
+  constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
+
+  constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
+
+public:
+  union
+  {
+    struct
+    {
+      float x, y, z, w;
+    };
+    struct
+    {
+      float r, g, b, a;
+    };
+    struct
+    {
+      float left, top, right, bottom;
+    };
+    float F32[4];
+    double F64[2];
+    s8 I8[16];
+    s16 I16[8];
+    s32 I32[4];
+    s64 I64[2];
+    u8 U8[16];
+    u16 U16[8];
+    u32 U32[4];
+    u64 U64[2];
+    __m128 m;
+  };
+
+  GSVector4() = default;
+
+  constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
+
+  constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
+
+  constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
+
+  constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
+
+  constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
+
+  constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
+
+  ALWAYS_INLINE GSVector4(float x, float y, float z, float w)
+  {
+    this->x = x;
+    this->y = y;
+    this->z = z;
+    this->w = w;
+  }
+
+  ALWAYS_INLINE GSVector4(float x, float y)
+  {
+    this->x = x;
+    this->y = y;
+    this->z = 0.0f;
+    this->w = 0.0f;
+  }
+
+  ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
+  {
+    this->x = static_cast<float>(x);
+    this->y = static_cast<float>(y);
+    this->z = static_cast<float>(z);
+    this->w = static_cast<float>(w);
+  }
+
+  ALWAYS_INLINE GSVector4(int x, int y)
+  {
+    this->x = static_cast<float>(x);
+    this->y = static_cast<float>(y);
+    this->z = 0.0f;
+    this->w = 0.0f;
+  }
+
+  ALWAYS_INLINE explicit GSVector4(const GSVector2& v)
+  {
+    x = v.x;
+    y = v.y;
+    z = 0.0f;
+    w = 0.0f;
+  }
+
+  ALWAYS_INLINE explicit GSVector4(const GSVector2i& v)
+  {
+    x = static_cast<float>(v.x);
+    y = static_cast<float>(v.y);
+    z = 0.0f;
+    w = 0.0f;
+  }
+
+  ALWAYS_INLINE explicit GSVector4(float f) { x = y = z = w = f; }
+
+  ALWAYS_INLINE explicit GSVector4(int i) { x = y = z = w = static_cast<float>(i); }
+
+  ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
+
+  ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
+
+  ALWAYS_INLINE static GSVector4 f64(double x, double y)
+  {
+    GSVector4 ret;
+    ret.F64[0] = x;
+    ret.F64[1] = y;
+    return ret;
+  }
+
+  ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; }
+
+  ALWAYS_INLINE GSVector4 noopt() { return *this; }
+
+  u32 rgba32() const { return GSVector4i(*this).rgba32(); }
+
+  ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
+
+  ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
+
+  GSVector4 abs() const { return GSVector4(std::fabs(x), std::fabs(y), std::fabs(z), std::fabs(w)); }
+
+  GSVector4 neg() const { return GSVector4(-x, -y, -z, -w); }
+
+  GSVector4 rcp() const { return GSVector4(1.0f / x, 1.0f / y, 1.0f / z, 1.0f / w); }
+
+  GSVector4 rcpnr() const
+  {
+    GSVector4 v_ = rcp();
+
+    return (v_ + v_) - (v_ * v_) * *this;
+  }
+
+  GSVector4 floor() const { return GSVector4(std::floor(x), std::floor(y), std::floor(z), std::floor(w)); }
+
+  GSVector4 ceil() const { return GSVector4(std::ceil(x), std::ceil(y), std::ceil(z), std::ceil(w)); }
+
+  GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ + b_; }
+
+  GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ - b_; }
+
+  GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const { return b_ - *this * a_; }
+
+  GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const { return -b_ - *this * a_; }
+
+  GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const
+  {
+    return a_.madd(b_, *this); // *this + a * b
+  }
+
+  GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const
+  {
+    return a_.nmadd(b_, *this); // *this - a * b
+  }
+
+  GSVector4 hadd() const { return GSVector4(x + y, z + w, x + y, z + w); }
+
+  GSVector4 hadd(const GSVector4& v) const { return GSVector4(x + y, z + w, v.x + v.y, v.z + v.w); }
+
+  GSVector4 hsub() const { return GSVector4(x - y, z - w, x - y, z - w); }
+
+  GSVector4 hsub(const GSVector4& v) const { return GSVector4(x - y, z - w, v.x - v.y, v.z - v.w); }
+
+  template<int i>
+  GSVector4 dp(const GSVector4& v) const
+  {
+    float res = 0.0f;
+    if constexpr (i & 0x10)
+      res += x * v.x;
+    if constexpr (i & 0x20)
+      res += y * v.y;
+    if constexpr (i & 0x40)
+      res += z * v.z;
+    if constexpr (i & 0x80)
+      res += w * v.w;
+    return GSVector4((i & 0x01) ? res : 0.0f, (i & 0x02) ? res : 0.0f, (i & 0x04) ? res : 0.0f,
+                     (i & 0x08) ? res : 0.0f);
+  }
+
+  GSVector4 sat(const GSVector4& min, const GSVector4& max) const
+  {
+    return GSVector4(std::clamp(x, min.x, max.x), std::clamp(y, min.y, max.y), std::clamp(z, min.z, max.z),
+                     std::clamp(w, min.w, max.w));
+  }
+
+  GSVector4 sat(const GSVector4& v) const
+  {
+    return GSVector4(std::clamp(x, v.x, v.z), std::clamp(y, v.y, v.w), std::clamp(z, v.x, v.z),
+                     std::clamp(w, v.y, v.w));
+  }
+
+  GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
+
+  GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
+
+  GSVector4 min(const GSVector4& v) const
+  {
+    return GSVector4(std::min(x, v.x), std::min(y, v.y), std::min(z, v.z), std::min(w, v.w));
+  }
+
+  GSVector4 max(const GSVector4& v) const
+  {
+    return GSVector4(std::max(x, v.x), std::max(y, v.y), std::max(z, v.z), std::max(w, v.w));
+  }
+
+  template<int mask>
+  GSVector4 blend32(const GSVector4& v) const
+  {
+    return GSVector4(v.F32[mask & 1], v.F32[(mask >> 1) & 1], v.F32[(mask >> 2) & 1], v.F32[(mask >> 3) & 1]);
+  }
+
+  ALWAYS_INLINE GSVector4 blend32(const GSVector4& v, const GSVector4& mask) const
+  {
+    return GSVector4((mask.U32[0] & 0x80000000u) ? v.x : x, (mask.U32[1] & 0x80000000u) ? v.y : y,
+                     (mask.U32[2] & 0x80000000u) ? v.z : z, (mask.U32[3] & 0x80000000u) ? v.w : w);
+  }
+
+  GSVector4 upl(const GSVector4& v) const { return GSVector4(x, y, v.x, v.y); }
+
+  GSVector4 uph(const GSVector4& v) const { return GSVector4(z, w, v.z, v.w); }
+
+  GSVector4 upld(const GSVector4& v) const
+  {
+    GSVector4 ret;
+    ret.U64[0] = U64[0];
+    ret.U64[1] = v.U64[0];
+    return ret;
+  }
+
+  GSVector4 uphd(const GSVector4& v) const
+  {
+    GSVector4 ret;
+    ret.U64[0] = U64[1];
+    ret.U64[1] = v.U64[1];
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector4 l2h(const GSVector4& v) const { return GSVector4(x, y, v.x, v.y); }
+
+  ALWAYS_INLINE GSVector4 h2l(const GSVector4& v) const { return GSVector4(v.z, v.w, z, w); }
+
+  ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const
+  {
+    GSVector4 ret;
+    ret.U32[0] = ((~v.U32[0]) & U32[0]);
+    ret.U32[1] = ((~v.U32[1]) & U32[1]);
+    ret.U32[2] = ((~v.U32[2]) & U32[2]);
+    ret.U32[3] = ((~v.U32[3]) & U32[3]);
+    return ret;
+  }
+
+  ALWAYS_INLINE int mask() const
+  {
+    return (U32[0] >> 31) | ((U32[1] >> 30) & 2) | ((U32[2] >> 29) & 4) | ((U32[3] >> 28) & 8);
+  }
+
+  ALWAYS_INLINE bool alltrue() const { return ((U64[0] & U64[1]) == 0xFFFFFFFFFFFFFFFFULL); }
+
+  ALWAYS_INLINE bool allfalse() const { return ((U64[0] | U64[1]) == 0); }
+
+  ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
+
+  template<int src, int dst>
+  ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
+  {
+    GSVector4 ret = *this;
+    ret.F32[dst] = v.F32[src];
+    return ret;
+  }
+
+  template<int i>
+  ALWAYS_INLINE int extract32() const
+  {
+    return I32[i];
+  }
+
+  ALWAYS_INLINE static constexpr GSVector4 zero() { return GSVector4::cxpr(0.0f, 0.0f, 0.0f, 0.0f); }
+
+  ALWAYS_INLINE static constexpr GSVector4 xffffffff()
+  {
+    GSVector4 ret = zero();
+    ret.U64[0] = ~ret.U64[0];
+    ret.U64[1] = ~ret.U64[1];
+    return ret;
+  }
+
+  ALWAYS_INLINE static GSVector4 loadl(const void* p)
+  {
+    GSVector4 ret;
+    std::memcpy(&ret.x, p, sizeof(float) * 2);
+    ret.z = 0.0f;
+    ret.w = 0.0f;
+    return ret;
+  }
+
+  ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(f, f, f, f); }
+
+  template<bool aligned>
+  ALWAYS_INLINE static GSVector4 load(const void* p)
+  {
+    GSVector4 ret;
+    std::memcpy(&ret.x, p, sizeof(float) * 4);
+    return ret;
+  }
+
+  ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { std::memcpy(p, &v, sizeof(v)); }
+
+  ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { std::memcpy(p, &v.x, sizeof(float) * 2); }
+
+  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { std::memcpy(p, &v.z, sizeof(float) * 2); }
+
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector4& v)
+  {
+    std::memcpy(p, &v.x, sizeof(float));
+  }
+
+  ALWAYS_INLINE static void store(float* p, const GSVector4& v) { *p = v.x; }
+
+  ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
+
+  void operator+=(const GSVector4& v_)
+  {
+    x = x + v_.x;
+    y = y + v_.y;
+    z = z + v_.z;
+    w = w + v_.w;
+  }
+  void operator-=(const GSVector4& v_)
+  {
+    x = x - v_.x;
+    y = y - v_.y;
+    z = z - v_.z;
+    w = w - v_.w;
+  }
+  void operator*=(const GSVector4& v_)
+  {
+    x = x * v_.x;
+    y = y * v_.y;
+    z = z * v_.z;
+    w = w * v_.w;
+  }
+  void operator/=(const GSVector4& v_)
+  {
+    x = x / v_.x;
+    y = y / v_.y;
+    z = z / v_.z;
+    w = w / v_.w;
+  }
+
+  void operator+=(const float v_)
+  {
+    x = x + v_;
+    y = y + v_;
+    z = z + v_;
+    w = w + v_;
+  }
+  void operator-=(const float v_)
+  {
+    x = x - v_;
+    y = y - v_;
+    z = z - v_;
+    w = w - v_;
+  }
+  void operator*=(const float v_)
+  {
+    x = x * v_;
+    y = y * v_;
+    z = z * v_;
+    w = w * v_;
+  }
+  void operator/=(const float v_)
+  {
+    x = x / v_;
+    y = y / v_;
+    z = z / v_;
+    w = w / v_;
+  }
+
+  void operator&=(const GSVector4& v_)
+  {
+    U64[0] &= v_.U64[0];
+    U64[1] &= v_.U64[1];
+  }
+  void operator|=(const GSVector4& v_)
+  {
+    U64[0] |= v_.U64[0];
+    U64[1] |= v_.U64[1];
+  }
+  void operator^=(const GSVector4& v_)
+  {
+    U64[0] ^= v_.U64[0];
+    U64[1] ^= v_.U64[1];
+  }
+
+  friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z, v1.w + v2.w);
+  }
+
+  friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z, v1.w - v2.w);
+  }
+
+  friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z, v1.w * v2.w);
+  }
+
+  friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(v1.x / v2.x, v1.y / v2.y, v1.z / v2.z, v1.w / v2.w);
+  }
+
+  friend GSVector4 operator+(const GSVector4& v, float f) { return GSVector4(v.x + f, v.y + f, v.z + f, v.w + f); }
+
+  friend GSVector4 operator-(const GSVector4& v, float f) { return GSVector4(v.x - f, v.y - f, v.z - f, v.w - f); }
+
+  friend GSVector4 operator*(const GSVector4& v, float f) { return GSVector4(v.x * f, v.y * f, v.z * f, v.w * f); }
+
+  friend GSVector4 operator/(const GSVector4& v, float f) { return GSVector4(v.x / f, v.y / f, v.z / f, v.w / f); }
+
+  friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
+  {
+    GSVector4 ret;
+    ret.U64[0] = v1.U64[0] & v2.U64[0];
+    ret.U64[1] = v1.U64[1] & v2.U64[1];
+    return ret;
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
+  {
+    GSVector4 ret;
+    ret.U64[0] = v1.U64[0] | v2.U64[0];
+    ret.U64[1] = v1.U64[1] | v2.U64[1];
+    return ret;
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
+  {
+    GSVector4 ret;
+    ret.U64[0] = v1.U64[0] ^ v2.U64[0];
+    ret.U64[1] = v1.U64[1] ^ v2.U64[1];
+    return ret;
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
+  {
+    GSVector4 ret;
+    ret.I32[0] = (v1.x == v2.x) ? -1 : 0;
+    ret.I32[1] = (v1.y == v2.y) ? -1 : 0;
+    ret.I32[2] = (v1.z == v2.z) ? -1 : 0;
+    ret.I32[3] = (v1.w == v2.w) ? -1 : 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
+  {
+    GSVector4 ret;
+    ret.I32[0] = (v1.x != v2.x) ? -1 : 0;
+    ret.I32[1] = (v1.y != v2.y) ? -1 : 0;
+    ret.I32[2] = (v1.z != v2.z) ? -1 : 0;
+    ret.I32[3] = (v1.w != v2.w) ? -1 : 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
+  {
+    GSVector4 ret;
+    ret.I32[0] = (v1.x > v2.x) ? -1 : 0;
+    ret.I32[1] = (v1.y > v2.y) ? -1 : 0;
+    ret.I32[2] = (v1.z > v2.z) ? -1 : 0;
+    ret.I32[3] = (v1.w > v2.w) ? -1 : 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
+  {
+    GSVector4 ret;
+    ret.I32[0] = (v1.x < v2.x) ? -1 : 0;
+    ret.I32[1] = (v1.y < v2.y) ? -1 : 0;
+    ret.I32[2] = (v1.z < v2.z) ? -1 : 0;
+    ret.I32[3] = (v1.w < v2.w) ? -1 : 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
+  {
+    GSVector4 ret;
+    ret.I32[0] = (v1.x >= v2.x) ? -1 : 0;
+    ret.I32[1] = (v1.y >= v2.y) ? -1 : 0;
+    ret.I32[2] = (v1.z >= v2.z) ? -1 : 0;
+    ret.I32[3] = (v1.w >= v2.w) ? -1 : 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
+  {
+    GSVector4 ret;
+    ret.I32[0] = (v1.x <= v2.x) ? -1 : 0;
+    ret.I32[1] = (v1.y <= v2.y) ? -1 : 0;
+    ret.I32[2] = (v1.z <= v2.z) ? -1 : 0;
+    ret.I32[3] = (v1.w <= v2.w) ? -1 : 0;
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector4 mul64(const GSVector4& v_) const
+  {
+    GSVector4 ret;
+    ret.F64[0] = F64[0] * v_.F64[0];
+    ret.F64[1] = F64[1] * v_.F64[1];
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector4 add64(const GSVector4& v_) const
+  {
+    GSVector4 ret;
+    ret.F64[0] = F64[0] + v_.F64[0];
+    ret.F64[1] = F64[1] + v_.F64[1];
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector4 sub64(const GSVector4& v_) const
+  {
+    GSVector4 ret;
+    ret.F64[0] = F64[0] - v_.F64[0];
+    ret.F64[1] = F64[1] - v_.F64[1];
+    return ret;
+  }
+
+  ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_)
+  {
+    GSVector4 ret;
+    ret.F64[0] = v_.x;
+    ret.F64[1] = v_.y;
+    return ret;
+  }
+
+  ALWAYS_INLINE static GSVector4 f32to64(const void* p)
+  {
+    float f[2];
+    std::memcpy(f, p, sizeof(f));
+    GSVector4 ret;
+    ret.F64[0] = f[0];
+    ret.F64[1] = f[1];
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
+  {
+    return GSVector4i(static_cast<s32>(F64[0]), static_cast<s32>(F64[1]), 0, 0);
+  }
+
+  // clang-format off
+
+#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+    ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(F32[xn], F32[yn], F32[zn], F32[wn]); } \
+    ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(F32[xn], F32[yn], v_.F32[zn], v_.F32[wn]); }
+
+#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
+    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+#define VECTOR4_SHUFFLE_1(xs, xn) \
+    VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
+    VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
+    VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
+    VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
+
+  VECTOR4_SHUFFLE_1(x, 0)
+    VECTOR4_SHUFFLE_1(y, 1)
+    VECTOR4_SHUFFLE_1(z, 2)
+    VECTOR4_SHUFFLE_1(w, 3)
+
+  // clang-format on
+
+  ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(x, x, x, x); }
+
+  ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(v.x, v.x, v.x, v.x); }
+
+  ALWAYS_INLINE static GSVector4 broadcast32(const void* f)
+  {
+    float ff;
+    std::memcpy(&ff, f, sizeof(ff));
+    return GSVector4(ff, ff, ff, ff);
+  }
+
+  ALWAYS_INLINE static GSVector4 broadcast64(const void* d)
+  {
+    GSVector4 ret;
+    std::memcpy(&ret.F64[0], d, sizeof(ret.F64[0]));
+    ret.F64[1] = ret.F64[0];
+    return ret;
+  }
+};
+
+ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
+{
+  // TODO: Truncation vs rounding...
+  x = static_cast<s32>(v.x);
+  y = static_cast<s32>(v.y);
+  z = static_cast<s32>(v.z);
+  w = static_cast<s32>(v.w);
+}
+
+ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
+{
+  x = static_cast<float>(v.x);
+  y = static_cast<float>(v.y);
+  z = static_cast<float>(v.z);
+  w = static_cast<float>(v.w);
+}
+
+ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
+{
+  GSVector4i ret;
+  std::memcpy(&ret, &v, sizeof(ret));
+  return ret;
+}
+
+ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
+{
+  GSVector4 ret;
+  std::memcpy(&ret, &v, sizeof(ret));
+  return ret;
+}
+
+#undef SSATURATE8
+#undef USATURATE8
+#undef SSATURATE16
+#undef USATURATE16
+#undef ALL_LANES_8
+#undef ALL_LANES_16
+#undef ALL_LANES_32
+#undef ALL_LANES_64
diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h
new file mode 100644
index 000000000..9975fbae8
--- /dev/null
+++ b/src/common/gsvector_sse.h
@@ -0,0 +1,1322 @@
+// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: LGPL-3.0+
+
+#pragma once
+
+#include "common/assert.h"
+#include "common/intrin.h"
+#include "common/types.h"
+
+#include <algorithm>
+
+#ifdef CPU_ARCH_AVX2
+#define GSVECTOR_HAS_UNSIGNED 1
+#define GSVECTOR_HAS_SRLV 1
+#endif
+
+class GSVector4;
+
+class alignas(16) GSVector4i
+{
+  struct cxpr_init_tag
+  {
+  };
+  static constexpr cxpr_init_tag cxpr_init{};
+
+  constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : I32{x, y, z, w} {}
+
+  constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+    : I16{s0, s1, s2, s3, s4, s5, s6, s7}
+  {
+  }
+
+  constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
+                       s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+    : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+  {
+  }
+
+public:
+  union
+  {
+    struct
+    {
+      s32 x, y, z, w;
+    };
+    struct
+    {
+      s32 r, g, b, a;
+    };
+    struct
+    {
+      s32 left, top, right, bottom;
+    };
+    float F32[4];
+    s8 I8[16];
+    s16 I16[8];
+    s32 I32[4];
+    s64 I64[2];
+    u8 U8[16];
+    u16 U16[8];
+    u32 U32[4];
+    u64 U64[2];
+    __m128i m;
+  };
+
+  GSVector4i() = default;
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w)
+  {
+    return GSVector4i(cxpr_init, x, y, z, w);
+  }
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); }
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); }
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+  {
+    return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7);
+  }
+
+  ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9,
+                                                  s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+  {
+    return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15);
+  }
+
+  ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) { m = _mm_set_epi32(w, z, y, x); }
+
+  ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); }
+
+  ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7)
+  {
+    m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
+  }
+
+  ALWAYS_INLINE constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10,
+                                     s8 b11, s8 b12, s8 b13, s8 b14, s8 b15)
+    : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+  {
+  }
+
+  ALWAYS_INLINE GSVector4i(const GSVector4i& v) { m = v.m; }
+
+  ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { m = _mm_loadl_epi64((__m128i*)&v); }
+
+  // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7),
+  // so leave the non-constexpr version default
+  ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
+
+  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
+
+  ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
+
+  ALWAYS_INLINE constexpr explicit GSVector4i(__m128i m) : m(m) {}
+
+  ALWAYS_INLINE void operator=(const GSVector4i& v) { m = v.m; }
+  ALWAYS_INLINE void operator=(s32 i) { m = _mm_set1_epi32(i); }
+  ALWAYS_INLINE void operator=(__m128i m_) { m = m_; }
+
+  ALWAYS_INLINE operator __m128i() const { return m; }
+
+  // rect
+
+  ALWAYS_INLINE s32 width() const { return right - left; }
+
+  ALWAYS_INLINE s32 height() const { return bottom - top; }
+
+  ALWAYS_INLINE GSVector4i rsize() const
+  {
+    return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height());
+  }
+
+  ALWAYS_INLINE s32 rarea() const { return width() * height(); }
+
+  ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; }
+
+  ALWAYS_INLINE GSVector4i runion(const GSVector4i& v) const { return min_i32(v).upl64(max_i32(v).srl<8>()); }
+
+  ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& v) const { return sat_i32(v); }
+  ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); }
+  ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); }
+
+  template<Align_Mode mode>
+  GSVector4i _ralign_helper(const GSVector4i& mask) const
+  {
+    GSVector4i v;
+
+    switch (mode)
+    {
+      case Align_Inside:
+        v = add32(mask);
+        break;
+      case Align_Outside:
+        v = add32(mask.zwxy());
+        break;
+      case Align_NegInf:
+        v = *this;
+        break;
+      case Align_PosInf:
+        v = add32(mask.xyxy());
+        break;
+
+      default:
+        UnreachableCode();
+        break;
+    }
+
+    return v.andnot(mask.xyxy());
+  }
+
+  /// Align the rect using mask values that already have one subtracted (1 << n - 1 aligns to 1 << n)
+  template<Align_Mode mode>
+  GSVector4i ralign_presub(const GSVector2i& v) const
+  {
+    return _ralign_helper<mode>(GSVector4i(v));
+  }
+
+  template<Align_Mode mode>
+  GSVector4i ralign(const GSVector2i& v) const
+  {
+    // a must be 1 << n
+
+    return _ralign_helper<mode>(GSVector4i(v).sub32(GSVector4i(1, 1)));
+  }
+
+  //
+
+  ALWAYS_INLINE u32 rgba32() const
+  {
+    GSVector4i v = *this;
+
+    v = v.ps32(v);
+    v = v.pu16(v);
+
+    return (u32)store(v);
+  }
+
+  ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_i8(min).min_i8(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const
+  {
+    return max_i8(minmax.xyxy()).min_i8(minmax.zwzw());
+  }
+  ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_i16(min).min_i16(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const
+  {
+    return max_i16(minmax.xyxy()).min_i16(minmax.zwzw());
+  }
+  ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_i32(min).min_i32(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const
+  {
+    return max_i32(minmax.xyxy()).min_i32(minmax.zwzw());
+  }
+
+  ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_u8(min).min_u8(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const
+  {
+    return max_u8(minmax.xyxy()).min_u8(minmax.zwzw());
+  }
+  ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_u16(min).min_u16(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const
+  {
+    return max_u16(minmax.xyxy()).min_u16(minmax.zwzw());
+  }
+  ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const
+  {
+    return max_u32(min).min_u32(max);
+  }
+  ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const
+  {
+    return max_u32(minmax.xyxy()).min_u32(minmax.zwzw());
+  }
+
+  ALWAYS_INLINE GSVector4i min_i8(const GSVector4i& v) const { return GSVector4i(_mm_min_epi8(m, v)); }
+  ALWAYS_INLINE GSVector4i max_i8(const GSVector4i& v) const { return GSVector4i(_mm_max_epi8(m, v)); }
+  ALWAYS_INLINE GSVector4i min_i16(const GSVector4i& v) const { return GSVector4i(_mm_min_epi16(m, v)); }
+  ALWAYS_INLINE GSVector4i max_i16(const GSVector4i& v) const { return GSVector4i(_mm_max_epi16(m, v)); }
+  ALWAYS_INLINE GSVector4i min_i32(const GSVector4i& v) const { return GSVector4i(_mm_min_epi32(m, v)); }
+  ALWAYS_INLINE GSVector4i max_i32(const GSVector4i& v) const { return GSVector4i(_mm_max_epi32(m, v)); }
+
+  ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const { return GSVector4i(_mm_min_epu8(m, v)); }
+  ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const { return GSVector4i(_mm_max_epu8(m, v)); }
+  ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const { return GSVector4i(_mm_min_epu16(m, v)); }
+  ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const { return GSVector4i(_mm_max_epu16(m, v)); }
+  ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const { return GSVector4i(_mm_min_epu32(m, v)); }
+  ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const { return GSVector4i(_mm_max_epu32(m, v)); }
+
+  ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const { return GSVector4i(_mm_madd_epi16(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(_mm_hadd_epi32(m, m)); }
+
+  ALWAYS_INLINE s32 minv_s32() const
+  {
+    const __m128i vmin = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
+    return std::min<s32>(_mm_extract_epi32(vmin, 0), _mm_extract_epi32(vmin, 1));
+  }
+
+  ALWAYS_INLINE u32 minv_u32() const
+  {
+    const __m128i vmin = _mm_min_epu32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
+    return std::min<u32>(_mm_extract_epi32(vmin, 0), _mm_extract_epi32(vmin, 1));
+  }
+
+  ALWAYS_INLINE s32 maxv_s32() const
+  {
+    const __m128i vmax = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
+    return std::max<s32>(_mm_extract_epi32(vmax, 0), _mm_extract_epi32(vmax, 1));
+  }
+
+  ALWAYS_INLINE u32 maxv_u32() const
+  {
+    const __m128i vmax = _mm_max_epu32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2)));
+    return std::max<u32>(_mm_extract_epi32(vmax, 0), _mm_extract_epi32(vmax, 1));
+  }
+
+  ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
+
+  ALWAYS_INLINE GSVector4i blend8(const GSVector4i& v, const GSVector4i& mask) const
+  {
+    return GSVector4i(_mm_blendv_epi8(m, v, mask));
+  }
+
+  template<s32 mask>
+  ALWAYS_INLINE GSVector4i blend16(const GSVector4i& v) const
+  {
+    return GSVector4i(_mm_blend_epi16(m, v, mask));
+  }
+
+  template<s32 mask>
+  ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const
+  {
+#if defined(__AVX2__)
+    return GSVector4i(_mm_blend_epi32(m, v.m, mask));
+#else
+    constexpr s32 bit3 = ((mask & 8) * 3) << 3;
+    constexpr s32 bit2 = ((mask & 4) * 3) << 2;
+    constexpr s32 bit1 = ((mask & 2) * 3) << 1;
+    constexpr s32 bit0 = (mask & 1) * 3;
+    return blend16<bit3 | bit2 | bit1 | bit0>(v);
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const
+  {
+    return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v)));
+  }
+
+  ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); }
+
+  ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const { return GSVector4i(_mm_shuffle_epi8(m, mask)); }
+
+  ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi16(m, v)); }
+  ALWAYS_INLINE GSVector4i ps16() const { return GSVector4i(_mm_packs_epi16(m, m)); }
+  ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi16(m, v)); }
+  ALWAYS_INLINE GSVector4i pu16() const { return GSVector4i(_mm_packus_epi16(m, m)); }
+  ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi32(m, v)); }
+  ALWAYS_INLINE GSVector4i ps32() const { return GSVector4i(_mm_packs_epi32(m, m)); }
+  ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi32(m, v)); }
+  ALWAYS_INLINE GSVector4i pu32() const { return GSVector4i(_mm_packus_epi32(m, m)); }
+
+  ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi8(m, v)); }
+  ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi8(m, v)); }
+  ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi16(m, v)); }
+  ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi16(m, v)); }
+  ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi32(m, v)); }
+  ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi32(m, v)); }
+  ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi64(m, v)); }
+  ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi64(m, v)); }
+
+  ALWAYS_INLINE GSVector4i upl8() const { return GSVector4i(_mm_unpacklo_epi8(m, _mm_setzero_si128())); }
+  ALWAYS_INLINE GSVector4i uph8() const { return GSVector4i(_mm_unpackhi_epi8(m, _mm_setzero_si128())); }
+
+  ALWAYS_INLINE GSVector4i upl16() const { return GSVector4i(_mm_unpacklo_epi16(m, _mm_setzero_si128())); }
+  ALWAYS_INLINE GSVector4i uph16() const { return GSVector4i(_mm_unpackhi_epi16(m, _mm_setzero_si128())); }
+
+  ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(_mm_unpacklo_epi32(m, _mm_setzero_si128())); }
+
+  ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(_mm_unpackhi_epi32(m, _mm_setzero_si128())); }
+  ALWAYS_INLINE GSVector4i upl64() const { return GSVector4i(_mm_unpacklo_epi64(m, _mm_setzero_si128())); }
+  ALWAYS_INLINE GSVector4i uph64() const { return GSVector4i(_mm_unpackhi_epi64(m, _mm_setzero_si128())); }
+
+  ALWAYS_INLINE GSVector4i i8to16() const { return GSVector4i(_mm_cvtepi8_epi16(m)); }
+  ALWAYS_INLINE GSVector4i i8to32() const { return GSVector4i(_mm_cvtepi8_epi32(m)); }
+  ALWAYS_INLINE GSVector4i i8to64() const { return GSVector4i(_mm_cvtepi8_epi64(m)); }
+
+#ifdef CPU_ARCH_SSE41
+  ALWAYS_INLINE GSVector4i i16to32() const { return GSVector4i(_mm_cvtepi16_epi32(m)); }
+  ALWAYS_INLINE GSVector4i i16to64() const { return GSVector4i(_mm_cvtepi16_epi64(m)); }
+  ALWAYS_INLINE GSVector4i i32to64() const { return GSVector4i(_mm_cvtepi32_epi64(m)); }
+  ALWAYS_INLINE GSVector4i u8to16() const { return GSVector4i(_mm_cvtepu8_epi16(m)); }
+  ALWAYS_INLINE GSVector4i u8to32() const { return GSVector4i(_mm_cvtepu8_epi32(m)); }
+  ALWAYS_INLINE GSVector4i u8to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); }
+  ALWAYS_INLINE GSVector4i u16to32() const { return GSVector4i(_mm_cvtepu16_epi32(m)); }
+  ALWAYS_INLINE GSVector4i u16to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); }
+  ALWAYS_INLINE GSVector4i u32to64() const { return GSVector4i(_mm_cvtepu32_epi64(m)); }
+#endif
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i srl() const
+  {
+    return GSVector4i(_mm_srli_si128(m, i));
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i srl(const GSVector4i& v)
+  {
+    return GSVector4i(_mm_alignr_epi8(v.m, m, i));
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i sll() const
+  {
+    return GSVector4i(_mm_slli_si128(m, i));
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i sll16() const
+  {
+    return GSVector4i(_mm_slli_epi16(m, i));
+  }
+
+  ALWAYS_INLINE GSVector4i sll16(s32 i) const { return GSVector4i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+  ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi16(m, v.m)); }
+#endif
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i srl16() const
+  {
+    return GSVector4i(_mm_srli_epi16(m, i));
+  }
+
+  ALWAYS_INLINE GSVector4i srl16(s32 i) const { return GSVector4i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+  ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi16(m, v.m)); }
+#endif
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i sra16() const
+  {
+    return GSVector4i(_mm_srai_epi16(m, i));
+  }
+
+  ALWAYS_INLINE GSVector4i sra16(s32 i) const { return GSVector4i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+  ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi16(m, v.m)); }
+#endif
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i sll32() const
+  {
+    return GSVector4i(_mm_slli_epi32(m, i));
+  }
+
+  ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+  ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi32(m, v.m)); }
+#endif
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i srl32() const
+  {
+    return GSVector4i(_mm_srli_epi32(m, i));
+  }
+
+  ALWAYS_INLINE GSVector4i srl32(s32 i) const { return GSVector4i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+  ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi32(m, v.m)); }
+#endif
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i sra32() const
+  {
+    return GSVector4i(_mm_srai_epi32(m, i));
+  }
+
+  ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+  ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi32(m, v.m)); }
+#endif
+
+  template<s64 i>
+  ALWAYS_INLINE GSVector4i sll64() const
+  {
+    return GSVector4i(_mm_slli_epi64(m, i));
+  }
+
+  ALWAYS_INLINE GSVector4i sll64(s32 i) const { return GSVector4i(_mm_sll_epi64(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+  ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi64(m, v.m)); }
+#endif
+
+  template<s64 i>
+  ALWAYS_INLINE GSVector4i srl64() const
+  {
+    return GSVector4i(_mm_srli_epi64(m, i));
+  }
+
+  ALWAYS_INLINE GSVector4i srl64(s32 i) const { return GSVector4i(_mm_srl_epi64(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+  ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi64(m, v.m)); }
+#endif
+
+  template<s64 i>
+  ALWAYS_INLINE GSVector4i sra64() const
+  {
+    return GSVector4i(_mm_srai_epi64(m, i));
+  }
+
+  ALWAYS_INLINE GSVector4i sra64(s32 i) const { return GSVector4i(_mm_sra_epi64(m, _mm_cvtsi32_si128(i))); }
+
+#ifdef CPU_ARCH_AVX2
+  ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi64(m, v.m)); }
+#endif
+
+  ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const { return GSVector4i(_mm_add_epi8(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const { return GSVector4i(_mm_add_epi16(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(_mm_add_epi32(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi8(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi16(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const { return GSVector4i(_mm_hadds_epi16(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu8(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu16(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi8(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi16(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi32(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi8(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi16(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu8(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu16(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu8(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu16(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epi16(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i mul16hu(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epu16(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi16(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const { return GSVector4i(_mm_mulhrs_epi16(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi32(m, v.m)); }
+
+  template<s32 shift>
+  ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const
+  {
+    // (a - this) * f << shift + this
+
+    return add16(a.sub16(*this).modulate16<shift>(f));
+  }
+
+  template<s32 shift>
+  ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c)
+  {
+    // (a - b) * c << shift
+
+    return a.sub16(b).modulate16<shift>(c);
+  }
+
+  template<s32 shift>
+  ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c,
+                                         const GSVector4i& d)
+  {
+    // (a - b) * c << shift + d
+
+    return d.add16(a.sub16(b).modulate16<shift>(c));
+  }
+
+  ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const
+  {
+    // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit)
+
+    return add16(a_.sub16(*this).mul16l(f).sra16<4>());
+  }
+
+  template<s32 shift>
+  ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const
+  {
+    // a * f << shift
+    if (shift == 0)
+    {
+      return mul16hrs(f);
+    }
+
+    return sll16<shift + 1>().mul16hs(f);
+  }
+
+  ALWAYS_INLINE bool eq(const GSVector4i& v) const
+  {
+    // pxor, ptest, je
+
+    GSVector4i t = *this ^ v;
+
+    return _mm_testz_si128(t, t) != 0;
+  }
+
+  ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi8(m, v.m)); }
+  ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi16(m, v.m)); }
+  ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi32(m, v.m)); }
+  ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi64(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
+  ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); }
+  ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); }
+
+  ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi8(m, v.m)); }
+  ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi16(m, v.m)); }
+  ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi32(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi8(m, v.m)); }
+  ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi16(m, v.m)); }
+  ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi32(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi8(m, v.m)); }
+  ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi16(m, v.m)); }
+  ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi32(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi8(m, v.m)); }
+  ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi16(m, v.m)); }
+  ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi32(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(_mm_andnot_si128(v.m, m)); }
+
+  ALWAYS_INLINE s32 mask() const { return _mm_movemask_epi8(m); }
+
+  ALWAYS_INLINE bool alltrue() const { return mask() == 0xffff; }
+
+  ALWAYS_INLINE bool allfalse() const { return _mm_testz_si128(m, m) != 0; }
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i insert8(s32 a) const
+  {
+    return GSVector4i(_mm_insert_epi8(m, a, i));
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE s32 extract8() const
+  {
+    return _mm_extract_epi8(m, i);
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i insert16(s32 a) const
+  {
+    return GSVector4i(_mm_insert_epi16(m, a, i));
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE s32 extract16() const
+  {
+    return _mm_extract_epi16(m, i);
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i insert32(s32 a) const
+  {
+    return GSVector4i(_mm_insert_epi32(m, a, i));
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE s32 extract32() const
+  {
+    if constexpr (i == 0)
+      return GSVector4i::store(*this);
+
+    return _mm_extract_epi32(m, i);
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE GSVector4i insert64(s64 a) const
+  {
+    return GSVector4i(_mm_insert_epi64(m, a, i));
+  }
+
+  template<s32 i>
+  ALWAYS_INLINE s64 extract64() const
+  {
+    if (i == 0)
+      return GSVector4i::storeq(*this);
+
+    return _mm_extract_epi64(m, i);
+  }
+
+  ALWAYS_INLINE static GSVector4i loadnt(const void* p) { return GSVector4i(_mm_stream_load_si128((__m128i*)p)); }
+
+  ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); }
+
+  ALWAYS_INLINE static GSVector4i loadl(const void* p) { return GSVector4i(_mm_loadl_epi64((__m128i*)p)); }
+
+  ALWAYS_INLINE static GSVector4i loadh(const void* p)
+  {
+    return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), (__m64*)p)));
+  }
+
+  ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); }
+
+  template<bool aligned>
+  ALWAYS_INLINE static GSVector4i load(const void* p)
+  {
+    return GSVector4i(aligned ? _mm_load_si128((__m128i*)p) : _mm_loadu_si128((__m128i*)p));
+  }
+
+  ALWAYS_INLINE static GSVector4i load(s32 i) { return GSVector4i(_mm_cvtsi32_si128(i)); }
+
+  ALWAYS_INLINE static GSVector4i loadq(s64 i) { return GSVector4i(_mm_cvtsi64_si128(i)); }
+
+  ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128((__m128i*)p, v.m); }
+
+  ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64((__m128i*)p, v.m); }
+
+  ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { _mm_storeh_pi((__m64*)p, _mm_castsi128_ps(v.m)); }
+
+  ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v)
+  {
+    GSVector4i::storel(pl, v);
+    GSVector4i::storeh(ph, v);
+  }
+
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
+  {
+    if constexpr (aligned)
+      _mm_store_si128((__m128i*)p, v.m);
+    else
+      _mm_storeu_si128((__m128i*)p, v.m);
+  }
+
+  ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { _mm_storeu_si32(p, v); }
+
+  ALWAYS_INLINE static s32 store(const GSVector4i& v) { return _mm_cvtsi128_si32(v.m); }
+
+  ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return _mm_cvtsi128_si64(v.m); }
+
+  ALWAYS_INLINE void operator&=(const GSVector4i& v) { m = _mm_and_si128(m, v); }
+  ALWAYS_INLINE void operator|=(const GSVector4i& v) { m = _mm_or_si128(m, v); }
+  ALWAYS_INLINE void operator^=(const GSVector4i& v) { m = _mm_xor_si128(m, v); }
+
+  ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2)
+  {
+    return GSVector4i(_mm_and_si128(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2)
+  {
+    return GSVector4i(_mm_or_si128(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2)
+  {
+    return GSVector4i(_mm_xor_si128(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); }
+
+  ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); }
+
+  ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); }
+
+  ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); }
+
+  ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); }
+
+  ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); }
+
+  ALWAYS_INLINE GSVector2i xy() const
+  {
+    GSVector2i ret;
+    storel(&ret, *this);
+    return ret;
+  }
+
+  ALWAYS_INLINE GSVector2i zw() const
+  {
+    GSVector2i ret;
+    storeh(&ret, *this);
+    return ret;
+  }
+
+  // clang-format off
+
+#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+    ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+    ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+    ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \
+    ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} \
+
+#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+    VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \
+    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+    VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+#define VECTOR4i_SHUFFLE_1(xs, xn) \
+    VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \
+    VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \
+    VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \
+    VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \
+
+  VECTOR4i_SHUFFLE_1(x, 0)
+    VECTOR4i_SHUFFLE_1(y, 1)
+    VECTOR4i_SHUFFLE_1(z, 2)
+    VECTOR4i_SHUFFLE_1(w, 3)
+
+  // clang-format on
+};
+
+class alignas(16) GSVector4
+{
+  struct cxpr_init_tag
+  {
+  };
+  static constexpr cxpr_init_tag cxpr_init{};
+
+  constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {}
+
+  constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {}
+
+  constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {}
+
+public:
+  union
+  {
+    struct
+    {
+      float x, y, z, w;
+    };
+    struct
+    {
+      float r, g, b, a;
+    };
+    struct
+    {
+      float left, top, right, bottom;
+    };
+    float F32[4];
+    double F64[2];
+    s8 I8[16];
+    s16 I16[8];
+    s32 I32[4];
+    s64 I64[2];
+    u8 U8[16];
+    u16 U16[8];
+    u32 U32[4];
+    u64 U64[2];
+    __m128 m;
+  };
+
+  GSVector4() = default;
+
+  constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); }
+
+  constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); }
+
+  constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); }
+
+  constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); }
+
+  constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); }
+
+  constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); }
+
+  ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); }
+
+  ALWAYS_INLINE GSVector4(float x, float y) { m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y)); }
+
+  ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
+  {
+    GSVector4i v_(x, y, z, w);
+
+    m = _mm_cvtepi32_ps(v_.m);
+  }
+
+  ALWAYS_INLINE GSVector4(int x, int y)
+  {
+    m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y)));
+  }
+
+  ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { m = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&v)); }
+
+  ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) { m = _mm_cvtepi32_ps(_mm_loadl_epi64((__m128i*)&v)); }
+
+  ALWAYS_INLINE constexpr explicit GSVector4(__m128 m) : m(m) {}
+
+  ALWAYS_INLINE explicit GSVector4(__m128d m) : m(_mm_castpd_ps(m)) {}
+
+  ALWAYS_INLINE explicit GSVector4(float f) { *this = f; }
+
+  ALWAYS_INLINE explicit GSVector4(int i)
+  {
+#ifdef CPU_ARCH_AVX2
+    m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i)));
+#else
+    *this = GSVector4(GSVector4i(i));
+#endif
+  }
+
+  ALWAYS_INLINE explicit GSVector4(const GSVector4i& v);
+
+  ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
+
+  ALWAYS_INLINE static GSVector4 f64(double x, double y) { return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x))); }
+
+  ALWAYS_INLINE void operator=(float f)
+  {
+#if CPU_ARCH_AVX2
+
+    m = _mm_broadcastss_ps(_mm_load_ss(&f));
+
+#else
+
+    m = _mm_set1_ps(f);
+
+#endif
+  }
+
+  ALWAYS_INLINE void operator=(__m128 m_) { this->m = m_; }
+
+  ALWAYS_INLINE operator __m128() const { return m; }
+
+  /// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks
+  /// we don't need the whole vector Useful for e.g. preventing clang from optimizing shuffles that remove
+  /// possibly-denormal garbage data from vectors before computing with them
+  ALWAYS_INLINE GSVector4 noopt()
+  {
+    // Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the
+    // future the implementation should be updated
+#ifdef __clang__
+    __asm__("" : "+x"(m)::);
+#endif
+    return *this;
+  }
+
+  u32 rgba32() const { return GSVector4i(*this).rgba32(); }
+
+  ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
+
+  ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); }
+
+  ALWAYS_INLINE GSVector4 abs() const { return *this & cast(GSVector4i::cxpr(0x7fffffff)); }
+
+  ALWAYS_INLINE GSVector4 neg() const { return *this ^ cast(GSVector4i::cxpr(0x80000000)); }
+
+  ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(_mm_rcp_ps(m)); }
+
+  ALWAYS_INLINE GSVector4 rcpnr() const
+  {
+    GSVector4 v_ = rcp();
+
+    return (v_ + v_) - (v_ * v_) * *this;
+  }
+
+  template<int mode>
+  ALWAYS_INLINE GSVector4 round() const
+  {
+    return GSVector4(_mm_round_ps(m, mode));
+  }
+
+  ALWAYS_INLINE GSVector4 floor() const { return round<Round_NegInf>(); }
+
+  ALWAYS_INLINE GSVector4 ceil() const { return round<Round_PosInf>(); }
+
+  ALWAYS_INLINE GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const
+  {
+#ifdef CPU_ARCH_AVX2
+    return GSVector4(_mm_fmadd_ps(m, a_, b_));
+#else
+    return *this * a_ + b_;
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const
+  {
+#ifdef CPU_ARCH_AVX2
+    return GSVector4(_mm_fmsub_ps(m, a_, b_));
+#else
+    return *this * a_ - b_;
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const
+  {
+#ifdef CPU_ARCH_AVX2
+    return GSVector4(_mm_fnmadd_ps(m, a_, b_));
+#else
+    return b_ - *this * a_;
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const
+  {
+#ifdef CPU_ARCH_AVX2
+    return GSVector4(_mm_fnmsub_ps(m, a_, b_));
+#else
+    return -b_ - *this * a_;
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const
+  {
+    return a_.madd(b_, *this); // *this + a * b
+  }
+
+  ALWAYS_INLINE GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const
+  {
+    return a_.nmadd(b_, *this); // *this - a * b
+  }
+
+  ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(_mm_hadd_ps(m, m)); }
+
+  ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(_mm_hadd_ps(m, v.m)); }
+
+  ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(_mm_hsub_ps(m, m)); }
+
+  ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const { return GSVector4(_mm_hsub_ps(m, v.m)); }
+
+  template<int i>
+  ALWAYS_INLINE GSVector4 dp(const GSVector4& v) const
+  {
+    return GSVector4(_mm_dp_ps(m, v.m, i));
+  }
+
+  ALWAYS_INLINE GSVector4 sat(const GSVector4& min, const GSVector4& max) const
+  {
+    return GSVector4(_mm_min_ps(_mm_max_ps(m, min), max));
+  }
+
+  ALWAYS_INLINE GSVector4 sat(const GSVector4& v) const
+  {
+    return GSVector4(_mm_min_ps(_mm_max_ps(m, v.xyxy()), v.zwzw()));
+  }
+
+  ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); }
+
+  ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); }
+
+  ALWAYS_INLINE GSVector4 min(const GSVector4& v) const { return GSVector4(_mm_min_ps(m, v)); }
+
+  ALWAYS_INLINE GSVector4 max(const GSVector4& v) const { return GSVector4(_mm_max_ps(m, v)); }
+
+  template<int mask>
+  ALWAYS_INLINE GSVector4 blend32(const GSVector4& v) const
+  {
+    return GSVector4(_mm_blend_ps(m, v, mask));
+  }
+
+  ALWAYS_INLINE GSVector4 blend32(const GSVector4& v, const GSVector4& mask) const
+  {
+    return GSVector4(_mm_blendv_ps(m, v, mask));
+  }
+
+  ALWAYS_INLINE GSVector4 upl(const GSVector4& v) const { return GSVector4(_mm_unpacklo_ps(m, v)); }
+
+  ALWAYS_INLINE GSVector4 uph(const GSVector4& v) const { return GSVector4(_mm_unpackhi_ps(m, v)); }
+
+  ALWAYS_INLINE GSVector4 upld(const GSVector4& v) const
+  {
+    return GSVector4(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m), _mm_castps_pd(v.m))));
+  }
+
+  ALWAYS_INLINE GSVector4 uphd(const GSVector4& v) const
+  {
+    return GSVector4(_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m), _mm_castps_pd(v.m))));
+  }
+
+  ALWAYS_INLINE GSVector4 l2h(const GSVector4& v) const { return GSVector4(_mm_movelh_ps(m, v)); }
+
+  ALWAYS_INLINE GSVector4 h2l(const GSVector4& v) const { return GSVector4(_mm_movehl_ps(m, v)); }
+
+  ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const { return GSVector4(_mm_andnot_ps(v.m, m)); }
+
+  ALWAYS_INLINE int mask() const { return _mm_movemask_ps(m); }
+
+  ALWAYS_INLINE bool alltrue() const { return mask() == 0xf; }
+
+  ALWAYS_INLINE bool allfalse() const
+  {
+#ifdef CPU_ARCH_AVX2
+    return _mm_testz_ps(m, m) != 0;
+#else
+    const __m128i ii = _mm_castps_si128(m);
+    return _mm_testz_si128(ii, ii) != 0;
+#endif
+  }
+
+  ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
+
+  template<int src, int dst>
+  ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
+  {
+    if constexpr (src == dst)
+      return GSVector4(_mm_blend_ps(m, v.m, 1 << src));
+    else
+      return GSVector4(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0)));
+  }
+
+  template<int i>
+  ALWAYS_INLINE int extract32() const
+  {
+    return _mm_extract_ps(m, i);
+  }
+
+  ALWAYS_INLINE static GSVector4 zero() { return GSVector4(_mm_setzero_ps()); }
+
+  ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); }
+
+  ALWAYS_INLINE static GSVector4 loadl(const void* p) { return GSVector4(_mm_castpd_ps(_mm_load_sd((double*)p))); }
+
+  ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(_mm_load_ss(&f)); }
+
+  template<bool aligned>
+  ALWAYS_INLINE static GSVector4 load(const void* p)
+  {
+    return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p));
+  }
+
+  ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps((float*)p, v.m); }
+
+  ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { _mm_store_sd((double*)p, _mm_castps_pd(v.m)); }
+
+  ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { _mm_storeh_pd((double*)p, _mm_castps_pd(v.m)); }
+
+  template<bool aligned>
+  ALWAYS_INLINE static void store(void* p, const GSVector4& v)
+  {
+    if constexpr (aligned)
+      _mm_store_ps((float*)p, v.m);
+    else
+      _mm_storeu_ps((float*)p, v.m);
+  }
+
+  ALWAYS_INLINE static void store(float* p, const GSVector4& v) { _mm_store_ss(p, v.m); }
+
+  ALWAYS_INLINE GSVector4 operator-() const { return neg(); }
+
+  ALWAYS_INLINE void operator+=(const GSVector4& v_) { m = _mm_add_ps(m, v_); }
+  ALWAYS_INLINE void operator-=(const GSVector4& v_) { m = _mm_sub_ps(m, v_); }
+  ALWAYS_INLINE void operator*=(const GSVector4& v_) { m = _mm_mul_ps(m, v_); }
+  ALWAYS_INLINE void operator/=(const GSVector4& v_) { m = _mm_div_ps(m, v_); }
+
+  ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
+  ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
+  ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
+  ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); }
+
+  ALWAYS_INLINE void operator&=(const GSVector4& v_) { m = _mm_and_ps(m, v_); }
+  ALWAYS_INLINE void operator|=(const GSVector4& v_) { m = _mm_or_ps(m, v_); }
+  ALWAYS_INLINE void operator^=(const GSVector4& v_) { m = _mm_xor_ps(m, v_); }
+
+  ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(_mm_add_ps(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(_mm_sub_ps(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(_mm_mul_ps(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(_mm_div_ps(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
+
+  ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
+
+  ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
+
+  ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); }
+
+  ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(_mm_and_ps(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(_mm_or_ps(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(_mm_xor_ps(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(_mm_cmpeq_ps(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(_mm_cmpneq_ps(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(_mm_cmpgt_ps(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(_mm_cmplt_ps(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(_mm_cmpge_ps(v1, v2));
+  }
+
+  ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2)
+  {
+    return GSVector4(_mm_cmple_ps(v1, v2));
+  }
+
+  ALWAYS_INLINE GSVector4 mul64(const GSVector4& v_) const
+  {
+    return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
+  }
+
+  ALWAYS_INLINE GSVector4 add64(const GSVector4& v_) const
+  {
+    return GSVector4(_mm_add_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
+  }
+
+  ALWAYS_INLINE GSVector4 sub64(const GSVector4& v_) const
+  {
+    return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m)));
+  }
+
+  ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) { return GSVector4(_mm_cvtps_pd(v_.m)); }
+
+  ALWAYS_INLINE static GSVector4 f32to64(const void* p)
+  {
+    return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p)))));
+  }
+
+  ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
+  {
+    return GSVector4i(truncate ? _mm_cvttpd_epi32(_mm_castps_pd(m)) : _mm_cvtpd_epi32(_mm_castps_pd(m)));
+  }
+
+  // clang-format off
+
+#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
+    ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn))); } \
+    ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(_mm_shuffle_ps(m, v_.m, _MM_SHUFFLE(wn, zn, yn, xn))); } \
+
+#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \
+    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \
+    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \
+    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \
+    VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \
+
+#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \
+    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \
+    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \
+    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \
+    VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \
+
+#define VECTOR4_SHUFFLE_1(xs, xn) \
+    VECTOR4_SHUFFLE_2(xs, xn, x, 0) \
+    VECTOR4_SHUFFLE_2(xs, xn, y, 1) \
+    VECTOR4_SHUFFLE_2(xs, xn, z, 2) \
+    VECTOR4_SHUFFLE_2(xs, xn, w, 3) \
+
+  VECTOR4_SHUFFLE_1(x, 0)
+    VECTOR4_SHUFFLE_1(y, 1)
+    VECTOR4_SHUFFLE_1(z, 2)
+    VECTOR4_SHUFFLE_1(w, 3)
+
+  // clang-format on
+
+#if CPU_ARCH_AVX2
+
+  ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(_mm_broadcastss_ps(m)); }
+
+  ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(_mm_broadcastss_ps(v.m)); }
+
+  ALWAYS_INLINE static GSVector4 broadcast32(const void* f)
+  {
+    return GSVector4(_mm_broadcastss_ps(_mm_load_ss((const float*)f)));
+  }
+
+#endif
+
+  ALWAYS_INLINE static GSVector4 broadcast64(const void* d)
+  {
+    return GSVector4(_mm_loaddup_pd(static_cast<const double*>(d)));
+  }
+};
+
+ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
+{
+  m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v);
+}
+
+ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
+{
+  m = _mm_cvtepi32_ps(v);
+}
+
+ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v)
+{
+  return GSVector4i(_mm_castps_si128(v.m));
+}
+
+ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v)
+{
+  return GSVector4(_mm_castsi128_ps(v.m));
+}
diff --git a/src/common/intrin.h b/src/common/intrin.h
index 795a7f950..1b0587e7e 100644
--- a/src/common/intrin.h
+++ b/src/common/intrin.h
@@ -13,7 +13,21 @@
 #if defined(CPU_ARCH_X86) || defined(CPU_ARCH_X64)
 #define CPU_ARCH_SSE 1
 #include <emmintrin.h>
-#elif defined(CPU_ARCH_ARM64)
+#include <tmmintrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#if defined(__AVX2__)
+#define CPU_ARCH_AVX 1
+#define CPU_ARCH_AVX2 1
+#define CPU_ARCH_SSE41 1
+#elif defined(__AVX__)
+#define CPU_ARCH_AVX 1
+#define CPU_ARCH_SSE41 1
+#elif defined(__SSE4_1__) || defined(_MSC_VER)
+#define CPU_ARCH_SSE41 1
+#endif
+#elif defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64)
 #define CPU_ARCH_NEON 1
 #if defined(_MSC_VER) && !defined(__clang__)
 #include <arm64_neon.h>