diff --git a/CMakeModules/DuckStationUtils.cmake b/CMakeModules/DuckStationUtils.cmake
index 066d3d832..af9be60f7 100644
--- a/CMakeModules/DuckStationUtils.cmake
+++ b/CMakeModules/DuckStationUtils.cmake
@@ -83,8 +83,8 @@ function(detect_architecture)
           AND CMAKE_SIZEOF_VOID_P EQUAL 4))
     message(STATUS "Building ARM32 binaries.")
     set(CPU_ARCH_ARM32 TRUE PARENT_SCOPE)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -marm -march=armv7-a" PARENT_SCOPE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -march=armv7-a" PARENT_SCOPE)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -marm -march=armv7-a -mfpu=neon-vfpv4" PARENT_SCOPE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -march=armv7-a -mfpu=neon-vfpv4" PARENT_SCOPE)
   elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "riscv64")
     message(STATUS "Building RISC-V 64 binaries.")
     set(CPU_ARCH_RISCV64 TRUE PARENT_SCOPE)
diff --git a/src/common-tests/gsvector_yuvtorgb_test.cpp b/src/common-tests/gsvector_yuvtorgb_test.cpp
index 684c14806..c4f60caed 100644
--- a/src/common-tests/gsvector_yuvtorgb_test.cpp
+++ b/src/common-tests/gsvector_yuvtorgb_test.cpp
@@ -108,7 +108,7 @@ TEST(GSVector, YUVToRGB)
 
 #if 0
 // Performance test
-u32 g_gsvector_yuvtorgb_temp[64];
+alignas(VECTOR_ALIGNMENT) u32 g_gsvector_yuvtorgb_temp[64];
 
 TEST(GSVector, YUVToRGB_Scalar)
 {
diff --git a/src/common/align.h b/src/common/align.h
index 6374a99f2..4414b4247 100644
--- a/src/common/align.h
+++ b/src/common/align.h
@@ -92,8 +92,8 @@ ALWAYS_INLINE static void* AlignedMalloc(size_t size, size_t alignment)
 #else
   // Unaligned sizes are slow on macOS.
 #ifdef __APPLE__
-    if (IsPow2(alignment))
-      size = (size + alignment - 1) & ~(alignment - 1);
+  if (IsPow2(alignment))
+    size = (size + alignment - 1) & ~(alignment - 1);
 #endif
   void* ret = nullptr;
   return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr;
diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h
index 7ffe8c552..9b3cd6a70 100644
--- a/src/common/gsvector_neon.h
+++ b/src/common/gsvector_neon.h
@@ -5,6 +5,7 @@
 #include "common/types.h"
 
 #include <algorithm>
+#include <cmath>
 
 #define GSVECTOR_HAS_UNSIGNED 1
 #define GSVECTOR_HAS_SRLV 1
@@ -86,7 +87,7 @@ public:
 
   ALWAYS_INLINE constexpr explicit GSVector2i(int32x2_t m) : v2s(m) {}
 
-  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
 
   ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
 
@@ -174,6 +175,8 @@ public:
     return GSVector2i(vreinterpret_s32_u32(vmax_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s))));
   }
 
+#ifdef CPU_ARCH_ARM64
+
   ALWAYS_INLINE u8 minv_u8() const { return vminv_u8(vreinterpret_u8_s32(v2s)); }
 
   ALWAYS_INLINE u16 maxv_u8() const { return vmaxv_u8(vreinterpret_u8_s32(v2s)); }
@@ -190,6 +193,56 @@ public:
 
   ALWAYS_INLINE u32 maxv_u32() const { return vmaxv_u32(v2s); }
 
+#else
+
+  ALWAYS_INLINE u8 minv_u8() const
+  {
+    uint8x8_t vmin = vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
+    return static_cast<u8>(
+      std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
+               std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
+                        std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
+  }
+
+  ALWAYS_INLINE u16 maxv_u8() const
+  {
+    uint8x8_t vmax = vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1)));
+    return static_cast<u8>(
+      std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
+               std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
+                        std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
+  }
+
+  ALWAYS_INLINE u16 minv_u16() const
+  {
+    uint16x4_t vmin = vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
+    return static_cast<u16>(
+      std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
+  }
+
+  ALWAYS_INLINE u16 maxv_u16() const
+  {
+    uint16x4_t vmax = vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1)));
+    return static_cast<u16>(
+      std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
+  }
+
+  ALWAYS_INLINE s32 minv_s32() const { return std::min<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
+
+  ALWAYS_INLINE u32 minv_u32() const
+  {
+    return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
+  }
+
+  ALWAYS_INLINE s32 maxv_s32() const { return std::max<s32>(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); }
+
+  ALWAYS_INLINE u32 maxv_u32() const
+  {
+    return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1));
+  }
+
+#endif
+
   ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); }
 
   ALWAYS_INLINE GSVector2i blend8(const GSVector2i& a, const GSVector2i& mask) const
@@ -249,6 +302,8 @@ public:
     return GSVector2i(vreinterpret_s32_u16(vqmovn_u32(vcombine_u32(vreinterpret_u32_s32(v2s), vcreate_u32(0)))));
   }
 
+#ifdef CPU_ARCH_ARM64
+
   ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
   {
     return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s))));
@@ -272,6 +327,33 @@ public:
 
   ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip1_s32(v2s, vdup_n_s32(0))); }
 
+#else
+
+  ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const
+  {
+    return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)).val[0]));
+  }
+
+  ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const
+  {
+    return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)).val[0]));
+  }
+  ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip_s32(v2s, v.v2s).val[0]); }
+
+  ALWAYS_INLINE GSVector2i upl8() const
+  {
+    return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0)).val[0]));
+  }
+
+  ALWAYS_INLINE GSVector2i upl16() const
+  {
+    return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0)).val[0]));
+  }
+
+  ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip_s32(v2s, vdup_n_s32(0)).val[0]); }
+
+#endif
+
   ALWAYS_INLINE GSVector2i i8to16() const
   {
     return GSVector2i(vreinterpret_s32_s16(vget_low_s8(vmovl_s8(vreinterpret_s8_s32(v2s)))));
@@ -465,7 +547,7 @@ public:
 
   ALWAYS_INLINE bool eq(const GSVector2i& v) const
   {
-    return (vmaxv_u32(vreinterpret_u32_s32(veor_s32(v2s, v.v2s))) == 0);
+    return (vget_lane_u64(vreinterpret_u64_s32(veor_s32(v2s, v.v2s)), 0) == 0);
   }
 
   ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const
@@ -483,11 +565,6 @@ public:
     return GSVector2i(vreinterpret_s32_u32(vceq_s32(v2s, v.v2s)));
   }
 
-  ALWAYS_INLINE GSVector2i eq64(const GSVector2i& v) const
-  {
-    return GSVector2i(vreinterpret_s32_u64(vceq_s64(vreinterpret_s64_s32(v2s), vreinterpret_s64_s32(v.v2s))));
-  }
-
   ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); }
 
   ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); }
@@ -553,13 +630,23 @@ public:
   ALWAYS_INLINE bool alltrue() const
   {
     // MSB should be set in all 8-bit lanes.
+#ifdef CPU_ARCH_ARM64
     return (vminv_u8(vreinterpret_u8_s32(v2s)) & 0x80) == 0x80;
+#else
+    return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) ==
+            0x80808080u);
+#endif
   }
 
   ALWAYS_INLINE bool allfalse() const
   {
     // MSB should be clear in all 8-bit lanes.
+#ifdef CPU_ARCH_ARM64
     return (vmaxv_u32(vreinterpret_u8_s32(v2s)) & 0x80) != 0x80;
+#else
+    return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) ==
+            0);
+#endif
   }
 
   template<int i>
@@ -744,10 +831,26 @@ public:
     return GSVector2(recip);
   }
 
+#ifdef CPU_ARCH_ARM64
+
   ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); }
 
   ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); }
 
+#else
+
+  ALWAYS_INLINE GSVector2 floor() const
+  {
+    return GSVector2(std::floor(vget_lane_f32(v2s, 0)), std::floor(vget_lane_f32(v2s, 1)));
+  }
+
+  ALWAYS_INLINE GSVector2 ceil() const
+  {
+    return GSVector2(std::ceil(vget_lane_f32(v2s, 0)), std::ceil(vget_lane_f32(v2s, 1)));
+  }
+
+#endif
+
   ALWAYS_INLINE GSVector2 sat(const GSVector2& a, const GSVector2& b) const { return max(a).min(b); }
 
   ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); }
@@ -791,7 +894,11 @@ public:
   template<int src, int dst>
   ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const
   {
+#ifdef CPU_ARCH_ARM64
     return GSVector2(vcopy_lane_f32(v2s, dst, v.v2s, src));
+#else
+    return GSVector2(vset_lane_f32(vget_lane_f32(v.v2s, src), v2s, dst));
+#endif
   }
 
   template<int i>
@@ -800,7 +907,15 @@ public:
     return vget_lane_s32(vreinterpret_s32_f32(v2s), i);
   }
 
-  ALWAYS_INLINE float dot(const GSVector2& v) const { return vaddv_f32(vmul_f32(v2s, v.v2s)); }
+  ALWAYS_INLINE float dot(const GSVector2& v) const
+  {
+#ifdef CPU_ARCH_ARM64
+    return vaddv_f32(vmul_f32(v2s, v.v2s));
+#else
+    const float32x2_t dp = vmul_f32(v2s, v.v2s);
+    return vget_lane_f32(vadd_f32(dp, vdup_lane_f32(dp, 1)), 0);
+#endif
+  }
 
   ALWAYS_INLINE static GSVector2 zero() { return GSVector2(vdup_n_f32(0.0f)); }
 
@@ -817,7 +932,14 @@ public:
   ALWAYS_INLINE void operator+=(const GSVector2& v) { v2s = vadd_f32(v2s, v.v2s); }
   ALWAYS_INLINE void operator-=(const GSVector2& v) { v2s = vsub_f32(v2s, v.v2s); }
   ALWAYS_INLINE void operator*=(const GSVector2& v) { v2s = vmul_f32(v2s, v.v2s); }
-  ALWAYS_INLINE void operator/=(const GSVector2& v) { v2s = vdiv_f32(v2s, v.v2s); }
+  ALWAYS_INLINE void operator/=(const GSVector2& v)
+  {
+#ifdef CPU_ARCH_ARM64
+    v2s = vdiv_f32(v2s, v.v2s);
+#else
+    *this = GSVector2(vget_lane_f32(v2s, 0) / vget_lane_f32(v.v2s, 0), vget_lane_f32(v2s, 1) / vget_lane_f32(v.v2s, 1));
+#endif
+  }
 
   ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); }
   ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); }
@@ -856,7 +978,12 @@ public:
 
   ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2)
   {
+#ifdef CPU_ARCH_ARM64
     return GSVector2(vdiv_f32(v1.v2s, v2.v2s));
+#else
+    return GSVector2(vget_lane_f32(v1.v2s, 0) / vget_lane_f32(v2.v2s, 0),
+                     vget_lane_f32(v1.v2s, 1) / vget_lane_f32(v2.v2s, 1));
+#endif
   }
 
   ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); }
@@ -1013,8 +1140,8 @@ public:
   ALWAYS_INLINE explicit GSVector4i(int32x2_t m) : v4s(vcombine_s32(m, vcreate_s32(0))) {}
   ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {}
 
-  ALWAYS_INLINE explicit GSVector4i(const GSVector2& v, bool truncate = true);
-  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector4i(const GSVector2& v);
+  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
 
   ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
 
@@ -1035,7 +1162,14 @@ public:
 
   ALWAYS_INLINE s32 rarea() const { return width() * height(); }
 
-  ALWAYS_INLINE bool rempty() const { return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0); }
+  ALWAYS_INLINE bool rempty() const
+  {
+#ifdef CPU_ARCH_ARM64
+    return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0);
+#else
+    return (vget_lane_u64(vreinterpret_u64_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))), 0) == 0);
+#endif
+  }
 
   ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_i32(a).upl64(max_i32(a).srl<8>()); }
 
@@ -1159,13 +1293,32 @@ public:
 
   ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const
   {
-    int32x4_t acc =
+#ifdef CPU_ARCH_ARM64
+    const int32x4_t acc =
+      vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
+    return GSVector4i(vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)));
+#else
+    // borrowed from sse2neon
+    const int32x4_t low =
       vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
-    acc = vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s));
-    return GSVector4i(acc);
+    const int32x4_t high =
+      vmlal_s16(vdupq_n_s32(0), vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
+    return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)),
+                                   vpadd_s32(vget_low_s32(high), vget_high_s32(high))));
+#endif
   }
 
-  ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(vpaddq_s32(v4s, v4s)); }
+  ALWAYS_INLINE GSVector4i addp_s32() const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4i(vpaddq_s32(v4s, v4s));
+#else
+    const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s));
+    return GSVector4i(vcombine_s32(res, res));
+#endif
+  }
+
+#ifdef CPU_ARCH_ARM64
 
   ALWAYS_INLINE u8 minv_u8() const { return vminvq_u8(vreinterpretq_u8_s32(v4s)); }
 
@@ -1183,6 +1336,70 @@ public:
 
   ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); }
 
+#else
+
+  ALWAYS_INLINE u8 minv_u8() const
+  {
+    uint8x8_t vmin = vmin_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
+    vmin = vmin_u8(vmin, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmin), 1)));
+    return static_cast<u8>(
+      std::min(static_cast<u32>(vget_lane_u8(vmin, 0)),
+               std::min(static_cast<u32>(vget_lane_u8(vmin, 1)),
+                        std::min(static_cast<u32>(vget_lane_u8(vmin, 2)), static_cast<u32>(vget_lane_u8(vmin, 3))))));
+  }
+
+  ALWAYS_INLINE u16 maxv_u8() const
+  {
+    uint8x8_t vmax = vmax_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s)));
+    vmax = vmax_u8(vmax, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmax), 1)));
+    return static_cast<u8>(
+      std::max(static_cast<u32>(vget_lane_u8(vmax, 0)),
+               std::max(static_cast<u32>(vget_lane_u8(vmax, 1)),
+                        std::max(static_cast<u32>(vget_lane_u8(vmax, 2)), static_cast<u32>(vget_lane_u8(vmax, 3))))));
+  }
+
+  ALWAYS_INLINE u16 minv_u16() const
+  {
+    uint16x4_t vmin = vmin_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
+    vmin = vmin_u16(vmin, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmin), 1)));
+    return static_cast<u16>(
+      std::min(static_cast<u32>(vget_lane_u16(vmin, 0)), static_cast<u32>(vget_lane_u16(vmin, 1))));
+  }
+
+  ALWAYS_INLINE u16 maxv_u16() const
+  {
+    uint16x4_t vmax = vmax_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s)));
+    vmax = vmax_u16(vmax, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmax), 1)));
+    return static_cast<u16>(
+      std::max<u32>(static_cast<u32>(vget_lane_u16(vmax, 0)), static_cast<u32>(vget_lane_u16(vmax, 1))));
+  }
+
+  ALWAYS_INLINE s32 minv_s32() const
+  {
+    int32x2_t vmin = vmin_s32(vget_low_s32(v4s), vget_high_s32(v4s));
+    return std::min<s32>(vget_lane_s32(vmin, 0), vget_lane_s32(vmin, 1));
+  }
+
+  ALWAYS_INLINE u32 minv_u32() const
+  {
+    uint32x2_t vmin = vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
+    return std::min<u32>(vget_lane_u32(vreinterpret_u32_s32(vmin), 0), vget_lane_u32(vreinterpret_u32_s32(vmin), 1));
+  }
+
+  ALWAYS_INLINE s32 maxv_s32() const
+  {
+    int32x2_t vmax = vmax_s32(vget_low_s32(v4s), vget_high_s32(v4s));
+    return std::max<s32>(vget_lane_s32(vmax, 0), vget_lane_s32(vmax, 1));
+  }
+
+  ALWAYS_INLINE u32 maxv_u32() const
+  {
+    uint32x2_t vmax = vmax_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s)));
+    return std::max<u32>(vget_lane_u32(vreinterpret_u32_s32(vmax), 0), vget_lane_u32(vreinterpret_u32_s32(vmax), 1));
+  }
+
+#endif
+
   ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); }
 
   ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const
@@ -1224,7 +1441,13 @@ public:
 
   ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const
   {
+#ifdef CPU_ARCH_ARM64
     return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s))));
+#else
+    int8x8x2_t split = {vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v4s))};
+    return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vtbl2_s8(split, vget_low_s8(vreinterpretq_s8_s32(mask.v4s))),
+                                                       vtbl2_s8(split, vget_high_s8(vreinterpretq_s8_s32(mask.v4s))))));
+#endif
   }
 
   ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const
@@ -1271,6 +1494,8 @@ public:
     return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s))));
   }
 
+#ifdef CPU_ARCH_ARM64
+
   ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
   {
     return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))));
@@ -1341,6 +1566,81 @@ public:
     return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
   }
 
+#else
+
+  ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const
+  {
+    const int8x8x2_t res = vzip_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
+    return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
+  }
+
+  ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const
+  {
+    const int8x8x2_t res = vzip_s8(vget_high_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
+    return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1])));
+  }
+
+  ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const
+  {
+    const int16x4x2_t res =
+      vzip_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s)));
+    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
+  }
+
+  ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const
+  {
+    const int16x4x2_t res =
+      vzip_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s)));
+    return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1])));
+  }
+
+  ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const
+  {
+    const int32x2x2_t res = vzip_s32(vget_low_s32(v4s), vget_low_s32(v.v4s));
+    return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
+  }
+
+  ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const
+  {
+    const int32x2x2_t res = vzip_s32(vget_high_s32(v4s), vget_high_s32(v.v4s));
+    return GSVector4i(vcombine_s32(res.val[0], res.val[1]));
+  }
+
+  ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(
+      vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(
+      vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4i upl8() const { return upl8(GSVector4i(vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i uph8() const { return uph8(GSVector4i(vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i upl16() const { return upl16(GSVector4i(vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i uph16() const { return uph16(GSVector4i(vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i upl32() const { return upl32(GSVector4i(vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i uph32() const { return uph32(GSVector4i(vdupq_n_s32(0))); }
+
+  ALWAYS_INLINE GSVector4i upl64() const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
+  }
+
+  ALWAYS_INLINE GSVector4i uph64() const
+  {
+    return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0))));
+  }
+#endif
+
   ALWAYS_INLINE GSVector4i i8to16() const
   {
     return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))));
@@ -1537,11 +1837,14 @@ public:
     return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(-i))));
   }
 
+#ifdef CPU_ARCH_ARM64
+  // not on arm32, hopefully we can do without
   ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const
   {
     return GSVector4i(
       vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
   }
+#endif
 
   template<int i>
   ALWAYS_INLINE GSVector4i srl64() const
@@ -1554,11 +1857,13 @@ public:
     return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i))));
   }
 
+#ifdef CPU_ARCH_ARM64
   ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const
   {
     return GSVector4i(
       vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s)))));
   }
+#endif
 
   ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const
   {
@@ -1588,7 +1893,14 @@ public:
     // return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))));
     const int16x8_t a = vreinterpretq_s16_s32(v4s);
     const int16x8_t b = vreinterpretq_s16_s32(v.v4s);
+#ifdef CPU_ARCH_ARM64
     return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    // sse2neon again
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(ab0246, ab1357)));
+#endif
   }
 
   ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const
@@ -1719,7 +2031,13 @@ public:
 
   ALWAYS_INLINE bool eq(const GSVector4i& v) const
   {
-    return (vmaxvq_u32(vreinterpretq_u32_s32(veorq_s32(v4s, v.v4s))) == 0);
+    const int32x4_t res = veorq_s32(v4s, v.v4s);
+#ifdef CPU_ARCH_ARM64
+    return (vmaxvq_u32(vreinterpretq_u32_s32(res)) == 0);
+#else
+    const int32x2_t paired = vorr_s32(vget_low_s32(res), vget_high_s32(res));
+    return (vget_lane_u64(vreinterpret_u64_s32(paired), 0) == 0);
+#endif
   }
 
   ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const
@@ -1737,10 +2055,12 @@ public:
     return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s)));
   }
 
+#ifdef CPU_ARCH_ARM64
   ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const
   {
     return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s))));
   }
+#endif
 
   ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); }
 
@@ -1807,13 +2127,23 @@ public:
   ALWAYS_INLINE bool alltrue() const
   {
     // MSB should be set in all 8-bit lanes.
+#ifdef CPU_ARCH_ARM64
     return (vminvq_u8(vreinterpretq_u8_s32(v4s)) & 0x80) == 0x80;
+#else
+    const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s)));
+    return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0x80808080u);
+#endif
   }
 
   ALWAYS_INLINE bool allfalse() const
   {
     // MSB should be clear in all 8-bit lanes.
+#ifdef CPU_ARCH_ARM64
     return (vmaxvq_u32(vreinterpretq_u8_s32(v4s)) & 0x80) != 0x80;
+#else
+    const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s)));
+    return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0);
+#endif
   }
 
   template<int i>
@@ -2099,10 +2429,7 @@ public:
     v4s = vld1q_f32(arr);
   }
 
-  ALWAYS_INLINE GSVector4(float x, float y)
-  {
-    v4s = vzip1q_f32(vsetq_lane_f32(x, vdupq_n_f32(0.0f), 0), vsetq_lane_f32(y, vdupq_n_f32(0.0f), 0));
-  }
+  ALWAYS_INLINE GSVector4(float x, float y) { v4s = vsetq_lane_f32(x, vsetq_lane_f32(y, vdupq_n_f32(0.0f), 1), 0); }
 
   ALWAYS_INLINE GSVector4(int x, int y, int z, int w)
   {
@@ -2112,7 +2439,7 @@ public:
 
   ALWAYS_INLINE GSVector4(int x, int y)
   {
-    v4s = vcvtq_f32_s32(vzip1q_s32(vsetq_lane_s32(x, vdupq_n_s32(0), 0), vsetq_lane_s32(y, vdupq_n_s32(0), 0)));
+    v4s = vcvtq_f32_s32(vsetq_lane_s32(x, vsetq_lane_s32(y, vdupq_n_s32(0), 0), 0));
   }
 
   ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(v.v2s, vcreate_f32(0)); }
@@ -2129,10 +2456,12 @@ public:
 
   ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v);
 
+#ifdef CPU_ARCH_ARM64
   ALWAYS_INLINE static GSVector4 f64(double x, double y)
   {
     return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1)));
   }
+#endif
 
   ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); }
 
@@ -2140,19 +2469,6 @@ public:
 
   ALWAYS_INLINE operator float32x4_t() const { return v4s; }
 
-  /// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks
-  /// we don't need the whole vector Useful for e.g. preventing clang from optimizing shuffles that remove
-  /// possibly-denormal garbage data from vectors before computing with them
-  ALWAYS_INLINE GSVector4 noopt()
-  {
-    // Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the
-    // future the implementation should be updated
-#ifdef __clang__
-    // __asm__("":"+x"(m)::);
-#endif
-    return *this;
-  }
-
   ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); }
 
   ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
@@ -2172,10 +2488,28 @@ public:
     return GSVector4(recip);
   }
 
+#ifdef _M_ARM64
+
   ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); }
 
   ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); }
 
+#else
+
+  ALWAYS_INLINE GSVector4 floor() const
+  {
+    return GSVector4(std::floor(vgetq_lane_f32(v4s, 0)), std::floor(vgetq_lane_f32(v4s, 1)),
+                     std::floor(vgetq_lane_f32(v4s, 2)), std::floor(vgetq_lane_f32(v4s, 3)));
+  }
+
+  ALWAYS_INLINE GSVector4 ceil() const
+  {
+    return GSVector4(std::ceil(vgetq_lane_f32(v4s, 0)), std::ceil(vgetq_lane_f32(v4s, 1)),
+                     std::ceil(vgetq_lane_f32(v4s, 2)), std::ceil(vgetq_lane_f32(v4s, 3)));
+  }
+
+#endif
+
   ALWAYS_INLINE GSVector4 madd(const GSVector4& a, const GSVector4& b) const
   {
     return GSVector4(vfmaq_f32(b.v4s, v4s, a.v4s));
@@ -2197,6 +2531,8 @@ public:
     return a.nmadd(b, *this); // *this - a * b
   }
 
+#ifdef CPU_ARCH_ARM64
+
   ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); }
 
   ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); }
@@ -2208,12 +2544,46 @@ public:
     return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s)));
   }
 
+#else
+
+  ALWAYS_INLINE GSVector4 hadd() const
+  {
+    const float32x2_t res = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
+    return GSVector4(vcombine_f32(res, res));
+  }
+
+  ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const
+  {
+    const float32x2_t res1 = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s));
+    const float32x2_t res2 = vpadd_f32(vget_low_f32(v.v4s), vget_high_f32(v.v4s));
+    return GSVector4(vcombine_f32(res1, res2));
+  }
+
+  ALWAYS_INLINE GSVector4 hsub() const
+  {
+    const float32x4x2_t res = vuzpq_f32(v4s, v4s);
+    return GSVector4(vsubq_f32(res.val[0], res.val[0]));
+  }
+
+  ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const
+  {
+    const float32x4x2_t res = vuzpq_f32(v4s, v.v4s);
+    return GSVector4(vsubq_f32(res.val[0], res.val[1]));
+  }
+
+#endif
+
   ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }
 
   ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const
   {
+#ifdef CPU_ARCH_ARM64
     const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0)));
     const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1)));
+#else
+    const GSVector4 minv(a.xyxy());
+    const GSVector4 maxv(a.zwzw());
+#endif
     return sat(minv, maxv);
   }
 
@@ -2239,6 +2609,8 @@ public:
     return GSVector4(vbslq_f32(bitmask, a.v4s, v4s));
   }
 
+#ifdef CPU_ARCH_ARM64
+
   ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); }
 
   ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); }
@@ -2253,6 +2625,34 @@ public:
     return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s))));
   }
 
+#else
+
+  ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const
+  {
+    const float32x2x2_t res = vzip_f32(vget_low_f32(v4s), vget_low_f32(a.v4s));
+    return GSVector4(vcombine_f32(res.val[0], res.val[1]));
+  }
+
+  ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const
+  {
+    const float32x2x2_t res = vzip_f32(vget_high_f32(v4s), vget_high_f32(a.v4s));
+    return GSVector4(vcombine_f32(res.val[0], res.val[1]));
+  }
+
+  ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const
+  {
+    return GSVector4(vreinterpretq_f32_s64(
+      vcombine_s64(vget_low_s64(vreinterpretq_s64_f32(v4s)), vget_low_s64(vreinterpretq_s64_f32(a.v4s)))));
+  }
+
+  ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const
+  {
+    return GSVector4(vreinterpretq_f32_s64(
+      vcombine_s64(vget_high_s64(vreinterpretq_s64_f32(v4s)), vget_high_s64(vreinterpretq_s64_f32(a.v4s)))));
+  }
+
+#endif
+
   ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const
   {
     return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)));
@@ -2270,8 +2670,15 @@ public:
 
   ALWAYS_INLINE int mask() const
   {
+#ifdef CPU_ARCH_ARM64
     static constexpr const int32_t shifts[] = {0, 1, 2, 3};
     return static_cast<int>(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts))));
+#else
+    // sse2neon again
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31));
+    uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
   }
 
   ALWAYS_INLINE bool alltrue() const
@@ -2290,7 +2697,11 @@ public:
   template<int src, int dst>
   ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const
   {
+#ifdef CPU_ARCH_ARM64
     return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src));
+#else
+    return GSVector4(vsetq_lane_f32(vgetq_lane_f32(v.v4s, src), v4s, dst));
+#endif
   }
 
   template<int i>
@@ -2320,12 +2731,20 @@ public:
 
   ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
   {
+#ifdef CPU_ARCH_ARM64
     vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
+#else
+    vst1_s64((s64*)p, vget_low_s64(vreinterpretq_s64_f32(v.v4s)));
+#endif
   }
 
   ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
   {
+#ifdef CPU_ARCH_ARM64
     vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
+#else
+    vst1_s64((s64*)p, vget_high_s64(vreinterpretq_s64_f32(v.v4s)));
+#endif
   }
 
   template<bool aligned>
@@ -2341,12 +2760,29 @@ public:
   ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); }
   ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); }
   ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); }
-  ALWAYS_INLINE void operator/=(const GSVector4& v) { v4s = vdivq_f32(v4s, v.v4s); }
+  ALWAYS_INLINE void operator/=(const GSVector4& v)
+  {
+#ifdef CPU_ARCH_ARM64
+    v4s = vdivq_f32(v4s, v.v4s);
+#else
+    *this =
+      GSVector4(vgetq_lane_f32(v4s, 0) / vgetq_lane_f32(v.v4s, 0), vgetq_lane_f32(v4s, 1) / vgetq_lane_f32(v.v4s, 1),
+                vgetq_lane_f32(v4s, 2) / vgetq_lane_f32(v.v4s, 2), vgetq_lane_f32(v4s, 3) / vgetq_lane_f32(v.v4s, 3));
+#endif
+  }
 
   ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); }
   ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); }
   ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); }
-  ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); }
+  ALWAYS_INLINE void operator/=(float f)
+  {
+#ifdef CPU_ARCH_ARM64
+    *this /= GSVector4(f);
+#else
+    *this = GSVector4(vgetq_lane_f32(v4s, 0) / f, vgetq_lane_f32(v4s, 1) / f, vgetq_lane_f32(v4s, 2) / f,
+                      vgetq_lane_f32(v4s, 3) / f);
+#endif
+  }
 
   ALWAYS_INLINE void operator&=(const GSVector4& v)
   {
@@ -2380,13 +2816,27 @@ public:
 
   ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2)
   {
+#ifdef CPU_ARCH_ARM64
     return GSVector4(vdivq_f32(v1.v4s, v2.v4s));
+#else
+    return GSVector4(
+      vgetq_lane_f32(v1.v4s, 0) / vgetq_lane_f32(v2.v4s, 0), vgetq_lane_f32(v1.v4s, 1) / vgetq_lane_f32(v2.v4s, 1),
+      vgetq_lane_f32(v1.v4s, 2) / vgetq_lane_f32(v2.v4s, 2), vgetq_lane_f32(v1.v4s, 3) / vgetq_lane_f32(v2.v4s, 3));
+#endif
   }
 
   ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); }
   ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); }
   ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); }
-  ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); }
+  ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f)
+  {
+#ifdef CPU_ARCH_ARM64
+    return v / GSVector4(f);
+#else
+    return GSVector4(vgetq_lane_f32(v.v4s, 0) / f, vgetq_lane_f32(v.v4s, 1) / f, vgetq_lane_f32(v.v4s, 2) / f,
+                     vgetq_lane_f32(v.v4s, 3) / f);
+#endif
+  }
 
   ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2)
   {
@@ -2434,6 +2884,9 @@ public:
     return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s)));
   }
 
+#ifdef CPU_ARCH_ARM64
+  // Not in ARM32
+
   ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const
   {
     return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s)));
@@ -2459,14 +2912,15 @@ public:
     return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast<const float*>(p)))));
   }
 
-  ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
+  ALWAYS_INLINE GSVector4i f64toi32() const
   {
-    const float64x2_t r = truncate ? v4s : vrndiq_f64(vreinterpretq_f64_f32(v4s));
-    const s32 low = static_cast<s32>(vgetq_lane_f64(r, 0));
-    const s32 high = static_cast<s32>(vgetq_lane_f64(r, 1));
+    const s32 low = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0));
+    const s32 high = static_cast<s32>(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1));
     return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1));
   }
 
+#endif
+
   // clang-format off
 
 #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \
@@ -2498,21 +2952,39 @@ public:
 
   // clang-format on
 
-  ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(vdupq_laneq_f32(v4s, 0)); }
+  ALWAYS_INLINE GSVector4 broadcast32() const
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vdupq_laneq_f32(v4s, 0));
+#else
+    return xxxx();
+#endif
+  }
 
-  ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(vdupq_laneq_f32(v.v4s, 0)); }
+  ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v)
+  {
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vdupq_laneq_f32(v.v4s, 0));
+#else
+    return v.xxxx();
+#endif
+  }
 
   ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); }
 
   ALWAYS_INLINE static GSVector4 broadcast64(const void* f)
   {
-    return GSVector4(vreinterpretq_f64_f32(vld1q_dup_f64((const double*)f)));
+#ifdef CPU_ARCH_ARM64
+    return GSVector4(vreinterpretq_f32_f64(vld1q_dup_f64((const double*)f)));
+#else
+    return GSVector4(vreinterpretq_f32_s64(vld1q_dup_s64((const s64*)f)));
+#endif
   }
 };
 
-ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate)
+ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
 {
-  v2s = truncate ? vcvt_s32_f32(v.v2s) : vcvtn_u32_f32(v.v2s);
+  v2s = vcvt_s32_f32(v.v2s);
 }
 
 ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
@@ -2530,9 +3002,9 @@ ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
   return GSVector2(vreinterpret_f32_s32(v.v2s));
 }
 
-ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
+ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
 {
-  v4s = truncate ? vcvtq_s32_f32(v.v4s) : vcvtnq_u32_f32(v.v4s);
+  v4s = vcvtq_s32_f32(v.v4s);
 }
 
 ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h
index 02d200b51..b82e77485 100644
--- a/src/common/gsvector_nosimd.h
+++ b/src/common/gsvector_nosimd.h
@@ -121,7 +121,7 @@ public:
   // so leave the non-constexpr version default
   ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; }
 
-  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
 
   ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
 
@@ -955,7 +955,7 @@ public:
   // so leave the non-constexpr version default
   ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
 
-  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
 
   ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
 
@@ -1879,8 +1879,6 @@ public:
 
   ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; }
 
-  ALWAYS_INLINE GSVector4 noopt() { return *this; }
-
   u32 rgba32() const { return GSVector4i(*this).rgba32(); }
 
   ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); }
@@ -2316,7 +2314,7 @@ public:
     return ret;
   }
 
-  ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
+  ALWAYS_INLINE GSVector4i f64toi32() const
   {
     return GSVector4i(static_cast<s32>(F64[0]), static_cast<s32>(F64[1]), 0, 0);
   }
@@ -2372,9 +2370,8 @@ public:
   }
 };
 
-ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate)
+ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
 {
-  // TODO: Truncation vs rounding...
   x = static_cast<s32>(v.x);
   y = static_cast<s32>(v.y);
 }
@@ -2399,9 +2396,8 @@ ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
   return ret;
 }
 
-ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
+ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
 {
-  // TODO: Truncation vs rounding...
   x = static_cast<s32>(v.x);
   y = static_cast<s32>(v.y);
   z = static_cast<s32>(v.z);
diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h
index a876b2acf..8d9501bb7 100644
--- a/src/common/gsvector_sse.h
+++ b/src/common/gsvector_sse.h
@@ -88,7 +88,7 @@ public:
   // so leave the non-constexpr version default
   ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; }
 
-  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector2i(const GSVector2& v);
 
   ALWAYS_INLINE static GSVector2i cast(const GSVector2& v);
 
@@ -840,9 +840,9 @@ public:
   // so leave the non-constexpr version default
   ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; }
 
-  ALWAYS_INLINE explicit GSVector4i(const GSVector2& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector4i(const GSVector2& v);
 
-  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true);
+  ALWAYS_INLINE explicit GSVector4i(const GSVector4& v);
 
   ALWAYS_INLINE static GSVector4i cast(const GSVector4& v);
 
@@ -1952,9 +1952,9 @@ public:
     return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast<const double*>(p)))));
   }
 
-  ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const
+  ALWAYS_INLINE GSVector4i f64toi32() const
   {
-    return GSVector4i(truncate ? _mm_cvttpd_epi32(_mm_castps_pd(m)) : _mm_cvtpd_epi32(_mm_castps_pd(m)));
+    return GSVector4i(_mm_cvttpd_epi32(_mm_castps_pd(m)));
   }
 
   // clang-format off
@@ -2007,9 +2007,9 @@ public:
   }
 };
 
-ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate)
+ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v)
 {
-  m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v);
+  m = _mm_cvttps_epi32(v);
 }
 
 ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v)
@@ -2027,9 +2027,9 @@ ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v)
   return GSVector2(_mm_castsi128_ps(v.m));
 }
 
-ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate)
+ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v)
 {
-  m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v);
+  m = _mm_cvttps_epi32(v);
 }
 
 ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v)
diff --git a/src/common/intrin.h b/src/common/intrin.h
index 1b0587e7e..225cd4d97 100644
--- a/src/common/intrin.h
+++ b/src/common/intrin.h
@@ -66,8 +66,10 @@ ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count)
 
 #if defined(CPU_ARCH_SSE)
   const __m128i svalue = _mm_set1_epi64x(reinterpret_cast<intptr_t>(value));
-#elif defined(CPU_ARCH_NEON)
+#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64)
   const uint64x2_t svalue = vdupq_n_u64(reinterpret_cast<uintptr_t>(value));
+#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32)
+  const uint32x4_t svalue = vdupq_n_u32(reinterpret_cast<uintptr_t>(value));
 #endif
 
   // Clang gets way too eager and tries to unroll these, emitting thousands of instructions.
@@ -78,8 +80,10 @@ ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count)
   {
 #if defined(CPU_ARCH_SSE)
     _mm_store_si128(reinterpret_cast<__m128i*>(dest), svalue);
-#elif defined(CPU_ARCH_NEON)
+#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64)
     vst1q_u64(reinterpret_cast<u64*>(dest), svalue);
+#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32)
+    vst1q_u32(reinterpret_cast<u32*>(dest), svalue);
 #endif
     dest += PTRS_PER_VECTOR;
   }
diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp
index 68ab18330..47b92e544 100644
--- a/src/core/cpu_code_cache.cpp
+++ b/src/core/cpu_code_cache.cpp
@@ -14,6 +14,7 @@
 
 #include "util/page_fault_handler.h"
 
+#include "common/align.h"
 #include "common/assert.h"
 #include "common/error.h"
 #include "common/intrin.h"
@@ -456,15 +457,15 @@ CPU::CodeCache::Block* CPU::CodeCache::CreateBlock(u32 pc, const BlockInstructio
       s_blocks.erase(it);
 
       block->~Block();
-      std::free(block);
+      Common::AlignedFree(block);
       block = nullptr;
     }
   }
 
   if (!block)
   {
-    block =
-      static_cast<Block*>(std::malloc(sizeof(Block) + (sizeof(Instruction) * size) + (sizeof(InstructionInfo) * size)));
+    block = static_cast<Block*>(Common::AlignedMalloc(
+      sizeof(Block) + (sizeof(Instruction) * size) + (sizeof(InstructionInfo) * size), alignof(Block)));
     Assert(block);
     new (block) Block();
     s_blocks.push_back(block);
@@ -734,7 +735,7 @@ void CPU::CodeCache::ClearBlocks()
   for (Block* block : s_blocks)
   {
     block->~Block();
-    std::free(block);
+    Common::AlignedFree(block);
   }
   s_blocks.clear();
 
diff --git a/src/core/cpu_recompiler_code_generator_aarch32.cpp b/src/core/cpu_recompiler_code_generator_aarch32.cpp
index f95140713..c9a53236c 100644
--- a/src/core/cpu_recompiler_code_generator_aarch32.cpp
+++ b/src/core/cpu_recompiler_code_generator_aarch32.cpp
@@ -148,7 +148,7 @@ void CPU::Recompiler::armEmitFarLoad(vixl::aarch32::Assembler* armAsm, const vix
 }
 
 void CPU::Recompiler::armEmitFarStore(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg,
-                                      const void* addr, const vixl::aarch64::Register& tempreg)
+                                      const void* addr, const vixl::aarch32::Register& tempreg)
 {
   armMoveAddressToReg(armAsm, tempreg, addr);
   armAsm->str(reg, vixl::aarch32::MemOperand(tempreg));
@@ -1931,12 +1931,12 @@ void CodeGenerator::EmitICacheCheckAndUpdate()
   {
     if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks))
     {
-      armEmitFarLoad(m_emit, RARG2, GetFetchMemoryAccessTimePtr());
-      m_emit->ldr(RARG1, a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
-      m_emit->Mov(RARG3, m_block->size);
-      m_emit->mul(RARG2, RARG2, RARG3);
-      m_emit->add(RARG1, RARG1, RARG2);
-      m_emit->str(RARG1, a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
+      armEmitFarLoad(m_emit, GetHostReg32(RARG2), GetFetchMemoryAccessTimePtr());
+      m_emit->ldr(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
+      m_emit->Mov(GetHostReg32(RARG3), m_block->size);
+      m_emit->mul(GetHostReg32(RARG2), GetHostReg32(RARG2), GetHostReg32(RARG3));
+      m_emit->add(GetHostReg32(RARG1), GetHostReg32(RARG1), GetHostReg32(RARG2));
+      m_emit->str(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks)));
     }
     else
     {