diff --git a/CMakeModules/DuckStationUtils.cmake b/CMakeModules/DuckStationUtils.cmake index 066d3d832..af9be60f7 100644 --- a/CMakeModules/DuckStationUtils.cmake +++ b/CMakeModules/DuckStationUtils.cmake @@ -83,8 +83,8 @@ function(detect_architecture) AND CMAKE_SIZEOF_VOID_P EQUAL 4)) message(STATUS "Building ARM32 binaries.") set(CPU_ARCH_ARM32 TRUE PARENT_SCOPE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -marm -march=armv7-a" PARENT_SCOPE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -march=armv7-a" PARENT_SCOPE) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -marm -march=armv7-a -mfpu=neon-vfpv4" PARENT_SCOPE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -march=armv7-a -mfpu=neon-vfpv4" PARENT_SCOPE) elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "riscv64") message(STATUS "Building RISC-V 64 binaries.") set(CPU_ARCH_RISCV64 TRUE PARENT_SCOPE) diff --git a/src/common-tests/gsvector_yuvtorgb_test.cpp b/src/common-tests/gsvector_yuvtorgb_test.cpp index 684c14806..c4f60caed 100644 --- a/src/common-tests/gsvector_yuvtorgb_test.cpp +++ b/src/common-tests/gsvector_yuvtorgb_test.cpp @@ -108,7 +108,7 @@ TEST(GSVector, YUVToRGB) #if 0 // Performance test -u32 g_gsvector_yuvtorgb_temp[64]; +alignas(VECTOR_ALIGNMENT) u32 g_gsvector_yuvtorgb_temp[64]; TEST(GSVector, YUVToRGB_Scalar) { diff --git a/src/common/align.h b/src/common/align.h index 6374a99f2..4414b4247 100644 --- a/src/common/align.h +++ b/src/common/align.h @@ -92,8 +92,8 @@ ALWAYS_INLINE static void* AlignedMalloc(size_t size, size_t alignment) #else // Unaligned sizes are slow on macOS. #ifdef __APPLE__ - if (IsPow2(alignment)) - size = (size + alignment - 1) & ~(alignment - 1); + if (IsPow2(alignment)) + size = (size + alignment - 1) & ~(alignment - 1); #endif void* ret = nullptr; return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr; diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h index 7ffe8c552..9b3cd6a70 100644 --- a/src/common/gsvector_neon.h +++ b/src/common/gsvector_neon.h @@ -5,6 +5,7 @@ #include "common/types.h" #include +#include #define GSVECTOR_HAS_UNSIGNED 1 #define GSVECTOR_HAS_SRLV 1 @@ -86,7 +87,7 @@ public: ALWAYS_INLINE constexpr explicit GSVector2i(int32x2_t m) : v2s(m) {} - ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true); + ALWAYS_INLINE explicit GSVector2i(const GSVector2& v); ALWAYS_INLINE static GSVector2i cast(const GSVector2& v); @@ -174,6 +175,8 @@ public: return GSVector2i(vreinterpret_s32_u32(vmax_u32(vreinterpret_u32_s32(v2s), vreinterpret_u32_s32(v.v2s)))); } +#ifdef CPU_ARCH_ARM64 + ALWAYS_INLINE u8 minv_u8() const { return vminv_u8(vreinterpret_u8_s32(v2s)); } ALWAYS_INLINE u16 maxv_u8() const { return vmaxv_u8(vreinterpret_u8_s32(v2s)); } @@ -190,6 +193,56 @@ public: ALWAYS_INLINE u32 maxv_u32() const { return vmaxv_u32(v2s); } +#else + + ALWAYS_INLINE u8 minv_u8() const + { + uint8x8_t vmin = vmin_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1))); + return static_cast( + std::min(static_cast(vget_lane_u8(vmin, 0)), + std::min(static_cast(vget_lane_u8(vmin, 1)), + std::min(static_cast(vget_lane_u8(vmin, 2)), static_cast(vget_lane_u8(vmin, 3)))))); + } + + ALWAYS_INLINE u16 maxv_u8() const + { + uint8x8_t vmax = vmax_u8(vreinterpret_u8_s32(v2s), vreinterpret_u8_s32(vdup_lane_s32(v2s, 1))); + return static_cast( + std::max(static_cast(vget_lane_u8(vmax, 0)), + std::max(static_cast(vget_lane_u8(vmax, 1)), + std::max(static_cast(vget_lane_u8(vmax, 2)), static_cast(vget_lane_u8(vmax, 3)))))); + } + + ALWAYS_INLINE u16 minv_u16() const + { + uint16x4_t vmin = vmin_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1))); + return static_cast( + std::min(static_cast(vget_lane_u16(vmin, 0)), static_cast(vget_lane_u16(vmin, 1)))); + } + + ALWAYS_INLINE u16 maxv_u16() const + { + uint16x4_t vmax = vmax_u16(vreinterpret_u16_s32(v2s), vreinterpret_u16_s32(vdup_lane_s32(v2s, 1))); + return static_cast( + std::max(static_cast(vget_lane_u16(vmax, 0)), static_cast(vget_lane_u16(vmax, 1)))); + } + + ALWAYS_INLINE s32 minv_s32() const { return std::min(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); } + + ALWAYS_INLINE u32 minv_u32() const + { + return std::min(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1)); + } + + ALWAYS_INLINE s32 maxv_s32() const { return std::max(vget_lane_s32(v2s, 0), vget_lane_s32(v2s, 1)); } + + ALWAYS_INLINE u32 maxv_u32() const + { + return std::max(vget_lane_u32(vreinterpret_u32_s32(v2s), 0), vget_lane_u32(vreinterpret_u32_s32(v2s), 1)); + } + +#endif + ALWAYS_INLINE GSVector2i clamp8() const { return pu16().upl8(); } ALWAYS_INLINE GSVector2i blend8(const GSVector2i& a, const GSVector2i& mask) const @@ -249,6 +302,8 @@ public: return GSVector2i(vreinterpret_s32_u16(vqmovn_u32(vcombine_u32(vreinterpret_u32_s32(v2s), vcreate_u32(0))))); } +#ifdef CPU_ARCH_ARM64 + ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const { return GSVector2i(vreinterpret_s32_s8(vzip1_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)))); @@ -272,6 +327,33 @@ public: ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip1_s32(v2s, vdup_n_s32(0))); } +#else + + ALWAYS_INLINE GSVector2i upl8(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vreinterpret_s8_s32(v.v2s)).val[0])); + } + + ALWAYS_INLINE GSVector2i upl16(const GSVector2i& v) const + { + return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vreinterpret_s16_s32(v.v2s)).val[0])); + } + ALWAYS_INLINE GSVector2i upl32(const GSVector2i& v) const { return GSVector2i(vzip_s32(v2s, v.v2s).val[0]); } + + ALWAYS_INLINE GSVector2i upl8() const + { + return GSVector2i(vreinterpret_s32_s8(vzip_s8(vreinterpret_s8_s32(v2s), vdup_n_s8(0)).val[0])); + } + + ALWAYS_INLINE GSVector2i upl16() const + { + return GSVector2i(vreinterpret_s32_s16(vzip_s16(vreinterpret_s16_s32(v2s), vdup_n_s16(0)).val[0])); + } + + ALWAYS_INLINE GSVector2i upl32() const { return GSVector2i(vzip_s32(v2s, vdup_n_s32(0)).val[0]); } + +#endif + ALWAYS_INLINE GSVector2i i8to16() const { return GSVector2i(vreinterpret_s32_s16(vget_low_s8(vmovl_s8(vreinterpret_s8_s32(v2s))))); @@ -465,7 +547,7 @@ public: ALWAYS_INLINE bool eq(const GSVector2i& v) const { - return (vmaxv_u32(vreinterpret_u32_s32(veor_s32(v2s, v.v2s))) == 0); + return (vget_lane_u64(vreinterpret_u64_s32(veor_s32(v2s, v.v2s)), 0) == 0); } ALWAYS_INLINE GSVector2i eq8(const GSVector2i& v) const @@ -483,11 +565,6 @@ public: return GSVector2i(vreinterpret_s32_u32(vceq_s32(v2s, v.v2s))); } - ALWAYS_INLINE GSVector2i eq64(const GSVector2i& v) const - { - return GSVector2i(vreinterpret_s32_u64(vceq_s64(vreinterpret_s64_s32(v2s), vreinterpret_s64_s32(v.v2s)))); - } - ALWAYS_INLINE GSVector2i neq8(const GSVector2i& v) const { return ~eq8(v); } ALWAYS_INLINE GSVector2i neq16(const GSVector2i& v) const { return ~eq16(v); } @@ -553,13 +630,23 @@ public: ALWAYS_INLINE bool alltrue() const { // MSB should be set in all 8-bit lanes. +#ifdef CPU_ARCH_ARM64 return (vminv_u8(vreinterpret_u8_s32(v2s)) & 0x80) == 0x80; +#else + return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) == + 0x80808080u); +#endif } ALWAYS_INLINE bool allfalse() const { // MSB should be clear in all 8-bit lanes. +#ifdef CPU_ARCH_ARM64 return (vmaxv_u32(vreinterpret_u8_s32(v2s)) & 0x80) != 0x80; +#else + return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) == + 0); +#endif } template @@ -744,10 +831,26 @@ public: return GSVector2(recip); } +#ifdef CPU_ARCH_ARM64 + ALWAYS_INLINE GSVector2 floor() const { return GSVector2(vrndm_f32(v2s)); } ALWAYS_INLINE GSVector2 ceil() const { return GSVector2(vrndp_f32(v2s)); } +#else + + ALWAYS_INLINE GSVector2 floor() const + { + return GSVector2(std::floor(vget_lane_f32(v2s, 0)), std::floor(vget_lane_f32(v2s, 1))); + } + + ALWAYS_INLINE GSVector2 ceil() const + { + return GSVector2(std::ceil(vget_lane_f32(v2s, 0)), std::ceil(vget_lane_f32(v2s, 1))); + } + +#endif + ALWAYS_INLINE GSVector2 sat(const GSVector2& a, const GSVector2& b) const { return max(a).min(b); } ALWAYS_INLINE GSVector2 sat(const float scale = 255) const { return sat(zero(), GSVector2(scale)); } @@ -791,7 +894,11 @@ public: template ALWAYS_INLINE GSVector2 insert32(const GSVector2& v) const { +#ifdef CPU_ARCH_ARM64 return GSVector2(vcopy_lane_f32(v2s, dst, v.v2s, src)); +#else + return GSVector2(vset_lane_f32(vget_lane_f32(v.v2s, src), v2s, dst)); +#endif } template @@ -800,7 +907,15 @@ public: return vget_lane_s32(vreinterpret_s32_f32(v2s), i); } - ALWAYS_INLINE float dot(const GSVector2& v) const { return vaddv_f32(vmul_f32(v2s, v.v2s)); } + ALWAYS_INLINE float dot(const GSVector2& v) const + { +#ifdef CPU_ARCH_ARM64 + return vaddv_f32(vmul_f32(v2s, v.v2s)); +#else + const float32x2_t dp = vmul_f32(v2s, v.v2s); + return vget_lane_f32(vadd_f32(dp, vdup_lane_f32(dp, 1)), 0); +#endif + } ALWAYS_INLINE static GSVector2 zero() { return GSVector2(vdup_n_f32(0.0f)); } @@ -817,7 +932,14 @@ public: ALWAYS_INLINE void operator+=(const GSVector2& v) { v2s = vadd_f32(v2s, v.v2s); } ALWAYS_INLINE void operator-=(const GSVector2& v) { v2s = vsub_f32(v2s, v.v2s); } ALWAYS_INLINE void operator*=(const GSVector2& v) { v2s = vmul_f32(v2s, v.v2s); } - ALWAYS_INLINE void operator/=(const GSVector2& v) { v2s = vdiv_f32(v2s, v.v2s); } + ALWAYS_INLINE void operator/=(const GSVector2& v) + { +#ifdef CPU_ARCH_ARM64 + v2s = vdiv_f32(v2s, v.v2s); +#else + *this = GSVector2(vget_lane_f32(v2s, 0) / vget_lane_f32(v.v2s, 0), vget_lane_f32(v2s, 1) / vget_lane_f32(v.v2s, 1)); +#endif + } ALWAYS_INLINE void operator+=(float f) { *this += GSVector2(f); } ALWAYS_INLINE void operator-=(float f) { *this -= GSVector2(f); } @@ -856,7 +978,12 @@ public: ALWAYS_INLINE friend GSVector2 operator/(const GSVector2& v1, const GSVector2& v2) { +#ifdef CPU_ARCH_ARM64 return GSVector2(vdiv_f32(v1.v2s, v2.v2s)); +#else + return GSVector2(vget_lane_f32(v1.v2s, 0) / vget_lane_f32(v2.v2s, 0), + vget_lane_f32(v1.v2s, 1) / vget_lane_f32(v2.v2s, 1)); +#endif } ALWAYS_INLINE friend GSVector2 operator+(const GSVector2& v, float f) { return v + GSVector2(f); } @@ -1013,8 +1140,8 @@ public: ALWAYS_INLINE explicit GSVector4i(int32x2_t m) : v4s(vcombine_s32(m, vcreate_s32(0))) {} ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {} - ALWAYS_INLINE explicit GSVector4i(const GSVector2& v, bool truncate = true); - ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true); + ALWAYS_INLINE explicit GSVector4i(const GSVector2& v); + ALWAYS_INLINE explicit GSVector4i(const GSVector4& v); ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); @@ -1035,7 +1162,14 @@ public: ALWAYS_INLINE s32 rarea() const { return width() * height(); } - ALWAYS_INLINE bool rempty() const { return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0); } + ALWAYS_INLINE bool rempty() const + { +#ifdef CPU_ARCH_ARM64 + return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0); +#else + return (vget_lane_u64(vreinterpret_u64_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))), 0) == 0); +#endif + } ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_i32(a).upl64(max_i32(a).srl<8>()); } @@ -1159,13 +1293,32 @@ public: ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const { - int32x4_t acc = +#ifdef CPU_ARCH_ARM64 + const int32x4_t acc = + vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s))); + return GSVector4i(vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s))); +#else + // borrowed from sse2neon + const int32x4_t low = vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s))); - acc = vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)); - return GSVector4i(acc); + const int32x4_t high = + vmlal_s16(vdupq_n_s32(0), vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s))); + return GSVector4i(vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)), + vpadd_s32(vget_low_s32(high), vget_high_s32(high)))); +#endif } - ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(vpaddq_s32(v4s, v4s)); } + ALWAYS_INLINE GSVector4i addp_s32() const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4i(vpaddq_s32(v4s, v4s)); +#else + const int32x2_t res = vpadd_s32(vget_low_s32(v4s), vget_high_s32(v4s)); + return GSVector4i(vcombine_s32(res, res)); +#endif + } + +#ifdef CPU_ARCH_ARM64 ALWAYS_INLINE u8 minv_u8() const { return vminvq_u8(vreinterpretq_u8_s32(v4s)); } @@ -1183,6 +1336,70 @@ public: ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); } +#else + + ALWAYS_INLINE u8 minv_u8() const + { + uint8x8_t vmin = vmin_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s))); + vmin = vmin_u8(vmin, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmin), 1))); + return static_cast( + std::min(static_cast(vget_lane_u8(vmin, 0)), + std::min(static_cast(vget_lane_u8(vmin, 1)), + std::min(static_cast(vget_lane_u8(vmin, 2)), static_cast(vget_lane_u8(vmin, 3)))))); + } + + ALWAYS_INLINE u16 maxv_u8() const + { + uint8x8_t vmax = vmax_u8(vget_low_u8(vreinterpretq_u8_s32(v4s)), vget_high_u8(vreinterpretq_u8_s32(v4s))); + vmax = vmax_u8(vmax, vreinterpret_u8_s32(vdup_lane_s32(vreinterpret_s32_u8(vmax), 1))); + return static_cast( + std::max(static_cast(vget_lane_u8(vmax, 0)), + std::max(static_cast(vget_lane_u8(vmax, 1)), + std::max(static_cast(vget_lane_u8(vmax, 2)), static_cast(vget_lane_u8(vmax, 3)))))); + } + + ALWAYS_INLINE u16 minv_u16() const + { + uint16x4_t vmin = vmin_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s))); + vmin = vmin_u16(vmin, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmin), 1))); + return static_cast( + std::min(static_cast(vget_lane_u16(vmin, 0)), static_cast(vget_lane_u16(vmin, 1)))); + } + + ALWAYS_INLINE u16 maxv_u16() const + { + uint16x4_t vmax = vmax_u16(vget_low_u16(vreinterpretq_u16_s32(v4s)), vget_high_u16(vreinterpretq_u16_s32(v4s))); + vmax = vmax_u16(vmax, vreinterpret_u16_s32(vdup_lane_s32(vreinterpret_s32_u16(vmax), 1))); + return static_cast( + std::max(static_cast(vget_lane_u16(vmax, 0)), static_cast(vget_lane_u16(vmax, 1)))); + } + + ALWAYS_INLINE s32 minv_s32() const + { + int32x2_t vmin = vmin_s32(vget_low_s32(v4s), vget_high_s32(v4s)); + return std::min(vget_lane_s32(vmin, 0), vget_lane_s32(vmin, 1)); + } + + ALWAYS_INLINE u32 minv_u32() const + { + uint32x2_t vmin = vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s))); + return std::min(vget_lane_u32(vreinterpret_u32_s32(vmin), 0), vget_lane_u32(vreinterpret_u32_s32(vmin), 1)); + } + + ALWAYS_INLINE s32 maxv_s32() const + { + int32x2_t vmax = vmax_s32(vget_low_s32(v4s), vget_high_s32(v4s)); + return std::max(vget_lane_s32(vmax, 0), vget_lane_s32(vmax, 1)); + } + + ALWAYS_INLINE u32 maxv_u32() const + { + uint32x2_t vmax = vmax_u32(vget_low_u32(vreinterpretq_u32_s32(v4s)), vget_high_u32(vreinterpretq_u32_s32(v4s))); + return std::max(vget_lane_u32(vreinterpret_u32_s32(vmax), 0), vget_lane_u32(vreinterpret_u32_s32(vmax), 1)); + } + +#endif + ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); } ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const @@ -1224,7 +1441,13 @@ public: ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const { +#ifdef CPU_ARCH_ARM64 return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s)))); +#else + int8x8x2_t split = {vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v4s))}; + return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vtbl2_s8(split, vget_low_s8(vreinterpretq_s8_s32(mask.v4s))), + vtbl2_s8(split, vget_high_s8(vreinterpretq_s8_s32(mask.v4s)))))); +#endif } ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const @@ -1271,6 +1494,8 @@ public: return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s)))); } +#ifdef CPU_ARCH_ARM64 + ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const { return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); @@ -1341,6 +1566,81 @@ public: return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0)))); } +#else + + ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const + { + const int8x8x2_t res = vzip_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)), vget_low_s8(vreinterpretq_s8_s32(v.v4s))); + return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1]))); + } + + ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const + { + const int8x8x2_t res = vzip_s8(vget_high_s8(vreinterpretq_s8_s32(v4s)), vget_high_s8(vreinterpretq_s8_s32(v.v4s))); + return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(res.val[0], res.val[1]))); + } + + ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const + { + const int16x4x2_t res = + vzip_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s))); + return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1]))); + } + + ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const + { + const int16x4x2_t res = + vzip_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s))); + return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(res.val[0], res.val[1]))); + } + + ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const + { + const int32x2x2_t res = vzip_s32(vget_low_s32(v4s), vget_low_s32(v.v4s)); + return GSVector4i(vcombine_s32(res.val[0], res.val[1])); + } + + ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const + { + const int32x2x2_t res = vzip_s32(vget_high_s32(v4s), vget_high_s32(v.v4s)); + return GSVector4i(vcombine_s32(res.val[0], res.val[1])); + } + + ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s64( + vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s))))); + } + + ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s64( + vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s))))); + } + + ALWAYS_INLINE GSVector4i upl8() const { return upl8(GSVector4i(vdupq_n_s32(0))); } + + ALWAYS_INLINE GSVector4i uph8() const { return uph8(GSVector4i(vdupq_n_s32(0))); } + + ALWAYS_INLINE GSVector4i upl16() const { return upl16(GSVector4i(vdupq_n_s32(0))); } + + ALWAYS_INLINE GSVector4i uph16() const { return uph16(GSVector4i(vdupq_n_s32(0))); } + + ALWAYS_INLINE GSVector4i upl32() const { return upl32(GSVector4i(vdupq_n_s32(0))); } + + ALWAYS_INLINE GSVector4i uph32() const { return uph32(GSVector4i(vdupq_n_s32(0))); } + + ALWAYS_INLINE GSVector4i upl64() const + { + return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0)))); + } + + ALWAYS_INLINE GSVector4i uph64() const + { + return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0)))); + } +#endif + ALWAYS_INLINE GSVector4i i8to16() const { return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))); @@ -1537,11 +1837,14 @@ public: return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(-i)))); } +#ifdef CPU_ARCH_ARM64 + // not on arm32, hopefully we can do without ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const { return GSVector4i( vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s))))); } +#endif template ALWAYS_INLINE GSVector4i srl64() const @@ -1554,11 +1857,13 @@ public: return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i)))); } +#ifdef CPU_ARCH_ARM64 ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const { return GSVector4i( vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s))))); } +#endif ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const { @@ -1588,7 +1893,14 @@ public: // return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); const int16x8_t a = vreinterpretq_s16_s32(v4s); const int16x8_t b = vreinterpretq_s16_s32(v.v4s); +#ifdef CPU_ARCH_ARM64 return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + // sse2neon again + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(ab0246, ab1357))); +#endif } ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const @@ -1719,7 +2031,13 @@ public: ALWAYS_INLINE bool eq(const GSVector4i& v) const { - return (vmaxvq_u32(vreinterpretq_u32_s32(veorq_s32(v4s, v.v4s))) == 0); + const int32x4_t res = veorq_s32(v4s, v.v4s); +#ifdef CPU_ARCH_ARM64 + return (vmaxvq_u32(vreinterpretq_u32_s32(res)) == 0); +#else + const int32x2_t paired = vorr_s32(vget_low_s32(res), vget_high_s32(res)); + return (vget_lane_u64(vreinterpret_u64_s32(paired), 0) == 0); +#endif } ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const @@ -1737,10 +2055,12 @@ public: return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s))); } +#ifdef CPU_ARCH_ARM64 ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const { return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s)))); } +#endif ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); } @@ -1807,13 +2127,23 @@ public: ALWAYS_INLINE bool alltrue() const { // MSB should be set in all 8-bit lanes. +#ifdef CPU_ARCH_ARM64 return (vminvq_u8(vreinterpretq_u8_s32(v4s)) & 0x80) == 0x80; +#else + const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s))); + return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0x80808080u); +#endif } ALWAYS_INLINE bool allfalse() const { // MSB should be clear in all 8-bit lanes. +#ifdef CPU_ARCH_ARM64 return (vmaxvq_u32(vreinterpretq_u8_s32(v4s)) & 0x80) != 0x80; +#else + const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s))); + return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0); +#endif } template @@ -2099,10 +2429,7 @@ public: v4s = vld1q_f32(arr); } - ALWAYS_INLINE GSVector4(float x, float y) - { - v4s = vzip1q_f32(vsetq_lane_f32(x, vdupq_n_f32(0.0f), 0), vsetq_lane_f32(y, vdupq_n_f32(0.0f), 0)); - } + ALWAYS_INLINE GSVector4(float x, float y) { v4s = vsetq_lane_f32(x, vsetq_lane_f32(y, vdupq_n_f32(0.0f), 1), 0); } ALWAYS_INLINE GSVector4(int x, int y, int z, int w) { @@ -2112,7 +2439,7 @@ public: ALWAYS_INLINE GSVector4(int x, int y) { - v4s = vcvtq_f32_s32(vzip1q_s32(vsetq_lane_s32(x, vdupq_n_s32(0), 0), vsetq_lane_s32(y, vdupq_n_s32(0), 0))); + v4s = vcvtq_f32_s32(vsetq_lane_s32(x, vsetq_lane_s32(y, vdupq_n_s32(0), 0), 0)); } ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(v.v2s, vcreate_f32(0)); } @@ -2129,10 +2456,12 @@ public: ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v); +#ifdef CPU_ARCH_ARM64 ALWAYS_INLINE static GSVector4 f64(double x, double y) { return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1))); } +#endif ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); } @@ -2140,19 +2469,6 @@ public: ALWAYS_INLINE operator float32x4_t() const { return v4s; } - /// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks - /// we don't need the whole vector Useful for e.g. preventing clang from optimizing shuffles that remove - /// possibly-denormal garbage data from vectors before computing with them - ALWAYS_INLINE GSVector4 noopt() - { - // Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the - // future the implementation should be updated -#ifdef __clang__ - // __asm__("":"+x"(m)::); -#endif - return *this; - } - ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); } ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); } @@ -2172,10 +2488,28 @@ public: return GSVector4(recip); } +#ifdef _M_ARM64 + ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); } ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); } +#else + + ALWAYS_INLINE GSVector4 floor() const + { + return GSVector4(std::floor(vgetq_lane_f32(v4s, 0)), std::floor(vgetq_lane_f32(v4s, 1)), + std::floor(vgetq_lane_f32(v4s, 2)), std::floor(vgetq_lane_f32(v4s, 3))); + } + + ALWAYS_INLINE GSVector4 ceil() const + { + return GSVector4(std::ceil(vgetq_lane_f32(v4s, 0)), std::ceil(vgetq_lane_f32(v4s, 1)), + std::ceil(vgetq_lane_f32(v4s, 2)), std::ceil(vgetq_lane_f32(v4s, 3))); + } + +#endif + ALWAYS_INLINE GSVector4 madd(const GSVector4& a, const GSVector4& b) const { return GSVector4(vfmaq_f32(b.v4s, v4s, a.v4s)); @@ -2197,6 +2531,8 @@ public: return a.nmadd(b, *this); // *this - a * b } +#ifdef CPU_ARCH_ARM64 + ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); } ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); } @@ -2208,12 +2544,46 @@ public: return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s))); } +#else + + ALWAYS_INLINE GSVector4 hadd() const + { + const float32x2_t res = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s)); + return GSVector4(vcombine_f32(res, res)); + } + + ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const + { + const float32x2_t res1 = vpadd_f32(vget_low_f32(v4s), vget_high_f32(v4s)); + const float32x2_t res2 = vpadd_f32(vget_low_f32(v.v4s), vget_high_f32(v.v4s)); + return GSVector4(vcombine_f32(res1, res2)); + } + + ALWAYS_INLINE GSVector4 hsub() const + { + const float32x4x2_t res = vuzpq_f32(v4s, v4s); + return GSVector4(vsubq_f32(res.val[0], res.val[0])); + } + + ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const + { + const float32x4x2_t res = vuzpq_f32(v4s, v.v4s); + return GSVector4(vsubq_f32(res.val[0], res.val[1])); + } + +#endif + ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); } ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const { +#ifdef CPU_ARCH_ARM64 const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0))); const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1))); +#else + const GSVector4 minv(a.xyxy()); + const GSVector4 maxv(a.zwzw()); +#endif return sat(minv, maxv); } @@ -2239,6 +2609,8 @@ public: return GSVector4(vbslq_f32(bitmask, a.v4s, v4s)); } +#ifdef CPU_ARCH_ARM64 + ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); } ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); } @@ -2253,6 +2625,34 @@ public: return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s)))); } +#else + + ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const + { + const float32x2x2_t res = vzip_f32(vget_low_f32(v4s), vget_low_f32(a.v4s)); + return GSVector4(vcombine_f32(res.val[0], res.val[1])); + } + + ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const + { + const float32x2x2_t res = vzip_f32(vget_high_f32(v4s), vget_high_f32(a.v4s)); + return GSVector4(vcombine_f32(res.val[0], res.val[1])); + } + + ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const + { + return GSVector4(vreinterpretq_f32_s64( + vcombine_s64(vget_low_s64(vreinterpretq_s64_f32(v4s)), vget_low_s64(vreinterpretq_s64_f32(a.v4s))))); + } + + ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const + { + return GSVector4(vreinterpretq_f32_s64( + vcombine_s64(vget_high_s64(vreinterpretq_s64_f32(v4s)), vget_high_s64(vreinterpretq_s64_f32(a.v4s))))); + } + +#endif + ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const { return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s))); @@ -2270,8 +2670,15 @@ public: ALWAYS_INLINE int mask() const { +#ifdef CPU_ARCH_ARM64 static constexpr const int32_t shifts[] = {0, 1, 2, 3}; return static_cast(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts)))); +#else + // sse2neon again + uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31)); + uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); + return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); +#endif } ALWAYS_INLINE bool alltrue() const @@ -2290,7 +2697,11 @@ public: template ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const { +#ifdef CPU_ARCH_ARM64 return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src)); +#else + return GSVector4(vsetq_lane_f32(vgetq_lane_f32(v.v4s, src), v4s, dst)); +#endif } template @@ -2320,12 +2731,20 @@ public: ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { +#ifdef CPU_ARCH_ARM64 vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s))); +#else + vst1_s64((s64*)p, vget_low_s64(vreinterpretq_s64_f32(v.v4s))); +#endif } ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { +#ifdef CPU_ARCH_ARM64 vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s))); +#else + vst1_s64((s64*)p, vget_high_s64(vreinterpretq_s64_f32(v.v4s))); +#endif } template @@ -2341,12 +2760,29 @@ public: ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); } ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); } ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); } - ALWAYS_INLINE void operator/=(const GSVector4& v) { v4s = vdivq_f32(v4s, v.v4s); } + ALWAYS_INLINE void operator/=(const GSVector4& v) + { +#ifdef CPU_ARCH_ARM64 + v4s = vdivq_f32(v4s, v.v4s); +#else + *this = + GSVector4(vgetq_lane_f32(v4s, 0) / vgetq_lane_f32(v.v4s, 0), vgetq_lane_f32(v4s, 1) / vgetq_lane_f32(v.v4s, 1), + vgetq_lane_f32(v4s, 2) / vgetq_lane_f32(v.v4s, 2), vgetq_lane_f32(v4s, 3) / vgetq_lane_f32(v.v4s, 3)); +#endif + } ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); } ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); } ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); } - ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); } + ALWAYS_INLINE void operator/=(float f) + { +#ifdef CPU_ARCH_ARM64 + *this /= GSVector4(f); +#else + *this = GSVector4(vgetq_lane_f32(v4s, 0) / f, vgetq_lane_f32(v4s, 1) / f, vgetq_lane_f32(v4s, 2) / f, + vgetq_lane_f32(v4s, 3) / f); +#endif + } ALWAYS_INLINE void operator&=(const GSVector4& v) { @@ -2380,13 +2816,27 @@ public: ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2) { +#ifdef CPU_ARCH_ARM64 return GSVector4(vdivq_f32(v1.v4s, v2.v4s)); +#else + return GSVector4( + vgetq_lane_f32(v1.v4s, 0) / vgetq_lane_f32(v2.v4s, 0), vgetq_lane_f32(v1.v4s, 1) / vgetq_lane_f32(v2.v4s, 1), + vgetq_lane_f32(v1.v4s, 2) / vgetq_lane_f32(v2.v4s, 2), vgetq_lane_f32(v1.v4s, 3) / vgetq_lane_f32(v2.v4s, 3)); +#endif } ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); } ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); } ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); } - ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); } + ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) + { +#ifdef CPU_ARCH_ARM64 + return v / GSVector4(f); +#else + return GSVector4(vgetq_lane_f32(v.v4s, 0) / f, vgetq_lane_f32(v.v4s, 1) / f, vgetq_lane_f32(v.v4s, 2) / f, + vgetq_lane_f32(v.v4s, 3) / f); +#endif + } ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2) { @@ -2434,6 +2884,9 @@ public: return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s))); } +#ifdef CPU_ARCH_ARM64 + // Not in ARM32 + ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const { return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))); @@ -2459,14 +2912,15 @@ public: return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast(p))))); } - ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const + ALWAYS_INLINE GSVector4i f64toi32() const { - const float64x2_t r = truncate ? v4s : vrndiq_f64(vreinterpretq_f64_f32(v4s)); - const s32 low = static_cast(vgetq_lane_f64(r, 0)); - const s32 high = static_cast(vgetq_lane_f64(r, 1)); + const s32 low = static_cast(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 0)); + const s32 high = static_cast(vgetq_lane_f64(vreinterpretq_f64_f32(v4s), 1)); return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1)); } +#endif + // clang-format off #define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ @@ -2498,21 +2952,39 @@ public: // clang-format on - ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(vdupq_laneq_f32(v4s, 0)); } + ALWAYS_INLINE GSVector4 broadcast32() const + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vdupq_laneq_f32(v4s, 0)); +#else + return xxxx(); +#endif + } - ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(vdupq_laneq_f32(v.v4s, 0)); } + ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) + { +#ifdef CPU_ARCH_ARM64 + return GSVector4(vdupq_laneq_f32(v.v4s, 0)); +#else + return v.xxxx(); +#endif + } ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); } ALWAYS_INLINE static GSVector4 broadcast64(const void* f) { - return GSVector4(vreinterpretq_f64_f32(vld1q_dup_f64((const double*)f))); +#ifdef CPU_ARCH_ARM64 + return GSVector4(vreinterpretq_f32_f64(vld1q_dup_f64((const double*)f))); +#else + return GSVector4(vreinterpretq_f32_s64(vld1q_dup_s64((const s64*)f))); +#endif } }; -ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate) +ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v) { - v2s = truncate ? vcvt_s32_f32(v.v2s) : vcvtn_u32_f32(v.v2s); + v2s = vcvt_s32_f32(v.v2s); } ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v) @@ -2530,9 +3002,9 @@ ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v) return GSVector2(vreinterpret_f32_s32(v.v2s)); } -ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate) +ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v) { - v4s = truncate ? vcvtq_s32_f32(v.v4s) : vcvtnq_u32_f32(v.v4s); + v4s = vcvtq_s32_f32(v.v4s); } ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v) diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h index 02d200b51..b82e77485 100644 --- a/src/common/gsvector_nosimd.h +++ b/src/common/gsvector_nosimd.h @@ -121,7 +121,7 @@ public: // so leave the non-constexpr version default ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; } - ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true); + ALWAYS_INLINE explicit GSVector2i(const GSVector2& v); ALWAYS_INLINE static GSVector2i cast(const GSVector2& v); @@ -955,7 +955,7 @@ public: // so leave the non-constexpr version default ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; } - ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true); + ALWAYS_INLINE explicit GSVector4i(const GSVector4& v); ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); @@ -1879,8 +1879,6 @@ public: ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; } - ALWAYS_INLINE GSVector4 noopt() { return *this; } - u32 rgba32() const { return GSVector4i(*this).rgba32(); } ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); } @@ -2316,7 +2314,7 @@ public: return ret; } - ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const + ALWAYS_INLINE GSVector4i f64toi32() const { return GSVector4i(static_cast(F64[0]), static_cast(F64[1]), 0, 0); } @@ -2372,9 +2370,8 @@ public: } }; -ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate) +ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v) { - // TODO: Truncation vs rounding... x = static_cast(v.x); y = static_cast(v.y); } @@ -2399,9 +2396,8 @@ ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v) return ret; } -ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate) +ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v) { - // TODO: Truncation vs rounding... x = static_cast(v.x); y = static_cast(v.y); z = static_cast(v.z); diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h index a876b2acf..8d9501bb7 100644 --- a/src/common/gsvector_sse.h +++ b/src/common/gsvector_sse.h @@ -88,7 +88,7 @@ public: // so leave the non-constexpr version default ALWAYS_INLINE explicit GSVector2i(s32 i) { *this = i; } - ALWAYS_INLINE explicit GSVector2i(const GSVector2& v, bool truncate = true); + ALWAYS_INLINE explicit GSVector2i(const GSVector2& v); ALWAYS_INLINE static GSVector2i cast(const GSVector2& v); @@ -840,9 +840,9 @@ public: // so leave the non-constexpr version default ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; } - ALWAYS_INLINE explicit GSVector4i(const GSVector2& v, bool truncate = true); + ALWAYS_INLINE explicit GSVector4i(const GSVector2& v); - ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true); + ALWAYS_INLINE explicit GSVector4i(const GSVector4& v); ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); @@ -1952,9 +1952,9 @@ public: return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast(p))))); } - ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const + ALWAYS_INLINE GSVector4i f64toi32() const { - return GSVector4i(truncate ? _mm_cvttpd_epi32(_mm_castps_pd(m)) : _mm_cvtpd_epi32(_mm_castps_pd(m))); + return GSVector4i(_mm_cvttpd_epi32(_mm_castps_pd(m))); } // clang-format off @@ -2007,9 +2007,9 @@ public: } }; -ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v, bool truncate) +ALWAYS_INLINE GSVector2i::GSVector2i(const GSVector2& v) { - m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v); + m = _mm_cvttps_epi32(v); } ALWAYS_INLINE GSVector2::GSVector2(const GSVector2i& v) @@ -2027,9 +2027,9 @@ ALWAYS_INLINE GSVector2 GSVector2::cast(const GSVector2i& v) return GSVector2(_mm_castsi128_ps(v.m)); } -ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate) +ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v) { - m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v); + m = _mm_cvttps_epi32(v); } ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v) diff --git a/src/common/intrin.h b/src/common/intrin.h index 1b0587e7e..225cd4d97 100644 --- a/src/common/intrin.h +++ b/src/common/intrin.h @@ -66,8 +66,10 @@ ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count) #if defined(CPU_ARCH_SSE) const __m128i svalue = _mm_set1_epi64x(reinterpret_cast(value)); -#elif defined(CPU_ARCH_NEON) +#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64) const uint64x2_t svalue = vdupq_n_u64(reinterpret_cast(value)); +#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32) + const uint32x4_t svalue = vdupq_n_u32(reinterpret_cast(value)); #endif // Clang gets way too eager and tries to unroll these, emitting thousands of instructions. @@ -78,8 +80,10 @@ ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count) { #if defined(CPU_ARCH_SSE) _mm_store_si128(reinterpret_cast<__m128i*>(dest), svalue); -#elif defined(CPU_ARCH_NEON) +#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM64) vst1q_u64(reinterpret_cast(dest), svalue); +#elif defined(CPU_ARCH_NEON) && defined(CPU_ARCH_ARM32) + vst1q_u32(reinterpret_cast(dest), svalue); #endif dest += PTRS_PER_VECTOR; } diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index 68ab18330..47b92e544 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -14,6 +14,7 @@ #include "util/page_fault_handler.h" +#include "common/align.h" #include "common/assert.h" #include "common/error.h" #include "common/intrin.h" @@ -456,15 +457,15 @@ CPU::CodeCache::Block* CPU::CodeCache::CreateBlock(u32 pc, const BlockInstructio s_blocks.erase(it); block->~Block(); - std::free(block); + Common::AlignedFree(block); block = nullptr; } } if (!block) { - block = - static_cast(std::malloc(sizeof(Block) + (sizeof(Instruction) * size) + (sizeof(InstructionInfo) * size))); + block = static_cast(Common::AlignedMalloc( + sizeof(Block) + (sizeof(Instruction) * size) + (sizeof(InstructionInfo) * size), alignof(Block))); Assert(block); new (block) Block(); s_blocks.push_back(block); @@ -734,7 +735,7 @@ void CPU::CodeCache::ClearBlocks() for (Block* block : s_blocks) { block->~Block(); - std::free(block); + Common::AlignedFree(block); } s_blocks.clear(); diff --git a/src/core/cpu_recompiler_code_generator_aarch32.cpp b/src/core/cpu_recompiler_code_generator_aarch32.cpp index f95140713..c9a53236c 100644 --- a/src/core/cpu_recompiler_code_generator_aarch32.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch32.cpp @@ -148,7 +148,7 @@ void CPU::Recompiler::armEmitFarLoad(vixl::aarch32::Assembler* armAsm, const vix } void CPU::Recompiler::armEmitFarStore(vixl::aarch32::Assembler* armAsm, const vixl::aarch32::Register& reg, - const void* addr, const vixl::aarch64::Register& tempreg) + const void* addr, const vixl::aarch32::Register& tempreg) { armMoveAddressToReg(armAsm, tempreg, addr); armAsm->str(reg, vixl::aarch32::MemOperand(tempreg)); @@ -1931,12 +1931,12 @@ void CodeGenerator::EmitICacheCheckAndUpdate() { if (m_block->HasFlag(CodeCache::BlockFlags::NeedsDynamicFetchTicks)) { - armEmitFarLoad(m_emit, RARG2, GetFetchMemoryAccessTimePtr()); - m_emit->ldr(RARG1, a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks))); - m_emit->Mov(RARG3, m_block->size); - m_emit->mul(RARG2, RARG2, RARG3); - m_emit->add(RARG1, RARG1, RARG2); - m_emit->str(RARG1, a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks))); + armEmitFarLoad(m_emit, GetHostReg32(RARG2), GetFetchMemoryAccessTimePtr()); + m_emit->ldr(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks))); + m_emit->Mov(GetHostReg32(RARG3), m_block->size); + m_emit->mul(GetHostReg32(RARG2), GetHostReg32(RARG2), GetHostReg32(RARG3)); + m_emit->add(GetHostReg32(RARG1), GetHostReg32(RARG1), GetHostReg32(RARG2)); + m_emit->str(GetHostReg32(RARG1), a32::MemOperand(GetCPUPtrReg(), OFFSETOF(State, pending_ticks))); } else {