diff --git a/CMakeModules/DuckStationUtils.cmake b/CMakeModules/DuckStationUtils.cmake index b2dd91c6a..12d7a52ed 100644 --- a/CMakeModules/DuckStationUtils.cmake +++ b/CMakeModules/DuckStationUtils.cmake @@ -57,6 +57,8 @@ function(detect_architecture) if("x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES) message(STATUS "Building x86_64 MacOS binaries.") set(CPU_ARCH_X64 TRUE PARENT_SCOPE) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Xarch_x86_64 -msse4.1" PARENT_SCOPE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xarch_864_64 -msse4.1" PARENT_SCOPE) endif() if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES) message(STATUS "Building ARM64 MacOS binaries.") @@ -67,6 +69,8 @@ function(detect_architecture) CMAKE_SIZEOF_VOID_P EQUAL 8) message(STATUS "Building x86_64 binaries.") set(CPU_ARCH_X64 TRUE PARENT_SCOPE) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1" PARENT_SCOPE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.1" PARENT_SCOPE) elseif(("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64") AND CMAKE_SIZEOF_VOID_P EQUAL 8) # Might have an A64 kernel, e.g. Raspbian. message(STATUS "Building ARM64 binaries.") diff --git a/dep/msvc/vsprops/Base.props b/dep/msvc/vsprops/Base.props index 9d485d4e4..81efe34c4 100644 --- a/dep/msvc/vsprops/Base.props +++ b/dep/msvc/vsprops/Base.props @@ -30,6 +30,7 @@ _HAS_EXCEPTIONS=0;_CRT_INTERNAL_NONSTDC_NAMES;_CRT_NONSTDC_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;WIN32;%(PreprocessorDefinitions) %(AdditionalIncludeDirectories);$(DepsIncludeDir) /Zc:__cplusplus /Zo /utf-8 %(AdditionalOptions) + -msse4.1 %(AdditionalOptions) -flto=thin %(AdditionalOptions) false diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index bc532cf6c..c0c8566d5 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -21,6 +21,11 @@ add_library(common fifo_queue.h file_system.cpp file_system.h + gsvector.h + gsvector_formatter.h + gsvector_neon.h + gsvector_nosimd.h + gsvector_sse.h intrin.h hash_combine.h heap_array.h diff --git a/src/common/common.vcxproj b/src/common/common.vcxproj index 9d0feb068..aafeebe45 100644 --- a/src/common/common.vcxproj +++ b/src/common/common.vcxproj @@ -16,6 +16,11 @@ + + + + + @@ -70,6 +75,7 @@ + diff --git a/src/common/common.vcxproj.filters b/src/common/common.vcxproj.filters index a36015e9f..2e280f9de 100644 --- a/src/common/common.vcxproj.filters +++ b/src/common/common.vcxproj.filters @@ -46,6 +46,11 @@ + + + + + @@ -80,6 +85,7 @@ thirdparty + diff --git a/src/common/gsvector.h b/src/common/gsvector.h new file mode 100644 index 000000000..8dda3b369 --- /dev/null +++ b/src/common/gsvector.h @@ -0,0 +1,65 @@ +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#pragma once + +#include "common/intrin.h" + +#include + +enum Align_Mode +{ + Align_Outside, + Align_Inside, + Align_NegInf, + Align_PosInf +}; + +enum Round_Mode +{ + Round_NearestInt = 8, + Round_NegInf = 9, + Round_PosInf = 10, + Round_Truncate = 11 +}; + +template +class GSVector2T +{ +public: + union + { + struct + { + T x, y; + }; + struct + { + T r, g; + }; + struct + { + T v[2]; + }; + }; + + GSVector2T() = default; + + ALWAYS_INLINE constexpr GSVector2T(T x) : x(x), y(x) {} + ALWAYS_INLINE constexpr GSVector2T(T x, T y) : x(x), y(y) {} + ALWAYS_INLINE constexpr bool operator==(const GSVector2T& v) const { return std::memcmp(this, &v, sizeof(*this)) == 0; } + ALWAYS_INLINE constexpr bool operator!=(const GSVector2T& v) const { return std::memcmp(this, &v, sizeof(*this)) != 0; } + ALWAYS_INLINE constexpr GSVector2T operator*(const GSVector2T& v) const { return {x * v.x, y * v.y}; } + ALWAYS_INLINE constexpr GSVector2T operator/(const GSVector2T& v) const { return {x / v.x, y / v.y}; } +}; + +using GSVector2 = GSVector2T; +using GSVector2i = GSVector2T; + +#if defined(CPU_ARCH_SSE) +#include "common/gsvector_sse.h" +#elif defined(CPU_ARCH_NEON) +#include "common/gsvector_neon.h" +#else +#include "common/gsvector_nosimd.h" +#endif diff --git a/src/common/gsvector.natvis b/src/common/gsvector.natvis new file mode 100644 index 000000000..077d748e0 --- /dev/null +++ b/src/common/gsvector.natvis @@ -0,0 +1,10 @@ + + + + {{ {x}, {y} }} + + + + {{ {I32[0]}, {I32[1]}, {I32[2]}, {I32[3]} }} + + \ No newline at end of file diff --git a/src/common/gsvector_formatter.h b/src/common/gsvector_formatter.h new file mode 100644 index 000000000..5ed7c1665 --- /dev/null +++ b/src/common/gsvector_formatter.h @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: 2024 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#pragma once + +#include "gsvector.h" +#include "small_string.h" + +#include "fmt/format.h" + +template<> +struct fmt::formatter : formatter +{ + auto format(const GSVector4i& rc, format_context& ctx) const + { + const TinyString str = + TinyString::from_format("{},{} => {},{} ({}x{})", rc.left, rc.top, rc.right, rc.bottom, rc.width(), rc.height()); + + return fmt::formatter::format(str.view(), ctx); + } +}; diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h new file mode 100644 index 000000000..1362b22aa --- /dev/null +++ b/src/common/gsvector_neon.h @@ -0,0 +1,1712 @@ +// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#define GSVECTOR_HAS_UNSIGNED 1 +#define GSVECTOR_HAS_SRLV 1 + +class GSVector4; + +class alignas(16) GSVector4i +{ + struct cxpr_init_tag + { + }; + static constexpr cxpr_init_tag cxpr_init{}; + + constexpr GSVector4i(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {} + + constexpr GSVector4i(cxpr_init_tag, short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7) + : I16{s0, s1, s2, s3, s4, s5, s6, s7} + { + } + + constexpr GSVector4i(cxpr_init_tag, char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, + char b9, char b10, char b11, char b12, char b13, char b14, char b15) +#if !defined(__APPLE__) && !defined(_MSC_VER) + : U8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} +#else + : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} +#endif + { + } + +public: + union + { + struct + { + int x, y, z, w; + }; + struct + { + int r, g, b, a; + }; + struct + { + int left, top, right, bottom; + }; + float F32[4]; + s8 I8[16]; + s16 I16[8]; + s32 I32[4]; + s64 I64[2]; + u8 U8[16]; + u16 U16[8]; + u32 U32[4]; + u64 U64[2]; + int32x4_t v4s; + }; + + GSVector4i() = default; + + ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w) + { + return GSVector4i(cxpr_init, x, y, z, w); + } + + ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); } + + ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); } + + ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) + { + return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7); + } + + ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, + s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) + { + return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15); + } + + ALWAYS_INLINE GSVector4i(int x, int y, int z, int w) + { + GSVector4i xz = load(x).upl32(load(z)); + GSVector4i yw = load(y).upl32(load(w)); + + *this = xz.upl32(yw); + } + + ALWAYS_INLINE GSVector4i(int x, int y) { *this = load(x).upl32(load(y)); } + + ALWAYS_INLINE GSVector4i(short s0, short s1, short s2, short s3, short s4, short s5, short s6, short s7) + : I16{s0, s1, s2, s3, s4, s5, s6, s7} + { + } + + constexpr GSVector4i(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, + char b10, char b11, char b12, char b13, char b14, char b15) +#if !defined(__APPLE__) && !defined(_MSC_VER) + : U8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} +#else + : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} +#endif + { + } + + ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { v4s = vcombine_s32(vld1_s32(v.v), vcreate_s32(0)); } + + // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), + // so leave the non-constexpr version default + ALWAYS_INLINE explicit GSVector4i(int i) { *this = i; } + + ALWAYS_INLINE constexpr explicit GSVector4i(int32x4_t m) : v4s(m) {} + + ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true); + + ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); + + ALWAYS_INLINE void operator=(int i) { v4s = vdupq_n_s32(i); } + + ALWAYS_INLINE operator int32x4_t() const { return v4s; } + + // rect + + ALWAYS_INLINE int width() const { return right - left; } + + ALWAYS_INLINE int height() const { return bottom - top; } + + ALWAYS_INLINE GSVector4i rsize() const + { + return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height()); + } + + ALWAYS_INLINE s32 rarea() const { return width() * height(); } + + ALWAYS_INLINE bool rempty() const { return (vminv_u32(vreinterpret_u32_s32(vget_low_s32(lt32(zwzw())))) == 0); } + + ALWAYS_INLINE GSVector4i runion(const GSVector4i& a) const { return min_i32(a).upl64(max_i32(a).srl<8>()); } + + ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& a) const { return sat_i32(a); } + ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); } + ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); } + + template + GSVector4i _ralign_helper(const GSVector4i& mask) const + { + GSVector4i v; + + switch (mode) + { + case Align_Inside: + v = add32(mask); + break; + case Align_Outside: + v = add32(mask.zwxy()); + break; + case Align_NegInf: + v = *this; + break; + case Align_PosInf: + v = add32(mask.xyxy()); + break; + + default: + UnreachableCode(); + break; + } + + return v.andnot(mask.xyxy()); + } + + /// Align the rect using mask values that already have one subtracted (1 << n - 1 aligns to 1 << n) + template + GSVector4i ralign_presub(const GSVector2i& a) const + { + return _ralign_helper(GSVector4i(a)); + } + + template + GSVector4i ralign(const GSVector2i& a) const + { + // a must be 1 << n + + return _ralign_helper(GSVector4i(a) - GSVector4i(1, 1)); + } + + // + + ALWAYS_INLINE u32 rgba32() const + { + GSVector4i v = *this; + + v = v.ps32(v); + v = v.pu16(v); + + return (u32)store(v); + } + + ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const + { + return max_i8(min).min_i8(max); + } + ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const + { + return max_i8(minmax.xyxy()).min_i8(minmax.zwzw()); + } + ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const + { + return max_i16(min).min_i16(max); + } + ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const + { + return max_i16(minmax.xyxy()).min_i16(minmax.zwzw()); + } + ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const + { + return max_i32(min).min_i32(max); + } + ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const + { + return max_i32(minmax.xyxy()).min_i32(minmax.zwzw()); + } + + ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const + { + return max_u8(min).min_u8(max); + } + ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const + { + return max_u8(minmax.xyxy()).min_u8(minmax.zwzw()); + } + ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const + { + return max_u16(min).min_u16(max); + } + ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const + { + return max_u16(minmax.xyxy()).min_u16(minmax.zwzw()); + } + ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const + { + return max_u32(min).min_u32(max); + } + ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const + { + return max_u32(minmax.xyxy()).min_u32(minmax.zwzw()); + } + + ALWAYS_INLINE GSVector4i min_i8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s8(vminq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i max_i8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s8(vmaxq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i min_i16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vminq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i max_i16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vmaxq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i min_i32(const GSVector4i& v) const { return GSVector4i(vminq_s32(v4s, v.v4s)); } + + ALWAYS_INLINE GSVector4i max_i32(const GSVector4i& v) const { return GSVector4i(vmaxq_s32(v4s, v.v4s)); } + + ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u8(vminq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u8(vmaxq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u16(vminq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u16(vmaxq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u32(vminq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u32(vmaxq_u32(vreinterpretq_u32_s32(v4s), vreinterpretq_u32_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const + { + int32x4_t acc = + vmlal_s16(vdupq_n_s32(0), vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s))); + acc = vmlal_high_s16(acc, vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)); + return GSVector4i(acc); + } + + ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(vpaddq_s32(v4s, v4s)); } + + ALWAYS_INLINE s32 minv_s32() const { return vminvq_s32(v4s); } + + ALWAYS_INLINE u32 minv_u32() const { return vminvq_u32(v4s); } + + ALWAYS_INLINE s32 maxv_s32() const { return vmaxvq_s32(v4s); } + + ALWAYS_INLINE u32 maxv_u32() const { return vmaxvq_u32(v4s); } + + ALWAYS_INLINE static int min_i16(int a, int b) { return store(load(a).min_i16(load(b))); } + + ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); } + + ALWAYS_INLINE GSVector4i blend8(const GSVector4i& a, const GSVector4i& mask) const + { + uint8x16_t mask2 = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_s32(mask.v4s), 7)); + return GSVector4i(vreinterpretq_s32_u8(vbslq_u8(mask2, vreinterpretq_u8_s32(a.v4s), vreinterpretq_u8_s32(v4s)))); + } + + template + ALWAYS_INLINE GSVector4i blend16(const GSVector4i& a) const + { + static constexpr const uint16_t _mask[8] = { + ((mask) & (1 << 0)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 1)) ? (uint16_t)-1 : 0x0, + ((mask) & (1 << 2)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 3)) ? (uint16_t)-1 : 0x0, + ((mask) & (1 << 4)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 5)) ? (uint16_t)-1 : 0x0, + ((mask) & (1 << 6)) ? (uint16_t)-1 : 0x0, ((mask) & (1 << 7)) ? (uint16_t)-1 : 0x0}; + return GSVector4i( + vreinterpretq_s32_u16(vbslq_u16(vld1q_u16(_mask), vreinterpretq_u16_s32(a.v4s), vreinterpretq_u16_s32(v4s)))); + } + + template + ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const + { + constexpr int bit3 = ((mask & 8) * 3) << 3; + constexpr int bit2 = ((mask & 4) * 3) << 2; + constexpr int bit1 = ((mask & 2) * 3) << 1; + constexpr int bit0 = (mask & 1) * 3; + return blend16(v); + } + + ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const + { + return GSVector4i( + vreinterpretq_s32_s8(vorrq_s8(vbicq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(mask.v4s)), + vandq_s8(vreinterpretq_s8_s32(mask.v4s), vreinterpretq_s8_s32(v.v4s))))); + } + + ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); } + + ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const + { + return GSVector4i(vreinterpretq_s32_s8(vqtbl1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_u8_s32(mask.v4s)))); + } + + ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s8( + vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v.v4s))))); + } + + ALWAYS_INLINE GSVector4i ps16() const + { + return GSVector4i(vreinterpretq_s32_s8( + vcombine_s8(vqmovn_s16(vreinterpretq_s16_s32(v4s)), vqmovn_s16(vreinterpretq_s16_s32(v4s))))); + } + + ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u8( + vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v.v4s))))); + } + + ALWAYS_INLINE GSVector4i pu16() const + { + return GSVector4i(vreinterpretq_s32_u8( + vcombine_u8(vqmovun_s16(vreinterpretq_s16_s32(v4s)), vqmovun_s16(vreinterpretq_s16_s32(v4s))))); + } + + ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i ps32() const + { + return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(vqmovn_s32(v4s), vqmovn_s32(v4s)))); + } + + ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i pu32() const + { + return GSVector4i(vreinterpretq_s32_u16(vcombine_u16(vqmovun_s32(v4s), vqmovun_s32(v4s)))); + } + + ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(vzip1q_s32(v4s, v.v4s)); } + + ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(vzip2q_s32(v4s, v.v4s)); } + + ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s64( + vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vget_low_s64(vreinterpretq_s64_s32(v.v4s))))); + } + + ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s64( + vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vget_high_s64(vreinterpretq_s64_s32(v.v4s))))); + } + + ALWAYS_INLINE GSVector4i upl8() const + { + return GSVector4i(vreinterpretq_s32_s8(vzip1q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0)))); + } + + ALWAYS_INLINE GSVector4i uph8() const + { + return GSVector4i(vreinterpretq_s32_s8(vzip2q_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0)))); + } + + ALWAYS_INLINE GSVector4i upl16() const + { + return GSVector4i(vreinterpretq_s32_s16(vzip1q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0)))); + } + + ALWAYS_INLINE GSVector4i uph16() const + { + return GSVector4i(vreinterpretq_s32_s16(vzip2q_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(0)))); + } + + ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(vzip1q_s32(v4s, vdupq_n_s32(0))); } + + ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(vzip2q_s32(v4s, vdupq_n_s32(0))); } + + ALWAYS_INLINE GSVector4i upl64() const + { + return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0)))); + } + + ALWAYS_INLINE GSVector4i uph64() const + { + return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(v4s)), vdup_n_s64(0)))); + } + + ALWAYS_INLINE GSVector4i i8to16() const + { + return GSVector4i(vreinterpretq_s32_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))); + } + + ALWAYS_INLINE GSVector4i u8to16() const + { + return GSVector4i(vreinterpretq_s32_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s))))); + } + + ALWAYS_INLINE GSVector4i i8to32() const + { + return GSVector4i(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s)))))); + } + + ALWAYS_INLINE GSVector4i u8to32() const + { + return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s))))))); + } + + ALWAYS_INLINE GSVector4i i8to64() const + { + return GSVector4i(vreinterpretq_s32_s64( + vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(vreinterpretq_s8_s32(v4s))))))))); + } + + ALWAYS_INLINE GSVector4i u8to64() const + { + return GSVector4i(vreinterpretq_s32_u64( + vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s32(v4s))))))))); + } + + ALWAYS_INLINE GSVector4i i16to32() const { return GSVector4i(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)))); } + + ALWAYS_INLINE GSVector4i u16to32() const + { + return GSVector4i(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s))))); + } + + ALWAYS_INLINE GSVector4i i16to64() const + { + return GSVector4i( + vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_s32(v4s))))))); + } + + ALWAYS_INLINE GSVector4i u16to64() const + { + return GSVector4i( + vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vreinterpretq_u16_s32(v4s))))))); + } + + ALWAYS_INLINE GSVector4i i32to64() const { return GSVector4i(vreinterpretq_s32_s64(vmovl_s32(vget_low_s32(v4s)))); } + + ALWAYS_INLINE GSVector4i u32to64() const + { + return GSVector4i(vreinterpretq_s32_u64(vmovl_u32(vget_low_u32(vreinterpretq_u32_s32(v4s))))); + } + + template + ALWAYS_INLINE GSVector4i srl() const + { + return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vreinterpretq_s8_s32(v4s), vdupq_n_s8(0), i))); + } + + template + ALWAYS_INLINE GSVector4i srl(const GSVector4i& v) + { + if constexpr (i >= 16) + return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v.v4s), vdupq_n_u8(0), i - 16))); + else + return GSVector4i(vreinterpretq_s32_u8(vextq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s), i))); + } + + template + ALWAYS_INLINE GSVector4i sll() const + { + return GSVector4i(vreinterpretq_s32_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_s32(v4s), 16 - i))); + } + + template + ALWAYS_INLINE GSVector4i sll16() const + { + return GSVector4i(vreinterpretq_s32_s16(vshlq_n_s16(vreinterpretq_s16_s32(v4s), i))); + } + + ALWAYS_INLINE GSVector4i sll16(s32 i) const + { + return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(i)))); + } + + ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + + template + ALWAYS_INLINE GSVector4i srl16() const + { + return GSVector4i(vreinterpretq_s32_u16(vshrq_n_u16(vreinterpretq_u16_s32(v4s), i))); + } + + ALWAYS_INLINE GSVector4i srl16(s32 i) const + { + return GSVector4i(vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vdupq_n_u16(-i)))); + } + + ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const + { + return GSVector4i( + vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s))))); + } + + template + ALWAYS_INLINE GSVector4i sra16() const + { + constexpr int count = (i & ~15) ? 15 : i; + return GSVector4i(vreinterpretq_s32_s16(vshrq_n_s16(vreinterpretq_s16_s32(v4s), count))); + } + + ALWAYS_INLINE GSVector4i sra16(s32 i) const + { + return GSVector4i(vreinterpretq_s32_s16(vshlq_s16(vreinterpretq_s16_s32(v4s), vdupq_n_s16(-i)))); + } + + ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const + { + return GSVector4i( + vreinterpretq_s32_u16(vshlq_u16(vreinterpretq_u16_s32(v4s), vnegq_s16(vreinterpretq_s16_s32(v.v4s))))); + } + + template + ALWAYS_INLINE GSVector4i sll32() const + { + return GSVector4i(vshlq_n_s32(v4s, i)); + } + + ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(i))); } + + ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(vshlq_s32(v4s, v.v4s)); } + + template + ALWAYS_INLINE GSVector4i srl32() const + { + return GSVector4i(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(v4s), i))); + } + + ALWAYS_INLINE GSVector4i srl32(s32 i) const + { + return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vdupq_n_s32(-i)))); + } + + ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u32(vshlq_u32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s)))); + } + + template + ALWAYS_INLINE GSVector4i sra32() const + { + return GSVector4i(vshrq_n_s32(v4s, i)); + } + + ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(vshlq_s32(v4s, vdupq_n_s32(-i))); } + + ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const + { + return GSVector4i(vshlq_s32(vreinterpretq_u32_s32(v4s), vnegq_s32(v.v4s))); + } + + template + ALWAYS_INLINE GSVector4i sll64() const + { + return GSVector4i(vreinterpretq_s32_s64(vshlq_n_s64(vreinterpretq_s64_s32(v4s), i))); + } + + ALWAYS_INLINE GSVector4i sll64(s32 i) const + { + return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(i)))); + } + + ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s)))); + } + + template + ALWAYS_INLINE GSVector4i sra64() const + { + return GSVector4i(vreinterpretq_s32_s64(vshrq_n_s64(vreinterpretq_s64_s32(v4s), i))); + } + + ALWAYS_INLINE GSVector4i sra64(s32 i) const + { + return GSVector4i(vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vdupq_n_s16(-i)))); + } + + ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const + { + return GSVector4i( + vreinterpretq_s32_s64(vshlq_s64(vreinterpretq_s64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s))))); + } + + template + ALWAYS_INLINE GSVector4i srl64() const + { + return GSVector4i(vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(v4s), i))); + } + + ALWAYS_INLINE GSVector4i srl64(s32 i) const + { + return GSVector4i(vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vdupq_n_u16(-i)))); + } + + ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const + { + return GSVector4i( + vreinterpretq_s32_u64(vshlq_u64(vreinterpretq_u64_s32(v4s), vnegq_s64(vreinterpretq_s64_s32(v.v4s))))); + } + + ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s8(vaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(vaddq_s32(v4s, v.v4s)); } + + ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s8(vqaddq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vqaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const + { + // can't use vpaddq_s16() here, because we need saturation. + // return GSVector4i(vreinterpretq_s32_s16(vpaddq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + const int16x8_t a = vreinterpretq_s16_s32(v4s); + const int16x8_t b = vreinterpretq_s16_s32(v.v4s); + return GSVector4i(vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); + } + + ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u8(vqaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u16(vqaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s8(vsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(vsubq_s32(v4s, v.v4s)); } + + ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s8(vqsubq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vqsubq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u8(vqsubq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u16(vqsubq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u8(vrhaddq_u8(vreinterpretq_u8_s32(v4s), vreinterpretq_u8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u16(vrhaddq_u16(vreinterpretq_u16_s32(v4s), vreinterpretq_u16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const + { + // from sse2neon + int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_s32(v4s)); + int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_s32(v.v4s)); + int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ + int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_s32(v4s)); + int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_s32(v.v4s)); + int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ + uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); + return GSVector4i(vreinterpretq_s32_u16(r.val[1])); + } + + ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vmulq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const + { + int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_s32(v4s)), vget_low_s16(vreinterpretq_s16_s32(v.v4s))); + int32x4_t mul_hi = + vmull_s16(vget_high_s16(vreinterpretq_s16_s32(v4s)), vget_high_s16(vreinterpretq_s16_s32(v.v4s))); + int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); + int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); + return GSVector4i(vreinterpretq_s32_s16(vcombine_s16(narrow_lo, narrow_hi))); + } + + ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(vmulq_s32(v4s, v.v4s)); } + + template + ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const + { + // (a - this) * f << shift + this + + return add16(a.sub16(*this).modulate16(f)); + } + + template + ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c) + { + // (a - b) * c << shift + + return a.sub16(b).modulate16(c); + } + + template + ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, + const GSVector4i& d) + { + // (a - b) * c << shift + d + + return d.add16(a.sub16(b).modulate16(c)); + } + + ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a, const GSVector4i& f) const + { + // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit) + + return add16(a.sub16(*this).mul16l(f).sra16<4>()); + } + + template + ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const + { + // a * f << shift + if (shift == 0) + { + return mul16hrs(f); + } + + return sll16().mul16hs(f); + } + + ALWAYS_INLINE bool eq(const GSVector4i& v) const + { + return (vmaxvq_u32(vreinterpretq_u32_s32(veorq_s32(v4s, v.v4s))) == 0); + } + + ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u8(vceqq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u16(vceqq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u32(vceqq_s32(v4s, v.v4s))); + } + + ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(v4s), vreinterpretq_s64_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); } + + ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); } + + ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); } + + ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s8(vcgtq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vcgtq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(vcgtq_s32(v4s, v.v4s)); } + + ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s8(vcgeq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); + } + ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vcgeq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return GSVector4i(vcgeq_s32(v4s, v.v4s)); } + + ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s8(vcltq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vcltq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + + ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(vcltq_s32(v4s, v.v4s)); } + + ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s8(vcleq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s)))); + } + ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const + { + return GSVector4i(vreinterpretq_s32_s16(vcleq_s16(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v.v4s)))); + } + ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return GSVector4i(vcleq_s32(v4s, v.v4s)); } + + ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(vbicq_s32(v4s, v.v4s)); } + + ALWAYS_INLINE int mask() const + { + // borrowed from sse2neon + const uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s32(v4s), 7)); + const uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + const uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + const uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + return static_cast(vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8)); + } + + ALWAYS_INLINE bool alltrue() const + { + // MSB should be set in all 8-bit lanes. + return (vminvq_u8(vreinterpretq_u8_s32(v4s)) & 0x80) == 0x80; + } + + ALWAYS_INLINE bool allfalse() const + { + // MSB should be clear in all 8-bit lanes. + return (vmaxvq_u32(vreinterpretq_u8_s32(v4s)) & 0x80) != 0x80; + } + + template + ALWAYS_INLINE GSVector4i insert8(int a) const + { + return GSVector4i(vreinterpretq_s32_u8(vsetq_lane_u8(a, vreinterpretq_u8_s32(v4s), static_cast(i)))); + } + + template + ALWAYS_INLINE int extract8() const + { + return vgetq_lane_u8(vreinterpretq_u8_s32(v4s), i); + } + + template + ALWAYS_INLINE GSVector4i insert16(int a) const + { + return GSVector4i(vreinterpretq_s32_u16(vsetq_lane_u16(a, vreinterpretq_u16_s32(v4s), static_cast(i)))); + } + + template + ALWAYS_INLINE int extract16() const + { + return vgetq_lane_u16(vreinterpretq_u16_s32(v4s), i); + } + + template + ALWAYS_INLINE GSVector4i insert32(int a) const + { + return GSVector4i(vsetq_lane_s32(a, v4s, i)); + } + + template + ALWAYS_INLINE int extract32() const + { + return vgetq_lane_s32(v4s, i); + } + + template + ALWAYS_INLINE GSVector4i insert64(s64 a) const + { + return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(a, vreinterpretq_s64_s32(v4s), i))); + } + + template + ALWAYS_INLINE s64 extract64() const + { + return vgetq_lane_s64(vreinterpretq_s64_s32(v4s), i); + } + + ALWAYS_INLINE static GSVector4i loadnt(const void* p) + { +#if __has_builtin(__builtin_nontemporal_store) + return GSVector4i(__builtin_nontemporal_load((int32x4_t*)p)); +#else + return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p))); +#endif + } + + ALWAYS_INLINE static GSVector4i load32(const void* p) + { + // should be ldr s0, [x0] + u32 val; + std::memcpy(&val, p, sizeof(u32)); + return GSVector4i(vsetq_lane_u32(val, vdupq_n_u32(0), 0)); + } + + ALWAYS_INLINE static GSVector4i loadl(const void* p) + { + return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0))); + } + + ALWAYS_INLINE static GSVector4i loadh(const void* p) + { + return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p)))); + } + + ALWAYS_INLINE static GSVector4i loadh(const void* p, const GSVector4i& v) + { + return GSVector4i( + vreinterpretq_s32_s64(vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(v.v4s)), vld1_s64((int64_t*)p)))); + } + + ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); } + + ALWAYS_INLINE static GSVector4i load(const void* pl, const void* ph) + { + return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vld1_s64((int64_t*)pl), vld1_s64((int64_t*)ph)))); + } + + template + ALWAYS_INLINE static GSVector4i load(const void* p) + { + return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p))); + } + + ALWAYS_INLINE static GSVector4i load(int i) { return GSVector4i(vsetq_lane_s32(i, vdupq_n_s32(0), 0)); } + + ALWAYS_INLINE static GSVector4i loadq(s64 i) + { + return GSVector4i(vreinterpretq_s32_s64(vsetq_lane_s64(i, vdupq_n_s64(0), 0))); + } + + ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) + { +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(v.v4s, ((int32x4_t*)p)); +#else + vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s)); +#endif + } + + ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) + { + u32 val = vgetq_lane_s32(v, 0); + std::memcpy(p, &val, sizeof(u32)); + } + + ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) + { + vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s))); + } + + ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) + { + vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s))); + } + + ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v) + { + GSVector4i::storel(pl, v); + GSVector4i::storeh(ph, v); + } + + template + ALWAYS_INLINE static void store(void* p, const GSVector4i& v) + { + vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s)); + } + + ALWAYS_INLINE static int store(const GSVector4i& v) { return vgetq_lane_s32(v.v4s, 0); } + + ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return vgetq_lane_s64(vreinterpretq_s64_s32(v.v4s), 0); } + + ALWAYS_INLINE void operator&=(const GSVector4i& v) + { + v4s = vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))); + } + + ALWAYS_INLINE void operator|=(const GSVector4i& v) + { + v4s = vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))); + } + + ALWAYS_INLINE void operator^=(const GSVector4i& v) + { + v4s = vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v4s), vreinterpretq_s8_s32(v.v4s))); + } + + ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2) + { + return GSVector4i(vreinterpretq_s32_s8(vandq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s)))); + } + + ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2) + { + return GSVector4i(vreinterpretq_s32_s8(vorrq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s)))); + } + + ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2) + { + return GSVector4i(vreinterpretq_s32_s8(veorq_s8(vreinterpretq_s8_s32(v1.v4s), vreinterpretq_s8_s32(v2.v4s)))); + } + + ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, int i) { return v & GSVector4i(i); } + + ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, int i) { return v | GSVector4i(i); } + + ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, int i) { return v ^ GSVector4i(i); } + + ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return GSVector4i(vmvnq_s32(v.v4s)); } + + ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(0); } + + ALWAYS_INLINE static GSVector4i xffffffff() { return GSVector4i(0xFFFFFFFF); } + + ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); } + + ALWAYS_INLINE GSVector2i xy() const + { + GSVector2i ret; + storel(&ret, *this); + return ret; + } + + ALWAYS_INLINE GSVector2i zw() const + { + GSVector2i ret; + storeh(&ret, *this); + return ret; + } + + // clang-format off + + +#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const { return GSVector4i(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } + + // ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} + // ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} + // ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} + // ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} + +#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ + +#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ + +#define VECTOR4i_SHUFFLE_1(xs, xn) \ + VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \ + VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \ + VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \ + VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \ + + VECTOR4i_SHUFFLE_1(x, 0) + VECTOR4i_SHUFFLE_1(y, 1) + VECTOR4i_SHUFFLE_1(z, 2) + VECTOR4i_SHUFFLE_1(w, 3) + + // TODO: Make generic like above. + ALWAYS_INLINE GSVector4i xxzzlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 2, 2, 4, 4, 6, 6))); } + ALWAYS_INLINE GSVector4i yywwlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 1, 3, 3, 5, 5, 7, 7))); } + ALWAYS_INLINE GSVector4i yxwzlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 0, 3, 2, 5, 4, 7, 6))); } + ALWAYS_INLINE GSVector4i xxxxlh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 0, 0, 4, 4, 4, 4))); } + + ALWAYS_INLINE GSVector4i xxxxl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 0, 0, 0, 4, 5, 6, 7))); } + ALWAYS_INLINE GSVector4i zwxyl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 2, 3, 0, 1, 4, 5, 6, 7))); } + ALWAYS_INLINE GSVector4i yxwzl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 1, 0, 3, 2, 4, 5, 6, 7))); } + ALWAYS_INLINE GSVector4i zwzwl() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 2, 3, 2, 3, 4, 5, 6, 7))); } + + ALWAYS_INLINE GSVector4i zzzzh() const { return GSVector4i(vreinterpretq_s32_s16(__builtin_shufflevector(vreinterpretq_s16_s32(v4s), vreinterpretq_s16_s32(v4s), 0, 1, 2, 3, 6, 6, 6, 6))); } + + // clang-format on +}; + +class alignas(16) GSVector4 +{ + struct cxpr_init_tag + { + }; + static constexpr cxpr_init_tag cxpr_init{}; + + constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {} + + constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {} + + constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {} + +public: + union + { + struct + { + float x, y, z, w; + }; + struct + { + float r, g, b, a; + }; + struct + { + float left, top, right, bottom; + }; + float F32[4]; + double F64[2]; + s8 I8[16]; + s16 I16[8]; + s32 I32[4]; + s64 I64[2]; + u8 U8[16]; + u16 U16[8]; + u32 U32[4]; + u64 U64[2]; + float32x4_t v4s; + }; + + GSVector4() = default; + + constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); } + + constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); } + + constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); } + + constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); } + + constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); } + + constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); } + + ALWAYS_INLINE GSVector4(float x, float y, float z, float w) + { + const float arr[4] = {x, y, z, w}; + v4s = vld1q_f32(arr); + } + + ALWAYS_INLINE GSVector4(float x, float y) + { + v4s = vzip1q_f32(vsetq_lane_f32(x, vdupq_n_f32(0.0f), 0), vsetq_lane_f32(y, vdupq_n_f32(0.0f), 0)); + } + + ALWAYS_INLINE GSVector4(int x, int y, int z, int w) + { + const int arr[4] = {x, y, z, w}; + v4s = vcvtq_f32_s32(vld1q_s32(arr)); + } + + ALWAYS_INLINE GSVector4(int x, int y) + { + v4s = vcvtq_f32_s32(vzip1q_s32(vsetq_lane_s32(x, vdupq_n_s32(0), 0), vsetq_lane_s32(y, vdupq_n_s32(0), 0))); + } + + ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { v4s = vcombine_f32(vld1_f32(v.v), vcreate_f32(0)); } + + ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) + { + v4s = vcvtq_f32_s32(vcombine_s32(vld1_s32(v.v), vcreate_s32(0))); + } + + ALWAYS_INLINE constexpr explicit GSVector4(float32x4_t m) : v4s(m) {} + + ALWAYS_INLINE explicit GSVector4(float f) { v4s = vdupq_n_f32(f); } + + ALWAYS_INLINE explicit GSVector4(int i) { v4s = vcvtq_f32_s32(vdupq_n_s32(i)); } + + ALWAYS_INLINE explicit GSVector4(const GSVector4i& v); + + ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v); + + ALWAYS_INLINE static GSVector4 f64(double x, double y) + { + return GSVector4(vreinterpretq_f32_f64(vsetq_lane_f64(y, vdupq_n_f64(x), 1))); + } + + ALWAYS_INLINE void operator=(float f) { v4s = vdupq_n_f32(f); } + + ALWAYS_INLINE void operator=(float32x4_t m) { v4s = m; } + + ALWAYS_INLINE operator float32x4_t() const { return v4s; } + + /// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks + /// we don't need the whole vector Useful for e.g. preventing clang from optimizing shuffles that remove + /// possibly-denormal garbage data from vectors before computing with them + ALWAYS_INLINE GSVector4 noopt() + { + // Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the + // future the implementation should be updated +#ifdef __clang__ + // __asm__("":"+x"(m)::); +#endif + return *this; + } + + ALWAYS_INLINE u32 rgba32() const { return GSVector4i(*this).rgba32(); } + + ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); } + + ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); } + + ALWAYS_INLINE GSVector4 abs() const { return GSVector4(vabsq_f32(v4s)); } + + ALWAYS_INLINE GSVector4 neg() const { return GSVector4(vnegq_f32(v4s)); } + + ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(vrecpeq_f32(v4s)); } + + ALWAYS_INLINE GSVector4 rcpnr() const + { + float32x4_t recip = vrecpeq_f32(v4s); + recip = vmulq_f32(recip, vrecpsq_f32(recip, v4s)); + return GSVector4(recip); + } + + template + ALWAYS_INLINE GSVector4 round() const + { + if constexpr (mode == Round_NegInf) + return floor(); + else if constexpr (mode == Round_PosInf) + return ceil(); + else if constexpr (mode == Round_NearestInt) + return GSVector4(vrndnq_f32(v4s)); + else + return GSVector4(vrndq_f32(v4s)); + } + + ALWAYS_INLINE GSVector4 floor() const { return GSVector4(vrndmq_f32(v4s)); } + + ALWAYS_INLINE GSVector4 ceil() const { return GSVector4(vrndpq_f32(v4s)); } + + ALWAYS_INLINE GSVector4 madd(const GSVector4& a, const GSVector4& b) const + { + return GSVector4(vfmaq_f32(b.v4s, v4s, a.v4s)); + } + ALWAYS_INLINE GSVector4 msub(const GSVector4& a, const GSVector4& b) const + { + return GSVector4(vfmsq_f32(b.v4s, v4s, a.v4s)); + } + ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a, const GSVector4& b) const { return b - *this * a; } + ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a, const GSVector4& b) const { return -b - *this * a; } + + ALWAYS_INLINE GSVector4 addm(const GSVector4& a, const GSVector4& b) const + { + return a.madd(b, *this); // *this + a * b + } + + ALWAYS_INLINE GSVector4 subm(const GSVector4& a, const GSVector4& b) const + { + return a.nmadd(b, *this); // *this - a * b + } + + ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(vpaddq_f32(v4s, v4s)); } + + ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(vpaddq_f32(v4s, v.v4s)); } + + ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v4s), vuzp2q_f32(v4s, v4s))); } + + ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const + { + return GSVector4(vsubq_f32(vuzp1q_f32(v4s, v.v4s), vuzp2q_f32(v4s, v.v4s))); + } + + ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); } + + ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const + { + const GSVector4 minv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 0))); + const GSVector4 maxv(vreinterpretq_f32_f64(vdupq_laneq_f64(vreinterpretq_f64_f32(a.v4s), 1))); + return sat(minv, maxv); + } + + ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); } + + ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); } + + ALWAYS_INLINE GSVector4 min(const GSVector4& a) const { return GSVector4(vminq_f32(v4s, a.v4s)); } + + ALWAYS_INLINE GSVector4 max(const GSVector4& a) const { return GSVector4(vmaxq_f32(v4s, a.v4s)); } + + template + ALWAYS_INLINE GSVector4 blend32(const GSVector4& a) const + { + return GSVector4(__builtin_shufflevector(v4s, a.v4s, (mask & 1) ? 4 : 0, (mask & 2) ? 5 : 1, (mask & 4) ? 6 : 2, + (mask & 8) ? 7 : 3)); + } + + ALWAYS_INLINE GSVector4 blend32(const GSVector4& a, const GSVector4& mask) const + { + // duplicate sign bit across and bit select + const uint32x4_t bitmask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(mask.v4s), 31)); + return GSVector4(vbslq_f32(bitmask, a.v4s, v4s)); + } + + ALWAYS_INLINE GSVector4 upl(const GSVector4& a) const { return GSVector4(vzip1q_f32(v4s, a.v4s)); } + + ALWAYS_INLINE GSVector4 uph(const GSVector4& a) const { return GSVector4(vzip2q_f32(v4s, a.v4s)); } + + ALWAYS_INLINE GSVector4 upld(const GSVector4& a) const + { + return GSVector4(vreinterpretq_f32_f64(vzip1q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s)))); + } + + ALWAYS_INLINE GSVector4 uphd(const GSVector4& a) const + { + return GSVector4(vreinterpretq_f32_f64(vzip2q_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(a.v4s)))); + } + + ALWAYS_INLINE GSVector4 l2h(const GSVector4& a) const + { + return GSVector4(vcombine_f32(vget_low_f32(v4s), vget_low_f32(a.v4s))); + } + + ALWAYS_INLINE GSVector4 h2l(const GSVector4& a) const + { + return GSVector4(vcombine_f32(vget_high_f32(v4s), vget_high_f32(a.v4s))); + } + + ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const + { + return GSVector4(vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(v4s), vreinterpretq_s32_f32(v.v4s)))); + } + + ALWAYS_INLINE int mask() const + { + static constexpr const int32_t shifts[] = {0, 1, 2, 3}; + return static_cast(vaddvq_u32(vshlq_u32(vshrq_n_u32(vreinterpretq_u32_f32(v4s), 31), vld1q_s32(shifts)))); + } + + ALWAYS_INLINE bool alltrue() const + { + // return mask() == 0xf; + return ~(vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) & vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0; + } + + ALWAYS_INLINE bool allfalse() const + { + return (vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) | vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0; + } + + ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); } + + template + ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const + { + return GSVector4(vcopyq_laneq_f32(v4s, dst, v.v4s, src)); + } + + template + ALWAYS_INLINE int extract32() const + { + return vgetq_lane_s32(vreinterpretq_s32_f32(v4s), i); + } + + ALWAYS_INLINE static GSVector4 zero() { return GSVector4(vdupq_n_f32(0.0f)); } + + ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); } + + ALWAYS_INLINE static GSVector4 loadl(const void* p) + { + return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0))); + } + + ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(vsetq_lane_f32(f, vmovq_n_f32(0.0f), 0)); } + + template + ALWAYS_INLINE static GSVector4 load(const void* p) + { + return GSVector4(vld1q_f32((const float*)p)); + } + + ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); } + + ALWAYS_INLINE static void storel(void* p, const GSVector4& v) + { + vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s))); + } + + ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) + { + vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s))); + } + + template + ALWAYS_INLINE static void store(void* p, const GSVector4& v) + { + vst1q_f32((float*)p, v.v4s); + } + + ALWAYS_INLINE static void store(float* p, const GSVector4& v) { vst1q_lane_f32(p, v.v4s, 0); } + + ALWAYS_INLINE GSVector4 operator-() const { return neg(); } + + ALWAYS_INLINE void operator+=(const GSVector4& v) { v4s = vaddq_f32(v4s, v.v4s); } + ALWAYS_INLINE void operator-=(const GSVector4& v) { v4s = vsubq_f32(v4s, v.v4s); } + ALWAYS_INLINE void operator*=(const GSVector4& v) { v4s = vmulq_f32(v4s, v.v4s); } + ALWAYS_INLINE void operator/=(const GSVector4& v) { v4s = vdivq_f32(v4s, v.v4s); } + + ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); } + ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); } + ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); } + ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); } + + ALWAYS_INLINE void operator&=(const GSVector4& v) + { + v4s = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s))); + } + + ALWAYS_INLINE void operator|=(const GSVector4& v) + { + v4s = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s))); + } + + ALWAYS_INLINE void operator^=(const GSVector4& v) + { + v4s = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v4s), vreinterpretq_u32_f32(v.v4s))); + } + + ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(vaddq_f32(v1.v4s, v2.v4s)); + } + + ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(vsubq_f32(v1.v4s, v2.v4s)); + } + + ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(vmulq_f32(v1.v4s, v2.v4s)); + } + + ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(vdivq_f32(v1.v4s, v2.v4s)); + } + + ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); } + ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); } + ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); } + ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); } + + ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s)))); + } + + ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s)))); + } + + ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1.v4s), vreinterpretq_u32_f32(v2.v4s)))); + } + + ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(vreinterpretq_f32_u32(vceqq_f32(v1.v4s, v2.v4s))); + } + + ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2) + { + // NEON has no != + return GSVector4(vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(v1.v4s, v2.v4s)))); + } + + ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(vreinterpretq_f32_u32(vcgtq_f32(v1.v4s, v2.v4s))); + } + + ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(vreinterpretq_f32_u32(vcltq_f32(v1.v4s, v2.v4s))); + } + + ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(vreinterpretq_f32_u32(vcgeq_f32(v1.v4s, v2.v4s))); + } + + ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(vreinterpretq_f32_u32(vcleq_f32(v1.v4s, v2.v4s))); + } + + ALWAYS_INLINE GSVector4 mul64(const GSVector4& v) const + { + return GSVector4(vmulq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))); + } + + ALWAYS_INLINE GSVector4 add64(const GSVector4& v) const + { + return GSVector4(vaddq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))); + } + + ALWAYS_INLINE GSVector4 sub64(const GSVector4& v) const + { + return GSVector4(vsubq_f64(vreinterpretq_f64_f32(v4s), vreinterpretq_f64_f32(v.v4s))); + } + + ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v) + { + return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vget_low_f32(v.v4s)))); + } + + ALWAYS_INLINE static GSVector4 f32to64(const void* p) + { + return GSVector4(vreinterpretq_f32_f64(vcvt_f64_f32(vld1_f32(static_cast(p))))); + } + + ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const + { + const float64x2_t r = truncate ? v4s : vrndiq_f64(vreinterpretq_f64_f32(v4s)); + const s32 low = static_cast(vgetq_lane_f64(r, 0)); + const s32 high = static_cast(vgetq_lane_f64(r, 1)); + return GSVector4i(vsetq_lane_s32(high, vsetq_lane_s32(low, vdupq_n_s32(0), 0), 1)); + } + + // clang-format off + +#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(__builtin_shufflevector(v4s, v4s, xn, yn, zn, wn)); } \ + ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v) const { return GSVector4(__builtin_shufflevector(v4s, v.v4s, xn, yn, 4 + zn, 4 + wn)); } + +#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ + +#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ + +#define VECTOR4_SHUFFLE_1(xs, xn) \ + VECTOR4_SHUFFLE_2(xs, xn, x, 0) \ + VECTOR4_SHUFFLE_2(xs, xn, y, 1) \ + VECTOR4_SHUFFLE_2(xs, xn, z, 2) \ + VECTOR4_SHUFFLE_2(xs, xn, w, 3) \ + + VECTOR4_SHUFFLE_1(x, 0) + VECTOR4_SHUFFLE_1(y, 1) + VECTOR4_SHUFFLE_1(z, 2) + VECTOR4_SHUFFLE_1(w, 3) + + // clang-format on + + ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(vdupq_laneq_f32(v4s, 0)); } + + ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(vdupq_laneq_f32(v.v4s, 0)); } + + ALWAYS_INLINE static GSVector4 broadcast32(const void* f) { return GSVector4(vld1q_dup_f32((const float*)f)); } + + ALWAYS_INLINE static GSVector4 broadcast64(const void* f) + { + return GSVector4(vreinterpretq_f64_f32(vld1q_dup_f64((const double*)f))); + } +}; + +ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate) +{ + v4s = truncate ? vcvtq_s32_f32(v.v4s) : vcvtnq_u32_f32(v.v4s); +} + +ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v) +{ + v4s = vcvtq_f32_s32(v.v4s); +} + +ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v) +{ + return GSVector4i(vreinterpretq_s32_f32(v.v4s)); +} + +ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v) +{ + return GSVector4(vreinterpretq_f32_s32(v.v4s)); +} diff --git a/src/common/gsvector_nosimd.h b/src/common/gsvector_nosimd.h new file mode 100644 index 000000000..b718b8e3c --- /dev/null +++ b/src/common/gsvector_nosimd.h @@ -0,0 +1,1612 @@ +// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin +// SPDX-License-Identifier: LGPL-3.0+ + +// Implementation of GSVector4/GSVector4i when the host does not support any form of SIMD. + +#pragma once + +#include "common/assert.h" +#include "common/types.h" + +#include +#include +#include + +#define GSVECTOR_HAS_UNSIGNED 1 +#define GSVECTOR_HAS_SRLV 1 + +class GSVector4; + +#define ALL_LANES_8(expr) \ + GSVector4i ret; \ + for (size_t i = 0; i < 16; i++) \ + expr; \ + return ret; +#define ALL_LANES_16(expr) \ + GSVector4i ret; \ + for (size_t i = 0; i < 8; i++) \ + expr; \ + return ret; +#define ALL_LANES_32(expr) \ + GSVector4i ret; \ + for (size_t i = 0; i < 4; i++) \ + expr; \ + return ret; +#define ALL_LANES_64(expr) \ + GSVector4i ret; \ + for (size_t i = 0; i < 2; i++) \ + expr; \ + return ret; +#define SSATURATE8(expr) static_cast(std::clamp(expr, -128, 127)) +#define USATURATE8(expr) static_cast(std::clamp(expr, 0, 255)) +#define SSATURATE16(expr) static_cast(std::clamp(expr, -32768, 32767)) +#define USATURATE16(expr) static_cast(std::clamp(expr, 0, 65535)) + +class alignas(16) GSVector4i +{ + struct cxpr_init_tag + { + }; + static constexpr cxpr_init_tag cxpr_init{}; + + constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : I32{x, y, z, w} {} + + constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) + : I16{s0, s1, s2, s3, s4, s5, s6, s7} + { + } + + constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, + s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) + : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} + { + } + +public: + union + { + struct + { + s32 x, y, z, w; + }; + struct + { + s32 r, g, b, a; + }; + struct + { + s32 left, top, right, bottom; + }; + float F32[4]; + s8 I8[16]; + s16 I16[8]; + s32 I32[4]; + s64 I64[2]; + u8 U8[16]; + u16 U16[8]; + u32 U32[4]; + u64 U64[2]; + }; + + GSVector4i() = default; + + ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w) + { + return GSVector4i(cxpr_init, x, y, z, w); + } + + ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); } + + ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); } + + ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) + { + return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7); + } + + ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, + s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) + { + return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15); + } + + ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) + { + this->x = x; + this->y = y; + this->z = z; + this->w = w; + } + + ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); } + + ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) + { + I16[0] = s0; + I16[1] = s1; + I16[2] = s2; + I16[3] = s3; + I16[4] = s4; + I16[5] = s5; + I16[6] = s6; + I16[7] = s7; + } + + ALWAYS_INLINE constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, + s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) + : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} + { + } + + ALWAYS_INLINE GSVector4i(const GSVector4i& v) { std::memcpy(I32, v.I32, sizeof(I32)); } + + ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) + { + x = v.x; + y = v.y; + z = 0; + w = 0; + } + + // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), + // so leave the non-constexpr version default + ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; } + + ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true); + + ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); + + ALWAYS_INLINE void operator=(const GSVector4i& v) { std::memcpy(I32, v.I32, sizeof(I32)); } + ALWAYS_INLINE void operator=(s32 i) + { + x = i; + y = i; + z = i; + w = i; + } + + // rect + + ALWAYS_INLINE s32 width() const { return right - left; } + + ALWAYS_INLINE s32 height() const { return bottom - top; } + + ALWAYS_INLINE GSVector4i rsize() const + { + return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height()); + } + + ALWAYS_INLINE s32 rarea() const { return width() * height(); } + + ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; } + + // TODO: Optimize for no-simd, this generates crap code. + ALWAYS_INLINE GSVector4i runion(const GSVector4i& v) const { return min_i32(v).upl64(max_i32(v).srl<8>()); } + + ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& v) const { return sat_i32(v); } + ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); } + ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); } + + template + GSVector4i _ralign_helper(const GSVector4i& mask) const + { + GSVector4i v; + + switch (mode) + { + case Align_Inside: + v = add32(mask); + break; + case Align_Outside: + v = add32(mask.zwxy()); + break; + case Align_NegInf: + v = *this; + break; + case Align_PosInf: + v = add32(mask.xyxy()); + break; + + default: + UnreachableCode(); + break; + } + + return v.andnot(mask.xyxy()); + } + + /// Align the rect using mask values that already have one subtracted (1 << n - 1 aligns to 1 << n) + template + GSVector4i ralign_presub(const GSVector2i& v) const + { + return _ralign_helper(GSVector4i(v)); + } + + template + GSVector4i ralign(const GSVector2i& v) const + { + // a must be 1 << n + + return _ralign_helper(GSVector4i(v).sub32(GSVector4i(1, 1))); + } + + // + + ALWAYS_INLINE u32 rgba32() const + { + GSVector4i v = *this; + + v = v.ps32(v); + v = v.pu16(v); + + return (u32)store(v); + } + + ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const + { + return max_i8(min).min_i8(max); + } + ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const + { + return max_i8(minmax.xyxy()).min_i8(minmax.zwzw()); + } + ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const + { + return max_i16(min).min_i16(max); + } + ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const + { + return max_i16(minmax.xyxy()).min_i16(minmax.zwzw()); + } + ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const + { + return max_i32(min).min_i32(max); + } + ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const + { + return max_i32(minmax.xyxy()).min_i32(minmax.zwzw()); + } + + ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const + { + return max_u8(min).min_u8(max); + } + ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const + { + return max_u8(minmax.xyxy()).min_u8(minmax.zwzw()); + } + ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const + { + return max_u16(min).min_u16(max); + } + ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const + { + return max_u16(minmax.xyxy()).min_u16(minmax.zwzw()); + } + ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const + { + return max_u32(min).min_u32(max); + } + ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const + { + return max_u32(minmax.xyxy()).min_u32(minmax.zwzw()); + } + + GSVector4i min_i8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = std::min(I8[i], v.I8[i])); } + GSVector4i max_i8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = std::max(I8[i], v.I8[i])); } + GSVector4i min_i16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = std::min(I16[i], v.I16[i])); } + GSVector4i max_i16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = std::max(I16[i], v.I16[i])); } + GSVector4i min_i32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = std::min(I32[i], v.I32[i])); } + GSVector4i max_i32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = std::max(I32[i], v.I32[i])); } + + GSVector4i min_u8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = std::min(U8[i], v.U8[i])); } + GSVector4i max_u8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = std::max(U8[i], v.U8[i])); } + GSVector4i min_u16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = std::min(U16[i], v.U16[i])); } + GSVector4i max_u16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = std::max(U16[i], v.U16[i])); } + GSVector4i min_u32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = std::min(U32[i], v.U32[i])); } + GSVector4i max_u32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = std::max(U32[i], v.U32[i])); } + + GSVector4i madd_s16(const GSVector4i& v) const + { + ALL_LANES_32(ret.I32[i] = (I16[i * 2] * v.I16[i * 2]) + (I16[i * 2 + 1] * v.I16[i * 2 + 1])); + } + + GSVector4i addp_s32() const { return GSVector4i(x + y, z + w, 0, 0); } + + s32 minv_s32() const { return std::min(x, std::min(y, std::min(z, w))); } + + u32 minv_u32() const { return std::min(U32[0], std::min(U32[1], std::min(U32[2], U32[3]))); } + + s32 maxv_s32() const { return std::max(x, std::max(y, std::max(z, w))); } + + u32 maxv_u32() const { return std::max(U32[0], std::max(U32[1], std::max(U32[2], U32[3]))); } + + static s32 min_i16(s32 a, s32 b) { return store(load(a).min_i16(load(b))); } + + ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); } + + GSVector4i blend8(const GSVector4i& v, const GSVector4i& mask) const + { + GSVector4i ret; + for (size_t i = 0; i < 16; i++) + ret.U8[i] = (mask.U8[i] & 0x80) ? v.U8[i] : U8[i]; + return ret; + } + + template + GSVector4i blend16(const GSVector4i& v) const + { + GSVector4i ret; + for (size_t i = 0; i < 8; i++) + ret.U16[i] = ((mask & (1 << i)) != 0) ? v.U16[i] : U16[i]; + return ret; + } + + template + GSVector4i blend32(const GSVector4i& v) const + { + GSVector4i ret; + for (size_t i = 0; i < 4; i++) + ret.U32[i] = ((mask & (1 << i)) != 0) ? v.U32[i] : U32[i]; + return ret; + } + + GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const + { + GSVector4i ret; + for (size_t i = 0; i < 2; i++) + ret.U64[0] = (v.U64[i] & mask.U64[i]) | (U64[i] & ~mask.U64[i]); + return ret; + } + + ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); } + + GSVector4i shuffle8(const GSVector4i& mask) const + { + ALL_LANES_8(ret.I8[i] = (mask.I8[i] & 0x80) ? 0 : (I8[mask.I8[i] & 0xf])); + } + + GSVector4i ps16(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8((i < 8) ? I16[i] : v.I16[i])); } + GSVector4i ps16() const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I16[i])); } + GSVector4i pu16(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE16((i < 8) ? U16[i] : v.U16[i])); } + GSVector4i pu16() const { ALL_LANES_8(ret.U8[i] = USATURATE8(U16[i])); } + GSVector4i ps32(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = SSATURATE16((i < 8) ? I32[i] : v.I32[i])); } + GSVector4i ps32() const { ALL_LANES_16(ret.I16[i] = SSATURATE8(I32[i])); } + GSVector4i pu32(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16((i < 8) ? U32[i] : v.U32[i])); } + GSVector4i pu32() const { ALL_LANES_16(ret.U16[i] = USATURATE8(U32[i])); } + + GSVector4i upl8(const GSVector4i& v) const + { + return GSVector4i(I8[0], v.I8[0], I8[1], v.I8[1], I8[2], v.I8[2], I8[3], v.I8[3], I8[4], v.I8[4], I8[5], v.I8[5], + I8[6], v.I8[6], I8[7], v.I8[7]); + } + GSVector4i uph8(const GSVector4i& v) const + { + return GSVector4i(I8[8], v.I8[8], I8[9], v.I8[9], I8[10], v.I8[10], I8[11], v.I8[11], I8[12], v.I8[12], I8[13], + v.I8[13], I8[14], v.I8[14], I8[15], v.I8[15]); + } + GSVector4i upl16(const GSVector4i& v) const + { + return GSVector4i(I16[0], v.I16[0], I16[1], v.I16[1], I16[2], v.I16[2], I16[3], v.I16[3]); + } + GSVector4i uph16(const GSVector4i& v) const + { + return GSVector4i(I16[4], v.I16[4], I16[5], v.I16[5], I16[6], v.I16[6], I16[7], v.I16[7]); + } + GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(I32[0], v.I32[0], I32[1], v.I32[1]); } + GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(I32[2], v.I32[2], I32[3], v.I32[3]); } + GSVector4i upl64(const GSVector4i& v) const + { + GSVector4i ret; + ret.I64[0] = I64[0]; + ret.I64[1] = v.I64[0]; + return ret; + } + GSVector4i uph64(const GSVector4i& v) const + { + GSVector4i ret; + ret.I64[0] = I64[1]; + ret.I64[1] = v.I64[1]; + return ret; + } + + GSVector4i upl8() const + { + return GSVector4i(I8[0], 0, I8[1], 0, I8[2], 0, I8[3], 0, I8[4], 0, I8[5], 0, I8[6], 0, I8[7], 0); + } + GSVector4i uph8() const + { + return GSVector4i(I8[8], 0, I8[9], 0, I8[10], 0, I8[11], 0, I8[12], 0, I8[13], 0, I8[14], 0, I8[15], 0); + } + + GSVector4i upl16() const { return GSVector4i(I16[0], 0, I16[1], 0, I16[2], 0, I16[3], 0); } + GSVector4i uph16() const { return GSVector4i(I16[4], 0, I16[5], 0, I16[6], 0, I16[7], 0); } + + GSVector4i upl32() const { return GSVector4i(I32[0], 0, I32[1], 0); } + GSVector4i uph32() const { return GSVector4i(I32[2], 0, I32[3], 0); } + GSVector4i upl64() const + { + GSVector4i ret; + ret.I64[0] = I64[0]; + ret.I64[1] = 0; + return ret; + } + GSVector4i uph64() const + { + GSVector4i ret; + ret.I64[0] = I64[1]; + ret.I64[1] = 0; + return ret; + } + + GSVector4i i8to16() const { ALL_LANES_16(ret.I16[i] = I8[i]); } + GSVector4i i8to32() const { ALL_LANES_32(ret.I32[i] = I8[i]); } + GSVector4i i8to64() const { ALL_LANES_64(ret.I64[i] = I8[i]); } + + GSVector4i i16to32() const { ALL_LANES_32(ret.I32[i] = I16[i]); } + GSVector4i i16to64() const { ALL_LANES_64(ret.I64[i] = I16[i]); } + GSVector4i i32to64() const { ALL_LANES_64(ret.I64[i] = I32[i]); } + GSVector4i u8to16() const { ALL_LANES_64(ret.U16[i] = U8[i]); } + GSVector4i u8to32() const { ALL_LANES_32(ret.U32[i] = U8[i]); } + GSVector4i u8to64() const { ALL_LANES_64(ret.U64[i] = U8[i]); } + GSVector4i u16to32() const { ALL_LANES_32(ret.U32[i] = U16[i]); } + GSVector4i u16to64() const { ALL_LANES_64(ret.U64[i] = U16[i]); } + GSVector4i u32to64() const { ALL_LANES_64(ret.U64[i] = U32[i]); } + + template + GSVector4i srl() const + { + GSVector4i ret = {}; + if constexpr (v < 16) + { + for (s32 i = 0; i < (16 - v); i++) + ret.U8[i] = U8[v + i]; + } + return ret; + } + + template + GSVector4i srl(const GSVector4i& r) + { + // This sucks. Hopefully it's never used. + u8 concat[32]; + std::memcpy(concat, U8, sizeof(u8) * 16); + std::memcpy(concat + 16, r.U8, sizeof(u8) * 16); + + GSVector4i ret; + std::memcpy(ret.U8, &concat[v], sizeof(u8) * 16); + return ret; + } + + template + GSVector4i sll() const + { + GSVector4i ret = {}; + if constexpr (v < 16) + { + for (s32 i = 0; i < (16 - v); i++) + ret.U8[v + i] = U8[i]; + } + return ret; + } + + template + GSVector4i sll16() const + { + ALL_LANES_16(ret.U16[i] = U16[i] << v); + } + + GSVector4i sll16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v); } + + GSVector4i sllv16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] << v.U16[i]); } + + template + GSVector4i srl16() const + { + ALL_LANES_16(ret.U16[i] = U16[i] >> v); + } + + GSVector4i srl16(s32 v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v); } + + GSVector4i srlv16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = U16[i] >> v.U16[i]); } + + template + GSVector4i sra16() const + { + ALL_LANES_16(ret.I16[i] = I16[i] >> v); + } + + GSVector4i sra16(s32 v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v); } + + GSVector4i srav16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] >> v.I16[i]); } + + template + GSVector4i sll32() const + { + ALL_LANES_32(ret.U32[i] = U32[i] << v); + } + + GSVector4i sll32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v); } + + GSVector4i sllv32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] << v.U32[i]); } + + template + GSVector4i srl32() const + { + ALL_LANES_32(ret.U32[i] = U32[i] >> v); + } + + GSVector4i srl32(s32 v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v); } + + GSVector4i srlv32(const GSVector4i& v) const { ALL_LANES_32(ret.U32[i] = U32[i] >> v.U32[i]); } + + template + GSVector4i sra32() const + { + ALL_LANES_32(ret.I32[i] = I32[i] >> v); + } + + GSVector4i sra32(s32 v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v); } + + GSVector4i srav32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] >> v.I32[i]); } + + template + GSVector4i sll64() const + { + ALL_LANES_64(ret.U64[i] = U64[i] << v); + } + + GSVector4i sll64(s32 v) const { ALL_LANES_64(ret.U64[i] = U64[i] << v); } + + GSVector4i sllv64(const GSVector4i& v) const { ALL_LANES_64(ret.U64[i] = U64[i] << v.U64[i]); } + + template + GSVector4i srl64() const + { + ALL_LANES_64(ret.U64[i] = U64[i] >> v); + } + + GSVector4i srl64(s32 v) const { ALL_LANES_64(ret.U64[i] = U64[i] >> v); } + + GSVector4i srlv64(const GSVector4i& v) const { ALL_LANES_64(ret.U64[i] = U64[i] >> v.U64[i]); } + + template + GSVector4i sra64() const + { + ALL_LANES_64(ret.I64[i] = I64[i] >> v); + } + + GSVector4i sra64(s32 v) const { ALL_LANES_64(ret.I64[i] = I64[i] >> v); } + + GSVector4i srav64(const GSVector4i& v) const { ALL_LANES_64(ret.I64[i] = I64[i] >> v.I64[i]); } + + GSVector4i add8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] + v.I8[i]); } + + GSVector4i add16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] + v.I16[i]); } + + GSVector4i add32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] + v.I32[i]); } + + GSVector4i adds8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] + v.I8[i])); } + + GSVector4i adds16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] + v.I16[i])); } + + GSVector4i hadds16(const GSVector4i& v) const + { + return GSVector4i(SSATURATE16(I16[0] + I16[1]), SSATURATE16(I16[2] + I16[3]), SSATURATE16(I16[4] + I16[5]), + SSATURATE16(I16[6] + I16[7]), SSATURATE16(v.I16[0] + v.I16[1]), SSATURATE16(v.I16[2] + v.I16[3]), + SSATURATE16(v.I16[4] + v.I16[5]), SSATURATE16(v.I16[6] + v.I16[7])); + } + + GSVector4i addus8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] + v.U8[i])); } + + GSVector4i addus16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] + v.U16[i])); } + + GSVector4i sub8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = I8[i] - v.I8[i]); } + + GSVector4i sub16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] - v.I16[i]); } + + GSVector4i sub32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] - v.I32[i]); } + + GSVector4i subs8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = SSATURATE8(I8[i] - v.I8[i])); } + + GSVector4i subs16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = SSATURATE16(I16[i] - v.I16[i])); } + + GSVector4i subus8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = USATURATE8(U8[i] - v.U8[i])); } + + GSVector4i subus16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = USATURATE16(U16[i] - v.U16[i])); } + + GSVector4i avg8(const GSVector4i& v) const { ALL_LANES_8(ret.U8[i] = (U8[i] + v.U8[i]) >> 1); } + + GSVector4i avg16(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] + v.U16[i]) >> 1); } + + GSVector4i mul16hs(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] * v.I16[i]) >> 16); } + + GSVector4i mul16hu(const GSVector4i& v) const { ALL_LANES_16(ret.U16[i] = (U16[i] * v.U16[i]) >> 16); } + + GSVector4i mul16l(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = I16[i] * v.I16[i]); } + + GSVector4i mul16hrs(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = ((I16[i] * v.I16[i]) >> 14) + 1); } + + GSVector4i mul32l(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = I32[i] * v.I32[i]); } + + template + ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const + { + // (a - this) * f << shift + this + + return add16(a.sub16(*this).modulate16(f)); + } + + template + ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c) + { + // (a - b) * c << shift + + return a.sub16(b).modulate16(c); + } + + template + ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, + const GSVector4i& d) + { + // (a - b) * c << shift + d + + return d.add16(a.sub16(b).modulate16(c)); + } + + ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const + { + // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit) + + return add16(a_.sub16(*this).mul16l(f).sra16<4>()); + } + + template + ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const + { + // a * f << shift + if constexpr (shift == 0) + { + return mul16hrs(f); + } + + return sll16().mul16hs(f); + } + + ALWAYS_INLINE bool eq(const GSVector4i& v) const { return (std::memcmp(I32, v.I32, sizeof(I32))) == 0; } + + GSVector4i eq8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] == v.I8[i]) ? -1 : 0); } + GSVector4i eq16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] == v.I16[i]) ? -1 : 0); } + GSVector4i eq32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] == v.I32[i]) ? -1 : 0); } + GSVector4i eq64(const GSVector4i& v) const { ALL_LANES_64(ret.I64[i] = (I64[i] == v.I64[i]) ? -1 : 0); } + + GSVector4i neq8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] != v.I8[i]) ? -1 : 0); } + GSVector4i neq16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] != v.I16[i]) ? -1 : 0); } + GSVector4i neq32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] != v.I32[i]) ? -1 : 0); } + + GSVector4i gt8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] > v.I8[i]) ? -1 : 0); } + GSVector4i gt16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] > v.I16[i]) ? -1 : 0); } + GSVector4i gt32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] > v.I32[i]) ? -1 : 0); } + + GSVector4i ge8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] >= v.I8[i]) ? -1 : 0); } + GSVector4i ge16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] >= v.I16[i]) ? -1 : 0); } + GSVector4i ge32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] >= v.I32[i]) ? -1 : 0); } + + GSVector4i lt8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] < v.I8[i]) ? -1 : 0); } + GSVector4i lt16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] < v.I16[i]) ? -1 : 0); } + GSVector4i lt32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] < v.I32[i]) ? -1 : 0); } + + GSVector4i le8(const GSVector4i& v) const { ALL_LANES_8(ret.I8[i] = (I8[i] <= v.I8[i]) ? -1 : 0); } + GSVector4i le16(const GSVector4i& v) const { ALL_LANES_16(ret.I16[i] = (I16[i] <= v.I16[i]) ? -1 : 0); } + GSVector4i le32(const GSVector4i& v) const { ALL_LANES_32(ret.I32[i] = (I32[i] <= v.I32[i]) ? -1 : 0); } + + ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { ALL_LANES_64(ret.U64[i] = (~v.U64[i]) & U64[i]); } + + s32 mask() const + { + return static_cast((static_cast(U8[0] >> 7) << 0) | (static_cast(U8[1] >> 7) << 1) | + (static_cast(U8[2] >> 7) << 2) | (static_cast(U8[3] >> 7) << 3) | + (static_cast(U8[4] >> 7) << 4) | (static_cast(U8[5] >> 7) << 5) | + (static_cast(U8[6] >> 7) << 6) | (static_cast(U8[7] >> 7) << 7) | + (static_cast(U8[8] >> 7) << 8) | (static_cast(U8[9] >> 7) << 9) | + (static_cast(U8[10] >> 7) << 10) | (static_cast(U8[11] >> 7) << 11) | + (static_cast(U8[12] >> 7) << 12) | (static_cast(U8[13] >> 7) << 13) | + (static_cast(U8[14] >> 7) << 14) | (static_cast(U8[15] >> 7) << 15)); + } + + ALWAYS_INLINE bool alltrue() const { return ((U64[0] & U64[1]) == 0xFFFFFFFFFFFFFFFFULL); } + + ALWAYS_INLINE bool allfalse() const { return ((U64[0] | U64[1]) == 0); } + + template + ALWAYS_INLINE GSVector4i insert8(s32 a) const + { + GSVector4i ret = *this; + ret.I8[i] = static_cast(a); + return ret; + } + + template + ALWAYS_INLINE s32 extract8() const + { + return I8[i]; + } + + template + ALWAYS_INLINE GSVector4i insert16(s32 a) const + { + GSVector4i ret = *this; + ret.I16[i] = static_cast(a); + return ret; + } + + template + ALWAYS_INLINE s32 extract16() const + { + return I16[i]; + } + + template + ALWAYS_INLINE GSVector4i insert32(s32 a) const + { + GSVector4i ret = *this; + ret.I32[i] = a; + return ret; + } + + template + ALWAYS_INLINE s32 extract32() const + { + return I32[i]; + } + + template + ALWAYS_INLINE GSVector4i insert64(s64 a) const + { + GSVector4i ret = *this; + ret.I64[i] = a; + return ret; + } + + template + ALWAYS_INLINE s64 extract64() const + { + return I64[i]; + } + + ALWAYS_INLINE static GSVector4i loadnt(const void* p) + { + GSVector4i ret; + std::memcpy(&ret, p, sizeof(ret.I32)); + return ret; + } + + ALWAYS_INLINE static GSVector4i load32(const void* p) + { + GSVector4i ret; + std::memcpy(&ret.x, p, sizeof(s32)); + ret.y = 0; + ret.z = 0; + ret.w = 0; + return ret; + } + + ALWAYS_INLINE static GSVector4i loadl(const void* p) + { + GSVector4i ret; + std::memcpy(&ret.U64[0], p, sizeof(ret.U64[0])); + ret.U64[1] = 0; + return ret; + } + + ALWAYS_INLINE static GSVector4i loadh(const void* p) + { + GSVector4i ret; + ret.U64[0] = 0; + std::memcpy(&ret.U64[1], p, sizeof(ret.U64[1])); + return ret; + } + + ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); } + + template + ALWAYS_INLINE static GSVector4i load(const void* p) + { + GSVector4i ret; + std::memcpy(ret.I32, p, sizeof(ret.I32)); + return ret; + } + + ALWAYS_INLINE static GSVector4i load(s32 i) + { + GSVector4i ret; + ret.x = i; + ret.y = 0; + ret.z = 0; + ret.w = 0; + return ret; + } + + ALWAYS_INLINE static GSVector4i loadq(s64 i) + { + GSVector4i ret; + ret.I64[0] = i; + ret.I64[1] = 0; + return ret; + } + + ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { std::memcpy(p, v.I32, sizeof(v.I32)); } + + ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { std::memcpy(p, &v.I32[0], sizeof(s32) * 2); } + + ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { std::memcpy(p, &v.I32[2], sizeof(s32) * 2); } + + ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v) + { + GSVector4i::storel(pl, v); + GSVector4i::storeh(ph, v); + } + + template + ALWAYS_INLINE static void store(void* p, const GSVector4i& v) + { + std::memcpy(p, v.I32, sizeof(I32)); + } + + ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { std::memcpy(p, &v.x, sizeof(s32)); } + + ALWAYS_INLINE static s32 store(const GSVector4i& v) { return v.x; } + + ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return v.I64[0]; } + + ALWAYS_INLINE void operator&=(const GSVector4i& v) + { + U64[0] &= v.U64[0]; + U64[1] &= v.U64[1]; + } + ALWAYS_INLINE void operator|=(const GSVector4i& v) + { + U64[0] |= v.U64[0]; + U64[1] |= v.U64[1]; + } + ALWAYS_INLINE void operator^=(const GSVector4i& v) + { + U64[0] ^= v.U64[0]; + U64[1] ^= v.U64[1]; + } + + ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2) + { + GSVector4i ret; + ret.U64[0] = v1.U64[0] & v2.U64[0]; + ret.U64[1] = v1.U64[1] & v2.U64[1]; + return ret; + } + + ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2) + { + GSVector4i ret; + ret.U64[0] = v1.U64[0] | v2.U64[0]; + ret.U64[1] = v1.U64[1] | v2.U64[1]; + return ret; + } + + ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2) + { + GSVector4i ret; + ret.U64[0] = v1.U64[0] ^ v2.U64[0]; + ret.U64[1] = v1.U64[1] ^ v2.U64[1]; + return ret; + } + + ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); } + + ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); } + + ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); } + + ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); } + + ALWAYS_INLINE static constexpr GSVector4i zero() { return GSVector4i::cxpr(0, 0, 0, 0); } + + ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); } + + ALWAYS_INLINE GSVector2i xy() const + { + GSVector2i ret; + storel(&ret, *this); + return ret; + } + + ALWAYS_INLINE GSVector2i zw() const + { + GSVector2i ret; + storeh(&ret, *this); + return ret; + } + + // clang-format off + // l/h/lh not implemented until needed + +#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(I32[xn], I32[yn], I32[zn], I32[wn]);} + +#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ + +#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ + +#define VECTOR4i_SHUFFLE_1(xs, xn) \ + VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \ + VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \ + VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \ + VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \ + + VECTOR4i_SHUFFLE_1(x, 0) + VECTOR4i_SHUFFLE_1(y, 1) + VECTOR4i_SHUFFLE_1(z, 2) + VECTOR4i_SHUFFLE_1(w, 3) + + // clang-format on +}; + +class alignas(16) GSVector4 +{ + struct cxpr_init_tag + { + }; + static constexpr cxpr_init_tag cxpr_init{}; + + constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {} + + constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {} + + constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {} + +public: + union + { + struct + { + float x, y, z, w; + }; + struct + { + float r, g, b, a; + }; + struct + { + float left, top, right, bottom; + }; + float F32[4]; + double F64[2]; + s8 I8[16]; + s16 I16[8]; + s32 I32[4]; + s64 I64[2]; + u8 U8[16]; + u16 U16[8]; + u32 U32[4]; + u64 U64[2]; + __m128 m; + }; + + GSVector4() = default; + + constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); } + + constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); } + + constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); } + + constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); } + + constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); } + + constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); } + + ALWAYS_INLINE GSVector4(float x, float y, float z, float w) + { + this->x = x; + this->y = y; + this->z = z; + this->w = w; + } + + ALWAYS_INLINE GSVector4(float x, float y) + { + this->x = x; + this->y = y; + this->z = 0.0f; + this->w = 0.0f; + } + + ALWAYS_INLINE GSVector4(int x, int y, int z, int w) + { + this->x = static_cast(x); + this->y = static_cast(y); + this->z = static_cast(z); + this->w = static_cast(w); + } + + ALWAYS_INLINE GSVector4(int x, int y) + { + this->x = static_cast(x); + this->y = static_cast(y); + this->z = 0.0f; + this->w = 0.0f; + } + + ALWAYS_INLINE explicit GSVector4(const GSVector2& v) + { + x = v.x; + y = v.y; + z = 0.0f; + w = 0.0f; + } + + ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) + { + x = static_cast(v.x); + y = static_cast(v.y); + z = 0.0f; + w = 0.0f; + } + + ALWAYS_INLINE explicit GSVector4(float f) { x = y = z = w = f; } + + ALWAYS_INLINE explicit GSVector4(int i) { x = y = z = w = static_cast(i); } + + ALWAYS_INLINE explicit GSVector4(const GSVector4i& v); + + ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v); + + ALWAYS_INLINE static GSVector4 f64(double x, double y) + { + GSVector4 ret; + ret.F64[0] = x; + ret.F64[1] = y; + return ret; + } + + ALWAYS_INLINE void operator=(float f) { x = y = z = w = f; } + + ALWAYS_INLINE GSVector4 noopt() { return *this; } + + u32 rgba32() const { return GSVector4i(*this).rgba32(); } + + ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); } + + ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); } + + GSVector4 abs() const { return GSVector4(std::fabs(x), std::fabs(y), std::fabs(z), std::fabs(w)); } + + GSVector4 neg() const { return GSVector4(-x, -y, -z, -w); } + + GSVector4 rcp() const { return GSVector4(1.0f / x, 1.0f / y, 1.0f / z, 1.0f / w); } + + GSVector4 rcpnr() const + { + GSVector4 v_ = rcp(); + + return (v_ + v_) - (v_ * v_) * *this; + } + + GSVector4 floor() const { return GSVector4(std::floor(x), std::floor(y), std::floor(z), std::floor(w)); } + + GSVector4 ceil() const { return GSVector4(std::ceil(x), std::ceil(y), std::ceil(z), std::ceil(w)); } + + GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ + b_; } + + GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const { return *this * a_ - b_; } + + GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const { return b_ - *this * a_; } + + GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const { return -b_ - *this * a_; } + + GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const + { + return a_.madd(b_, *this); // *this + a * b + } + + GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const + { + return a_.nmadd(b_, *this); // *this - a * b + } + + GSVector4 hadd() const { return GSVector4(x + y, z + w, x + y, z + w); } + + GSVector4 hadd(const GSVector4& v) const { return GSVector4(x + y, z + w, v.x + v.y, v.z + v.w); } + + GSVector4 hsub() const { return GSVector4(x - y, z - w, x - y, z - w); } + + GSVector4 hsub(const GSVector4& v) const { return GSVector4(x - y, z - w, v.x - v.y, v.z - v.w); } + + template + GSVector4 dp(const GSVector4& v) const + { + float res = 0.0f; + if constexpr (i & 0x10) + res += x * v.x; + if constexpr (i & 0x20) + res += y * v.y; + if constexpr (i & 0x40) + res += z * v.z; + if constexpr (i & 0x80) + res += w * v.w; + return GSVector4((i & 0x01) ? res : 0.0f, (i & 0x02) ? res : 0.0f, (i & 0x04) ? res : 0.0f, + (i & 0x08) ? res : 0.0f); + } + + GSVector4 sat(const GSVector4& min, const GSVector4& max) const + { + return GSVector4(std::clamp(x, min.x, max.x), std::clamp(y, min.y, max.y), std::clamp(z, min.z, max.z), + std::clamp(w, min.w, max.w)); + } + + GSVector4 sat(const GSVector4& v) const + { + return GSVector4(std::clamp(x, v.x, v.z), std::clamp(y, v.y, v.w), std::clamp(z, v.x, v.z), + std::clamp(w, v.y, v.w)); + } + + GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); } + + GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); } + + GSVector4 min(const GSVector4& v) const + { + return GSVector4(std::min(x, v.x), std::min(y, v.y), std::min(z, v.z), std::min(w, v.w)); + } + + GSVector4 max(const GSVector4& v) const + { + return GSVector4(std::max(x, v.x), std::max(y, v.y), std::max(z, v.z), std::max(w, v.w)); + } + + template + GSVector4 blend32(const GSVector4& v) const + { + return GSVector4(v.F32[mask & 1], v.F32[(mask >> 1) & 1], v.F32[(mask >> 2) & 1], v.F32[(mask >> 3) & 1]); + } + + ALWAYS_INLINE GSVector4 blend32(const GSVector4& v, const GSVector4& mask) const + { + return GSVector4((mask.U32[0] & 0x80000000u) ? v.x : x, (mask.U32[1] & 0x80000000u) ? v.y : y, + (mask.U32[2] & 0x80000000u) ? v.z : z, (mask.U32[3] & 0x80000000u) ? v.w : w); + } + + GSVector4 upl(const GSVector4& v) const { return GSVector4(x, y, v.x, v.y); } + + GSVector4 uph(const GSVector4& v) const { return GSVector4(z, w, v.z, v.w); } + + GSVector4 upld(const GSVector4& v) const + { + GSVector4 ret; + ret.U64[0] = U64[0]; + ret.U64[1] = v.U64[0]; + return ret; + } + + GSVector4 uphd(const GSVector4& v) const + { + GSVector4 ret; + ret.U64[0] = U64[1]; + ret.U64[1] = v.U64[1]; + return ret; + } + + ALWAYS_INLINE GSVector4 l2h(const GSVector4& v) const { return GSVector4(x, y, v.x, v.y); } + + ALWAYS_INLINE GSVector4 h2l(const GSVector4& v) const { return GSVector4(v.z, v.w, z, w); } + + ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const + { + GSVector4 ret; + ret.U32[0] = ((~v.U32[0]) & U32[0]); + ret.U32[1] = ((~v.U32[1]) & U32[1]); + ret.U32[2] = ((~v.U32[2]) & U32[2]); + ret.U32[3] = ((~v.U32[3]) & U32[3]); + return ret; + } + + ALWAYS_INLINE int mask() const + { + return (U32[0] >> 31) | ((U32[1] >> 30) & 2) | ((U32[2] >> 29) & 4) | ((U32[3] >> 28) & 8); + } + + ALWAYS_INLINE bool alltrue() const { return ((U64[0] & U64[1]) == 0xFFFFFFFFFFFFFFFFULL); } + + ALWAYS_INLINE bool allfalse() const { return ((U64[0] | U64[1]) == 0); } + + ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); } + + template + ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const + { + GSVector4 ret = *this; + ret.F32[dst] = v.F32[src]; + return ret; + } + + template + ALWAYS_INLINE int extract32() const + { + return I32[i]; + } + + ALWAYS_INLINE static constexpr GSVector4 zero() { return GSVector4::cxpr(0.0f, 0.0f, 0.0f, 0.0f); } + + ALWAYS_INLINE static constexpr GSVector4 xffffffff() + { + GSVector4 ret = zero(); + ret.U64[0] = ~ret.U64[0]; + ret.U64[1] = ~ret.U64[1]; + return ret; + } + + ALWAYS_INLINE static GSVector4 loadl(const void* p) + { + GSVector4 ret; + std::memcpy(&ret.x, p, sizeof(float) * 2); + ret.z = 0.0f; + ret.w = 0.0f; + return ret; + } + + ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(f, f, f, f); } + + template + ALWAYS_INLINE static GSVector4 load(const void* p) + { + GSVector4 ret; + std::memcpy(&ret.x, p, sizeof(float) * 4); + return ret; + } + + ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { std::memcpy(p, &v, sizeof(v)); } + + ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { std::memcpy(p, &v.x, sizeof(float) * 2); } + + ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { std::memcpy(p, &v.z, sizeof(float) * 2); } + + template + ALWAYS_INLINE static void store(void* p, const GSVector4& v) + { + std::memcpy(p, &v.x, sizeof(float)); + } + + ALWAYS_INLINE static void store(float* p, const GSVector4& v) { *p = v.x; } + + ALWAYS_INLINE GSVector4 operator-() const { return neg(); } + + void operator+=(const GSVector4& v_) + { + x = x + v_.x; + y = y + v_.y; + z = z + v_.z; + w = w + v_.w; + } + void operator-=(const GSVector4& v_) + { + x = x - v_.x; + y = y - v_.y; + z = z - v_.z; + w = w - v_.w; + } + void operator*=(const GSVector4& v_) + { + x = x * v_.x; + y = y * v_.y; + z = z * v_.z; + w = w * v_.w; + } + void operator/=(const GSVector4& v_) + { + x = x / v_.x; + y = y / v_.y; + z = z / v_.z; + w = w / v_.w; + } + + void operator+=(const float v_) + { + x = x + v_; + y = y + v_; + z = z + v_; + w = w + v_; + } + void operator-=(const float v_) + { + x = x - v_; + y = y - v_; + z = z - v_; + w = w - v_; + } + void operator*=(const float v_) + { + x = x * v_; + y = y * v_; + z = z * v_; + w = w * v_; + } + void operator/=(const float v_) + { + x = x / v_; + y = y / v_; + z = z / v_; + w = w / v_; + } + + void operator&=(const GSVector4& v_) + { + U64[0] &= v_.U64[0]; + U64[1] &= v_.U64[1]; + } + void operator|=(const GSVector4& v_) + { + U64[0] |= v_.U64[0]; + U64[1] |= v_.U64[1]; + } + void operator^=(const GSVector4& v_) + { + U64[0] ^= v_.U64[0]; + U64[1] ^= v_.U64[1]; + } + + friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z, v1.w + v2.w); + } + + friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z, v1.w - v2.w); + } + + friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z, v1.w * v2.w); + } + + friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(v1.x / v2.x, v1.y / v2.y, v1.z / v2.z, v1.w / v2.w); + } + + friend GSVector4 operator+(const GSVector4& v, float f) { return GSVector4(v.x + f, v.y + f, v.z + f, v.w + f); } + + friend GSVector4 operator-(const GSVector4& v, float f) { return GSVector4(v.x - f, v.y - f, v.z - f, v.w - f); } + + friend GSVector4 operator*(const GSVector4& v, float f) { return GSVector4(v.x * f, v.y * f, v.z * f, v.w * f); } + + friend GSVector4 operator/(const GSVector4& v, float f) { return GSVector4(v.x / f, v.y / f, v.z / f, v.w / f); } + + friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2) + { + GSVector4 ret; + ret.U64[0] = v1.U64[0] & v2.U64[0]; + ret.U64[1] = v1.U64[1] & v2.U64[1]; + return ret; + } + + ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2) + { + GSVector4 ret; + ret.U64[0] = v1.U64[0] | v2.U64[0]; + ret.U64[1] = v1.U64[1] | v2.U64[1]; + return ret; + } + + ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2) + { + GSVector4 ret; + ret.U64[0] = v1.U64[0] ^ v2.U64[0]; + ret.U64[1] = v1.U64[1] ^ v2.U64[1]; + return ret; + } + + ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2) + { + GSVector4 ret; + ret.I32[0] = (v1.x == v2.x) ? -1 : 0; + ret.I32[1] = (v1.y == v2.y) ? -1 : 0; + ret.I32[2] = (v1.z == v2.z) ? -1 : 0; + ret.I32[3] = (v1.w == v2.w) ? -1 : 0; + return ret; + } + + ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2) + { + GSVector4 ret; + ret.I32[0] = (v1.x != v2.x) ? -1 : 0; + ret.I32[1] = (v1.y != v2.y) ? -1 : 0; + ret.I32[2] = (v1.z != v2.z) ? -1 : 0; + ret.I32[3] = (v1.w != v2.w) ? -1 : 0; + return ret; + } + + ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2) + { + GSVector4 ret; + ret.I32[0] = (v1.x > v2.x) ? -1 : 0; + ret.I32[1] = (v1.y > v2.y) ? -1 : 0; + ret.I32[2] = (v1.z > v2.z) ? -1 : 0; + ret.I32[3] = (v1.w > v2.w) ? -1 : 0; + return ret; + } + + ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2) + { + GSVector4 ret; + ret.I32[0] = (v1.x < v2.x) ? -1 : 0; + ret.I32[1] = (v1.y < v2.y) ? -1 : 0; + ret.I32[2] = (v1.z < v2.z) ? -1 : 0; + ret.I32[3] = (v1.w < v2.w) ? -1 : 0; + return ret; + } + + ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2) + { + GSVector4 ret; + ret.I32[0] = (v1.x >= v2.x) ? -1 : 0; + ret.I32[1] = (v1.y >= v2.y) ? -1 : 0; + ret.I32[2] = (v1.z >= v2.z) ? -1 : 0; + ret.I32[3] = (v1.w >= v2.w) ? -1 : 0; + return ret; + } + + ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2) + { + GSVector4 ret; + ret.I32[0] = (v1.x <= v2.x) ? -1 : 0; + ret.I32[1] = (v1.y <= v2.y) ? -1 : 0; + ret.I32[2] = (v1.z <= v2.z) ? -1 : 0; + ret.I32[3] = (v1.w <= v2.w) ? -1 : 0; + return ret; + } + + ALWAYS_INLINE GSVector4 mul64(const GSVector4& v_) const + { + GSVector4 ret; + ret.F64[0] = F64[0] * v_.F64[0]; + ret.F64[1] = F64[1] * v_.F64[1]; + return ret; + } + + ALWAYS_INLINE GSVector4 add64(const GSVector4& v_) const + { + GSVector4 ret; + ret.F64[0] = F64[0] + v_.F64[0]; + ret.F64[1] = F64[1] + v_.F64[1]; + return ret; + } + + ALWAYS_INLINE GSVector4 sub64(const GSVector4& v_) const + { + GSVector4 ret; + ret.F64[0] = F64[0] - v_.F64[0]; + ret.F64[1] = F64[1] - v_.F64[1]; + return ret; + } + + ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) + { + GSVector4 ret; + ret.F64[0] = v_.x; + ret.F64[1] = v_.y; + return ret; + } + + ALWAYS_INLINE static GSVector4 f32to64(const void* p) + { + float f[2]; + std::memcpy(f, p, sizeof(f)); + GSVector4 ret; + ret.F64[0] = f[0]; + ret.F64[1] = f[1]; + return ret; + } + + ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const + { + return GSVector4i(static_cast(F64[0]), static_cast(F64[1]), 0, 0); + } + + // clang-format off + +#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(F32[xn], F32[yn], F32[zn], F32[wn]); } \ + ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(F32[xn], F32[yn], v_.F32[zn], v_.F32[wn]); } + +#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ + +#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ + +#define VECTOR4_SHUFFLE_1(xs, xn) \ + VECTOR4_SHUFFLE_2(xs, xn, x, 0) \ + VECTOR4_SHUFFLE_2(xs, xn, y, 1) \ + VECTOR4_SHUFFLE_2(xs, xn, z, 2) \ + VECTOR4_SHUFFLE_2(xs, xn, w, 3) \ + + VECTOR4_SHUFFLE_1(x, 0) + VECTOR4_SHUFFLE_1(y, 1) + VECTOR4_SHUFFLE_1(z, 2) + VECTOR4_SHUFFLE_1(w, 3) + + // clang-format on + + ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(x, x, x, x); } + + ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(v.x, v.x, v.x, v.x); } + + ALWAYS_INLINE static GSVector4 broadcast32(const void* f) + { + float ff; + std::memcpy(&ff, f, sizeof(ff)); + return GSVector4(ff, ff, ff, ff); + } + + ALWAYS_INLINE static GSVector4 broadcast64(const void* d) + { + GSVector4 ret; + std::memcpy(&ret.F64[0], d, sizeof(ret.F64[0])); + ret.F64[1] = ret.F64[0]; + return ret; + } +}; + +ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate) +{ + // TODO: Truncation vs rounding... + x = static_cast(v.x); + y = static_cast(v.y); + z = static_cast(v.z); + w = static_cast(v.w); +} + +ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v) +{ + x = static_cast(v.x); + y = static_cast(v.y); + z = static_cast(v.z); + w = static_cast(v.w); +} + +ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v) +{ + GSVector4i ret; + std::memcpy(&ret, &v, sizeof(ret)); + return ret; +} + +ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v) +{ + GSVector4 ret; + std::memcpy(&ret, &v, sizeof(ret)); + return ret; +} + +#undef SSATURATE8 +#undef USATURATE8 +#undef SSATURATE16 +#undef USATURATE16 +#undef ALL_LANES_8 +#undef ALL_LANES_16 +#undef ALL_LANES_32 +#undef ALL_LANES_64 diff --git a/src/common/gsvector_sse.h b/src/common/gsvector_sse.h new file mode 100644 index 000000000..9975fbae8 --- /dev/null +++ b/src/common/gsvector_sse.h @@ -0,0 +1,1322 @@ +// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team, 2019-2024 Connor McLaughlin +// SPDX-License-Identifier: LGPL-3.0+ + +#pragma once + +#include "common/assert.h" +#include "common/intrin.h" +#include "common/types.h" + +#include + +#ifdef CPU_ARCH_AVX2 +#define GSVECTOR_HAS_UNSIGNED 1 +#define GSVECTOR_HAS_SRLV 1 +#endif + +class GSVector4; + +class alignas(16) GSVector4i +{ + struct cxpr_init_tag + { + }; + static constexpr cxpr_init_tag cxpr_init{}; + + constexpr GSVector4i(cxpr_init_tag, s32 x, s32 y, s32 z, s32 w) : I32{x, y, z, w} {} + + constexpr GSVector4i(cxpr_init_tag, s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) + : I16{s0, s1, s2, s3, s4, s5, s6, s7} + { + } + + constexpr GSVector4i(cxpr_init_tag, s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, + s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) + : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} + { + } + +public: + union + { + struct + { + s32 x, y, z, w; + }; + struct + { + s32 r, g, b, a; + }; + struct + { + s32 left, top, right, bottom; + }; + float F32[4]; + s8 I8[16]; + s16 I16[8]; + s32 I32[4]; + s64 I64[2]; + u8 U8[16]; + u16 U16[8]; + u32 U32[4]; + u64 U64[2]; + __m128i m; + }; + + GSVector4i() = default; + + ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x, s32 y, s32 z, s32 w) + { + return GSVector4i(cxpr_init, x, y, z, w); + } + + ALWAYS_INLINE constexpr static GSVector4i cxpr(s32 x) { return GSVector4i(cxpr_init, x, x, x, x); } + + ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 x) { return GSVector4i(cxpr_init, x, x, x, x, x, x, x, x); } + + ALWAYS_INLINE constexpr static GSVector4i cxpr16(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) + { + return GSVector4i(cxpr_init, s0, s1, s2, s3, s4, s5, s6, s7); + } + + ALWAYS_INLINE constexpr static GSVector4i cxpr8(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, + s8 b10, s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) + { + return GSVector4i(cxpr_init, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15); + } + + ALWAYS_INLINE GSVector4i(s32 x, s32 y, s32 z, s32 w) { m = _mm_set_epi32(w, z, y, x); } + + ALWAYS_INLINE GSVector4i(s32 x, s32 y) { *this = load(x).upl32(load(y)); } + + ALWAYS_INLINE GSVector4i(s16 s0, s16 s1, s16 s2, s16 s3, s16 s4, s16 s5, s16 s6, s16 s7) + { + m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0); + } + + ALWAYS_INLINE constexpr GSVector4i(s8 b0, s8 b1, s8 b2, s8 b3, s8 b4, s8 b5, s8 b6, s8 b7, s8 b8, s8 b9, s8 b10, + s8 b11, s8 b12, s8 b13, s8 b14, s8 b15) + : I8{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} + { + } + + ALWAYS_INLINE GSVector4i(const GSVector4i& v) { m = v.m; } + + ALWAYS_INLINE explicit GSVector4i(const GSVector2i& v) { m = _mm_loadl_epi64((__m128i*)&v); } + + // MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), + // so leave the non-constexpr version default + ALWAYS_INLINE explicit GSVector4i(s32 i) { *this = i; } + + ALWAYS_INLINE explicit GSVector4i(const GSVector4& v, bool truncate = true); + + ALWAYS_INLINE static GSVector4i cast(const GSVector4& v); + + ALWAYS_INLINE constexpr explicit GSVector4i(__m128i m) : m(m) {} + + ALWAYS_INLINE void operator=(const GSVector4i& v) { m = v.m; } + ALWAYS_INLINE void operator=(s32 i) { m = _mm_set1_epi32(i); } + ALWAYS_INLINE void operator=(__m128i m_) { m = m_; } + + ALWAYS_INLINE operator __m128i() const { return m; } + + // rect + + ALWAYS_INLINE s32 width() const { return right - left; } + + ALWAYS_INLINE s32 height() const { return bottom - top; } + + ALWAYS_INLINE GSVector4i rsize() const + { + return sub32(xyxy()); // same as GSVector4i(0, 0, width(), height()); + } + + ALWAYS_INLINE s32 rarea() const { return width() * height(); } + + ALWAYS_INLINE bool rempty() const { return lt32(zwzw()).mask() != 0x00ff; } + + ALWAYS_INLINE GSVector4i runion(const GSVector4i& v) const { return min_i32(v).upl64(max_i32(v).srl<8>()); } + + ALWAYS_INLINE GSVector4i rintersect(const GSVector4i& v) const { return sat_i32(v); } + ALWAYS_INLINE bool rintersects(const GSVector4i& v) const { return !rintersect(v).rempty(); } + ALWAYS_INLINE bool rcontains(const GSVector4i& v) const { return rintersect(v).eq(v); } + + template + GSVector4i _ralign_helper(const GSVector4i& mask) const + { + GSVector4i v; + + switch (mode) + { + case Align_Inside: + v = add32(mask); + break; + case Align_Outside: + v = add32(mask.zwxy()); + break; + case Align_NegInf: + v = *this; + break; + case Align_PosInf: + v = add32(mask.xyxy()); + break; + + default: + UnreachableCode(); + break; + } + + return v.andnot(mask.xyxy()); + } + + /// Align the rect using mask values that already have one subtracted (1 << n - 1 aligns to 1 << n) + template + GSVector4i ralign_presub(const GSVector2i& v) const + { + return _ralign_helper(GSVector4i(v)); + } + + template + GSVector4i ralign(const GSVector2i& v) const + { + // a must be 1 << n + + return _ralign_helper(GSVector4i(v).sub32(GSVector4i(1, 1))); + } + + // + + ALWAYS_INLINE u32 rgba32() const + { + GSVector4i v = *this; + + v = v.ps32(v); + v = v.pu16(v); + + return (u32)store(v); + } + + ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& min, const GSVector4i& max) const + { + return max_i8(min).min_i8(max); + } + ALWAYS_INLINE GSVector4i sat_i8(const GSVector4i& minmax) const + { + return max_i8(minmax.xyxy()).min_i8(minmax.zwzw()); + } + ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& min, const GSVector4i& max) const + { + return max_i16(min).min_i16(max); + } + ALWAYS_INLINE GSVector4i sat_i16(const GSVector4i& minmax) const + { + return max_i16(minmax.xyxy()).min_i16(minmax.zwzw()); + } + ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& min, const GSVector4i& max) const + { + return max_i32(min).min_i32(max); + } + ALWAYS_INLINE GSVector4i sat_i32(const GSVector4i& minmax) const + { + return max_i32(minmax.xyxy()).min_i32(minmax.zwzw()); + } + + ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& min, const GSVector4i& max) const + { + return max_u8(min).min_u8(max); + } + ALWAYS_INLINE GSVector4i sat_u8(const GSVector4i& minmax) const + { + return max_u8(minmax.xyxy()).min_u8(minmax.zwzw()); + } + ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& min, const GSVector4i& max) const + { + return max_u16(min).min_u16(max); + } + ALWAYS_INLINE GSVector4i sat_u16(const GSVector4i& minmax) const + { + return max_u16(minmax.xyxy()).min_u16(minmax.zwzw()); + } + ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& min, const GSVector4i& max) const + { + return max_u32(min).min_u32(max); + } + ALWAYS_INLINE GSVector4i sat_u32(const GSVector4i& minmax) const + { + return max_u32(minmax.xyxy()).min_u32(minmax.zwzw()); + } + + ALWAYS_INLINE GSVector4i min_i8(const GSVector4i& v) const { return GSVector4i(_mm_min_epi8(m, v)); } + ALWAYS_INLINE GSVector4i max_i8(const GSVector4i& v) const { return GSVector4i(_mm_max_epi8(m, v)); } + ALWAYS_INLINE GSVector4i min_i16(const GSVector4i& v) const { return GSVector4i(_mm_min_epi16(m, v)); } + ALWAYS_INLINE GSVector4i max_i16(const GSVector4i& v) const { return GSVector4i(_mm_max_epi16(m, v)); } + ALWAYS_INLINE GSVector4i min_i32(const GSVector4i& v) const { return GSVector4i(_mm_min_epi32(m, v)); } + ALWAYS_INLINE GSVector4i max_i32(const GSVector4i& v) const { return GSVector4i(_mm_max_epi32(m, v)); } + + ALWAYS_INLINE GSVector4i min_u8(const GSVector4i& v) const { return GSVector4i(_mm_min_epu8(m, v)); } + ALWAYS_INLINE GSVector4i max_u8(const GSVector4i& v) const { return GSVector4i(_mm_max_epu8(m, v)); } + ALWAYS_INLINE GSVector4i min_u16(const GSVector4i& v) const { return GSVector4i(_mm_min_epu16(m, v)); } + ALWAYS_INLINE GSVector4i max_u16(const GSVector4i& v) const { return GSVector4i(_mm_max_epu16(m, v)); } + ALWAYS_INLINE GSVector4i min_u32(const GSVector4i& v) const { return GSVector4i(_mm_min_epu32(m, v)); } + ALWAYS_INLINE GSVector4i max_u32(const GSVector4i& v) const { return GSVector4i(_mm_max_epu32(m, v)); } + + ALWAYS_INLINE GSVector4i madd_s16(const GSVector4i& v) const { return GSVector4i(_mm_madd_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector4i addp_s32() const { return GSVector4i(_mm_hadd_epi32(m, m)); } + + ALWAYS_INLINE s32 minv_s32() const + { + const __m128i vmin = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); + return std::min(_mm_extract_epi32(vmin, 0), _mm_extract_epi32(vmin, 1)); + } + + ALWAYS_INLINE u32 minv_u32() const + { + const __m128i vmin = _mm_min_epu32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); + return std::min(_mm_extract_epi32(vmin, 0), _mm_extract_epi32(vmin, 1)); + } + + ALWAYS_INLINE s32 maxv_s32() const + { + const __m128i vmax = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); + return std::max(_mm_extract_epi32(vmax, 0), _mm_extract_epi32(vmax, 1)); + } + + ALWAYS_INLINE u32 maxv_u32() const + { + const __m128i vmax = _mm_max_epu32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(3, 2, 3, 2))); + return std::max(_mm_extract_epi32(vmax, 0), _mm_extract_epi32(vmax, 1)); + } + + ALWAYS_INLINE GSVector4i clamp8() const { return pu16().upl8(); } + + ALWAYS_INLINE GSVector4i blend8(const GSVector4i& v, const GSVector4i& mask) const + { + return GSVector4i(_mm_blendv_epi8(m, v, mask)); + } + + template + ALWAYS_INLINE GSVector4i blend16(const GSVector4i& v) const + { + return GSVector4i(_mm_blend_epi16(m, v, mask)); + } + + template + ALWAYS_INLINE GSVector4i blend32(const GSVector4i& v) const + { +#if defined(__AVX2__) + return GSVector4i(_mm_blend_epi32(m, v.m, mask)); +#else + constexpr s32 bit3 = ((mask & 8) * 3) << 3; + constexpr s32 bit2 = ((mask & 4) * 3) << 2; + constexpr s32 bit1 = ((mask & 2) * 3) << 1; + constexpr s32 bit0 = (mask & 1) * 3; + return blend16(v); +#endif + } + + ALWAYS_INLINE GSVector4i blend(const GSVector4i& v, const GSVector4i& mask) const + { + return GSVector4i(_mm_or_si128(_mm_andnot_si128(mask, m), _mm_and_si128(mask, v))); + } + + ALWAYS_INLINE GSVector4i mix16(const GSVector4i& v) const { return blend16<0xaa>(v); } + + ALWAYS_INLINE GSVector4i shuffle8(const GSVector4i& mask) const { return GSVector4i(_mm_shuffle_epi8(m, mask)); } + + ALWAYS_INLINE GSVector4i ps16(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi16(m, v)); } + ALWAYS_INLINE GSVector4i ps16() const { return GSVector4i(_mm_packs_epi16(m, m)); } + ALWAYS_INLINE GSVector4i pu16(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi16(m, v)); } + ALWAYS_INLINE GSVector4i pu16() const { return GSVector4i(_mm_packus_epi16(m, m)); } + ALWAYS_INLINE GSVector4i ps32(const GSVector4i& v) const { return GSVector4i(_mm_packs_epi32(m, v)); } + ALWAYS_INLINE GSVector4i ps32() const { return GSVector4i(_mm_packs_epi32(m, m)); } + ALWAYS_INLINE GSVector4i pu32(const GSVector4i& v) const { return GSVector4i(_mm_packus_epi32(m, v)); } + ALWAYS_INLINE GSVector4i pu32() const { return GSVector4i(_mm_packus_epi32(m, m)); } + + ALWAYS_INLINE GSVector4i upl8(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi8(m, v)); } + ALWAYS_INLINE GSVector4i uph8(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi8(m, v)); } + ALWAYS_INLINE GSVector4i upl16(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi16(m, v)); } + ALWAYS_INLINE GSVector4i uph16(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi16(m, v)); } + ALWAYS_INLINE GSVector4i upl32(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi32(m, v)); } + ALWAYS_INLINE GSVector4i uph32(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi32(m, v)); } + ALWAYS_INLINE GSVector4i upl64(const GSVector4i& v) const { return GSVector4i(_mm_unpacklo_epi64(m, v)); } + ALWAYS_INLINE GSVector4i uph64(const GSVector4i& v) const { return GSVector4i(_mm_unpackhi_epi64(m, v)); } + + ALWAYS_INLINE GSVector4i upl8() const { return GSVector4i(_mm_unpacklo_epi8(m, _mm_setzero_si128())); } + ALWAYS_INLINE GSVector4i uph8() const { return GSVector4i(_mm_unpackhi_epi8(m, _mm_setzero_si128())); } + + ALWAYS_INLINE GSVector4i upl16() const { return GSVector4i(_mm_unpacklo_epi16(m, _mm_setzero_si128())); } + ALWAYS_INLINE GSVector4i uph16() const { return GSVector4i(_mm_unpackhi_epi16(m, _mm_setzero_si128())); } + + ALWAYS_INLINE GSVector4i upl32() const { return GSVector4i(_mm_unpacklo_epi32(m, _mm_setzero_si128())); } + + ALWAYS_INLINE GSVector4i uph32() const { return GSVector4i(_mm_unpackhi_epi32(m, _mm_setzero_si128())); } + ALWAYS_INLINE GSVector4i upl64() const { return GSVector4i(_mm_unpacklo_epi64(m, _mm_setzero_si128())); } + ALWAYS_INLINE GSVector4i uph64() const { return GSVector4i(_mm_unpackhi_epi64(m, _mm_setzero_si128())); } + + ALWAYS_INLINE GSVector4i i8to16() const { return GSVector4i(_mm_cvtepi8_epi16(m)); } + ALWAYS_INLINE GSVector4i i8to32() const { return GSVector4i(_mm_cvtepi8_epi32(m)); } + ALWAYS_INLINE GSVector4i i8to64() const { return GSVector4i(_mm_cvtepi8_epi64(m)); } + +#ifdef CPU_ARCH_SSE41 + ALWAYS_INLINE GSVector4i i16to32() const { return GSVector4i(_mm_cvtepi16_epi32(m)); } + ALWAYS_INLINE GSVector4i i16to64() const { return GSVector4i(_mm_cvtepi16_epi64(m)); } + ALWAYS_INLINE GSVector4i i32to64() const { return GSVector4i(_mm_cvtepi32_epi64(m)); } + ALWAYS_INLINE GSVector4i u8to16() const { return GSVector4i(_mm_cvtepu8_epi16(m)); } + ALWAYS_INLINE GSVector4i u8to32() const { return GSVector4i(_mm_cvtepu8_epi32(m)); } + ALWAYS_INLINE GSVector4i u8to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); } + ALWAYS_INLINE GSVector4i u16to32() const { return GSVector4i(_mm_cvtepu16_epi32(m)); } + ALWAYS_INLINE GSVector4i u16to64() const { return GSVector4i(_mm_cvtepu16_epi64(m)); } + ALWAYS_INLINE GSVector4i u32to64() const { return GSVector4i(_mm_cvtepu32_epi64(m)); } +#endif + + template + ALWAYS_INLINE GSVector4i srl() const + { + return GSVector4i(_mm_srli_si128(m, i)); + } + + template + ALWAYS_INLINE GSVector4i srl(const GSVector4i& v) + { + return GSVector4i(_mm_alignr_epi8(v.m, m, i)); + } + + template + ALWAYS_INLINE GSVector4i sll() const + { + return GSVector4i(_mm_slli_si128(m, i)); + } + + template + ALWAYS_INLINE GSVector4i sll16() const + { + return GSVector4i(_mm_slli_epi16(m, i)); + } + + ALWAYS_INLINE GSVector4i sll16(s32 i) const { return GSVector4i(_mm_sll_epi16(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector4i sllv16(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi16(m, v.m)); } +#endif + + template + ALWAYS_INLINE GSVector4i srl16() const + { + return GSVector4i(_mm_srli_epi16(m, i)); + } + + ALWAYS_INLINE GSVector4i srl16(s32 i) const { return GSVector4i(_mm_srl_epi16(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector4i srlv16(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi16(m, v.m)); } +#endif + + template + ALWAYS_INLINE GSVector4i sra16() const + { + return GSVector4i(_mm_srai_epi16(m, i)); + } + + ALWAYS_INLINE GSVector4i sra16(s32 i) const { return GSVector4i(_mm_sra_epi16(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector4i srav16(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi16(m, v.m)); } +#endif + + template + ALWAYS_INLINE GSVector4i sll32() const + { + return GSVector4i(_mm_slli_epi32(m, i)); + } + + ALWAYS_INLINE GSVector4i sll32(s32 i) const { return GSVector4i(_mm_sll_epi32(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector4i sllv32(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi32(m, v.m)); } +#endif + + template + ALWAYS_INLINE GSVector4i srl32() const + { + return GSVector4i(_mm_srli_epi32(m, i)); + } + + ALWAYS_INLINE GSVector4i srl32(s32 i) const { return GSVector4i(_mm_srl_epi32(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector4i srlv32(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi32(m, v.m)); } +#endif + + template + ALWAYS_INLINE GSVector4i sra32() const + { + return GSVector4i(_mm_srai_epi32(m, i)); + } + + ALWAYS_INLINE GSVector4i sra32(s32 i) const { return GSVector4i(_mm_sra_epi32(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector4i srav32(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi32(m, v.m)); } +#endif + + template + ALWAYS_INLINE GSVector4i sll64() const + { + return GSVector4i(_mm_slli_epi64(m, i)); + } + + ALWAYS_INLINE GSVector4i sll64(s32 i) const { return GSVector4i(_mm_sll_epi64(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector4i sllv64(const GSVector4i& v) const { return GSVector4i(_mm_sllv_epi64(m, v.m)); } +#endif + + template + ALWAYS_INLINE GSVector4i srl64() const + { + return GSVector4i(_mm_srli_epi64(m, i)); + } + + ALWAYS_INLINE GSVector4i srl64(s32 i) const { return GSVector4i(_mm_srl_epi64(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector4i srlv64(const GSVector4i& v) const { return GSVector4i(_mm_srlv_epi64(m, v.m)); } +#endif + + template + ALWAYS_INLINE GSVector4i sra64() const + { + return GSVector4i(_mm_srai_epi64(m, i)); + } + + ALWAYS_INLINE GSVector4i sra64(s32 i) const { return GSVector4i(_mm_sra_epi64(m, _mm_cvtsi32_si128(i))); } + +#ifdef CPU_ARCH_AVX2 + ALWAYS_INLINE GSVector4i srav64(const GSVector4i& v) const { return GSVector4i(_mm_srav_epi64(m, v.m)); } +#endif + + ALWAYS_INLINE GSVector4i add8(const GSVector4i& v) const { return GSVector4i(_mm_add_epi8(m, v.m)); } + + ALWAYS_INLINE GSVector4i add16(const GSVector4i& v) const { return GSVector4i(_mm_add_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector4i add32(const GSVector4i& v) const { return GSVector4i(_mm_add_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector4i adds8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi8(m, v.m)); } + + ALWAYS_INLINE GSVector4i adds16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector4i hadds16(const GSVector4i& v) const { return GSVector4i(_mm_hadds_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector4i addus8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu8(m, v.m)); } + + ALWAYS_INLINE GSVector4i addus16(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu16(m, v.m)); } + + ALWAYS_INLINE GSVector4i sub8(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi8(m, v.m)); } + + ALWAYS_INLINE GSVector4i sub16(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector4i sub32(const GSVector4i& v) const { return GSVector4i(_mm_sub_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector4i subs8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi8(m, v.m)); } + + ALWAYS_INLINE GSVector4i subs16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector4i subus8(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu8(m, v.m)); } + + ALWAYS_INLINE GSVector4i subus16(const GSVector4i& v) const { return GSVector4i(_mm_subs_epu16(m, v.m)); } + + ALWAYS_INLINE GSVector4i avg8(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu8(m, v.m)); } + + ALWAYS_INLINE GSVector4i avg16(const GSVector4i& v) const { return GSVector4i(_mm_avg_epu16(m, v.m)); } + + ALWAYS_INLINE GSVector4i mul16hs(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector4i mul16hu(const GSVector4i& v) const { return GSVector4i(_mm_mulhi_epu16(m, v.m)); } + + ALWAYS_INLINE GSVector4i mul16l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector4i mul16hrs(const GSVector4i& v) const { return GSVector4i(_mm_mulhrs_epi16(m, v.m)); } + + ALWAYS_INLINE GSVector4i mul32l(const GSVector4i& v) const { return GSVector4i(_mm_mullo_epi32(m, v.m)); } + + template + ALWAYS_INLINE GSVector4i lerp16(const GSVector4i& a, const GSVector4i& f) const + { + // (a - this) * f << shift + this + + return add16(a.sub16(*this).modulate16(f)); + } + + template + ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c) + { + // (a - b) * c << shift + + return a.sub16(b).modulate16(c); + } + + template + ALWAYS_INLINE static GSVector4i lerp16(const GSVector4i& a, const GSVector4i& b, const GSVector4i& c, + const GSVector4i& d) + { + // (a - b) * c << shift + d + + return d.add16(a.sub16(b).modulate16(c)); + } + + ALWAYS_INLINE GSVector4i lerp16_4(const GSVector4i& a_, const GSVector4i& f) const + { + // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit) + + return add16(a_.sub16(*this).mul16l(f).sra16<4>()); + } + + template + ALWAYS_INLINE GSVector4i modulate16(const GSVector4i& f) const + { + // a * f << shift + if (shift == 0) + { + return mul16hrs(f); + } + + return sll16().mul16hs(f); + } + + ALWAYS_INLINE bool eq(const GSVector4i& v) const + { + // pxor, ptest, je + + GSVector4i t = *this ^ v; + + return _mm_testz_si128(t, t) != 0; + } + + ALWAYS_INLINE GSVector4i eq8(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi8(m, v.m)); } + ALWAYS_INLINE GSVector4i eq16(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi16(m, v.m)); } + ALWAYS_INLINE GSVector4i eq32(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi32(m, v.m)); } + ALWAYS_INLINE GSVector4i eq64(const GSVector4i& v) const { return GSVector4i(_mm_cmpeq_epi64(m, v.m)); } + + ALWAYS_INLINE GSVector4i neq8(const GSVector4i& v) const { return ~eq8(v); } + ALWAYS_INLINE GSVector4i neq16(const GSVector4i& v) const { return ~eq16(v); } + ALWAYS_INLINE GSVector4i neq32(const GSVector4i& v) const { return ~eq32(v); } + + ALWAYS_INLINE GSVector4i gt8(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi8(m, v.m)); } + ALWAYS_INLINE GSVector4i gt16(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi16(m, v.m)); } + ALWAYS_INLINE GSVector4i gt32(const GSVector4i& v) const { return GSVector4i(_mm_cmpgt_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector4i ge8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi8(m, v.m)); } + ALWAYS_INLINE GSVector4i ge16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi16(m, v.m)); } + ALWAYS_INLINE GSVector4i ge32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmplt_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector4i lt8(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi8(m, v.m)); } + ALWAYS_INLINE GSVector4i lt16(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi16(m, v.m)); } + ALWAYS_INLINE GSVector4i lt32(const GSVector4i& v) const { return GSVector4i(_mm_cmplt_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector4i le8(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi8(m, v.m)); } + ALWAYS_INLINE GSVector4i le16(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi16(m, v.m)); } + ALWAYS_INLINE GSVector4i le32(const GSVector4i& v) const { return ~GSVector4i(_mm_cmpgt_epi32(m, v.m)); } + + ALWAYS_INLINE GSVector4i andnot(const GSVector4i& v) const { return GSVector4i(_mm_andnot_si128(v.m, m)); } + + ALWAYS_INLINE s32 mask() const { return _mm_movemask_epi8(m); } + + ALWAYS_INLINE bool alltrue() const { return mask() == 0xffff; } + + ALWAYS_INLINE bool allfalse() const { return _mm_testz_si128(m, m) != 0; } + + template + ALWAYS_INLINE GSVector4i insert8(s32 a) const + { + return GSVector4i(_mm_insert_epi8(m, a, i)); + } + + template + ALWAYS_INLINE s32 extract8() const + { + return _mm_extract_epi8(m, i); + } + + template + ALWAYS_INLINE GSVector4i insert16(s32 a) const + { + return GSVector4i(_mm_insert_epi16(m, a, i)); + } + + template + ALWAYS_INLINE s32 extract16() const + { + return _mm_extract_epi16(m, i); + } + + template + ALWAYS_INLINE GSVector4i insert32(s32 a) const + { + return GSVector4i(_mm_insert_epi32(m, a, i)); + } + + template + ALWAYS_INLINE s32 extract32() const + { + if constexpr (i == 0) + return GSVector4i::store(*this); + + return _mm_extract_epi32(m, i); + } + + template + ALWAYS_INLINE GSVector4i insert64(s64 a) const + { + return GSVector4i(_mm_insert_epi64(m, a, i)); + } + + template + ALWAYS_INLINE s64 extract64() const + { + if (i == 0) + return GSVector4i::storeq(*this); + + return _mm_extract_epi64(m, i); + } + + ALWAYS_INLINE static GSVector4i loadnt(const void* p) { return GSVector4i(_mm_stream_load_si128((__m128i*)p)); } + + ALWAYS_INLINE static GSVector4i load32(const void* p) { return GSVector4i(_mm_loadu_si32(p)); } + + ALWAYS_INLINE static GSVector4i loadl(const void* p) { return GSVector4i(_mm_loadl_epi64((__m128i*)p)); } + + ALWAYS_INLINE static GSVector4i loadh(const void* p) + { + return GSVector4i(_mm_castps_si128(_mm_loadh_pi(_mm_setzero_ps(), (__m64*)p))); + } + + ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); } + + template + ALWAYS_INLINE static GSVector4i load(const void* p) + { + return GSVector4i(aligned ? _mm_load_si128((__m128i*)p) : _mm_loadu_si128((__m128i*)p)); + } + + ALWAYS_INLINE static GSVector4i load(s32 i) { return GSVector4i(_mm_cvtsi32_si128(i)); } + + ALWAYS_INLINE static GSVector4i loadq(s64 i) { return GSVector4i(_mm_cvtsi64_si128(i)); } + + ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { _mm_stream_si128((__m128i*)p, v.m); } + + ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { _mm_storel_epi64((__m128i*)p, v.m); } + + ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { _mm_storeh_pi((__m64*)p, _mm_castsi128_ps(v.m)); } + + ALWAYS_INLINE static void store(void* pl, void* ph, const GSVector4i& v) + { + GSVector4i::storel(pl, v); + GSVector4i::storeh(ph, v); + } + + template + ALWAYS_INLINE static void store(void* p, const GSVector4i& v) + { + if constexpr (aligned) + _mm_store_si128((__m128i*)p, v.m); + else + _mm_storeu_si128((__m128i*)p, v.m); + } + + ALWAYS_INLINE static void store32(void* p, const GSVector4i& v) { _mm_storeu_si32(p, v); } + + ALWAYS_INLINE static s32 store(const GSVector4i& v) { return _mm_cvtsi128_si32(v.m); } + + ALWAYS_INLINE static s64 storeq(const GSVector4i& v) { return _mm_cvtsi128_si64(v.m); } + + ALWAYS_INLINE void operator&=(const GSVector4i& v) { m = _mm_and_si128(m, v); } + ALWAYS_INLINE void operator|=(const GSVector4i& v) { m = _mm_or_si128(m, v); } + ALWAYS_INLINE void operator^=(const GSVector4i& v) { m = _mm_xor_si128(m, v); } + + ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v1, const GSVector4i& v2) + { + return GSVector4i(_mm_and_si128(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v1, const GSVector4i& v2) + { + return GSVector4i(_mm_or_si128(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v1, const GSVector4i& v2) + { + return GSVector4i(_mm_xor_si128(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4i operator&(const GSVector4i& v, s32 i) { return v & GSVector4i(i); } + + ALWAYS_INLINE friend GSVector4i operator|(const GSVector4i& v, s32 i) { return v | GSVector4i(i); } + + ALWAYS_INLINE friend GSVector4i operator^(const GSVector4i& v, s32 i) { return v ^ GSVector4i(i); } + + ALWAYS_INLINE friend GSVector4i operator~(const GSVector4i& v) { return v ^ v.eq32(v); } + + ALWAYS_INLINE static GSVector4i zero() { return GSVector4i(_mm_setzero_si128()); } + + ALWAYS_INLINE GSVector4i xyxy(const GSVector4i& v) const { return upl64(v); } + + ALWAYS_INLINE GSVector2i xy() const + { + GSVector2i ret; + storel(&ret, *this); + return ret; + } + + ALWAYS_INLINE GSVector2i zw() const + { + GSVector2i ret; + storeh(&ret, *this); + return ret; + } + + // clang-format off + +#define VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws() const {return GSVector4i(_mm_shuffle_epi32(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws##l() const {return GSVector4i(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws##h() const {return GSVector4i(_mm_shufflehi_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)));} \ + ALWAYS_INLINE GSVector4i xs##ys##zs##ws##lh() const {return GSVector4i(_mm_shufflehi_epi16(_mm_shufflelo_epi16(m, _MM_SHUFFLE(wn, zn, yn, xn)), _MM_SHUFFLE(wn, zn, yn, xn)));} \ + +#define VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ + VECTOR4i_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ + +#define VECTOR4i_SHUFFLE_2(xs, xn, ys, yn) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ + VECTOR4i_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ + +#define VECTOR4i_SHUFFLE_1(xs, xn) \ + VECTOR4i_SHUFFLE_2(xs, xn, x, 0) \ + VECTOR4i_SHUFFLE_2(xs, xn, y, 1) \ + VECTOR4i_SHUFFLE_2(xs, xn, z, 2) \ + VECTOR4i_SHUFFLE_2(xs, xn, w, 3) \ + + VECTOR4i_SHUFFLE_1(x, 0) + VECTOR4i_SHUFFLE_1(y, 1) + VECTOR4i_SHUFFLE_1(z, 2) + VECTOR4i_SHUFFLE_1(w, 3) + + // clang-format on +}; + +class alignas(16) GSVector4 +{ + struct cxpr_init_tag + { + }; + static constexpr cxpr_init_tag cxpr_init{}; + + constexpr GSVector4(cxpr_init_tag, float x, float y, float z, float w) : F32{x, y, z, w} {} + + constexpr GSVector4(cxpr_init_tag, int x, int y, int z, int w) : I32{x, y, z, w} {} + + constexpr GSVector4(cxpr_init_tag, u64 x, u64 y) : U64{x, y} {} + +public: + union + { + struct + { + float x, y, z, w; + }; + struct + { + float r, g, b, a; + }; + struct + { + float left, top, right, bottom; + }; + float F32[4]; + double F64[2]; + s8 I8[16]; + s16 I16[8]; + s32 I32[4]; + s64 I64[2]; + u8 U8[16]; + u16 U16[8]; + u32 U32[4]; + u64 U64[2]; + __m128 m; + }; + + GSVector4() = default; + + constexpr static GSVector4 cxpr(float x, float y, float z, float w) { return GSVector4(cxpr_init, x, y, z, w); } + + constexpr static GSVector4 cxpr(float x) { return GSVector4(cxpr_init, x, x, x, x); } + + constexpr static GSVector4 cxpr(int x, int y, int z, int w) { return GSVector4(cxpr_init, x, y, z, w); } + + constexpr static GSVector4 cxpr(int x) { return GSVector4(cxpr_init, x, x, x, x); } + + constexpr static GSVector4 cxpr64(u64 x, u64 y) { return GSVector4(cxpr_init, x, y); } + + constexpr static GSVector4 cxpr64(u64 x) { return GSVector4(cxpr_init, x, x); } + + ALWAYS_INLINE GSVector4(float x, float y, float z, float w) { m = _mm_set_ps(w, z, y, x); } + + ALWAYS_INLINE GSVector4(float x, float y) { m = _mm_unpacklo_ps(_mm_load_ss(&x), _mm_load_ss(&y)); } + + ALWAYS_INLINE GSVector4(int x, int y, int z, int w) + { + GSVector4i v_(x, y, z, w); + + m = _mm_cvtepi32_ps(v_.m); + } + + ALWAYS_INLINE GSVector4(int x, int y) + { + m = _mm_cvtepi32_ps(_mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(y))); + } + + ALWAYS_INLINE explicit GSVector4(const GSVector2& v) { m = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&v)); } + + ALWAYS_INLINE explicit GSVector4(const GSVector2i& v) { m = _mm_cvtepi32_ps(_mm_loadl_epi64((__m128i*)&v)); } + + ALWAYS_INLINE constexpr explicit GSVector4(__m128 m) : m(m) {} + + ALWAYS_INLINE explicit GSVector4(__m128d m) : m(_mm_castpd_ps(m)) {} + + ALWAYS_INLINE explicit GSVector4(float f) { *this = f; } + + ALWAYS_INLINE explicit GSVector4(int i) + { +#ifdef CPU_ARCH_AVX2 + m = _mm_cvtepi32_ps(_mm_broadcastd_epi32(_mm_cvtsi32_si128(i))); +#else + *this = GSVector4(GSVector4i(i)); +#endif + } + + ALWAYS_INLINE explicit GSVector4(const GSVector4i& v); + + ALWAYS_INLINE static GSVector4 cast(const GSVector4i& v); + + ALWAYS_INLINE static GSVector4 f64(double x, double y) { return GSVector4(_mm_castpd_ps(_mm_set_pd(y, x))); } + + ALWAYS_INLINE void operator=(float f) + { +#if CPU_ARCH_AVX2 + + m = _mm_broadcastss_ps(_mm_load_ss(&f)); + +#else + + m = _mm_set1_ps(f); + +#endif + } + + ALWAYS_INLINE void operator=(__m128 m_) { this->m = m_; } + + ALWAYS_INLINE operator __m128() const { return m; } + + /// Makes Clang think that the whole vector is needed, preventing it from changing shuffles around because it thinks + /// we don't need the whole vector Useful for e.g. preventing clang from optimizing shuffles that remove + /// possibly-denormal garbage data from vectors before computing with them + ALWAYS_INLINE GSVector4 noopt() + { + // Note: Clang is currently the only compiler that attempts to optimize vector intrinsics, if that changes in the + // future the implementation should be updated +#ifdef __clang__ + __asm__("" : "+x"(m)::); +#endif + return *this; + } + + u32 rgba32() const { return GSVector4i(*this).rgba32(); } + + ALWAYS_INLINE static GSVector4 rgba32(u32 rgba) { return GSVector4(GSVector4i::load((int)rgba).u8to32()); } + + ALWAYS_INLINE static GSVector4 unorm8(u32 rgba) { return rgba32(rgba) * GSVector4::cxpr(1.0f / 255.0f); } + + ALWAYS_INLINE GSVector4 abs() const { return *this & cast(GSVector4i::cxpr(0x7fffffff)); } + + ALWAYS_INLINE GSVector4 neg() const { return *this ^ cast(GSVector4i::cxpr(0x80000000)); } + + ALWAYS_INLINE GSVector4 rcp() const { return GSVector4(_mm_rcp_ps(m)); } + + ALWAYS_INLINE GSVector4 rcpnr() const + { + GSVector4 v_ = rcp(); + + return (v_ + v_) - (v_ * v_) * *this; + } + + template + ALWAYS_INLINE GSVector4 round() const + { + return GSVector4(_mm_round_ps(m, mode)); + } + + ALWAYS_INLINE GSVector4 floor() const { return round(); } + + ALWAYS_INLINE GSVector4 ceil() const { return round(); } + + ALWAYS_INLINE GSVector4 madd(const GSVector4& a_, const GSVector4& b_) const + { +#ifdef CPU_ARCH_AVX2 + return GSVector4(_mm_fmadd_ps(m, a_, b_)); +#else + return *this * a_ + b_; +#endif + } + + ALWAYS_INLINE GSVector4 msub(const GSVector4& a_, const GSVector4& b_) const + { +#ifdef CPU_ARCH_AVX2 + return GSVector4(_mm_fmsub_ps(m, a_, b_)); +#else + return *this * a_ - b_; +#endif + } + + ALWAYS_INLINE GSVector4 nmadd(const GSVector4& a_, const GSVector4& b_) const + { +#ifdef CPU_ARCH_AVX2 + return GSVector4(_mm_fnmadd_ps(m, a_, b_)); +#else + return b_ - *this * a_; +#endif + } + + ALWAYS_INLINE GSVector4 nmsub(const GSVector4& a_, const GSVector4& b_) const + { +#ifdef CPU_ARCH_AVX2 + return GSVector4(_mm_fnmsub_ps(m, a_, b_)); +#else + return -b_ - *this * a_; +#endif + } + + ALWAYS_INLINE GSVector4 addm(const GSVector4& a_, const GSVector4& b_) const + { + return a_.madd(b_, *this); // *this + a * b + } + + ALWAYS_INLINE GSVector4 subm(const GSVector4& a_, const GSVector4& b_) const + { + return a_.nmadd(b_, *this); // *this - a * b + } + + ALWAYS_INLINE GSVector4 hadd() const { return GSVector4(_mm_hadd_ps(m, m)); } + + ALWAYS_INLINE GSVector4 hadd(const GSVector4& v) const { return GSVector4(_mm_hadd_ps(m, v.m)); } + + ALWAYS_INLINE GSVector4 hsub() const { return GSVector4(_mm_hsub_ps(m, m)); } + + ALWAYS_INLINE GSVector4 hsub(const GSVector4& v) const { return GSVector4(_mm_hsub_ps(m, v.m)); } + + template + ALWAYS_INLINE GSVector4 dp(const GSVector4& v) const + { + return GSVector4(_mm_dp_ps(m, v.m, i)); + } + + ALWAYS_INLINE GSVector4 sat(const GSVector4& min, const GSVector4& max) const + { + return GSVector4(_mm_min_ps(_mm_max_ps(m, min), max)); + } + + ALWAYS_INLINE GSVector4 sat(const GSVector4& v) const + { + return GSVector4(_mm_min_ps(_mm_max_ps(m, v.xyxy()), v.zwzw())); + } + + ALWAYS_INLINE GSVector4 sat(const float scale = 255) const { return sat(zero(), GSVector4(scale)); } + + ALWAYS_INLINE GSVector4 clamp(const float scale = 255) const { return min(GSVector4(scale)); } + + ALWAYS_INLINE GSVector4 min(const GSVector4& v) const { return GSVector4(_mm_min_ps(m, v)); } + + ALWAYS_INLINE GSVector4 max(const GSVector4& v) const { return GSVector4(_mm_max_ps(m, v)); } + + template + ALWAYS_INLINE GSVector4 blend32(const GSVector4& v) const + { + return GSVector4(_mm_blend_ps(m, v, mask)); + } + + ALWAYS_INLINE GSVector4 blend32(const GSVector4& v, const GSVector4& mask) const + { + return GSVector4(_mm_blendv_ps(m, v, mask)); + } + + ALWAYS_INLINE GSVector4 upl(const GSVector4& v) const { return GSVector4(_mm_unpacklo_ps(m, v)); } + + ALWAYS_INLINE GSVector4 uph(const GSVector4& v) const { return GSVector4(_mm_unpackhi_ps(m, v)); } + + ALWAYS_INLINE GSVector4 upld(const GSVector4& v) const + { + return GSVector4(_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(m), _mm_castps_pd(v.m)))); + } + + ALWAYS_INLINE GSVector4 uphd(const GSVector4& v) const + { + return GSVector4(_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(m), _mm_castps_pd(v.m)))); + } + + ALWAYS_INLINE GSVector4 l2h(const GSVector4& v) const { return GSVector4(_mm_movelh_ps(m, v)); } + + ALWAYS_INLINE GSVector4 h2l(const GSVector4& v) const { return GSVector4(_mm_movehl_ps(m, v)); } + + ALWAYS_INLINE GSVector4 andnot(const GSVector4& v) const { return GSVector4(_mm_andnot_ps(v.m, m)); } + + ALWAYS_INLINE int mask() const { return _mm_movemask_ps(m); } + + ALWAYS_INLINE bool alltrue() const { return mask() == 0xf; } + + ALWAYS_INLINE bool allfalse() const + { +#ifdef CPU_ARCH_AVX2 + return _mm_testz_ps(m, m) != 0; +#else + const __m128i ii = _mm_castps_si128(m); + return _mm_testz_si128(ii, ii) != 0; +#endif + } + + ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); } + + template + ALWAYS_INLINE GSVector4 insert32(const GSVector4& v) const + { + if constexpr (src == dst) + return GSVector4(_mm_blend_ps(m, v.m, 1 << src)); + else + return GSVector4(_mm_insert_ps(m, v.m, _MM_MK_INSERTPS_NDX(src, dst, 0))); + } + + template + ALWAYS_INLINE int extract32() const + { + return _mm_extract_ps(m, i); + } + + ALWAYS_INLINE static GSVector4 zero() { return GSVector4(_mm_setzero_ps()); } + + ALWAYS_INLINE static GSVector4 xffffffff() { return zero() == zero(); } + + ALWAYS_INLINE static GSVector4 loadl(const void* p) { return GSVector4(_mm_castpd_ps(_mm_load_sd((double*)p))); } + + ALWAYS_INLINE static GSVector4 load(float f) { return GSVector4(_mm_load_ss(&f)); } + + template + ALWAYS_INLINE static GSVector4 load(const void* p) + { + return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p)); + } + + ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { _mm_stream_ps((float*)p, v.m); } + + ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { _mm_store_sd((double*)p, _mm_castps_pd(v.m)); } + + ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { _mm_storeh_pd((double*)p, _mm_castps_pd(v.m)); } + + template + ALWAYS_INLINE static void store(void* p, const GSVector4& v) + { + if constexpr (aligned) + _mm_store_ps((float*)p, v.m); + else + _mm_storeu_ps((float*)p, v.m); + } + + ALWAYS_INLINE static void store(float* p, const GSVector4& v) { _mm_store_ss(p, v.m); } + + ALWAYS_INLINE GSVector4 operator-() const { return neg(); } + + ALWAYS_INLINE void operator+=(const GSVector4& v_) { m = _mm_add_ps(m, v_); } + ALWAYS_INLINE void operator-=(const GSVector4& v_) { m = _mm_sub_ps(m, v_); } + ALWAYS_INLINE void operator*=(const GSVector4& v_) { m = _mm_mul_ps(m, v_); } + ALWAYS_INLINE void operator/=(const GSVector4& v_) { m = _mm_div_ps(m, v_); } + + ALWAYS_INLINE void operator+=(float f) { *this += GSVector4(f); } + ALWAYS_INLINE void operator-=(float f) { *this -= GSVector4(f); } + ALWAYS_INLINE void operator*=(float f) { *this *= GSVector4(f); } + ALWAYS_INLINE void operator/=(float f) { *this /= GSVector4(f); } + + ALWAYS_INLINE void operator&=(const GSVector4& v_) { m = _mm_and_ps(m, v_); } + ALWAYS_INLINE void operator|=(const GSVector4& v_) { m = _mm_or_ps(m, v_); } + ALWAYS_INLINE void operator^=(const GSVector4& v_) { m = _mm_xor_ps(m, v_); } + + ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(_mm_add_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(_mm_sub_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(_mm_mul_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(_mm_div_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4 operator+(const GSVector4& v, float f) { return v + GSVector4(f); } + + ALWAYS_INLINE friend GSVector4 operator-(const GSVector4& v, float f) { return v - GSVector4(f); } + + ALWAYS_INLINE friend GSVector4 operator*(const GSVector4& v, float f) { return v * GSVector4(f); } + + ALWAYS_INLINE friend GSVector4 operator/(const GSVector4& v, float f) { return v / GSVector4(f); } + + ALWAYS_INLINE friend GSVector4 operator&(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(_mm_and_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4 operator|(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(_mm_or_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4 operator^(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(_mm_xor_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4 operator==(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(_mm_cmpeq_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4 operator!=(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(_mm_cmpneq_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4 operator>(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(_mm_cmpgt_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4 operator<(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(_mm_cmplt_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4 operator>=(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(_mm_cmpge_ps(v1, v2)); + } + + ALWAYS_INLINE friend GSVector4 operator<=(const GSVector4& v1, const GSVector4& v2) + { + return GSVector4(_mm_cmple_ps(v1, v2)); + } + + ALWAYS_INLINE GSVector4 mul64(const GSVector4& v_) const + { + return GSVector4(_mm_mul_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m))); + } + + ALWAYS_INLINE GSVector4 add64(const GSVector4& v_) const + { + return GSVector4(_mm_add_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m))); + } + + ALWAYS_INLINE GSVector4 sub64(const GSVector4& v_) const + { + return GSVector4(_mm_sub_pd(_mm_castps_pd(m), _mm_castps_pd(v_.m))); + } + + ALWAYS_INLINE static GSVector4 f32to64(const GSVector4& v_) { return GSVector4(_mm_cvtps_pd(v_.m)); } + + ALWAYS_INLINE static GSVector4 f32to64(const void* p) + { + return GSVector4(_mm_cvtps_pd(_mm_castpd_ps(_mm_load_sd(static_cast(p))))); + } + + ALWAYS_INLINE GSVector4i f64toi32(bool truncate = true) const + { + return GSVector4i(truncate ? _mm_cvttpd_epi32(_mm_castps_pd(m)) : _mm_cvtpd_epi32(_mm_castps_pd(m))); + } + + // clang-format off + +#define VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, ws, wn) \ + ALWAYS_INLINE GSVector4 xs##ys##zs##ws() const { return GSVector4(_mm_shuffle_ps(m, m, _MM_SHUFFLE(wn, zn, yn, xn))); } \ + ALWAYS_INLINE GSVector4 xs##ys##zs##ws(const GSVector4& v_) const { return GSVector4(_mm_shuffle_ps(m, v_.m, _MM_SHUFFLE(wn, zn, yn, xn))); } \ + +#define VECTOR4_SHUFFLE_3(xs, xn, ys, yn, zs, zn) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, x, 0) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, y, 1) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, z, 2) \ + VECTOR4_SHUFFLE_4(xs, xn, ys, yn, zs, zn, w, 3) \ + +#define VECTOR4_SHUFFLE_2(xs, xn, ys, yn) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, x, 0) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, y, 1) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, z, 2) \ + VECTOR4_SHUFFLE_3(xs, xn, ys, yn, w, 3) \ + +#define VECTOR4_SHUFFLE_1(xs, xn) \ + VECTOR4_SHUFFLE_2(xs, xn, x, 0) \ + VECTOR4_SHUFFLE_2(xs, xn, y, 1) \ + VECTOR4_SHUFFLE_2(xs, xn, z, 2) \ + VECTOR4_SHUFFLE_2(xs, xn, w, 3) \ + + VECTOR4_SHUFFLE_1(x, 0) + VECTOR4_SHUFFLE_1(y, 1) + VECTOR4_SHUFFLE_1(z, 2) + VECTOR4_SHUFFLE_1(w, 3) + + // clang-format on + +#if CPU_ARCH_AVX2 + + ALWAYS_INLINE GSVector4 broadcast32() const { return GSVector4(_mm_broadcastss_ps(m)); } + + ALWAYS_INLINE static GSVector4 broadcast32(const GSVector4& v) { return GSVector4(_mm_broadcastss_ps(v.m)); } + + ALWAYS_INLINE static GSVector4 broadcast32(const void* f) + { + return GSVector4(_mm_broadcastss_ps(_mm_load_ss((const float*)f))); + } + +#endif + + ALWAYS_INLINE static GSVector4 broadcast64(const void* d) + { + return GSVector4(_mm_loaddup_pd(static_cast(d))); + } +}; + +ALWAYS_INLINE GSVector4i::GSVector4i(const GSVector4& v, bool truncate) +{ + m = truncate ? _mm_cvttps_epi32(v) : _mm_cvtps_epi32(v); +} + +ALWAYS_INLINE GSVector4::GSVector4(const GSVector4i& v) +{ + m = _mm_cvtepi32_ps(v); +} + +ALWAYS_INLINE GSVector4i GSVector4i::cast(const GSVector4& v) +{ + return GSVector4i(_mm_castps_si128(v.m)); +} + +ALWAYS_INLINE GSVector4 GSVector4::cast(const GSVector4i& v) +{ + return GSVector4(_mm_castsi128_ps(v.m)); +} diff --git a/src/common/intrin.h b/src/common/intrin.h index 795a7f950..1b0587e7e 100644 --- a/src/common/intrin.h +++ b/src/common/intrin.h @@ -13,7 +13,21 @@ #if defined(CPU_ARCH_X86) || defined(CPU_ARCH_X64) #define CPU_ARCH_SSE 1 #include -#elif defined(CPU_ARCH_ARM64) +#include +#include +#include + +#if defined(__AVX2__) +#define CPU_ARCH_AVX 1 +#define CPU_ARCH_AVX2 1 +#define CPU_ARCH_SSE41 1 +#elif defined(__AVX__) +#define CPU_ARCH_AVX 1 +#define CPU_ARCH_SSE41 1 +#elif defined(__SSE4_1__) || defined(_MSC_VER) +#define CPU_ARCH_SSE41 1 +#endif +#elif defined(CPU_ARCH_ARM32) || defined(CPU_ARCH_ARM64) #define CPU_ARCH_NEON 1 #if defined(_MSC_VER) && !defined(__clang__) #include