Common: Add MemsetPtrs()

This commit is contained in:
Stenzek 2023-10-01 15:03:56 +10:00
parent e2efec12b7
commit 0ee6712499

View file

@ -7,6 +7,8 @@
#include "types.h" #include "types.h"
#include <type_traits>
#if defined(CPU_ARCH_X86) || defined(CPU_ARCH_X64) #if defined(CPU_ARCH_X86) || defined(CPU_ARCH_X64)
#define CPU_ARCH_SSE 1 #define CPU_ARCH_SSE 1
#include <emmintrin.h> #include <emmintrin.h>
@ -24,3 +26,42 @@
#else #else
#include <malloc.h> // alloca #include <malloc.h> // alloca
#endif #endif
template<typename T>
static inline void MemsetPtrs(T* ptr, T value, u32 count)
{
static_assert(std::is_pointer_v<T>, "T is pointer type");
static_assert(sizeof(T) == sizeof(void*), "T isn't a fat pointer");
T* dest = ptr;
#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
static constexpr u32 PTRS_PER_VECTOR = (16 / sizeof(T));
const u32 aligned_count = count / PTRS_PER_VECTOR;
const u32 remaining_count = count % PTRS_PER_VECTOR;
#if defined(CPU_ARCH_SSE)
const __m128i svalue = _mm_set1_epi64x(reinterpret_cast<intptr_t>(value));
#elif defined(CPU_ARCH_NEON)
const uint64x2_t svalue = vdupq_n_u64(reinterpret_cast<uintptr_t>(value));
#endif
// Clang gets way too eager and tries to unroll these, emitting thousands of instructions.
#ifdef __clang__
#pragma clang loop unroll(disable)
#endif
for (u32 i = 0; i < aligned_count; i++)
{
#if defined(CPU_ARCH_SSE)
_mm_store_si128(reinterpret_cast<__m128i*>(dest), svalue);
#elif defined(CPU_ARCH_NEON)
vst1q_u64(reinterpret_cast<u64*>(dest), svalue);
#endif
dest += PTRS_PER_VECTOR;
}
#else
const u32 remaining_count = count;
#endif
for (u32 i = 0; i < remaining_count; i++)
*(dest++) = value;
}