From 0ee67124999e876128361154ea218e9d426a4657 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sun, 1 Oct 2023 15:03:56 +1000 Subject: [PATCH] Common: Add MemsetPtrs() --- src/common/intrin.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/common/intrin.h b/src/common/intrin.h index 61595d297..2c730f058 100644 --- a/src/common/intrin.h +++ b/src/common/intrin.h @@ -7,6 +7,8 @@ #include "types.h" +#include + #if defined(CPU_ARCH_X86) || defined(CPU_ARCH_X64) #define CPU_ARCH_SSE 1 #include @@ -24,3 +26,42 @@ #else #include // alloca #endif + +template +static inline void MemsetPtrs(T* ptr, T value, u32 count) +{ + static_assert(std::is_pointer_v, "T is pointer type"); + static_assert(sizeof(T) == sizeof(void*), "T isn't a fat pointer"); + T* dest = ptr; + +#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON) + static constexpr u32 PTRS_PER_VECTOR = (16 / sizeof(T)); + const u32 aligned_count = count / PTRS_PER_VECTOR; + const u32 remaining_count = count % PTRS_PER_VECTOR; + +#if defined(CPU_ARCH_SSE) + const __m128i svalue = _mm_set1_epi64x(reinterpret_cast(value)); +#elif defined(CPU_ARCH_NEON) + const uint64x2_t svalue = vdupq_n_u64(reinterpret_cast(value)); +#endif + + // Clang gets way too eager and tries to unroll these, emitting thousands of instructions. +#ifdef __clang__ +#pragma clang loop unroll(disable) +#endif + for (u32 i = 0; i < aligned_count; i++) + { +#if defined(CPU_ARCH_SSE) + _mm_store_si128(reinterpret_cast<__m128i*>(dest), svalue); +#elif defined(CPU_ARCH_NEON) + vst1q_u64(reinterpret_cast(dest), svalue); +#endif + dest += PTRS_PER_VECTOR; + } +#else + const u32 remaining_count = count; +#endif + + for (u32 i = 0; i < remaining_count; i++) + *(dest++) = value; +}