mirror of
https://github.com/RetroDECK/Duckstation.git
synced 2024-11-29 09:05:41 +00:00
CDImageCHD: Add SSE/NEON implementation of CopyAndSwap()
This commit is contained in:
parent
ecb082b672
commit
e2efec12b7
|
@ -11,7 +11,7 @@
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
|
||||||
template<typename T, std::size_t SIZE>
|
template<typename T, std::size_t SIZE, std::size_t ALIGNMENT = 0>
|
||||||
class FixedHeapArray
|
class FixedHeapArray
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
@ -24,11 +24,11 @@ public:
|
||||||
using const_pointer = const T*;
|
using const_pointer = const T*;
|
||||||
using this_type = FixedHeapArray<T, SIZE>;
|
using this_type = FixedHeapArray<T, SIZE>;
|
||||||
|
|
||||||
FixedHeapArray() { m_data = new T[SIZE]; }
|
FixedHeapArray() { allocate(); }
|
||||||
|
|
||||||
FixedHeapArray(const this_type& copy)
|
FixedHeapArray(const this_type& copy)
|
||||||
{
|
{
|
||||||
m_data = new T[SIZE];
|
allocate();
|
||||||
std::copy(copy.cbegin(), copy.cend(), begin());
|
std::copy(copy.cbegin(), copy.cend(), begin());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ public:
|
||||||
move.m_data = nullptr;
|
move.m_data = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
~FixedHeapArray() { delete[] m_data; }
|
~FixedHeapArray() { deallocate(); }
|
||||||
|
|
||||||
size_type size() const { return SIZE; }
|
size_type size() const { return SIZE; }
|
||||||
size_type capacity() const { return SIZE; }
|
size_type capacity() const { return SIZE; }
|
||||||
|
@ -107,6 +107,42 @@ public:
|
||||||
#undef RELATIONAL_OPERATOR
|
#undef RELATIONAL_OPERATOR
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void allocate()
|
||||||
|
{
|
||||||
|
if constexpr (ALIGNMENT > 0)
|
||||||
|
{
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
m_data = static_cast<T*>(_aligned_malloc(SIZE * sizeof(T), ALIGNMENT));
|
||||||
|
if (!m_data)
|
||||||
|
Panic("Memory allocation failed.");
|
||||||
|
#else
|
||||||
|
if (posix_memalign(reinterpret_cast<void**>(&m_data), ALIGNMENT, SIZE * sizeof(T)) != 0)
|
||||||
|
Panic("Memory allocation failed.");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_data = static_cast<T*>(std::malloc(SIZE * sizeof(T)));
|
||||||
|
if (!m_data)
|
||||||
|
Panic("Memory allocation failed.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void deallocate()
|
||||||
|
{
|
||||||
|
if constexpr (ALIGNMENT > 0)
|
||||||
|
{
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
_aligned_free(m_data);
|
||||||
|
#else
|
||||||
|
std::free(m_data);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::free(m_data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
T* m_data;
|
T* m_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -313,23 +349,23 @@ private:
|
||||||
if constexpr (alignment > 0)
|
if constexpr (alignment > 0)
|
||||||
{
|
{
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
m_data = _aligned_realloc(prev_ptr, size, alignment);
|
m_data = static_cast<T*>(_aligned_realloc(prev_ptr, size * sizeof(T), alignment));
|
||||||
if (!m_data)
|
if (!m_data)
|
||||||
Panic("Memory allocation failed.");
|
Panic("Memory allocation failed.");
|
||||||
#else
|
#else
|
||||||
if (posix_memalign(reinterpret_cast<void**>(&m_data), alignment, size) != 0)
|
if (posix_memalign(reinterpret_cast<void**>(&m_data), alignment, size * sizeof(T)) != 0)
|
||||||
Panic("Memory allocation failed.");
|
Panic("Memory allocation failed.");
|
||||||
|
|
||||||
if (prev_ptr)
|
if (prev_ptr)
|
||||||
{
|
{
|
||||||
std::memcpy(m_data, prev_ptr, prev_size);
|
std::memcpy(m_data, prev_ptr, prev_size * sizeof(T));
|
||||||
std::free(prev_ptr);
|
std::free(prev_ptr);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
m_data = static_cast<T*>(std::realloc(prev_ptr, size));
|
m_data = static_cast<T*>(std::realloc(prev_ptr, size * sizeof(T)));
|
||||||
if (!m_data)
|
if (!m_data)
|
||||||
Panic("Memory allocation failed.");
|
Panic("Memory allocation failed.");
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,6 +9,8 @@
|
||||||
#include "common/error.h"
|
#include "common/error.h"
|
||||||
#include "common/file_system.h"
|
#include "common/file_system.h"
|
||||||
#include "common/hash_combine.h"
|
#include "common/hash_combine.h"
|
||||||
|
#include "common/heap_array.h"
|
||||||
|
#include "common/intrin.h"
|
||||||
#include "common/log.h"
|
#include "common/log.h"
|
||||||
#include "common/path.h"
|
#include "common/path.h"
|
||||||
#include "common/string_util.h"
|
#include "common/string_util.h"
|
||||||
|
@ -76,11 +78,13 @@ private:
|
||||||
chd_file* OpenCHD(std::string_view filename, FileSystem::ManagedCFilePtr fp, Error* error, u32 recursion_level);
|
chd_file* OpenCHD(std::string_view filename, FileSystem::ManagedCFilePtr fp, Error* error, u32 recursion_level);
|
||||||
bool ReadHunk(u32 hunk_index);
|
bool ReadHunk(u32 hunk_index);
|
||||||
|
|
||||||
|
static void CopyAndSwap(void* dst_ptr, const u8* src_ptr);
|
||||||
|
|
||||||
chd_file* m_chd = nullptr;
|
chd_file* m_chd = nullptr;
|
||||||
u32 m_hunk_size = 0;
|
u32 m_hunk_size = 0;
|
||||||
u32 m_sectors_per_hunk = 0;
|
u32 m_sectors_per_hunk = 0;
|
||||||
|
|
||||||
std::vector<u8> m_hunk_buffer;
|
DynamicHeapArray<u8, 16> m_hunk_buffer;
|
||||||
u32 m_current_hunk_index = static_cast<u32>(-1);
|
u32 m_current_hunk_index = static_cast<u32>(-1);
|
||||||
bool m_precached = false;
|
bool m_precached = false;
|
||||||
|
|
||||||
|
@ -443,12 +447,39 @@ bool CDImageCHD::IsPrecached() const
|
||||||
return m_precached;
|
return m_precached;
|
||||||
}
|
}
|
||||||
|
|
||||||
// There's probably a more efficient way of doing this with vectorization...
|
ALWAYS_INLINE_RELEASE void CDImageCHD::CopyAndSwap(void* dst_ptr, const u8* src_ptr)
|
||||||
ALWAYS_INLINE static void CopyAndSwap(void* dst_ptr, const u8* src_ptr, u32 data_size)
|
|
||||||
{
|
{
|
||||||
|
constexpr u32 data_size = RAW_SECTOR_SIZE;
|
||||||
|
|
||||||
u8* dst_ptr_byte = static_cast<u8*>(dst_ptr);
|
u8* dst_ptr_byte = static_cast<u8*>(dst_ptr);
|
||||||
#if defined(CPU_ARCH_X64) || defined(CPU_ARCH_ARM64)
|
#if defined(CPU_ARCH_SSE) || defined(CPU_ARCH_NEON)
|
||||||
const u32 num_values = data_size / 8;
|
static_assert((data_size % 16) == 0);
|
||||||
|
constexpr u32 num_values = data_size / 16;
|
||||||
|
|
||||||
|
#if defined(CPU_ARCH_SSE)
|
||||||
|
// Requires SSSE3.
|
||||||
|
//const __m128i mask = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
|
||||||
|
for (u32 i = 0; i < num_values; i++)
|
||||||
|
{
|
||||||
|
__m128i value = _mm_load_si128(reinterpret_cast<const __m128i*>(src_ptr));
|
||||||
|
//value = _mm_shuffle_epi8(value, mask);
|
||||||
|
value = _mm_or_si128(_mm_slli_epi16(value, 8), _mm_srli_epi16(value, 8));
|
||||||
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst_ptr_byte), value);
|
||||||
|
src_ptr += sizeof(value);
|
||||||
|
dst_ptr_byte += sizeof(value);
|
||||||
|
}
|
||||||
|
#elif defined(CPU_ARCH_NEON)
|
||||||
|
for (u32 i = 0; i < num_values; i++)
|
||||||
|
{
|
||||||
|
uint16x8_t value = vld1q_u16(reinterpret_cast<const u16*>(src_ptr));
|
||||||
|
value = vorrq_u16(vshlq_n_u16(value, 8), vshrq_n_u16(value, 8));
|
||||||
|
vst1q_u16(reinterpret_cast<u16*>(dst_ptr_byte), value);
|
||||||
|
src_ptr += sizeof(value);
|
||||||
|
dst_ptr_byte += sizeof(value);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#elif defined(CPU_ARCH_RISCV64)
|
||||||
|
constexpr u32 num_values = data_size / 8;
|
||||||
for (u32 i = 0; i < num_values; i++)
|
for (u32 i = 0; i < num_values; i++)
|
||||||
{
|
{
|
||||||
u64 value;
|
u64 value;
|
||||||
|
@ -458,8 +489,8 @@ ALWAYS_INLINE static void CopyAndSwap(void* dst_ptr, const u8* src_ptr, u32 data
|
||||||
src_ptr += sizeof(value);
|
src_ptr += sizeof(value);
|
||||||
dst_ptr_byte += sizeof(value);
|
dst_ptr_byte += sizeof(value);
|
||||||
}
|
}
|
||||||
#elif defined(CPU_ARCH_X86) || defined(CPU_ARCH_ARM32)
|
#else
|
||||||
const u32 num_values = data_size / 4;
|
constexpr u32 num_values = data_size / 4;
|
||||||
for (u32 i = 0; i < num_values; i++)
|
for (u32 i = 0; i < num_values; i++)
|
||||||
{
|
{
|
||||||
u32 value;
|
u32 value;
|
||||||
|
@ -469,17 +500,6 @@ ALWAYS_INLINE static void CopyAndSwap(void* dst_ptr, const u8* src_ptr, u32 data
|
||||||
src_ptr += sizeof(value);
|
src_ptr += sizeof(value);
|
||||||
dst_ptr_byte += sizeof(value);
|
dst_ptr_byte += sizeof(value);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
const u32 num_values = data_size / sizeof(u16);
|
|
||||||
for (u32 i = 0; i < num_values; i++)
|
|
||||||
{
|
|
||||||
u16 value;
|
|
||||||
std::memcpy(&value, src_ptr, sizeof(value));
|
|
||||||
value = (value << 8) | (value >> 8);
|
|
||||||
std::memcpy(dst_ptr_byte, &value, sizeof(value));
|
|
||||||
src_ptr += sizeof(value);
|
|
||||||
dst_ptr_byte += sizeof(value);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -495,7 +515,7 @@ bool CDImageCHD::ReadSectorFromIndex(void* buffer, const Index& index, LBA lba_i
|
||||||
|
|
||||||
// Audio data is in big-endian, so we have to swap it for little endian hosts...
|
// Audio data is in big-endian, so we have to swap it for little endian hosts...
|
||||||
if (index.mode == TrackMode::Audio)
|
if (index.mode == TrackMode::Audio)
|
||||||
CopyAndSwap(buffer, &m_hunk_buffer[hunk_offset], RAW_SECTOR_SIZE);
|
CopyAndSwap(buffer, &m_hunk_buffer[hunk_offset]);
|
||||||
else
|
else
|
||||||
std::memcpy(buffer, &m_hunk_buffer[hunk_offset], RAW_SECTOR_SIZE);
|
std::memcpy(buffer, &m_hunk_buffer[hunk_offset], RAW_SECTOR_SIZE);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue