From f5ddd7ba326ceaeacb8d3a193dc86f93f3a189cd Mon Sep 17 00:00:00 2001 From: Stenzek Date: Tue, 19 Dec 2023 00:55:04 +1000 Subject: [PATCH] DMA: Template transfer functions ~20% speedup in FMV playback on a Ryzen 9 7950X3D. CPUs hate branches. --- src/core/dma.cpp | 100 +++++++++++++++++++++++++++++++++------------- src/core/mdec.cpp | 4 +- src/core/spu.cpp | 2 +- 3 files changed, 75 insertions(+), 31 deletions(-) diff --git a/src/core/dma.cpp b/src/core/dma.cpp index 9161b6340..debd1860b 100644 --- a/src/core/dma.cpp +++ b/src/core/dma.cpp @@ -166,15 +166,20 @@ static void UpdateIRQ(); // returns false if the DMA should now be halted static TickCount GetTransferSliceTicks(); static TickCount GetTransferHaltTicks(); -static bool TransferChannel(Channel channel); + static void HaltTransfer(TickCount duration); static void UnhaltTransfer(void*, TickCount ticks, TickCount ticks_late); +template +static bool TransferChannel(); + // from device -> memory -static TickCount TransferDeviceToMemory(Channel channel, u32 address, u32 increment, u32 word_count); +template +static TickCount TransferDeviceToMemory(u32 address, u32 increment, u32 word_count); // from memory -> device -static TickCount TransferMemoryToDevice(Channel channel, u32 address, u32 increment, u32 word_count); +template +static TickCount TransferMemoryToDevice(u32 address, u32 increment, u32 word_count); // configuration static TickCount s_max_slice_ticks = 1000; @@ -187,6 +192,17 @@ static TickCount s_halt_ticks_remaining = 0; static std::array s_state; static DPCR s_DPCR = {}; static DICR s_DICR = {}; + +static constexpr std::array s_channel_transfer_functions = {{ + &TransferChannel, + &TransferChannel, + &TransferChannel, + &TransferChannel, + &TransferChannel, + &TransferChannel, + &TransferChannel, +}}; + }; // namespace DMA u32 DMA::GetAddressMask() @@ -343,7 +359,7 @@ void DMA::WriteRegister(u32 offset, u32 value) SetRequest(static_cast(channel_index), state.channel_control.start_trigger); if (CanTransferChannel(static_cast(channel_index), ignore_halt)) - TransferChannel(static_cast(channel_index)); + s_channel_transfer_functions[channel_index](); return; } @@ -364,7 +380,7 @@ void DMA::WriteRegister(u32 offset, u32 value) { if (CanTransferChannel(static_cast(i), false)) { - if (!TransferChannel(static_cast(i))) + if (!s_channel_transfer_functions[i]()) break; } } @@ -397,7 +413,7 @@ void DMA::SetRequest(Channel channel, bool request) cs.request = request; if (CanTransferChannel(channel, false)) - TransferChannel(channel); + s_channel_transfer_functions[static_cast(channel)](); } void DMA::SetMaxSliceTicks(TickCount ticks) @@ -410,7 +426,7 @@ void DMA::SetHaltTicks(TickCount ticks) s_halt_ticks = ticks; } -bool DMA::CanTransferChannel(Channel channel, bool ignore_halt) +ALWAYS_INLINE_RELEASE bool DMA::CanTransferChannel(Channel channel, bool ignore_halt) { if (!s_DPCR.GetMasterEnable(channel)) return false; @@ -468,7 +484,8 @@ TickCount DMA::GetTransferHaltTicks() return Pad::IsTransmitting() ? HALT_TICKS_WHEN_TRANSMITTING_PAD : s_halt_ticks; } -bool DMA::TransferChannel(Channel channel) +template +bool DMA::TransferChannel() { ChannelState& cs = s_state[static_cast(channel)]; const u32 mask = GetAddressMask(); @@ -490,9 +507,9 @@ bool DMA::TransferChannel(Channel channel) TickCount used_ticks; if (copy_to_device) - used_ticks = TransferMemoryToDevice(channel, current_address & mask, increment, word_count); + used_ticks = TransferMemoryToDevice(current_address & mask, increment, word_count); else - used_ticks = TransferDeviceToMemory(channel, current_address & mask, increment, word_count); + used_ticks = TransferDeviceToMemory(current_address & mask, increment, word_count); CPU::AddPendingTicks(used_ticks); } @@ -528,7 +545,7 @@ bool DMA::TransferChannel(Channel channel) remaining_ticks -= 5; const TickCount block_ticks = - TransferMemoryToDevice(channel, (current_address + sizeof(header)) & mask, 4, word_count); + TransferMemoryToDevice((current_address + sizeof(header)) & mask, 4, word_count); CPU::AddPendingTicks(block_ticks); remaining_ticks -= block_ticks; } @@ -574,7 +591,7 @@ bool DMA::TransferChannel(Channel channel) { blocks_remaining--; - const TickCount ticks = TransferMemoryToDevice(channel, current_address & mask, increment, block_size); + const TickCount ticks = TransferMemoryToDevice(current_address & mask, increment, block_size); CPU::AddPendingTicks(ticks); ticks_remaining -= ticks; @@ -587,7 +604,7 @@ bool DMA::TransferChannel(Channel channel) { blocks_remaining--; - const TickCount ticks = TransferDeviceToMemory(channel, current_address & mask, increment, block_size); + const TickCount ticks = TransferDeviceToMemory(current_address & mask, increment, block_size); CPU::AddPendingTicks(ticks); ticks_remaining -= ticks; @@ -655,7 +672,7 @@ void DMA::UnhaltTransfer(void*, TickCount ticks, TickCount ticks_late) { if (CanTransferChannel(static_cast(i), false)) { - if (!TransferChannel(static_cast(i))) + if (!s_channel_transfer_functions[i]()) return; } } @@ -664,23 +681,26 @@ void DMA::UnhaltTransfer(void*, TickCount ticks, TickCount ticks_late) s_halt_ticks_remaining = 0; } -TickCount DMA::TransferMemoryToDevice(Channel channel, u32 address, u32 increment, u32 word_count) +template +TickCount DMA::TransferMemoryToDevice(u32 address, u32 increment, u32 word_count) { const u32* src_pointer = reinterpret_cast(Bus::g_ram + address); const u32 mask = GetAddressMask(); - if (channel != Channel::GPU && - (static_cast(increment) < 0 || ((address + (increment * word_count)) & mask) <= address)) + if constexpr (channel != Channel::GPU) { - // Use temp buffer if it's wrapping around - if (s_transfer_buffer.size() < word_count) - s_transfer_buffer.resize(word_count); - src_pointer = s_transfer_buffer.data(); - - u8* ram_pointer = Bus::g_ram; - for (u32 i = 0; i < word_count; i++) + if (static_cast(increment) < 0 || ((address + (increment * word_count)) & mask) <= address) { - std::memcpy(&s_transfer_buffer[i], &ram_pointer[address], sizeof(u32)); - address = (address + increment) & mask; + // Use temp buffer if it's wrapping around + if (s_transfer_buffer.size() < word_count) + s_transfer_buffer.resize(word_count); + src_pointer = s_transfer_buffer.data(); + + u8* ram_pointer = Bus::g_ram; + for (u32 i = 0; i < word_count; i++) + { + std::memcpy(&s_transfer_buffer[i], &ram_pointer[address], sizeof(u32)); + address = (address + increment) & mask; + } } } @@ -722,11 +742,12 @@ TickCount DMA::TransferMemoryToDevice(Channel channel, u32 address, u32 incremen return Bus::GetDMARAMTickCount(word_count); } -TickCount DMA::TransferDeviceToMemory(Channel channel, u32 address, u32 increment, u32 word_count) +template +TickCount DMA::TransferDeviceToMemory(u32 address, u32 increment, u32 word_count) { const u32 mask = GetAddressMask(); - if (channel == Channel::OTC) + if constexpr (channel == Channel::OTC) { // clear ordering table u8* ram_pointer = Bus::g_ram; @@ -868,3 +889,26 @@ void DMA::DrawDebugStateWindow() ImGui::Columns(1); ImGui::End(); } + +// Instantiate channel functions. +template TickCount DMA::TransferDeviceToMemory(u32 address, u32 increment, u32 word_count); +template TickCount DMA::TransferMemoryToDevice(u32 address, u32 increment, u32 word_count); +template bool DMA::TransferChannel(); +template TickCount DMA::TransferDeviceToMemory(u32 address, u32 increment, u32 word_count); +template TickCount DMA::TransferMemoryToDevice(u32 address, u32 increment, u32 word_count); +template bool DMA::TransferChannel(); +template TickCount DMA::TransferDeviceToMemory(u32 address, u32 increment, u32 word_count); +template TickCount DMA::TransferMemoryToDevice(u32 address, u32 increment, u32 word_count); +template bool DMA::TransferChannel(); +template TickCount DMA::TransferDeviceToMemory(u32 address, u32 increment, u32 word_count); +template TickCount DMA::TransferMemoryToDevice(u32 address, u32 increment, u32 word_count); +template bool DMA::TransferChannel(); +template TickCount DMA::TransferDeviceToMemory(u32 address, u32 increment, u32 word_count); +template TickCount DMA::TransferMemoryToDevice(u32 address, u32 increment, u32 word_count); +template bool DMA::TransferChannel(); +template TickCount DMA::TransferDeviceToMemory(u32 address, u32 increment, u32 word_count); +template TickCount DMA::TransferMemoryToDevice(u32 address, u32 increment, u32 word_count); +template bool DMA::TransferChannel(); +template TickCount DMA::TransferDeviceToMemory(u32 address, u32 increment, u32 word_count); +template TickCount DMA::TransferMemoryToDevice(u32 address, u32 increment, u32 word_count); +template bool DMA::TransferChannel(); diff --git a/src/core/mdec.cpp b/src/core/mdec.cpp index 08aadca1f..3f964fcb5 100644 --- a/src/core/mdec.cpp +++ b/src/core/mdec.cpp @@ -248,7 +248,7 @@ void MDEC::WriteRegister(u32 offset, u32 value) void MDEC::DMARead(u32* words, u32 word_count) { - if (s_data_out_fifo.GetSize() < word_count) + if (s_data_out_fifo.GetSize() < word_count) [[unlikely]] { Log_WarningPrintf("Insufficient data in output FIFO (requested %u, have %u)", word_count, s_data_out_fifo.GetSize()); @@ -269,7 +269,7 @@ void MDEC::DMARead(u32* words, u32 word_count) void MDEC::DMAWrite(const u32* words, u32 word_count) { - if (s_data_in_fifo.GetSpace() < (word_count * 2)) + if (s_data_in_fifo.GetSpace() < (word_count * 2)) [[unlikely]] { Log_WarningPrintf("Input FIFO overflow (writing %u, space %u)", word_count * 2, s_data_in_fifo.GetSpace()); } diff --git a/src/core/spu.cpp b/src/core/spu.cpp index a74cc3e4e..690b248e9 100644 --- a/src/core/spu.cpp +++ b/src/core/spu.cpp @@ -1445,7 +1445,7 @@ void SPU::DMAWrite(const u32* words, u32 word_count) const u32 words_to_transfer = std::min(s_transfer_fifo.GetSpace(), halfword_count); s_transfer_fifo.PushRange(halfwords, words_to_transfer); - if (words_to_transfer != halfword_count) + if (words_to_transfer != halfword_count) [[unlikely]] Log_WarningPrintf("Transfer FIFO overflow, dropping %u halfwords", halfword_count - words_to_transfer); UpdateDMARequest();