mirror of
https://github.com/RetroDECK/Duckstation.git
synced 2025-01-19 06:45:39 +00:00
DMA: Elide intermediate copy where possible
Easy 5% performance improvement.
This commit is contained in:
parent
d80aaf3880
commit
07e8ddcae2
|
@ -138,12 +138,8 @@ TickCount Bus::ReadWords(PhysicalMemoryAddress address, u32* words, u32 word_cou
|
||||||
return total_ticks;
|
return total_ticks;
|
||||||
}
|
}
|
||||||
|
|
||||||
// DMA is using DRAM Hyper Page mode, allowing it to access DRAM rows at 1 clock cycle per word (effectively around 17
|
|
||||||
// clks per 16 words, due to required row address loading, probably plus some further minimal overload due to refresh
|
|
||||||
// cycles). This is making DMA much faster than CPU memory accesses (CPU DRAM access takes 1 opcode cycle plus 6
|
|
||||||
// waitstates, ie. 7 cycles in total).
|
|
||||||
std::memcpy(words, &m_ram[address], sizeof(u32) * word_count);
|
std::memcpy(words, &m_ram[address], sizeof(u32) * word_count);
|
||||||
return static_cast<TickCount>(word_count + ((word_count + 15) / 16));
|
return GetDMARAMTickCount(word_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
TickCount Bus::WriteWords(PhysicalMemoryAddress address, const u32* words, u32 word_count)
|
TickCount Bus::WriteWords(PhysicalMemoryAddress address, const u32* words, u32 word_count)
|
||||||
|
@ -166,16 +162,9 @@ TickCount Bus::WriteWords(PhysicalMemoryAddress address, const u32* words, u32 w
|
||||||
return total_ticks;
|
return total_ticks;
|
||||||
}
|
}
|
||||||
|
|
||||||
const u32 start_page = address / CPU_CODE_CACHE_PAGE_SIZE;
|
|
||||||
const u32 end_page = (address + word_count * sizeof(u32)) / CPU_CODE_CACHE_PAGE_SIZE;
|
|
||||||
for (u32 page = start_page; page <= end_page; page++)
|
|
||||||
{
|
|
||||||
if (m_ram_code_bits[page])
|
|
||||||
DoInvalidateCodeCache(page);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::memcpy(&m_ram[address], words, sizeof(u32) * word_count);
|
std::memcpy(&m_ram[address], words, sizeof(u32) * word_count);
|
||||||
return static_cast<TickCount>(word_count + ((word_count + 15) / 16));
|
InvalidateCodePages(address, word_count);
|
||||||
|
return GetDMARAMTickCount(word_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Bus::SetExpansionROM(std::vector<u8> data)
|
void Bus::SetExpansionROM(std::vector<u8> data)
|
||||||
|
|
|
@ -26,6 +26,8 @@ class System;
|
||||||
|
|
||||||
class Bus
|
class Bus
|
||||||
{
|
{
|
||||||
|
friend DMA;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Bus();
|
Bus();
|
||||||
~Bus();
|
~Bus();
|
||||||
|
@ -243,6 +245,31 @@ private:
|
||||||
|
|
||||||
void DoInvalidateCodeCache(u32 page_index);
|
void DoInvalidateCodeCache(u32 page_index);
|
||||||
|
|
||||||
|
/// Direct access to RAM - used by DMA.
|
||||||
|
ALWAYS_INLINE u8* GetRAM() { return m_ram.data(); }
|
||||||
|
|
||||||
|
/// Returns the number of cycles stolen by DMA RAM access.
|
||||||
|
ALWAYS_INLINE static TickCount GetDMARAMTickCount(u32 word_count)
|
||||||
|
{
|
||||||
|
// DMA is using DRAM Hyper Page mode, allowing it to access DRAM rows at 1 clock cycle per word (effectively around
|
||||||
|
// 17 clks per 16 words, due to required row address loading, probably plus some further minimal overload due to
|
||||||
|
// refresh cycles). This is making DMA much faster than CPU memory accesses (CPU DRAM access takes 1 opcode cycle
|
||||||
|
// plus 6 waitstates, ie. 7 cycles in total).
|
||||||
|
return static_cast<TickCount>(word_count + ((word_count + 15) / 16));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Invalidates any code pages which overlap the specified range.
|
||||||
|
ALWAYS_INLINE void InvalidateCodePages(PhysicalMemoryAddress address, u32 word_count)
|
||||||
|
{
|
||||||
|
const u32 start_page = address / CPU_CODE_CACHE_PAGE_SIZE;
|
||||||
|
const u32 end_page = (address + word_count * sizeof(u32)) / CPU_CODE_CACHE_PAGE_SIZE;
|
||||||
|
for (u32 page = start_page; page <= end_page; page++)
|
||||||
|
{
|
||||||
|
if (m_ram_code_bits[page])
|
||||||
|
DoInvalidateCodeCache(page);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
CPU::Core* m_cpu = nullptr;
|
CPU::Core* m_cpu = nullptr;
|
||||||
CPU::CodeCache* m_cpu_code_cache = nullptr;
|
CPU::CodeCache* m_cpu_code_cache = nullptr;
|
||||||
DMA* m_dma = nullptr;
|
DMA* m_dma = nullptr;
|
||||||
|
|
125
src/core/dma.cpp
125
src/core/dma.cpp
|
@ -228,11 +228,11 @@ void DMA::TransferChannel(Channel channel)
|
||||||
{
|
{
|
||||||
const u32 word_count = cs.block_control.manual.GetWordCount();
|
const u32 word_count = cs.block_control.manual.GetWordCount();
|
||||||
Log_DebugPrintf("DMA%u: Copying %u words %s 0x%08X", static_cast<u32>(channel), word_count,
|
Log_DebugPrintf("DMA%u: Copying %u words %s 0x%08X", static_cast<u32>(channel), word_count,
|
||||||
copy_to_device ? "from" : "to", current_address);
|
copy_to_device ? "from" : "to", current_address & ADDRESS_MASK);
|
||||||
if (copy_to_device)
|
if (copy_to_device)
|
||||||
TransferMemoryToDevice(channel, current_address, increment, word_count);
|
TransferMemoryToDevice(channel, current_address & ADDRESS_MASK, increment, word_count);
|
||||||
else
|
else
|
||||||
TransferDeviceToMemory(channel, current_address, increment, word_count);
|
TransferDeviceToMemory(channel, current_address & ADDRESS_MASK, increment, word_count);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -245,17 +245,18 @@ void DMA::TransferChannel(Channel channel)
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
Log_DebugPrintf("DMA%u: Copying linked list starting at 0x%08X to device", static_cast<u32>(channel),
|
Log_DebugPrintf("DMA%u: Copying linked list starting at 0x%08X to device", static_cast<u32>(channel),
|
||||||
current_address);
|
current_address & ADDRESS_MASK);
|
||||||
|
|
||||||
|
u8* ram_pointer = m_bus->GetRAM();
|
||||||
while (cs.request)
|
while (cs.request)
|
||||||
{
|
{
|
||||||
u32 header;
|
u32 header;
|
||||||
m_bus->DispatchAccess<MemoryAccessType::Read, MemoryAccessSize::Word>(current_address & ADDRESS_MASK, header);
|
std::memcpy(&header, &ram_pointer[current_address & ADDRESS_MASK], sizeof(header));
|
||||||
|
|
||||||
const u32 word_count = header >> 24;
|
const u32 word_count = header >> 24;
|
||||||
const u32 next_address = header & UINT32_C(0x00FFFFFF);
|
const u32 next_address = header & UINT32_C(0x00FFFFFF);
|
||||||
Log_TracePrintf(" .. linked list entry at 0x%08X size=%u(%u words) next=0x%08X", current_address,
|
Log_TracePrintf(" .. linked list entry at 0x%08X size=%u(%u words) next=0x%08X",
|
||||||
word_count * UINT32_C(4), word_count, next_address);
|
current_address & ADDRESS_MASK, word_count * UINT32_C(4), word_count, next_address);
|
||||||
if (word_count > 0)
|
if (word_count > 0)
|
||||||
TransferMemoryToDevice(channel, (current_address + sizeof(header)) & ADDRESS_MASK, 4, word_count);
|
TransferMemoryToDevice(channel, (current_address + sizeof(header)) & ADDRESS_MASK, 4, word_count);
|
||||||
|
|
||||||
|
@ -280,7 +281,7 @@ void DMA::TransferChannel(Channel channel)
|
||||||
Log_DebugPrintf("DMA%u: Copying %u blocks of size %u (%u total words) %s 0x%08X", static_cast<u32>(channel),
|
Log_DebugPrintf("DMA%u: Copying %u blocks of size %u (%u total words) %s 0x%08X", static_cast<u32>(channel),
|
||||||
cs.block_control.request.GetBlockCount(), cs.block_control.request.GetBlockSize(),
|
cs.block_control.request.GetBlockCount(), cs.block_control.request.GetBlockSize(),
|
||||||
cs.block_control.request.GetBlockCount() * cs.block_control.request.GetBlockSize(),
|
cs.block_control.request.GetBlockCount() * cs.block_control.request.GetBlockSize(),
|
||||||
copy_to_device ? "from" : "to", current_address);
|
copy_to_device ? "from" : "to", current_address & ADDRESS_MASK);
|
||||||
|
|
||||||
const u32 block_size = cs.block_control.request.GetBlockSize();
|
const u32 block_size = cs.block_control.request.GetBlockSize();
|
||||||
u32 blocks_remaining = cs.block_control.request.GetBlockCount();
|
u32 blocks_remaining = cs.block_control.request.GetBlockCount();
|
||||||
|
@ -330,35 +331,28 @@ void DMA::TransferChannel(Channel channel)
|
||||||
|
|
||||||
void DMA::TransferMemoryToDevice(Channel channel, u32 address, u32 increment, u32 word_count)
|
void DMA::TransferMemoryToDevice(Channel channel, u32 address, u32 increment, u32 word_count)
|
||||||
{
|
{
|
||||||
// Read from memory. Wrap-around?
|
const u32* src_pointer = reinterpret_cast<u32*>(m_bus->GetRAM() + address);
|
||||||
if (m_transfer_buffer.size() < word_count)
|
if (static_cast<s32>(increment) < 0 || ((address + (increment * word_count)) & ADDRESS_MASK) <= address)
|
||||||
m_transfer_buffer.resize(word_count);
|
|
||||||
|
|
||||||
if (increment > 0 && ((address + (increment * word_count)) & ADDRESS_MASK) > address)
|
|
||||||
{
|
{
|
||||||
|
// Use temp buffer if it's wrapping around
|
||||||
|
if (m_transfer_buffer.size() < word_count)
|
||||||
|
m_transfer_buffer.resize(word_count);
|
||||||
|
src_pointer = m_transfer_buffer.data();
|
||||||
m_bus->ReadWords(address, m_transfer_buffer.data(), word_count);
|
m_bus->ReadWords(address, m_transfer_buffer.data(), word_count);
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
|
||||||
for (u32 i = 0; i < word_count; i++)
|
|
||||||
{
|
|
||||||
m_bus->DispatchAccess<MemoryAccessType::Read, MemoryAccessSize::Word>(address, m_transfer_buffer[i]);
|
|
||||||
address = (address + increment) & ADDRESS_MASK;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (channel)
|
switch (channel)
|
||||||
{
|
{
|
||||||
case Channel::GPU:
|
case Channel::GPU:
|
||||||
m_gpu->DMAWrite(m_transfer_buffer.data(), word_count);
|
m_gpu->DMAWrite(src_pointer, word_count);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Channel::SPU:
|
case Channel::SPU:
|
||||||
m_spu->DMAWrite(m_transfer_buffer.data(), word_count);
|
m_spu->DMAWrite(src_pointer, word_count);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Channel::MDECin:
|
case Channel::MDECin:
|
||||||
m_mdec->DMAWrite(m_transfer_buffer.data(), word_count);
|
m_mdec->DMAWrite(src_pointer, word_count);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Channel::CDROM:
|
case Channel::CDROM:
|
||||||
|
@ -372,78 +366,67 @@ void DMA::TransferMemoryToDevice(Channel channel, u32 address, u32 increment, u3
|
||||||
|
|
||||||
void DMA::TransferDeviceToMemory(Channel channel, u32 address, u32 increment, u32 word_count)
|
void DMA::TransferDeviceToMemory(Channel channel, u32 address, u32 increment, u32 word_count)
|
||||||
{
|
{
|
||||||
if (m_transfer_buffer.size() < word_count)
|
if (channel == Channel::OTC)
|
||||||
m_transfer_buffer.resize(word_count);
|
{
|
||||||
|
// clear ordering table
|
||||||
|
u8* ram_pointer = m_bus->GetRAM();
|
||||||
|
const u32 word_count_less_1 = word_count - 1;
|
||||||
|
for (u32 i = 0; i < word_count_less_1; i++)
|
||||||
|
{
|
||||||
|
u32 value = ((address - 4) & ADDRESS_MASK);
|
||||||
|
std::memcpy(&ram_pointer[address], &value, sizeof(value));
|
||||||
|
address = (address - 4) & ADDRESS_MASK;
|
||||||
|
}
|
||||||
|
|
||||||
|
const u32 terminator = UINT32_C(0xFFFFFFF);
|
||||||
|
std::memcpy(&ram_pointer[address], &terminator, sizeof(terminator));
|
||||||
|
m_bus->InvalidateCodePages(address, word_count);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
u32* dest_pointer = reinterpret_cast<u32*>(&m_bus->m_ram[address]);
|
||||||
|
if (static_cast<s32>(increment) < 0 || ((address + (increment * word_count)) & ADDRESS_MASK) <= address)
|
||||||
|
{
|
||||||
|
// Use temp buffer if it's wrapping around
|
||||||
|
if (m_transfer_buffer.size() < word_count)
|
||||||
|
m_transfer_buffer.resize(word_count);
|
||||||
|
dest_pointer = m_transfer_buffer.data();
|
||||||
|
}
|
||||||
|
|
||||||
// Read from device.
|
// Read from device.
|
||||||
switch (channel)
|
switch (channel)
|
||||||
{
|
{
|
||||||
case Channel::OTC:
|
|
||||||
{
|
|
||||||
// clear ordering table
|
|
||||||
// this always goes in reverse, so we can generate values in reverse order and write it forwards
|
|
||||||
if (((address - (4 * word_count)) & ADDRESS_MASK) < address)
|
|
||||||
{
|
|
||||||
const u32 end_address = (address - (4 * (word_count - 1))) & ADDRESS_MASK;
|
|
||||||
|
|
||||||
u32 value = end_address;
|
|
||||||
m_transfer_buffer[0] = UINT32_C(0xFFFFFF);
|
|
||||||
for (u32 i = 1; i < word_count; i++)
|
|
||||||
{
|
|
||||||
m_transfer_buffer[i] = value;
|
|
||||||
value = (value + 4) & ADDRESS_MASK;
|
|
||||||
}
|
|
||||||
|
|
||||||
m_bus->WriteWords(end_address, m_transfer_buffer.data(), word_count);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
for (u32 i = 0; i < word_count; i++)
|
|
||||||
{
|
|
||||||
u32 value = (i == word_count - 1) ? UINT32_C(0xFFFFFFF) : ((address - 4) & ADDRESS_MASK);
|
|
||||||
m_bus->DispatchAccess<MemoryAccessType::Write, MemoryAccessSize::Word>(address, value);
|
|
||||||
address = (address - 4) & ADDRESS_MASK;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case Channel::GPU:
|
case Channel::GPU:
|
||||||
m_gpu->DMARead(m_transfer_buffer.data(), word_count);
|
m_gpu->DMARead(dest_pointer, word_count);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Channel::CDROM:
|
case Channel::CDROM:
|
||||||
m_cdrom->DMARead(m_transfer_buffer.data(), word_count);
|
m_cdrom->DMARead(dest_pointer, word_count);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Channel::SPU:
|
case Channel::SPU:
|
||||||
m_spu->DMARead(m_transfer_buffer.data(), word_count);
|
m_spu->DMARead(dest_pointer, word_count);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Channel::MDECout:
|
case Channel::MDECout:
|
||||||
m_mdec->DMARead(m_transfer_buffer.data(), word_count);
|
m_mdec->DMARead(dest_pointer, word_count);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Channel::MDECin:
|
|
||||||
case Channel::PIO:
|
|
||||||
default:
|
default:
|
||||||
Panic("Unhandled DMA channel for device read");
|
Panic("Unhandled DMA channel for device read");
|
||||||
std::fill_n(m_transfer_buffer.begin(), word_count, UINT32_C(0xFFFFFFFF));
|
std::fill_n(dest_pointer, word_count, UINT32_C(0xFFFFFFFF));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (increment > 0 && ((address + (increment * word_count)) & ADDRESS_MASK) > address)
|
if (dest_pointer == m_transfer_buffer.data())
|
||||||
{
|
|
||||||
m_bus->WriteWords(address, m_transfer_buffer.data(), word_count);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
|
u8* ram_pointer = m_bus->m_ram.data();
|
||||||
for (u32 i = 0; i < word_count; i++)
|
for (u32 i = 0; i < word_count; i++)
|
||||||
{
|
{
|
||||||
m_bus->DispatchAccess<MemoryAccessType::Write, MemoryAccessSize::Word>(address, m_transfer_buffer[i]);
|
std::memcpy(&ram_pointer[address], &m_transfer_buffer[i], sizeof(u32));
|
||||||
address = (address + increment) & ADDRESS_MASK;
|
address = (address + increment) & ADDRESS_MASK;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m_bus->InvalidateCodePages(address, word_count);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue