From 2595e31575c17663707d6a64c867364b23a2a045 Mon Sep 17 00:00:00 2001
From: Connor McLaughlin <stenzek@gmail.com>
Date: Sat, 21 Nov 2020 13:32:58 +1000
Subject: [PATCH] GPU: Split software to frontend/backend

---
 src/core/CMakeLists.txt                       |    4 +
 src/core/core.vcxproj                         |    4 +
 src/core/core.vcxproj.filters                 |    4 +
 src/core/gpu.cpp                              |   16 +-
 src/core/gpu.h                                |   66 +-
 src/core/gpu_backend.cpp                      |  327 +++++
 src/core/gpu_backend.h                        |   91 ++
 src/core/gpu_hw.cpp                           |    8 +-
 src/core/gpu_sw.cpp                           | 1067 +++++------------
 src/core/gpu_sw.h                             |   99 +-
 src/core/gpu_sw_backend.cpp                   |  928 ++++++++++++++
 src/core/gpu_sw_backend.h                     |  174 +++
 src/core/gpu_types.h                          |  174 +++
 src/core/host_interface.cpp                   |    2 +
 src/core/settings.cpp                         |    2 +
 src/core/settings.h                           |    1 +
 .../libretro_host_interface.cpp               |   16 +-
 src/duckstation-qt/displaysettingswidget.cpp  |   11 +
 src/duckstation-qt/displaysettingswidget.ui   |    9 +-
 src/duckstation-sdl/sdl_host_interface.cpp    |    2 +
 20 files changed, 2035 insertions(+), 970 deletions(-)
 create mode 100644 src/core/gpu_backend.cpp
 create mode 100644 src/core/gpu_backend.h
 create mode 100644 src/core/gpu_sw_backend.cpp
 create mode 100644 src/core/gpu_sw_backend.h

diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 086a5a76a..5da789715 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -30,6 +30,8 @@ add_library(core
     dma.h
     gpu.cpp
     gpu.h
+    gpu_backend.cpp
+    gpu_backend.h
     gpu_commands.cpp
     gpu_hw.cpp
     gpu_hw.h
@@ -41,6 +43,8 @@ add_library(core
     gpu_hw_vulkan.h
     gpu_sw.cpp
     gpu_sw.h
+    gpu_sw_backend.cpp
+    gpu_sw_backend.h
     gpu_types.h
     gte.cpp
     gte.h
diff --git a/src/core/core.vcxproj b/src/core/core.vcxproj
index bd9dce6df..a0fc019ce 100644
--- a/src/core/core.vcxproj
+++ b/src/core/core.vcxproj
@@ -115,11 +115,13 @@
     </ClCompile>
     <ClCompile Include="cpu_types.cpp" />
     <ClCompile Include="digital_controller.cpp" />
+    <ClCompile Include="gpu_backend.cpp" />
     <ClCompile Include="gpu_commands.cpp" />
     <ClCompile Include="gpu_hw_d3d11.cpp" />
     <ClCompile Include="gpu_hw_shadergen.cpp" />
     <ClCompile Include="gpu_hw_vulkan.cpp" />
     <ClCompile Include="gpu_sw.cpp" />
+    <ClCompile Include="gpu_sw_backend.cpp" />
     <ClCompile Include="gte.cpp" />
     <ClCompile Include="dma.cpp" />
     <ClCompile Include="gpu.cpp" />
@@ -185,10 +187,12 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
     </ClInclude>
     <ClInclude Include="digital_controller.h" />
+    <ClInclude Include="gpu_backend.h" />
     <ClInclude Include="gpu_hw_d3d11.h" />
     <ClInclude Include="gpu_hw_shadergen.h" />
     <ClInclude Include="gpu_hw_vulkan.h" />
     <ClInclude Include="gpu_sw.h" />
+    <ClInclude Include="gpu_sw_backend.h" />
     <ClInclude Include="gpu_types.h" />
     <ClInclude Include="gte.h" />
     <ClInclude Include="cpu_types.h" />
diff --git a/src/core/core.vcxproj.filters b/src/core/core.vcxproj.filters
index bd9fa7850..d2c2ce6d5 100644
--- a/src/core/core.vcxproj.filters
+++ b/src/core/core.vcxproj.filters
@@ -51,6 +51,8 @@
     <ClCompile Include="memory_card_image.cpp" />
     <ClCompile Include="analog_joystick.cpp" />
     <ClCompile Include="cpu_recompiler_code_generator_aarch32.cpp" />
+    <ClCompile Include="gpu_backend.cpp" />
+    <ClCompile Include="gpu_sw_backend.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="types.h" />
@@ -105,5 +107,7 @@
     <ClInclude Include="memory_card_image.h" />
     <ClInclude Include="analog_joystick.h" />
     <ClInclude Include="gpu_types.h" />
+    <ClInclude Include="gpu_backend.h" />
+    <ClInclude Include="gpu_sw_backend.h" />
   </ItemGroup>
 </Project>
diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp
index 05cb6ae99..c4f5e920c 100644
--- a/src/core/gpu.cpp
+++ b/src/core/gpu.cpp
@@ -129,10 +129,10 @@ bool GPU::DoState(StateWrapper& sw)
   sw.Do(&m_draw_mode.texture_page_y);
   sw.Do(&m_draw_mode.texture_palette_x);
   sw.Do(&m_draw_mode.texture_palette_y);
-  sw.Do(&m_draw_mode.texture_window_and_x);
-  sw.Do(&m_draw_mode.texture_window_and_y);
-  sw.Do(&m_draw_mode.texture_window_or_x);
-  sw.Do(&m_draw_mode.texture_window_or_y);
+  sw.Do(&m_draw_mode.texture_window.and_x);
+  sw.Do(&m_draw_mode.texture_window.and_y);
+  sw.Do(&m_draw_mode.texture_window.or_x);
+  sw.Do(&m_draw_mode.texture_window.or_y);
   sw.Do(&m_draw_mode.texture_x_flip);
   sw.Do(&m_draw_mode.texture_y_flip);
 
@@ -1358,10 +1358,10 @@ void GPU::SetTextureWindow(u32 value)
   const u8 offset_y = Truncate8((value >> 15) & UINT32_C(0x1F));
   Log_DebugPrintf("Set texture window %02X %02X %02X %02X", mask_x, mask_y, offset_x, offset_y);
 
-  m_draw_mode.texture_window_and_x = ~(mask_x * 8);
-  m_draw_mode.texture_window_and_y = ~(mask_y * 8);
-  m_draw_mode.texture_window_or_x = (offset_x & mask_x) * 8u;
-  m_draw_mode.texture_window_or_y = (offset_y & mask_y) * 8u;
+  m_draw_mode.texture_window.and_x = ~(mask_x * 8);
+  m_draw_mode.texture_window.and_y = ~(mask_y * 8);
+  m_draw_mode.texture_window.or_x = (offset_x & mask_x) * 8u;
+  m_draw_mode.texture_window.or_y = (offset_y & mask_y) * 8u;
   m_draw_mode.texture_window_value = value;
   m_draw_mode.texture_window_changed = true;
 }
diff --git a/src/core/gpu.h b/src/core/gpu.h
index 979318326..459b2fbff 100644
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@@ -159,9 +159,6 @@ protected:
   ALWAYS_INLINE static constexpr TickCount SystemTicksToGPUTicks(TickCount sysclk_ticks) { return sysclk_ticks << 1; }
 
   // Helper/format conversion functions.
-  static constexpr u8 Convert5To8(u8 x5) { return (x5 << 3) | (x5 & 7); }
-  static constexpr u8 Convert8To5(u8 x8) { return (x8 >> 3); }
-
   static constexpr u32 RGBA5551ToRGBA8888(u16 color)
   {
     u8 r = Truncate8(color & 31);
@@ -197,68 +194,10 @@ protected:
   {
     return std::make_tuple(static_cast<u8>(rgb24), static_cast<u8>(rgb24 >> 8), static_cast<u8>(rgb24 >> 16));
   }
-  static constexpr u32 PackColorRGB24(u8 r, u8 g, u8 b)
-  {
-    return ZeroExtend32(r) | (ZeroExtend32(g) << 8) | (ZeroExtend32(b) << 16);
-  }
 
   static bool DumpVRAMToFile(const char* filename, u32 width, u32 height, u32 stride, const void* buffer,
                              bool remove_alpha);
 
-  union VRAMPixel
-  {
-    u16 bits;
-
-    BitField<u16, u8, 0, 5> r;
-    BitField<u16, u8, 5, 5> g;
-    BitField<u16, u8, 10, 5> b;
-    BitField<u16, bool, 15, 1> c;
-
-    u8 GetR8() const { return Convert5To8(r); }
-    u8 GetG8() const { return Convert5To8(g); }
-    u8 GetB8() const { return Convert5To8(b); }
-
-    void Set(u8 r_, u8 g_, u8 b_, bool c_ = false)
-    {
-      bits = (ZeroExtend16(r_)) | (ZeroExtend16(g_) << 5) | (ZeroExtend16(b_) << 10) | (static_cast<u16>(c_) << 15);
-    }
-
-    void ClampAndSet(u8 r_, u8 g_, u8 b_, bool c_ = false)
-    {
-      Set(std::min<u8>(r_, 0x1F), std::min<u8>(g_, 0x1F), std::min<u8>(b_, 0x1F), c_);
-    }
-
-    void SetRGB24(u32 rgb24, bool c_ = false)
-    {
-      bits = Truncate16(((rgb24 >> 3) & 0x1F) | (((rgb24 >> 11) & 0x1F) << 5) | (((rgb24 >> 19) & 0x1F) << 10)) |
-             (static_cast<u16>(c_) << 15);
-    }
-
-    void SetRGB24(u8 r8, u8 g8, u8 b8, bool c_ = false)
-    {
-      bits = (ZeroExtend16(r8 >> 3)) | (ZeroExtend16(g8 >> 3) << 5) | (ZeroExtend16(b8 >> 3) << 10) |
-             (static_cast<u16>(c_) << 15);
-    }
-
-    void SetRGB24Dithered(u32 x, u32 y, u8 r8, u8 g8, u8 b8, bool c_ = false)
-    {
-      const s32 offset = DITHER_MATRIX[y & 3][x & 3];
-      r8 = static_cast<u8>(std::clamp<s32>(static_cast<s32>(ZeroExtend32(r8)) + offset, 0, 255));
-      g8 = static_cast<u8>(std::clamp<s32>(static_cast<s32>(ZeroExtend32(g8)) + offset, 0, 255));
-      b8 = static_cast<u8>(std::clamp<s32>(static_cast<s32>(ZeroExtend32(b8)) + offset, 0, 255));
-      SetRGB24(r8, g8, b8, c_);
-    }
-
-    u32 ToRGB24() const
-    {
-      const u32 r_ = ZeroExtend32(r.GetValue());
-      const u32 g_ = ZeroExtend32(g.GetValue());
-      const u32 b_ = ZeroExtend32(b.GetValue());
-
-      return ((r_ << 3) | (r_ & 7)) | (((g_ << 3) | (g_ & 7)) << 8) | (((b_ << 3) | (b_ & 7)) << 16);
-    }
-  };
-
   void SoftReset();
 
   // Sets dots per scanline
@@ -464,10 +403,7 @@ protected:
     u32 texture_page_y;
     u32 texture_palette_x;
     u32 texture_palette_y;
-    u8 texture_window_and_x;
-    u8 texture_window_and_y;
-    u8 texture_window_or_x;
-    u8 texture_window_or_y;
+    GPUTextureWindow texture_window;
     bool texture_x_flip;
     bool texture_y_flip;
     bool texture_page_changed;
diff --git a/src/core/gpu_backend.cpp b/src/core/gpu_backend.cpp
new file mode 100644
index 000000000..7bb67df8a
--- /dev/null
+++ b/src/core/gpu_backend.cpp
@@ -0,0 +1,327 @@
+#include "gpu_backend.h"
+#include "common/log.h"
+#include "common/state_wrapper.h"
+#include "settings.h"
+Log_SetChannel(GPUBackend);
+
+std::unique_ptr<GPUBackend> g_gpu_backend;
+
+GPUBackend::GPUBackend() = default;
+
+GPUBackend::~GPUBackend() = default;
+
+bool GPUBackend::Initialize()
+{
+  if (g_settings.gpu_use_thread)
+    StartGPUThread();
+
+  return true;
+}
+
+void GPUBackend::Reset()
+{
+  Sync();
+  m_drawing_area = {};
+}
+
+void GPUBackend::UpdateSettings()
+{
+  Sync();
+
+  if (m_use_gpu_thread != g_settings.gpu_use_thread)
+  {
+    if (!g_settings.gpu_use_thread)
+      StopGPUThread();
+    else
+      StartGPUThread();
+  }
+}
+
+void GPUBackend::Shutdown()
+{
+  StopGPUThread();
+}
+
+GPUBackendFillVRAMCommand* GPUBackend::NewFillVRAMCommand()
+{
+  GPUBackendFillVRAMCommand* cmd =
+    static_cast<GPUBackendFillVRAMCommand*>(AllocateCommand(sizeof(GPUBackendFillVRAMCommand)));
+  cmd->type = GPUBackendCommandType::FillVRAM;
+  cmd->size = cmd->Size();
+  return cmd;
+}
+
+GPUBackendUpdateVRAMCommand* GPUBackend::NewUpdateVRAMCommand(u32 num_words)
+{
+  const u32 size = sizeof(GPUBackendUpdateVRAMCommand) + (num_words * sizeof(u16));
+  GPUBackendUpdateVRAMCommand* cmd = static_cast<GPUBackendUpdateVRAMCommand*>(AllocateCommand(size));
+  cmd->type = GPUBackendCommandType::UpdateVRAM;
+  cmd->size = size;
+  return cmd;
+}
+
+GPUBackendCopyVRAMCommand* GPUBackend::NewCopyVRAMCommand()
+{
+  GPUBackendCopyVRAMCommand* cmd =
+    static_cast<GPUBackendCopyVRAMCommand*>(AllocateCommand(sizeof(GPUBackendCopyVRAMCommand)));
+  cmd->type = GPUBackendCommandType::CopyVRAM;
+  cmd->size = cmd->Size();
+  return cmd;
+}
+
+GPUBackendSetDrawingAreaCommand* GPUBackend::NewSetDrawingAreaCommand()
+{
+  GPUBackendSetDrawingAreaCommand* cmd =
+    static_cast<GPUBackendSetDrawingAreaCommand*>(AllocateCommand(sizeof(GPUBackendSetDrawingAreaCommand)));
+  cmd->type = GPUBackendCommandType::SetDrawingArea;
+  cmd->size = cmd->Size();
+  return cmd;
+}
+
+GPUBackendDrawPolygonCommand* GPUBackend::NewDrawPolygonCommand(u32 num_vertices)
+{
+  const u32 size = sizeof(GPUBackendDrawPolygonCommand) + (num_vertices * sizeof(GPUBackendDrawPolygonCommand::Vertex));
+  GPUBackendDrawPolygonCommand* cmd = static_cast<GPUBackendDrawPolygonCommand*>(AllocateCommand(size));
+  cmd->type = GPUBackendCommandType::DrawPolygon;
+  cmd->size = size;
+  cmd->num_vertices = Truncate16(num_vertices);
+  return cmd;
+}
+
+GPUBackendDrawRectangleCommand* GPUBackend::NewDrawRectangleCommand()
+{
+  GPUBackendDrawRectangleCommand* cmd =
+    static_cast<GPUBackendDrawRectangleCommand*>(AllocateCommand(sizeof(GPUBackendDrawRectangleCommand)));
+  cmd->type = GPUBackendCommandType::DrawRectangle;
+  cmd->size = cmd->Size();
+  return cmd;
+}
+
+GPUBackendDrawLineCommand* GPUBackend::NewDrawLineCommand(u32 num_vertices)
+{
+  const u32 size = sizeof(GPUBackendDrawLineCommand) + (num_vertices * sizeof(GPUBackendDrawLineCommand::Vertex));
+  GPUBackendDrawLineCommand* cmd = static_cast<GPUBackendDrawLineCommand*>(AllocateCommand(size));
+  cmd->type = GPUBackendCommandType::DrawLine;
+  cmd->size = size;
+  cmd->num_vertices = Truncate16(num_vertices);
+  return cmd;
+}
+
+void* GPUBackend::AllocateCommand(u32 size)
+{
+  for (;;)
+  {
+    u32 read_ptr = m_command_fifo_read_ptr.load();
+    u32 write_ptr = m_command_fifo_write_ptr.load();
+    if (read_ptr > write_ptr)
+    {
+      u32 available_size = read_ptr - write_ptr;
+      while (available_size < (size + sizeof(GPUBackendCommandType)))
+      {
+        WakeGPUThread();
+        read_ptr = m_command_fifo_read_ptr.load();
+        available_size = (read_ptr > write_ptr) ? (read_ptr - write_ptr) : (COMMAND_QUEUE_SIZE - write_ptr);
+      }
+    }
+    else
+    {
+      const u32 available_size = COMMAND_QUEUE_SIZE - write_ptr;
+      if ((size + sizeof(GPUBackendCommand)) > available_size)
+      {
+        // allocate a dummy command to wrap the buffer around
+        GPUBackendCommand* dummy_cmd = reinterpret_cast<GPUBackendCommand*>(&m_command_fifo_data[write_ptr]);
+        dummy_cmd->type = GPUBackendCommandType::Wraparound;
+        dummy_cmd->size = available_size;
+        dummy_cmd->params.bits = 0;
+        m_command_fifo_write_ptr.store(0);
+        continue;
+      }
+    }
+
+    return &m_command_fifo_data[write_ptr];
+  }
+}
+
+u32 GPUBackend::GetPendingCommandSize() const
+{
+  const u32 read_ptr = m_command_fifo_read_ptr.load();
+  const u32 write_ptr = m_command_fifo_write_ptr.load();
+  return (write_ptr >= read_ptr) ? (write_ptr - read_ptr) : (COMMAND_QUEUE_SIZE - read_ptr + write_ptr);
+}
+
+void GPUBackend::PushCommand(GPUBackendCommand* cmd)
+{
+  if (!m_use_gpu_thread)
+  {
+    // single-thread mode
+    if (cmd->type != GPUBackendCommandType::Sync)
+      HandleCommand(cmd);
+  }
+  else
+  {
+    const u32 new_write_ptr = m_command_fifo_write_ptr.fetch_add(cmd->size) + cmd->size;
+    DebugAssert(new_write_ptr <= COMMAND_QUEUE_SIZE);
+    if (GetPendingCommandSize() >= THRESHOLD_TO_WAKE_GPU)
+      WakeGPUThread();
+  }
+}
+
+void GPUBackend::WakeGPUThread()
+{
+  std::unique_lock<std::mutex> lock(m_sync_mutex);
+  if (!m_gpu_thread_sleeping.load())
+    return;
+
+  m_wake_gpu_thread_cv.notify_one();
+}
+
+void GPUBackend::StartGPUThread()
+{
+  m_gpu_loop_done.store(false);
+  m_use_gpu_thread = true;
+  m_gpu_thread = std::thread(&GPUBackend::RunGPULoop, this);
+  Log_InfoPrint("GPU thread started.");
+}
+
+void GPUBackend::StopGPUThread()
+{
+  if (!m_use_gpu_thread)
+    return;
+
+  m_gpu_loop_done.store(true);
+  WakeGPUThread();
+  m_gpu_thread.join();
+  m_use_gpu_thread = false;
+  Log_InfoPrint("GPU thread stopped.");
+}
+
+void GPUBackend::Sync()
+{
+  if (!m_use_gpu_thread)
+    return;
+
+  GPUBackendSyncCommand* cmd = static_cast<GPUBackendSyncCommand*>(AllocateCommand(sizeof(GPUBackendSyncCommand)));
+  cmd->type = GPUBackendCommandType::Sync;
+  cmd->size = sizeof(GPUBackendSyncCommand);
+  PushCommand(cmd);
+  WakeGPUThread();
+
+  m_sync_event.Wait();
+  m_sync_event.Reset();
+}
+
+void GPUBackend::RunGPULoop()
+{
+  for (;;)
+  {
+    u32 write_ptr = m_command_fifo_write_ptr.load();
+    u32 read_ptr = m_command_fifo_read_ptr.load();
+    if (read_ptr == write_ptr)
+    {
+      std::unique_lock<std::mutex> lock(m_sync_mutex);
+      m_gpu_thread_sleeping.store(true);
+      m_wake_gpu_thread_cv.wait(lock, [this]() { return m_gpu_loop_done.load() || GetPendingCommandSize() > 0; });
+      m_gpu_thread_sleeping.store(false);
+
+      if (m_gpu_loop_done.load())
+        break;
+      else
+        continue;
+    }
+
+    if (write_ptr < read_ptr)
+      write_ptr = COMMAND_QUEUE_SIZE;
+
+    while (read_ptr < write_ptr)
+    {
+      const GPUBackendCommand* cmd = reinterpret_cast<const GPUBackendCommand*>(&m_command_fifo_data[read_ptr]);
+      read_ptr += cmd->size;
+
+      switch (cmd->type)
+      {
+        case GPUBackendCommandType::Wraparound:
+        {
+          DebugAssert(read_ptr == COMMAND_QUEUE_SIZE);
+          write_ptr = m_command_fifo_write_ptr.load();
+          read_ptr = 0;
+        }
+        break;
+
+        case GPUBackendCommandType::Sync:
+        {
+          DebugAssert(read_ptr == write_ptr);
+          m_sync_event.Signal();
+        }
+        break;
+
+        default:
+          HandleCommand(cmd);
+          break;
+      }
+    }
+
+    m_command_fifo_read_ptr.store(read_ptr);
+  }
+}
+
+void GPUBackend::HandleCommand(const GPUBackendCommand* cmd)
+{
+  switch (cmd->type)
+  {
+    case GPUBackendCommandType::FillVRAM:
+    {
+      FlushRender();
+      const GPUBackendFillVRAMCommand* ccmd = static_cast<const GPUBackendFillVRAMCommand*>(cmd);
+      FillVRAM(ZeroExtend32(ccmd->x), ZeroExtend32(ccmd->y), ZeroExtend32(ccmd->width), ZeroExtend32(ccmd->height),
+               ccmd->color, ccmd->params);
+    }
+    break;
+
+    case GPUBackendCommandType::UpdateVRAM:
+    {
+      FlushRender();
+      const GPUBackendUpdateVRAMCommand* ccmd = static_cast<const GPUBackendUpdateVRAMCommand*>(cmd);
+      UpdateVRAM(ZeroExtend32(ccmd->x), ZeroExtend32(ccmd->y), ZeroExtend32(ccmd->width), ZeroExtend32(ccmd->height),
+                 ccmd->data, ccmd->params);
+    }
+    break;
+
+    case GPUBackendCommandType::CopyVRAM:
+    {
+      FlushRender();
+      const GPUBackendCopyVRAMCommand* ccmd = static_cast<const GPUBackendCopyVRAMCommand*>(cmd);
+      CopyVRAM(ZeroExtend32(ccmd->src_x), ZeroExtend32(ccmd->src_y), ZeroExtend32(ccmd->dst_x),
+               ZeroExtend32(ccmd->dst_y), ZeroExtend32(ccmd->width), ZeroExtend32(ccmd->height), ccmd->params);
+    }
+    break;
+
+    case GPUBackendCommandType::SetDrawingArea:
+    {
+      FlushRender();
+      m_drawing_area = static_cast<const GPUBackendSetDrawingAreaCommand*>(cmd)->new_area;
+      DrawingAreaChanged();
+    }
+    break;
+
+    case GPUBackendCommandType::DrawPolygon:
+    {
+      DrawPolygon(static_cast<const GPUBackendDrawPolygonCommand*>(cmd));
+    }
+    break;
+
+    case GPUBackendCommandType::DrawRectangle:
+    {
+      DrawRectangle(static_cast<const GPUBackendDrawRectangleCommand*>(cmd));
+    }
+    break;
+
+    case GPUBackendCommandType::DrawLine:
+    {
+      DrawLine(static_cast<const GPUBackendDrawLineCommand*>(cmd));
+    }
+    break;
+
+    default:
+      break;
+  }
+}
diff --git a/src/core/gpu_backend.h b/src/core/gpu_backend.h
new file mode 100644
index 000000000..2590b76b9
--- /dev/null
+++ b/src/core/gpu_backend.h
@@ -0,0 +1,91 @@
+#pragma once
+#include "common/event.h"
+#include "common/heap_array.h"
+#include "gpu_types.h"
+#include <atomic>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <thread>
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4324) // warning C4324: 'GPUBackend': structure was padded due to alignment specifier
+#endif
+
+class GPUBackend
+{
+public:
+  GPUBackend();
+  virtual ~GPUBackend();
+
+  ALWAYS_INLINE u16* GetVRAM() const { return m_vram_ptr; }
+
+  virtual bool Initialize();
+  virtual void UpdateSettings();
+  virtual void Reset();
+  virtual void Shutdown();
+
+  GPUBackendFillVRAMCommand* NewFillVRAMCommand();
+  GPUBackendUpdateVRAMCommand* NewUpdateVRAMCommand(u32 num_words);
+  GPUBackendCopyVRAMCommand* NewCopyVRAMCommand();
+  GPUBackendSetDrawingAreaCommand* NewSetDrawingAreaCommand();
+  GPUBackendDrawPolygonCommand* NewDrawPolygonCommand(u32 num_vertices);
+  GPUBackendDrawRectangleCommand* NewDrawRectangleCommand();
+  GPUBackendDrawLineCommand* NewDrawLineCommand(u32 num_vertices);
+
+  void PushCommand(GPUBackendCommand* cmd);
+  void Sync();
+
+  /// Processes all pending GPU commands.
+  void RunGPULoop();
+
+protected:
+  void* AllocateCommand(u32 size);
+  u32 GetPendingCommandSize() const;
+  void WakeGPUThread();
+  void StartGPUThread();
+  void StopGPUThread();
+
+  virtual void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params) = 0;
+  virtual void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data,
+                          GPUBackendCommandParameters params) = 0;
+  virtual void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
+                        GPUBackendCommandParameters params) = 0;
+  virtual void DrawPolygon(const GPUBackendDrawPolygonCommand* cmd) = 0;
+  virtual void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) = 0;
+  virtual void DrawLine(const GPUBackendDrawLineCommand* cmd) = 0;
+  virtual void FlushRender() = 0;
+  virtual void DrawingAreaChanged() = 0;
+
+  void HandleCommand(const GPUBackendCommand* cmd);
+
+  u16* m_vram_ptr = nullptr;
+
+  Common::Rectangle<u32> m_drawing_area{};
+
+  Common::Event m_sync_event;
+  std::atomic_bool m_gpu_thread_sleeping{false};
+  std::atomic_bool m_gpu_loop_done{false};
+  std::thread m_gpu_thread;
+  bool m_use_gpu_thread = false;
+
+  std::mutex m_sync_mutex;
+  std::condition_variable m_sync_cpu_thread_cv;
+  std::condition_variable m_wake_gpu_thread_cv;
+  bool m_sync_done = false;
+
+  enum : u32
+  {
+    COMMAND_QUEUE_SIZE = 4 * 1024 * 1024,
+    THRESHOLD_TO_WAKE_GPU = 256
+  };
+
+  HeapArray<u8, COMMAND_QUEUE_SIZE> m_command_fifo_data;
+  alignas(64) std::atomic<u32> m_command_fifo_read_ptr{0};
+  alignas(64) std::atomic<u32> m_command_fifo_write_ptr{0};
+};
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp
index 0145a8da0..63b007cf6 100644
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@@ -1004,10 +1004,10 @@ void GPU_HW::DispatchRenderCommand()
   {
     m_draw_mode.ClearTextureWindowChangedFlag();
 
-    m_batch_ubo_data.u_texture_window_and[0] = ZeroExtend32(m_draw_mode.texture_window_and_x);
-    m_batch_ubo_data.u_texture_window_and[1] = ZeroExtend32(m_draw_mode.texture_window_and_y);
-    m_batch_ubo_data.u_texture_window_or[0] = ZeroExtend32(m_draw_mode.texture_window_or_x);
-    m_batch_ubo_data.u_texture_window_or[1] = ZeroExtend32(m_draw_mode.texture_window_or_y);
+    m_batch_ubo_data.u_texture_window_and[0] = ZeroExtend32(m_draw_mode.texture_window.and_x);
+    m_batch_ubo_data.u_texture_window_and[1] = ZeroExtend32(m_draw_mode.texture_window.and_y);
+    m_batch_ubo_data.u_texture_window_or[0] = ZeroExtend32(m_draw_mode.texture_window.or_x);
+    m_batch_ubo_data.u_texture_window_or[1] = ZeroExtend32(m_draw_mode.texture_window.or_y);
     m_batch_ubo_dirty = true;
   }
 
diff --git a/src/core/gpu_sw.cpp b/src/core/gpu_sw.cpp
index 3d383f053..5a69c1cb8 100644
--- a/src/core/gpu_sw.cpp
+++ b/src/core/gpu_sw.cpp
@@ -19,14 +19,23 @@ Log_SetChannel(GPU_SW);
 #endif
 #endif
 
+template<typename T>
+ALWAYS_INLINE static constexpr std::tuple<T, T> MinMax(T v1, T v2)
+{
+  if (v1 > v2)
+    return std::tie(v2, v1);
+  else
+    return std::tie(v1, v2);
+}
+
 GPU_SW::GPU_SW()
 {
-  m_vram.fill(0);
-  m_vram_ptr = m_vram.data();
+  m_vram_ptr = m_backend.GetVRAM();
 }
 
 GPU_SW::~GPU_SW()
 {
+  m_backend.Shutdown();
   if (m_host_display)
     m_host_display->ClearDisplayTexture();
 }
@@ -38,7 +47,7 @@ bool GPU_SW::IsHardwareRenderer() const
 
 bool GPU_SW::Initialize(HostDisplay* host_display)
 {
-  if (!GPU::Initialize(host_display))
+  if (!GPU::Initialize(host_display) || !m_backend.Initialize())
     return false;
 
   static constexpr auto formats_for_16bit = make_array(HostDisplayPixelFormat::RGB565, HostDisplayPixelFormat::RGBA5551,
@@ -70,7 +79,13 @@ void GPU_SW::Reset()
 {
   GPU::Reset();
 
-  m_vram.fill(0);
+  m_backend.Reset();
+}
+
+void GPU_SW::UpdateSettings()
+{
+  GPU::UpdateSettings();
+  m_backend.UpdateSettings();
 }
 
 template<HostDisplayPixelFormat out_format, typename out_type>
@@ -248,7 +263,7 @@ void GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 field
     const u32 rows = height >> interlaced_shift;
     dst_stride <<= interlaced_shift;
 
-    const u16* src_ptr = &m_vram[src_y * VRAM_WIDTH + src_x];
+    const u16* src_ptr = &m_vram_ptr[src_y * VRAM_WIDTH + src_x];
     const u32 src_step = VRAM_WIDTH << interleaved_shift;
     for (u32 row = 0; row < rows; row++)
     {
@@ -265,7 +280,7 @@ void GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 field
     const u32 end_x = src_x + width;
     for (u32 row = 0; row < rows; row++)
     {
-      const u16* src_row_ptr = &m_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
+      const u16* src_row_ptr = &m_vram_ptr[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
       OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr);
       for (u32 col = src_x; col < end_x; col++)
       {
@@ -340,7 +355,7 @@ void GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 heigh
 
   if ((src_x + width) <= VRAM_WIDTH && (src_y + (rows << interleaved_shift)) <= VRAM_HEIGHT)
   {
-    const u8* src_ptr = reinterpret_cast<const u8*>(&m_vram[src_y * VRAM_WIDTH + src_x]) + (skip_x * 3);
+    const u8* src_ptr = reinterpret_cast<const u8*>(&m_vram_ptr[src_y * VRAM_WIDTH + src_x]) + (skip_x * 3);
     const u32 src_stride = (VRAM_WIDTH << interleaved_shift) * sizeof(u16);
     for (u32 row = 0; row < rows; row++)
     {
@@ -400,7 +415,7 @@ void GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32 skip_x, u32 width, u32 heigh
   {
     for (u32 row = 0; row < rows; row++)
     {
-      const u16* src_row_ptr = &m_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
+      const u16* src_row_ptr = &m_vram_ptr[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
       OutputPixelType* dst_row_ptr = reinterpret_cast<OutputPixelType*>(dst_ptr);
 
       for (u32 col = 0; col < width; col++)
@@ -475,6 +490,8 @@ void GPU_SW::ClearDisplay()
 void GPU_SW::UpdateDisplay()
 {
   // fill display texture
+  m_backend.Sync();
+
   if (!g_settings.debugging.show_vram)
   {
     if (IsDisplayDisabled())
@@ -530,8 +547,34 @@ void GPU_SW::UpdateDisplay()
   }
 }
 
+void GPU_SW::FillBackendCommandParameters(GPUBackendCommand* cmd)
+{
+  cmd->params.bits = 0;
+  cmd->params.check_mask_before_draw = m_GPUSTAT.check_mask_before_draw;
+  cmd->params.set_mask_while_drawing = m_GPUSTAT.set_mask_while_drawing;
+  cmd->params.active_line_lsb = m_crtc_state.active_line_lsb;
+  cmd->params.interlaced_rendering = IsInterlacedRenderingEnabled();
+}
+
+void GPU_SW::FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc)
+{
+  FillBackendCommandParameters(cmd);
+  cmd->rc.bits = rc.bits;
+  cmd->draw_mode.bits = m_draw_mode.mode_reg.bits;
+  cmd->palette.bits = m_draw_mode.palette_reg;
+  cmd->window = m_draw_mode.texture_window;
+}
+
 void GPU_SW::DispatchRenderCommand()
 {
+  if (m_drawing_area_changed)
+  {
+    GPUBackendSetDrawingAreaCommand* cmd = m_backend.NewSetDrawingAreaCommand();
+    cmd->new_area = m_drawing_area;
+    m_backend.PushCommand(cmd);
+    m_drawing_area_changed = false;
+  }
+
   const GPURenderCommand rc{m_render_command.bits};
   const bool dithering_enable = rc.IsDitheringEnabled() && m_GPUSTAT.dither_enable;
 
@@ -539,80 +582,119 @@ void GPU_SW::DispatchRenderCommand()
   {
     case GPUPrimitive::Polygon:
     {
+      const u32 num_vertices = rc.quad_polygon ? 4 : 3;
+      GPUBackendDrawPolygonCommand* cmd = m_backend.NewDrawPolygonCommand(num_vertices);
+      FillDrawCommand(cmd, rc);
+
       const u32 first_color = rc.color_for_first_vertex;
       const bool shaded = rc.shading_enable;
       const bool textured = rc.texture_enable;
-
-      const u32 num_vertices = rc.quad_polygon ? 4 : 3;
-      std::array<SWVertex, 4> vertices;
       for (u32 i = 0; i < num_vertices; i++)
       {
-        SWVertex& vert = vertices[i];
-        const u32 color_rgb = (shaded && i > 0) ? (FifoPop() & UINT32_C(0x00FFFFFF)) : first_color;
-        vert.r = Truncate8(color_rgb);
-        vert.g = Truncate8(color_rgb >> 8);
-        vert.b = Truncate8(color_rgb >> 16);
-
-        const GPUVertexPosition vp{FifoPop()};
-        vert.x = m_drawing_offset.x + vp.x;
-        vert.y = m_drawing_offset.y + vp.y;
-
-        if (textured)
-        {
-          std::tie(vert.u, vert.v) = UnpackTexcoord(Truncate16(FifoPop()));
-        }
-        else
-        {
-          vert.u = 0;
-          vert.v = 0;
-        }
+        GPUBackendDrawPolygonCommand::Vertex* vert = &cmd->vertices[i];
+        vert->color = (shaded && i > 0) ? (FifoPop() & UINT32_C(0x00FFFFFF)) : first_color;
+        const u64 maddr_and_pos = m_fifo.Pop();
+        const GPUVertexPosition vp{Truncate32(maddr_and_pos)};
+        vert->x = m_drawing_offset.x + vp.x;
+        vert->y = m_drawing_offset.y + vp.y;
+        vert->texcoord = textured ? Truncate16(FifoPop()) : 0;
       }
 
       if (!IsDrawingAreaIsValid())
         return;
 
-      const DrawTriangleFunction DrawFunction = GetDrawTriangleFunction(
-        rc.shading_enable, rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable, dithering_enable);
+      // Cull polygons which are too large.
+      const auto [min_x_12, max_x_12] = MinMax(cmd->vertices[1].x, cmd->vertices[2].x);
+      const auto [min_y_12, max_y_12] = MinMax(cmd->vertices[1].y, cmd->vertices[2].y);
+      const s32 min_x = std::min(min_x_12, cmd->vertices[0].x);
+      const s32 max_x = std::max(max_x_12, cmd->vertices[0].x);
+      const s32 min_y = std::min(min_y_12, cmd->vertices[0].y);
+      const s32 max_y = std::max(max_y_12, cmd->vertices[0].y);
 
-      (this->*DrawFunction)(&vertices[0], &vertices[1], &vertices[2]);
-      if (num_vertices > 3)
-        (this->*DrawFunction)(&vertices[2], &vertices[1], &vertices[3]);
+      if ((max_x - min_x) >= MAX_PRIMITIVE_WIDTH || (max_y - min_y) >= MAX_PRIMITIVE_HEIGHT)
+      {
+        Log_DebugPrintf("Culling too-large polygon: %d,%d %d,%d %d,%d", cmd->vertices[0].x, cmd->vertices[0].y,
+                        cmd->vertices[1].x, cmd->vertices[1].y, cmd->vertices[2].x, cmd->vertices[2].y);
+      }
+      else
+      {
+        AddDrawTriangleTicks(cmd->vertices[0].x, cmd->vertices[0].y, cmd->vertices[1].x, cmd->vertices[1].y,
+                             cmd->vertices[2].x, cmd->vertices[2].y, rc.shading_enable, rc.texture_enable,
+                             rc.transparency_enable);
+      }
+
+      // quads
+      if (rc.quad_polygon)
+      {
+        const s32 min_x_123 = std::min(min_x_12, cmd->vertices[3].x);
+        const s32 max_x_123 = std::max(max_x_12, cmd->vertices[3].x);
+        const s32 min_y_123 = std::min(min_y_12, cmd->vertices[3].y);
+        const s32 max_y_123 = std::max(max_y_12, cmd->vertices[3].y);
+
+        // Cull polygons which are too large.
+        if ((max_x_123 - min_x_123) >= MAX_PRIMITIVE_WIDTH || (max_y_123 - min_y_123) >= MAX_PRIMITIVE_HEIGHT)
+        {
+          Log_DebugPrintf("Culling too-large polygon (quad second half): %d,%d %d,%d %d,%d", cmd->vertices[2].x,
+                          cmd->vertices[2].y, cmd->vertices[1].x, cmd->vertices[1].y, cmd->vertices[0].x,
+                          cmd->vertices[0].y);
+        }
+        else
+        {
+          AddDrawTriangleTicks(cmd->vertices[2].x, cmd->vertices[2].y, cmd->vertices[1].x, cmd->vertices[1].y,
+                               cmd->vertices[3].x, cmd->vertices[3].y, rc.shading_enable, rc.texture_enable,
+                               rc.transparency_enable);
+        }
+      }
+
+      m_backend.PushCommand(cmd);
     }
     break;
 
     case GPUPrimitive::Rectangle:
     {
-      const auto [r, g, b] = UnpackColorRGB24(rc.color_for_first_vertex);
-      const GPUVertexPosition vp{FifoPop()};
-      const u32 texcoord_and_palette = rc.texture_enable ? FifoPop() : 0;
-      const auto [texcoord_x, texcoord_y] = UnpackTexcoord(Truncate16(texcoord_and_palette));
+      GPUBackendDrawRectangleCommand* cmd = m_backend.NewDrawRectangleCommand();
+      FillDrawCommand(cmd, rc);
+      cmd->color = rc.color_for_first_vertex;
+
+      const GPUVertexPosition vp{FifoPop()};
+      cmd->x = TruncateGPUVertexPosition(m_drawing_offset.x + vp.x);
+      cmd->y = TruncateGPUVertexPosition(m_drawing_offset.y + vp.y);
+
+      if (rc.texture_enable)
+      {
+        const u32 texcoord_and_palette = FifoPop();
+        cmd->palette.bits = Truncate16(texcoord_and_palette >> 16);
+        cmd->texcoord = Truncate16(texcoord_and_palette);
+      }
+      else
+      {
+        cmd->palette.bits = 0;
+        cmd->texcoord = 0;
+      }
 
-      u32 width;
-      u32 height;
       switch (rc.rectangle_size)
       {
         case GPUDrawRectangleSize::R1x1:
-          width = 1;
-          height = 1;
+          cmd->width = 1;
+          cmd->height = 1;
           break;
         case GPUDrawRectangleSize::R8x8:
-          width = 8;
-          height = 8;
+          cmd->width = 8;
+          cmd->height = 8;
           break;
         case GPUDrawRectangleSize::R16x16:
-          width = 16;
-          height = 16;
+          cmd->width = 16;
+          cmd->height = 16;
           break;
         default:
         {
           const u32 width_and_height = FifoPop();
-          width = static_cast<u32>(width_and_height & VRAM_WIDTH_MASK);
-          height = static_cast<u32>((width_and_height >> 16) & VRAM_HEIGHT_MASK);
+          cmd->width = static_cast<u16>(width_and_height & VRAM_WIDTH_MASK);
+          cmd->height = static_cast<u16>((width_and_height >> 16) & VRAM_HEIGHT_MASK);
 
-          if (width >= MAX_PRIMITIVE_WIDTH || height >= MAX_PRIMITIVE_HEIGHT)
+          if (cmd->width >= MAX_PRIMITIVE_WIDTH || cmd->height >= MAX_PRIMITIVE_HEIGHT)
           {
-            Log_DebugPrintf("Culling too-large rectangle: %d,%d %dx%d", vp.x.GetValue(), vp.y.GetValue(), width,
-                            height);
+            Log_DebugPrintf("Culling too-large rectangle: %d,%d %dx%d", cmd->x, cmd->y, cmd->width, cmd->height);
             return;
           }
         }
@@ -622,51 +704,123 @@ void GPU_SW::DispatchRenderCommand()
       if (!IsDrawingAreaIsValid())
         return;
 
-      const DrawRectangleFunction DrawFunction =
-        GetDrawRectangleFunction(rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable);
+      const u32 clip_left = static_cast<u32>(std::clamp<s32>(cmd->x, m_drawing_area.left, m_drawing_area.right));
+      const u32 clip_right =
+        static_cast<u32>(std::clamp<s32>(cmd->x + cmd->width, m_drawing_area.left, m_drawing_area.right)) + 1u;
+      const u32 clip_top = static_cast<u32>(std::clamp<s32>(cmd->y, m_drawing_area.top, m_drawing_area.bottom));
+      const u32 clip_bottom =
+        static_cast<u32>(std::clamp<s32>(cmd->y + cmd->height, m_drawing_area.top, m_drawing_area.bottom)) + 1u;
 
-      (this->*DrawFunction)(vp.x, vp.y, width, height, r, g, b, texcoord_x, texcoord_y);
+      // cmd->bounds.Set(Truncate16(clip_left), Truncate16(clip_top), Truncate16(clip_right), Truncate16(clip_bottom));
+      AddDrawRectangleTicks(clip_right - clip_left, clip_bottom - clip_top, rc.texture_enable, rc.transparency_enable);
+
+      m_backend.PushCommand(cmd);
     }
     break;
 
     case GPUPrimitive::Line:
     {
-      const u32 first_color = rc.color_for_first_vertex;
-      const bool shaded = rc.shading_enable;
-
-      const DrawLineFunction DrawFunction = GetDrawLineFunction(shaded, rc.transparency_enable, dithering_enable);
-
-      std::array<SWVertex, 2> vertices = {};
-      u32 buffer_pos = 0;
-
-      // first vertex
-      SWVertex* p0 = &vertices[0];
-      SWVertex* p1 = &vertices[1];
-      p0->SetPosition(GPUVertexPosition{rc.polyline ? m_blit_buffer[buffer_pos++] : Truncate32(FifoPop())},
-                      m_drawing_offset.x, m_drawing_offset.y);
-      p0->SetColorRGB24(first_color);
-
-      // remaining vertices in line strip
-      const u32 num_vertices = rc.polyline ? GetPolyLineVertexCount() : 2;
-      for (u32 i = 1; i < num_vertices; i++)
+      if (!rc.polyline)
       {
-        if (rc.polyline)
+        GPUBackendDrawLineCommand* cmd = m_backend.NewDrawLineCommand(2);
+        FillDrawCommand(cmd, rc);
+        cmd->palette.bits = 0;
+
+        if (rc.shading_enable)
         {
-          p1->SetColorRGB24(shaded ? (m_blit_buffer[buffer_pos++] & UINT32_C(0x00FFFFFF)) : first_color);
-          p1->SetPosition(GPUVertexPosition{m_blit_buffer[buffer_pos++]}, m_drawing_offset.x, m_drawing_offset.y);
+          cmd->vertices[0].color = rc.color_for_first_vertex;
+          const GPUVertexPosition start_pos{FifoPop()};
+          cmd->vertices[0].x = m_drawing_offset.x + start_pos.x;
+          cmd->vertices[0].y = m_drawing_offset.y + start_pos.y;
+
+          cmd->vertices[1].color = FifoPop() & UINT32_C(0x00FFFFFF);
+          const GPUVertexPosition end_pos{FifoPop()};
+          cmd->vertices[1].x = m_drawing_offset.x + end_pos.x;
+          cmd->vertices[1].y = m_drawing_offset.y + end_pos.y;
         }
         else
         {
-          p1->SetColorRGB24(shaded ? (FifoPop() & UINT32_C(0x00FFFFFF)) : first_color);
-          p1->SetPosition(GPUVertexPosition{Truncate32(FifoPop())}, m_drawing_offset.x, m_drawing_offset.y);
+          cmd->vertices[0].color = rc.color_for_first_vertex;
+          cmd->vertices[1].color = rc.color_for_first_vertex;
+
+          const GPUVertexPosition start_pos{FifoPop()};
+          cmd->vertices[0].x = m_drawing_offset.x + start_pos.x;
+          cmd->vertices[0].y = m_drawing_offset.y + start_pos.y;
+
+          const GPUVertexPosition end_pos{FifoPop()};
+          cmd->vertices[1].x = m_drawing_offset.x + end_pos.x;
+          cmd->vertices[1].y = m_drawing_offset.y + end_pos.y;
         }
 
-        // down here because of the FIFO pops
-        if (IsDrawingAreaIsValid())
-          (this->*DrawFunction)(p0, p1);
+        if (!IsDrawingAreaIsValid())
+          return;
 
-        // swap p0/p1 so that the last vertex is used as the first for the next line
-        std::swap(p0, p1);
+        const auto [min_x, max_x] = MinMax(cmd->vertices[0].x, cmd->vertices[1].x);
+        const auto [min_y, max_y] = MinMax(cmd->vertices[0].y, cmd->vertices[1].y);
+        if ((max_x - min_x) >= MAX_PRIMITIVE_WIDTH || (max_y - min_y) >= MAX_PRIMITIVE_HEIGHT)
+        {
+          Log_DebugPrintf("Culling too-large line: %d,%d - %d,%d", cmd->vertices[0].y, cmd->vertices[0].y,
+                          cmd->vertices[1].x, cmd->vertices[1].y);
+          return;
+        }
+
+        const u32 clip_left = static_cast<u32>(std::clamp<s32>(min_x, m_drawing_area.left, m_drawing_area.left));
+        const u32 clip_right = static_cast<u32>(std::clamp<s32>(max_x, m_drawing_area.left, m_drawing_area.right)) + 1u;
+        const u32 clip_top = static_cast<u32>(std::clamp<s32>(min_y, m_drawing_area.top, m_drawing_area.bottom));
+        const u32 clip_bottom =
+          static_cast<u32>(std::clamp<s32>(max_y, m_drawing_area.top, m_drawing_area.bottom)) + 1u;
+        // cmd->bounds.Set(Truncate16(clip_left), Truncate16(clip_top), Truncate16(clip_right),
+        // Truncate16(clip_bottom));
+        AddDrawLineTicks(clip_right - clip_left, clip_bottom - clip_top, rc.shading_enable);
+
+        m_backend.PushCommand(cmd);
+      }
+      else
+      {
+        const u32 num_vertices = GetPolyLineVertexCount();
+
+        GPUBackendDrawLineCommand* cmd = m_backend.NewDrawLineCommand(num_vertices);
+        FillDrawCommand(cmd, m_render_command);
+
+        u32 buffer_pos = 0;
+        const GPUVertexPosition start_vp{m_blit_buffer[buffer_pos++]};
+        cmd->vertices[0].x = start_vp.x + m_drawing_offset.x;
+        cmd->vertices[0].y = start_vp.y + m_drawing_offset.y;
+        cmd->vertices[0].color = m_render_command.color_for_first_vertex;
+        // cmd->bounds.SetInvalid();
+
+        const bool shaded = m_render_command.shading_enable;
+        for (u32 i = 1; i < num_vertices; i++)
+        {
+          cmd->vertices[i].color =
+            shaded ? (m_blit_buffer[buffer_pos++] & UINT32_C(0x00FFFFFF)) : m_render_command.color_for_first_vertex;
+          const GPUVertexPosition vp{m_blit_buffer[buffer_pos++]};
+          cmd->vertices[i].x = m_drawing_offset.x + vp.x;
+          cmd->vertices[i].y = m_drawing_offset.y + vp.y;
+
+          const auto [min_x, max_x] = MinMax(cmd->vertices[i - 1].x, cmd->vertices[i].y);
+          const auto [min_y, max_y] = MinMax(cmd->vertices[i - 1].x, cmd->vertices[i].y);
+          if ((max_x - min_x) >= MAX_PRIMITIVE_WIDTH || (max_y - min_y) >= MAX_PRIMITIVE_HEIGHT)
+          {
+            Log_DebugPrintf("Culling too-large line: %d,%d - %d,%d", cmd->vertices[i - 1].x, cmd->vertices[i - 1].y,
+                            cmd->vertices[i].x, cmd->vertices[i].y);
+          }
+          else
+          {
+            const u32 clip_left = static_cast<u32>(std::clamp<s32>(min_x, m_drawing_area.left, m_drawing_area.left));
+            const u32 clip_right =
+              static_cast<u32>(std::clamp<s32>(max_x, m_drawing_area.left, m_drawing_area.right)) + 1u;
+            const u32 clip_top = static_cast<u32>(std::clamp<s32>(min_y, m_drawing_area.top, m_drawing_area.bottom));
+            const u32 clip_bottom =
+              static_cast<u32>(std::clamp<s32>(max_y, m_drawing_area.top, m_drawing_area.bottom)) + 1u;
+
+            // cmd->bounds.Include(Truncate16(clip_left), Truncate16(clip_right), Truncate16(clip_top),
+            // Truncate16(clip_bottom));
+            AddDrawLineTicks(clip_right - clip_left, clip_bottom - clip_top, m_render_command.shading_enable);
+          }
+        }
+
+        m_backend.PushCommand(cmd);
       }
     }
     break;
@@ -677,731 +831,50 @@ void GPU_SW::DispatchRenderCommand()
   }
 }
 
-constexpr GPU_SW::DitherLUT GPU_SW::ComputeDitherLUT()
+void GPU_SW::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
 {
-  DitherLUT lut = {};
-  for (u32 i = 0; i < DITHER_MATRIX_SIZE; i++)
-  {
-    for (u32 j = 0; j < DITHER_MATRIX_SIZE; j++)
-    {
-      for (s32 value = 0; value < DITHER_LUT_SIZE; value++)
-      {
-        const s32 dithered_value = (value + DITHER_MATRIX[i][j]) >> 3;
-        lut[i][j][value] = static_cast<u8>((dithered_value < 0) ? 0 : ((dithered_value > 31) ? 31 : dithered_value));
-      }
-    }
-  }
-  return lut;
+  m_backend.Sync();
 }
 
-static constexpr GPU_SW::DitherLUT s_dither_lut = GPU_SW::ComputeDitherLUT();
-
-template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
-void ALWAYS_INLINE_RELEASE GPU_SW::ShadePixel(u32 x, u32 y, u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x,
-                                              u8 texcoord_y)
+void GPU_SW::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color)
 {
-  VRAMPixel color;
-  bool transparent;
-  if constexpr (texture_enable)
-  {
-    // Apply texture window
-    // TODO: Precompute the second half
-    texcoord_x = (texcoord_x & m_draw_mode.texture_window_and_x) | m_draw_mode.texture_window_or_x;
-    texcoord_y = (texcoord_y & m_draw_mode.texture_window_and_y) | m_draw_mode.texture_window_or_y;
-
-    VRAMPixel texture_color;
-    switch (m_draw_mode.mode_reg.texture_mode)
-    {
-      case GPUTextureMode::Palette4Bit:
-      {
-        const u16 palette_value = GetPixel((m_draw_mode.texture_page_x + ZeroExtend32(texcoord_x / 4)) % VRAM_WIDTH,
-                                           (m_draw_mode.texture_page_y + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
-        const u16 palette_index = (palette_value >> ((texcoord_x % 4) * 4)) & 0x0Fu;
-        texture_color.bits = GetPixel((m_draw_mode.texture_palette_x + ZeroExtend32(palette_index)) % VRAM_WIDTH,
-                                      m_draw_mode.texture_palette_y);
-      }
-      break;
-
-      case GPUTextureMode::Palette8Bit:
-      {
-        const u16 palette_value = GetPixel((m_draw_mode.texture_page_x + ZeroExtend32(texcoord_x / 2)) % VRAM_WIDTH,
-                                           (m_draw_mode.texture_page_y + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
-        const u16 palette_index = (palette_value >> ((texcoord_x % 2) * 8)) & 0xFFu;
-        texture_color.bits = GetPixel((m_draw_mode.texture_palette_x + ZeroExtend32(palette_index)) % VRAM_WIDTH,
-                                      m_draw_mode.texture_palette_y);
-      }
-      break;
-
-      default:
-      {
-        texture_color.bits = GetPixel((m_draw_mode.texture_page_x + ZeroExtend32(texcoord_x)) % VRAM_WIDTH,
-                                      (m_draw_mode.texture_page_y + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
-      }
-      break;
-    }
-
-    if (texture_color.bits == 0)
-      return;
-
-    transparent = texture_color.c;
-
-    if constexpr (raw_texture_enable)
-    {
-      color.bits = texture_color.bits;
-    }
-    else
-    {
-      const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
-      const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
-
-      color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.r) * u16(color_r)) >> 4]) << 0) |
-                   (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.g) * u16(color_g)) >> 4]) << 5) |
-                   (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.b) * u16(color_b)) >> 4]) << 10) |
-                   (texture_color.bits & 0x8000u);
-    }
-  }
-  else
-  {
-    transparent = true;
-
-    const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
-    const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
-
-    color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_r]) << 0) |
-                 (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_g]) << 5) |
-                 (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_b]) << 10);
-  }
-
-  const VRAMPixel bg_color{GetPixel(static_cast<u32>(x), static_cast<u32>(y))};
-  if constexpr (transparency_enable)
-  {
-    if (transparent)
-    {
-#define BLEND_AVERAGE(bg, fg) Truncate8(std::min<u32>((ZeroExtend32(bg) / 2) + (ZeroExtend32(fg) / 2), 0x1F))
-#define BLEND_ADD(bg, fg) Truncate8(std::min<u32>(ZeroExtend32(bg) + ZeroExtend32(fg), 0x1F))
-#define BLEND_SUBTRACT(bg, fg) Truncate8((bg > fg) ? ((bg) - (fg)) : 0)
-#define BLEND_QUARTER(bg, fg) Truncate8(std::min<u32>(ZeroExtend32(bg) + ZeroExtend32(fg / 4), 0x1F))
-
-#define BLEND_RGB(func)                                                                                                \
-  color.Set(func(bg_color.r.GetValue(), color.r.GetValue()), func(bg_color.g.GetValue(), color.g.GetValue()),          \
-            func(bg_color.b.GetValue(), color.b.GetValue()), color.c.GetValue())
-
-      switch (m_draw_mode.mode_reg.transparency_mode)
-      {
-        case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
-          BLEND_RGB(BLEND_AVERAGE);
-          break;
-        case GPUTransparencyMode::BackgroundPlusForeground:
-          BLEND_RGB(BLEND_ADD);
-          break;
-        case GPUTransparencyMode::BackgroundMinusForeground:
-          BLEND_RGB(BLEND_SUBTRACT);
-          break;
-        case GPUTransparencyMode::BackgroundPlusQuarterForeground:
-          BLEND_RGB(BLEND_QUARTER);
-          break;
-        default:
-          break;
-      }
-
-#undef BLEND_RGB
-
-#undef BLEND_QUARTER
-#undef BLEND_SUBTRACT
-#undef BLEND_ADD
-#undef BLEND_AVERAGE
-    }
-  }
-  else
-  {
-    UNREFERENCED_VARIABLE(transparent);
-  }
-
-  const u16 mask_and = m_GPUSTAT.GetMaskAND();
-  if ((bg_color.bits & mask_and) != 0)
-    return;
-
-  SetPixel(static_cast<u32>(x), static_cast<u32>(y), color.bits | m_GPUSTAT.GetMaskOR());
+  GPUBackendFillVRAMCommand* cmd = m_backend.NewFillVRAMCommand();
+  FillBackendCommandParameters(cmd);
+  cmd->x = static_cast<u16>(x);
+  cmd->y = static_cast<u16>(y);
+  cmd->width = static_cast<u16>(width);
+  cmd->height = static_cast<u16>(height);
+  cmd->color = color;
+  m_backend.PushCommand(cmd);
 }
 
-template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
-void GPU_SW::DrawRectangle(s32 origin_x, s32 origin_y, u32 width, u32 height, u8 r, u8 g, u8 b, u8 origin_texcoord_x,
-                           u8 origin_texcoord_y)
+void GPU_SW::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data)
 {
-  const s32 start_x = TruncateGPUVertexPosition(m_drawing_offset.x + origin_x);
-  const s32 start_y = TruncateGPUVertexPosition(m_drawing_offset.y + origin_y);
+  const u32 num_words = width * height;
+  GPUBackendUpdateVRAMCommand* cmd = m_backend.NewUpdateVRAMCommand(num_words);
+  FillBackendCommandParameters(cmd);
+  cmd->x = static_cast<u16>(x);
+  cmd->y = static_cast<u16>(y);
+  cmd->width = static_cast<u16>(width);
+  cmd->height = static_cast<u16>(height);
+  std::memcpy(cmd->data, data, sizeof(u16) * num_words);
+  m_backend.PushCommand(cmd);
+}
 
-  {
-    const u32 clip_left = static_cast<u32>(std::clamp<s32>(start_x, m_drawing_area.left, m_drawing_area.right));
-    const u32 clip_right =
-      static_cast<u32>(std::clamp<s32>(start_x + static_cast<s32>(width), m_drawing_area.left, m_drawing_area.right)) +
-      1u;
-    const u32 clip_top = static_cast<u32>(std::clamp<s32>(start_y, m_drawing_area.top, m_drawing_area.bottom));
-    const u32 clip_bottom =
-      static_cast<u32>(std::clamp<s32>(start_y + static_cast<s32>(height), m_drawing_area.top, m_drawing_area.bottom)) +
-      1u;
-    AddDrawRectangleTicks(clip_right - clip_left, clip_bottom - clip_top, texture_enable, transparency_enable);
-  }
-
-  for (u32 offset_y = 0; offset_y < height; offset_y++)
-  {
-    const s32 y = start_y + static_cast<s32>(offset_y);
-    if (y < static_cast<s32>(m_drawing_area.top) || y > static_cast<s32>(m_drawing_area.bottom) ||
-        (IsInterlacedRenderingEnabled() && GetActiveLineLSB() == (static_cast<u32>(y) & 1u)))
-    {
-      continue;
-    }
-
-    const u8 texcoord_y = Truncate8(ZeroExtend32(origin_texcoord_y) + offset_y);
-
-    for (u32 offset_x = 0; offset_x < width; offset_x++)
-    {
-      const s32 x = start_x + static_cast<s32>(offset_x);
-      if (x < static_cast<s32>(m_drawing_area.left) || x > static_cast<s32>(m_drawing_area.right))
-        continue;
-
-      const u8 texcoord_x = Truncate8(ZeroExtend32(origin_texcoord_x) + offset_x);
-
-      ShadePixel<texture_enable, raw_texture_enable, transparency_enable, false>(
-        static_cast<u32>(x), static_cast<u32>(y), r, g, b, texcoord_x, texcoord_y);
-    }
-  }
+void GPU_SW::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height)
+{
+  GPUBackendCopyVRAMCommand* cmd = m_backend.NewCopyVRAMCommand();
+  FillBackendCommandParameters(cmd);
+  cmd->src_x = static_cast<u16>(src_x);
+  cmd->src_y = static_cast<u16>(src_y);
+  cmd->dst_x = static_cast<u16>(dst_x);
+  cmd->dst_y = static_cast<u16>(dst_y);
+  cmd->width = static_cast<u16>(width);
+  cmd->height = static_cast<u16>(height);
+  m_backend.PushCommand(cmd);
 }
 
 std::unique_ptr<GPU> GPU::CreateSoftwareRenderer()
 {
   return std::make_unique<GPU_SW>();
 }
-
-//////////////////////////////////////////////////////////////////////////
-// Polygon and line rasterization ported from Mednafen
-//////////////////////////////////////////////////////////////////////////
-
-#define COORD_FBS 12
-#define COORD_MF_INT(n) ((n) << COORD_FBS)
-#define COORD_POST_PADDING 12
-
-static ALWAYS_INLINE_RELEASE s64 MakePolyXFP(s32 x)
-{
-  return ((u64)x << 32) + ((1ULL << 32) - (1 << 11));
-}
-
-static ALWAYS_INLINE_RELEASE s64 MakePolyXFPStep(s32 dx, s32 dy)
-{
-  s64 ret;
-  s64 dx_ex = (u64)dx << 32;
-
-  if (dx_ex < 0)
-    dx_ex -= dy - 1;
-
-  if (dx_ex > 0)
-    dx_ex += dy - 1;
-
-  ret = dx_ex / dy;
-
-  return (ret);
-}
-
-static ALWAYS_INLINE_RELEASE s32 GetPolyXFP_Int(s64 xfp)
-{
-  return (xfp >> 32);
-}
-
-template<bool shading_enable, bool texture_enable>
-bool ALWAYS_INLINE_RELEASE GPU_SW::CalcIDeltas(i_deltas& idl, const SWVertex* A, const SWVertex* B, const SWVertex* C)
-{
-#define CALCIS(x, y) (((B->x - A->x) * (C->y - B->y)) - ((C->x - B->x) * (B->y - A->y)))
-
-  s32 denom = CALCIS(x, y);
-
-  if (!denom)
-    return false;
-
-  if constexpr (shading_enable)
-  {
-    idl.dr_dx = (u32)(CALCIS(r, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
-    idl.dr_dy = (u32)(CALCIS(x, r) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
-
-    idl.dg_dx = (u32)(CALCIS(g, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
-    idl.dg_dy = (u32)(CALCIS(x, g) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
-
-    idl.db_dx = (u32)(CALCIS(b, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
-    idl.db_dy = (u32)(CALCIS(x, b) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
-  }
-
-  if constexpr (texture_enable)
-  {
-    idl.du_dx = (u32)(CALCIS(u, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
-    idl.du_dy = (u32)(CALCIS(x, u) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
-
-    idl.dv_dx = (u32)(CALCIS(v, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
-    idl.dv_dy = (u32)(CALCIS(x, v) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
-  }
-
-  return true;
-
-#undef CALCIS
-}
-
-template<bool shading_enable, bool texture_enable>
-void ALWAYS_INLINE_RELEASE GPU_SW::AddIDeltas_DX(i_group& ig, const i_deltas& idl, u32 count /*= 1*/)
-{
-  if constexpr (shading_enable)
-  {
-    ig.r += idl.dr_dx * count;
-    ig.g += idl.dg_dx * count;
-    ig.b += idl.db_dx * count;
-  }
-
-  if constexpr (texture_enable)
-  {
-    ig.u += idl.du_dx * count;
-    ig.v += idl.dv_dx * count;
-  }
-}
-
-template<bool shading_enable, bool texture_enable>
-void ALWAYS_INLINE_RELEASE GPU_SW::AddIDeltas_DY(i_group& ig, const i_deltas& idl, u32 count /*= 1*/)
-{
-  if constexpr (shading_enable)
-  {
-    ig.r += idl.dr_dy * count;
-    ig.g += idl.dg_dy * count;
-    ig.b += idl.db_dy * count;
-  }
-
-  if constexpr (texture_enable)
-  {
-    ig.u += idl.du_dy * count;
-    ig.v += idl.dv_dy * count;
-  }
-}
-
-template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
-         bool dithering_enable>
-void GPU_SW::DrawSpan(s32 y, s32 x_start, s32 x_bound, i_group ig, const i_deltas& idl)
-{
-  if (IsInterlacedRenderingEnabled() && GetActiveLineLSB() == (static_cast<u32>(y) & 1u))
-    return;
-
-  s32 x_ig_adjust = x_start;
-  s32 w = x_bound - x_start;
-  s32 x = TruncateGPUVertexPosition(x_start);
-
-  if (x < static_cast<s32>(m_drawing_area.left))
-  {
-    s32 delta = static_cast<s32>(m_drawing_area.left) - x;
-    x_ig_adjust += delta;
-    x += delta;
-    w -= delta;
-  }
-
-  if ((x + w) > (static_cast<s32>(m_drawing_area.right) + 1))
-    w = static_cast<s32>(m_drawing_area.right) + 1 - x;
-
-  if (w <= 0)
-    return;
-
-  AddIDeltas_DX<shading_enable, texture_enable>(ig, idl, x_ig_adjust);
-  AddIDeltas_DY<shading_enable, texture_enable>(ig, idl, y);
-
-  do
-  {
-    const u32 r = ig.r >> (COORD_FBS + COORD_POST_PADDING);
-    const u32 g = ig.g >> (COORD_FBS + COORD_POST_PADDING);
-    const u32 b = ig.b >> (COORD_FBS + COORD_POST_PADDING);
-    const u32 u = ig.u >> (COORD_FBS + COORD_POST_PADDING);
-    const u32 v = ig.v >> (COORD_FBS + COORD_POST_PADDING);
-
-    ShadePixel<texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
-      static_cast<u32>(x), static_cast<u32>(y), Truncate8(r), Truncate8(g), Truncate8(b), Truncate8(u), Truncate8(v));
-
-    x++;
-    AddIDeltas_DX<shading_enable, texture_enable>(ig, idl);
-  } while (--w > 0);
-}
-
-template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
-         bool dithering_enable>
-void GPU_SW::DrawTriangle(const SWVertex* v0, const SWVertex* v1, const SWVertex* v2)
-{
-  u32 core_vertex;
-  {
-    u32 cvtemp = 0;
-
-    if (v1->x <= v0->x)
-    {
-      if (v2->x <= v1->x)
-        cvtemp = (1 << 2);
-      else
-        cvtemp = (1 << 1);
-    }
-    else if (v2->x < v0->x)
-      cvtemp = (1 << 2);
-    else
-      cvtemp = (1 << 0);
-
-    if (v2->y < v1->y)
-    {
-      std::swap(v2, v1);
-      cvtemp = ((cvtemp >> 1) & 0x2) | ((cvtemp << 1) & 0x4) | (cvtemp & 0x1);
-    }
-
-    if (v1->y < v0->y)
-    {
-      std::swap(v1, v0);
-      cvtemp = ((cvtemp >> 1) & 0x1) | ((cvtemp << 1) & 0x2) | (cvtemp & 0x4);
-    }
-
-    if (v2->y < v1->y)
-    {
-      std::swap(v2, v1);
-      cvtemp = ((cvtemp >> 1) & 0x2) | ((cvtemp << 1) & 0x4) | (cvtemp & 0x1);
-    }
-
-    core_vertex = cvtemp >> 1;
-  }
-
-  if (v0->y == v2->y)
-    return;
-
-  if (static_cast<u32>(std::abs(v2->x - v0->x)) >= MAX_PRIMITIVE_WIDTH ||
-      static_cast<u32>(std::abs(v2->x - v1->x)) >= MAX_PRIMITIVE_WIDTH ||
-      static_cast<u32>(std::abs(v1->x - v0->x)) >= MAX_PRIMITIVE_WIDTH ||
-      static_cast<u32>(v2->y - v0->y) >= MAX_PRIMITIVE_HEIGHT)
-  {
-    return;
-  }
-
-  AddDrawTriangleTicks(v0->x, v0->y, v1->x, v1->y, v2->x, v2->y, shading_enable, texture_enable, transparency_enable);
-
-  s64 base_coord = MakePolyXFP(v0->x);
-  s64 base_step = MakePolyXFPStep((v2->x - v0->x), (v2->y - v0->y));
-  s64 bound_coord_us;
-  s64 bound_coord_ls;
-  bool right_facing;
-
-  if (v1->y == v0->y)
-  {
-    bound_coord_us = 0;
-    right_facing = (bool)(v1->x > v0->x);
-  }
-  else
-  {
-    bound_coord_us = MakePolyXFPStep((v1->x - v0->x), (v1->y - v0->y));
-    right_facing = (bool)(bound_coord_us > base_step);
-  }
-
-  if (v2->y == v1->y)
-    bound_coord_ls = 0;
-  else
-    bound_coord_ls = MakePolyXFPStep((v2->x - v1->x), (v2->y - v1->y));
-
-  i_deltas idl;
-  if (!CalcIDeltas<shading_enable, texture_enable>(idl, v0, v1, v2))
-    return;
-
-  const SWVertex* vertices[3] = {v0, v1, v2};
-
-  i_group ig;
-  if constexpr (texture_enable)
-  {
-    ig.u = (COORD_MF_INT(vertices[core_vertex]->u) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
-    ig.v = (COORD_MF_INT(vertices[core_vertex]->v) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
-  }
-
-  ig.r = (COORD_MF_INT(vertices[core_vertex]->r) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
-  ig.g = (COORD_MF_INT(vertices[core_vertex]->g) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
-  ig.b = (COORD_MF_INT(vertices[core_vertex]->b) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
-
-  AddIDeltas_DX<shading_enable, texture_enable>(ig, idl, -vertices[core_vertex]->x);
-  AddIDeltas_DY<shading_enable, texture_enable>(ig, idl, -vertices[core_vertex]->y);
-
-  struct TriangleHalf
-  {
-    u64 x_coord[2];
-    u64 x_step[2];
-
-    s32 y_coord;
-    s32 y_bound;
-
-    bool dec_mode;
-  } tripart[2];
-
-  u32 vo = 0;
-  u32 vp = 0;
-  if (core_vertex != 0)
-    vo = 1;
-  if (core_vertex == 2)
-    vp = 3;
-
-  {
-    TriangleHalf* tp = &tripart[vo];
-    tp->y_coord = vertices[0 ^ vo]->y;
-    tp->y_bound = vertices[1 ^ vo]->y;
-    tp->x_coord[right_facing] = MakePolyXFP(vertices[0 ^ vo]->x);
-    tp->x_step[right_facing] = bound_coord_us;
-    tp->x_coord[!right_facing] = base_coord + ((vertices[vo]->y - vertices[0]->y) * base_step);
-    tp->x_step[!right_facing] = base_step;
-    tp->dec_mode = vo;
-  }
-
-  {
-    TriangleHalf* tp = &tripart[vo ^ 1];
-    tp->y_coord = vertices[1 ^ vp]->y;
-    tp->y_bound = vertices[2 ^ vp]->y;
-    tp->x_coord[right_facing] = MakePolyXFP(vertices[1 ^ vp]->x);
-    tp->x_step[right_facing] = bound_coord_ls;
-    tp->x_coord[!right_facing] =
-      base_coord + ((vertices[1 ^ vp]->y - vertices[0]->y) *
-                    base_step); // base_coord + ((vertices[1].y - vertices[0].y) * base_step);
-    tp->x_step[!right_facing] = base_step;
-    tp->dec_mode = vp;
-  }
-
-  for (u32 i = 0; i < 2; i++)
-  {
-    s32 yi = tripart[i].y_coord;
-    s32 yb = tripart[i].y_bound;
-
-    u64 lc = tripart[i].x_coord[0];
-    u64 ls = tripart[i].x_step[0];
-
-    u64 rc = tripart[i].x_coord[1];
-    u64 rs = tripart[i].x_step[1];
-
-    if (tripart[i].dec_mode)
-    {
-      while (yi > yb)
-      {
-        yi--;
-        lc -= ls;
-        rc -= rs;
-
-        s32 y = TruncateGPUVertexPosition(yi);
-
-        if (y < static_cast<s32>(m_drawing_area.top))
-          break;
-
-        if (y > static_cast<s32>(m_drawing_area.bottom))
-          continue;
-
-        DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
-          yi, GetPolyXFP_Int(lc), GetPolyXFP_Int(rc), ig, idl);
-      }
-    }
-    else
-    {
-      while (yi < yb)
-      {
-        s32 y = TruncateGPUVertexPosition(yi);
-
-        if (y > static_cast<s32>(m_drawing_area.bottom))
-          break;
-
-        if (y >= static_cast<s32>(m_drawing_area.top))
-        {
-
-          DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
-            yi, GetPolyXFP_Int(lc), GetPolyXFP_Int(rc), ig, idl);
-        }
-
-        yi++;
-        lc += ls;
-        rc += rs;
-      }
-    }
-  }
-}
-
-GPU_SW::DrawTriangleFunction GPU_SW::GetDrawTriangleFunction(bool shading_enable, bool texture_enable,
-                                                             bool raw_texture_enable, bool transparency_enable,
-                                                             bool dithering_enable)
-{
-#define F(SHADING, TEXTURE, RAW_TEXTURE, TRANSPARENCY, DITHERING)                                                      \
-  &GPU_SW::DrawTriangle<SHADING, TEXTURE, RAW_TEXTURE, TRANSPARENCY, DITHERING>
-
-  static constexpr DrawTriangleFunction funcs[2][2][2][2][2] = {
-    {{{{F(false, false, false, false, false), F(false, false, false, false, true)},
-       {F(false, false, false, true, false), F(false, false, false, true, true)}},
-      {{F(false, false, true, false, false), F(false, false, true, false, true)},
-       {F(false, false, true, true, false), F(false, false, true, true, true)}}},
-     {{{F(false, true, false, false, false), F(false, true, false, false, true)},
-       {F(false, true, false, true, false), F(false, true, false, true, true)}},
-      {{F(false, true, true, false, false), F(false, true, true, false, true)},
-       {F(false, true, true, true, false), F(false, true, true, true, true)}}}},
-    {{{{F(true, false, false, false, false), F(true, false, false, false, true)},
-       {F(true, false, false, true, false), F(true, false, false, true, true)}},
-      {{F(true, false, true, false, false), F(true, false, true, false, true)},
-       {F(true, false, true, true, false), F(true, false, true, true, true)}}},
-     {{{F(true, true, false, false, false), F(true, true, false, false, true)},
-       {F(true, true, false, true, false), F(true, true, false, true, true)}},
-      {{F(true, true, true, false, false), F(true, true, true, false, true)},
-       {F(true, true, true, true, false), F(true, true, true, true, true)}}}}};
-
-#undef F
-
-  return funcs[u8(shading_enable)][u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)]
-              [u8(dithering_enable)];
-}
-
-enum
-{
-  Line_XY_FractBits = 32
-};
-enum
-{
-  Line_RGB_FractBits = 12
-};
-
-struct line_fxp_coord
-{
-  u64 x, y;
-  u32 r, g, b;
-};
-
-struct line_fxp_step
-{
-  s64 dx_dk, dy_dk;
-  s32 dr_dk, dg_dk, db_dk;
-};
-
-static ALWAYS_INLINE_RELEASE s64 LineDivide(s64 delta, s32 dk)
-{
-  delta = (u64)delta << Line_XY_FractBits;
-
-  if (delta < 0)
-    delta -= dk - 1;
-  if (delta > 0)
-    delta += dk - 1;
-
-  return (delta / dk);
-}
-
-template<bool shading_enable, bool transparency_enable, bool dithering_enable>
-void GPU_SW::DrawLine(const SWVertex* p0, const SWVertex* p1)
-{
-  const s32 i_dx = std::abs(p1->x - p0->x);
-  const s32 i_dy = std::abs(p1->y - p0->y);
-  const s32 k = (i_dx > i_dy) ? i_dx : i_dy;
-  if (i_dx >= MAX_PRIMITIVE_WIDTH || i_dy >= MAX_PRIMITIVE_HEIGHT)
-    return;
-
-  {
-    // TODO: Move to base class
-    const u32 clip_left =
-      static_cast<u32>(std::clamp<s32>(std::min(p0->x, p1->x), m_drawing_area.left, m_drawing_area.left));
-    const u32 clip_right =
-      static_cast<u32>(std::clamp<s32>(std::max(p0->x, p1->x), m_drawing_area.left, m_drawing_area.right)) + 1u;
-    const u32 clip_top =
-      static_cast<u32>(std::clamp<s32>(std::min(p0->y, p1->y), m_drawing_area.top, m_drawing_area.bottom));
-    const u32 clip_bottom =
-      static_cast<u32>(std::clamp<s32>(std::max(p0->y, p1->y), m_drawing_area.top, m_drawing_area.bottom)) + 1u;
-
-    AddDrawLineTicks(clip_right - clip_left, clip_bottom - clip_top, shading_enable);
-  }
-
-  if (p0->x >= p1->x && k > 0)
-    std::swap(p0, p1);
-
-  line_fxp_step step;
-  if (k == 0)
-  {
-    step.dx_dk = 0;
-    step.dy_dk = 0;
-
-    if constexpr (shading_enable)
-    {
-      step.dr_dk = 0;
-      step.dg_dk = 0;
-      step.db_dk = 0;
-    }
-  }
-  else
-  {
-    step.dx_dk = LineDivide(p1->x - p0->x, k);
-    step.dy_dk = LineDivide(p1->y - p0->y, k);
-
-    if constexpr (shading_enable)
-    {
-      step.dr_dk = (s32)((u32)(p1->r - p0->r) << Line_RGB_FractBits) / k;
-      step.dg_dk = (s32)((u32)(p1->g - p0->g) << Line_RGB_FractBits) / k;
-      step.db_dk = (s32)((u32)(p1->b - p0->b) << Line_RGB_FractBits) / k;
-    }
-  }
-
-  line_fxp_coord cur_point;
-  cur_point.x = ((u64)p0->x << Line_XY_FractBits) | (1ULL << (Line_XY_FractBits - 1));
-  cur_point.y = ((u64)p0->y << Line_XY_FractBits) | (1ULL << (Line_XY_FractBits - 1));
-
-  cur_point.x -= 1024;
-
-  if (step.dy_dk < 0)
-    cur_point.y -= 1024;
-
-  if constexpr (shading_enable)
-  {
-    cur_point.r = (p0->r << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1));
-    cur_point.g = (p0->g << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1));
-    cur_point.b = (p0->b << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1));
-  }
-
-  for (s32 i = 0; i <= k; i++)
-  {
-    // Sign extension is not necessary here for x and y, due to the maximum values that ClipX1 and ClipY1 can contain.
-    const s32 x = (cur_point.x >> Line_XY_FractBits) & 2047;
-    const s32 y = (cur_point.y >> Line_XY_FractBits) & 2047;
-
-    if (!IsInterlacedRenderingEnabled() || GetActiveLineLSB() != (static_cast<u32>(y) & 1u))
-    {
-      const u8 r = shading_enable ? static_cast<u8>(cur_point.r >> Line_RGB_FractBits) : p0->r;
-      const u8 g = shading_enable ? static_cast<u8>(cur_point.g >> Line_RGB_FractBits) : p0->g;
-      const u8 b = shading_enable ? static_cast<u8>(cur_point.b >> Line_RGB_FractBits) : p0->b;
-
-      if (x >= static_cast<s32>(m_drawing_area.left) && x <= static_cast<s32>(m_drawing_area.right) &&
-          y >= static_cast<s32>(m_drawing_area.top) && y <= static_cast<s32>(m_drawing_area.bottom))
-      {
-        ShadePixel<false, false, transparency_enable, dithering_enable>(static_cast<u32>(x), static_cast<u32>(y), r, g,
-                                                                        b, 0, 0);
-      }
-    }
-
-    cur_point.x += step.dx_dk;
-    cur_point.y += step.dy_dk;
-
-    if constexpr (shading_enable)
-    {
-      cur_point.r += step.dr_dk;
-      cur_point.g += step.dg_dk;
-      cur_point.b += step.db_dk;
-    }
-  }
-}
-
-GPU_SW::DrawLineFunction GPU_SW::GetDrawLineFunction(bool shading_enable, bool transparency_enable,
-                                                     bool dithering_enable)
-{
-#define F(SHADING, TRANSPARENCY, DITHERING) &GPU_SW::DrawLine<SHADING, TRANSPARENCY, DITHERING>
-
-  static constexpr DrawLineFunction funcs[2][2][2] = {
-    {{F(false, false, false), F(false, false, true)}, {F(false, true, false), F(false, true, true)}},
-    {{F(true, false, false), F(true, false, true)}, {F(true, true, false), F(true, true, true)}}};
-
-#undef F
-
-  return funcs[u8(shading_enable)][u8(transparency_enable)][u8(dithering_enable)];
-}
-
-GPU_SW::DrawRectangleFunction GPU_SW::GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable,
-                                                               bool transparency_enable)
-{
-#define F(TEXTURE, RAW_TEXTURE, TRANSPARENCY) &GPU_SW::DrawRectangle<TEXTURE, RAW_TEXTURE, TRANSPARENCY>
-
-  static constexpr DrawRectangleFunction funcs[2][2][2] = {
-    {{F(false, false, false), F(false, false, true)}, {F(false, true, false), F(false, true, true)}},
-    {{F(true, false, false), F(true, false, true)}, {F(true, true, false), F(true, true, true)}}};
-
-#undef F
-
-  return funcs[u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)];
-}
diff --git a/src/core/gpu_sw.h b/src/core/gpu_sw.h
index 0b9326db4..f9b910154 100644
--- a/src/core/gpu_sw.h
+++ b/src/core/gpu_sw.h
@@ -1,6 +1,7 @@
 #pragma once
 #include "common/heap_array.h"
 #include "gpu.h"
+#include "gpu_sw_backend.h"
 #include "host_display.h"
 #include <array>
 #include <memory>
@@ -18,37 +19,14 @@ public:
 
   bool Initialize(HostDisplay* host_display) override;
   void Reset() override;
-
-  ALWAYS_INLINE_RELEASE u16 GetPixel(const u32 x, const u32 y) const { return m_vram[VRAM_WIDTH * y + x]; }
-  ALWAYS_INLINE_RELEASE const u16* GetPixelPtr(const u32 x, const u32 y) const { return &m_vram[VRAM_WIDTH * y + x]; }
-  ALWAYS_INLINE_RELEASE u16* GetPixelPtr(const u32 x, const u32 y) { return &m_vram[VRAM_WIDTH * y + x]; }
-  ALWAYS_INLINE_RELEASE void SetPixel(const u32 x, const u32 y, const u16 value) { m_vram[VRAM_WIDTH * y + x] = value; }
-
-  // this is actually (31 * 255) >> 4) == 494, but to simplify addressing we use the next power of two (512)
-  static constexpr u32 DITHER_LUT_SIZE = 512;
-  using DitherLUT = std::array<std::array<std::array<u8, 512>, DITHER_MATRIX_SIZE>, DITHER_MATRIX_SIZE>;
-  static constexpr DitherLUT ComputeDitherLUT();
+  void UpdateSettings() override;
 
 protected:
-  struct SWVertex
-  {
-    s32 x, y;
-    u8 r, g, b;
-    u8 u, v;
+  void ReadVRAM(u32 x, u32 y, u32 width, u32 height) override;
+  void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color) override;
+  void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data) override;
+  void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) override;
 
-    ALWAYS_INLINE void SetPosition(GPUVertexPosition p, s32 offset_x, s32 offset_y)
-    {
-      x = TruncateGPUVertexPosition(offset_x + p.x);
-      y = TruncateGPUVertexPosition(offset_y + p.y);
-    }
-
-    ALWAYS_INLINE void SetColorRGB24(u32 color) { std::tie(r, g, b) = UnpackColorRGB24(color); }
-    ALWAYS_INLINE void SetTexcoord(u16 value) { std::tie(u, v) = UnpackTexcoord(value); }
-  };
-
-  //////////////////////////////////////////////////////////////////////////
-  // Scanout
-  //////////////////////////////////////////////////////////////////////////
   template<HostDisplayPixelFormat display_format>
   void CopyOut15Bit(u32 src_x, u32 src_y, u32 width, u32 height, u32 field, bool interlaced, bool interleaved);
   void CopyOut15Bit(HostDisplayPixelFormat display_format, u32 src_x, u32 src_y, u32 width, u32 height, u32 field,
@@ -63,71 +41,14 @@ protected:
   void ClearDisplay() override;
   void UpdateDisplay() override;
 
-  //////////////////////////////////////////////////////////////////////////
-  // Rasterization
-  //////////////////////////////////////////////////////////////////////////
-
   void DispatchRenderCommand() override;
 
-  template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
-  void ShadePixel(u32 x, u32 y, u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x, u8 texcoord_y);
+  void FillBackendCommandParameters(GPUBackendCommand* cmd);
+  void FillDrawCommand(GPUBackendDrawCommand* cmd, GPURenderCommand rc);
 
-  template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
-  void DrawRectangle(s32 origin_x, s32 origin_y, u32 width, u32 height, u8 r, u8 g, u8 b, u8 origin_texcoord_x,
-                     u8 origin_texcoord_y);
-
-  using DrawRectangleFunction = void (GPU_SW::*)(s32 origin_x, s32 origin_y, u32 width, u32 height, u8 r, u8 g, u8 b,
-                                                 u8 origin_texcoord_x, u8 origin_texcoord_y);
-  DrawRectangleFunction GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable,
-                                                 bool transparency_enable);
-
-  //////////////////////////////////////////////////////////////////////////
-  // Polygon and line rasterization ported from Mednafen
-  //////////////////////////////////////////////////////////////////////////
-  struct i_deltas
-  {
-    u32 du_dx, dv_dx;
-    u32 dr_dx, dg_dx, db_dx;
-
-    u32 du_dy, dv_dy;
-    u32 dr_dy, dg_dy, db_dy;
-  };
-
-  struct i_group
-  {
-    u32 u, v;
-    u32 r, g, b;
-  };
-
-  template<bool shading_enable, bool texture_enable>
-  bool CalcIDeltas(i_deltas& idl, const SWVertex* A, const SWVertex* B, const SWVertex* C);
-
-  template<bool shading_enable, bool texture_enable>
-  void AddIDeltas_DX(i_group& ig, const i_deltas& idl, u32 count = 1);
-
-  template<bool shading_enable, bool texture_enable>
-  void AddIDeltas_DY(i_group& ig, const i_deltas& idl, u32 count = 1);
-
-  template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
-           bool dithering_enable>
-  void DrawSpan(s32 y, s32 x_start, s32 x_bound, i_group ig, const i_deltas& idl);
-
-  template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
-           bool dithering_enable>
-  void DrawTriangle(const SWVertex* v0, const SWVertex* v1, const SWVertex* v2);
-
-  using DrawTriangleFunction = void (GPU_SW::*)(const SWVertex* v0, const SWVertex* v1, const SWVertex* v2);
-  DrawTriangleFunction GetDrawTriangleFunction(bool shading_enable, bool texture_enable, bool raw_texture_enable,
-                                               bool transparency_enable, bool dithering_enable);
-
-  template<bool shading_enable, bool transparency_enable, bool dithering_enable>
-  void DrawLine(const SWVertex* p0, const SWVertex* p1);
-
-  using DrawLineFunction = void (GPU_SW::*)(const SWVertex* p0, const SWVertex* p1);
-  DrawLineFunction GetDrawLineFunction(bool shading_enable, bool transparency_enable, bool dithering_enable);
-
-  std::array<u16, VRAM_WIDTH * VRAM_HEIGHT> m_vram;
   HeapArray<u8, VRAM_WIDTH * VRAM_HEIGHT * sizeof(u32)> m_display_texture_buffer;
   HostDisplayPixelFormat m_16bit_display_format = HostDisplayPixelFormat::RGB565;
   HostDisplayPixelFormat m_24bit_display_format = HostDisplayPixelFormat::RGBA8;
+
+  GPU_SW_Backend m_backend;
 };
diff --git a/src/core/gpu_sw_backend.cpp b/src/core/gpu_sw_backend.cpp
new file mode 100644
index 000000000..fc4d2d2f6
--- /dev/null
+++ b/src/core/gpu_sw_backend.cpp
@@ -0,0 +1,928 @@
+#include "gpu_sw_backend.h"
+#include "common/assert.h"
+#include "common/log.h"
+#include "gpu_sw_backend.h"
+#include "host_display.h"
+#include "system.h"
+#include <algorithm>
+Log_SetChannel(GPU_SW_Backend);
+
+GPU_SW_Backend::GPU_SW_Backend() : GPUBackend()
+{
+  m_vram.fill(0);
+  m_vram_ptr = m_vram.data();
+}
+
+GPU_SW_Backend::~GPU_SW_Backend() = default;
+
+bool GPU_SW_Backend::Initialize()
+{
+  return GPUBackend::Initialize();
+}
+
+void GPU_SW_Backend::Reset()
+{
+  GPUBackend::Reset();
+
+  m_vram.fill(0);
+}
+
+void GPU_SW_Backend::DrawPolygon(const GPUBackendDrawPolygonCommand* cmd)
+{
+  const GPURenderCommand rc{cmd->rc.bits};
+  const bool dithering_enable = rc.IsDitheringEnabled() && cmd->draw_mode.dither_enable;
+
+  const DrawTriangleFunction DrawFunction = GetDrawTriangleFunction(
+    rc.shading_enable, rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable, dithering_enable);
+
+  (this->*DrawFunction)(cmd, &cmd->vertices[0], &cmd->vertices[1], &cmd->vertices[2]);
+  if (rc.quad_polygon)
+    (this->*DrawFunction)(cmd, &cmd->vertices[2], &cmd->vertices[1], &cmd->vertices[3]);
+}
+
+void GPU_SW_Backend::DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
+{
+  const GPURenderCommand rc{cmd->rc.bits};
+  const bool dithering_enable = rc.IsDitheringEnabled() && cmd->draw_mode.dither_enable;
+
+  const DrawRectangleFunction DrawFunction =
+    GetDrawRectangleFunction(rc.texture_enable, rc.raw_texture_enable, rc.transparency_enable);
+
+  (this->*DrawFunction)(cmd);
+}
+
+void GPU_SW_Backend::DrawLine(const GPUBackendDrawLineCommand* cmd)
+{
+  const DrawLineFunction DrawFunction =
+    GetDrawLineFunction(cmd->rc.shading_enable, cmd->rc.transparency_enable, cmd->IsDitheringEnabled());
+
+  for (u16 i = 1; i < cmd->num_vertices; i++)
+    (this->*DrawFunction)(cmd, &cmd->vertices[i - 1], &cmd->vertices[i]);
+}
+
+constexpr GPU_SW_Backend::DitherLUT GPU_SW_Backend::ComputeDitherLUT()
+{
+  DitherLUT lut = {};
+  for (u32 i = 0; i < DITHER_MATRIX_SIZE; i++)
+  {
+    for (u32 j = 0; j < DITHER_MATRIX_SIZE; j++)
+    {
+      for (s32 value = 0; value < DITHER_LUT_SIZE; value++)
+      {
+        const s32 dithered_value = (value + DITHER_MATRIX[i][j]) >> 3;
+        lut[i][j][value] = static_cast<u8>((dithered_value < 0) ? 0 : ((dithered_value > 31) ? 31 : dithered_value));
+      }
+    }
+  }
+  return lut;
+}
+
+static constexpr GPU_SW_Backend::DitherLUT s_dither_lut = GPU_SW_Backend::ComputeDitherLUT();
+
+template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
+void ALWAYS_INLINE_RELEASE GPU_SW_Backend::ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, u8 color_r,
+                                                      u8 color_g, u8 color_b, u8 texcoord_x, u8 texcoord_y)
+{
+  VRAMPixel color;
+  bool transparent;
+  if constexpr (texture_enable)
+  {
+    // Apply texture window
+    // TODO: Precompute the second half
+    texcoord_x = (texcoord_x & cmd->window.and_x) | cmd->window.or_x;
+    texcoord_y = (texcoord_y & cmd->window.and_y) | cmd->window.or_y;
+
+    VRAMPixel texture_color;
+    switch (cmd->draw_mode.texture_mode)
+    {
+      case GPUTextureMode::Palette4Bit:
+      {
+        const u16 palette_value =
+          GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 4)) % VRAM_WIDTH,
+                   (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
+        const u16 palette_index = (palette_value >> ((texcoord_x % 4) * 4)) & 0x0Fu;
+
+        const u32 px = (cmd->palette.GetXBase() + ZeroExtend32(palette_index)) % VRAM_WIDTH;
+        const u32 py = cmd->palette.GetYBase();
+        texture_color.bits =
+          GetPixel((cmd->palette.GetXBase() + ZeroExtend32(palette_index)) % VRAM_WIDTH, cmd->palette.GetYBase());
+      }
+      break;
+
+      case GPUTextureMode::Palette8Bit:
+      {
+        const u16 palette_value =
+          GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 2)) % VRAM_WIDTH,
+                   (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
+        const u16 palette_index = (palette_value >> ((texcoord_x % 2) * 8)) & 0xFFu;
+        texture_color.bits =
+          GetPixel((cmd->palette.GetXBase() + ZeroExtend32(palette_index)) % VRAM_WIDTH, cmd->palette.GetYBase());
+      }
+      break;
+
+      default:
+      {
+        texture_color.bits = GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x)) % VRAM_WIDTH,
+                                      (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
+      }
+      break;
+    }
+
+    if (texture_color.bits == 0)
+      return;
+
+    transparent = texture_color.c;
+
+    if constexpr (raw_texture_enable)
+    {
+      color.bits = texture_color.bits;
+    }
+    else
+    {
+      const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
+      const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
+
+      color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.r) * u16(color_r)) >> 4]) << 0) |
+                   (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.g) * u16(color_g)) >> 4]) << 5) |
+                   (ZeroExtend16(s_dither_lut[dither_y][dither_x][(u16(texture_color.b) * u16(color_b)) >> 4]) << 10) |
+                   (texture_color.bits & 0x8000u);
+    }
+  }
+  else
+  {
+    transparent = true;
+
+    const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
+    const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
+
+    color.bits = (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_r]) << 0) |
+                 (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_g]) << 5) |
+                 (ZeroExtend16(s_dither_lut[dither_y][dither_x][color_b]) << 10);
+  }
+
+  const VRAMPixel bg_color{GetPixel(static_cast<u32>(x), static_cast<u32>(y))};
+  if constexpr (transparency_enable)
+  {
+    if (transparent)
+    {
+#define BLEND_AVERAGE(bg, fg) Truncate8(std::min<u32>((ZeroExtend32(bg) / 2) + (ZeroExtend32(fg) / 2), 0x1F))
+#define BLEND_ADD(bg, fg) Truncate8(std::min<u32>(ZeroExtend32(bg) + ZeroExtend32(fg), 0x1F))
+#define BLEND_SUBTRACT(bg, fg) Truncate8((bg > fg) ? ((bg) - (fg)) : 0)
+#define BLEND_QUARTER(bg, fg) Truncate8(std::min<u32>(ZeroExtend32(bg) + ZeroExtend32(fg / 4), 0x1F))
+
+#define BLEND_RGB(func)                                                                                                \
+  color.Set(func(bg_color.r.GetValue(), color.r.GetValue()), func(bg_color.g.GetValue(), color.g.GetValue()),          \
+            func(bg_color.b.GetValue(), color.b.GetValue()), color.c.GetValue())
+
+      switch (cmd->draw_mode.transparency_mode)
+      {
+        case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
+          BLEND_RGB(BLEND_AVERAGE);
+          break;
+        case GPUTransparencyMode::BackgroundPlusForeground:
+          BLEND_RGB(BLEND_ADD);
+          break;
+        case GPUTransparencyMode::BackgroundMinusForeground:
+          BLEND_RGB(BLEND_SUBTRACT);
+          break;
+        case GPUTransparencyMode::BackgroundPlusQuarterForeground:
+          BLEND_RGB(BLEND_QUARTER);
+          break;
+        default:
+          break;
+      }
+
+#undef BLEND_RGB
+
+#undef BLEND_QUARTER
+#undef BLEND_SUBTRACT
+#undef BLEND_ADD
+#undef BLEND_AVERAGE
+    }
+  }
+  else
+  {
+    UNREFERENCED_VARIABLE(transparent);
+  }
+
+  const u16 mask_and = cmd->params.GetMaskAND();
+  if ((bg_color.bits & mask_and) != 0)
+    return;
+
+  SetPixel(static_cast<u32>(x), static_cast<u32>(y), color.bits | cmd->params.GetMaskOR());
+}
+
+template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
+void GPU_SW_Backend::DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
+{
+  const s32 origin_x = cmd->x;
+  const s32 origin_y = cmd->y;
+  const auto [r, g, b] = UnpackColorRGB24(cmd->color);
+  const auto [origin_texcoord_x, origin_texcoord_y] = UnpackTexcoord(cmd->texcoord);
+
+  for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
+  {
+    const s32 y = origin_y + static_cast<s32>(offset_y);
+    if (y < static_cast<s32>(m_drawing_area.top) || y > static_cast<s32>(m_drawing_area.bottom) ||
+        (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u)))
+    {
+      continue;
+    }
+
+    const u8 texcoord_y = Truncate8(ZeroExtend32(origin_texcoord_y) + offset_y);
+
+    for (u32 offset_x = 0; offset_x < cmd->width; offset_x++)
+    {
+      const s32 x = origin_x + static_cast<s32>(offset_x);
+      if (x < static_cast<s32>(m_drawing_area.left) || x > static_cast<s32>(m_drawing_area.right))
+        continue;
+
+      const u8 texcoord_x = Truncate8(ZeroExtend32(origin_texcoord_x) + offset_x);
+
+      ShadePixel<texture_enable, raw_texture_enable, transparency_enable, false>(
+        cmd, static_cast<u32>(x), static_cast<u32>(y), r, g, b, texcoord_x, texcoord_y);
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Polygon and line rasterization ported from Mednafen
+//////////////////////////////////////////////////////////////////////////
+
+#define COORD_FBS 12
+#define COORD_MF_INT(n) ((n) << COORD_FBS)
+#define COORD_POST_PADDING 12
+
+static ALWAYS_INLINE_RELEASE s64 MakePolyXFP(s32 x)
+{
+  return ((u64)x << 32) + ((1ULL << 32) - (1 << 11));
+}
+
+static ALWAYS_INLINE_RELEASE s64 MakePolyXFPStep(s32 dx, s32 dy)
+{
+  s64 ret;
+  s64 dx_ex = (u64)dx << 32;
+
+  if (dx_ex < 0)
+    dx_ex -= dy - 1;
+
+  if (dx_ex > 0)
+    dx_ex += dy - 1;
+
+  ret = dx_ex / dy;
+
+  return (ret);
+}
+
+static ALWAYS_INLINE_RELEASE s32 GetPolyXFP_Int(s64 xfp)
+{
+  return (xfp >> 32);
+}
+
+template<bool shading_enable, bool texture_enable>
+bool ALWAYS_INLINE_RELEASE GPU_SW_Backend::CalcIDeltas(i_deltas& idl, const GPUBackendDrawPolygonCommand::Vertex* A,
+                                                       const GPUBackendDrawPolygonCommand::Vertex* B,
+                                                       const GPUBackendDrawPolygonCommand::Vertex* C)
+{
+#define CALCIS(x, y) (((B->x - A->x) * (C->y - B->y)) - ((C->x - B->x) * (B->y - A->y)))
+
+  s32 denom = CALCIS(x, y);
+
+  if (!denom)
+    return false;
+
+  if constexpr (shading_enable)
+  {
+    idl.dr_dx = (u32)(CALCIS(r, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
+    idl.dr_dy = (u32)(CALCIS(x, r) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
+
+    idl.dg_dx = (u32)(CALCIS(g, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
+    idl.dg_dy = (u32)(CALCIS(x, g) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
+
+    idl.db_dx = (u32)(CALCIS(b, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
+    idl.db_dy = (u32)(CALCIS(x, b) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
+  }
+
+  if constexpr (texture_enable)
+  {
+    idl.du_dx = (u32)(CALCIS(u, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
+    idl.du_dy = (u32)(CALCIS(x, u) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
+
+    idl.dv_dx = (u32)(CALCIS(v, y) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
+    idl.dv_dy = (u32)(CALCIS(x, v) * (1 << COORD_FBS) / denom) << COORD_POST_PADDING;
+  }
+
+  return true;
+
+#undef CALCIS
+}
+
+template<bool shading_enable, bool texture_enable>
+void ALWAYS_INLINE_RELEASE GPU_SW_Backend::AddIDeltas_DX(i_group& ig, const i_deltas& idl, u32 count /*= 1*/)
+{
+  if constexpr (shading_enable)
+  {
+    ig.r += idl.dr_dx * count;
+    ig.g += idl.dg_dx * count;
+    ig.b += idl.db_dx * count;
+  }
+
+  if constexpr (texture_enable)
+  {
+    ig.u += idl.du_dx * count;
+    ig.v += idl.dv_dx * count;
+  }
+}
+
+template<bool shading_enable, bool texture_enable>
+void ALWAYS_INLINE_RELEASE GPU_SW_Backend::AddIDeltas_DY(i_group& ig, const i_deltas& idl, u32 count /*= 1*/)
+{
+  if constexpr (shading_enable)
+  {
+    ig.r += idl.dr_dy * count;
+    ig.g += idl.dg_dy * count;
+    ig.b += idl.db_dy * count;
+  }
+
+  if constexpr (texture_enable)
+  {
+    ig.u += idl.du_dy * count;
+    ig.v += idl.dv_dy * count;
+  }
+}
+
+template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
+         bool dithering_enable>
+void GPU_SW_Backend::DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, i_group ig,
+                              const i_deltas& idl)
+{
+  if (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u))
+    return;
+
+  s32 x_ig_adjust = x_start;
+  s32 w = x_bound - x_start;
+  s32 x = TruncateGPUVertexPosition(x_start);
+
+  if (x < static_cast<s32>(m_drawing_area.left))
+  {
+    s32 delta = static_cast<s32>(m_drawing_area.left) - x;
+    x_ig_adjust += delta;
+    x += delta;
+    w -= delta;
+  }
+
+  if ((x + w) > (static_cast<s32>(m_drawing_area.right) + 1))
+    w = static_cast<s32>(m_drawing_area.right) + 1 - x;
+
+  if (w <= 0)
+    return;
+
+  AddIDeltas_DX<shading_enable, texture_enable>(ig, idl, x_ig_adjust);
+  AddIDeltas_DY<shading_enable, texture_enable>(ig, idl, y);
+
+  do
+  {
+    const u32 r = ig.r >> (COORD_FBS + COORD_POST_PADDING);
+    const u32 g = ig.g >> (COORD_FBS + COORD_POST_PADDING);
+    const u32 b = ig.b >> (COORD_FBS + COORD_POST_PADDING);
+    const u32 u = ig.u >> (COORD_FBS + COORD_POST_PADDING);
+    const u32 v = ig.v >> (COORD_FBS + COORD_POST_PADDING);
+
+    ShadePixel<texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
+      cmd, static_cast<u32>(x), static_cast<u32>(y), Truncate8(r), Truncate8(g), Truncate8(b), Truncate8(u),
+      Truncate8(v));
+
+    x++;
+    AddIDeltas_DX<shading_enable, texture_enable>(ig, idl);
+  } while (--w > 0);
+}
+
+template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
+         bool dithering_enable>
+void GPU_SW_Backend::DrawTriangle(const GPUBackendDrawPolygonCommand* cmd,
+                                  const GPUBackendDrawPolygonCommand::Vertex* v0,
+                                  const GPUBackendDrawPolygonCommand::Vertex* v1,
+                                  const GPUBackendDrawPolygonCommand::Vertex* v2)
+{
+  u32 core_vertex;
+  {
+    u32 cvtemp = 0;
+
+    if (v1->x <= v0->x)
+    {
+      if (v2->x <= v1->x)
+        cvtemp = (1 << 2);
+      else
+        cvtemp = (1 << 1);
+    }
+    else if (v2->x < v0->x)
+      cvtemp = (1 << 2);
+    else
+      cvtemp = (1 << 0);
+
+    if (v2->y < v1->y)
+    {
+      std::swap(v2, v1);
+      cvtemp = ((cvtemp >> 1) & 0x2) | ((cvtemp << 1) & 0x4) | (cvtemp & 0x1);
+    }
+
+    if (v1->y < v0->y)
+    {
+      std::swap(v1, v0);
+      cvtemp = ((cvtemp >> 1) & 0x1) | ((cvtemp << 1) & 0x2) | (cvtemp & 0x4);
+    }
+
+    if (v2->y < v1->y)
+    {
+      std::swap(v2, v1);
+      cvtemp = ((cvtemp >> 1) & 0x2) | ((cvtemp << 1) & 0x4) | (cvtemp & 0x1);
+    }
+
+    core_vertex = cvtemp >> 1;
+  }
+
+  if (v0->y == v2->y)
+    return;
+
+  if (static_cast<u32>(std::abs(v2->x - v0->x)) >= MAX_PRIMITIVE_WIDTH ||
+      static_cast<u32>(std::abs(v2->x - v1->x)) >= MAX_PRIMITIVE_WIDTH ||
+      static_cast<u32>(std::abs(v1->x - v0->x)) >= MAX_PRIMITIVE_WIDTH ||
+      static_cast<u32>(v2->y - v0->y) >= MAX_PRIMITIVE_HEIGHT)
+  {
+    return;
+  }
+
+  s64 base_coord = MakePolyXFP(v0->x);
+  s64 base_step = MakePolyXFPStep((v2->x - v0->x), (v2->y - v0->y));
+  s64 bound_coord_us;
+  s64 bound_coord_ls;
+  bool right_facing;
+
+  if (v1->y == v0->y)
+  {
+    bound_coord_us = 0;
+    right_facing = (bool)(v1->x > v0->x);
+  }
+  else
+  {
+    bound_coord_us = MakePolyXFPStep((v1->x - v0->x), (v1->y - v0->y));
+    right_facing = (bool)(bound_coord_us > base_step);
+  }
+
+  if (v2->y == v1->y)
+    bound_coord_ls = 0;
+  else
+    bound_coord_ls = MakePolyXFPStep((v2->x - v1->x), (v2->y - v1->y));
+
+  i_deltas idl;
+  if (!CalcIDeltas<shading_enable, texture_enable>(idl, v0, v1, v2))
+    return;
+
+  const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2};
+
+  i_group ig;
+  if constexpr (texture_enable)
+  {
+    ig.u = (COORD_MF_INT(vertices[core_vertex]->u) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
+    ig.v = (COORD_MF_INT(vertices[core_vertex]->v) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
+  }
+
+  ig.r = (COORD_MF_INT(vertices[core_vertex]->r) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
+  ig.g = (COORD_MF_INT(vertices[core_vertex]->g) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
+  ig.b = (COORD_MF_INT(vertices[core_vertex]->b) + (1 << (COORD_FBS - 1))) << COORD_POST_PADDING;
+
+  AddIDeltas_DX<shading_enable, texture_enable>(ig, idl, -vertices[core_vertex]->x);
+  AddIDeltas_DY<shading_enable, texture_enable>(ig, idl, -vertices[core_vertex]->y);
+
+  struct TriangleHalf
+  {
+    u64 x_coord[2];
+    u64 x_step[2];
+
+    s32 y_coord;
+    s32 y_bound;
+
+    bool dec_mode;
+  } tripart[2];
+
+  u32 vo = 0;
+  u32 vp = 0;
+  if (core_vertex != 0)
+    vo = 1;
+  if (core_vertex == 2)
+    vp = 3;
+
+  {
+    TriangleHalf* tp = &tripart[vo];
+    tp->y_coord = vertices[0 ^ vo]->y;
+    tp->y_bound = vertices[1 ^ vo]->y;
+    tp->x_coord[right_facing] = MakePolyXFP(vertices[0 ^ vo]->x);
+    tp->x_step[right_facing] = bound_coord_us;
+    tp->x_coord[!right_facing] = base_coord + ((vertices[vo]->y - vertices[0]->y) * base_step);
+    tp->x_step[!right_facing] = base_step;
+    tp->dec_mode = vo;
+  }
+
+  {
+    TriangleHalf* tp = &tripart[vo ^ 1];
+    tp->y_coord = vertices[1 ^ vp]->y;
+    tp->y_bound = vertices[2 ^ vp]->y;
+    tp->x_coord[right_facing] = MakePolyXFP(vertices[1 ^ vp]->x);
+    tp->x_step[right_facing] = bound_coord_ls;
+    tp->x_coord[!right_facing] =
+      base_coord + ((vertices[1 ^ vp]->y - vertices[0]->y) *
+                    base_step); // base_coord + ((vertices[1].y - vertices[0].y) * base_step);
+    tp->x_step[!right_facing] = base_step;
+    tp->dec_mode = vp;
+  }
+
+  for (u32 i = 0; i < 2; i++)
+  {
+    s32 yi = tripart[i].y_coord;
+    s32 yb = tripart[i].y_bound;
+
+    u64 lc = tripart[i].x_coord[0];
+    u64 ls = tripart[i].x_step[0];
+
+    u64 rc = tripart[i].x_coord[1];
+    u64 rs = tripart[i].x_step[1];
+
+    if (tripart[i].dec_mode)
+    {
+      while (yi > yb)
+      {
+        yi--;
+        lc -= ls;
+        rc -= rs;
+
+        s32 y = TruncateGPUVertexPosition(yi);
+
+        if (y < static_cast<s32>(m_drawing_area.top))
+          break;
+
+        if (y > static_cast<s32>(m_drawing_area.bottom))
+          continue;
+
+        DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
+          cmd, yi, GetPolyXFP_Int(lc), GetPolyXFP_Int(rc), ig, idl);
+      }
+    }
+    else
+    {
+      while (yi < yb)
+      {
+        s32 y = TruncateGPUVertexPosition(yi);
+
+        if (y > static_cast<s32>(m_drawing_area.bottom))
+          break;
+
+        if (y >= static_cast<s32>(m_drawing_area.top))
+        {
+
+          DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable, dithering_enable>(
+            cmd, yi, GetPolyXFP_Int(lc), GetPolyXFP_Int(rc), ig, idl);
+        }
+
+        yi++;
+        lc += ls;
+        rc += rs;
+      }
+    }
+  }
+}
+
+GPU_SW_Backend::DrawTriangleFunction GPU_SW_Backend::GetDrawTriangleFunction(bool shading_enable, bool texture_enable,
+                                                                             bool raw_texture_enable,
+                                                                             bool transparency_enable,
+                                                                             bool dithering_enable)
+{
+#define F(SHADING, TEXTURE, RAW_TEXTURE, TRANSPARENCY, DITHERING)                                                      \
+  &GPU_SW_Backend::DrawTriangle<SHADING, TEXTURE, RAW_TEXTURE, TRANSPARENCY, DITHERING>
+
+  static constexpr DrawTriangleFunction funcs[2][2][2][2][2] = {
+    {{{{F(false, false, false, false, false), F(false, false, false, false, true)},
+       {F(false, false, false, true, false), F(false, false, false, true, true)}},
+      {{F(false, false, true, false, false), F(false, false, true, false, true)},
+       {F(false, false, true, true, false), F(false, false, true, true, true)}}},
+     {{{F(false, true, false, false, false), F(false, true, false, false, true)},
+       {F(false, true, false, true, false), F(false, true, false, true, true)}},
+      {{F(false, true, true, false, false), F(false, true, true, false, true)},
+       {F(false, true, true, true, false), F(false, true, true, true, true)}}}},
+    {{{{F(true, false, false, false, false), F(true, false, false, false, true)},
+       {F(true, false, false, true, false), F(true, false, false, true, true)}},
+      {{F(true, false, true, false, false), F(true, false, true, false, true)},
+       {F(true, false, true, true, false), F(true, false, true, true, true)}}},
+     {{{F(true, true, false, false, false), F(true, true, false, false, true)},
+       {F(true, true, false, true, false), F(true, true, false, true, true)}},
+      {{F(true, true, true, false, false), F(true, true, true, false, true)},
+       {F(true, true, true, true, false), F(true, true, true, true, true)}}}}};
+
+#undef F
+
+  return funcs[u8(shading_enable)][u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)]
+              [u8(dithering_enable)];
+}
+
+enum
+{
+  Line_XY_FractBits = 32
+};
+enum
+{
+  Line_RGB_FractBits = 12
+};
+
+struct line_fxp_coord
+{
+  u64 x, y;
+  u32 r, g, b;
+};
+
+struct line_fxp_step
+{
+  s64 dx_dk, dy_dk;
+  s32 dr_dk, dg_dk, db_dk;
+};
+
+static ALWAYS_INLINE_RELEASE s64 LineDivide(s64 delta, s32 dk)
+{
+  delta = (u64)delta << Line_XY_FractBits;
+
+  if (delta < 0)
+    delta -= dk - 1;
+  if (delta > 0)
+    delta += dk - 1;
+
+  return (delta / dk);
+}
+
+template<bool shading_enable, bool transparency_enable, bool dithering_enable>
+void GPU_SW_Backend::DrawLine(const GPUBackendDrawLineCommand* cmd, const GPUBackendDrawLineCommand::Vertex* p0,
+                              const GPUBackendDrawLineCommand::Vertex* p1)
+{
+  const s32 i_dx = std::abs(p1->x - p0->x);
+  const s32 i_dy = std::abs(p1->y - p0->y);
+  const s32 k = (i_dx > i_dy) ? i_dx : i_dy;
+  if (i_dx >= MAX_PRIMITIVE_WIDTH || i_dy >= MAX_PRIMITIVE_HEIGHT)
+    return;
+
+  if (p0->x >= p1->x && k > 0)
+    std::swap(p0, p1);
+
+  line_fxp_step step;
+  if (k == 0)
+  {
+    step.dx_dk = 0;
+    step.dy_dk = 0;
+
+    if constexpr (shading_enable)
+    {
+      step.dr_dk = 0;
+      step.dg_dk = 0;
+      step.db_dk = 0;
+    }
+  }
+  else
+  {
+    step.dx_dk = LineDivide(p1->x - p0->x, k);
+    step.dy_dk = LineDivide(p1->y - p0->y, k);
+
+    if constexpr (shading_enable)
+    {
+      step.dr_dk = (s32)((u32)(p1->r - p0->r) << Line_RGB_FractBits) / k;
+      step.dg_dk = (s32)((u32)(p1->g - p0->g) << Line_RGB_FractBits) / k;
+      step.db_dk = (s32)((u32)(p1->b - p0->b) << Line_RGB_FractBits) / k;
+    }
+  }
+
+  line_fxp_coord cur_point;
+  cur_point.x = ((u64)p0->x << Line_XY_FractBits) | (1ULL << (Line_XY_FractBits - 1));
+  cur_point.y = ((u64)p0->y << Line_XY_FractBits) | (1ULL << (Line_XY_FractBits - 1));
+
+  cur_point.x -= 1024;
+
+  if (step.dy_dk < 0)
+    cur_point.y -= 1024;
+
+  if constexpr (shading_enable)
+  {
+    cur_point.r = (p0->r << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1));
+    cur_point.g = (p0->g << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1));
+    cur_point.b = (p0->b << Line_RGB_FractBits) | (1 << (Line_RGB_FractBits - 1));
+  }
+
+  for (s32 i = 0; i <= k; i++)
+  {
+    // Sign extension is not necessary here for x and y, due to the maximum values that ClipX1 and ClipY1 can contain.
+    const s32 x = (cur_point.x >> Line_XY_FractBits) & 2047;
+    const s32 y = (cur_point.y >> Line_XY_FractBits) & 2047;
+
+    if ((!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (Truncate8(static_cast<u32>(y)) & 1u)) &&
+        x >= static_cast<s32>(m_drawing_area.left) && x <= static_cast<s32>(m_drawing_area.right) &&
+        y >= static_cast<s32>(m_drawing_area.top) && y <= static_cast<s32>(m_drawing_area.bottom))
+    {
+      const u8 r = shading_enable ? static_cast<u8>(cur_point.r >> Line_RGB_FractBits) : p0->r;
+      const u8 g = shading_enable ? static_cast<u8>(cur_point.g >> Line_RGB_FractBits) : p0->g;
+      const u8 b = shading_enable ? static_cast<u8>(cur_point.b >> Line_RGB_FractBits) : p0->b;
+
+      ShadePixel<false, false, transparency_enable, dithering_enable>(cmd, static_cast<u32>(x), static_cast<u32>(y), r,
+                                                                      g, b, 0, 0);
+    }
+
+    cur_point.x += step.dx_dk;
+    cur_point.y += step.dy_dk;
+
+    if constexpr (shading_enable)
+    {
+      cur_point.r += step.dr_dk;
+      cur_point.g += step.dg_dk;
+      cur_point.b += step.db_dk;
+    }
+  }
+}
+
+GPU_SW_Backend::DrawLineFunction GPU_SW_Backend::GetDrawLineFunction(bool shading_enable, bool transparency_enable,
+                                                                     bool dithering_enable)
+{
+#define F(SHADING, TRANSPARENCY, DITHERING) &GPU_SW_Backend::DrawLine<SHADING, TRANSPARENCY, DITHERING>
+
+  static constexpr DrawLineFunction funcs[2][2][2] = {
+    {{F(false, false, false), F(false, false, true)}, {F(false, true, false), F(false, true, true)}},
+    {{F(true, false, false), F(true, false, true)}, {F(true, true, false), F(true, true, true)}}};
+
+#undef F
+
+  return funcs[u8(shading_enable)][u8(transparency_enable)][u8(dithering_enable)];
+}
+
+GPU_SW_Backend::DrawRectangleFunction
+GPU_SW_Backend::GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable, bool transparency_enable)
+{
+#define F(TEXTURE, RAW_TEXTURE, TRANSPARENCY) &GPU_SW_Backend::DrawRectangle<TEXTURE, RAW_TEXTURE, TRANSPARENCY>
+
+  static constexpr DrawRectangleFunction funcs[2][2][2] = {
+    {{F(false, false, false), F(false, false, true)}, {F(false, true, false), F(false, true, true)}},
+    {{F(true, false, false), F(true, false, true)}, {F(true, true, false), F(true, true, true)}}};
+
+#undef F
+
+  return funcs[u8(texture_enable)][u8(raw_texture_enable)][u8(transparency_enable)];
+}
+
+void GPU_SW_Backend::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params)
+{
+  const u16 color16 = RGBA8888ToRGBA5551(color);
+  if ((x + width) <= VRAM_WIDTH && !params.interlaced_rendering)
+  {
+    for (u32 yoffs = 0; yoffs < height; yoffs++)
+    {
+      const u32 row = (y + yoffs) % VRAM_HEIGHT;
+      std::fill_n(&m_vram_ptr[row * VRAM_WIDTH + x], width, color16);
+    }
+  }
+  else if (params.interlaced_rendering)
+  {
+    // Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
+    const u32 active_field = params.active_line_lsb;
+    for (u32 yoffs = 0; yoffs < height; yoffs++)
+    {
+      const u32 row = (y + yoffs) % VRAM_HEIGHT;
+      if ((row & u32(1)) == active_field)
+        continue;
+
+      u16* row_ptr = &m_vram_ptr[row * VRAM_WIDTH];
+      for (u32 xoffs = 0; xoffs < width; xoffs++)
+      {
+        const u32 col = (x + xoffs) % VRAM_WIDTH;
+        row_ptr[col] = color16;
+      }
+    }
+  }
+  else
+  {
+    for (u32 yoffs = 0; yoffs < height; yoffs++)
+    {
+      const u32 row = (y + yoffs) % VRAM_HEIGHT;
+      u16* row_ptr = &m_vram_ptr[row * VRAM_WIDTH];
+      for (u32 xoffs = 0; xoffs < width; xoffs++)
+      {
+        const u32 col = (x + xoffs) % VRAM_WIDTH;
+        row_ptr[col] = color16;
+      }
+    }
+  }
+}
+
+void GPU_SW_Backend::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data,
+                                GPUBackendCommandParameters params)
+{
+  // Fast path when the copy is not oversized.
+  if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !params.IsMaskingEnabled())
+  {
+    const u16* src_ptr = static_cast<const u16*>(data);
+    u16* dst_ptr = &m_vram_ptr[y * VRAM_WIDTH + x];
+    for (u32 yoffs = 0; yoffs < height; yoffs++)
+    {
+      std::copy_n(src_ptr, width, dst_ptr);
+      src_ptr += width;
+      dst_ptr += VRAM_WIDTH;
+    }
+  }
+  else
+  {
+    // Slow path when we need to handle wrap-around.
+    const u16* src_ptr = static_cast<const u16*>(data);
+    const u16 mask_and = params.GetMaskAND();
+    const u16 mask_or = params.GetMaskOR();
+
+    for (u32 row = 0; row < height;)
+    {
+      u16* dst_row_ptr = &m_vram_ptr[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH];
+      for (u32 col = 0; col < width;)
+      {
+        // TODO: Handle unaligned reads...
+        u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH];
+        if (((*pixel_ptr) & mask_and) == 0)
+          *pixel_ptr = *(src_ptr++) | mask_or;
+      }
+    }
+  }
+}
+
+void GPU_SW_Backend::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
+                              GPUBackendCommandParameters params)
+{
+  // Break up oversized copies. This behavior has not been verified on console.
+  if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH)
+  {
+    u32 remaining_rows = height;
+    u32 current_src_y = src_y;
+    u32 current_dst_y = dst_y;
+    while (remaining_rows > 0)
+    {
+      const u32 rows_to_copy =
+        std::min<u32>(remaining_rows, std::min<u32>(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y));
+
+      u32 remaining_columns = width;
+      u32 current_src_x = src_x;
+      u32 current_dst_x = dst_x;
+      while (remaining_columns > 0)
+      {
+        const u32 columns_to_copy =
+          std::min<u32>(remaining_columns, std::min<u32>(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x));
+        CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy, params);
+        current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH;
+        current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH;
+        remaining_columns -= columns_to_copy;
+      }
+
+      current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT;
+      current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT;
+      remaining_rows -= rows_to_copy;
+    }
+
+    return;
+  }
+
+  // This doesn't have a fast path, but do we really need one? It's not common.
+  const u16 mask_and = params.GetMaskAND();
+  const u16 mask_or = params.GetMaskOR();
+
+  // Copy in reverse when src_x < dst_x, this is verified on console.
+  if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH))
+  {
+    for (u32 row = 0; row < height; row++)
+    {
+      const u16* src_row_ptr = &m_vram_ptr[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
+      u16* dst_row_ptr = &m_vram_ptr[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
+
+      for (s32 col = static_cast<s32>(width - 1); col >= 0; col--)
+      {
+        const u16 src_pixel = src_row_ptr[(src_x + static_cast<u32>(col)) % VRAM_WIDTH];
+        u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast<u32>(col)) % VRAM_WIDTH];
+        if ((*dst_pixel_ptr & mask_and) == 0)
+          *dst_pixel_ptr = src_pixel | mask_or;
+      }
+    }
+  }
+  else
+  {
+    for (u32 row = 0; row < height; row++)
+    {
+      const u16* src_row_ptr = &m_vram_ptr[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
+      u16* dst_row_ptr = &m_vram_ptr[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
+
+      for (u32 col = 0; col < width; col++)
+      {
+        const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH];
+        u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH];
+        if ((*dst_pixel_ptr & mask_and) == 0)
+          *dst_pixel_ptr = src_pixel | mask_or;
+      }
+    }
+  }
+}
+
+void GPU_SW_Backend::FlushRender() {}
+
+void GPU_SW_Backend::DrawingAreaChanged() {}
diff --git a/src/core/gpu_sw_backend.h b/src/core/gpu_sw_backend.h
new file mode 100644
index 000000000..a7105ecc2
--- /dev/null
+++ b/src/core/gpu_sw_backend.h
@@ -0,0 +1,174 @@
+#pragma once
+#include "gpu_backend.h"
+#include <array>
+#include <memory>
+#include <vector>
+
+class GPU_SW_Backend final : public GPUBackend
+{
+public:
+  GPU_SW_Backend();
+  ~GPU_SW_Backend() override;
+
+  bool Initialize() override;
+  void Reset() override;
+
+  ALWAYS_INLINE_RELEASE u16 GetPixel(const u32 x, const u32 y) const { return m_vram[VRAM_WIDTH * y + x]; }
+  ALWAYS_INLINE_RELEASE const u16* GetPixelPtr(const u32 x, const u32 y) const { return &m_vram[VRAM_WIDTH * y + x]; }
+  ALWAYS_INLINE_RELEASE u16* GetPixelPtr(const u32 x, const u32 y) { return &m_vram[VRAM_WIDTH * y + x]; }
+  ALWAYS_INLINE_RELEASE void SetPixel(const u32 x, const u32 y, const u16 value) { m_vram[VRAM_WIDTH * y + x] = value; }
+
+  // this is actually (31 * 255) >> 4) == 494, but to simplify addressing we use the next power of two (512)
+  static constexpr u32 DITHER_LUT_SIZE = 512;
+  using DitherLUT = std::array<std::array<std::array<u8, 512>, DITHER_MATRIX_SIZE>, DITHER_MATRIX_SIZE>;
+  static constexpr DitherLUT ComputeDitherLUT();
+
+protected:
+  static constexpr u8 Convert5To8(u8 x5) { return (x5 << 3) | (x5 & 7); }
+  static constexpr u8 Convert8To5(u8 x8) { return (x8 >> 3); }
+
+  union VRAMPixel
+  {
+    u16 bits;
+
+    BitField<u16, u8, 0, 5> r;
+    BitField<u16, u8, 5, 5> g;
+    BitField<u16, u8, 10, 5> b;
+    BitField<u16, bool, 15, 1> c;
+
+    u8 GetR8() const { return Convert5To8(r); }
+    u8 GetG8() const { return Convert5To8(g); }
+    u8 GetB8() const { return Convert5To8(b); }
+
+    void Set(u8 r_, u8 g_, u8 b_, bool c_ = false)
+    {
+      bits = (ZeroExtend16(r_)) | (ZeroExtend16(g_) << 5) | (ZeroExtend16(b_) << 10) | (static_cast<u16>(c_) << 15);
+    }
+
+    void ClampAndSet(u8 r_, u8 g_, u8 b_, bool c_ = false)
+    {
+      Set(std::min<u8>(r_, 0x1F), std::min<u8>(g_, 0x1F), std::min<u8>(b_, 0x1F), c_);
+    }
+
+    void SetRGB24(u32 rgb24, bool c_ = false)
+    {
+      bits = Truncate16(((rgb24 >> 3) & 0x1F) | (((rgb24 >> 11) & 0x1F) << 5) | (((rgb24 >> 19) & 0x1F) << 10)) |
+             (static_cast<u16>(c_) << 15);
+    }
+
+    void SetRGB24(u8 r8, u8 g8, u8 b8, bool c_ = false)
+    {
+      bits = (ZeroExtend16(r8 >> 3)) | (ZeroExtend16(g8 >> 3) << 5) | (ZeroExtend16(b8 >> 3) << 10) |
+             (static_cast<u16>(c_) << 15);
+    }
+
+    void SetRGB24Dithered(u32 x, u32 y, u8 r8, u8 g8, u8 b8, bool c_ = false)
+    {
+      const s32 offset = DITHER_MATRIX[y & 3][x & 3];
+      r8 = static_cast<u8>(std::clamp<s32>(static_cast<s32>(ZeroExtend32(r8)) + offset, 0, 255));
+      g8 = static_cast<u8>(std::clamp<s32>(static_cast<s32>(ZeroExtend32(g8)) + offset, 0, 255));
+      b8 = static_cast<u8>(std::clamp<s32>(static_cast<s32>(ZeroExtend32(b8)) + offset, 0, 255));
+      SetRGB24(r8, g8, b8, c_);
+    }
+
+    u32 ToRGB24() const
+    {
+      const u32 r_ = ZeroExtend32(r.GetValue());
+      const u32 g_ = ZeroExtend32(g.GetValue());
+      const u32 b_ = ZeroExtend32(b.GetValue());
+
+      return ((r_ << 3) | (r_ & 7)) | (((g_ << 3) | (g_ & 7)) << 8) | (((b_ << 3) | (b_ & 7)) << 16);
+    }
+  };
+
+  static constexpr std::tuple<u8, u8> UnpackTexcoord(u16 texcoord)
+  {
+    return std::make_tuple(static_cast<u8>(texcoord), static_cast<u8>(texcoord >> 8));
+  }
+
+  static constexpr std::tuple<u8, u8, u8> UnpackColorRGB24(u32 rgb24)
+  {
+    return std::make_tuple(static_cast<u8>(rgb24), static_cast<u8>(rgb24 >> 8), static_cast<u8>(rgb24 >> 16));
+  }
+
+  void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, GPUBackendCommandParameters params) override;
+  void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, GPUBackendCommandParameters params) override;
+  void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height,
+                GPUBackendCommandParameters params) override;
+
+  void DrawPolygon(const GPUBackendDrawPolygonCommand* cmd) override;
+  void DrawLine(const GPUBackendDrawLineCommand* cmd) override;
+  void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd) override;
+  void FlushRender() override;
+  void DrawingAreaChanged() override;
+
+  //////////////////////////////////////////////////////////////////////////
+  // Rasterization
+  //////////////////////////////////////////////////////////////////////////
+  template<bool texture_enable, bool raw_texture_enable, bool transparency_enable, bool dithering_enable>
+  void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y, u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x,
+                  u8 texcoord_y);
+
+  template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
+  void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd);
+
+  using DrawRectangleFunction = void (GPU_SW_Backend::*)(const GPUBackendDrawRectangleCommand* cmd);
+  DrawRectangleFunction GetDrawRectangleFunction(bool texture_enable, bool raw_texture_enable,
+                                                 bool transparency_enable);
+
+  //////////////////////////////////////////////////////////////////////////
+  // Polygon and line rasterization ported from Mednafen
+  //////////////////////////////////////////////////////////////////////////
+  struct i_deltas
+  {
+    u32 du_dx, dv_dx;
+    u32 dr_dx, dg_dx, db_dx;
+
+    u32 du_dy, dv_dy;
+    u32 dr_dy, dg_dy, db_dy;
+  };
+
+  struct i_group
+  {
+    u32 u, v;
+    u32 r, g, b;
+  };
+
+  template<bool shading_enable, bool texture_enable>
+  bool CalcIDeltas(i_deltas& idl, const GPUBackendDrawPolygonCommand::Vertex* A,
+                   const GPUBackendDrawPolygonCommand::Vertex* B, const GPUBackendDrawPolygonCommand::Vertex* C);
+
+  template<bool shading_enable, bool texture_enable>
+  void AddIDeltas_DX(i_group& ig, const i_deltas& idl, u32 count = 1);
+
+  template<bool shading_enable, bool texture_enable>
+  void AddIDeltas_DY(i_group& ig, const i_deltas& idl, u32 count = 1);
+
+  template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
+           bool dithering_enable>
+  void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, i_group ig,
+                const i_deltas& idl);
+
+  template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable,
+           bool dithering_enable>
+  void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0,
+                    const GPUBackendDrawPolygonCommand::Vertex* v1, const GPUBackendDrawPolygonCommand::Vertex* v2);
+
+  using DrawTriangleFunction = void (GPU_SW_Backend::*)(const GPUBackendDrawPolygonCommand* cmd,
+                                                        const GPUBackendDrawPolygonCommand::Vertex* v0,
+                                                        const GPUBackendDrawPolygonCommand::Vertex* v1,
+                                                        const GPUBackendDrawPolygonCommand::Vertex* v2);
+  DrawTriangleFunction GetDrawTriangleFunction(bool shading_enable, bool texture_enable, bool raw_texture_enable,
+                                               bool transparency_enable, bool dithering_enable);
+
+  template<bool shading_enable, bool transparency_enable, bool dithering_enable>
+  void DrawLine(const GPUBackendDrawLineCommand* cmd, const GPUBackendDrawLineCommand::Vertex* p0,
+                const GPUBackendDrawLineCommand::Vertex* p1);
+
+  using DrawLineFunction = void (GPU_SW_Backend::*)(const GPUBackendDrawLineCommand* cmd,
+                                                    const GPUBackendDrawLineCommand::Vertex* p0,
+                                                    const GPUBackendDrawLineCommand::Vertex* p1);
+  DrawLineFunction GetDrawLineFunction(bool shading_enable, bool transparency_enable, bool dithering_enable);
+
+  std::array<u16, VRAM_WIDTH * VRAM_HEIGHT> m_vram;
+};
diff --git a/src/core/gpu_types.h b/src/core/gpu_types.h
index 1df6cbc8b..4fd0472e7 100644
--- a/src/core/gpu_types.h
+++ b/src/core/gpu_types.h
@@ -222,3 +222,177 @@ static constexpr s32 DITHER_MATRIX[DITHER_MATRIX_SIZE][DITHER_MATRIX_SIZE] = { {
                                                                               {+2, -2, +3, -1},  // row 1
                                                                               {-3, +1, -4, +0},  // row 2
                                                                               {+4, -1, +2, -2} }; // row 3
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4200) // warning C4200: nonstandard extension used: zero-sized array in struct/union
+#endif
+
+enum class GPUBackendCommandType : u8
+{
+  Wraparound,
+  Sync,
+  FillVRAM,
+  UpdateVRAM,
+  CopyVRAM,
+  SetDrawingArea,
+  DrawPolygon,
+  DrawRectangle,
+  DrawLine
+};
+
+union GPUBackendCommandParameters
+{
+  u8 bits;
+
+  BitField<u8, bool, 0, 1> interlaced_rendering;
+
+  /// Returns 0 if the currently-displayed field is on an even line in VRAM, otherwise 1.
+  BitField<u8, u8, 1, 1> active_line_lsb;
+
+  BitField<u8, bool, 2, 1> set_mask_while_drawing;
+  BitField<u8, bool, 3, 1> check_mask_before_draw;
+
+  ALWAYS_INLINE bool IsMaskingEnabled() const { return (bits & 12u) != 0u; }
+
+  // During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
+  u16 GetMaskAND() const
+  {
+    // return check_mask_before_draw ? 0x8000 : 0x0000;
+    return Truncate16((bits << 12) & 0x8000);
+  }
+  u16 GetMaskOR() const
+  {
+    // return set_mask_while_drawing ? 0x8000 : 0x0000;
+    return Truncate16((bits << 13) & 0x8000);
+  }
+};
+
+struct GPUBackendCommand
+{
+  GPUBackendCommandType type;
+  GPUBackendCommandParameters params;
+  u32 size;
+};
+
+struct GPUBackendSyncCommand : public GPUBackendCommand
+{
+  ALWAYS_INLINE u32 Size() const { return sizeof(GPUBackendSyncCommand); }
+};
+
+struct GPUBackendFillVRAMCommand : public GPUBackendCommand
+{
+  u16 x;
+  u16 y;
+  u16 width;
+  u16 height;
+  u32 color;
+
+  ALWAYS_INLINE u32 Size() const { return sizeof(GPUBackendFillVRAMCommand); }
+};
+
+struct GPUBackendUpdateVRAMCommand : public GPUBackendCommand
+{
+  u16 x;
+  u16 y;
+  u16 width;
+  u16 height;
+  u16 data[0];
+
+  ALWAYS_INLINE u32 Size() const { return sizeof(GPUBackendUpdateVRAMCommand) + (sizeof(u16) * width * height); }
+};
+
+struct GPUBackendCopyVRAMCommand : public GPUBackendCommand
+{
+  u16 src_x;
+  u16 src_y;
+  u16 dst_x;
+  u16 dst_y;
+  u16 width;
+  u16 height;
+
+  ALWAYS_INLINE u32 Size() const { return sizeof(GPUBackendCopyVRAMCommand); }
+};
+
+struct GPUBackendSetDrawingAreaCommand : public GPUBackendCommand
+{
+  Common::Rectangle<u32> new_area;
+
+  ALWAYS_INLINE u32 Size() const { return sizeof(GPUBackendSetDrawingAreaCommand); }
+};
+
+struct GPUBackendDrawCommand : public GPUBackendCommand
+{
+  GPURenderCommand rc;
+  GPUDrawModeReg draw_mode;
+  GPUTexturePaletteReg palette;
+  GPUTextureWindow window;
+
+  ALWAYS_INLINE bool IsDitheringEnabled() const { return rc.IsDitheringEnabled() && draw_mode.dither_enable; }
+};
+
+struct GPUBackendDrawPolygonCommand : public GPUBackendDrawCommand
+{
+  u16 num_vertices;
+
+  struct Vertex
+  {
+    s32 x, y;
+    union
+    {
+      struct
+      {
+        u8 r, g, b, a;
+      };
+      u32 color;
+    };
+    union
+    {
+      struct
+      {
+        u8 u, v;
+      };
+      u16 texcoord;
+    };
+  };
+
+  Vertex vertices[0];
+
+  ALWAYS_INLINE u32 Size() const { return sizeof(GPUBackendDrawPolygonCommand) + sizeof(Vertex) * num_vertices; }
+};
+
+struct GPUBackendDrawRectangleCommand : public GPUBackendDrawCommand
+{
+  s32 x, y;
+  u16 width, height;
+  u16 texcoord;
+  u32 color;
+
+  ALWAYS_INLINE u32 Size() const { return sizeof(GPUBackendDrawRectangleCommand); }
+};
+
+struct GPUBackendDrawLineCommand : public GPUBackendDrawCommand
+{
+  u16 num_vertices;
+
+  struct Vertex
+  {
+    s32 x, y;
+    union
+    {
+      struct
+      {
+        u8 r, g, b, a;
+      };
+      u32 color;
+    };
+  };
+
+  Vertex vertices[0];
+
+  ALWAYS_INLINE u32 Size() const { return sizeof(GPUBackendDrawLineCommand) + sizeof(Vertex) * num_vertices; }
+};
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
diff --git a/src/core/host_interface.cpp b/src/core/host_interface.cpp
index c49c84c69..b7d8d48a7 100644
--- a/src/core/host_interface.cpp
+++ b/src/core/host_interface.cpp
@@ -432,6 +432,7 @@ void HostInterface::SetDefaultSettings(SettingsInterface& si)
   si.SetIntValue("GPU", "Multisamples", 1);
   si.SetBoolValue("GPU", "UseDebugDevice", false);
   si.SetBoolValue("GPU", "PerSampleShading", false);
+  si.SetBoolValue("GPU", "UseThread", true);
   si.SetBoolValue("GPU", "TrueColor", false);
   si.SetBoolValue("GPU", "ScaledDithering", true);
   si.SetStringValue("GPU", "TextureFilter", Settings::GetTextureFilterName(Settings::DEFAULT_GPU_TEXTURE_FILTER));
@@ -629,6 +630,7 @@ void HostInterface::CheckForSettingsChanges(const Settings& old_settings)
     if (g_settings.gpu_resolution_scale != old_settings.gpu_resolution_scale ||
         g_settings.gpu_multisamples != old_settings.gpu_multisamples ||
         g_settings.gpu_per_sample_shading != old_settings.gpu_per_sample_shading ||
+        g_settings.gpu_use_thread != old_settings.gpu_use_thread ||
         g_settings.gpu_fifo_size != old_settings.gpu_fifo_size ||
         g_settings.gpu_max_run_ahead != old_settings.gpu_max_run_ahead ||
         g_settings.gpu_true_color != old_settings.gpu_true_color ||
diff --git a/src/core/settings.cpp b/src/core/settings.cpp
index c2bb9fe8b..15bdbcb48 100644
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -139,6 +139,7 @@ void Settings::Load(SettingsInterface& si)
   gpu_multisamples = static_cast<u32>(si.GetIntValue("GPU", "Multisamples", 1));
   gpu_use_debug_device = si.GetBoolValue("GPU", "UseDebugDevice", false);
   gpu_per_sample_shading = si.GetBoolValue("GPU", "PerSampleShading", false);
+  gpu_use_thread = si.GetBoolValue("GPU", "UseThread", true);
   gpu_true_color = si.GetBoolValue("GPU", "TrueColor", true);
   gpu_scaled_dithering = si.GetBoolValue("GPU", "ScaledDithering", false);
   gpu_texture_filter =
@@ -273,6 +274,7 @@ void Settings::Save(SettingsInterface& si) const
   si.SetIntValue("GPU", "Multisamples", static_cast<long>(gpu_multisamples));
   si.SetBoolValue("GPU", "UseDebugDevice", gpu_use_debug_device);
   si.SetBoolValue("GPU", "PerSampleShading", gpu_per_sample_shading);
+  si.SetBoolValue("GPU", "UseThread", gpu_use_thread);
   si.SetBoolValue("GPU", "TrueColor", gpu_true_color);
   si.SetBoolValue("GPU", "ScaledDithering", gpu_scaled_dithering);
   si.SetStringValue("GPU", "TextureFilter", GetTextureFilterName(gpu_texture_filter));
diff --git a/src/core/settings.h b/src/core/settings.h
index 32f31fc6c..00214b055 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -94,6 +94,7 @@ struct Settings
   std::string display_post_process_chain;
   u32 gpu_resolution_scale = 1;
   u32 gpu_multisamples = 1;
+  bool gpu_use_thread = true;
   bool gpu_use_debug_device = false;
   bool gpu_per_sample_shading = false;
   bool gpu_true_color = true;
diff --git a/src/duckstation-libretro/libretro_host_interface.cpp b/src/duckstation-libretro/libretro_host_interface.cpp
index 7ccfd64b7..99d7ae72f 100644
--- a/src/duckstation-libretro/libretro_host_interface.cpp
+++ b/src/duckstation-libretro/libretro_host_interface.cpp
@@ -461,7 +461,7 @@ void LibretroHostInterface::OnSystemDestroyed()
   m_using_hardware_renderer = false;
 }
 
-static std::array<retro_core_option_definition, 44> s_option_definitions = {{
+static std::array<retro_core_option_definition, 45> s_option_definitions = {{
   {"duckstation_Console.Region",
    "Console Region",
    "Determines which region/hardware to emulate. Auto-Detect will use the region of the disc inserted.",
@@ -542,6 +542,12 @@ static std::array<retro_core_option_definition, 44> s_option_definitions = {{
    "OpenGL"
 #endif
   },
+  {"duckstation_GPU.UseThread",
+   "Threaded Rendering (Software)",
+   "Uses a second thread for drawing graphics. Currently only available for the software renderer, but can provide a "
+   "significant speed improvement, and is safe to use.",
+   {{"true", "Enabled"}, {"false", "Disabled"}},
+   "true"},
   {"duckstation_GPU.ResolutionScale",
    "Internal Resolution Scale",
    "Scales internal VRAM resolution by the specified multiplier. Larger values are slower. Some games require "
@@ -731,8 +737,7 @@ static std::array<retro_core_option_definition, 44> s_option_definitions = {{
    "Controller 1 Analog Axis Scale",
    "Sets the analog stick axis scaling factor.",
    {{"1.00f", "1.00"}, {"1.40f", "1.40"}},
-   "1.00f"
-   },
+   "1.00f"},
   {"duckstation_Controller2.Type",
    "Controller 2 Type",
    "Sets the type of controller for Slot 2.",
@@ -753,12 +758,11 @@ static std::array<retro_core_option_definition, 44> s_option_definitions = {{
    "Allows you to use the analog sticks to control the d-pad in digital mode, as well as the buttons.",
    {{"true", "Enabled"}, {"false", "Disabled"}},
    "false"},
-   {"duckstation_Controller2.AxisScale",
+  {"duckstation_Controller2.AxisScale",
    "Controller 2 Analog Axis Scale",
    "Sets the analog stick axis scaling factor.",
    {{"1.00f", "1.00"}, {"1.40f", "1.40"}},
-   "1.00f"
-   },
+   "1.00f"},
   {"duckstation_Display.ShowOSDMessages",
    "Display OSD Messages",
    "Shows on-screen messages generated by the core.",
diff --git a/src/duckstation-qt/displaysettingswidget.cpp b/src/duckstation-qt/displaysettingswidget.cpp
index 92d9f563d..a92524475 100644
--- a/src/duckstation-qt/displaysettingswidget.cpp
+++ b/src/duckstation-qt/displaysettingswidget.cpp
@@ -33,6 +33,7 @@ DisplaySettingsWidget::DisplaySettingsWidget(QtHostInterface* host_interface, QW
   SettingWidgetBinder::BindWidgetToBoolSetting(m_host_interface, m_ui.displayIntegerScaling, "Display",
                                                "IntegerScaling");
   SettingWidgetBinder::BindWidgetToBoolSetting(m_host_interface, m_ui.vsync, "Display", "VSync");
+  SettingWidgetBinder::BindWidgetToBoolSetting(m_host_interface, m_ui.gpuThread, "GPU", "UseThread");
   SettingWidgetBinder::BindWidgetToBoolSetting(m_host_interface, m_ui.showOSDMessages, "Display", "ShowOSDMessages",
                                                true);
   SettingWidgetBinder::BindWidgetToBoolSetting(m_host_interface, m_ui.showFPS, "Display", "ShowFPS", false);
@@ -84,6 +85,9 @@ DisplaySettingsWidget::DisplaySettingsWidget(QtHostInterface* host_interface, QW
     m_ui.vsync, tr("VSync"), tr("Checked"),
     tr("Enable this option to match DuckStation's refresh rate with your current monitor or screen. "
        "VSync is automatically disabled when it is not possible (e.g. running at non-100% speed)."));
+  dialog->registerWidgetHelp(m_ui.gpuThread, tr("Threaded Rendering"), tr("Checked"),
+                             tr("Uses a second thread for drawing graphics. Currently only available for the software "
+                                "renderer, but can provide a significant speed improvement, and is safe to use."));
   dialog->registerWidgetHelp(m_ui.showOSDMessages, tr("Show OSD Messages"), tr("Checked"),
                              tr("Shows on-screen-display messages when events occur such as save states being "
                                 "created/loaded, screenshots being taken, etc."));
@@ -124,6 +128,7 @@ void DisplaySettingsWidget::populateGPUAdaptersAndResolutions()
 {
   std::vector<std::string> adapter_names;
   std::vector<std::string> fullscreen_modes;
+  bool thread_supported = false;
   switch (static_cast<GPURenderer>(m_ui.renderer->currentIndex()))
   {
 #ifdef WIN32
@@ -140,6 +145,10 @@ void DisplaySettingsWidget::populateGPUAdaptersAndResolutions()
       adapter_names = FrontendCommon::VulkanHostDisplay::EnumerateAdapterNames();
       break;
 
+    case GPURenderer::Software:
+      thread_supported = true;
+      break;
+
     default:
       break;
   }
@@ -184,6 +193,8 @@ void DisplaySettingsWidget::populateGPUAdaptersAndResolutions()
     // disable it if we don't have a choice
     m_ui.fullscreenMode->setEnabled(!fullscreen_modes.empty());
   }
+
+  m_ui.gpuThread->setEnabled(thread_supported);
 }
 
 void DisplaySettingsWidget::onGPUAdapterIndexChanged()
diff --git a/src/duckstation-qt/displaysettingswidget.ui b/src/duckstation-qt/displaysettingswidget.ui
index 5ae78f14e..658e8a02b 100644
--- a/src/duckstation-qt/displaysettingswidget.ui
+++ b/src/duckstation-qt/displaysettingswidget.ui
@@ -62,7 +62,14 @@
       <item row="2" column="1">
        <widget class="QComboBox" name="fullscreenMode"/>
       </item>
-      <item row="3" column="0">
+      <item row="3" column="0" colspan="2">
+       <widget class="QCheckBox" name="gpuThread">
+        <property name="text">
+         <string>Threaded Rendering</string>
+        </property>
+       </widget>
+      </item>
+      <item row="4" column="0" colspan="2">
        <widget class="QCheckBox" name="vsync">
         <property name="text">
          <string>VSync</string>
diff --git a/src/duckstation-sdl/sdl_host_interface.cpp b/src/duckstation-sdl/sdl_host_interface.cpp
index c9899586e..65b5abc0a 100644
--- a/src/duckstation-sdl/sdl_host_interface.cpp
+++ b/src/duckstation-sdl/sdl_host_interface.cpp
@@ -931,6 +931,8 @@ void SDLHostInterface::DrawQuickSettingsMenu()
       }
     }
 
+    settings_changed |= ImGui::MenuItem("GPU on Thread", nullptr, &m_settings_copy.gpu_use_thread);
+
     ImGui::EndMenu();
   }