GPU/HW: Switch to indexed draws

2025-04-10 19:15:14 +00:00 · 2024-03-01 15:33:00 +10:00 · 2024-03-01 15:33:00 +10:00 · 250fb56838
parent b34742cdd2
commit 250fb56838
2 changed files with 153 additions and 121 deletions
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@ -174,19 +174,6 @@ ALWAYS_INLINE void GPU_HW::BatchVertex::SetUVLimits(u32 min_u, u32 max_u, u32 mi
  uv_limits = PackUVLimits(min_u, max_u, min_v, max_v);
 }
 ALWAYS_INLINE void GPU_HW::AddVertex(const BatchVertex& v)
 {
  std::memcpy(m_batch_current_vertex_ptr, &v, sizeof(BatchVertex));
  m_batch_current_vertex_ptr++;
 }
 template<typename... Args>
 ALWAYS_INLINE void GPU_HW::AddNewVertex(Args&&... args)
 {
  m_batch_current_vertex_ptr->Set(std::forward<Args>(args)...);
  m_batch_current_vertex_ptr++;
 }
 const Threading::Thread* GPU_HW::GetSWThread() const
 {
  return m_sw_renderer ? m_sw_renderer->GetThread() : nullptr;
@ -247,7 +234,8 @@ void GPU_HW::Reset(bool clear_vram)
 {
  GPU::Reset(clear_vram);
-  m_batch_current_vertex_ptr = m_batch_start_vertex_ptr;
+  if (m_batch_vertex_ptr)
    UnmapGPUBuffer(0, 0);
  if (m_sw_renderer)
    m_sw_renderer->Reset();
@ -305,7 +293,8 @@ bool GPU_HW::DoState(StateWrapper& sw, GPUTexture** host_texture, bool update_di
  // invalidate the whole VRAM read texture when loading state
  if (sw.IsReading())
  {
-    m_batch_current_vertex_ptr = m_batch_start_vertex_ptr;
+    if (m_batch_vertex_ptr)
      UnmapGPUBuffer(0, 0);
    SetFullVRAMDirtyRectangle();
    ResetBatchVertexDepth();
  }
@ -1373,29 +1362,36 @@ void GPU_HW::SetScissor()
  g_gpu_device->SetScissor(left, top, right - left, bottom - top);
 }
-void GPU_HW::MapBatchVertexPointer(u32 required_vertices)
+void GPU_HW::MapGPUBuffer(u32 required_vertices, u32 required_indices)
 {
-  DebugAssert(!m_batch_start_vertex_ptr);
+  DebugAssert(!m_batch_vertex_ptr && !m_batch_index_ptr);
-  void* map;
+  void* vb_map;
-  u32 space;
+  u32 vb_space;
-  g_gpu_device->MapVertexBuffer(sizeof(BatchVertex), required_vertices, &map, &space, &m_batch_base_vertex);
+  g_gpu_device->MapVertexBuffer(sizeof(BatchVertex), required_vertices, &vb_map, &vb_space, &m_batch_base_vertex);
  m_batch_vertex_ptr = static_cast<BatchVertex*>(vb_map);
  m_batch_vertex_space = Truncate16(std::min<u32>(vb_space, std::numeric_limits<u16>::max()));
-  m_batch_start_vertex_ptr = static_cast<BatchVertex*>(map);
+  u32 ib_space;
-  m_batch_current_vertex_ptr = m_batch_start_vertex_ptr;
+  g_gpu_device->MapIndexBuffer(required_indices, &m_batch_index_ptr, &ib_space, &m_batch_base_index);
-  m_batch_end_vertex_ptr = m_batch_start_vertex_ptr + space;
+  m_batch_index_space = Truncate16(std::min<u32>(ib_space, std::numeric_limits<u16>::max()));
 }
-void GPU_HW::UnmapBatchVertexPointer(u32 used_vertices)
+void GPU_HW::UnmapGPUBuffer(u32 used_vertices, u32 used_indices)
 {
-  DebugAssert(m_batch_start_vertex_ptr);
+  DebugAssert(m_batch_vertex_ptr && m_batch_index_ptr);
  g_gpu_device->UnmapVertexBuffer(sizeof(BatchVertex), used_vertices);
-  m_batch_start_vertex_ptr = nullptr;
+  g_gpu_device->UnmapIndexBuffer(used_indices);
-  m_batch_end_vertex_ptr = nullptr;
+  m_batch_vertex_ptr = nullptr;
-  m_batch_current_vertex_ptr = nullptr;
+  m_batch_vertex_count = 0;
  m_batch_vertex_space = 0;
  m_batch_index_ptr = nullptr;
  m_batch_index_count = 0;
  m_batch_index_space = 0;
 }
-void GPU_HW::DrawBatchVertices(BatchRenderMode render_mode, u32 num_vertices, u32 base_vertex)
+ALWAYS_INLINE_RELEASE void GPU_HW::DrawBatchVertices(BatchRenderMode render_mode, u32 num_indices, u32 base_index,
                                                     u32 base_vertex)
 {
  // [depth_test][render_mode][texture_mode][transparency_mode][dithering][interlacing]
  const u8 depth_test = m_batch.use_depth_buffer ? static_cast<u8>(2) : BoolToUInt8(m_batch.check_mask_before_draw);
@ -1403,7 +1399,7 @@ void GPU_HW::DrawBatchVertices(BatchRenderMode render_mode, u32 num_vertices, u3
    m_batch_pipelines[depth_test][static_cast<u8>(render_mode)][static_cast<u8>(m_batch.texture_mode)][static_cast<u8>(
      m_batch.transparency_mode)][BoolToUInt8(m_batch.dithering)][BoolToUInt8(m_batch.interlacing)]
      .get());
-  g_gpu_device->Draw(num_vertices, base_vertex);
+  g_gpu_device->DrawIndexed(num_indices, base_index, base_vertex);
 }
 void GPU_HW::ClearDisplay()
@ -1529,7 +1525,7 @@ void GPU_HW::SetBatchDepthBuffer(bool enabled)
  if (m_batch.use_depth_buffer == enabled)
    return;
-  if (GetBatchVertexCount() > 0)
+  if (m_batch_index_count > 0)
  {
    FlushRender();
    EnsureVertexBufferSpaceForCurrentCommand();
@ -1549,7 +1545,7 @@ void GPU_HW::CheckForDepthClear(const BatchVertex* vertices, u32 num_vertices)
  if ((average_z - m_last_depth_z) >= g_settings.gpu_pgxp_depth_clear_threshold)
  {
-    if (GetBatchVertexCount() > 0)
+    if (m_batch_index_count > 0)
    {
      FlushRender();
      EnsureVertexBufferSpaceForCurrentCommand();
@ -1576,16 +1572,17 @@ u32 GPU_HW::GetAdaptiveDownsamplingMipLevels() const
 void GPU_HW::DrawLine(float x0, float y0, u32 col0, float x1, float y1, u32 col1, float depth)
 {
  DebugAssert(m_batch_vertex_space >= 4 && m_batch_index_space >= 6);
  const float dx = x1 - x0;
  const float dy = y1 - y0;
  std::array<BatchVertex, 4> output;
  if (dx == 0.0f && dy == 0.0f)
  {
    // Degenerate, render a point.
-    output[0].Set(x0, y0, depth, 1.0f, col0, 0, 0, 0);
+    (m_batch_vertex_ptr++)->Set(x0, y0, depth, 1.0f, col0, 0, 0, 0);
-    output[1].Set(x0 + 1.0f, y0, depth, 1.0f, col0, 0, 0, 0);
+    (m_batch_vertex_ptr++)->Set(x0 + 1.0f, y0, depth, 1.0f, col0, 0, 0, 0);
-    output[2].Set(x1, y1 + 1.0f, depth, 1.0f, col0, 0, 0, 0);
+    (m_batch_vertex_ptr++)->Set(x1, y1 + 1.0f, depth, 1.0f, col0, 0, 0, 0);
-    output[3].Set(x1 + 1.0f, y1 + 1.0f, depth, 1.0f, col0, 0, 0, 0);
+    (m_batch_vertex_ptr++)->Set(x1 + 1.0f, y1 + 1.0f, depth, 1.0f, col0, 0, 0, 0);
  }
  else
  {
@ -1649,18 +1646,24 @@ void GPU_HW::DrawLine(float x0, float y0, u32 col0, float x1, float y1, u32 col1
    const float ox1 = x1 + pad_x1;
    const float oy1 = y1 + pad_y1;
-    output[0].Set(ox0, oy0, depth, 1.0f, col0, 0, 0, 0);
+    (m_batch_vertex_ptr++)->Set(ox0, oy0, depth, 1.0f, col0, 0, 0, 0);
-    output[1].Set(ox0 + fill_dx, oy0 + fill_dy, depth, 1.0f, col0, 0, 0, 0);
+    (m_batch_vertex_ptr++)->Set(ox0 + fill_dx, oy0 + fill_dy, depth, 1.0f, col0, 0, 0, 0);
-    output[2].Set(ox1, oy1, depth, 1.0f, col1, 0, 0, 0);
+    (m_batch_vertex_ptr++)->Set(ox1, oy1, depth, 1.0f, col1, 0, 0, 0);
-    output[3].Set(ox1 + fill_dx, oy1 + fill_dy, depth, 1.0f, col1, 0, 0, 0);
+    (m_batch_vertex_ptr++)->Set(ox1 + fill_dx, oy1 + fill_dy, depth, 1.0f, col1, 0, 0, 0);
  }
-  AddVertex(output[0]);
+  const u32 start_index = m_batch_vertex_count;
-  AddVertex(output[1]);
+  m_batch_vertex_count += 4;
-  AddVertex(output[2]);
+  m_batch_vertex_space -= 4;
-  AddVertex(output[3]);
+
-  AddVertex(output[2]);
+  *(m_batch_index_ptr++) = Truncate16(start_index + 0);
-  AddVertex(output[1]);
+  *(m_batch_index_ptr++) = Truncate16(start_index + 1);
  *(m_batch_index_ptr++) = Truncate16(start_index + 2);
  *(m_batch_index_ptr++) = Truncate16(start_index + 3);
  *(m_batch_index_ptr++) = Truncate16(start_index + 2);
  *(m_batch_index_ptr++) = Truncate16(start_index + 1);
  m_batch_index_count += 6;
  m_batch_index_space -= 6;
 }
 void GPU_HW::LoadVertices()
@ -1676,8 +1679,6 @@ void GPU_HW::LoadVertices()
  {
    case GPUPrimitive::Polygon:
    {
      DebugAssert(GetBatchVertexSpace() >= (rc.quad_polygon ? 6u : 3u));
      const u32 first_color = rc.color_for_first_vertex;
      const bool shaded = rc.shading_enable;
      const bool textured = rc.texture_enable;
@ -1732,9 +1733,27 @@ void GPU_HW::LoadVertices()
      if (m_compute_uv_range && textured)
        ComputePolygonUVLimits(texpage, vertices.data(), num_vertices);
-      if (!IsDrawingAreaIsValid())
+      if (!IsDrawingAreaIsValid()) [[unlikely]]
        return;
      const u32 start_index = m_batch_vertex_count;
      if (rc.quad_polygon)
      {
        DebugAssert(m_batch_vertex_space >= 4);
        std::memcpy(m_batch_vertex_ptr, vertices.data(), sizeof(BatchVertex) * 4);
        m_batch_vertex_ptr += 4;
        m_batch_vertex_count += 4;
        m_batch_vertex_space -= 4;
      }
      else
      {
        DebugAssert(m_batch_vertex_space >= 3);
        std::memcpy(m_batch_vertex_ptr, vertices.data(), sizeof(BatchVertex) * 3);
        m_batch_vertex_ptr += 3;
        m_batch_vertex_count += 3;
        m_batch_vertex_space -= 3;
      }
      // Cull polygons which are too large.
      const auto [min_x_12, max_x_12] = MinMax(native_vertex_positions[1][0], native_vertex_positions[2][0]);
      const auto [min_y_12, max_y_12] = MinMax(native_vertex_positions[1][1], native_vertex_positions[2][1]);
@ -1763,8 +1782,12 @@ void GPU_HW::LoadVertices()
                             native_vertex_positions[2][0], native_vertex_positions[2][1], rc.shading_enable,
                             rc.texture_enable, rc.transparency_enable);
-        std::memcpy(m_batch_current_vertex_ptr, vertices.data(), sizeof(BatchVertex) * 3);
+        DebugAssert(m_batch_index_space >= 3);
-        m_batch_current_vertex_ptr += 3;
+        *(m_batch_index_ptr++) = Truncate16(start_index);
        *(m_batch_index_ptr++) = Truncate16(start_index + 1);
        *(m_batch_index_ptr++) = Truncate16(start_index + 2);
        m_batch_index_count += 3;
        m_batch_index_space -= 3;
      }
      // quads
@ -1797,9 +1820,12 @@ void GPU_HW::LoadVertices()
                               native_vertex_positions[3][0], native_vertex_positions[3][1], rc.shading_enable,
                               rc.texture_enable, rc.transparency_enable);
-          AddVertex(vertices[2]);
+          DebugAssert(m_batch_index_space >= 3);
-          AddVertex(vertices[1]);
+          *(m_batch_index_ptr++) = Truncate16(start_index + 2);
-          AddVertex(vertices[3]);
+          *(m_batch_index_ptr++) = Truncate16(start_index + 1);
          *(m_batch_index_ptr++) = Truncate16(start_index + 3);
          m_batch_index_count += 3;
          m_batch_index_space -= 3;
        }
      }
@ -1808,7 +1834,8 @@ void GPU_HW::LoadVertices()
        GPUBackendDrawPolygonCommand* cmd = m_sw_renderer->NewDrawPolygonCommand(num_vertices);
        FillDrawCommand(cmd, rc);
-        for (u32 i = 0; i < num_vertices; i++)
+        const u32 sw_num_vertices = rc.quad_polygon ? 4 : 3;
        for (u32 i = 0; i < sw_num_vertices; i++)
        {
          GPUBackendDrawPolygonCommand::Vertex* vert = &cmd->vertices[i];
          vert->x = native_vertex_positions[i][0];
@ -1863,12 +1890,13 @@ void GPU_HW::LoadVertices()
        break;
      }
-      if (!IsDrawingAreaIsValid())
+      if (!IsDrawingAreaIsValid()) [[unlikely]]
        return;
      // we can split the rectangle up into potentially 8 quads
      SetBatchDepthBuffer(false);
-      DebugAssert(GetBatchVertexSpace() >= MAX_VERTICES_FOR_RECTANGLE);
+      DebugAssert(m_batch_vertex_space >= MAX_VERTICES_FOR_RECTANGLE &&
                  m_batch_index_space >= MAX_VERTICES_FOR_RECTANGLE);
      // Split the rectangle into multiple quads if it's greater than 256x256, as the texture page should repeat.
      u16 tex_top = orig_tex_top;
@ -1890,13 +1918,26 @@ void GPU_HW::LoadVertices()
          CheckForTexPageOverlap(texpage, tex_left, tex_top, tex_right - 1, tex_bottom - 1);
-          AddNewVertex(quad_start_x, quad_start_y, depth, 1.0f, color, texpage, tex_left, tex_top, uv_limits);
+          const u32 base_vertex = m_batch_vertex_count;
-          AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top, uv_limits);
+          (m_batch_vertex_ptr++)
-          AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom, uv_limits);
+            ->Set(quad_start_x, quad_start_y, depth, 1.0f, color, texpage, tex_left, tex_top, uv_limits);
          (m_batch_vertex_ptr++)
            ->Set(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top, uv_limits);
          (m_batch_vertex_ptr++)
            ->Set(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom, uv_limits);
          (m_batch_vertex_ptr++)
            ->Set(quad_end_x, quad_end_y, depth, 1.0f, color, texpage, tex_right, tex_bottom, uv_limits);
          m_batch_vertex_count += 4;
          m_batch_vertex_space -= 4;
-          AddNewVertex(quad_start_x, quad_end_y, depth, 1.0f, color, texpage, tex_left, tex_bottom, uv_limits);
+          *(m_batch_index_ptr++) = Truncate16(base_vertex + 0);
-          AddNewVertex(quad_end_x, quad_start_y, depth, 1.0f, color, texpage, tex_right, tex_top, uv_limits);
+          *(m_batch_index_ptr++) = Truncate16(base_vertex + 1);
-          AddNewVertex(quad_end_x, quad_end_y, depth, 1.0f, color, texpage, tex_right, tex_bottom, uv_limits);
+          *(m_batch_index_ptr++) = Truncate16(base_vertex + 2);
          *(m_batch_index_ptr++) = Truncate16(base_vertex + 2);
          *(m_batch_index_ptr++) = Truncate16(base_vertex + 1);
          *(m_batch_index_ptr++) = Truncate16(base_vertex + 3);
          m_batch_index_count += 6;
          m_batch_index_space -= 6;
          x_offset += quad_width;
          tex_left = 0;
@ -1937,7 +1978,7 @@ void GPU_HW::LoadVertices()
      if (!rc.polyline)
      {
-        DebugAssert(GetBatchVertexSpace() >= 2);
+        DebugAssert(m_batch_vertex_space >= 4 && m_batch_index_space >= 6);
        u32 start_color, end_color;
        GPUVertexPosition start_pos, end_pos;
@ -1955,7 +1996,7 @@ void GPU_HW::LoadVertices()
          end_pos.bits = FifoPop();
        }
-        if (!IsDrawingAreaIsValid())
+        if (!IsDrawingAreaIsValid()) [[unlikely]]
          return;
        s32 start_x = start_pos.x + m_drawing_offset.x;
@ -1996,9 +2037,9 @@ void GPU_HW::LoadVertices()
      {
        // Multiply by two because we don't use line strips.
        const u32 num_vertices = GetPolyLineVertexCount();
-        DebugAssert(GetBatchVertexSpace() >= (num_vertices * 2));
+        DebugAssert(m_batch_vertex_space >= (num_vertices * 4) && m_batch_index_space >= (num_vertices * 6));
-        if (!IsDrawingAreaIsValid())
+        if (!IsDrawingAreaIsValid()) [[unlikely]]
          return;
        const bool shaded = rc.shading_enable;
@ -2181,7 +2222,7 @@ ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(u32 texpage, u32 min_u
    if (update_drawn || update_written)
    {
-      if (GetBatchVertexCount() > 0)
+      if (m_batch_index_count > 0)
      {
        FlushRender();
        EnsureVertexBufferSpaceForCurrentCommand();
@ -2194,7 +2235,7 @@ ALWAYS_INLINE_RELEASE void GPU_HW::CheckForTexPageOverlap(u32 texpage, u32 min_u
 ALWAYS_INLINE bool GPU_HW::IsFlushed() const
 {
-  return m_batch_current_vertex_ptr == m_batch_start_vertex_ptr;
+  return (m_batch_index_count == 0);
 }
 GPU_HW::InterlacedRenderMode GPU_HW::GetInterlacedRenderMode() const
@ -2228,44 +2269,44 @@ ALWAYS_INLINE_RELEASE bool GPU_HW::NeedsShaderBlending(GPUTransparencyMode trans
            (transparency != GPUTransparencyMode::Disabled || IsBlendedTextureFiltering(m_texture_filtering)))));
 }
-ALWAYS_INLINE u32 GPU_HW::GetBatchVertexSpace() const
+void GPU_HW::EnsureVertexBufferSpace(u32 required_vertices, u32 required_indices)
 {
-  return static_cast<u32>(m_batch_end_vertex_ptr - m_batch_current_vertex_ptr);
+  if (m_batch_vertex_ptr)
 }
 ALWAYS_INLINE u32 GPU_HW::GetBatchVertexCount() const
  {
-  return static_cast<u32>(m_batch_current_vertex_ptr - m_batch_start_vertex_ptr);
+    if (m_batch_vertex_space >= required_vertices && m_batch_index_space >= required_indices)
 }
 void GPU_HW::EnsureVertexBufferSpace(u32 required_vertices)
 {
  if (m_batch_current_vertex_ptr)
  {
    if (GetBatchVertexSpace() >= required_vertices)
      return;
    FlushRender();
  }
-  MapBatchVertexPointer(required_vertices);
+  MapGPUBuffer(required_vertices, required_indices);
 }
 void GPU_HW::EnsureVertexBufferSpaceForCurrentCommand()
 {
  u32 required_vertices;
  u32 required_indices;
  switch (m_render_command.primitive)
  {
    case GPUPrimitive::Polygon:
-      required_vertices = m_render_command.quad_polygon ? 6 : 3;
+      required_vertices = 4; // assume quad, in case of expansion
      required_indices = 6;
      break;
    case GPUPrimitive::Rectangle:
-      required_vertices = MAX_VERTICES_FOR_RECTANGLE;
+      required_vertices = MAX_VERTICES_FOR_RECTANGLE; // TODO: WRong
      required_indices = MAX_VERTICES_FOR_RECTANGLE;
      break;
    case GPUPrimitive::Line:
-    default:
+    {
-      required_vertices = m_render_command.polyline ? (GetPolyLineVertexCount() * 6u) : 6u;
+      // assume expansion
      const u32 vert_count = m_render_command.polyline ? GetPolyLineVertexCount() : 2;
      required_vertices = vert_count * 4;
      required_indices = vert_count * 6;
    }
    break;
    default:
      UnreachableCode();
  }
  // can we fit these vertices in the current depth buffer range?
@ -2273,16 +2314,11 @@ void GPU_HW::EnsureVertexBufferSpaceForCurrentCommand()
  {
    // implies FlushRender()
    ResetBatchVertexDepth();
-  }
+    MapGPUBuffer(required_vertices, required_indices);
  else if (m_batch_current_vertex_ptr)
  {
    if (GetBatchVertexSpace() >= required_vertices)
    return;
    FlushRender();
  }
-  MapBatchVertexPointer(required_vertices);
+  EnsureVertexBufferSpace(required_vertices, required_indices);
 }
 void GPU_HW::ResetBatchVertexDepth()
@ -2769,7 +2805,7 @@ void GPU_HW::DispatchRenderCommand()
  EnsureVertexBufferSpaceForCurrentCommand();
-  if (GetBatchVertexCount() == 0)
+  if (m_batch_index_count == 0)
  {
    // transparency mode change
    if (transparency_mode != GPUTransparencyMode::Disabled &&
@ -2842,14 +2878,13 @@ void GPU_HW::DispatchRenderCommand()
 void GPU_HW::FlushRender()
 {
-  if (!m_batch_current_vertex_ptr)
+  if (m_batch_index_count == 0)
    return;
-  const u32 vertex_count = GetBatchVertexCount();
+  const u32 base_vertex = m_batch_base_vertex;
-  UnmapBatchVertexPointer(vertex_count);
+  const u32 base_index = m_batch_base_index;
-
+  const u32 index_count = m_batch_index_count;
-  if (vertex_count == 0)
+  UnmapGPUBuffer(m_batch_vertex_count, m_batch_index_count);
    return;
 #ifdef _DEBUG
  GL_SCOPE_FMT("Hardware Draw {}", ++s_draw_number);
@ -2870,19 +2905,19 @@ void GPU_HW::FlushRender()
  {
    if (NeedsTwoPassRendering())
    {
-      DrawBatchVertices(BatchRenderMode::OnlyOpaque, vertex_count, m_batch_base_vertex);
+      DrawBatchVertices(BatchRenderMode::OnlyOpaque, index_count, base_index, base_vertex);
-      DrawBatchVertices(BatchRenderMode::OnlyTransparent, vertex_count, m_batch_base_vertex);
+      DrawBatchVertices(BatchRenderMode::OnlyTransparent, index_count, base_index, base_vertex);
    }
    else
    {
-      DrawBatchVertices(m_batch.GetRenderMode(), vertex_count, m_batch_base_vertex);
+      DrawBatchVertices(m_batch.GetRenderMode(), index_count, base_index, base_vertex);
    }
  }
  if (m_wireframe_mode != GPUWireframeMode::Disabled)
  {
    g_gpu_device->SetPipeline(m_wireframe_pipeline.get());
-    g_gpu_device->Draw(vertex_count, m_batch_base_vertex);
+    g_gpu_device->DrawIndexed(index_count, base_index, base_vertex);
  }
 }
--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@ -132,11 +132,6 @@ private:
  void LoadVertices();
  void AddVertex(const BatchVertex& v);
  template<typename... Args>
  void AddNewVertex(Args&&... args);
  void PrintSettingsToLog();
  void CheckSettings();
@ -144,9 +139,9 @@ private:
  void UpdateDepthBufferFromMaskBit();
  void ClearDepthBuffer();
  void SetScissor();
-  void MapBatchVertexPointer(u32 required_vertices);
+  void MapGPUBuffer(u32 required_vertices, u32 required_indices);
-  void UnmapBatchVertexPointer(u32 used_vertices);
+  void UnmapGPUBuffer(u32 used_vertices, u32 used_indices);
-  void DrawBatchVertices(BatchRenderMode render_mode, u32 num_vertices, u32 base_vertex);
+  void DrawBatchVertices(BatchRenderMode render_mode, u32 num_indices, u32 base_index, u32 base_vertex);
  u32 CalculateResolutionScale() const;
  GPUDownsampleMode GetDownsampleMode(u32 resolution_scale) const;
@ -160,9 +155,7 @@ private:
  void CheckForTexPageOverlap(u32 texpage, u32 min_u, u32 min_v, u32 max_u, u32 max_v);
  bool IsFlushed() const;
-  u32 GetBatchVertexSpace() const;
+  void EnsureVertexBufferSpace(u32 required_vertices, u32 required_indices);
  u32 GetBatchVertexCount() const;
  void EnsureVertexBufferSpace(u32 required_vertices);
  void EnsureVertexBufferSpaceForCurrentCommand();
  void ResetBatchVertexDepth();
@ -225,10 +218,14 @@ private:
  std::unique_ptr<GPU_SW_Backend> m_sw_renderer;
-  BatchVertex* m_batch_start_vertex_ptr = nullptr;
+  BatchVertex* m_batch_vertex_ptr = nullptr;
-  BatchVertex* m_batch_end_vertex_ptr = nullptr;
+  u16* m_batch_index_ptr = nullptr;
  BatchVertex* m_batch_current_vertex_ptr = nullptr;
  u32 m_batch_base_vertex = 0;
  u32 m_batch_base_index = 0;
  u16 m_batch_vertex_count = 0;
  u16 m_batch_index_count = 0;
  u16 m_batch_vertex_space = 0;
  u16 m_batch_index_space = 0;
  s32 m_current_depth = 0;
  float m_last_depth_z = 1.0f;