GPU: Add base functionality for GL streaming buffers

2025-04-10 19:15:14 +00:00 · 2019-11-02 22:21:56 +10:00 · 2019-11-02 22:21:56 +10:00 · 407fee9ec3
parent c52c0608ae
commit 407fee9ec3
8 changed files with 189 additions and 59 deletions
--- a/src/common/common.vcxproj
+++ b/src/common/common.vcxproj
@ -40,6 +40,7 @@
    <ClInclude Include="cd_image.h" />
    <ClInclude Include="fifo_queue.h" />
    <ClInclude Include="gl_program.h" />
+    <ClInclude Include="gl_stream_buffer.h" />
    <ClInclude Include="gl_texture.h" />
    <ClInclude Include="heap_array.h" />
    <ClInclude Include="jit_code_buffer.h" />
@ -53,6 +54,7 @@
    <ClCompile Include="cd_image_bin.cpp" />
    <ClCompile Include="cd_image_cue.cpp" />
    <ClCompile Include="gl_program.cpp" />
+    <ClCompile Include="gl_stream_buffer.cpp" />
    <ClCompile Include="gl_texture.cpp" />
    <ClCompile Include="jit_code_buffer.cpp" />
    <ClCompile Include="state_wrapper.cpp" />
--- a/src/common/common.vcxproj.filters
+++ b/src/common/common.vcxproj.filters
@ -12,6 +12,7 @@
    <ClInclude Include="audio_stream.h" />
    <ClInclude Include="cd_xa.h" />
    <ClInclude Include="heap_array.h" />
+    <ClInclude Include="gl_stream_buffer.h" />
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="jit_code_buffer.cpp" />
@ -23,6 +24,7 @@
    <ClCompile Include="cd_xa.cpp" />
    <ClCompile Include="cd_image_cue.cpp" />
    <ClCompile Include="cd_image_bin.cpp" />
+    <ClCompile Include="gl_stream_buffer.cpp" />
  </ItemGroup>
  <ItemGroup>
    <Natvis Include="bitfield.natvis" />
--- a/src/common/gl_stream_buffer.cpp
+++ b/src/common/gl_stream_buffer.cpp
@ -0,0 +1,53 @@
+#include "gl_stream_buffer.h"
+
+namespace GL {
+
+StreamBuffer::StreamBuffer(GLenum target, GLuint buffer_id, u32 size)
+  : m_target(target), m_buffer_id(buffer_id), m_size(size), m_cpu_buffer(size)
+{
+}
+
+StreamBuffer::~StreamBuffer()
+{
+  glDeleteBuffers(1, &m_buffer_id);
+}
+
+void StreamBuffer::Bind()
+{
+  glBindBuffer(m_target, m_buffer_id);
+}
+
+StreamBuffer::MappingResult StreamBuffer::Map(u32 alignment, u32 min_size)
+{
+  return MappingResult{static_cast<void*>(m_cpu_buffer.data()), 0, m_size / alignment};
+}
+
+void StreamBuffer::Unmap(u32 used_size)
+{
+  if (used_size == 0)
+    return;
+
+  glBindBuffer(m_target, m_buffer_id);
+  glBufferSubData(m_target, 0, used_size, m_cpu_buffer.data());
+}
+
+std::unique_ptr<StreamBuffer> StreamBuffer::Create(GLenum target, u32 size)
+{
+  glGetError();
+
+  GLuint buffer_id;
+  glGenBuffers(1, &buffer_id);
+  glBindBuffer(target, buffer_id);
+  glBufferData(target, size, nullptr, GL_STREAM_DRAW);
+
+  GLenum err = glGetError();
+  if (err != GL_NO_ERROR)
+  {
+    glDeleteBuffers(1, &buffer_id);
+    return {};
+  }
+
+  return std::unique_ptr<StreamBuffer>(new StreamBuffer(target, buffer_id, size));
+}
+
+} // namespace GL
--- a/src/common/gl_stream_buffer.h
+++ b/src/common/gl_stream_buffer.h
@ -0,0 +1,42 @@
+#pragma once
+#include "types.h"
+#include <glad.h>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+namespace GL {
+// TODO: Persistent mapping-based implementation
+class StreamBuffer
+{
+public:
+  ~StreamBuffer();
+
+  ALWAYS_INLINE GLuint GetGLBufferId() const { return m_buffer_id; }
+  ALWAYS_INLINE GLenum GetGLTarget() const { return m_target; }
+  ALWAYS_INLINE u32 GetSize() const { return m_size; }
+
+  void Bind();
+
+  struct MappingResult
+  {
+    void* pointer;
+    u32 index_aligned; // offset / alignment, suitable for base vertex
+    u32 space_aligned; // remaining space / alignment
+  };
+
+  MappingResult Map(u32 alignment, u32 min_size);
+  void Unmap(u32 used_size);
+
+  static std::unique_ptr<StreamBuffer> Create(GLenum target, u32 size);
+
+private:
+  StreamBuffer(GLenum target, GLuint buffer_id, u32 size);
+
+  GLenum m_target;
+  GLuint m_buffer_id;
+  u32 m_size;
+
+  std::vector<u8> m_cpu_buffer;
+};
+} // namespace GL
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@ -49,9 +49,9 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command
    case Primitive::Polygon:
    {
      // if we're drawing quads, we need to create a degenerate triangle to restart the triangle strip
-      bool restart_strip = (rc.quad_polygon && !m_batch.vertices.empty());
+      bool restart_strip = (rc.quad_polygon && !IsFlushed());
      if (restart_strip)
-        m_batch.vertices.push_back(m_batch.vertices.back());
+        AddDuplicateVertex();

      const u32 first_color = rc.color_for_first_vertex;
      const bool shaded = rc.shading_enable;
@ -60,28 +60,15 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command
      u32 buffer_pos = 1;
      for (u32 i = 0; i < num_vertices; i++)
      {
-        HWVertex hw_vert;
-        hw_vert.color = (shaded && i > 0) ? (command_ptr[buffer_pos++] & UINT32_C(0x00FFFFFF)) : first_color;
-
+        const u32 color = (shaded && i > 0) ? (command_ptr[buffer_pos++] & UINT32_C(0x00FFFFFF)) : first_color;
        const VertexPosition vp{command_ptr[buffer_pos++]};
-        hw_vert.x = vp.x;
-        hw_vert.y = vp.y;
-        hw_vert.texpage = texpage;
+        const u16 packed_texcoord = textured ? Truncate16(command_ptr[buffer_pos++]) : 0;

-        if (textured)
-        {
-          const auto [texcoord_x, texcoord_y] = UnpackTexcoord(Truncate16(command_ptr[buffer_pos++]));
-          hw_vert.texcoord = HWVertex::PackTexcoord(texcoord_x, texcoord_y);
-        }
-        else
-        {
-          hw_vert.texcoord = 0;
-        }
+        (m_batch_current_vertex_ptr++)->Set(vp.x, vp.y, color, texpage, packed_texcoord);

-        m_batch.vertices.push_back(hw_vert);
        if (restart_strip)
        {
-          m_batch.vertices.push_back(m_batch.vertices.back());
+          AddDuplicateVertex();
          restart_strip = false;
        }
      }
@ -91,9 +78,9 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command
    case Primitive::Rectangle:
    {
      // if we're drawing quads, we need to create a degenerate triangle to restart the triangle strip
-      const bool restart_strip = !m_batch.vertices.empty();
+      const bool restart_strip = !IsFlushed();
      if (restart_strip)
-        m_batch.vertices.push_back(m_batch.vertices.back());
+        AddDuplicateVertex();

      u32 buffer_pos = 1;
      const u32 color = rc.color_for_first_vertex;
@ -132,16 +119,13 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command
      const u16 tex_right = tex_left + static_cast<u16>(rectangle_width);
      const u16 tex_bottom = tex_top + static_cast<u16>(rectangle_height);

-      m_batch.vertices.push_back(
-        HWVertex{pos_left, pos_top, color, texpage, HWVertex::PackTexcoord(tex_left, tex_top)});
+      (m_batch_current_vertex_ptr++)->Set(pos_left, pos_top, color, texpage, tex_left, tex_top);
      if (restart_strip)
-        m_batch.vertices.push_back(m_batch.vertices.back());
-      m_batch.vertices.push_back(
-        HWVertex{pos_right, pos_top, color, texpage, HWVertex::PackTexcoord(tex_right, tex_top)});
-      m_batch.vertices.push_back(
-        HWVertex{pos_left, pos_bottom, color, texpage, HWVertex::PackTexcoord(tex_left, tex_bottom)});
-      m_batch.vertices.push_back(
-        HWVertex{pos_right, pos_bottom, color, texpage, HWVertex::PackTexcoord(tex_right, tex_bottom)});
+        AddDuplicateVertex();
+
+      (m_batch_current_vertex_ptr++)->Set(pos_right, pos_top, color, texpage, tex_right, tex_top);
+      (m_batch_current_vertex_ptr++)->Set(pos_left, pos_bottom, color, texpage, tex_left, tex_bottom);
+      (m_batch_current_vertex_ptr++)->Set(pos_right, pos_bottom, color, texpage, tex_right, tex_bottom);
    }
    break;

@ -155,7 +139,7 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command
      {
        const u32 color = (shaded && i > 0) ? (command_ptr[buffer_pos++] & UINT32_C(0x00FFFFFF)) : first_color;
        const VertexPosition vp{command_ptr[buffer_pos++]};
-        m_batch.vertices.push_back(HWVertex{vp.x.GetValue(), vp.y.GetValue(), color});
+        (m_batch_current_vertex_ptr++)->Set(vp.x, vp.y, color, 0, 0);
      }
    }
    break;
@ -166,6 +150,12 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command
  }
 }

+void GPU_HW::AddDuplicateVertex()
+{
+  std::memcpy(m_batch_current_vertex_ptr, m_batch_current_vertex_ptr - 1, sizeof(HWVertex));
+  m_batch_current_vertex_ptr++;
+}
+
 void GPU_HW::CalcScissorRect(int* left, int* top, int* right, int* bottom)
 {
  *left = m_drawing_area.left * m_resolution_scale;
@ -567,8 +557,6 @@ GPU_HW::HWPrimitive GPU_HW::GetPrimitiveForCommand(RenderCommand rc)
    return HWPrimitive::Triangles;
 }

-void GPU_HW::InvalidateVRAMReadCache() {}
-
 void GPU_HW::DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32* command_ptr)
 {
  TextureMode texture_mode;
@ -612,10 +600,10 @@ void GPU_HW::DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32
    rc.transparency_enable ? m_render_state.transparency_mode : TransparencyMode::Disabled;
  const HWPrimitive rc_primitive = GetPrimitiveForCommand(rc);
  const bool dithering_enable = (!m_true_color && rc.IsDitheringEnabled()) ? m_GPUSTAT.dither_enable : false;
+  const u32 max_added_vertices = num_vertices + 2;
  if (!IsFlushed())
  {
-    const u32 max_added_vertices = num_vertices + 2;
-    const bool buffer_overflow = (m_batch.vertices.size() + max_added_vertices) >= MAX_BATCH_VERTEX_COUNT;
+    const bool buffer_overflow = GetBatchVertexSpace() < max_added_vertices;
    if (buffer_overflow || rc_primitive == HWPrimitive::LineStrip || m_batch.texture_mode != texture_mode ||
        m_batch.transparency_mode != transparency_mode || m_batch.primitive != rc_primitive ||
        dithering_enable != m_batch.dithering || m_render_state.IsTexturePageChanged() ||
@ -625,6 +613,10 @@ void GPU_HW::DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32
    }
  }

+  // map buffer if it's not already done
+  if (!m_batch_current_vertex_ptr)
+    MapBatchVertexPointer(max_added_vertices);
+
  // update state
  m_batch.primitive = rc_primitive;
  m_batch.texture_mode = texture_mode;
--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@ -38,13 +38,24 @@ protected:
    s32 y;
    u32 color;
    u32 texpage;
-    u32 texcoord;
+    u32 texcoord; // 16-bit texcoords are needed for 256 extent rectangles

-    // 16-bit texcoords are needed for 256 extent rectangles
-    static u32 PackTexcoord(u16 x, u16 y) { return ZeroExtend32(x) | (ZeroExtend32(y) << 16); }
+    ALWAYS_INLINE void Set(s32 x_, s32 y_, u32 color_, u32 texpage_, u16 packed_texcoord)
+    {
+      Set(x_, y_, color_, texpage_, packed_texcoord & 0xFF, (packed_texcoord >> 8));
+    }
+
+    ALWAYS_INLINE void Set(s32 x_, s32 y_, u32 color_, u32 texpage_, u16 texcoord_x, u16 texcoord_y)
+    {
+      x = x_;
+      y = y_;
+      color = color_;
+      texpage = texpage_;
+      texcoord = ZeroExtend32(texcoord_x) | (ZeroExtend32(texcoord_y) << 16);
+    }
  };

-  struct HWRenderBatch
+  struct HWBatchConfig
  {
    u32 texture_page_x;
    u32 texture_page_y;
@ -56,8 +67,6 @@ protected:
    std::array<u8, 4> texture_window_values;
    bool dithering;

-    std::vector<HWVertex> vertices;
-
    // We need two-pass rendering when using BG-FG blending and texturing, as the transparency can be enabled
    // on a per-pixel basis, and the opaque pixels shouldn't be blended at all.
    bool NeedsTwoPassRendering() const
@ -75,6 +84,7 @@ protected:
  };

  static constexpr u32 VERTEX_BUFFER_SIZE = 1 * 1024 * 1024;
+  static constexpr u32 MIN_BATCH_VERTEX_COUNT = 6;
  static constexpr u32 MAX_BATCH_VERTEX_COUNT = VERTEX_BUFFER_SIZE / sizeof(HWVertex);
  static constexpr u32 TEXTURE_TILE_SIZE = 256;
  static constexpr u32 TEXTURE_TILE_X_COUNT = VRAM_WIDTH / TEXTURE_TILE_SIZE;
@ -89,9 +99,14 @@ protected:
                           static_cast<float>(rgba >> 24) * (1.0f / 255.0f));
  }

-  virtual void InvalidateVRAMReadCache();
+  virtual void InvalidateVRAMReadCache() = 0;

-  bool IsFlushed() const { return m_batch.vertices.empty(); }
+  virtual void MapBatchVertexPointer(u32 required_vertices) = 0;
+
+  u32 GetBatchVertexSpace() const { return static_cast<u32>(m_batch_end_vertex_ptr - m_batch_current_vertex_ptr); }
+  u32 GetBatchVertexCount() const { return static_cast<u32>(m_batch_current_vertex_ptr - m_batch_start_vertex_ptr); }
+
+  bool IsFlushed() const { return m_batch_current_vertex_ptr == m_batch_start_vertex_ptr; }

  void DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32* command_ptr) override;

@ -108,7 +123,13 @@ protected:
  std::string GenerateFillFragmentShader();
  std::string GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced);

-  HWRenderBatch m_batch = {};
+  HWBatchConfig m_batch = {};
+
+  HWVertex* m_batch_start_vertex_ptr = nullptr;
+  HWVertex* m_batch_end_vertex_ptr = nullptr;
+  HWVertex* m_batch_current_vertex_ptr = nullptr;
+  u32 m_batch_base_vertex = 0;
+
  u32 m_resolution_scale = 1;
  u32 m_max_resolution_scale = 1;
  bool m_true_color = false;
@ -119,4 +140,5 @@ private:
  void GenerateShaderHeader(std::stringstream& ss);

  void LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command_ptr);
+  void AddDuplicateVertex();
 };
--- a/src/core/gpu_hw_opengl.cpp
+++ b/src/core/gpu_hw_opengl.cpp
@ -62,7 +62,6 @@ void GPU_HW_OpenGL::RestoreGraphicsAPIState()
  glLineWidth(static_cast<float>(m_resolution_scale));
  UpdateDrawingArea();

-  glBindBuffer(GL_ARRAY_BUFFER, m_vertex_buffer);
  glBindVertexArray(m_vao_id);
 }

@ -132,6 +131,19 @@ void GPU_HW_OpenGL::InvalidateVRAMReadCache()
  m_vram_read_texture_dirty = true;
 }

+void GPU_HW_OpenGL::MapBatchVertexPointer(u32 required_vertices)
+{
+  Assert(!m_batch_start_vertex_ptr);
+
+  const GL::StreamBuffer::MappingResult res =
+    m_vertex_stream_buffer->Map(sizeof(HWVertex), required_vertices * sizeof(HWVertex));
+
+  m_batch_start_vertex_ptr = static_cast<HWVertex*>(res.pointer);
+  m_batch_current_vertex_ptr = m_batch_start_vertex_ptr;
+  m_batch_end_vertex_ptr = m_batch_start_vertex_ptr + res.space_aligned;
+  m_batch_base_vertex = res.index_aligned;
+}
+
 std::tuple<s32, s32> GPU_HW_OpenGL::ConvertToFramebufferCoordinates(s32 x, s32 y)
 {
  return std::make_tuple(x, static_cast<s32>(static_cast<s32>(VRAM_HEIGHT) - y));
@ -217,9 +229,11 @@ void GPU_HW_OpenGL::DestroyFramebuffer()

 void GPU_HW_OpenGL::CreateVertexBuffer()
 {
-  glGenBuffers(1, &m_vertex_buffer);
-  glBindBuffer(GL_ARRAY_BUFFER, m_vertex_buffer);
-  glBufferData(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE, nullptr, GL_STREAM_DRAW);
+  m_vertex_stream_buffer = GL::StreamBuffer::Create(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE);
+  if (!m_vertex_stream_buffer)
+    Panic("Failed to create vertex streaming buffer");
+
+  m_vertex_stream_buffer->Bind();

  glGenVertexArrays(1, &m_vao_id);
  glBindVertexArray(m_vao_id);
@ -638,35 +652,36 @@ void GPU_HW_OpenGL::UpdateVRAMReadTexture()

 void GPU_HW_OpenGL::FlushRender()
 {
-  if (m_batch.vertices.empty())
+  const u32 vertex_count = GetBatchVertexCount();
+  if (vertex_count == 0)
    return;

  if (m_vram_read_texture_dirty)
    UpdateVRAMReadTexture();

  m_stats.num_batches++;
-  m_stats.num_vertices += static_cast<u32>(m_batch.vertices.size());
+  m_stats.num_vertices += vertex_count;

-  Assert((m_batch.vertices.size() * sizeof(HWVertex)) <= VERTEX_BUFFER_SIZE);
-  glBufferSubData(GL_ARRAY_BUFFER, 0, static_cast<GLsizei>(sizeof(HWVertex) * m_batch.vertices.size()),
-                  m_batch.vertices.data());
+  m_vertex_stream_buffer->Unmap(vertex_count * sizeof(HWVertex));
+  m_vertex_stream_buffer->Bind();
+  m_batch_start_vertex_ptr = nullptr;
+  m_batch_end_vertex_ptr = nullptr;
+  m_batch_current_vertex_ptr = nullptr;

  static constexpr std::array<GLenum, 4> gl_primitives = {{GL_LINES, GL_LINE_STRIP, GL_TRIANGLES, GL_TRIANGLE_STRIP}};

  if (m_batch.NeedsTwoPassRendering())
  {
    SetDrawState(HWBatchRenderMode::OnlyTransparent);
-    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, static_cast<GLsizei>(m_batch.vertices.size()));
+    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, vertex_count);
    SetDrawState(HWBatchRenderMode::OnlyOpaque);
-    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, static_cast<GLsizei>(m_batch.vertices.size()));
+    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, vertex_count);
  }
  else
  {
    SetDrawState(m_batch.GetRenderMode());
-    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, static_cast<GLsizei>(m_batch.vertices.size()));
+    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, vertex_count);
  }
-
-  m_batch.vertices.clear();
 }

 std::unique_ptr<GPU> GPU::CreateHardwareOpenGLRenderer()
--- a/src/core/gpu_hw_opengl.h
+++ b/src/core/gpu_hw_opengl.h
@ -1,5 +1,6 @@
 #pragma once
 #include "common/gl_program.h"
+#include "common/gl_stream_buffer.h"
 #include "common/gl_texture.h"
 #include "glad.h"
 #include "gpu_hw.h"
@ -31,6 +32,7 @@ protected:
  void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) override;
  void FlushRender() override;
  void InvalidateVRAMReadCache() override;
+  void MapBatchVertexPointer(u32 required_vertices) override;

 private:
  struct GLStats
@ -62,7 +64,7 @@ private:
  std::unique_ptr<GL::Texture> m_vram_downsample_texture;
  std::unique_ptr<GL::Texture> m_display_texture;

-  GLuint m_vertex_buffer = 0;
+  std::unique_ptr<GL::StreamBuffer> m_vertex_stream_buffer;
  GLuint m_vao_id = 0;
  GLuint m_attributeless_vao_id = 0;