diff --git a/src/common/common.vcxproj b/src/common/common.vcxproj index 5b647cf64..43cec14de 100644 --- a/src/common/common.vcxproj +++ b/src/common/common.vcxproj @@ -40,6 +40,7 @@ + @@ -53,6 +54,7 @@ + diff --git a/src/common/common.vcxproj.filters b/src/common/common.vcxproj.filters index a7bf0f289..a7918e047 100644 --- a/src/common/common.vcxproj.filters +++ b/src/common/common.vcxproj.filters @@ -12,6 +12,7 @@ + @@ -23,6 +24,7 @@ + diff --git a/src/common/gl_stream_buffer.cpp b/src/common/gl_stream_buffer.cpp new file mode 100644 index 000000000..65a32fb58 --- /dev/null +++ b/src/common/gl_stream_buffer.cpp @@ -0,0 +1,53 @@ +#include "gl_stream_buffer.h" + +namespace GL { + +StreamBuffer::StreamBuffer(GLenum target, GLuint buffer_id, u32 size) + : m_target(target), m_buffer_id(buffer_id), m_size(size), m_cpu_buffer(size) +{ +} + +StreamBuffer::~StreamBuffer() +{ + glDeleteBuffers(1, &m_buffer_id); +} + +void StreamBuffer::Bind() +{ + glBindBuffer(m_target, m_buffer_id); +} + +StreamBuffer::MappingResult StreamBuffer::Map(u32 alignment, u32 min_size) +{ + return MappingResult{static_cast(m_cpu_buffer.data()), 0, m_size / alignment}; +} + +void StreamBuffer::Unmap(u32 used_size) +{ + if (used_size == 0) + return; + + glBindBuffer(m_target, m_buffer_id); + glBufferSubData(m_target, 0, used_size, m_cpu_buffer.data()); +} + +std::unique_ptr StreamBuffer::Create(GLenum target, u32 size) +{ + glGetError(); + + GLuint buffer_id; + glGenBuffers(1, &buffer_id); + glBindBuffer(target, buffer_id); + glBufferData(target, size, nullptr, GL_STREAM_DRAW); + + GLenum err = glGetError(); + if (err != GL_NO_ERROR) + { + glDeleteBuffers(1, &buffer_id); + return {}; + } + + return std::unique_ptr(new StreamBuffer(target, buffer_id, size)); +} + +} // namespace GL \ No newline at end of file diff --git a/src/common/gl_stream_buffer.h b/src/common/gl_stream_buffer.h new file mode 100644 index 000000000..5c1876bed --- /dev/null +++ b/src/common/gl_stream_buffer.h @@ -0,0 +1,42 @@ +#pragma once +#include "types.h" +#include +#include +#include +#include + +namespace GL { +// TODO: Persistent mapping-based implementation +class StreamBuffer +{ +public: + ~StreamBuffer(); + + ALWAYS_INLINE GLuint GetGLBufferId() const { return m_buffer_id; } + ALWAYS_INLINE GLenum GetGLTarget() const { return m_target; } + ALWAYS_INLINE u32 GetSize() const { return m_size; } + + void Bind(); + + struct MappingResult + { + void* pointer; + u32 index_aligned; // offset / alignment, suitable for base vertex + u32 space_aligned; // remaining space / alignment + }; + + MappingResult Map(u32 alignment, u32 min_size); + void Unmap(u32 used_size); + + static std::unique_ptr Create(GLenum target, u32 size); + +private: + StreamBuffer(GLenum target, GLuint buffer_id, u32 size); + + GLenum m_target; + GLuint m_buffer_id; + u32 m_size; + + std::vector m_cpu_buffer; +}; +} // namespace GL \ No newline at end of file diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp index 1ae3d370a..3c48e4eb9 100644 --- a/src/core/gpu_hw.cpp +++ b/src/core/gpu_hw.cpp @@ -49,9 +49,9 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command case Primitive::Polygon: { // if we're drawing quads, we need to create a degenerate triangle to restart the triangle strip - bool restart_strip = (rc.quad_polygon && !m_batch.vertices.empty()); + bool restart_strip = (rc.quad_polygon && !IsFlushed()); if (restart_strip) - m_batch.vertices.push_back(m_batch.vertices.back()); + AddDuplicateVertex(); const u32 first_color = rc.color_for_first_vertex; const bool shaded = rc.shading_enable; @@ -60,28 +60,15 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command u32 buffer_pos = 1; for (u32 i = 0; i < num_vertices; i++) { - HWVertex hw_vert; - hw_vert.color = (shaded && i > 0) ? (command_ptr[buffer_pos++] & UINT32_C(0x00FFFFFF)) : first_color; - + const u32 color = (shaded && i > 0) ? (command_ptr[buffer_pos++] & UINT32_C(0x00FFFFFF)) : first_color; const VertexPosition vp{command_ptr[buffer_pos++]}; - hw_vert.x = vp.x; - hw_vert.y = vp.y; - hw_vert.texpage = texpage; + const u16 packed_texcoord = textured ? Truncate16(command_ptr[buffer_pos++]) : 0; - if (textured) - { - const auto [texcoord_x, texcoord_y] = UnpackTexcoord(Truncate16(command_ptr[buffer_pos++])); - hw_vert.texcoord = HWVertex::PackTexcoord(texcoord_x, texcoord_y); - } - else - { - hw_vert.texcoord = 0; - } + (m_batch_current_vertex_ptr++)->Set(vp.x, vp.y, color, texpage, packed_texcoord); - m_batch.vertices.push_back(hw_vert); if (restart_strip) { - m_batch.vertices.push_back(m_batch.vertices.back()); + AddDuplicateVertex(); restart_strip = false; } } @@ -91,9 +78,9 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command case Primitive::Rectangle: { // if we're drawing quads, we need to create a degenerate triangle to restart the triangle strip - const bool restart_strip = !m_batch.vertices.empty(); + const bool restart_strip = !IsFlushed(); if (restart_strip) - m_batch.vertices.push_back(m_batch.vertices.back()); + AddDuplicateVertex(); u32 buffer_pos = 1; const u32 color = rc.color_for_first_vertex; @@ -132,16 +119,13 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command const u16 tex_right = tex_left + static_cast(rectangle_width); const u16 tex_bottom = tex_top + static_cast(rectangle_height); - m_batch.vertices.push_back( - HWVertex{pos_left, pos_top, color, texpage, HWVertex::PackTexcoord(tex_left, tex_top)}); + (m_batch_current_vertex_ptr++)->Set(pos_left, pos_top, color, texpage, tex_left, tex_top); if (restart_strip) - m_batch.vertices.push_back(m_batch.vertices.back()); - m_batch.vertices.push_back( - HWVertex{pos_right, pos_top, color, texpage, HWVertex::PackTexcoord(tex_right, tex_top)}); - m_batch.vertices.push_back( - HWVertex{pos_left, pos_bottom, color, texpage, HWVertex::PackTexcoord(tex_left, tex_bottom)}); - m_batch.vertices.push_back( - HWVertex{pos_right, pos_bottom, color, texpage, HWVertex::PackTexcoord(tex_right, tex_bottom)}); + AddDuplicateVertex(); + + (m_batch_current_vertex_ptr++)->Set(pos_right, pos_top, color, texpage, tex_right, tex_top); + (m_batch_current_vertex_ptr++)->Set(pos_left, pos_bottom, color, texpage, tex_left, tex_bottom); + (m_batch_current_vertex_ptr++)->Set(pos_right, pos_bottom, color, texpage, tex_right, tex_bottom); } break; @@ -155,7 +139,7 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command { const u32 color = (shaded && i > 0) ? (command_ptr[buffer_pos++] & UINT32_C(0x00FFFFFF)) : first_color; const VertexPosition vp{command_ptr[buffer_pos++]}; - m_batch.vertices.push_back(HWVertex{vp.x.GetValue(), vp.y.GetValue(), color}); + (m_batch_current_vertex_ptr++)->Set(vp.x, vp.y, color, 0, 0); } } break; @@ -166,6 +150,12 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command } } +void GPU_HW::AddDuplicateVertex() +{ + std::memcpy(m_batch_current_vertex_ptr, m_batch_current_vertex_ptr - 1, sizeof(HWVertex)); + m_batch_current_vertex_ptr++; +} + void GPU_HW::CalcScissorRect(int* left, int* top, int* right, int* bottom) { *left = m_drawing_area.left * m_resolution_scale; @@ -567,8 +557,6 @@ GPU_HW::HWPrimitive GPU_HW::GetPrimitiveForCommand(RenderCommand rc) return HWPrimitive::Triangles; } -void GPU_HW::InvalidateVRAMReadCache() {} - void GPU_HW::DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32* command_ptr) { TextureMode texture_mode; @@ -612,10 +600,10 @@ void GPU_HW::DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32 rc.transparency_enable ? m_render_state.transparency_mode : TransparencyMode::Disabled; const HWPrimitive rc_primitive = GetPrimitiveForCommand(rc); const bool dithering_enable = (!m_true_color && rc.IsDitheringEnabled()) ? m_GPUSTAT.dither_enable : false; + const u32 max_added_vertices = num_vertices + 2; if (!IsFlushed()) { - const u32 max_added_vertices = num_vertices + 2; - const bool buffer_overflow = (m_batch.vertices.size() + max_added_vertices) >= MAX_BATCH_VERTEX_COUNT; + const bool buffer_overflow = GetBatchVertexSpace() < max_added_vertices; if (buffer_overflow || rc_primitive == HWPrimitive::LineStrip || m_batch.texture_mode != texture_mode || m_batch.transparency_mode != transparency_mode || m_batch.primitive != rc_primitive || dithering_enable != m_batch.dithering || m_render_state.IsTexturePageChanged() || @@ -625,6 +613,10 @@ void GPU_HW::DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32 } } + // map buffer if it's not already done + if (!m_batch_current_vertex_ptr) + MapBatchVertexPointer(max_added_vertices); + // update state m_batch.primitive = rc_primitive; m_batch.texture_mode = texture_mode; diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h index 586c70e5a..7828ad714 100644 --- a/src/core/gpu_hw.h +++ b/src/core/gpu_hw.h @@ -38,13 +38,24 @@ protected: s32 y; u32 color; u32 texpage; - u32 texcoord; + u32 texcoord; // 16-bit texcoords are needed for 256 extent rectangles - // 16-bit texcoords are needed for 256 extent rectangles - static u32 PackTexcoord(u16 x, u16 y) { return ZeroExtend32(x) | (ZeroExtend32(y) << 16); } + ALWAYS_INLINE void Set(s32 x_, s32 y_, u32 color_, u32 texpage_, u16 packed_texcoord) + { + Set(x_, y_, color_, texpage_, packed_texcoord & 0xFF, (packed_texcoord >> 8)); + } + + ALWAYS_INLINE void Set(s32 x_, s32 y_, u32 color_, u32 texpage_, u16 texcoord_x, u16 texcoord_y) + { + x = x_; + y = y_; + color = color_; + texpage = texpage_; + texcoord = ZeroExtend32(texcoord_x) | (ZeroExtend32(texcoord_y) << 16); + } }; - struct HWRenderBatch + struct HWBatchConfig { u32 texture_page_x; u32 texture_page_y; @@ -56,8 +67,6 @@ protected: std::array texture_window_values; bool dithering; - std::vector vertices; - // We need two-pass rendering when using BG-FG blending and texturing, as the transparency can be enabled // on a per-pixel basis, and the opaque pixels shouldn't be blended at all. bool NeedsTwoPassRendering() const @@ -75,6 +84,7 @@ protected: }; static constexpr u32 VERTEX_BUFFER_SIZE = 1 * 1024 * 1024; + static constexpr u32 MIN_BATCH_VERTEX_COUNT = 6; static constexpr u32 MAX_BATCH_VERTEX_COUNT = VERTEX_BUFFER_SIZE / sizeof(HWVertex); static constexpr u32 TEXTURE_TILE_SIZE = 256; static constexpr u32 TEXTURE_TILE_X_COUNT = VRAM_WIDTH / TEXTURE_TILE_SIZE; @@ -89,9 +99,14 @@ protected: static_cast(rgba >> 24) * (1.0f / 255.0f)); } - virtual void InvalidateVRAMReadCache(); + virtual void InvalidateVRAMReadCache() = 0; - bool IsFlushed() const { return m_batch.vertices.empty(); } + virtual void MapBatchVertexPointer(u32 required_vertices) = 0; + + u32 GetBatchVertexSpace() const { return static_cast(m_batch_end_vertex_ptr - m_batch_current_vertex_ptr); } + u32 GetBatchVertexCount() const { return static_cast(m_batch_current_vertex_ptr - m_batch_start_vertex_ptr); } + + bool IsFlushed() const { return m_batch_current_vertex_ptr == m_batch_start_vertex_ptr; } void DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32* command_ptr) override; @@ -108,7 +123,13 @@ protected: std::string GenerateFillFragmentShader(); std::string GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced); - HWRenderBatch m_batch = {}; + HWBatchConfig m_batch = {}; + + HWVertex* m_batch_start_vertex_ptr = nullptr; + HWVertex* m_batch_end_vertex_ptr = nullptr; + HWVertex* m_batch_current_vertex_ptr = nullptr; + u32 m_batch_base_vertex = 0; + u32 m_resolution_scale = 1; u32 m_max_resolution_scale = 1; bool m_true_color = false; @@ -119,4 +140,5 @@ private: void GenerateShaderHeader(std::stringstream& ss); void LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command_ptr); + void AddDuplicateVertex(); }; diff --git a/src/core/gpu_hw_opengl.cpp b/src/core/gpu_hw_opengl.cpp index 483cbbf90..3003d7521 100644 --- a/src/core/gpu_hw_opengl.cpp +++ b/src/core/gpu_hw_opengl.cpp @@ -62,7 +62,6 @@ void GPU_HW_OpenGL::RestoreGraphicsAPIState() glLineWidth(static_cast(m_resolution_scale)); UpdateDrawingArea(); - glBindBuffer(GL_ARRAY_BUFFER, m_vertex_buffer); glBindVertexArray(m_vao_id); } @@ -132,6 +131,19 @@ void GPU_HW_OpenGL::InvalidateVRAMReadCache() m_vram_read_texture_dirty = true; } +void GPU_HW_OpenGL::MapBatchVertexPointer(u32 required_vertices) +{ + Assert(!m_batch_start_vertex_ptr); + + const GL::StreamBuffer::MappingResult res = + m_vertex_stream_buffer->Map(sizeof(HWVertex), required_vertices * sizeof(HWVertex)); + + m_batch_start_vertex_ptr = static_cast(res.pointer); + m_batch_current_vertex_ptr = m_batch_start_vertex_ptr; + m_batch_end_vertex_ptr = m_batch_start_vertex_ptr + res.space_aligned; + m_batch_base_vertex = res.index_aligned; +} + std::tuple GPU_HW_OpenGL::ConvertToFramebufferCoordinates(s32 x, s32 y) { return std::make_tuple(x, static_cast(static_cast(VRAM_HEIGHT) - y)); @@ -217,9 +229,11 @@ void GPU_HW_OpenGL::DestroyFramebuffer() void GPU_HW_OpenGL::CreateVertexBuffer() { - glGenBuffers(1, &m_vertex_buffer); - glBindBuffer(GL_ARRAY_BUFFER, m_vertex_buffer); - glBufferData(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE, nullptr, GL_STREAM_DRAW); + m_vertex_stream_buffer = GL::StreamBuffer::Create(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE); + if (!m_vertex_stream_buffer) + Panic("Failed to create vertex streaming buffer"); + + m_vertex_stream_buffer->Bind(); glGenVertexArrays(1, &m_vao_id); glBindVertexArray(m_vao_id); @@ -638,35 +652,36 @@ void GPU_HW_OpenGL::UpdateVRAMReadTexture() void GPU_HW_OpenGL::FlushRender() { - if (m_batch.vertices.empty()) + const u32 vertex_count = GetBatchVertexCount(); + if (vertex_count == 0) return; if (m_vram_read_texture_dirty) UpdateVRAMReadTexture(); m_stats.num_batches++; - m_stats.num_vertices += static_cast(m_batch.vertices.size()); + m_stats.num_vertices += vertex_count; - Assert((m_batch.vertices.size() * sizeof(HWVertex)) <= VERTEX_BUFFER_SIZE); - glBufferSubData(GL_ARRAY_BUFFER, 0, static_cast(sizeof(HWVertex) * m_batch.vertices.size()), - m_batch.vertices.data()); + m_vertex_stream_buffer->Unmap(vertex_count * sizeof(HWVertex)); + m_vertex_stream_buffer->Bind(); + m_batch_start_vertex_ptr = nullptr; + m_batch_end_vertex_ptr = nullptr; + m_batch_current_vertex_ptr = nullptr; static constexpr std::array gl_primitives = {{GL_LINES, GL_LINE_STRIP, GL_TRIANGLES, GL_TRIANGLE_STRIP}}; if (m_batch.NeedsTwoPassRendering()) { SetDrawState(HWBatchRenderMode::OnlyTransparent); - glDrawArrays(gl_primitives[static_cast(m_batch.primitive)], 0, static_cast(m_batch.vertices.size())); + glDrawArrays(gl_primitives[static_cast(m_batch.primitive)], 0, vertex_count); SetDrawState(HWBatchRenderMode::OnlyOpaque); - glDrawArrays(gl_primitives[static_cast(m_batch.primitive)], 0, static_cast(m_batch.vertices.size())); + glDrawArrays(gl_primitives[static_cast(m_batch.primitive)], 0, vertex_count); } else { SetDrawState(m_batch.GetRenderMode()); - glDrawArrays(gl_primitives[static_cast(m_batch.primitive)], 0, static_cast(m_batch.vertices.size())); + glDrawArrays(gl_primitives[static_cast(m_batch.primitive)], 0, vertex_count); } - - m_batch.vertices.clear(); } std::unique_ptr GPU::CreateHardwareOpenGLRenderer() diff --git a/src/core/gpu_hw_opengl.h b/src/core/gpu_hw_opengl.h index 59ecbb173..81236a943 100644 --- a/src/core/gpu_hw_opengl.h +++ b/src/core/gpu_hw_opengl.h @@ -1,5 +1,6 @@ #pragma once #include "common/gl_program.h" +#include "common/gl_stream_buffer.h" #include "common/gl_texture.h" #include "glad.h" #include "gpu_hw.h" @@ -31,6 +32,7 @@ protected: void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) override; void FlushRender() override; void InvalidateVRAMReadCache() override; + void MapBatchVertexPointer(u32 required_vertices) override; private: struct GLStats @@ -62,7 +64,7 @@ private: std::unique_ptr m_vram_downsample_texture; std::unique_ptr m_display_texture; - GLuint m_vertex_buffer = 0; + std::unique_ptr m_vertex_stream_buffer; GLuint m_vao_id = 0; GLuint m_attributeless_vao_id = 0;