diff --git a/src/common/common.vcxproj b/src/common/common.vcxproj
index 5b647cf64..43cec14de 100644
--- a/src/common/common.vcxproj
+++ b/src/common/common.vcxproj
@@ -40,6 +40,7 @@
     <ClInclude Include="cd_image.h" />
     <ClInclude Include="fifo_queue.h" />
     <ClInclude Include="gl_program.h" />
+    <ClInclude Include="gl_stream_buffer.h" />
     <ClInclude Include="gl_texture.h" />
     <ClInclude Include="heap_array.h" />
     <ClInclude Include="jit_code_buffer.h" />
@@ -53,6 +54,7 @@
     <ClCompile Include="cd_image_bin.cpp" />
     <ClCompile Include="cd_image_cue.cpp" />
     <ClCompile Include="gl_program.cpp" />
+    <ClCompile Include="gl_stream_buffer.cpp" />
     <ClCompile Include="gl_texture.cpp" />
     <ClCompile Include="jit_code_buffer.cpp" />
     <ClCompile Include="state_wrapper.cpp" />
diff --git a/src/common/common.vcxproj.filters b/src/common/common.vcxproj.filters
index a7bf0f289..a7918e047 100644
--- a/src/common/common.vcxproj.filters
+++ b/src/common/common.vcxproj.filters
@@ -12,6 +12,7 @@
     <ClInclude Include="audio_stream.h" />
     <ClInclude Include="cd_xa.h" />
     <ClInclude Include="heap_array.h" />
+    <ClInclude Include="gl_stream_buffer.h" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="jit_code_buffer.cpp" />
@@ -23,6 +24,7 @@
     <ClCompile Include="cd_xa.cpp" />
     <ClCompile Include="cd_image_cue.cpp" />
     <ClCompile Include="cd_image_bin.cpp" />
+    <ClCompile Include="gl_stream_buffer.cpp" />
   </ItemGroup>
   <ItemGroup>
     <Natvis Include="bitfield.natvis" />
diff --git a/src/common/gl_stream_buffer.cpp b/src/common/gl_stream_buffer.cpp
new file mode 100644
index 000000000..65a32fb58
--- /dev/null
+++ b/src/common/gl_stream_buffer.cpp
@@ -0,0 +1,53 @@
+#include "gl_stream_buffer.h"
+
+namespace GL {
+
+StreamBuffer::StreamBuffer(GLenum target, GLuint buffer_id, u32 size)
+  : m_target(target), m_buffer_id(buffer_id), m_size(size), m_cpu_buffer(size)
+{
+}
+
+StreamBuffer::~StreamBuffer()
+{
+  glDeleteBuffers(1, &m_buffer_id);
+}
+
+void StreamBuffer::Bind()
+{
+  glBindBuffer(m_target, m_buffer_id);
+}
+
+StreamBuffer::MappingResult StreamBuffer::Map(u32 alignment, u32 min_size)
+{
+  return MappingResult{static_cast<void*>(m_cpu_buffer.data()), 0, m_size / alignment};
+}
+
+void StreamBuffer::Unmap(u32 used_size)
+{
+  if (used_size == 0)
+    return;
+
+  glBindBuffer(m_target, m_buffer_id);
+  glBufferSubData(m_target, 0, used_size, m_cpu_buffer.data());
+}
+
+std::unique_ptr<StreamBuffer> StreamBuffer::Create(GLenum target, u32 size)
+{
+  glGetError();
+
+  GLuint buffer_id;
+  glGenBuffers(1, &buffer_id);
+  glBindBuffer(target, buffer_id);
+  glBufferData(target, size, nullptr, GL_STREAM_DRAW);
+
+  GLenum err = glGetError();
+  if (err != GL_NO_ERROR)
+  {
+    glDeleteBuffers(1, &buffer_id);
+    return {};
+  }
+
+  return std::unique_ptr<StreamBuffer>(new StreamBuffer(target, buffer_id, size));
+}
+
+} // namespace GL
\ No newline at end of file
diff --git a/src/common/gl_stream_buffer.h b/src/common/gl_stream_buffer.h
new file mode 100644
index 000000000..5c1876bed
--- /dev/null
+++ b/src/common/gl_stream_buffer.h
@@ -0,0 +1,42 @@
+#pragma once
+#include "types.h"
+#include <glad.h>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+namespace GL {
+// TODO: Persistent mapping-based implementation
+class StreamBuffer
+{
+public:
+  ~StreamBuffer();
+
+  ALWAYS_INLINE GLuint GetGLBufferId() const { return m_buffer_id; }
+  ALWAYS_INLINE GLenum GetGLTarget() const { return m_target; }
+  ALWAYS_INLINE u32 GetSize() const { return m_size; }
+
+  void Bind();
+
+  struct MappingResult
+  {
+    void* pointer;
+    u32 index_aligned; // offset / alignment, suitable for base vertex
+    u32 space_aligned; // remaining space / alignment
+  };
+
+  MappingResult Map(u32 alignment, u32 min_size);
+  void Unmap(u32 used_size);
+
+  static std::unique_ptr<StreamBuffer> Create(GLenum target, u32 size);
+
+private:
+  StreamBuffer(GLenum target, GLuint buffer_id, u32 size);
+
+  GLenum m_target;
+  GLuint m_buffer_id;
+  u32 m_size;
+
+  std::vector<u8> m_cpu_buffer;
+};
+} // namespace GL
\ No newline at end of file
diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp
index 1ae3d370a..3c48e4eb9 100644
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@@ -49,9 +49,9 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command
     case Primitive::Polygon:
     {
       // if we're drawing quads, we need to create a degenerate triangle to restart the triangle strip
-      bool restart_strip = (rc.quad_polygon && !m_batch.vertices.empty());
+      bool restart_strip = (rc.quad_polygon && !IsFlushed());
       if (restart_strip)
-        m_batch.vertices.push_back(m_batch.vertices.back());
+        AddDuplicateVertex();
 
       const u32 first_color = rc.color_for_first_vertex;
       const bool shaded = rc.shading_enable;
@@ -60,28 +60,15 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command
       u32 buffer_pos = 1;
       for (u32 i = 0; i < num_vertices; i++)
       {
-        HWVertex hw_vert;
-        hw_vert.color = (shaded && i > 0) ? (command_ptr[buffer_pos++] & UINT32_C(0x00FFFFFF)) : first_color;
-
+        const u32 color = (shaded && i > 0) ? (command_ptr[buffer_pos++] & UINT32_C(0x00FFFFFF)) : first_color;
         const VertexPosition vp{command_ptr[buffer_pos++]};
-        hw_vert.x = vp.x;
-        hw_vert.y = vp.y;
-        hw_vert.texpage = texpage;
+        const u16 packed_texcoord = textured ? Truncate16(command_ptr[buffer_pos++]) : 0;
 
-        if (textured)
-        {
-          const auto [texcoord_x, texcoord_y] = UnpackTexcoord(Truncate16(command_ptr[buffer_pos++]));
-          hw_vert.texcoord = HWVertex::PackTexcoord(texcoord_x, texcoord_y);
-        }
-        else
-        {
-          hw_vert.texcoord = 0;
-        }
+        (m_batch_current_vertex_ptr++)->Set(vp.x, vp.y, color, texpage, packed_texcoord);
 
-        m_batch.vertices.push_back(hw_vert);
         if (restart_strip)
         {
-          m_batch.vertices.push_back(m_batch.vertices.back());
+          AddDuplicateVertex();
           restart_strip = false;
         }
       }
@@ -91,9 +78,9 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command
     case Primitive::Rectangle:
     {
       // if we're drawing quads, we need to create a degenerate triangle to restart the triangle strip
-      const bool restart_strip = !m_batch.vertices.empty();
+      const bool restart_strip = !IsFlushed();
       if (restart_strip)
-        m_batch.vertices.push_back(m_batch.vertices.back());
+        AddDuplicateVertex();
 
       u32 buffer_pos = 1;
       const u32 color = rc.color_for_first_vertex;
@@ -132,16 +119,13 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command
       const u16 tex_right = tex_left + static_cast<u16>(rectangle_width);
       const u16 tex_bottom = tex_top + static_cast<u16>(rectangle_height);
 
-      m_batch.vertices.push_back(
-        HWVertex{pos_left, pos_top, color, texpage, HWVertex::PackTexcoord(tex_left, tex_top)});
+      (m_batch_current_vertex_ptr++)->Set(pos_left, pos_top, color, texpage, tex_left, tex_top);
       if (restart_strip)
-        m_batch.vertices.push_back(m_batch.vertices.back());
-      m_batch.vertices.push_back(
-        HWVertex{pos_right, pos_top, color, texpage, HWVertex::PackTexcoord(tex_right, tex_top)});
-      m_batch.vertices.push_back(
-        HWVertex{pos_left, pos_bottom, color, texpage, HWVertex::PackTexcoord(tex_left, tex_bottom)});
-      m_batch.vertices.push_back(
-        HWVertex{pos_right, pos_bottom, color, texpage, HWVertex::PackTexcoord(tex_right, tex_bottom)});
+        AddDuplicateVertex();
+
+      (m_batch_current_vertex_ptr++)->Set(pos_right, pos_top, color, texpage, tex_right, tex_top);
+      (m_batch_current_vertex_ptr++)->Set(pos_left, pos_bottom, color, texpage, tex_left, tex_bottom);
+      (m_batch_current_vertex_ptr++)->Set(pos_right, pos_bottom, color, texpage, tex_right, tex_bottom);
     }
     break;
 
@@ -155,7 +139,7 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command
       {
         const u32 color = (shaded && i > 0) ? (command_ptr[buffer_pos++] & UINT32_C(0x00FFFFFF)) : first_color;
         const VertexPosition vp{command_ptr[buffer_pos++]};
-        m_batch.vertices.push_back(HWVertex{vp.x.GetValue(), vp.y.GetValue(), color});
+        (m_batch_current_vertex_ptr++)->Set(vp.x, vp.y, color, 0, 0);
       }
     }
     break;
@@ -166,6 +150,12 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command
   }
 }
 
+void GPU_HW::AddDuplicateVertex()
+{
+  std::memcpy(m_batch_current_vertex_ptr, m_batch_current_vertex_ptr - 1, sizeof(HWVertex));
+  m_batch_current_vertex_ptr++;
+}
+
 void GPU_HW::CalcScissorRect(int* left, int* top, int* right, int* bottom)
 {
   *left = m_drawing_area.left * m_resolution_scale;
@@ -567,8 +557,6 @@ GPU_HW::HWPrimitive GPU_HW::GetPrimitiveForCommand(RenderCommand rc)
     return HWPrimitive::Triangles;
 }
 
-void GPU_HW::InvalidateVRAMReadCache() {}
-
 void GPU_HW::DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32* command_ptr)
 {
   TextureMode texture_mode;
@@ -612,10 +600,10 @@ void GPU_HW::DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32
     rc.transparency_enable ? m_render_state.transparency_mode : TransparencyMode::Disabled;
   const HWPrimitive rc_primitive = GetPrimitiveForCommand(rc);
   const bool dithering_enable = (!m_true_color && rc.IsDitheringEnabled()) ? m_GPUSTAT.dither_enable : false;
+  const u32 max_added_vertices = num_vertices + 2;
   if (!IsFlushed())
   {
-    const u32 max_added_vertices = num_vertices + 2;
-    const bool buffer_overflow = (m_batch.vertices.size() + max_added_vertices) >= MAX_BATCH_VERTEX_COUNT;
+    const bool buffer_overflow = GetBatchVertexSpace() < max_added_vertices;
     if (buffer_overflow || rc_primitive == HWPrimitive::LineStrip || m_batch.texture_mode != texture_mode ||
         m_batch.transparency_mode != transparency_mode || m_batch.primitive != rc_primitive ||
         dithering_enable != m_batch.dithering || m_render_state.IsTexturePageChanged() ||
@@ -625,6 +613,10 @@ void GPU_HW::DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32
     }
   }
 
+  // map buffer if it's not already done
+  if (!m_batch_current_vertex_ptr)
+    MapBatchVertexPointer(max_added_vertices);
+
   // update state
   m_batch.primitive = rc_primitive;
   m_batch.texture_mode = texture_mode;
diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h
index 586c70e5a..7828ad714 100644
--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@@ -38,13 +38,24 @@ protected:
     s32 y;
     u32 color;
     u32 texpage;
-    u32 texcoord;
+    u32 texcoord; // 16-bit texcoords are needed for 256 extent rectangles
 
-    // 16-bit texcoords are needed for 256 extent rectangles
-    static u32 PackTexcoord(u16 x, u16 y) { return ZeroExtend32(x) | (ZeroExtend32(y) << 16); }
+    ALWAYS_INLINE void Set(s32 x_, s32 y_, u32 color_, u32 texpage_, u16 packed_texcoord)
+    {
+      Set(x_, y_, color_, texpage_, packed_texcoord & 0xFF, (packed_texcoord >> 8));
+    }
+
+    ALWAYS_INLINE void Set(s32 x_, s32 y_, u32 color_, u32 texpage_, u16 texcoord_x, u16 texcoord_y)
+    {
+      x = x_;
+      y = y_;
+      color = color_;
+      texpage = texpage_;
+      texcoord = ZeroExtend32(texcoord_x) | (ZeroExtend32(texcoord_y) << 16);
+    }
   };
 
-  struct HWRenderBatch
+  struct HWBatchConfig
   {
     u32 texture_page_x;
     u32 texture_page_y;
@@ -56,8 +67,6 @@ protected:
     std::array<u8, 4> texture_window_values;
     bool dithering;
 
-    std::vector<HWVertex> vertices;
-
     // We need two-pass rendering when using BG-FG blending and texturing, as the transparency can be enabled
     // on a per-pixel basis, and the opaque pixels shouldn't be blended at all.
     bool NeedsTwoPassRendering() const
@@ -75,6 +84,7 @@ protected:
   };
 
   static constexpr u32 VERTEX_BUFFER_SIZE = 1 * 1024 * 1024;
+  static constexpr u32 MIN_BATCH_VERTEX_COUNT = 6;
   static constexpr u32 MAX_BATCH_VERTEX_COUNT = VERTEX_BUFFER_SIZE / sizeof(HWVertex);
   static constexpr u32 TEXTURE_TILE_SIZE = 256;
   static constexpr u32 TEXTURE_TILE_X_COUNT = VRAM_WIDTH / TEXTURE_TILE_SIZE;
@@ -89,9 +99,14 @@ protected:
                            static_cast<float>(rgba >> 24) * (1.0f / 255.0f));
   }
 
-  virtual void InvalidateVRAMReadCache();
+  virtual void InvalidateVRAMReadCache() = 0;
 
-  bool IsFlushed() const { return m_batch.vertices.empty(); }
+  virtual void MapBatchVertexPointer(u32 required_vertices) = 0;
+
+  u32 GetBatchVertexSpace() const { return static_cast<u32>(m_batch_end_vertex_ptr - m_batch_current_vertex_ptr); }
+  u32 GetBatchVertexCount() const { return static_cast<u32>(m_batch_current_vertex_ptr - m_batch_start_vertex_ptr); }
+
+  bool IsFlushed() const { return m_batch_current_vertex_ptr == m_batch_start_vertex_ptr; }
 
   void DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32* command_ptr) override;
 
@@ -108,7 +123,13 @@ protected:
   std::string GenerateFillFragmentShader();
   std::string GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced);
 
-  HWRenderBatch m_batch = {};
+  HWBatchConfig m_batch = {};
+
+  HWVertex* m_batch_start_vertex_ptr = nullptr;
+  HWVertex* m_batch_end_vertex_ptr = nullptr;
+  HWVertex* m_batch_current_vertex_ptr = nullptr;
+  u32 m_batch_base_vertex = 0;
+
   u32 m_resolution_scale = 1;
   u32 m_max_resolution_scale = 1;
   bool m_true_color = false;
@@ -119,4 +140,5 @@ private:
   void GenerateShaderHeader(std::stringstream& ss);
 
   void LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command_ptr);
+  void AddDuplicateVertex();
 };
diff --git a/src/core/gpu_hw_opengl.cpp b/src/core/gpu_hw_opengl.cpp
index 483cbbf90..3003d7521 100644
--- a/src/core/gpu_hw_opengl.cpp
+++ b/src/core/gpu_hw_opengl.cpp
@@ -62,7 +62,6 @@ void GPU_HW_OpenGL::RestoreGraphicsAPIState()
   glLineWidth(static_cast<float>(m_resolution_scale));
   UpdateDrawingArea();
 
-  glBindBuffer(GL_ARRAY_BUFFER, m_vertex_buffer);
   glBindVertexArray(m_vao_id);
 }
 
@@ -132,6 +131,19 @@ void GPU_HW_OpenGL::InvalidateVRAMReadCache()
   m_vram_read_texture_dirty = true;
 }
 
+void GPU_HW_OpenGL::MapBatchVertexPointer(u32 required_vertices)
+{
+  Assert(!m_batch_start_vertex_ptr);
+
+  const GL::StreamBuffer::MappingResult res =
+    m_vertex_stream_buffer->Map(sizeof(HWVertex), required_vertices * sizeof(HWVertex));
+
+  m_batch_start_vertex_ptr = static_cast<HWVertex*>(res.pointer);
+  m_batch_current_vertex_ptr = m_batch_start_vertex_ptr;
+  m_batch_end_vertex_ptr = m_batch_start_vertex_ptr + res.space_aligned;
+  m_batch_base_vertex = res.index_aligned;
+}
+
 std::tuple<s32, s32> GPU_HW_OpenGL::ConvertToFramebufferCoordinates(s32 x, s32 y)
 {
   return std::make_tuple(x, static_cast<s32>(static_cast<s32>(VRAM_HEIGHT) - y));
@@ -217,9 +229,11 @@ void GPU_HW_OpenGL::DestroyFramebuffer()
 
 void GPU_HW_OpenGL::CreateVertexBuffer()
 {
-  glGenBuffers(1, &m_vertex_buffer);
-  glBindBuffer(GL_ARRAY_BUFFER, m_vertex_buffer);
-  glBufferData(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE, nullptr, GL_STREAM_DRAW);
+  m_vertex_stream_buffer = GL::StreamBuffer::Create(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE);
+  if (!m_vertex_stream_buffer)
+    Panic("Failed to create vertex streaming buffer");
+
+  m_vertex_stream_buffer->Bind();
 
   glGenVertexArrays(1, &m_vao_id);
   glBindVertexArray(m_vao_id);
@@ -638,35 +652,36 @@ void GPU_HW_OpenGL::UpdateVRAMReadTexture()
 
 void GPU_HW_OpenGL::FlushRender()
 {
-  if (m_batch.vertices.empty())
+  const u32 vertex_count = GetBatchVertexCount();
+  if (vertex_count == 0)
     return;
 
   if (m_vram_read_texture_dirty)
     UpdateVRAMReadTexture();
 
   m_stats.num_batches++;
-  m_stats.num_vertices += static_cast<u32>(m_batch.vertices.size());
+  m_stats.num_vertices += vertex_count;
 
-  Assert((m_batch.vertices.size() * sizeof(HWVertex)) <= VERTEX_BUFFER_SIZE);
-  glBufferSubData(GL_ARRAY_BUFFER, 0, static_cast<GLsizei>(sizeof(HWVertex) * m_batch.vertices.size()),
-                  m_batch.vertices.data());
+  m_vertex_stream_buffer->Unmap(vertex_count * sizeof(HWVertex));
+  m_vertex_stream_buffer->Bind();
+  m_batch_start_vertex_ptr = nullptr;
+  m_batch_end_vertex_ptr = nullptr;
+  m_batch_current_vertex_ptr = nullptr;
 
   static constexpr std::array<GLenum, 4> gl_primitives = {{GL_LINES, GL_LINE_STRIP, GL_TRIANGLES, GL_TRIANGLE_STRIP}};
 
   if (m_batch.NeedsTwoPassRendering())
   {
     SetDrawState(HWBatchRenderMode::OnlyTransparent);
-    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, static_cast<GLsizei>(m_batch.vertices.size()));
+    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, vertex_count);
     SetDrawState(HWBatchRenderMode::OnlyOpaque);
-    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, static_cast<GLsizei>(m_batch.vertices.size()));
+    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, vertex_count);
   }
   else
   {
     SetDrawState(m_batch.GetRenderMode());
-    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, static_cast<GLsizei>(m_batch.vertices.size()));
+    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, vertex_count);
   }
-
-  m_batch.vertices.clear();
 }
 
 std::unique_ptr<GPU> GPU::CreateHardwareOpenGLRenderer()
diff --git a/src/core/gpu_hw_opengl.h b/src/core/gpu_hw_opengl.h
index 59ecbb173..81236a943 100644
--- a/src/core/gpu_hw_opengl.h
+++ b/src/core/gpu_hw_opengl.h
@@ -1,5 +1,6 @@
 #pragma once
 #include "common/gl_program.h"
+#include "common/gl_stream_buffer.h"
 #include "common/gl_texture.h"
 #include "glad.h"
 #include "gpu_hw.h"
@@ -31,6 +32,7 @@ protected:
   void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height) override;
   void FlushRender() override;
   void InvalidateVRAMReadCache() override;
+  void MapBatchVertexPointer(u32 required_vertices) override;
 
 private:
   struct GLStats
@@ -62,7 +64,7 @@ private:
   std::unique_ptr<GL::Texture> m_vram_downsample_texture;
   std::unique_ptr<GL::Texture> m_display_texture;
 
-  GLuint m_vertex_buffer = 0;
+  std::unique_ptr<GL::StreamBuffer> m_vertex_stream_buffer;
   GLuint m_vao_id = 0;
   GLuint m_attributeless_vao_id = 0;