GPU/HW: Split shadergen to seperate class

2025-04-10 19:15:14 +00:00 · 2019-11-03 13:36:54 +10:00 · 2019-11-03 13:36:54 +10:00 · be81d08109
parent 91c99f0226
commit be81d08109
10 changed files with 624 additions and 590 deletions
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@ -21,6 +21,8 @@ add_library(core
    gpu_hw.h
    gpu_hw_opengl.cpp
    gpu_hw_opengl.h
    gpu_hw_shadergen.cpp
    gpu_hw_shadergen.h
    gpu_sw.cpp
    gpu_sw.h
    gte.cpp
--- a/src/core/core.vcxproj
+++ b/src/core/core.vcxproj
@ -41,6 +41,7 @@
    <ClCompile Include="cpu_disasm.cpp" />
    <ClCompile Include="digital_controller.cpp" />
    <ClCompile Include="gpu_commands.cpp" />
    <ClCompile Include="gpu_hw_shadergen.cpp" />
    <ClCompile Include="gpu_sw.cpp" />
    <ClCompile Include="gte.cpp" />
    <ClCompile Include="dma.cpp" />
@ -64,6 +65,7 @@
    <ClInclude Include="cpu_core.h" />
    <ClInclude Include="cpu_disasm.h" />
    <ClInclude Include="digital_controller.h" />
    <ClInclude Include="gpu_hw_shadergen.h" />
    <ClInclude Include="gpu_sw.h" />
    <ClInclude Include="gte.h" />
    <ClInclude Include="cpu_types.h" />
--- a/src/core/core.vcxproj.filters
+++ b/src/core/core.vcxproj.filters
@ -23,6 +23,7 @@
    <ClCompile Include="settings.cpp" />
    <ClCompile Include="gpu_commands.cpp" />
    <ClCompile Include="gpu_sw.cpp" />
    <ClCompile Include="gpu_hw_shadergen.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="types.h" />
@ -50,6 +51,7 @@
    <ClInclude Include="memory_card.h" />
    <ClInclude Include="settings.h" />
    <ClInclude Include="gpu_sw.h" />
    <ClInclude Include="gpu_hw_shadergen.h" />
  </ItemGroup>
  <ItemGroup>
    <None Include="cpu_core.inl" />
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@ -18,6 +18,57 @@ class Timers;
 class GPU
 {
 public:
  enum class DMADirection : u32
  {
    Off = 0,
    FIFO = 1,
    CPUtoGP0 = 2,
    GPUREADtoCPU = 3
  };
  enum class Primitive : u8
  {
    Reserved = 0,
    Polygon = 1,
    Line = 2,
    Rectangle = 3
  };
  enum class DrawRectangleSize : u8
  {
    Variable = 0,
    R1x1 = 1,
    R8x8 = 2,
    R16x16 = 3
  };
  enum class TextureMode : u8
  {
    Palette4Bit = 0,
    Palette8Bit = 1,
    Direct16Bit = 2,
    Reserved_Direct16Bit = 3,
    // Not register values.
    RawTextureBit = 4,
    RawPalette4Bit = RawTextureBit | Palette4Bit,
    RawPalette8Bit = RawTextureBit | Palette8Bit,
    RawDirect16Bit = RawTextureBit | Direct16Bit,
    Reserved_RawDirect16Bit = RawTextureBit | Reserved_Direct16Bit,
    Disabled = 8 // Not a register value
  };
  enum class TransparencyMode : u8
  {
    HalfBackgroundPlusHalfForeground = 0,
    BackgroundPlusForeground = 1,
    BackgroundMinusForeground = 2,
    BackgroundPlusQuarterForeground = 3,
    Disabled = 4 // Not a register value
  };
  enum : u32
  {
    VRAM_WIDTH = 1024,
@ -29,6 +80,13 @@ public:
    HBLANK_TIMER_INDEX = 1
  };
  // 4x4 dither matrix.
  static constexpr s32 DITHER_MATRIX[4][4] = {{-4, +0, -3, +1},  // row 0
                                              {+2, -2, +3, -1},  // row 1
                                              {-3, +1, -4, +0},  // row 2
                                              {+4, -1, +2, -2}}; // row 3
  // Base class constructor.
  GPU();
  virtual ~GPU();
@ -112,57 +170,6 @@ protected:
  static bool DumpVRAMToFile(const char* filename, u32 width, u32 height, u32 stride, const void* buffer,
                             bool remove_alpha);
  enum class DMADirection : u32
  {
    Off = 0,
    FIFO = 1,
    CPUtoGP0 = 2,
    GPUREADtoCPU = 3
  };
  enum class Primitive : u8
  {
    Reserved = 0,
    Polygon = 1,
    Line = 2,
    Rectangle = 3
  };
  enum class DrawRectangleSize : u8
  {
    Variable = 0,
    R1x1 = 1,
    R8x8 = 2,
    R16x16 = 3
  };
  enum class TextureMode : u8
  {
    Palette4Bit = 0,
    Palette8Bit = 1,
    Direct16Bit = 2,
    Reserved_Direct16Bit = 3,
    // Not register values.
    RawTextureBit = 4,
    RawPalette4Bit = RawTextureBit | Palette4Bit,
    RawPalette8Bit = RawTextureBit | Palette8Bit,
    RawDirect16Bit = RawTextureBit | Direct16Bit,
    Reserved_RawDirect16Bit = RawTextureBit | Reserved_Direct16Bit,
    Disabled = 8 // Not a register value
  };
  enum class TransparencyMode : u8
  {
    HalfBackgroundPlusHalfForeground = 0,
    BackgroundPlusForeground = 1,
    BackgroundMinusForeground = 2,
    BackgroundPlusQuarterForeground = 3,
    Disabled = 4 // Not a register value
  };
  union RenderCommand
  {
    u32 bits;
@ -258,12 +265,6 @@ protected:
    }
  };
  // 4x4 dither matrix.
  static constexpr s32 DITHER_MATRIX[4][4] = {{-4, +0, -3, +1},  // row 0
                                              {+2, -2, +3, -1},  // row 1
                                              {-3, +1, -4, +0},  // row 2
                                              {+4, -1, +2, -2}}; // row 3
  void SoftReset();
  // Sets dots per scanline
@ -464,3 +465,5 @@ private:
  static const GP0CommandHandlerTable s_GP0_command_handler_table;
 };
 IMPLEMENT_ENUM_CLASS_BITWISE_OPERATORS(GPU::TextureMode);
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@ -163,7 +163,7 @@ void GPU_HW::LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command
 void GPU_HW::AddDuplicateVertex()
 {
-  std::memcpy(m_batch_current_vertex_ptr, m_batch_current_vertex_ptr - 1, sizeof(HWVertex));
+  std::memcpy(m_batch_current_vertex_ptr, m_batch_current_vertex_ptr - 1, sizeof(BatchVertex));
  m_batch_current_vertex_ptr++;
 }
@ -175,443 +175,14 @@ void GPU_HW::CalcScissorRect(int* left, int* top, int* right, int* bottom)
  *bottom = std::max<u32>((m_drawing_area.bottom + 1) * m_resolution_scale, *top + 1);
 }
-static void DefineMacro(std::stringstream& ss, const char* name, bool enabled)
+GPU_HW::BatchPrimitive GPU_HW::GetPrimitiveForCommand(RenderCommand rc)
 {
  if (enabled)
    ss << "#define " << name << " 1\n";
  else
    ss << "/* #define " << name << " 0 */\n";
 }
 void GPU_HW::GenerateShaderHeader(std::stringstream& ss)
 {
  ss << "#version 330 core\n\n";
  ss << "const int RESOLUTION_SCALE = " << m_resolution_scale << ";\n";
  ss << "const ivec2 VRAM_SIZE = ivec2(" << VRAM_WIDTH << ", " << VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n";
  ss << "const vec2 RCP_VRAM_SIZE = vec2(1.0, 1.0) / vec2(VRAM_SIZE);\n";
  ss << R"(
 float fixYCoord(float y)
 {
  return 1.0 - RCP_VRAM_SIZE.y - y;
 }
 int fixYCoord(int y)
 {
  return VRAM_SIZE.y - y - 1;
 }
 uint RGBA8ToRGBA5551(vec4 v)
 {
  uint r = uint(v.r * 255.0) >> 3;
  uint g = uint(v.g * 255.0) >> 3;
  uint b = uint(v.b * 255.0) >> 3;
  uint a = (v.a != 0.0) ? 1u : 0u;
  return (r) | (g << 5) | (b << 10) | (a << 15);
 }
 vec4 RGBA5551ToRGBA8(uint v)
 {
  uint r = (v & 31u);
  uint g = ((v >> 5) & 31u);
  uint b = ((v >> 10) & 31u);
  uint a = ((v >> 15) & 1u);
  // repeat lower bits
  r = (r << 3) | (r & 7u);
  g = (g << 3) | (g & 7u);
  b = (b << 3) | (b & 7u);
  return vec4(float(r) / 255.0, float(g) / 255.0, float(b) / 255.0, float(a));
 }
 )";
 }
 void GPU_HW::GenerateBatchUniformBuffer(std::stringstream& ss)
 {
  ss << R"(
 uniform UBOBlock {
  ivec2 u_pos_offset;
  uvec2 u_texture_window_mask;
  uvec2 u_texture_window_offset;
  float u_src_alpha_factor;
  float u_dst_alpha_factor;
 };
 )";
 }
 std::string GPU_HW::GenerateVertexShader(bool textured)
 {
  std::stringstream ss;
  GenerateShaderHeader(ss);
  DefineMacro(ss, "TEXTURED", textured);
  GenerateBatchUniformBuffer(ss);
  ss << R"(
 in ivec2 a_pos;
 in vec4 a_col0;
 in int a_texcoord;
 in int a_texpage;
 out vec3 v_col0;
 #if TEXTURED
  out vec2 v_tex0;
  flat out ivec4 v_texpage;
 #endif
 void main()
 {
  // 0..+1023 -> -1..1
  float pos_x = (float(a_pos.x + u_pos_offset.x) / 512.0) - 1.0;
  float pos_y = (float(a_pos.y + u_pos_offset.y) / -256.0) + 1.0;
  gl_Position = vec4(pos_x, pos_y, 0.0, 1.0);
  v_col0 = a_col0.rgb;
  #if TEXTURED
    v_tex0 = vec2(float(a_texcoord & 0xFFFF), float(a_texcoord >> 16)) / vec2(255.0);
    // base_x,base_y,palette_x,palette_y
    v_texpage.x = (a_texpage & 15) * 64 * RESOLUTION_SCALE;
    v_texpage.y = ((a_texpage >> 4) & 1) * 256 * RESOLUTION_SCALE;
    v_texpage.z = ((a_texpage >> 16) & 63) * 16 * RESOLUTION_SCALE;
    v_texpage.w = ((a_texpage >> 22) & 511) * RESOLUTION_SCALE;
  #endif
 }
 )";
  return ss.str();
 }
 std::string GPU_HW::GenerateFragmentShader(HWBatchRenderMode transparency, TextureMode texture_mode, bool dithering)
 {
  const TextureMode actual_texture_mode =
    static_cast<TextureMode>(static_cast<u8>(texture_mode) & ~static_cast<u8>(TextureMode::RawTextureBit));
  const bool raw_texture = (static_cast<u8>(texture_mode) & static_cast<u8>(TextureMode::RawTextureBit)) ==
                           static_cast<u8>(TextureMode::RawTextureBit);
  std::stringstream ss;
  GenerateShaderHeader(ss);
  GenerateBatchUniformBuffer(ss);
  DefineMacro(ss, "TRANSPARENCY", transparency != HWBatchRenderMode::TransparencyDisabled);
  DefineMacro(ss, "TRANSPARENCY_ONLY_OPAQUE", transparency == HWBatchRenderMode::OnlyOpaque);
  DefineMacro(ss, "TRANSPARENCY_ONLY_TRANSPARENCY", transparency == HWBatchRenderMode::OnlyTransparent);
  DefineMacro(ss, "TEXTURED", actual_texture_mode != TextureMode::Disabled);
  DefineMacro(ss, "PALETTE",
              actual_texture_mode == GPU::TextureMode::Palette4Bit ||
                actual_texture_mode == GPU::TextureMode::Palette8Bit);
  DefineMacro(ss, "PALETTE_4_BIT", actual_texture_mode == GPU::TextureMode::Palette4Bit);
  DefineMacro(ss, "PALETTE_8_BIT", actual_texture_mode == GPU::TextureMode::Palette8Bit);
  DefineMacro(ss, "RAW_TEXTURE", raw_texture);
  DefineMacro(ss, "DITHERING", dithering);
  DefineMacro(ss, "TRUE_COLOR", m_true_color);
  ss << "const int[16] s_dither_values = int[16]( ";
  for (u32 i = 0; i < 16; i++)
  {
    if (i > 0)
      ss << ", ";
    ss << DITHER_MATRIX[i / 4][i % 4];
  }
  ss << " );\n";
  ss << R"(
 in vec3 v_col0;
 #if TEXTURED
  in vec2 v_tex0;
  flat in ivec4 v_texpage;
  uniform sampler2D samp0;
 #endif
 out vec4 o_col0;
 ivec3 ApplyDithering(ivec3 icol)
 {
  ivec2 fc = (ivec2(gl_FragCoord.xy) / ivec2(RESOLUTION_SCALE, RESOLUTION_SCALE)) & ivec2(3, 3);
  int offset = s_dither_values[fc.y * 4 + fc.x];
  return icol + ivec3(offset, offset, offset);
 }
 ivec3 TruncateTo15Bit(ivec3 icol)
 {
  icol = clamp(icol, ivec3(0, 0, 0), ivec3(255, 255, 255));
  return (icol & ivec3(~7, ~7, ~7)) | ((icol >> 3) & ivec3(7, 7, 7));
 }
 #if TEXTURED
 ivec2 ApplyNativeTextureWindow(ivec2 coords)
 {
  uint x = (uint(coords.x) & ~(u_texture_window_mask.x * 8u)) | ((u_texture_window_offset.x & u_texture_window_mask.x) * 8u);
  uint y = (uint(coords.y) & ~(u_texture_window_mask.y * 8u)) | ((u_texture_window_offset.y & u_texture_window_mask.y) * 8u);
  return ivec2(int(x), int(y));
 }  
 ivec2 ApplyTextureWindow(ivec2 coords)
 {
  if (RESOLUTION_SCALE == 1)
    return ApplyNativeTextureWindow(coords);
  ivec2 downscaled_coords = coords / ivec2(RESOLUTION_SCALE);
  ivec2 coords_offset = coords % ivec2(RESOLUTION_SCALE);
  return (ApplyNativeTextureWindow(downscaled_coords) * ivec2(RESOLUTION_SCALE)) + coords_offset;
 }
 ivec4 SampleFromVRAM(vec2 coord)
 {
  // from 0..1 to 0..255
  ivec2 icoord = ivec2(coord * vec2(255 * RESOLUTION_SCALE));
  icoord = ApplyTextureWindow(icoord);
  // adjust for tightly packed palette formats
  ivec2 index_coord = icoord;
  #if PALETTE_4_BIT
    index_coord.x /= 4;
  #elif PALETTE_8_BIT
    index_coord.x /= 2;
  #endif
  // fixup coords
  ivec2 vicoord = ivec2(v_texpage.x + index_coord.x, fixYCoord(v_texpage.y + index_coord.y));
  // load colour/palette
  vec4 color = texelFetch(samp0, vicoord, 0);
  // apply palette
  #if PALETTE
    #if PALETTE_4_BIT
      int subpixel = int(icoord.x / RESOLUTION_SCALE) & 3;
      uint vram_value = RGBA8ToRGBA5551(color);
      int palette_index = int((vram_value >> (subpixel * 4)) & 0x0Fu);
    #elif PALETTE_8_BIT
      int subpixel = int(icoord.x / RESOLUTION_SCALE) & 1;
      uint vram_value = RGBA8ToRGBA5551(color);
      int palette_index = int((vram_value >> (subpixel * 8)) & 0xFFu);
    #endif
    ivec2 palette_icoord = ivec2(v_texpage.z + (palette_index * RESOLUTION_SCALE), fixYCoord(v_texpage.w));
    color = texelFetch(samp0, palette_icoord, 0);
  #endif
  return ivec4(color * vec4(255.0, 255.0, 255.0, 255.0));
 }
 #endif
 void main()
 {
  ivec3 vertcol = ivec3(v_col0 * vec3(255.0, 255.0, 255.0));
  bool semitransparent;
  bool new_mask_bit;
  ivec3 icolor;
  #if TEXTURED
    ivec4 texcol = SampleFromVRAM(v_tex0);
    if (texcol == ivec4(0.0, 0.0, 0.0, 0.0))
      discard;
    // Grab semitransparent bit from the texture color.
    semitransparent = (texcol.a != 0);
    #if RAW_TEXTURE
      icolor = texcol.rgb;
    #else
      icolor = (vertcol * texcol.rgb) >> 7;
    #endif
  #else
    // All pixels are semitransparent for untextured polygons.
    semitransparent = true;
    icolor = vertcol;
  #endif
  // Apply dithering
  #if DITHERING
    icolor = ApplyDithering(icolor);
  #endif
  // Clip to 15-bit range
  #if !TRUE_COLOR
    icolor = TruncateTo15Bit(icolor);
  #endif
  // Normalize
  vec3 color = vec3(icolor) / vec3(255.0, 255.0, 255.0);
  #if TRANSPARENCY
    // Apply semitransparency. If not a semitransparent texel, destination alpha is ignored.
    if (semitransparent)
    {
      #if TRANSPARENCY_ONLY_OPAQUE
        discard;
      #endif
      o_col0 = vec4(color * u_src_alpha_factor, u_dst_alpha_factor);
    }
    else
    {
      #if TRANSPARENCY_ONLY_TRANSPARENCY
        discard;
      #endif
      o_col0 = vec4(color, 0.0);
    }
  #else
    o_col0 = vec4(color, 0.0);
  #endif
 }
 )";
  return ss.str();
 }
 std::string GPU_HW::GenerateScreenQuadVertexShader()
 {
  std::stringstream ss;
  GenerateShaderHeader(ss);
  ss << R"(
 out vec2 v_tex0;
 void main()
 {
  v_tex0 = vec2(float((gl_VertexID << 1) & 2), float(gl_VertexID & 2));
  gl_Position = vec4(v_tex0 * vec2(2.0f, -2.0f) + vec2(-1.0f, 1.0f), 0.0f, 1.0f);
  gl_Position.y = -gl_Position.y;
 }
 )";
  return ss.str();
 }
 std::string GPU_HW::GenerateFillFragmentShader()
 {
  std::stringstream ss;
  GenerateShaderHeader(ss);
  ss << R"(
 uniform vec4 fill_color;
 out vec4 o_col0;
 void main()
 {
  o_col0 = fill_color;
 }
 )";
  return ss.str();
 }
 std::string GPU_HW::GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced)
 {
  std::stringstream ss;
  GenerateShaderHeader(ss);
  DefineMacro(ss, "DEPTH_24BIT", depth_24bit);
  DefineMacro(ss, "INTERLACED", interlaced);
  ss << R"(
 in vec2 v_tex0;
 out vec4 o_col0;
 uniform sampler2D samp0;
 uniform ivec3 u_base_coords;
 ivec2 GetCoords(vec2 fragcoord)
 {
  ivec2 icoords = ivec2(fragcoord);
  #if INTERLACED
    if ((((icoords.y - u_base_coords.z) / RESOLUTION_SCALE) & 1) != 0)
      discard;
  #endif
  return icoords;
 }
 void main()
 {
  ivec2 icoords = GetCoords(gl_FragCoord.xy);
  #if DEPTH_24BIT
    // compute offset in dwords from the start of the 24-bit values
    ivec2 base = ivec2(u_base_coords.x, u_base_coords.y + icoords.y);
    int xoff = int(icoords.x);
    int dword_index = (xoff / 2) + (xoff / 4);
    // sample two adjacent dwords, or four 16-bit values as the 24-bit value will lie somewhere between these
    uint s0 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + dword_index * 2 + 0, base.y), 0));
    uint s1 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + dword_index * 2 + 1, base.y), 0));
    uint s2 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + (dword_index + 1) * 2 + 0, base.y), 0));
    uint s3 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + (dword_index + 1) * 2 + 1, base.y), 0));
    // select the bit for this pixel depending on its offset in the 4-pixel block
    uint r, g, b;
    int block_offset = xoff & 3;
    if (block_offset == 0)
    {
      r = s0 & 0xFFu;
      g = s0 >> 8;
      b = s1 & 0xFFu;
    }
    else if (block_offset == 1)
    {
      r = s1 >> 8;
      g = s2 & 0xFFu;
      b = s2 >> 8;
    }
    else if (block_offset == 2)
    {
      r = s1 & 0xFFu;
      g = s1 >> 8;
      b = s2 & 0xFFu;
    }
    else
    {
      r = s2 >> 8;
      g = s3 & 0xFFu;
      b = s3 >> 8;
    }
    // and normalize
    o_col0 = vec4(float(r) / 255.0, float(g) / 255.0, float(b) / 255.0, 1.0);
  #else
    // load and return
    o_col0 = texelFetch(samp0, u_base_coords.xy + icoords, 0);
  #endif
 }
 )";
  return ss.str();
 }
 std::string GPU_HW::GenerateVRAMWriteFragmentShader()
 {
  std::stringstream ss;
  GenerateShaderHeader(ss);
  ss << R"(
 uniform ivec2 u_base_coords;
 uniform ivec2 u_size;
 uniform usamplerBuffer samp0;
 out vec4 o_col0;
 void main()
 {
  ivec2 coords = ivec2(gl_FragCoord.xy) / ivec2(RESOLUTION_SCALE, RESOLUTION_SCALE);
  ivec2 offset = coords - u_base_coords;
  offset.y = u_size.y - offset.y - 1;
  int buffer_offset = offset.y * u_size.x + offset.x;
  uint value = texelFetch(samp0, buffer_offset).r;
  o_col0 = RGBA5551ToRGBA8(value);
 })";
  return ss.str();
 }
 GPU_HW::HWPrimitive GPU_HW::GetPrimitiveForCommand(RenderCommand rc)
 {
  if (rc.primitive == Primitive::Line)
-    return rc.polyline ? HWPrimitive::LineStrip : HWPrimitive::Lines;
+    return rc.polyline ? BatchPrimitive::LineStrip : BatchPrimitive::Lines;
  else if ((rc.primitive == Primitive::Polygon && rc.quad_polygon) || rc.primitive == Primitive::Rectangle)
-    return HWPrimitive::TriangleStrip;
+    return BatchPrimitive::TriangleStrip;
  else
-    return HWPrimitive::Triangles;
+    return BatchPrimitive::Triangles;
 }
 void GPU_HW::DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32* command_ptr)
@ -687,13 +258,13 @@ void GPU_HW::DispatchRenderCommand(RenderCommand rc, u32 num_vertices, const u32
  // has any state changed which requires a new batch?
  const TransparencyMode transparency_mode =
    rc.transparency_enable ? m_render_state.transparency_mode : TransparencyMode::Disabled;
-  const HWPrimitive rc_primitive = GetPrimitiveForCommand(rc);
+  const BatchPrimitive rc_primitive = GetPrimitiveForCommand(rc);
  const bool dithering_enable = (!m_true_color && rc.IsDitheringEnabled()) ? m_GPUSTAT.dither_enable : false;
  const u32 max_added_vertices = num_vertices + 2;
  if (!IsFlushed())
  {
    const bool buffer_overflow = GetBatchVertexSpace() < max_added_vertices;
-    if (buffer_overflow || rc_primitive == HWPrimitive::LineStrip || m_batch.texture_mode != texture_mode ||
+    if (buffer_overflow || rc_primitive == BatchPrimitive::LineStrip || m_batch.texture_mode != texture_mode ||
        m_batch.transparency_mode != transparency_mode || m_batch.primitive != rc_primitive ||
        dithering_enable != m_batch.dithering || m_render_state.IsTextureWindowChanged())
    {
--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@ -8,6 +8,22 @@
 class GPU_HW : public GPU
 {
 public:
  enum class BatchPrimitive : u8
  {
    Lines = 0,
    LineStrip = 1,
    Triangles = 2,
    TriangleStrip = 3
  };
  enum class BatchRenderMode : u8
  {
    TransparencyDisabled,
    TransparentAndOpaque,
    OnlyOpaque,
    OnlyTransparent
  };
  GPU_HW();
  virtual ~GPU_HW();
@ -16,23 +32,7 @@ public:
  virtual void UpdateSettings() override;
 protected:
-  enum class HWPrimitive : u8
+  struct BatchVertex
  {
    Lines = 0,
    LineStrip = 1,
    Triangles = 2,
    TriangleStrip = 3
  };
  enum class HWBatchRenderMode : u8
  {
    TransparencyDisabled,
    TransparentAndOpaque,
    OnlyOpaque,
    OnlyTransparent
  };
  struct HWVertex
  {
    s32 x;
    s32 y;
@ -55,9 +55,9 @@ protected:
    }
  };
-  struct HWBatchConfig
+  struct BatchConfig
  {
-    HWPrimitive primitive;
+    BatchPrimitive primitive;
    TextureMode texture_mode;
    TransparencyMode transparency_mode;
    bool dithering;
@ -71,14 +71,14 @@ protected:
    }
    // Returns the render mode for this batch.
-    HWBatchRenderMode GetRenderMode() const
+    BatchRenderMode GetRenderMode() const
    {
-      return transparency_mode == TransparencyMode::Disabled ? HWBatchRenderMode::TransparencyDisabled :
+      return transparency_mode == TransparencyMode::Disabled ? BatchRenderMode::TransparencyDisabled :
-                                                               HWBatchRenderMode::TransparentAndOpaque;
+                                                               BatchRenderMode::TransparentAndOpaque;
    }
  };
-  struct HWBatchUBOData
+  struct BatchUBOData
  {
    s32 u_pos_offset[2];
    u32 u_texture_window_mask[2];
@ -90,7 +90,7 @@ protected:
  static constexpr u32 VRAM_UPDATE_TEXTURE_BUFFER_SIZE = VRAM_WIDTH * VRAM_HEIGHT * sizeof(u32);
  static constexpr u32 VERTEX_BUFFER_SIZE = 1 * 1024 * 1024;
  static constexpr u32 MIN_BATCH_VERTEX_COUNT = 6;
-  static constexpr u32 MAX_BATCH_VERTEX_COUNT = VERTEX_BUFFER_SIZE / sizeof(HWVertex);
+  static constexpr u32 MAX_BATCH_VERTEX_COUNT = VERTEX_BUFFER_SIZE / sizeof(BatchVertex);
  static constexpr u32 UNIFORM_BUFFER_SIZE = 512 * 1024;
  static constexpr std::tuple<float, float, float, float> RGBA8ToFloat(u32 rgba)
@ -121,31 +121,21 @@ protected:
    return std::make_tuple(x * s32(m_resolution_scale), y * s32(m_resolution_scale));
  }
-  std::string GenerateVertexShader(bool textured);
+  BatchVertex* m_batch_start_vertex_ptr = nullptr;
-  std::string GenerateFragmentShader(HWBatchRenderMode transparency, TextureMode texture_mode, bool dithering);
+  BatchVertex* m_batch_end_vertex_ptr = nullptr;
-  std::string GenerateScreenQuadVertexShader();
+  BatchVertex* m_batch_current_vertex_ptr = nullptr;
  std::string GenerateFillFragmentShader();
  std::string GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced);
  std::string GenerateVRAMWriteFragmentShader();
  HWVertex* m_batch_start_vertex_ptr = nullptr;
  HWVertex* m_batch_end_vertex_ptr = nullptr;
  HWVertex* m_batch_current_vertex_ptr = nullptr;
  u32 m_batch_base_vertex = 0;
  u32 m_resolution_scale = 1;
  u32 m_max_resolution_scale = 1;
  bool m_true_color = false;
-  HWBatchConfig m_batch = {};
+  BatchConfig m_batch = {};
-  HWBatchUBOData m_batch_ubo_data = {};
+  BatchUBOData m_batch_ubo_data = {};
  bool m_batch_ubo_dirty = true;
 private:
-  static HWPrimitive GetPrimitiveForCommand(RenderCommand rc);
+  static BatchPrimitive GetPrimitiveForCommand(RenderCommand rc);
  void GenerateShaderHeader(std::stringstream& ss);
  void GenerateBatchUniformBuffer(std::stringstream& ss);
  void LoadVertices(RenderCommand rc, u32 num_vertices, const u32* command_ptr);
  void AddDuplicateVertex();
--- a/src/core/gpu_hw_opengl.cpp
+++ b/src/core/gpu_hw_opengl.cpp
@ -2,6 +2,7 @@
 #include "YBaseLib/Assert.h"
 #include "YBaseLib/Log.h"
 #include "YBaseLib/String.h"
 #include "gpu_hw_shadergen.h"
 #include "host_interface.h"
 #include "imgui.h"
 #include "system.h"
@ -138,9 +139,9 @@ void GPU_HW_OpenGL::MapBatchVertexPointer(u32 required_vertices)
  Assert(!m_batch_start_vertex_ptr);
  const GL::StreamBuffer::MappingResult res =
-    m_vertex_stream_buffer->Map(sizeof(HWVertex), required_vertices * sizeof(HWVertex));
+    m_vertex_stream_buffer->Map(sizeof(BatchVertex), required_vertices * sizeof(BatchVertex));
-  m_batch_start_vertex_ptr = static_cast<HWVertex*>(res.pointer);
+  m_batch_start_vertex_ptr = static_cast<BatchVertex*>(res.pointer);
  m_batch_current_vertex_ptr = m_batch_start_vertex_ptr;
  m_batch_end_vertex_ptr = m_batch_start_vertex_ptr + res.space_aligned;
  m_batch_base_vertex = res.index_aligned;
@ -246,11 +247,11 @@ void GPU_HW_OpenGL::CreateVertexBuffer()
  glEnableVertexAttribArray(1);
  glEnableVertexAttribArray(2);
  glEnableVertexAttribArray(3);
-  glVertexAttribIPointer(0, 2, GL_INT, sizeof(HWVertex), reinterpret_cast<void*>(offsetof(HWVertex, x)));
+  glVertexAttribIPointer(0, 2, GL_INT, sizeof(BatchVertex), reinterpret_cast<void*>(offsetof(BatchVertex, x)));
-  glVertexAttribPointer(1, 4, GL_UNSIGNED_BYTE, true, sizeof(HWVertex),
+  glVertexAttribPointer(1, 4, GL_UNSIGNED_BYTE, true, sizeof(BatchVertex),
-                        reinterpret_cast<void*>(offsetof(HWVertex, color)));
+                        reinterpret_cast<void*>(offsetof(BatchVertex, color)));
-  glVertexAttribIPointer(2, 2, GL_INT, sizeof(HWVertex), reinterpret_cast<void*>(offsetof(HWVertex, texcoord)));
+  glVertexAttribIPointer(2, 2, GL_INT, sizeof(BatchVertex), reinterpret_cast<void*>(offsetof(BatchVertex, texcoord)));
-  glVertexAttribIPointer(3, 1, GL_INT, sizeof(HWVertex), reinterpret_cast<void*>(offsetof(HWVertex, texpage)));
+  glVertexAttribIPointer(3, 1, GL_INT, sizeof(BatchVertex), reinterpret_cast<void*>(offsetof(BatchVertex, texpage)));
  glBindVertexArray(0);
  glGenVertexArrays(1, &m_attributeless_vao_id);
@ -280,31 +281,56 @@ void GPU_HW_OpenGL::CreateTextureBuffer()
 bool GPU_HW_OpenGL::CompilePrograms()
 {
  GPU_HW_ShaderGen shadergen(GPU_HW_ShaderGen::Backend::OpenGL, m_resolution_scale, m_true_color);
  for (u32 render_mode = 0; render_mode < 4; render_mode++)
  {
    for (u32 texture_mode = 0; texture_mode < 9; texture_mode++)
    {
      for (u8 dithering = 0; dithering < 2; dithering++)
      {
-        if (!CompileProgram(m_render_programs[render_mode][texture_mode][dithering],
+        const bool textured = (static_cast<TextureMode>(texture_mode) != TextureMode::Disabled);
-                            static_cast<HWBatchRenderMode>(render_mode), static_cast<TextureMode>(texture_mode),
+        const std::string vs = shadergen.GenerateBatchVertexShader(textured);
-                            ConvertToBoolUnchecked(dithering)))
+        const std::string fs = shadergen.GenerateBatchFragmentShader(static_cast<BatchRenderMode>(render_mode),
-        {
+                                                                     static_cast<TextureMode>(texture_mode),
                                                                     ConvertToBoolUnchecked(dithering));
        GL::Program& prog = m_render_programs[render_mode][texture_mode][dithering];
        if (!prog.Compile(vs, fs))
          return false;
        prog.BindAttribute(0, "a_pos");
        prog.BindAttribute(1, "a_col0");
        if (textured)
        {
          prog.BindAttribute(2, "a_texcoord");
          prog.BindAttribute(3, "a_texpage");
        }
        prog.BindFragData(0, "o_col0");
        if (!prog.Link())
          return false;
        prog.BindUniformBlock("UBOBlock", 1);
        if (textured)
        {
          prog.Bind();
          prog.RegisterUniform("samp0");
          prog.Uniform1i(0, 0);
        }
      }
    }
  }
  // TODO: Use string_view
  for (u8 depth_24bit = 0; depth_24bit < 2; depth_24bit++)
  {
    for (u8 interlaced = 0; interlaced < 2; interlaced++)
    {
      GL::Program& prog = m_display_programs[depth_24bit][interlaced];
-      const std::string vs = GenerateScreenQuadVertexShader();
+      const std::string vs = shadergen.GenerateScreenQuadVertexShader();
-      const std::string fs =
+      const std::string fs = shadergen.GenerateDisplayFragmentShader(ConvertToBoolUnchecked(depth_24bit),
-        GenerateDisplayFragmentShader(ConvertToBoolUnchecked(depth_24bit), ConvertToBoolUnchecked(interlaced));
+                                                                     ConvertToBoolUnchecked(interlaced));
      if (!prog.Compile(vs, fs))
        return false;
@ -319,8 +345,11 @@ bool GPU_HW_OpenGL::CompilePrograms()
    }
  }
-  if (!m_vram_write_program.Compile(GenerateScreenQuadVertexShader(), GenerateVRAMWriteFragmentShader()))
+  if (!m_vram_write_program.Compile(shadergen.GenerateScreenQuadVertexShader(),
                                    shadergen.GenerateVRAMWriteFragmentShader()))
  {
    return false;
  }
  m_vram_write_program.BindFragData(0, "o_col0");
  if (!m_vram_write_program.Link())
@ -335,41 +364,7 @@ bool GPU_HW_OpenGL::CompilePrograms()
  return true;
 }
-bool GPU_HW_OpenGL::CompileProgram(GL::Program& prog, HWBatchRenderMode render_mode, TextureMode texture_mode,
+void GPU_HW_OpenGL::SetDrawState(BatchRenderMode render_mode)
                                   bool dithering)
 {
  const bool textured = texture_mode != TextureMode::Disabled;
  const std::string vs = GenerateVertexShader(textured);
  const std::string fs = GenerateFragmentShader(render_mode, texture_mode, dithering);
  if (!prog.Compile(vs, fs))
    return false;
  prog.BindAttribute(0, "a_pos");
  prog.BindAttribute(1, "a_col0");
  if (textured)
  {
    prog.BindAttribute(2, "a_texcoord");
    prog.BindAttribute(3, "a_texpage");
  }
  prog.BindFragData(0, "o_col0");
  if (!prog.Link())
    return false;
  prog.BindUniformBlock("UBOBlock", 1);
  if (textured)
  {
    prog.Bind();
    prog.RegisterUniform("samp0");
    prog.Uniform1i(0, 0);
  }
  return true;
 }
 void GPU_HW_OpenGL::SetDrawState(HWBatchRenderMode render_mode)
 {
  const GL::Program& prog = m_render_programs[static_cast<u8>(render_mode)][static_cast<u8>(m_batch.texture_mode)]
                                             [BoolToUInt8(m_batch.dithering)];
@ -378,7 +373,7 @@ void GPU_HW_OpenGL::SetDrawState(HWBatchRenderMode render_mode)
  if (m_batch.texture_mode != TextureMode::Disabled)
    m_vram_read_texture->Bind();
-  if (m_batch.transparency_mode == TransparencyMode::Disabled || render_mode == HWBatchRenderMode::OnlyOpaque)
+  if (m_batch.transparency_mode == TransparencyMode::Disabled || render_mode == BatchRenderMode::OnlyOpaque)
  {
    glDisable(GL_BLEND);
  }
@ -732,7 +727,7 @@ void GPU_HW_OpenGL::FlushRender()
  m_stats.num_batches++;
  m_stats.num_vertices += vertex_count;
-  m_vertex_stream_buffer->Unmap(vertex_count * sizeof(HWVertex));
+  m_vertex_stream_buffer->Unmap(vertex_count * sizeof(BatchVertex));
  m_vertex_stream_buffer->Bind();
  m_batch_start_vertex_ptr = nullptr;
  m_batch_end_vertex_ptr = nullptr;
@ -742,9 +737,9 @@ void GPU_HW_OpenGL::FlushRender()
  if (m_batch.NeedsTwoPassRendering())
  {
-    SetDrawState(HWBatchRenderMode::OnlyTransparent);
+    SetDrawState(BatchRenderMode::OnlyTransparent);
    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, vertex_count);
-    SetDrawState(HWBatchRenderMode::OnlyOpaque);
+    SetDrawState(BatchRenderMode::OnlyOpaque);
    glDrawArrays(gl_primitives[static_cast<u8>(m_batch.primitive)], 0, vertex_count);
  }
  else
--- a/src/core/gpu_hw_opengl.h
+++ b/src/core/gpu_hw_opengl.h
@ -58,8 +58,7 @@ private:
  void CreateTextureBuffer();
  bool CompilePrograms();
-  bool CompileProgram(GL::Program& prog, HWBatchRenderMode render_mode, TextureMode texture_mode, bool dithering);
+  void SetDrawState(BatchRenderMode render_mode);
  void SetDrawState(HWBatchRenderMode render_mode);
  void UploadUniformBlock(const void* data, u32 data_size);
  // downsample texture - used for readbacks at >1xIR.
--- a/src/core/gpu_hw_shadergen.cpp
+++ b/src/core/gpu_hw_shadergen.cpp
@ -0,0 +1,436 @@
 #include "gpu_hw_shadergen.h"
 GPU_HW_ShaderGen::GPU_HW_ShaderGen(Backend backend, u32 resolution_scale, bool true_color)
  : m_backend(backend), m_resolution_scale(resolution_scale), m_true_color(true_color)
 {
 }
 GPU_HW_ShaderGen::~GPU_HW_ShaderGen() = default;
 static void DefineMacro(std::stringstream& ss, const char* name, bool enabled)
 {
  if (enabled)
    ss << "#define " << name << " 1\n";
  else
    ss << "/* #define " << name << " 0 */\n";
 }
 void GPU_HW_ShaderGen::GenerateShaderHeader(std::stringstream& ss)
 {
  ss << "#version 330 core\n\n";
  ss << "const int RESOLUTION_SCALE = " << m_resolution_scale << ";\n";
  ss << "const ivec2 VRAM_SIZE = ivec2(" << GPU::VRAM_WIDTH << ", " << GPU::VRAM_HEIGHT << ") * RESOLUTION_SCALE;\n";
  ss << "const vec2 RCP_VRAM_SIZE = vec2(1.0, 1.0) / vec2(VRAM_SIZE);\n";
  ss << R"(
 float fixYCoord(float y)
 {
  return 1.0 - RCP_VRAM_SIZE.y - y;
 }
 int fixYCoord(int y)
 {
  return VRAM_SIZE.y - y - 1;
 }
 uint RGBA8ToRGBA5551(vec4 v)
 {
  uint r = uint(v.r * 255.0) >> 3;
  uint g = uint(v.g * 255.0) >> 3;
  uint b = uint(v.b * 255.0) >> 3;
  uint a = (v.a != 0.0) ? 1u : 0u;
  return (r) | (g << 5) | (b << 10) | (a << 15);
 }
 vec4 RGBA5551ToRGBA8(uint v)
 {
  uint r = (v & 31u);
  uint g = ((v >> 5) & 31u);
  uint b = ((v >> 10) & 31u);
  uint a = ((v >> 15) & 1u);
  // repeat lower bits
  r = (r << 3) | (r & 7u);
  g = (g << 3) | (g & 7u);
  b = (b << 3) | (b & 7u);
  return vec4(float(r) / 255.0, float(g) / 255.0, float(b) / 255.0, float(a));
 }
 )";
 }
 void GPU_HW_ShaderGen::GenerateBatchUniformBuffer(std::stringstream& ss)
 {
  ss << R"(
 uniform UBOBlock {
  ivec2 u_pos_offset;
  uvec2 u_texture_window_mask;
  uvec2 u_texture_window_offset;
  float u_src_alpha_factor;
  float u_dst_alpha_factor;
 };
 )";
 }
 std::string GPU_HW_ShaderGen::GenerateBatchVertexShader(bool textured)
 {
  std::stringstream ss;
  GenerateShaderHeader(ss);
  DefineMacro(ss, "TEXTURED", textured);
  GenerateBatchUniformBuffer(ss);
  ss << R"(
 in ivec2 a_pos;
 in vec4 a_col0;
 in int a_texcoord;
 in int a_texpage;
 out vec3 v_col0;
 #if TEXTURED
  out vec2 v_tex0;
  flat out ivec4 v_texpage;
 #endif
 void main()
 {
  // 0..+1023 -> -1..1
  float pos_x = (float(a_pos.x + u_pos_offset.x) / 512.0) - 1.0;
  float pos_y = (float(a_pos.y + u_pos_offset.y) / -256.0) + 1.0;
  gl_Position = vec4(pos_x, pos_y, 0.0, 1.0);
  v_col0 = a_col0.rgb;
  #if TEXTURED
    v_tex0 = vec2(float(a_texcoord & 0xFFFF), float(a_texcoord >> 16)) / vec2(255.0);
    // base_x,base_y,palette_x,palette_y
    v_texpage.x = (a_texpage & 15) * 64 * RESOLUTION_SCALE;
    v_texpage.y = ((a_texpage >> 4) & 1) * 256 * RESOLUTION_SCALE;
    v_texpage.z = ((a_texpage >> 16) & 63) * 16 * RESOLUTION_SCALE;
    v_texpage.w = ((a_texpage >> 22) & 511) * RESOLUTION_SCALE;
  #endif
 }
 )";
  return ss.str();
 }
 std::string GPU_HW_ShaderGen::GenerateBatchFragmentShader(GPU_HW::BatchRenderMode transparency,
                                                     GPU::TextureMode texture_mode, bool dithering)
 {
  const GPU::TextureMode actual_texture_mode = texture_mode & ~GPU::TextureMode::RawTextureBit;
  const bool raw_texture = (texture_mode & GPU::TextureMode::RawTextureBit) == GPU::TextureMode::RawTextureBit;
  std::stringstream ss;
  GenerateShaderHeader(ss);
  GenerateBatchUniformBuffer(ss);
  DefineMacro(ss, "TRANSPARENCY", transparency != GPU_HW::BatchRenderMode::TransparencyDisabled);
  DefineMacro(ss, "TRANSPARENCY_ONLY_OPAQUE", transparency == GPU_HW::BatchRenderMode::OnlyOpaque);
  DefineMacro(ss, "TRANSPARENCY_ONLY_TRANSPARENCY", transparency == GPU_HW::BatchRenderMode::OnlyTransparent);
  DefineMacro(ss, "TEXTURED", actual_texture_mode != GPU::TextureMode::Disabled);
  DefineMacro(ss, "PALETTE",
              actual_texture_mode == GPU::TextureMode::Palette4Bit ||
                actual_texture_mode == GPU::TextureMode::Palette8Bit);
  DefineMacro(ss, "PALETTE_4_BIT", actual_texture_mode == GPU::TextureMode::Palette4Bit);
  DefineMacro(ss, "PALETTE_8_BIT", actual_texture_mode == GPU::TextureMode::Palette8Bit);
  DefineMacro(ss, "RAW_TEXTURE", raw_texture);
  DefineMacro(ss, "DITHERING", dithering);
  DefineMacro(ss, "TRUE_COLOR", m_true_color);
  ss << "const int[16] s_dither_values = int[16]( ";
  for (u32 i = 0; i < 16; i++)
  {
    if (i > 0)
      ss << ", ";
    ss << GPU::DITHER_MATRIX[i / 4][i % 4];
  }
  ss << " );\n";
  ss << R"(
 in vec3 v_col0;
 #if TEXTURED
  in vec2 v_tex0;
  flat in ivec4 v_texpage;
  uniform sampler2D samp0;
 #endif
 out vec4 o_col0;
 ivec3 ApplyDithering(ivec3 icol)
 {
  ivec2 fc = (ivec2(gl_FragCoord.xy) / ivec2(RESOLUTION_SCALE, RESOLUTION_SCALE)) & ivec2(3, 3);
  int offset = s_dither_values[fc.y * 4 + fc.x];
  return icol + ivec3(offset, offset, offset);
 }
 ivec3 TruncateTo15Bit(ivec3 icol)
 {
  icol = clamp(icol, ivec3(0, 0, 0), ivec3(255, 255, 255));
  return (icol & ivec3(~7, ~7, ~7)) | ((icol >> 3) & ivec3(7, 7, 7));
 }
 #if TEXTURED
 ivec2 ApplyNativeTextureWindow(ivec2 coords)
 {
  uint x = (uint(coords.x) & ~(u_texture_window_mask.x * 8u)) | ((u_texture_window_offset.x & u_texture_window_mask.x) * 8u);
  uint y = (uint(coords.y) & ~(u_texture_window_mask.y * 8u)) | ((u_texture_window_offset.y & u_texture_window_mask.y) * 8u);
  return ivec2(int(x), int(y));
 }  
 ivec2 ApplyTextureWindow(ivec2 coords)
 {
  if (RESOLUTION_SCALE == 1)
    return ApplyNativeTextureWindow(coords);
  ivec2 downscaled_coords = coords / ivec2(RESOLUTION_SCALE);
  ivec2 coords_offset = coords % ivec2(RESOLUTION_SCALE);
  return (ApplyNativeTextureWindow(downscaled_coords) * ivec2(RESOLUTION_SCALE)) + coords_offset;
 }
 ivec4 SampleFromVRAM(vec2 coord)
 {
  // from 0..1 to 0..255
  ivec2 icoord = ivec2(coord * vec2(255 * RESOLUTION_SCALE));
  icoord = ApplyTextureWindow(icoord);
  // adjust for tightly packed palette formats
  ivec2 index_coord = icoord;
  #if PALETTE_4_BIT
    index_coord.x /= 4;
  #elif PALETTE_8_BIT
    index_coord.x /= 2;
  #endif
  // fixup coords
  ivec2 vicoord = ivec2(v_texpage.x + index_coord.x, fixYCoord(v_texpage.y + index_coord.y));
  // load colour/palette
  vec4 color = texelFetch(samp0, vicoord, 0);
  // apply palette
  #if PALETTE
    #if PALETTE_4_BIT
      int subpixel = int(icoord.x / RESOLUTION_SCALE) & 3;
      uint vram_value = RGBA8ToRGBA5551(color);
      int palette_index = int((vram_value >> (subpixel * 4)) & 0x0Fu);
    #elif PALETTE_8_BIT
      int subpixel = int(icoord.x / RESOLUTION_SCALE) & 1;
      uint vram_value = RGBA8ToRGBA5551(color);
      int palette_index = int((vram_value >> (subpixel * 8)) & 0xFFu);
    #endif
    ivec2 palette_icoord = ivec2(v_texpage.z + (palette_index * RESOLUTION_SCALE), fixYCoord(v_texpage.w));
    color = texelFetch(samp0, palette_icoord, 0);
  #endif
  return ivec4(color * vec4(255.0, 255.0, 255.0, 255.0));
 }
 #endif
 void main()
 {
  ivec3 vertcol = ivec3(v_col0 * vec3(255.0, 255.0, 255.0));
  bool semitransparent;
  bool new_mask_bit;
  ivec3 icolor;
  #if TEXTURED
    ivec4 texcol = SampleFromVRAM(v_tex0);
    if (texcol == ivec4(0.0, 0.0, 0.0, 0.0))
      discard;
    // Grab semitransparent bit from the texture color.
    semitransparent = (texcol.a != 0);
    #if RAW_TEXTURE
      icolor = texcol.rgb;
    #else
      icolor = (vertcol * texcol.rgb) >> 7;
    #endif
  #else
    // All pixels are semitransparent for untextured polygons.
    semitransparent = true;
    icolor = vertcol;
  #endif
  // Apply dithering
  #if DITHERING
    icolor = ApplyDithering(icolor);
  #endif
  // Clip to 15-bit range
  #if !TRUE_COLOR
    icolor = TruncateTo15Bit(icolor);
  #endif
  // Normalize
  vec3 color = vec3(icolor) / vec3(255.0, 255.0, 255.0);
  #if TRANSPARENCY
    // Apply semitransparency. If not a semitransparent texel, destination alpha is ignored.
    if (semitransparent)
    {
      #if TRANSPARENCY_ONLY_OPAQUE
        discard;
      #endif
      o_col0 = vec4(color * u_src_alpha_factor, u_dst_alpha_factor);
    }
    else
    {
      #if TRANSPARENCY_ONLY_TRANSPARENCY
        discard;
      #endif
      o_col0 = vec4(color, 0.0);
    }
  #else
    o_col0 = vec4(color, 0.0);
  #endif
 }
 )";
  return ss.str();
 }
 std::string GPU_HW_ShaderGen::GenerateScreenQuadVertexShader()
 {
  std::stringstream ss;
  GenerateShaderHeader(ss);
  ss << R"(
 out vec2 v_tex0;
 void main()
 {
  v_tex0 = vec2(float((gl_VertexID << 1) & 2), float(gl_VertexID & 2));
  gl_Position = vec4(v_tex0 * vec2(2.0f, -2.0f) + vec2(-1.0f, 1.0f), 0.0f, 1.0f);
  gl_Position.y = -gl_Position.y;
 }
 )";
  return ss.str();
 }
 std::string GPU_HW_ShaderGen::GenerateFillFragmentShader()
 {
  std::stringstream ss;
  GenerateShaderHeader(ss);
  ss << R"(
 uniform vec4 fill_color;
 out vec4 o_col0;
 void main()
 {
  o_col0 = fill_color;
 }
 )";
  return ss.str();
 }
 std::string GPU_HW_ShaderGen::GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced)
 {
  std::stringstream ss;
  GenerateShaderHeader(ss);
  DefineMacro(ss, "DEPTH_24BIT", depth_24bit);
  DefineMacro(ss, "INTERLACED", interlaced);
  ss << R"(
 in vec2 v_tex0;
 out vec4 o_col0;
 uniform sampler2D samp0;
 uniform ivec3 u_base_coords;
 ivec2 GetCoords(vec2 fragcoord)
 {
  ivec2 icoords = ivec2(fragcoord);
  #if INTERLACED
    if ((((icoords.y - u_base_coords.z) / RESOLUTION_SCALE) & 1) != 0)
      discard;
  #endif
  return icoords;
 }
 void main()
 {
  ivec2 icoords = GetCoords(gl_FragCoord.xy);
  #if DEPTH_24BIT
    // compute offset in dwords from the start of the 24-bit values
    ivec2 base = ivec2(u_base_coords.x, u_base_coords.y + icoords.y);
    int xoff = int(icoords.x);
    int dword_index = (xoff / 2) + (xoff / 4);
    // sample two adjacent dwords, or four 16-bit values as the 24-bit value will lie somewhere between these
    uint s0 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + dword_index * 2 + 0, base.y), 0));
    uint s1 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + dword_index * 2 + 1, base.y), 0));
    uint s2 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + (dword_index + 1) * 2 + 0, base.y), 0));
    uint s3 = RGBA8ToRGBA5551(texelFetch(samp0, ivec2(base.x + (dword_index + 1) * 2 + 1, base.y), 0));
    // select the bit for this pixel depending on its offset in the 4-pixel block
    uint r, g, b;
    int block_offset = xoff & 3;
    if (block_offset == 0)
    {
      r = s0 & 0xFFu;
      g = s0 >> 8;
      b = s1 & 0xFFu;
    }
    else if (block_offset == 1)
    {
      r = s1 >> 8;
      g = s2 & 0xFFu;
      b = s2 >> 8;
    }
    else if (block_offset == 2)
    {
      r = s1 & 0xFFu;
      g = s1 >> 8;
      b = s2 & 0xFFu;
    }
    else
    {
      r = s2 >> 8;
      g = s3 & 0xFFu;
      b = s3 >> 8;
    }
    // and normalize
    o_col0 = vec4(float(r) / 255.0, float(g) / 255.0, float(b) / 255.0, 1.0);
  #else
    // load and return
    o_col0 = texelFetch(samp0, u_base_coords.xy + icoords, 0);
  #endif
 }
 )";
  return ss.str();
 }
 std::string GPU_HW_ShaderGen::GenerateVRAMWriteFragmentShader()
 {
  std::stringstream ss;
  GenerateShaderHeader(ss);
  ss << R"(
 uniform ivec2 u_base_coords;
 uniform ivec2 u_size;
 uniform usamplerBuffer samp0;
 out vec4 o_col0;
 void main()
 {
  ivec2 coords = ivec2(gl_FragCoord.xy) / ivec2(RESOLUTION_SCALE, RESOLUTION_SCALE);
  ivec2 offset = coords - u_base_coords;
  offset.y = u_size.y - offset.y - 1;
  int buffer_offset = offset.y * u_size.x + offset.x;
  uint value = texelFetch(samp0, buffer_offset).r;
  o_col0 = RGBA5551ToRGBA8(value);
 })";
  return ss.str();
 }
--- a/src/core/gpu_hw_shadergen.h
+++ b/src/core/gpu_hw_shadergen.h
@ -0,0 +1,34 @@
 #pragma once
 #include <sstream>
 #include <string>
 #include "gpu_hw.h"
 class GPU_HW_ShaderGen
 {
 public:
  enum class Backend
  {
    OpenGL
  };
 public:
  GPU_HW_ShaderGen(Backend backend, u32 resolution_scale, bool true_color);
  ~GPU_HW_ShaderGen();
  void Init(Backend backend, u32 resolution_scale, bool true_color);
  std::string GenerateBatchVertexShader(bool textured);
  std::string GenerateBatchFragmentShader(GPU_HW::BatchRenderMode transparency, GPU::TextureMode texture_mode, bool dithering);
  std::string GenerateScreenQuadVertexShader();
  std::string GenerateFillFragmentShader();
  std::string GenerateDisplayFragmentShader(bool depth_24bit, bool interlaced);
  std::string GenerateVRAMWriteFragmentShader();
  Backend m_backend;
  u32 m_resolution_scale;
  bool m_true_color;
 private:
  void GenerateShaderHeader(std::stringstream& ss);
  void GenerateBatchUniformBuffer(std::stringstream& ss);
 };