From 72ab669e700eda8a1fb24f1b6696a04a0f0ba5ad Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Fri, 8 Mar 2024 17:55:02 +1000
Subject: [PATCH] GPUDevice: Add support for feedback loops

---
 src/core/gpu.cpp                        |  10 +-
 src/core/gpu.h                          |   1 +
 src/core/gpu_hw.cpp                     |   3 +-
 src/util/d3d11_device.cpp               |   9 +-
 src/util/d3d11_device.h                 |   4 +-
 src/util/d3d12_device.cpp               |  10 +-
 src/util/d3d12_device.h                 |   4 +-
 src/util/gpu_device.cpp                 |   5 +-
 src/util/gpu_device.h                   |  37 ++-
 src/util/metal_device.mm                |   1 +
 src/util/opengl_device.cpp              |  15 +-
 src/util/opengl_device.h                |   4 +-
 src/util/postprocessing_shader_fx.cpp   |   1 +
 src/util/postprocessing_shader_glsl.cpp |   1 +
 src/util/shadergen.cpp                  |  35 ++-
 src/util/shadergen.h                    |   2 +-
 src/util/vulkan_builders.cpp            |  16 ++
 src/util/vulkan_builders.h              |   4 +
 src/util/vulkan_device.cpp              | 356 ++++++++++++++++--------
 src/util/vulkan_device.h                |  37 ++-
 src/util/vulkan_pipeline.cpp            |  35 ++-
 src/util/vulkan_pipeline.h              |   5 +-
 src/util/vulkan_texture.cpp             |  22 +-
 23 files changed, 426 insertions(+), 191 deletions(-)

diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp
index 3d87e9f1a..2698454ee 100644
--- a/src/core/gpu.cpp
+++ b/src/core/gpu.cpp
@@ -1615,10 +1615,11 @@ bool GPU::CompileDisplayPipelines(bool display, bool deinterlace, bool chroma_sm
   plconfig.rasterization = GPUPipeline::RasterizationState::GetNoCullState();
   plconfig.depth = GPUPipeline::DepthState::GetNoTestsState();
   plconfig.blend = GPUPipeline::BlendState::GetNoBlendingState();
+  plconfig.geometry_shader = nullptr;
   plconfig.depth_format = GPUTexture::Format::Unknown;
   plconfig.samples = 1;
   plconfig.per_sample_shading = false;
-  plconfig.geometry_shader = nullptr;
+  plconfig.render_pass_flags = GPUPipeline::NoRenderPassFlags;
 
   if (display)
   {
@@ -2707,10 +2708,10 @@ void GPU::GetStatsString(SmallStringBase& str)
 {
   if (IsHardwareRenderer())
   {
-    str.format("{} HW | {} P | {} DC | {} RP | {} RB | {} C | {} W",
+    str.format("{} HW | {} P | {} DC | {} B | {} RP | {} RB | {} C | {} W",
                GPUDevice::RenderAPIToString(g_gpu_device->GetRenderAPI()), m_stats.num_primitives,
-               m_stats.host_num_draws, m_stats.host_num_render_passes, m_stats.host_num_downloads, m_stats.num_copies,
-               m_stats.num_writes);
+               m_stats.host_num_draws, m_stats.host_num_barriers, m_stats.host_num_render_passes,
+               m_stats.host_num_downloads, m_stats.num_copies, m_stats.num_writes);
   }
   else
   {
@@ -2753,6 +2754,7 @@ void GPU::UpdateStatistics(u32 frame_count)
 
   UPDATE_GPU_STAT(buffer_streamed);
   UPDATE_GPU_STAT(num_draws);
+  UPDATE_GPU_STAT(num_barriers);
   UPDATE_GPU_STAT(num_render_passes);
   UPDATE_GPU_STAT(num_copies);
   UPDATE_GPU_STAT(num_downloads);
diff --git a/src/core/gpu.h b/src/core/gpu.h
index a4f516a5c..bbfe4e6eb 100644
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@@ -625,6 +625,7 @@ protected:
   {
     size_t host_buffer_streamed;
     u32 host_num_draws;
+    u32 host_num_barriers;
     u32 host_num_render_passes;
     u32 host_num_copies;
     u32 host_num_downloads;
diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp
index f272fc057..98a4bb255 100644
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@@ -819,10 +819,11 @@ bool GPU_HW::CompilePipelines()
   plconfig.input_layout.vertex_stride = sizeof(BatchVertex);
   plconfig.rasterization = GPUPipeline::RasterizationState::GetNoCullState();
   plconfig.primitive = GPUPipeline::Primitive::Triangles;
+  plconfig.geometry_shader = nullptr;
   plconfig.SetTargetFormats(VRAM_RT_FORMAT, VRAM_DS_FORMAT);
   plconfig.samples = m_multisamples;
   plconfig.per_sample_shading = m_per_sample_shading;
-  plconfig.geometry_shader = nullptr;
+  plconfig.render_pass_flags = GPUPipeline::NoRenderPassFlags;
 
   // [depth_test][render_mode][texture_mode][transparency_mode][dithering][interlacing]
   for (u8 depth_test = 0; depth_test < 3; depth_test++)
diff --git a/src/util/d3d11_device.cpp b/src/util/d3d11_device.cpp
index 1c346558c..1b24fecc5 100644
--- a/src/util/d3d11_device.cpp
+++ b/src/util/d3d11_device.cpp
@@ -186,6 +186,7 @@ void D3D11Device::SetFeatures(FeatureMask disabled_features)
   m_features.texture_copy_to_self = false;
   m_features.supports_texture_buffers = !(disabled_features & FEATURE_MASK_TEXTURE_BUFFERS);
   m_features.texture_buffers_emulated_with_ssbo = false;
+  m_features.feedback_loops = false;
   m_features.geometry_shaders = !(disabled_features & FEATURE_MASK_GEOMETRY_SHADERS);
   m_features.partial_msaa_resolve = false;
   m_features.memory_import = false;
@@ -935,9 +936,10 @@ void D3D11Device::UnmapUniformBuffer(u32 size)
   }
 }
 
-void D3D11Device::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds)
+void D3D11Device::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds, GPUPipeline::RenderPassFlag feedback_loop)
 {
   ID3D11RenderTargetView* rtvs[MAX_RENDER_TARGETS];
+  DebugAssert(!feedback_loop);
 
   bool changed = (m_num_current_render_targets != num_rts || m_current_depth_target != ds);
   m_current_depth_target = static_cast<D3D11Texture*>(ds);
@@ -1083,3 +1085,8 @@ void D3D11Device::DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex)
   s_stats.num_draws++;
   m_context->DrawIndexed(index_count, base_index, base_vertex);
 }
+
+void D3D11Device::DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type)
+{
+  Panic("Barriers are not supported");
+}
\ No newline at end of file
diff --git a/src/util/d3d11_device.h b/src/util/d3d11_device.h
index 73c9a53bf..3e35ced45 100644
--- a/src/util/d3d11_device.h
+++ b/src/util/d3d11_device.h
@@ -85,7 +85,8 @@ public:
   void PushUniformBuffer(const void* data, u32 data_size) override;
   void* MapUniformBuffer(u32 size) override;
   void UnmapUniformBuffer(u32 size) override;
-  void SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds) override;
+  void SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds,
+                        GPUPipeline::RenderPassFlag feedback_loop = GPUPipeline::NoRenderPassFlags) override;
   void SetPipeline(GPUPipeline* pipeline) override;
   void SetTextureSampler(u32 slot, GPUTexture* texture, GPUSampler* sampler) override;
   void SetTextureBuffer(u32 slot, GPUTextureBuffer* buffer) override;
@@ -93,6 +94,7 @@ public:
   void SetScissor(s32 x, s32 y, s32 width, s32 height) override;
   void Draw(u32 vertex_count, u32 base_vertex) override;
   void DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) override;
+  void DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type) override;
 
   bool GetHostRefreshRate(float* refresh_rate) override;
 
diff --git a/src/util/d3d12_device.cpp b/src/util/d3d12_device.cpp
index 76637f77c..2ffc1276c 100644
--- a/src/util/d3d12_device.cpp
+++ b/src/util/d3d12_device.cpp
@@ -1190,6 +1190,7 @@ void D3D12Device::SetFeatures(FeatureMask disabled_features)
     /*!(disabled_features & FEATURE_MASK_TEXTURE_COPY_TO_SELF)*/ false; // TODO: Support with Enhanced Barriers
   m_features.supports_texture_buffers = !(disabled_features & FEATURE_MASK_TEXTURE_BUFFERS);
   m_features.texture_buffers_emulated_with_ssbo = false;
+  m_features.feedback_loops = false;
   m_features.geometry_shaders = !(disabled_features & FEATURE_MASK_GEOMETRY_SHADERS);
   m_features.partial_msaa_resolve = true;
   m_features.memory_import = false;
@@ -1548,8 +1549,10 @@ void D3D12Device::DestroyRootSignatures()
     it->Reset();
 }
 
-void D3D12Device::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds)
+void D3D12Device::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds,
+                                   GPUPipeline::RenderPassFlag feedback_loop)
 {
+  DebugAssert(!feedback_loop);
   if (InRenderPass())
     EndRenderPass();
 
@@ -2140,3 +2143,8 @@ void D3D12Device::DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex)
   s_stats.num_draws++;
   GetCommandList()->DrawIndexedInstanced(index_count, 1, base_index, base_vertex, 0);
 }
+
+void D3D12Device::DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type)
+{
+  Panic("Barriers are not supported");
+}
diff --git a/src/util/d3d12_device.h b/src/util/d3d12_device.h
index e3776fb64..0d6fc7652 100644
--- a/src/util/d3d12_device.h
+++ b/src/util/d3d12_device.h
@@ -107,7 +107,8 @@ public:
   void PushUniformBuffer(const void* data, u32 data_size) override;
   void* MapUniformBuffer(u32 size) override;
   void UnmapUniformBuffer(u32 size) override;
-  void SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds) override;
+  void SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds,
+                        GPUPipeline::RenderPassFlag feedback_loop = GPUPipeline::NoRenderPassFlags) override;
   void SetPipeline(GPUPipeline* pipeline) override;
   void SetTextureSampler(u32 slot, GPUTexture* texture, GPUSampler* sampler) override;
   void SetTextureBuffer(u32 slot, GPUTextureBuffer* buffer) override;
@@ -115,6 +116,7 @@ public:
   void SetScissor(s32 x, s32 y, s32 width, s32 height) override;
   void Draw(u32 vertex_count, u32 base_vertex) override;
   void DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) override;
+  void DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type) override;
 
   bool SetGPUTimingEnabled(bool enabled) override;
   float GetAndResetAccumulatedGPUTime() override;
diff --git a/src/util/gpu_device.cpp b/src/util/gpu_device.cpp
index 187610214..7443af9f7 100644
--- a/src/util/gpu_device.cpp
+++ b/src/util/gpu_device.cpp
@@ -492,6 +492,7 @@ bool GPUDevice::CreateResources()
   plconfig.SetTargetFormats(HasSurface() ? m_window_info.surface_format : GPUTexture::Format::RGBA8);
   plconfig.samples = 1;
   plconfig.per_sample_shading = false;
+  plconfig.render_pass_flags = GPUPipeline::NoRenderPassFlags;
   plconfig.vertex_shader = imgui_vs.get();
   plconfig.geometry_shader = nullptr;
   plconfig.fragment_shader = imgui_fs.get();
@@ -615,9 +616,9 @@ void GPUDevice::UploadUniformBuffer(const void* data, u32 data_size)
   UnmapUniformBuffer(data_size);
 }
 
-void GPUDevice::SetRenderTarget(GPUTexture* rt, GPUTexture* ds /*= nullptr*/)
+void GPUDevice::SetRenderTarget(GPUTexture* rt, GPUTexture* ds, GPUPipeline::RenderPassFlag render_pass_flags)
 {
-  SetRenderTargets(rt ? &rt : nullptr, rt ? 1 : 0, ds);
+  SetRenderTargets(rt ? &rt : nullptr, rt ? 1 : 0, ds, render_pass_flags);
 }
 
 void GPUDevice::SetViewportAndScissor(s32 x, s32 y, s32 width, s32 height)
diff --git a/src/util/gpu_device.h b/src/util/gpu_device.h
index d77eb6a53..6fd3e9ccf 100644
--- a/src/util/gpu_device.h
+++ b/src/util/gpu_device.h
@@ -133,6 +133,13 @@ public:
     MaxCount
   };
 
+  enum RenderPassFlag : u8
+  {
+    NoRenderPassFlags = 0,
+    ColorFeedbackLoop = (1 << 0),
+    SampleDepthBuffer = (1 << 1),
+  };
+
   enum class Primitive : u8
   {
     Points,
@@ -369,8 +376,9 @@ public:
 
     GPUTexture::Format color_formats[4];
     GPUTexture::Format depth_format;
-    u32 samples;
+    u8 samples;
     bool per_sample_shading;
+    RenderPassFlag render_pass_flags;
 
     void SetTargetFormats(GPUTexture::Format color_format,
                           GPUTexture::Format depth_format_ = GPUTexture::Format::Unknown);
@@ -425,11 +433,19 @@ public:
   enum FeatureMask : u32
   {
     FEATURE_MASK_DUAL_SOURCE_BLEND = (1 << 0),
-    FEATURE_MASK_FRAMEBUFFER_FETCH = (1 << 1),
-    FEATURE_MASK_TEXTURE_BUFFERS = (1 << 2),
-    FEATURE_MASK_GEOMETRY_SHADERS = (1 << 3),
-    FEATURE_MASK_TEXTURE_COPY_TO_SELF = (1 << 4),
-    FEATURE_MASK_MEMORY_IMPORT = (1 << 5),
+    FEATURE_MASK_FEEDBACK_LOOPS = (1 << 1),
+    FEATURE_MASK_FRAMEBUFFER_FETCH = (1 << 2),
+    FEATURE_MASK_TEXTURE_BUFFERS = (1 << 3),
+    FEATURE_MASK_GEOMETRY_SHADERS = (1 << 4),
+    FEATURE_MASK_TEXTURE_COPY_TO_SELF = (1 << 5),
+    FEATURE_MASK_MEMORY_IMPORT = (1 << 6),
+  };
+
+  enum class DrawBarrier : u32
+  {
+    None,
+    One,
+    Full
   };
 
   struct Features
@@ -441,6 +457,7 @@ public:
     bool texture_copy_to_self : 1;
     bool supports_texture_buffers : 1;
     bool texture_buffers_emulated_with_ssbo : 1;
+    bool feedback_loops : 1;
     bool geometry_shaders : 1;
     bool partial_msaa_resolve : 1;
     bool memory_import : 1;
@@ -454,6 +471,7 @@ public:
   {
     size_t buffer_streamed;
     u32 num_draws;
+    u32 num_barriers;
     u32 num_render_passes;
     u32 num_copies;
     u32 num_downloads;
@@ -616,18 +634,21 @@ public:
   void UploadUniformBuffer(const void* data, u32 data_size);
 
   /// Drawing setup abstraction.
-  virtual void SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds) = 0;
+  virtual void SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds,
+                                GPUPipeline::RenderPassFlag render_pass_flags = GPUPipeline::NoRenderPassFlags) = 0;
   virtual void SetPipeline(GPUPipeline* pipeline) = 0;
   virtual void SetTextureSampler(u32 slot, GPUTexture* texture, GPUSampler* sampler) = 0;
   virtual void SetTextureBuffer(u32 slot, GPUTextureBuffer* buffer) = 0;
   virtual void SetViewport(s32 x, s32 y, s32 width, s32 height) = 0; // TODO: Rectangle
   virtual void SetScissor(s32 x, s32 y, s32 width, s32 height) = 0;
-  void SetRenderTarget(GPUTexture* rt, GPUTexture* ds = nullptr);
+  void SetRenderTarget(GPUTexture* rt, GPUTexture* ds = nullptr,
+                       GPUPipeline::RenderPassFlag render_pass_flags = GPUPipeline::NoRenderPassFlags);
   void SetViewportAndScissor(s32 x, s32 y, s32 width, s32 height);
 
   // Drawing abstraction.
   virtual void Draw(u32 vertex_count, u32 base_vertex) = 0;
   virtual void DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) = 0;
+  virtual void DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type) = 0;
 
   /// Returns false if the window was completely occluded.
   virtual bool BeginPresent(bool skip_present) = 0;
diff --git a/src/util/metal_device.mm b/src/util/metal_device.mm
index 2039dbe24..9c28066e2 100644
--- a/src/util/metal_device.mm
+++ b/src/util/metal_device.mm
@@ -235,6 +235,7 @@ void MetalDevice::SetFeatures(FeatureMask disabled_features)
   m_features.texture_copy_to_self = !(disabled_features & FEATURE_MASK_TEXTURE_COPY_TO_SELF);
   m_features.supports_texture_buffers = !(disabled_features & FEATURE_MASK_TEXTURE_BUFFERS);
   m_features.texture_buffers_emulated_with_ssbo = true;
+  m_features.feedback_loops = false;
   m_features.geometry_shaders = false;
   m_features.partial_msaa_resolve = false;
   m_features.memory_import = true;
diff --git a/src/util/opengl_device.cpp b/src/util/opengl_device.cpp
index 18b612c8e..313386a1d 100644
--- a/src/util/opengl_device.cpp
+++ b/src/util/opengl_device.cpp
@@ -403,8 +403,9 @@ bool OpenGLDevice::CheckFeatures(FeatureMask disabled_features)
     !(disabled_features & FEATURE_MASK_DUAL_SOURCE_BLEND) && (max_dual_source_draw_buffers > 0) &&
     (GLAD_GL_VERSION_3_3 || GLAD_GL_ARB_blend_func_extended || GLAD_GL_EXT_blend_func_extended);
 
-  m_features.framebuffer_fetch = !(disabled_features & FEATURE_MASK_FRAMEBUFFER_FETCH) &&
-                                 (GLAD_GL_EXT_shader_framebuffer_fetch || GLAD_GL_ARM_shader_framebuffer_fetch);
+  m_features.framebuffer_fetch =
+    !(disabled_features & (FEATURE_MASK_FEEDBACK_LOOPS | FEATURE_MASK_FRAMEBUFFER_FETCH)) &&
+    (GLAD_GL_EXT_shader_framebuffer_fetch || GLAD_GL_ARM_shader_framebuffer_fetch);
 
 #ifdef __APPLE__
   // Partial texture buffer uploads appear to be broken in macOS's OpenGL driver.
@@ -469,6 +470,8 @@ bool OpenGLDevice::CheckFeatures(FeatureMask disabled_features)
   // So, blit from the shadow texture, like in the other renderers.
   m_features.texture_copy_to_self = !vendor_id_arm && !(disabled_features & FEATURE_MASK_TEXTURE_COPY_TO_SELF);
 
+  m_features.feedback_loops = m_features.framebuffer_fetch;
+
   m_features.geometry_shaders =
     !(disabled_features & FEATURE_MASK_GEOMETRY_SHADERS) && (GLAD_GL_VERSION_3_2 || GLAD_GL_ES_VERSION_3_2);
 
@@ -1035,6 +1038,11 @@ void OpenGLDevice::DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex)
   glDrawElements(m_current_pipeline->GetTopology(), index_count, GL_UNSIGNED_SHORT, indices);
 }
 
+void OpenGLDevice::DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type)
+{
+  Panic("Barriers are not supported");
+}
+
 void OpenGLDevice::MapVertexBuffer(u32 vertex_size, u32 vertex_count, void** map_ptr, u32* map_space,
                                    u32* map_base_vertex)
 {
@@ -1088,8 +1096,9 @@ void OpenGLDevice::UnmapUniformBuffer(u32 size)
   glBindBufferRange(GL_UNIFORM_BUFFER, 1, m_uniform_buffer->GetGLBufferId(), pos, size);
 }
 
-void OpenGLDevice::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds)
+void OpenGLDevice::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds, GPUPipeline::RenderPassFlag feedback_loop)
 {
+  //DebugAssert(!feedback_loop); TODO
   bool changed = (m_num_current_render_targets != num_rts || m_current_depth_target != ds);
   bool needs_ds_clear = (ds && ds->IsClearedOrInvalidated());
   bool needs_rt_clear = false;
diff --git a/src/util/opengl_device.h b/src/util/opengl_device.h
index df67e8c4c..de56f7fa0 100644
--- a/src/util/opengl_device.h
+++ b/src/util/opengl_device.h
@@ -89,7 +89,8 @@ public:
   void PushUniformBuffer(const void* data, u32 data_size) override;
   void* MapUniformBuffer(u32 size) override;
   void UnmapUniformBuffer(u32 size) override;
-  void SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds) override;
+  void SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds,
+                        GPUPipeline::RenderPassFlag feedback_loop = GPUPipeline::NoRenderPassFlags) override;
   void SetPipeline(GPUPipeline* pipeline) override;
   void SetTextureSampler(u32 slot, GPUTexture* texture, GPUSampler* sampler) override;
   void SetTextureBuffer(u32 slot, GPUTextureBuffer* buffer) override;
@@ -97,6 +98,7 @@ public:
   void SetScissor(s32 x, s32 y, s32 width, s32 height) override;
   void Draw(u32 vertex_count, u32 base_vertex) override;
   void DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) override;
+  void DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type) override;
 
   void SetSyncMode(DisplaySyncMode mode) override;
 
diff --git a/src/util/postprocessing_shader_fx.cpp b/src/util/postprocessing_shader_fx.cpp
index 9569a1ae0..4d43521a5 100644
--- a/src/util/postprocessing_shader_fx.cpp
+++ b/src/util/postprocessing_shader_fx.cpp
@@ -1222,6 +1222,7 @@ bool PostProcessing::ReShadeFXShader::CompilePipeline(GPUTexture::Format format,
   plconfig.blend = GPUPipeline::BlendState::GetNoBlendingState();
   plconfig.samples = 1;
   plconfig.per_sample_shading = false;
+  plconfig.render_pass_flags = GPUPipeline::NoRenderPassFlags;
 
   progress->PushState();
 
diff --git a/src/util/postprocessing_shader_glsl.cpp b/src/util/postprocessing_shader_glsl.cpp
index a3f90c7af..0fa20fcb8 100644
--- a/src/util/postprocessing_shader_glsl.cpp
+++ b/src/util/postprocessing_shader_glsl.cpp
@@ -136,6 +136,7 @@ bool PostProcessing::GLSLShader::CompilePipeline(GPUTexture::Format format, u32
   plconfig.blend = GPUPipeline::BlendState::GetNoBlendingState();
   plconfig.samples = 1;
   plconfig.per_sample_shading = false;
+  plconfig.render_pass_flags = GPUPipeline::NoRenderPassFlags;
   plconfig.vertex_shader = vs.get();
   plconfig.fragment_shader = fs.get();
   plconfig.geometry_shader = nullptr;
diff --git a/src/util/shadergen.cpp b/src/util/shadergen.cpp
index 777ff5471..407c28911 100644
--- a/src/util/shadergen.cpp
+++ b/src/util/shadergen.cpp
@@ -505,7 +505,7 @@ void ShaderGen::DeclareFragmentEntryPoint(
   const std::initializer_list<std::pair<const char*, const char*>>& additional_inputs,
   bool declare_fragcoord /* = false */, u32 num_color_outputs /* = 1 */, bool depth_output /* = false */,
   bool msaa /* = false */, bool ssaa /* = false */, bool declare_sample_id /* = false */,
-  bool noperspective_color /* = false */, bool framebuffer_fetch /* = false */)
+  bool noperspective_color /* = false */, bool feedback_loop /* = false */)
 {
   if (m_glsl)
   {
@@ -560,21 +560,32 @@ void ShaderGen::DeclareFragmentEntryPoint(
       ss << "#define o_depth gl_FragDepth\n";
 
     const char* target_0_qualifier = "out";
-#ifdef ENABLE_OPENGL
-    if ((m_render_api == RenderAPI::OpenGL || m_render_api == RenderAPI::OpenGLES) && m_supports_framebuffer_fetch &&
-        framebuffer_fetch)
+
+    if (feedback_loop)
     {
-      if (GLAD_GL_EXT_shader_framebuffer_fetch)
+#ifdef ENABLE_OPENGL
+      if (m_render_api == RenderAPI::OpenGL || m_render_api == RenderAPI::OpenGLES)
       {
-        target_0_qualifier = "inout";
-        ss << "#define LAST_FRAG_COLOR o_col0\n";
+        Assert(m_supports_framebuffer_fetch);
+        if (GLAD_GL_EXT_shader_framebuffer_fetch)
+        {
+          target_0_qualifier = "inout";
+          ss << "#define LAST_FRAG_COLOR o_col0\n";
+        }
+        else if (GLAD_GL_ARM_shader_framebuffer_fetch)
+        {
+          ss << "#define LAST_FRAG_COLOR gl_LastFragColorARM\n";
+        }
       }
-      else if (GLAD_GL_ARM_shader_framebuffer_fetch)
-      {
-        ss << "#define LAST_FRAG_COLOR gl_LastFragColorARM\n";
-      }
-    }
 #endif
+#ifdef ENABLE_VULKAN
+      if (m_render_api == RenderAPI::Vulkan)
+      {
+        ss << "layout(input_attachment_index = 0, set = 2, binding = 0) uniform subpassInput u_input_rt;\n";
+        ss << "#define LAST_FRAG_COLOR subpassLoad(u_input_rt)\n";
+      }
+#endif
+    }
 
     if (m_use_glsl_binding_layout)
     {
diff --git a/src/util/shadergen.h b/src/util/shadergen.h
index 0e88b0c6f..0108d0c26 100644
--- a/src/util/shadergen.h
+++ b/src/util/shadergen.h
@@ -53,7 +53,7 @@ protected:
                                  const std::initializer_list<std::pair<const char*, const char*>>& additional_inputs,
                                  bool declare_fragcoord = false, u32 num_color_outputs = 1, bool depth_output = false,
                                  bool msaa = false, bool ssaa = false, bool declare_sample_id = false,
-                                 bool noperspective_color = false, bool framebuffer_fetch = false);
+                                 bool noperspective_color = false, bool feedback_loop = false);
 
   RenderAPI m_render_api;
   bool m_glsl;
diff --git a/src/util/vulkan_builders.cpp b/src/util/vulkan_builders.cpp
index 92fb6f97f..825a717f0 100644
--- a/src/util/vulkan_builders.cpp
+++ b/src/util/vulkan_builders.cpp
@@ -267,6 +267,9 @@ void Vulkan::GraphicsPipelineBuilder::Clear()
   m_rendering = {};
   m_rendering.sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO_KHR;
 
+  m_rendering_input_attachment_locations = {};
+  m_rendering_input_attachment_locations.sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR;
+
   // set defaults
   SetNoCullRasterizationState();
   SetNoDepthTestState();
@@ -595,6 +598,19 @@ void Vulkan::GraphicsPipelineBuilder::SetDynamicRenderingDepthAttachment(VkForma
   m_rendering.stencilAttachmentFormat = stencil_format;
 }
 
+void Vulkan::GraphicsPipelineBuilder::AddDynamicRenderingInputAttachment(u32 color_attachment_index)
+{
+  AddPointerToChain(&m_ci, &m_rendering_input_attachment_locations);
+
+  DebugAssert(color_attachment_index < m_rendering.colorAttachmentCount);
+  DebugAssert(m_rendering_input_attachment_locations.colorAttachmentCount < MAX_INPUT_ATTACHMENTS);
+
+  m_rendering_input_attachment_locations.pColorAttachmentLocations = m_rendering_input_attachment_indices.data();
+  m_rendering_input_attachment_indices[m_rendering_input_attachment_locations.colorAttachmentCount] =
+    color_attachment_index;
+  m_rendering_input_attachment_locations.colorAttachmentCount++;
+}
+
 Vulkan::ComputePipelineBuilder::ComputePipelineBuilder()
 {
   Clear();
diff --git a/src/util/vulkan_builders.h b/src/util/vulkan_builders.h
index a7ddf21b7..fbcf7fd83 100644
--- a/src/util/vulkan_builders.h
+++ b/src/util/vulkan_builders.h
@@ -81,6 +81,7 @@ public:
     MAX_VERTEX_ATTRIBUTES = 16,
     MAX_VERTEX_BUFFERS = 8,
     MAX_ATTACHMENTS = GPUDevice::MAX_RENDER_TARGETS + 1,
+    MAX_INPUT_ATTACHMENTS = 1,
     MAX_DYNAMIC_STATE = 8
   };
 
@@ -144,6 +145,7 @@ public:
   void SetDynamicRendering();
   void AddDynamicRenderingColorAttachment(VkFormat format);
   void SetDynamicRenderingDepthAttachment(VkFormat depth_format, VkFormat stencil_format);
+  void AddDynamicRenderingInputAttachment(u32 color_attachment_index);
 
 private:
   VkGraphicsPipelineCreateInfo m_ci;
@@ -174,7 +176,9 @@ private:
   VkPipelineRasterizationLineStateCreateInfoEXT m_line_rasterization_state;
 
   VkPipelineRenderingCreateInfoKHR m_rendering;
+  VkRenderingAttachmentLocationInfoKHR m_rendering_input_attachment_locations;
   std::array<VkFormat, MAX_ATTACHMENTS> m_rendering_color_formats;
+  std::array<u32, MAX_INPUT_ATTACHMENTS> m_rendering_input_attachment_indices;
 };
 
 class ComputePipelineBuilder
diff --git a/src/util/vulkan_device.cpp b/src/util/vulkan_device.cpp
index 13abf3bed..68c345ecd 100644
--- a/src/util/vulkan_device.cpp
+++ b/src/util/vulkan_device.cpp
@@ -53,6 +53,7 @@ enum : u32
 {
   MAX_DRAW_CALLS_PER_FRAME = 2048,
   MAX_COMBINED_IMAGE_SAMPLER_DESCRIPTORS_PER_FRAME = GPUDevice::MAX_TEXTURE_SAMPLERS * MAX_DRAW_CALLS_PER_FRAME,
+  MAX_INPUT_ATTACHMENT_DESCRIPTORS_PER_FRAME = MAX_DRAW_CALLS_PER_FRAME,
   MAX_DESCRIPTOR_SETS_PER_FRAME = MAX_DRAW_CALLS_PER_FRAME,
   MAX_SAMPLER_DESCRIPTORS = 8192,
 
@@ -380,8 +381,6 @@ bool VulkanDevice::SelectDeviceExtensions(ExtensionList* extension_list, bool en
   m_optional_extensions.vk_ext_rasterization_order_attachment_access =
     SupportsExtension(VK_EXT_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_EXTENSION_NAME, false) ||
     SupportsExtension(VK_ARM_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_EXTENSION_NAME, false);
-  m_optional_extensions.vk_ext_attachment_feedback_loop_layout =
-    SupportsExtension(VK_EXT_ATTACHMENT_FEEDBACK_LOOP_LAYOUT_EXTENSION_NAME, false);
   m_optional_extensions.vk_khr_get_memory_requirements2 =
     SupportsExtension(VK_KHR_GET_MEMORY_REQUIREMENTS_2_EXTENSION_NAME, false);
   m_optional_extensions.vk_khr_bind_memory2 = SupportsExtension(VK_KHR_BIND_MEMORY_2_EXTENSION_NAME, false);
@@ -392,6 +391,9 @@ bool VulkanDevice::SelectDeviceExtensions(ExtensionList* extension_list, bool en
     SupportsExtension(VK_KHR_DEPTH_STENCIL_RESOLVE_EXTENSION_NAME, false) &&
     SupportsExtension(VK_KHR_CREATE_RENDERPASS_2_EXTENSION_NAME, false) &&
     SupportsExtension(VK_KHR_DYNAMIC_RENDERING_EXTENSION_NAME, false);
+  m_optional_extensions.vk_khr_dynamic_rendering_local_read =
+    m_optional_extensions.vk_khr_dynamic_rendering &&
+    SupportsExtension(VK_KHR_DYNAMIC_RENDERING_LOCAL_READ_EXTENSION_NAME, false);
   m_optional_extensions.vk_khr_push_descriptor = SupportsExtension(VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME, false);
   m_optional_extensions.vk_ext_external_memory_host =
     SupportsExtension(VK_EXT_EXTERNAL_MEMORY_HOST_EXTENSION_NAME, false);
@@ -538,17 +540,19 @@ bool VulkanDevice::CreateDevice(VkSurfaceKHR surface, bool enable_validation_lay
   VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesEXT rasterization_order_access_feature = {
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_FEATURES_EXT, nullptr, VK_TRUE, VK_FALSE,
     VK_FALSE};
-  VkPhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT attachment_feedback_loop_feature = {
-    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ATTACHMENT_FEEDBACK_LOOP_LAYOUT_FEATURES_EXT, nullptr, VK_TRUE};
   VkPhysicalDeviceDynamicRenderingFeatures dynamic_rendering_feature = {
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DYNAMIC_RENDERING_FEATURES, nullptr, VK_TRUE};
+  VkPhysicalDeviceDynamicRenderingLocalReadFeaturesKHR dynamic_rendering_local_read_feature = {
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DYNAMIC_RENDERING_LOCAL_READ_FEATURES_KHR, nullptr, VK_TRUE};
 
   if (m_optional_extensions.vk_ext_rasterization_order_attachment_access)
     Vulkan::AddPointerToChain(&device_info, &rasterization_order_access_feature);
-  if (m_optional_extensions.vk_ext_attachment_feedback_loop_layout)
-    Vulkan::AddPointerToChain(&device_info, &attachment_feedback_loop_feature);
   if (m_optional_extensions.vk_khr_dynamic_rendering)
+  {
     Vulkan::AddPointerToChain(&device_info, &dynamic_rendering_feature);
+    if (m_optional_extensions.vk_khr_dynamic_rendering_local_read)
+      Vulkan::AddPointerToChain(&device_info, &dynamic_rendering_local_read_feature);
+  }
 
   VkResult res = vkCreateDevice(m_physical_device, &device_info, nullptr, &m_device);
   if (res != VK_SUCCESS)
@@ -586,18 +590,20 @@ void VulkanDevice::ProcessDeviceExtensions()
   VkPhysicalDeviceRasterizationOrderAttachmentAccessFeaturesEXT rasterization_order_access_feature = {
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_FEATURES_EXT, nullptr, VK_FALSE, VK_FALSE,
     VK_FALSE};
-  VkPhysicalDeviceAttachmentFeedbackLoopLayoutFeaturesEXT attachment_feedback_loop_feature = {
-    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ATTACHMENT_FEEDBACK_LOOP_LAYOUT_FEATURES_EXT, nullptr, VK_FALSE};
   VkPhysicalDeviceDynamicRenderingFeatures dynamic_rendering_feature = {
     VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DYNAMIC_RENDERING_FEATURES, nullptr, VK_FALSE};
+  VkPhysicalDeviceDynamicRenderingLocalReadFeaturesKHR dynamic_rendering_local_read_feature = {
+    VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DYNAMIC_RENDERING_LOCAL_READ_FEATURES_KHR, nullptr, VK_FALSE};
 
   // add in optional feature structs
   if (m_optional_extensions.vk_ext_rasterization_order_attachment_access)
     Vulkan::AddPointerToChain(&features2, &rasterization_order_access_feature);
-  if (m_optional_extensions.vk_ext_attachment_feedback_loop_layout)
-    Vulkan::AddPointerToChain(&features2, &attachment_feedback_loop_feature);
   if (m_optional_extensions.vk_khr_dynamic_rendering)
+  {
     Vulkan::AddPointerToChain(&features2, &dynamic_rendering_feature);
+    if (m_optional_extensions.vk_khr_dynamic_rendering_local_read)
+      Vulkan::AddPointerToChain(&features2, &dynamic_rendering_local_read_feature);
+  }
 
   // we might not have VK_KHR_get_physical_device_properties2...
   if (!vkGetPhysicalDeviceFeatures2 || !vkGetPhysicalDeviceProperties2 || !vkGetPhysicalDeviceMemoryProperties2)
@@ -627,9 +633,9 @@ void VulkanDevice::ProcessDeviceExtensions()
   // confirm we actually support it
   m_optional_extensions.vk_ext_rasterization_order_attachment_access &=
     (rasterization_order_access_feature.rasterizationOrderColorAttachmentAccess == VK_TRUE);
-  m_optional_extensions.vk_ext_attachment_feedback_loop_layout &=
-    (attachment_feedback_loop_feature.attachmentFeedbackLoopLayout == VK_TRUE);
   m_optional_extensions.vk_khr_dynamic_rendering &= (dynamic_rendering_feature.dynamicRendering == VK_TRUE);
+  m_optional_extensions.vk_khr_dynamic_rendering_local_read &=
+    (dynamic_rendering_local_read_feature.dynamicRenderingLocalRead == VK_TRUE);
 
   VkPhysicalDeviceProperties2 properties2 = {VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2, nullptr, {}};
   VkPhysicalDevicePushDescriptorPropertiesKHR push_descriptor_properties = {
@@ -664,6 +670,7 @@ void VulkanDevice::ProcessDeviceExtensions()
     if (m_optional_extensions.vk_khr_dynamic_rendering)
     {
       m_optional_extensions.vk_khr_dynamic_rendering = false;
+      m_optional_extensions.vk_khr_dynamic_rendering_local_read = false;
       Log_WarningPrint("Disabling VK_KHR_dynamic_rendering on broken mobile driver.");
     }
     if (m_optional_extensions.vk_khr_push_descriptor)
@@ -673,26 +680,24 @@ void VulkanDevice::ProcessDeviceExtensions()
     }
   }
 
-  Log_InfoPrintf("VK_EXT_memory_budget is %s",
-                 m_optional_extensions.vk_ext_memory_budget ? "supported" : "NOT supported");
-  Log_InfoPrintf("VK_EXT_rasterization_order_attachment_access is %s",
-                 m_optional_extensions.vk_ext_rasterization_order_attachment_access ? "supported" : "NOT supported");
-  Log_InfoPrintf("VK_EXT_attachment_feedback_loop_layout is %s",
-                 m_optional_extensions.vk_ext_attachment_feedback_loop_layout ? "supported" : "NOT supported");
-  Log_InfoPrintf("VK_KHR_get_memory_requirements2 is %s",
-                 m_optional_extensions.vk_khr_get_memory_requirements2 ? "supported" : "NOT supported");
-  Log_InfoPrintf("VK_KHR_bind_memory2 is %s",
-                 m_optional_extensions.vk_khr_bind_memory2 ? "supported" : "NOT supported");
-  Log_InfoPrintf("VK_KHR_get_physical_device_properties2 is %s",
-                 m_optional_extensions.vk_khr_get_physical_device_properties2 ? "supported" : "NOT supported");
-  Log_InfoPrintf("VK_KHR_dedicated_allocation is %s",
-                 m_optional_extensions.vk_khr_dedicated_allocation ? "supported" : "NOT supported");
-  Log_InfoPrintf("VK_KHR_dynamic_rendering is %s",
-                 m_optional_extensions.vk_khr_dynamic_rendering ? "supported" : "NOT supported");
-  Log_InfoPrintf("VK_KHR_push_descriptor is %s",
-                 m_optional_extensions.vk_khr_push_descriptor ? "supported" : "NOT supported");
-  Log_InfoPrintf("VK_EXT_external_memory_host is %s",
-                 m_optional_extensions.vk_ext_external_memory_host ? "supported" : "NOT supported");
+  Log_InfoFmt("VK_EXT_memory_budget is {}", m_optional_extensions.vk_ext_memory_budget ? "supported" : "NOT supported");
+  Log_InfoFmt("VK_EXT_rasterization_order_attachment_access is {}",
+              m_optional_extensions.vk_ext_rasterization_order_attachment_access ? "supported" : "NOT supported");
+  Log_InfoFmt("VK_KHR_get_memory_requirements2 is {}",
+              m_optional_extensions.vk_khr_get_memory_requirements2 ? "supported" : "NOT supported");
+  Log_InfoFmt("VK_KHR_bind_memory2 is {}", m_optional_extensions.vk_khr_bind_memory2 ? "supported" : "NOT supported");
+  Log_InfoFmt("VK_KHR_get_physical_device_properties2 is {}",
+              m_optional_extensions.vk_khr_get_physical_device_properties2 ? "supported" : "NOT supported");
+  Log_InfoFmt("VK_KHR_dedicated_allocation is {}",
+              m_optional_extensions.vk_khr_dedicated_allocation ? "supported" : "NOT supported");
+  Log_InfoFmt("VK_KHR_dynamic_rendering is {}",
+              m_optional_extensions.vk_khr_dynamic_rendering ? "supported" : "NOT supported");
+  Log_InfoFmt("VK_KHR_dynamic_rendering_local_read is {}",
+              m_optional_extensions.vk_khr_dynamic_rendering_local_read ? "supported" : "NOT supported");
+  Log_InfoFmt("VK_KHR_push_descriptor is {}",
+              m_optional_extensions.vk_khr_push_descriptor ? "supported" : "NOT supported");
+  Log_InfoFmt("VK_EXT_external_memory_host is {}",
+              m_optional_extensions.vk_ext_external_memory_host ? "supported" : "NOT supported");
 }
 
 bool VulkanDevice::CreateAllocator()
@@ -834,25 +839,27 @@ bool VulkanDevice::CreateCommandBuffers()
     }
     Vulkan::SetObjectName(m_device, resources.fence, TinyString::from_format("Frame Fence {}", frame_index));
 
+    u32 num_pools = 0;
+    VkDescriptorPoolSize pool_sizes[2];
     if (!m_optional_extensions.vk_khr_push_descriptor)
     {
-      VkDescriptorPoolSize pool_sizes[] = {
-        {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, MAX_COMBINED_IMAGE_SAMPLER_DESCRIPTORS_PER_FRAME},
-      };
-
-      VkDescriptorPoolCreateInfo pool_create_info = {
-        VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, nullptr,   0, MAX_DESCRIPTOR_SETS_PER_FRAME,
-        static_cast<u32>(std::size(pool_sizes)),       pool_sizes};
-
-      res = vkCreateDescriptorPool(m_device, &pool_create_info, nullptr, &resources.descriptor_pool);
-      if (res != VK_SUCCESS)
-      {
-        LOG_VULKAN_ERROR(res, "vkCreateDescriptorPool failed: ");
-        return false;
-      }
-      Vulkan::SetObjectName(m_device, resources.descriptor_pool,
-                            TinyString::from_format("Frame Descriptor Pool {}", frame_index));
+      pool_sizes[num_pools++] = {VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+                                 MAX_COMBINED_IMAGE_SAMPLER_DESCRIPTORS_PER_FRAME};
     }
+    pool_sizes[num_pools++] = {VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT, MAX_INPUT_ATTACHMENT_DESCRIPTORS_PER_FRAME};
+
+    VkDescriptorPoolCreateInfo pool_create_info = {
+      VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, nullptr,   0, MAX_DESCRIPTOR_SETS_PER_FRAME,
+      static_cast<u32>(std::size(pool_sizes)),       pool_sizes};
+
+    res = vkCreateDescriptorPool(m_device, &pool_create_info, nullptr, &resources.descriptor_pool);
+    if (res != VK_SUCCESS)
+    {
+      LOG_VULKAN_ERROR(res, "vkCreateDescriptorPool failed: ");
+      return false;
+    }
+    Vulkan::SetObjectName(m_device, resources.descriptor_pool,
+                          TinyString::from_format("Frame Descriptor Pool {}", frame_index));
 
     ++frame_index;
   }
@@ -970,17 +977,15 @@ VkRenderPass VulkanDevice::GetRenderPass(const GPUPipeline::GraphicsConfig& conf
     key.stencil_store_op = stencil ? VK_ATTACHMENT_STORE_OP_STORE : VK_ATTACHMENT_STORE_OP_DONT_CARE;
   }
 
-  // key.color_feedback_loop = false;
-  // key.depth_sampling = false;
-
   key.samples = static_cast<u8>(config.samples);
+  key.feedback_loop = config.render_pass_flags;
 
   const auto it = m_render_pass_cache.find(key);
   return (it != m_render_pass_cache.end()) ? it->second : CreateCachedRenderPass(key);
 }
 
-VkRenderPass VulkanDevice::GetRenderPass(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds,
-                                         bool color_feedback_loop /* = false */, bool depth_sampling /* = false */)
+VkRenderPass VulkanDevice::GetRenderPass(VulkanTexture* const* rts, u32 num_rts, VulkanTexture* ds,
+                                         GPUPipeline::RenderPassFlag feedback_loop)
 {
   RenderPassCacheKey key;
   std::memset(&key, 0, sizeof(key));
@@ -1009,8 +1014,7 @@ VkRenderPass VulkanDevice::GetRenderPass(GPUTexture* const* rts, u32 num_rts, GP
     key.samples = static_cast<u8>(ds->GetSamples());
   }
 
-  key.color_feedback_loop = color_feedback_loop;
-  key.depth_sampling = depth_sampling;
+  key.feedback_loop = feedback_loop;
 
   const auto it = m_render_pass_cache.find(key);
   return (it != m_render_pass_cache.end()) ? it->second : CreateCachedRenderPass(key);
@@ -1674,8 +1678,9 @@ VkRenderPass VulkanDevice::CreateCachedRenderPass(RenderPassCacheKey key)
       break;
 
     const VkImageLayout layout =
-      key.color_feedback_loop ?
-        (UseFeedbackLoopLayout() ? VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT : VK_IMAGE_LAYOUT_GENERAL) :
+      (key.feedback_loop & GPUPipeline::ColorFeedbackLoop) ?
+        (m_optional_extensions.vk_khr_dynamic_rendering_local_read ? VK_IMAGE_LAYOUT_RENDERING_LOCAL_READ_KHR :
+                                                                     VK_IMAGE_LAYOUT_GENERAL) :
         VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
 
     const RenderPassCacheKey::RenderTarget key_rt = key.color[i];
@@ -1692,15 +1697,12 @@ VkRenderPass VulkanDevice::CreateCachedRenderPass(RenderPassCacheKey key)
     color_references[num_attachments].layout = layout;
     color_reference_ptr = color_references.data();
 
-    if (key.color_feedback_loop)
+    if (key.feedback_loop & GPUPipeline::ColorFeedbackLoop)
     {
       DebugAssert(i == 0);
-      if (!UseFeedbackLoopLayout())
-      {
-        input_reference.attachment = num_attachments;
-        input_reference.layout = layout;
-        input_reference_ptr = &input_reference;
-      }
+      input_reference.attachment = num_attachments;
+      input_reference.layout = layout;
+      input_reference_ptr = &input_reference;
 
       if (!m_optional_extensions.vk_ext_rasterization_order_attachment_access)
       {
@@ -1710,11 +1712,8 @@ VkRenderPass VulkanDevice::CreateCachedRenderPass(RenderPassCacheKey key)
         subpass_dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
         subpass_dependency.dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
         subpass_dependency.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
-        subpass_dependency.dstAccessMask =
-          UseFeedbackLoopLayout() ? VK_ACCESS_SHADER_READ_BIT : VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
-        subpass_dependency.dependencyFlags = UseFeedbackLoopLayout() ?
-                                               (VK_DEPENDENCY_BY_REGION_BIT | VK_DEPENDENCY_FEEDBACK_LOOP_BIT_EXT) :
-                                               VK_DEPENDENCY_BY_REGION_BIT;
+        subpass_dependency.dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
+        subpass_dependency.dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT;
         subpass_dependency_ptr = &subpass_dependency;
       }
     }
@@ -1726,10 +1725,9 @@ VkRenderPass VulkanDevice::CreateCachedRenderPass(RenderPassCacheKey key)
 
   if (key.depth_format != static_cast<u8>(GPUTexture::Format::Unknown))
   {
-    const VkImageLayout layout =
-      key.depth_sampling ?
-        (UseFeedbackLoopLayout() ? VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT : VK_IMAGE_LAYOUT_GENERAL) :
-        VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+    const VkImageLayout layout = (key.feedback_loop & GPUPipeline::SampleDepthBuffer) ?
+                                   VK_IMAGE_LAYOUT_GENERAL :
+                                   VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
     attachments[num_attachments] = {0,
                                     static_cast<VkFormat>(TEXTURE_FORMAT_MAPPING[key.depth_format]),
                                     static_cast<VkSampleCountFlagBits>(key.samples),
@@ -1746,7 +1744,8 @@ VkRenderPass VulkanDevice::CreateCachedRenderPass(RenderPassCacheKey key)
   }
 
   const VkSubpassDescriptionFlags subpass_flags =
-    (key.color_feedback_loop && m_optional_extensions.vk_ext_rasterization_order_attachment_access) ?
+    ((key.feedback_loop & GPUPipeline::ColorFeedbackLoop) &&
+     m_optional_extensions.vk_ext_rasterization_order_attachment_access) ?
       VK_SUBPASS_DESCRIPTION_RASTERIZATION_ORDER_ATTACHMENT_COLOR_ACCESS_BIT_EXT :
       0;
   const VkSubpassDescription subpass = {subpass_flags,
@@ -1784,7 +1783,9 @@ VkRenderPass VulkanDevice::CreateCachedRenderPass(RenderPassCacheKey key)
 VkFramebuffer VulkanDevice::CreateFramebuffer(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds, u32 flags)
 {
   VulkanDevice& dev = VulkanDevice::GetInstance();
-  VkRenderPass render_pass = dev.GetRenderPass(rts, num_rts, ds, false, false);
+  VkRenderPass render_pass =
+    dev.GetRenderPass(reinterpret_cast<VulkanTexture* const*>(rts), num_rts, static_cast<VulkanTexture*>(ds),
+                      static_cast<GPUPipeline::RenderPassFlag>(flags));
 
   const GPUTexture* rt_or_ds = (num_rts > 0) ? rts[0] : ds;
   DebugAssert(rt_or_ds);
@@ -2510,7 +2511,9 @@ bool VulkanDevice::CheckFeatures(FeatureMask disabled_features)
 
   m_features.dual_source_blend =
     !(disabled_features & FEATURE_MASK_DUAL_SOURCE_BLEND) && m_device_features.dualSrcBlend;
-  m_features.framebuffer_fetch = /*!(disabled_features & FEATURE_MASK_FRAMEBUFFER_FETCH) && */ false;
+  m_features.framebuffer_fetch =
+    !(disabled_features & (FEATURE_MASK_FEEDBACK_LOOPS | FEATURE_MASK_FRAMEBUFFER_FETCH)) &&
+    m_optional_extensions.vk_ext_rasterization_order_attachment_access;
 
   if (!m_features.dual_source_blend)
     Log_WarningPrintf("Vulkan driver is missing dual-source blending. This will have an impact on performance.");
@@ -2519,6 +2522,7 @@ bool VulkanDevice::CheckFeatures(FeatureMask disabled_features)
   m_features.texture_copy_to_self = !(disabled_features & FEATURE_MASK_TEXTURE_COPY_TO_SELF);
   m_features.per_sample_shading = m_device_features.sampleRateShading;
   m_features.supports_texture_buffers = !(disabled_features & FEATURE_MASK_TEXTURE_BUFFERS);
+  m_features.feedback_loops = !(disabled_features & FEATURE_MASK_FEEDBACK_LOOPS);
 
 #ifdef __APPLE__
   // Partial texture buffer uploads appear to be broken in macOS/MoltenVK.
@@ -2874,10 +2878,22 @@ bool VulkanDevice::CreatePipelineLayouts()
     Vulkan::SetObjectName(m_device, m_multi_texture_ds_layout, "Multi Texture Descriptor Set Layout");
   }
 
+  if (m_features.feedback_loops)
+  {
+    // TODO: This isn't ideal, since we can't push the RT descriptors.
+    dslb.AddBinding(0, VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT, 1, VK_SHADER_STAGE_FRAGMENT_BIT);
+    if ((m_feedback_loop_ds_layout = dslb.Create(m_device)) == VK_NULL_HANDLE)
+      return false;
+    Vulkan::SetObjectName(m_device, m_feedback_loop_ds_layout, "Feedback Loop Descriptor Set Layout");
+  }
+
   {
     VkPipelineLayout& pl = m_pipeline_layouts[static_cast<u8>(GPUPipeline::Layout::SingleTextureAndUBO)];
     plb.AddDescriptorSet(m_ubo_ds_layout);
     plb.AddDescriptorSet(m_single_texture_ds_layout);
+    // TODO: REMOVE ME
+    if (m_features.feedback_loops)
+      plb.AddDescriptorSet(m_feedback_loop_ds_layout);
     if ((pl = plb.Create(m_device)) == VK_NULL_HANDLE)
       return false;
     Vulkan::SetObjectName(m_device, pl, "Single Texture + UBO Pipeline Layout");
@@ -2886,6 +2902,9 @@ bool VulkanDevice::CreatePipelineLayouts()
   {
     VkPipelineLayout& pl = m_pipeline_layouts[static_cast<u8>(GPUPipeline::Layout::SingleTextureAndPushConstants)];
     plb.AddDescriptorSet(m_single_texture_ds_layout);
+    // TODO: REMOVE ME
+    if (m_features.feedback_loops)
+      plb.AddDescriptorSet(m_feedback_loop_ds_layout);
     plb.AddPushConstants(UNIFORM_PUSH_CONSTANTS_STAGES, 0, UNIFORM_PUSH_CONSTANTS_SIZE);
     if ((pl = plb.Create(m_device)) == VK_NULL_HANDLE)
       return false;
@@ -2896,6 +2915,9 @@ bool VulkanDevice::CreatePipelineLayouts()
     VkPipelineLayout& pl =
       m_pipeline_layouts[static_cast<u8>(GPUPipeline::Layout::SingleTextureBufferAndPushConstants)];
     plb.AddDescriptorSet(m_single_texture_buffer_ds_layout);
+    // TODO: REMOVE ME
+    if (m_features.feedback_loops)
+      plb.AddDescriptorSet(m_feedback_loop_ds_layout);
     plb.AddPushConstants(UNIFORM_PUSH_CONSTANTS_STAGES, 0, UNIFORM_PUSH_CONSTANTS_SIZE);
     if ((pl = plb.Create(m_device)) == VK_NULL_HANDLE)
       return false;
@@ -2941,6 +2963,7 @@ void VulkanDevice::DestroyPipelineLayouts()
       l = VK_NULL_HANDLE;
     }
   };
+  destroy_dsl(m_feedback_loop_ds_layout);
   destroy_dsl(m_multi_texture_ds_layout);
   destroy_dsl(m_single_texture_buffer_ds_layout);
   destroy_dsl(m_single_texture_ds_layout);
@@ -3080,13 +3103,15 @@ bool VulkanDevice::TryImportHostMemory(void* data, size_t data_size, VkBufferUsa
   return true;
 }
 
-void VulkanDevice::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds)
+void VulkanDevice::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds,
+                                    GPUPipeline::RenderPassFlag feedback_loop)
 {
-  bool changed = (m_num_current_render_targets != num_rts || m_current_depth_target != ds);
+  bool changed = (m_num_current_render_targets != num_rts || m_current_depth_target != ds ||
+                  m_current_feedback_loop != feedback_loop);
   bool needs_ds_clear = (ds && ds->IsClearedOrInvalidated());
   bool needs_rt_clear = false;
 
-  m_current_depth_target = ds;
+  m_current_depth_target = static_cast<VulkanTexture*>(ds);
   for (u32 i = 0; i < num_rts; i++)
   {
     VulkanTexture* const RT = static_cast<VulkanTexture*>(rts[i]);
@@ -3096,7 +3121,8 @@ void VulkanDevice::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUText
   }
   for (u32 i = num_rts; i < m_num_current_render_targets; i++)
     m_current_render_targets[i] = nullptr;
-  m_num_current_render_targets = num_rts;
+  m_num_current_render_targets = Truncate8(num_rts);
+  m_current_feedback_loop = feedback_loop;
 
   if (changed)
   {
@@ -3109,17 +3135,21 @@ void VulkanDevice::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUText
       return;
     }
 
-    if (!m_optional_extensions.vk_khr_dynamic_rendering)
+    if (!m_optional_extensions.vk_khr_dynamic_rendering || ((feedback_loop & GPUPipeline::ColorFeedbackLoop) &&
+                                                            !m_optional_extensions.vk_khr_dynamic_rendering_local_read))
     {
-      m_current_framebuffer =
-        m_framebuffer_manager.Lookup((m_num_current_render_targets > 0) ? m_current_render_targets.data() : nullptr,
-                                     m_num_current_render_targets, m_current_depth_target, 0);
+      m_current_framebuffer = m_framebuffer_manager.Lookup(
+        (m_num_current_render_targets > 0) ? reinterpret_cast<GPUTexture**>(m_current_render_targets.data()) : nullptr,
+        m_num_current_render_targets, m_current_depth_target, feedback_loop);
       if (m_current_framebuffer == VK_NULL_HANDLE)
       {
         Log_ErrorPrint("Failed to create framebuffer");
         return;
       }
     }
+
+    m_dirty_flags = (m_dirty_flags & ~DIRTY_FLAG_INPUT_ATTACHMENT) |
+                    ((feedback_loop & GPUPipeline::ColorFeedbackLoop) ? DIRTY_FLAG_INPUT_ATTACHMENT : 0);
   }
 
   // TODO: This could use vkCmdClearAttachments() instead.
@@ -3140,7 +3170,8 @@ void VulkanDevice::BeginRenderPass()
   for (u32 i = 0; i < num_textures; i++)
     m_current_textures[i]->TransitionToLayout(VulkanTexture::Layout::ShaderReadOnly);
 
-  if (m_optional_extensions.vk_khr_dynamic_rendering)
+  if (m_optional_extensions.vk_khr_dynamic_rendering && (m_optional_extensions.vk_khr_dynamic_rendering_local_read ||
+                                                         !(m_current_feedback_loop & GPUPipeline::ColorFeedbackLoop)))
   {
     VkRenderingInfoKHR ri = {
       VK_STRUCTURE_TYPE_RENDERING_INFO_KHR, nullptr, 0u, {}, 1u, 0u, 0u, nullptr, nullptr, nullptr};
@@ -3157,7 +3188,9 @@ void VulkanDevice::BeginRenderPass()
       for (u32 i = 0; i < m_num_current_render_targets; i++)
       {
         VulkanTexture* const rt = static_cast<VulkanTexture*>(m_current_render_targets[i]);
-        rt->TransitionToLayout(VulkanTexture::Layout::ColorAttachment);
+        rt->TransitionToLayout((m_current_feedback_loop & GPUPipeline::ColorFeedbackLoop) ?
+                                 VulkanTexture::Layout::FeedbackLoop :
+                                 VulkanTexture::Layout::ColorAttachment);
         rt->SetUseFenceCounter(GetCurrentFenceCounter());
 
         VkRenderingAttachmentInfo& ai = attachments[i];
@@ -3179,7 +3212,7 @@ void VulkanDevice::BeginRenderPass()
         rt->SetState(GPUTexture::State::Dirty);
       }
 
-      if (VulkanTexture* const ds = static_cast<VulkanTexture*>(m_current_depth_target))
+      if (VulkanTexture* const ds = m_current_depth_target)
       {
         ds->TransitionToLayout(VulkanTexture::Layout::DepthStencilAttachment);
         ds->SetUseFenceCounter(GetCurrentFenceCounter());
@@ -3201,8 +3234,8 @@ void VulkanDevice::BeginRenderPass()
         ds->SetState(GPUTexture::State::Dirty);
       }
 
-      const VulkanTexture* const rt_or_ds = static_cast<const VulkanTexture*>(
-        (m_num_current_render_targets > 0) ? m_current_render_targets[0] : m_current_depth_target);
+      const VulkanTexture* const rt_or_ds =
+        (m_num_current_render_targets > 0) ? m_current_render_targets[0] : m_current_depth_target;
       ri.renderArea = {{}, {rt_or_ds->GetWidth(), rt_or_ds->GetHeight()}};
     }
     else
@@ -3236,7 +3269,7 @@ void VulkanDevice::BeginRenderPass()
     {
       bi.framebuffer = m_current_framebuffer;
       bi.renderPass = m_current_render_pass = GetRenderPass(
-        m_current_render_targets.data(), m_num_current_render_targets, m_current_depth_target, false, false);
+        m_current_render_targets.data(), m_num_current_render_targets, m_current_depth_target, m_current_feedback_loop);
       if (bi.renderPass == VK_NULL_HANDLE)
       {
         Log_ErrorPrint("Failed to create render pass");
@@ -3255,7 +3288,9 @@ void VulkanDevice::BeginRenderPass()
           bi.clearValueCount = i + 1;
         }
         rt->SetState(GPUTexture::State::Dirty);
-        rt->TransitionToLayout(VulkanTexture::Layout::ColorAttachment);
+        rt->TransitionToLayout((m_current_feedback_loop & GPUPipeline::ColorFeedbackLoop) ?
+                                 VulkanTexture::Layout::FeedbackLoop :
+                                 VulkanTexture::Layout::ColorAttachment);
         rt->SetUseFenceCounter(GetCurrentFenceCounter());
       }
       if (VulkanTexture* const ds = static_cast<VulkanTexture*>(m_current_depth_target))
@@ -3357,6 +3392,7 @@ void VulkanDevice::BeginSwapChainRenderPass()
 
   s_stats.num_render_passes++;
   m_num_current_render_targets = 0;
+  m_current_feedback_loop = GPUPipeline::NoRenderPassFlags;
   std::memset(m_current_render_targets.data(), 0, sizeof(m_current_render_targets));
   m_current_depth_target = nullptr;
   m_current_framebuffer = VK_NULL_HANDLE;
@@ -3420,7 +3456,8 @@ void VulkanDevice::UnbindPipeline(VulkanPipeline* pl)
 
 void VulkanDevice::InvalidateCachedState()
 {
-  m_dirty_flags = ALL_DIRTY_STATE;
+  m_dirty_flags =
+    ALL_DIRTY_STATE | ((m_current_feedback_loop & GPUPipeline::ColorFeedbackLoop) ? DIRTY_FLAG_INPUT_ATTACHMENT : 0);
   m_current_render_pass = VK_NULL_HANDLE;
   m_current_pipeline = nullptr;
 }
@@ -3584,11 +3621,20 @@ void VulkanDevice::SetScissor(s32 x, s32 y, s32 width, s32 height)
 
 void VulkanDevice::PreDrawCheck()
 {
+  if (!InRenderPass())
+    BeginRenderPass();
+
   DebugAssert(!(m_dirty_flags & DIRTY_FLAG_INITIAL));
-  const u32 dirty = std::exchange(m_dirty_flags, 0);
+  const u32 update_mask = (m_current_feedback_loop ? ~0u : ~DIRTY_FLAG_INPUT_ATTACHMENT);
+  const u32 dirty = m_dirty_flags & update_mask;
+  m_dirty_flags = m_dirty_flags & ~update_mask;
+  if (dirty & DIRTY_FLAG_PIPELINE_LAYOUT && !(dirty & DIRTY_FLAG_INPUT_ATTACHMENT))
+    m_dirty_flags |= DIRTY_FLAG_INPUT_ATTACHMENT; // TODO: FOR NEXT TIME
+
   if (dirty != 0)
   {
-    if (dirty & (DIRTY_FLAG_PIPELINE_LAYOUT | DIRTY_FLAG_DYNAMIC_OFFSETS | DIRTY_FLAG_TEXTURES_OR_SAMPLERS))
+    if (dirty & (DIRTY_FLAG_PIPELINE_LAYOUT | DIRTY_FLAG_DYNAMIC_OFFSETS | DIRTY_FLAG_TEXTURES_OR_SAMPLERS |
+                 DIRTY_FLAG_INPUT_ATTACHMENT))
     {
       if (!UpdateDescriptorSets(dirty))
       {
@@ -3598,21 +3644,22 @@ void VulkanDevice::PreDrawCheck()
       }
     }
   }
-
-  if (!InRenderPass())
-    BeginRenderPass();
 }
 
 template<GPUPipeline::Layout layout>
-bool VulkanDevice::UpdateDescriptorSetsForLayout(bool new_layout, bool new_dynamic_offsets)
+bool VulkanDevice::UpdateDescriptorSetsForLayout(u32 dirty)
 {
-  std::array<VkDescriptorSet, 2> ds;
+  [[maybe_unused]] bool new_dynamic_offsets = false;
+
+  std::array<VkDescriptorSet, 3> ds;
   u32 first_ds = 0;
   u32 num_ds = 0;
 
   if constexpr (layout == GPUPipeline::Layout::SingleTextureAndUBO || layout == GPUPipeline::Layout::MultiTextureAndUBO)
   {
-    if (new_layout || new_dynamic_offsets)
+    new_dynamic_offsets = ((dirty & DIRTY_FLAG_DYNAMIC_OFFSETS) != 0);
+
+    if (dirty & (DIRTY_FLAG_PIPELINE_LAYOUT | DIRTY_FLAG_DYNAMIC_OFFSETS))
     {
       ds[num_ds++] = m_ubo_descriptor_set;
       new_dynamic_offsets = true;
@@ -3645,7 +3692,7 @@ bool VulkanDevice::UpdateDescriptorSetsForLayout(bool new_layout, bool new_dynam
       {
         DebugAssert(m_current_textures[i] && m_current_samplers[i] != VK_NULL_HANDLE);
         dsub.AddCombinedImageSamplerDescriptorWrite(VK_NULL_HANDLE, i, m_current_textures[i]->GetView(),
-                                                    m_current_samplers[i], VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+                                                    m_current_samplers[i], m_current_textures[i]->GetVkLayout());
       }
 
       const u32 set = (layout == GPUPipeline::Layout::MultiTextureAndUBO) ? 1 : 0;
@@ -3666,13 +3713,32 @@ bool VulkanDevice::UpdateDescriptorSetsForLayout(bool new_layout, bool new_dynam
       {
         DebugAssert(m_current_textures[i] && m_current_samplers[i] != VK_NULL_HANDLE);
         dsub.AddCombinedImageSamplerDescriptorWrite(tds, i, m_current_textures[i]->GetView(), m_current_samplers[i],
-                                                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+                                                    m_current_textures[i]->GetVkLayout());
       }
 
       dsub.Update(m_device, false);
     }
   }
 
+  if constexpr (layout == GPUPipeline::Layout::SingleTextureAndUBO ||
+                layout == GPUPipeline::Layout::SingleTextureAndPushConstants ||
+                layout == GPUPipeline::Layout::SingleTextureBufferAndPushConstants)
+  {
+    if (dirty & DIRTY_FLAG_INPUT_ATTACHMENT)
+    {
+      VkDescriptorSet ids = AllocateDescriptorSet(m_feedback_loop_ds_layout);
+      if (ids == VK_NULL_HANDLE)
+        return false;
+
+      ds[num_ds++] = ids;
+
+      Vulkan::DescriptorSetUpdateBuilder dsub;
+      dsub.AddInputAttachmentDescriptorWrite(ids, 0, m_current_render_targets[0]->GetView(),
+                                             m_current_render_targets[0]->GetVkLayout());
+      dsub.Update(m_device, false);
+    }
+  }
+
   DebugAssert(num_ds > 0);
   vkCmdBindDescriptorSets(GetCurrentCommandBuffer(), VK_PIPELINE_BIND_POINT_GRAPHICS,
                           m_pipeline_layouts[static_cast<u8>(m_current_pipeline_layout)], first_ds, num_ds, ds.data(),
@@ -3684,25 +3750,22 @@ bool VulkanDevice::UpdateDescriptorSetsForLayout(bool new_layout, bool new_dynam
 
 bool VulkanDevice::UpdateDescriptorSets(u32 dirty)
 {
-  const bool new_layout = (dirty & DIRTY_FLAG_PIPELINE_LAYOUT) != 0;
-  const bool new_dynamic_offsets = (dirty & DIRTY_FLAG_DYNAMIC_OFFSETS) != 0;
-
   switch (m_current_pipeline_layout)
   {
     case GPUPipeline::Layout::SingleTextureAndUBO:
-      return UpdateDescriptorSetsForLayout<GPUPipeline::Layout::SingleTextureAndUBO>(new_layout, new_dynamic_offsets);
+      return UpdateDescriptorSetsForLayout<GPUPipeline::Layout::SingleTextureAndUBO>(dirty);
 
     case GPUPipeline::Layout::SingleTextureAndPushConstants:
-      return UpdateDescriptorSetsForLayout<GPUPipeline::Layout::SingleTextureAndPushConstants>(new_layout, false);
+      return UpdateDescriptorSetsForLayout<GPUPipeline::Layout::SingleTextureAndPushConstants>(dirty);
 
     case GPUPipeline::Layout::SingleTextureBufferAndPushConstants:
-      return UpdateDescriptorSetsForLayout<GPUPipeline::Layout::SingleTextureBufferAndPushConstants>(new_layout, false);
+      return UpdateDescriptorSetsForLayout<GPUPipeline::Layout::SingleTextureBufferAndPushConstants>(dirty);
 
     case GPUPipeline::Layout::MultiTextureAndUBO:
-      return UpdateDescriptorSetsForLayout<GPUPipeline::Layout::MultiTextureAndUBO>(new_layout, new_dynamic_offsets);
+      return UpdateDescriptorSetsForLayout<GPUPipeline::Layout::MultiTextureAndUBO>(dirty);
 
     case GPUPipeline::Layout::MultiTextureAndPushConstants:
-      return UpdateDescriptorSetsForLayout<GPUPipeline::Layout::MultiTextureAndPushConstants>(new_layout, false);
+      return UpdateDescriptorSetsForLayout<GPUPipeline::Layout::MultiTextureAndPushConstants>(dirty);
 
     default:
       UnreachableCode();
@@ -3722,3 +3785,76 @@ void VulkanDevice::DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex)
   s_stats.num_draws++;
   vkCmdDrawIndexed(GetCurrentCommandBuffer(), index_count, 1, base_index, base_vertex, 0);
 }
+
+VkImageMemoryBarrier VulkanDevice::GetColorBufferBarrier(const VulkanTexture* rt) const
+{
+  const VkImageLayout vk_layout = m_optional_extensions.vk_khr_dynamic_rendering_local_read ?
+                                    VK_IMAGE_LAYOUT_RENDERING_LOCAL_READ_KHR :
+                                    VK_IMAGE_LAYOUT_GENERAL;
+  DebugAssert(rt->GetLayout() == VulkanTexture::Layout::FeedbackLoop);
+
+  return {VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+          nullptr,
+          VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
+          VK_ACCESS_INPUT_ATTACHMENT_READ_BIT,
+          vk_layout,
+          vk_layout,
+          VK_QUEUE_FAMILY_IGNORED,
+          VK_QUEUE_FAMILY_IGNORED,
+          rt->GetImage(),
+          {VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u}};
+}
+
+void VulkanDevice::DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type)
+{
+  PreDrawCheck();
+
+  // TODO: The first barrier is unnecessary if we're starting the render pass.
+
+  switch (type)
+  {
+    case GPUDevice::DrawBarrier::None:
+    {
+      s_stats.num_draws++;
+      vkCmdDrawIndexed(GetCurrentCommandBuffer(), index_count, 1, base_index, base_vertex, 0);
+    }
+    break;
+
+    case GPUDevice::DrawBarrier::One:
+    {
+      DebugAssert(m_num_current_render_targets == 1);
+      s_stats.num_barriers++;
+      s_stats.num_draws++;
+
+      const VkImageMemoryBarrier barrier =
+        GetColorBufferBarrier(static_cast<VulkanTexture*>(m_current_render_targets[0]));
+      vkCmdPipelineBarrier(m_current_command_buffer, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+                           VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_DEPENDENCY_BY_REGION_BIT, 0, nullptr, 0, nullptr,
+                           1, &barrier);
+      vkCmdDrawIndexed(GetCurrentCommandBuffer(), index_count, 1, base_index, base_vertex, 0);
+    }
+    break;
+
+    case GPUDevice::DrawBarrier::Full:
+    {
+      DebugAssert(m_num_current_render_targets == 1);
+
+      const VkImageMemoryBarrier barrier =
+        GetColorBufferBarrier(static_cast<VulkanTexture*>(m_current_render_targets[0]));
+      const u32 indices_per_primitive = m_current_pipeline->GetVerticesPerPrimitive();
+      const u32 end_batch = base_index + index_count;
+
+      for (; base_index < end_batch; base_index += indices_per_primitive)
+      {
+        s_stats.num_barriers++;
+        s_stats.num_draws++;
+
+        vkCmdPipelineBarrier(m_current_command_buffer, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+                             VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_DEPENDENCY_BY_REGION_BIT, 0, nullptr, 0, nullptr,
+                             1, &barrier);
+        vkCmdDrawIndexed(GetCurrentCommandBuffer(), indices_per_primitive, 1, base_index, base_vertex, 0);
+      }
+    }
+    break;
+  }
+}
diff --git a/src/util/vulkan_device.h b/src/util/vulkan_device.h
index 38517b5e4..308b6497b 100644
--- a/src/util/vulkan_device.h
+++ b/src/util/vulkan_device.h
@@ -44,7 +44,6 @@ public:
   {
     bool vk_ext_memory_budget : 1;
     bool vk_ext_rasterization_order_attachment_access : 1;
-    bool vk_ext_attachment_feedback_loop_layout : 1;
     bool vk_ext_full_screen_exclusive : 1;
     bool vk_khr_get_memory_requirements2 : 1;
     bool vk_khr_bind_memory2 : 1;
@@ -52,6 +51,7 @@ public:
     bool vk_khr_dedicated_allocation : 1;
     bool vk_khr_driver_properties : 1;
     bool vk_khr_dynamic_rendering : 1;
+    bool vk_khr_dynamic_rendering_local_read : 1;
     bool vk_khr_push_descriptor : 1;
     bool vk_ext_external_memory_host : 1;
   };
@@ -114,7 +114,8 @@ public:
   void PushUniformBuffer(const void* data, u32 data_size) override;
   void* MapUniformBuffer(u32 size) override;
   void UnmapUniformBuffer(u32 size) override;
-  void SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds) override;
+  void SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds,
+                        GPUPipeline::RenderPassFlag feedback_loop = GPUPipeline::NoRenderPassFlags) override;
   void SetPipeline(GPUPipeline* pipeline) override;
   void SetTextureSampler(u32 slot, GPUTexture* texture, GPUSampler* sampler) override;
   void SetTextureBuffer(u32 slot, GPUTextureBuffer* buffer) override;
@@ -122,6 +123,7 @@ public:
   void SetScissor(s32 x, s32 y, s32 width, s32 height) override;
   void Draw(u32 vertex_count, u32 base_vertex) override;
   void DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) override;
+  void DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type) override;
 
   bool SetGPUTimingEnabled(bool enabled) override;
   float GetAndResetAccumulatedGPUTime() override;
@@ -144,13 +146,6 @@ public:
   /// Returns true if Vulkan is suitable as a default for the devices in the system.
   static bool IsSuitableDefaultRenderer();
 
-  // The interaction between raster order attachment access and fbfetch is unclear.
-  ALWAYS_INLINE bool UseFeedbackLoopLayout() const
-  {
-    return (m_optional_extensions.vk_ext_attachment_feedback_loop_layout &&
-            !m_optional_extensions.vk_ext_rasterization_order_attachment_access);
-  }
-
   // Helpers for getting constants
   ALWAYS_INLINE u32 GetBufferCopyOffsetAlignment() const
   {
@@ -165,8 +160,8 @@ public:
 
   // Creates a simple render pass.
   VkRenderPass GetRenderPass(const GPUPipeline::GraphicsConfig& config);
-  VkRenderPass GetRenderPass(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds, bool color_feedback_loop = false,
-                             bool depth_sampling = false);
+  VkRenderPass GetRenderPass(VulkanTexture* const* rts, u32 num_rts, VulkanTexture* ds,
+                             GPUPipeline::RenderPassFlag render_pass_flags);
   VkRenderPass GetSwapChainRenderPass(GPUTexture::Format format, VkAttachmentLoadOp load_op);
 
   // Gets a non-clearing version of the specified render pass. Slow, don't call in hot path.
@@ -239,9 +234,10 @@ private:
     DIRTY_FLAG_PIPELINE_LAYOUT = (1 << 1),
     DIRTY_FLAG_DYNAMIC_OFFSETS = (1 << 2),
     DIRTY_FLAG_TEXTURES_OR_SAMPLERS = (1 << 3),
+    DIRTY_FLAG_INPUT_ATTACHMENT = (1 << 4),
 
-    ALL_DIRTY_STATE =
-      DIRTY_FLAG_INITIAL | DIRTY_FLAG_PIPELINE_LAYOUT | DIRTY_FLAG_DYNAMIC_OFFSETS | DIRTY_FLAG_TEXTURES_OR_SAMPLERS,
+    ALL_DIRTY_STATE = DIRTY_FLAG_INITIAL | DIRTY_FLAG_PIPELINE_LAYOUT | DIRTY_FLAG_DYNAMIC_OFFSETS |
+                      DIRTY_FLAG_TEXTURES_OR_SAMPLERS | DIRTY_FLAG_INPUT_ATTACHMENT,
   };
 
   struct RenderPassCacheKey
@@ -259,8 +255,7 @@ private:
     u8 depth_store_op : 1;
     u8 stencil_load_op : 2;
     u8 stencil_store_op : 1;
-    u8 depth_sampling : 1;
-    u8 color_feedback_loop : 1;
+    u8 feedback_loop : 2;
     u8 samples;
 
     bool operator==(const RenderPassCacheKey& rhs) const;
@@ -361,7 +356,7 @@ private:
   void PreDrawCheck();
 
   template<GPUPipeline::Layout layout>
-  bool UpdateDescriptorSetsForLayout(bool new_layout, bool new_dynamic_offsets);
+  bool UpdateDescriptorSetsForLayout(u32 dirty);
   bool UpdateDescriptorSets(u32 dirty);
 
   // Ends a render pass if we're currently in one.
@@ -375,6 +370,8 @@ private:
   static VkFramebuffer CreateFramebuffer(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds, u32 flags);
   static void DestroyFramebuffer(VkFramebuffer fbo);
 
+  VkImageMemoryBarrier GetColorBufferBarrier(const VulkanTexture* rt) const;
+
   void BeginCommandBuffer(u32 index);
   void WaitForCommandBufferCompletion(u32 index);
 
@@ -445,6 +442,7 @@ private:
   VkDescriptorSetLayout m_single_texture_ds_layout = VK_NULL_HANDLE;
   VkDescriptorSetLayout m_single_texture_buffer_ds_layout = VK_NULL_HANDLE;
   VkDescriptorSetLayout m_multi_texture_ds_layout = VK_NULL_HANDLE;
+  VkDescriptorSetLayout m_feedback_loop_ds_layout = VK_NULL_HANDLE;
   std::array<VkPipelineLayout, static_cast<u8>(GPUPipeline::Layout::MaxCount)> m_pipeline_layouts = {};
 
   VulkanStreamBuffer m_vertex_buffer;
@@ -460,9 +458,10 @@ private:
   // Which bindings/state has to be updated before the next draw.
   u32 m_dirty_flags = ALL_DIRTY_STATE;
 
-  u32 m_num_current_render_targets = 0;
-  std::array<GPUTexture*, MAX_RENDER_TARGETS> m_current_render_targets = {};
-  GPUTexture* m_current_depth_target = nullptr;
+  u8 m_num_current_render_targets = 0;
+  GPUPipeline::RenderPassFlag m_current_feedback_loop = GPUPipeline::NoRenderPassFlags;
+  std::array<VulkanTexture*, MAX_RENDER_TARGETS> m_current_render_targets = {};
+  VulkanTexture* m_current_depth_target = nullptr;
   VkFramebuffer m_current_framebuffer = VK_NULL_HANDLE;
   VkRenderPass m_current_render_pass = VK_NULL_HANDLE;
 
diff --git a/src/util/vulkan_pipeline.cpp b/src/util/vulkan_pipeline.cpp
index 657e50a8a..cc0bacfd1 100644
--- a/src/util/vulkan_pipeline.cpp
+++ b/src/util/vulkan_pipeline.cpp
@@ -72,8 +72,10 @@ std::unique_ptr<GPUShader> VulkanDevice::CreateShaderFromSource(GPUShaderStage s
 
 //////////////////////////////////////////////////////////////////////////
 
-VulkanPipeline::VulkanPipeline(VkPipeline pipeline, Layout layout)
-  : GPUPipeline(), m_pipeline(pipeline), m_layout(layout)
+VulkanPipeline::VulkanPipeline(VkPipeline pipeline, Layout layout, u8 vertices_per_primitive,
+                               RenderPassFlag render_pass_flags)
+  : GPUPipeline(), m_pipeline(pipeline), m_layout(layout), m_vertices_per_primitive(vertices_per_primitive),
+    m_render_pass_flags(render_pass_flags)
 {
 }
 
@@ -89,12 +91,13 @@ void VulkanPipeline::SetDebugName(const std::string_view& name)
 
 std::unique_ptr<GPUPipeline> VulkanDevice::CreatePipeline(const GPUPipeline::GraphicsConfig& config)
 {
-  static constexpr std::array<VkPrimitiveTopology, static_cast<u32>(GPUPipeline::Primitive::MaxCount)> primitives = {{
-    VK_PRIMITIVE_TOPOLOGY_POINT_LIST,     // Points
-    VK_PRIMITIVE_TOPOLOGY_LINE_LIST,      // Lines
-    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,  // Triangles
-    VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, // TriangleStrips
-  }};
+  static constexpr std::array<std::pair<VkPrimitiveTopology, u32>, static_cast<u32>(GPUPipeline::Primitive::MaxCount)>
+    primitives = {{
+      {VK_PRIMITIVE_TOPOLOGY_POINT_LIST, 1},     // Points
+      {VK_PRIMITIVE_TOPOLOGY_LINE_LIST, 2},      // Lines
+      {VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, 3},  // Triangles
+      {VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, 3}, // TriangleStrips
+    }};
 
   static constexpr u32 MAX_COMPONENTS = 4;
   static constexpr const VkFormat format_mapping[static_cast<u8>(
@@ -171,7 +174,8 @@ std::unique_ptr<GPUPipeline> VulkanDevice::CreatePipeline(const GPUPipeline::Gra
     }
   }
 
-  gpb.SetPrimitiveTopology(primitives[static_cast<u8>(config.primitive)]);
+  const auto [vk_topology, vertices_per_primitive] = primitives[static_cast<u8>(config.primitive)];
+  gpb.SetPrimitiveTopology(vk_topology);
 
   // Line width?
 
@@ -206,7 +210,8 @@ std::unique_ptr<GPUPipeline> VulkanDevice::CreatePipeline(const GPUPipeline::Gra
 
   gpb.SetPipelineLayout(m_pipeline_layouts[static_cast<u8>(config.layout)]);
 
-  if (m_optional_extensions.vk_khr_dynamic_rendering)
+  if (m_optional_extensions.vk_khr_dynamic_rendering && (m_optional_extensions.vk_khr_dynamic_rendering_local_read ||
+                                                         !(config.render_pass_flags & GPUPipeline::ColorFeedbackLoop)))
   {
     gpb.SetDynamicRendering();
 
@@ -224,6 +229,13 @@ std::unique_ptr<GPUPipeline> VulkanDevice::CreatePipeline(const GPUPipeline::Gra
       gpb.SetDynamicRenderingDepthAttachment(VulkanDevice::TEXTURE_FORMAT_MAPPING[static_cast<u8>(config.depth_format)],
                                              VK_FORMAT_UNDEFINED);
     }
+
+    if (config.render_pass_flags & GPUPipeline::ColorFeedbackLoop)
+    {
+      DebugAssert(m_optional_extensions.vk_khr_dynamic_rendering_local_read &&
+                  config.color_formats[0] != GPUTexture::Format::Unknown);
+      gpb.AddDynamicRenderingInputAttachment(0);
+    }
   }
   else
   {
@@ -236,5 +248,6 @@ std::unique_ptr<GPUPipeline> VulkanDevice::CreatePipeline(const GPUPipeline::Gra
   if (!pipeline)
     return {};
 
-  return std::unique_ptr<GPUPipeline>(new VulkanPipeline(pipeline, config.layout));
+  return std::unique_ptr<GPUPipeline>(
+    new VulkanPipeline(pipeline, config.layout, static_cast<u8>(vertices_per_primitive), config.render_pass_flags));
 }
diff --git a/src/util/vulkan_pipeline.h b/src/util/vulkan_pipeline.h
index 384cf288f..db355a714 100644
--- a/src/util/vulkan_pipeline.h
+++ b/src/util/vulkan_pipeline.h
@@ -32,12 +32,15 @@ public:
 
   ALWAYS_INLINE VkPipeline GetPipeline() const { return m_pipeline; }
   ALWAYS_INLINE Layout GetLayout() const { return m_layout; }
+  ALWAYS_INLINE u8 GetVerticesPerPrimitive() const { return m_vertices_per_primitive; }
 
   void SetDebugName(const std::string_view& name) override;
 
 private:
-  VulkanPipeline(VkPipeline pipeline, Layout layout);
+  VulkanPipeline(VkPipeline pipeline, Layout layout, u8 vertices_per_primitive, RenderPassFlag render_pass_flags);
 
   VkPipeline m_pipeline;
   Layout m_layout;
+  u8 m_vertices_per_primitive;
+  RenderPassFlag m_render_pass_flags;
 };
diff --git a/src/util/vulkan_texture.cpp b/src/util/vulkan_texture.cpp
index 5626c9e06..621774514 100644
--- a/src/util/vulkan_texture.cpp
+++ b/src/util/vulkan_texture.cpp
@@ -18,6 +18,7 @@ static constexpr const VkComponentMapping s_identity_swizzle{
 
 static VkImageLayout GetVkImageLayout(VulkanTexture::Layout layout)
 {
+  // TODO: Wrong for depth textures in feedback loop
   static constexpr std::array<VkImageLayout, static_cast<u32>(VulkanTexture::Layout::Count)> s_vk_layout_mapping = {{
     VK_IMAGE_LAYOUT_UNDEFINED,                        // Undefined
     VK_IMAGE_LAYOUT_PREINITIALIZED,                   // Preinitialized
@@ -34,17 +35,12 @@ static VkImageLayout GetVkImageLayout(VulkanTexture::Layout layout)
     VK_IMAGE_LAYOUT_GENERAL,                          // ComputeReadWriteImage
     VK_IMAGE_LAYOUT_GENERAL,                          // General
   }};
-  return (layout == VulkanTexture::Layout::FeedbackLoop && VulkanDevice::GetInstance().UseFeedbackLoopLayout()) ?
-           VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT :
+  return (layout == VulkanTexture::Layout::FeedbackLoop &&
+          VulkanDevice::GetInstance().GetOptionalExtensions().vk_khr_dynamic_rendering_local_read) ?
+           VK_IMAGE_LAYOUT_RENDERING_LOCAL_READ_KHR :
            s_vk_layout_mapping[static_cast<u32>(layout)];
 }
 
-static VkAccessFlagBits GetFeedbackLoopInputAccessBits()
-{
-  return VulkanDevice::GetInstance().UseFeedbackLoopLayout() ? VK_ACCESS_SHADER_READ_BIT :
-                                                               VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
-}
-
 VulkanTexture::VulkanTexture(u32 width, u32 height, u32 layers, u32 levels, u32 samples, Type type, Format format,
                              VkImage image, VmaAllocation allocation, VkImageView view, VkFormat vk_format)
   : GPUTexture(static_cast<u16>(width), static_cast<u16>(height), static_cast<u8>(layers), static_cast<u8>(levels),
@@ -111,8 +107,7 @@ std::unique_ptr<VulkanTexture> VulkanTexture::Create(u32 width, u32 height, u32
       DebugAssert(levels == 1);
       ici.usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT |
                   VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT |
-                  (dev.UseFeedbackLoopLayout() ? VK_IMAGE_USAGE_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT :
-                                                 VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT);
+                  VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT;
     }
     break;
 
@@ -120,8 +115,7 @@ std::unique_ptr<VulkanTexture> VulkanTexture::Create(u32 width, u32 height, u32
     {
       DebugAssert(levels == 1);
       ici.usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT |
-                  VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT |
-                  (dev.UseFeedbackLoopLayout() ? VK_IMAGE_USAGE_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT : 0);
+                  VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
       vci.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
     }
     break;
@@ -588,7 +582,7 @@ void VulkanTexture::TransitionSubresourcesToLayout(VkCommandBuffer command_buffe
     case Layout::FeedbackLoop:
       barrier.srcAccessMask = (aspect == VK_IMAGE_ASPECT_COLOR_BIT) ?
                                 (VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
-                                 GetFeedbackLoopInputAccessBits()) :
+                                 VK_ACCESS_INPUT_ATTACHMENT_READ_BIT) :
                                 (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
                                  VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT);
       srcStageMask = (aspect == VK_IMAGE_ASPECT_COLOR_BIT) ?
@@ -664,7 +658,7 @@ void VulkanTexture::TransitionSubresourcesToLayout(VkCommandBuffer command_buffe
     case Layout::FeedbackLoop:
       barrier.dstAccessMask = (aspect == VK_IMAGE_ASPECT_COLOR_BIT) ?
                                 (VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
-                                 GetFeedbackLoopInputAccessBits()) :
+                                 VK_ACCESS_INPUT_ATTACHMENT_READ_BIT) :
                                 (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
                                  VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT);
       dstStageMask = (aspect == VK_IMAGE_ASPECT_COLOR_BIT) ?