From f936a36c85ca0c81a7c551f830d24a5978d9047a Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Wed, 28 Feb 2024 16:13:50 +1000
Subject: [PATCH] GPUDevice: Add GPUDownloadTexture

Which can also be based in host/client memory.
Use it for screenshots and VRAM downloads.
---
 src/common/intrin.h         |  11 ++
 src/core/gpu.cpp            |  72 ++++++--
 src/core/gpu.h              |   2 +-
 src/core/gpu_hw.cpp         |  54 +++++-
 src/core/gpu_hw.h           |   1 +
 src/core/system.cpp         |   6 +-
 src/util/d3d11_device.cpp   |   2 +-
 src/util/d3d11_device.h     |  17 +-
 src/util/d3d11_texture.cpp  | 190 +++++++++++++-------
 src/util/d3d11_texture.h    |  25 ++-
 src/util/d3d12_device.cpp   |   2 +-
 src/util/d3d12_device.h     |  16 +-
 src/util/d3d12_texture.cpp  | 289 +++++++++++++++++++------------
 src/util/d3d12_texture.h    |  33 +++-
 src/util/gpu_device.h       |  10 +-
 src/util/gpu_texture.cpp    | 164 ++++++++++++++++--
 src/util/gpu_texture.h      |  85 ++++++++-
 src/util/metal_device.h     |  41 ++++-
 src/util/metal_device.mm    | 245 +++++++++++++++++---------
 src/util/opengl_device.cpp  |  76 +++-----
 src/util/opengl_device.h    |  19 +-
 src/util/opengl_texture.cpp | 207 +++++++++++++++++++++-
 src/util/opengl_texture.h   |  33 +++-
 src/util/vulkan_device.cpp  |  31 ++--
 src/util/vulkan_device.h    |  24 ++-
 src/util/vulkan_texture.cpp | 337 +++++++++++++++++++++++-------------
 src/util/vulkan_texture.h   |  36 +++-
 27 files changed, 1501 insertions(+), 527 deletions(-)

diff --git a/src/common/intrin.h b/src/common/intrin.h
index 7d5f18968..795a7f950 100644
--- a/src/common/intrin.h
+++ b/src/common/intrin.h
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include "align.h"
 #include "types.h"
 
 #include <type_traits>
@@ -27,6 +28,16 @@
 #include <malloc.h> // alloca
 #endif
 
+/// Only currently using 128-bit vectors at max.
+static constexpr u32 VECTOR_ALIGNMENT = 16;
+
+/// Aligns allocation/pitch size to preferred host size.
+template<typename T>
+ALWAYS_INLINE static T VectorAlign(T value)
+{
+  return Common::AlignUpPow2(value, VECTOR_ALIGNMENT);
+}
+
 template<typename T>
 ALWAYS_INLINE_RELEASE static void MemsetPtrs(T* ptr, T value, u32 count)
 {
diff --git a/src/core/gpu.cpp b/src/core/gpu.cpp
index f41447d0b..64cff9f10 100644
--- a/src/core/gpu.cpp
+++ b/src/core/gpu.cpp
@@ -1907,7 +1907,7 @@ Common::Rectangle<s32> GPU::CalculateDrawRect(s32 window_width, s32 window_heigh
 
 static bool CompressAndWriteTextureToFile(u32 width, u32 height, std::string filename, FileSystem::ManagedCFilePtr fp,
                                           bool clear_alpha, bool flip_y, u32 resize_width, u32 resize_height,
-                                          std::vector<u32> texture_data, u32 texture_data_stride,
+                                          std::vector<u8> texture_data, u32 texture_data_stride,
                                           GPUTexture::Format texture_format)
 {
 
@@ -1923,8 +1923,18 @@ static bool CompressAndWriteTextureToFile(u32 width, u32 height, std::string fil
 
   if (clear_alpha)
   {
-    for (u32& pixel : texture_data)
-      pixel |= 0xFF000000;
+    for (u32 y = 0; y < height; y++)
+    {
+      u8* pixels = &texture_data[y * texture_data_stride];
+      for (u32 x = 0; x < width; x++)
+      {
+        u32 pixel;
+        std::memcpy(&pixel, pixels, sizeof(pixel));
+        pixel |= 0xFF000000u;
+        std::memcpy(pixels, &pixel, sizeof(pixel));
+        pixels += sizeof(pixel);
+      }
+    }
   }
 
   if (flip_y)
@@ -1932,11 +1942,10 @@ static bool CompressAndWriteTextureToFile(u32 width, u32 height, std::string fil
 
   if (resize_width > 0 && resize_height > 0 && (resize_width != width || resize_height != height))
   {
-    std::vector<u32> resized_texture_data(resize_width * resize_height);
+    std::vector<u8> resized_texture_data(resize_width * resize_height * sizeof(u32));
     u32 resized_texture_stride = sizeof(u32) * resize_width;
-    if (!stbir_resize_uint8(reinterpret_cast<u8*>(texture_data.data()), width, height, texture_data_stride,
-                            reinterpret_cast<u8*>(resized_texture_data.data()), resize_width, resize_height,
-                            resized_texture_stride, 4))
+    if (!stbir_resize_uint8(texture_data.data(), width, height, texture_data_stride, resized_texture_data.data(),
+                            resize_width, resize_height, resized_texture_stride, 4))
     {
       Log_ErrorPrintf("Failed to resize texture data from %ux%u to %ux%u", width, height, resize_width, resize_height);
       return false;
@@ -2022,13 +2031,29 @@ bool GPU::WriteDisplayTextureToFile(std::string filename, bool full_resolution /
   const u32 read_width = static_cast<u32>(m_display_texture_view_width);
   const u32 read_height = static_cast<u32>(m_display_texture_view_height);
 
-  std::vector<u32> texture_data(read_width * read_height);
   const u32 texture_data_stride =
     Common::AlignUpPow2(GPUTexture::GetPixelSize(m_display_texture->GetFormat()) * read_width, 4);
-  if (!g_gpu_device->DownloadTexture(m_display_texture, read_x, read_y, read_width, read_height, texture_data.data(),
-                                     texture_data_stride))
+  std::vector<u8> texture_data(texture_data_stride * read_height);
+
+  std::unique_ptr<GPUDownloadTexture> dltex;
+  if (g_gpu_device->GetFeatures().memory_import)
+  {
+    dltex = g_gpu_device->CreateDownloadTexture(read_width, read_height, m_display_texture->GetFormat(),
+                                                texture_data.data(), texture_data.size(), texture_data_stride);
+  }
+  if (!dltex)
+  {
+    if (!(dltex = g_gpu_device->CreateDownloadTexture(read_width, read_height, m_display_texture->GetFormat())))
+    {
+      Log_ErrorFmt("Failed to create {}x{} {} download texture", read_width, read_height,
+                   GPUTexture::GetFormatName(m_display_texture->GetFormat()));
+      return false;
+    }
+  }
+
+  dltex->CopyFromTexture(0, 0, m_display_texture, read_x, read_y, read_width, read_height, 0, 0, !dltex->IsImported());
+  if (!dltex->ReadTexels(0, 0, read_width, read_height, texture_data.data(), texture_data_stride))
   {
-    Log_ErrorPrintf("Texture download failed");
     RestoreDeviceContext();
     return false;
   }
@@ -2060,7 +2085,7 @@ bool GPU::WriteDisplayTextureToFile(std::string filename, bool full_resolution /
 }
 
 bool GPU::RenderScreenshotToBuffer(u32 width, u32 height, const Common::Rectangle<s32>& draw_rect, bool postfx,
-                                   std::vector<u32>* out_pixels, u32* out_stride, GPUTexture::Format* out_format)
+                                   std::vector<u8>* out_pixels, u32* out_stride, GPUTexture::Format* out_format)
 {
   const GPUTexture::Format hdformat =
     g_gpu_device->HasSurface() ? g_gpu_device->GetWindowFormat() : GPUTexture::Format::RGBA8;
@@ -2076,8 +2101,25 @@ bool GPU::RenderScreenshotToBuffer(u32 width, u32 height, const Common::Rectangl
   RenderDisplay(render_texture.get(), draw_rect, postfx);
 
   const u32 stride = GPUTexture::GetPixelSize(hdformat) * width;
-  out_pixels->resize(width * height);
-  if (!g_gpu_device->DownloadTexture(render_texture.get(), 0, 0, width, height, out_pixels->data(), stride))
+  out_pixels->resize(height * stride);
+
+  std::unique_ptr<GPUDownloadTexture> dltex;
+  if (g_gpu_device->GetFeatures().memory_import)
+  {
+    dltex =
+      g_gpu_device->CreateDownloadTexture(width, height, hdformat, out_pixels->data(), out_pixels->size(), stride);
+  }
+  if (!dltex)
+  {
+    if (!(dltex = g_gpu_device->CreateDownloadTexture(width, height, hdformat)))
+    {
+      Log_ErrorFmt("Failed to create {}x{} download texture", width, height);
+      return false;
+    }
+  }
+
+  dltex->CopyFromTexture(0, 0, render_texture.get(), 0, 0, width, height, 0, 0, false);
+  if (!dltex->ReadTexels(0, 0, width, height, out_pixels->data(), stride))
   {
     RestoreDeviceContext();
     return false;
@@ -2142,7 +2184,7 @@ bool GPU::RenderScreenshotToFile(std::string filename, bool internal_resolution
   if (width == 0 || height == 0)
     return false;
 
-  std::vector<u32> pixels;
+  std::vector<u8> pixels;
   u32 pixels_stride;
   GPUTexture::Format pixels_format;
   if (!RenderScreenshotToBuffer(width, height, draw_rect, !internal_resolution, &pixels, &pixels_stride,
diff --git a/src/core/gpu.h b/src/core/gpu.h
index 702711e53..9e5ad40b9 100644
--- a/src/core/gpu.h
+++ b/src/core/gpu.h
@@ -206,7 +206,7 @@ public:
 
   /// Renders the display, optionally with postprocessing to the specified image.
   bool RenderScreenshotToBuffer(u32 width, u32 height, const Common::Rectangle<s32>& draw_rect, bool postfx,
-                                std::vector<u32>* out_pixels, u32* out_stride, GPUTexture::Format* out_format);
+                                std::vector<u8>* out_pixels, u32* out_stride, GPUTexture::Format* out_format);
 
   /// Helper function to save screenshot to PNG.
   bool RenderScreenshotToFile(std::string filename, bool internal_resolution = false, bool compress_on_thread = false);
diff --git a/src/core/gpu_hw.cpp b/src/core/gpu_hw.cpp
index bbb1e4a72..aa2e50fa9 100644
--- a/src/core/gpu_hw.cpp
+++ b/src/core/gpu_hw.cpp
@@ -663,6 +663,26 @@ bool GPU_HW::CreateBuffers()
   GL_OBJECT_NAME(m_vram_read_texture, "VRAM Read Texture");
   GL_OBJECT_NAME(m_vram_readback_texture, "VRAM Readback Texture");
 
+  if (g_gpu_device->GetFeatures().memory_import)
+  {
+    Log_DevPrint("Trying to import guest VRAM buffer for downloads...");
+    m_vram_readback_download_texture = g_gpu_device->CreateDownloadTexture(
+      m_vram_readback_texture->GetWidth(), m_vram_readback_texture->GetHeight(), m_vram_readback_texture->GetFormat(),
+      g_vram, sizeof(g_vram), VRAM_WIDTH * sizeof(u16));
+    if (!m_vram_readback_download_texture)
+      Log_ErrorPrint("Failed to create imported readback buffer");
+  }
+  if (!m_vram_readback_download_texture)
+  {
+    m_vram_readback_download_texture = g_gpu_device->CreateDownloadTexture(
+      m_vram_readback_texture->GetWidth(), m_vram_readback_texture->GetHeight(), m_vram_readback_texture->GetFormat());
+    if (!m_vram_readback_download_texture)
+    {
+      Log_ErrorPrint("Failed to create readback download texture");
+      return false;
+    }
+  }
+
   if (g_gpu_device->GetFeatures().supports_texture_buffers)
   {
     if (!(m_vram_upload_buffer =
@@ -703,6 +723,7 @@ void GPU_HW::DestroyBuffers()
   ClearDisplayTexture();
 
   m_vram_upload_buffer.reset();
+  m_vram_readback_download_texture.reset();
   g_gpu_device->RecycleTexture(std::move(m_downsample_texture));
   g_gpu_device->RecycleTexture(std::move(m_vram_read_texture));
   g_gpu_device->RecycleTexture(std::move(m_vram_depth_texture));
@@ -2405,8 +2426,18 @@ void GPU_HW::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
   }
 
   // Get bounds with wrap-around handled.
-  const Common::Rectangle<u32> copy_rect = GetVRAMTransferBounds(x, y, width, height);
-  const u32 encoded_width = (copy_rect.GetWidth() + 1) / 2;
+  Common::Rectangle<u32> copy_rect = GetVRAMTransferBounds(x, y, width, height);
+
+  // Has to be aligned to an even pixel for the download, due to 32-bit packing.
+  if (copy_rect.left & 1)
+    copy_rect.left--;
+  if (copy_rect.right & 1)
+    copy_rect.right++;
+
+  DebugAssert((copy_rect.left % 2) == 0 && (copy_rect.GetWidth() % 2) == 0);
+  const u32 encoded_left = copy_rect.left / 2;
+  const u32 encoded_top = copy_rect.top;
+  const u32 encoded_width = copy_rect.GetWidth() / 2;
   const u32 encoded_height = copy_rect.GetHeight();
 
   // Encode the 24-bit texture as 16-bit.
@@ -2421,9 +2452,22 @@ void GPU_HW::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
   GL_POP();
 
   // Stage the readback and copy it into our shadow buffer.
-  g_gpu_device->DownloadTexture(m_vram_readback_texture.get(), 0, 0, encoded_width, encoded_height,
-                                reinterpret_cast<u32*>(&g_vram[copy_rect.top * VRAM_WIDTH + copy_rect.left]),
-                                VRAM_WIDTH * sizeof(u16));
+  if (m_vram_readback_download_texture->IsImported())
+  {
+    // Fast path, read directly.
+    m_vram_readback_download_texture->CopyFromTexture(encoded_left, encoded_top, m_vram_readback_texture.get(), 0, 0,
+                                                      encoded_width, encoded_height, 0, 0, false);
+    m_vram_readback_download_texture->Flush();
+  }
+  else
+  {
+    // Copy to staging buffer, then to VRAM.
+    m_vram_readback_download_texture->CopyFromTexture(0, 0, m_vram_readback_texture.get(), 0, 0, encoded_width,
+                                                      encoded_height, 0, 0, true);
+    m_vram_readback_download_texture->ReadTexels(0, 0, encoded_width, encoded_height,
+                                                 &g_vram[copy_rect.top * VRAM_WIDTH + copy_rect.left],
+                                                 VRAM_WIDTH * sizeof(u16));
+  }
 
   RestoreDeviceContext();
 }
diff --git a/src/core/gpu_hw.h b/src/core/gpu_hw.h
index 53d960b2e..659b7fcff 100644
--- a/src/core/gpu_hw.h
+++ b/src/core/gpu_hw.h
@@ -216,6 +216,7 @@ private:
   std::unique_ptr<GPUTexture> m_vram_depth_texture;
   std::unique_ptr<GPUTexture> m_vram_read_texture;
   std::unique_ptr<GPUTexture> m_vram_readback_texture;
+  std::unique_ptr<GPUDownloadTexture> m_vram_readback_download_texture;
   std::unique_ptr<GPUTexture> m_vram_replacement_texture;
   std::unique_ptr<GPUTexture> m_display_private_texture; // TODO: Move to base.
 
diff --git a/src/core/system.cpp b/src/core/system.cpp
index bed6219e5..a5ba0a227 100644
--- a/src/core/system.cpp
+++ b/src/core/system.cpp
@@ -265,7 +265,7 @@ bool System::Internal::ProcessStartup()
     InitializeDiscordPresence();
 #endif
 
-return true;
+  return true;
 }
 
 void System::Internal::ProcessShutdown()
@@ -2430,7 +2430,7 @@ bool System::SaveStateToStream(ByteStream* state, u32 screenshot_size /* = 256 *
                                     ((display_aspect_ratio > 0.0f) ? display_aspect_ratio : 1.0f)));
     Log_VerbosePrintf("Saving %ux%u screenshot for state", screenshot_width, screenshot_height);
 
-    std::vector<u32> screenshot_buffer;
+    std::vector<u8> screenshot_buffer;
     u32 screenshot_stride;
     GPUTexture::Format screenshot_format;
     if (g_gpu->RenderScreenshotToBuffer(screenshot_width, screenshot_height,
@@ -2454,7 +2454,7 @@ bool System::SaveStateToStream(ByteStream* state, u32 screenshot_size /* = 256 *
         header.offset_to_screenshot = static_cast<u32>(state->GetPosition());
         header.screenshot_width = screenshot_width;
         header.screenshot_height = screenshot_height;
-        header.screenshot_size = static_cast<u32>(screenshot_buffer.size() * sizeof(u32));
+        header.screenshot_size = static_cast<u32>(screenshot_buffer.size());
         if (!state->Write2(screenshot_buffer.data(), header.screenshot_size))
           return false;
       }
diff --git a/src/util/d3d11_device.cpp b/src/util/d3d11_device.cpp
index 1a4f8fd6b..20c5efeab 100644
--- a/src/util/d3d11_device.cpp
+++ b/src/util/d3d11_device.cpp
@@ -155,7 +155,6 @@ void D3D11Device::DestroyDevice()
 {
   std::unique_lock lock(s_instance_mutex);
 
-  DestroyStagingBuffer();
   DestroyBuffers();
   m_context.Reset();
   m_device.Reset();
@@ -187,6 +186,7 @@ void D3D11Device::SetFeatures(FeatureMask disabled_features)
   m_features.texture_buffers_emulated_with_ssbo = false;
   m_features.geometry_shaders = !(disabled_features & FEATURE_MASK_GEOMETRY_SHADERS);
   m_features.partial_msaa_resolve = false;
+  m_features.memory_import = false;
   m_features.gpu_timing = true;
   m_features.shader_cache = true;
   m_features.pipeline_cache = false;
diff --git a/src/util/d3d11_device.h b/src/util/d3d11_device.h
index e2bd15dad..39d5ff459 100644
--- a/src/util/d3d11_device.h
+++ b/src/util/d3d11_device.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
 
 #pragma once
@@ -53,8 +53,11 @@ public:
   std::unique_ptr<GPUSampler> CreateSampler(const GPUSampler::Config& config) override;
   std::unique_ptr<GPUTextureBuffer> CreateTextureBuffer(GPUTextureBuffer::Format format, u32 size_in_elements) override;
 
-  bool DownloadTexture(GPUTexture* texture, u32 x, u32 y, u32 width, u32 height, void* out_data,
-                       u32 out_data_stride) override;
+  std::unique_ptr<GPUDownloadTexture> CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format) override;
+  std::unique_ptr<GPUDownloadTexture> CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format,
+                                                            void* memory, size_t memory_size,
+                                                            u32 memory_stride) override;
+
   bool SupportsTextureFormat(GPUTexture::Format format) const override;
   void CopyTextureRegion(GPUTexture* dst, u32 dst_x, u32 dst_y, u32 dst_layer, u32 dst_level, GPUTexture* src,
                          u32 src_x, u32 src_y, u32 src_layer, u32 src_level, u32 width, u32 height) override;
@@ -128,9 +131,6 @@ private:
 
   void SetFeatures(FeatureMask disabled_features);
 
-  bool CheckStagingBufferSize(u32 width, u32 height, DXGI_FORMAT format);
-  void DestroyStagingBuffer();
-
   bool CreateSwapChain();
   bool CreateSwapChainRTV();
   void DestroySwapChain();
@@ -163,11 +163,6 @@ private:
   BlendStateMap m_blend_states;
   InputLayoutMap m_input_layouts;
 
-  ComPtr<ID3D11Texture2D> m_readback_staging_texture;
-  DXGI_FORMAT m_readback_staging_texture_format = DXGI_FORMAT_UNKNOWN;
-  u32 m_readback_staging_texture_width = 0;
-  u32 m_readback_staging_texture_height = 0;
-
   bool m_allow_tearing_supported = false;
   bool m_using_flip_model_swap_chain = true;
   bool m_using_allow_tearing = false;
diff --git a/src/util/d3d11_texture.cpp b/src/util/d3d11_texture.cpp
index 91a43990d..44cd2ce4a 100644
--- a/src/util/d3d11_texture.cpp
+++ b/src/util/d3d11_texture.cpp
@@ -1,16 +1,11 @@
-// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
 
 #include "d3d11_texture.h"
 #include "d3d11_device.h"
 #include "d3d_common.h"
 
-// #include "common/align.h"
-// #include "common/assert.h"
-// #include "common/file_system.h"
 #include "common/log.h"
-// #include "common/path.h"
-// #include "common/rectangle.h"
 #include "common/string_util.h"
 
 #include "fmt/format.h"
@@ -26,60 +21,6 @@ std::unique_ptr<GPUTexture> D3D11Device::CreateTexture(u32 width, u32 height, u3
   return D3D11Texture::Create(m_device.Get(), width, height, layers, levels, samples, type, format, data, data_stride);
 }
 
-bool D3D11Device::DownloadTexture(GPUTexture* texture, u32 x, u32 y, u32 width, u32 height, void* out_data,
-                                  u32 out_data_stride)
-{
-  const D3D11Texture* tex = static_cast<const D3D11Texture*>(texture);
-  if (!CheckStagingBufferSize(width, height, tex->GetDXGIFormat()))
-    return false;
-
-  const CD3D11_BOX box(static_cast<LONG>(x), static_cast<LONG>(y), 0, static_cast<LONG>(x + width),
-                       static_cast<LONG>(y + height), 1);
-  m_context->CopySubresourceRegion(m_readback_staging_texture.Get(), 0, 0, 0, 0, tex->GetD3DTexture(), 0, &box);
-
-  D3D11_MAPPED_SUBRESOURCE sr;
-  HRESULT hr = m_context->Map(m_readback_staging_texture.Get(), 0, D3D11_MAP_READ, 0, &sr);
-  if (FAILED(hr))
-  {
-    Log_ErrorPrintf("Map() failed with HRESULT %08X", hr);
-    return false;
-  }
-
-  s_stats.num_downloads++;
-
-  const u32 copy_size = tex->GetPixelSize() * width;
-  StringUtil::StrideMemCpy(out_data, out_data_stride, sr.pData, sr.RowPitch, copy_size, height);
-  m_context->Unmap(m_readback_staging_texture.Get(), 0);
-  return true;
-}
-
-bool D3D11Device::CheckStagingBufferSize(u32 width, u32 height, DXGI_FORMAT format)
-{
-  if (m_readback_staging_texture_width >= width && m_readback_staging_texture_width >= height &&
-      m_readback_staging_texture_format == format)
-    return true;
-
-  DestroyStagingBuffer();
-
-  CD3D11_TEXTURE2D_DESC desc(format, width, height, 1, 1, 0, D3D11_USAGE_STAGING, D3D11_CPU_ACCESS_READ);
-  HRESULT hr = m_device->CreateTexture2D(&desc, nullptr, m_readback_staging_texture.ReleaseAndGetAddressOf());
-  if (FAILED(hr))
-  {
-    Log_ErrorPrintf("CreateTexture2D() failed with HRESULT %08X", hr);
-    return false;
-  }
-
-  return true;
-}
-
-void D3D11Device::DestroyStagingBuffer()
-{
-  m_readback_staging_texture.Reset();
-  m_readback_staging_texture_width = 0;
-  m_readback_staging_texture_height = 0;
-  m_readback_staging_texture_format = DXGI_FORMAT_UNKNOWN;
-}
-
 bool D3D11Device::SupportsTextureFormat(GPUTexture::Format format) const
 {
   const DXGI_FORMAT dfmt = D3DCommon::GetFormatMapping(format).resource_format;
@@ -447,3 +388,132 @@ std::unique_ptr<GPUTextureBuffer> D3D11Device::CreateTextureBuffer(GPUTextureBuf
 
   return tb;
 }
+
+D3D11DownloadTexture::D3D11DownloadTexture(Microsoft::WRL::ComPtr<ID3D11Texture2D> tex, u32 width, u32 height,
+                                           GPUTexture::Format format)
+  : GPUDownloadTexture(width, height, format, false), m_texture(std::move(tex))
+{
+}
+
+D3D11DownloadTexture::~D3D11DownloadTexture()
+{
+  if (IsMapped())
+    D3D11DownloadTexture::Unmap();
+}
+
+std::unique_ptr<D3D11DownloadTexture> D3D11DownloadTexture::Create(u32 width, u32 height, GPUTexture::Format format)
+{
+  D3D11_TEXTURE2D_DESC desc = {};
+  desc.Width = width;
+  desc.Height = height;
+  desc.Format = D3DCommon::GetFormatMapping(format).srv_format;
+  desc.MipLevels = 1;
+  desc.ArraySize = 1;
+  desc.SampleDesc.Count = 1;
+  desc.SampleDesc.Quality = 0;
+  desc.Usage = D3D11_USAGE_STAGING;
+  desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+
+  Microsoft::WRL::ComPtr<ID3D11Texture2D> tex;
+  HRESULT hr = D3D11Device::GetD3DDevice()->CreateTexture2D(&desc, nullptr, tex.GetAddressOf());
+  if (FAILED(hr))
+  {
+    Log_ErrorFmt("CreateTexture2D() failed: {:08X}", hr);
+    return {};
+  }
+
+  return std::unique_ptr<D3D11DownloadTexture>(new D3D11DownloadTexture(std::move(tex), width, height, format));
+}
+
+void D3D11DownloadTexture::CopyFromTexture(u32 dst_x, u32 dst_y, GPUTexture* src, u32 src_x, u32 src_y, u32 width,
+                                           u32 height, u32 src_layer, u32 src_level, bool use_transfer_pitch)
+{
+  D3D11Texture* src11 = static_cast<D3D11Texture*>(src);
+
+  DebugAssert(src11->GetFormat() == m_format);
+  DebugAssert(src_level < src11->GetLevels());
+  DebugAssert((src_x + width) <= src11->GetMipWidth(src_level) && (src_y + height) <= src11->GetMipHeight(src_level));
+  DebugAssert((dst_x + width) <= m_width && (dst_y + height) <= m_height);
+  DebugAssert((dst_x == 0 && dst_y == 0) || !use_transfer_pitch);
+
+  ID3D11DeviceContext1* const ctx = D3D11Device::GetD3DContext();
+  src11->CommitClear(ctx);
+
+  D3D11Device::GetStatistics().num_downloads++;
+
+  if (IsMapped())
+    Unmap();
+
+  // depth textures need to copy the whole thing..
+  const u32 subresource = D3D11CalcSubresource(src_level, src_layer, src11->GetLevels());
+  if (GPUTexture::IsDepthFormat(src11->GetFormat()))
+  {
+    ctx->CopySubresourceRegion(m_texture.Get(), 0, 0, 0, 0, src11->GetD3DTexture(), subresource, nullptr);
+  }
+  else
+  {
+    const CD3D11_BOX sbox(src_x, src_y, 0, src_x + width, src_y + height, 1);
+    ctx->CopySubresourceRegion(m_texture.Get(), 0, dst_x, dst_y, 0, src11->GetD3DTexture(), subresource, &sbox);
+  }
+
+  m_needs_flush = true;
+}
+
+bool D3D11DownloadTexture::Map(u32 x, u32 y, u32 width, u32 height)
+{
+  if (IsMapped())
+    return true;
+
+  D3D11_MAPPED_SUBRESOURCE sr;
+  HRESULT hr = D3D11Device::GetD3DContext()->Map(m_texture.Get(), 0, D3D11_MAP_READ, 0, &sr);
+  if (FAILED(hr))
+  {
+    Log_ErrorFmt("Map() failed: {:08X}", hr);
+    return false;
+  }
+
+  m_map_pointer = static_cast<u8*>(sr.pData);
+  m_current_pitch = sr.RowPitch;
+  return true;
+}
+
+void D3D11DownloadTexture::Unmap()
+{
+  if (!IsMapped())
+    return;
+
+  D3D11Device::GetD3DContext()->Unmap(m_texture.Get(), 0);
+  m_map_pointer = nullptr;
+}
+
+void D3D11DownloadTexture::Flush()
+{
+  if (!m_needs_flush)
+    return;
+
+  if (IsMapped())
+    Unmap();
+
+  // Handled when mapped.
+}
+
+void D3D11DownloadTexture::SetDebugName(std::string_view name)
+{
+  if (name.empty())
+    return;
+
+  SetD3DDebugObjectName(m_texture.Get(), name);
+}
+
+std::unique_ptr<GPUDownloadTexture> D3D11Device::CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format)
+{
+  return D3D11DownloadTexture::Create(width, height, format);
+}
+
+std::unique_ptr<GPUDownloadTexture> D3D11Device::CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format,
+                                                                       void* memory, size_t memory_size,
+                                                                       u32 memory_stride)
+{
+  Log_ErrorPrint("D3D11 cannot import memory for download textures");
+  return {};
+}
diff --git a/src/util/d3d11_texture.h b/src/util/d3d11_texture.h
index 5175af611..e7ae14118 100644
--- a/src/util/d3d11_texture.h
+++ b/src/util/d3d11_texture.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
 
 #pragma once
@@ -119,3 +119,26 @@ private:
   D3D11StreamBuffer m_buffer;
   Microsoft::WRL::ComPtr<ID3D11ShaderResourceView> m_srv;
 };
+
+class D3D11DownloadTexture final : public GPUDownloadTexture
+{
+public:
+  ~D3D11DownloadTexture() override;
+
+  static std::unique_ptr<D3D11DownloadTexture> Create(u32 width, u32 height, GPUTexture::Format format);
+
+  void CopyFromTexture(u32 dst_x, u32 dst_y, GPUTexture* src, u32 src_x, u32 src_y, u32 width, u32 height,
+                       u32 src_layer, u32 src_level, bool use_transfer_pitch) override;
+
+  bool Map(u32 x, u32 y, u32 width, u32 height) override;
+  void Unmap() override;
+
+  void Flush() override;
+
+  void SetDebugName(std::string_view name) override;
+
+private:
+  D3D11DownloadTexture(Microsoft::WRL::ComPtr<ID3D11Texture2D> tex, u32 width, u32 height, GPUTexture::Format format);
+
+  Microsoft::WRL::ComPtr<ID3D11Texture2D> m_texture;
+};
diff --git a/src/util/d3d12_device.cpp b/src/util/d3d12_device.cpp
index b594cbd1f..0b75a8117 100644
--- a/src/util/d3d12_device.cpp
+++ b/src/util/d3d12_device.cpp
@@ -261,7 +261,6 @@ void D3D12Device::DestroyDevice()
   WaitForGPUIdle();
 
   DestroyDeferredObjects(m_current_fence_value);
-  DestroyDownloadBuffer();
   DestroySamplers();
   DestroyTimestampQuery();
   DestroyBuffers();
@@ -1195,6 +1194,7 @@ void D3D12Device::SetFeatures(FeatureMask disabled_features)
   m_features.texture_buffers_emulated_with_ssbo = false;
   m_features.geometry_shaders = !(disabled_features & FEATURE_MASK_GEOMETRY_SHADERS);
   m_features.partial_msaa_resolve = true;
+  m_features.memory_import = false;
   m_features.gpu_timing = true;
   m_features.shader_cache = true;
   m_features.pipeline_cache = true;
diff --git a/src/util/d3d12_device.h b/src/util/d3d12_device.h
index c338b13bb..92aa052c7 100644
--- a/src/util/d3d12_device.h
+++ b/src/util/d3d12_device.h
@@ -30,6 +30,7 @@ class D3D12Pipeline;
 class D3D12SwapChain;
 class D3D12Texture;
 class D3D12TextureBuffer;
+class D3D12DownloadTexture;
 
 namespace D3D12MA {
 class Allocator;
@@ -39,6 +40,7 @@ class D3D12Device final : public GPUDevice
 {
 public:
   friend D3D12Texture;
+  friend D3D12DownloadTexture;
 
   template<typename T>
   using ComPtr = Microsoft::WRL::ComPtr<T>;
@@ -74,8 +76,11 @@ public:
   std::unique_ptr<GPUSampler> CreateSampler(const GPUSampler::Config& config) override;
   std::unique_ptr<GPUTextureBuffer> CreateTextureBuffer(GPUTextureBuffer::Format format, u32 size_in_elements) override;
 
-  bool DownloadTexture(GPUTexture* texture, u32 x, u32 y, u32 width, u32 height, void* out_data,
-                       u32 out_data_stride) override;
+  std::unique_ptr<GPUDownloadTexture> CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format) override;
+  std::unique_ptr<GPUDownloadTexture> CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format,
+                                                            void* memory, size_t memory_size,
+                                                            u32 memory_stride) override;
+
   bool SupportsTextureFormat(GPUTexture::Format format) const override;
   void CopyTextureRegion(GPUTexture* dst, u32 dst_x, u32 dst_y, u32 dst_layer, u32 dst_level, GPUTexture* src,
                          u32 src_x, u32 src_y, u32 src_layer, u32 src_level, u32 width, u32 height) override;
@@ -244,9 +249,6 @@ private:
 
   bool IsRenderTargetBound(const GPUTexture* tex) const;
 
-  bool CheckDownloadBufferSize(u32 required_size);
-  void DestroyDownloadBuffer();
-
   /// Set dirty flags on everything to force re-bind at next draw time.
   void InvalidateCachedState();
   void SetVertexBuffer(ID3D12GraphicsCommandList4* cmdlist);
@@ -321,10 +323,6 @@ private:
   SamplerMap m_sampler_map;
   ComPtr<ID3D12PipelineLibrary> m_pipeline_library;
 
-  ComPtr<D3D12MA::Allocation> m_download_buffer_allocation;
-  ComPtr<ID3D12Resource> m_download_buffer;
-  u32 m_download_buffer_size = 0;
-
   // Which bindings/state has to be updated before the next draw.
   u32 m_dirty_flags = ALL_DIRTY_STATE;
 
diff --git a/src/util/d3d12_texture.cpp b/src/util/d3d12_texture.cpp
index 0ccb59286..f85b153a4 100644
--- a/src/util/d3d12_texture.cpp
+++ b/src/util/d3d12_texture.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
 
 #include "d3d12_texture.h"
@@ -664,112 +664,6 @@ void D3D12Texture::MakeReadyForSampling()
   TransitionToState(D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE);
 }
 
-bool D3D12Device::DownloadTexture(GPUTexture* texture, u32 x, u32 y, u32 width, u32 height, void* out_data,
-                                  u32 out_data_stride)
-{
-  D3D12Texture* T = static_cast<D3D12Texture*>(texture);
-  T->CommitClear();
-
-  const u32 pitch = Common::AlignUp(width * T->GetPixelSize(), D3D12_TEXTURE_DATA_PITCH_ALIGNMENT);
-  const u32 size = pitch * height;
-  const u32 subresource = 0;
-  if (!CheckDownloadBufferSize(size))
-  {
-    Log_ErrorPrintf("Can't read back %ux%u", width, height);
-    return false;
-  }
-
-  if (InRenderPass())
-    EndRenderPass();
-
-  ID3D12GraphicsCommandList4* cmdlist = GetCommandList();
-
-  D3D12_TEXTURE_COPY_LOCATION srcloc;
-  srcloc.pResource = T->GetResource();
-  srcloc.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
-  srcloc.SubresourceIndex = subresource;
-
-  D3D12_TEXTURE_COPY_LOCATION dstloc;
-  dstloc.pResource = m_download_buffer.Get();
-  dstloc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
-  dstloc.PlacedFootprint.Offset = 0;
-  dstloc.PlacedFootprint.Footprint.Format = T->GetDXGIFormat();
-  dstloc.PlacedFootprint.Footprint.Width = width;
-  dstloc.PlacedFootprint.Footprint.Height = height;
-  dstloc.PlacedFootprint.Footprint.Depth = 1;
-  dstloc.PlacedFootprint.Footprint.RowPitch = pitch;
-
-  const D3D12_RESOURCE_STATES old_layout = T->GetResourceState();
-  if (old_layout != D3D12_RESOURCE_STATE_COPY_SOURCE)
-    T->TransitionSubresourceToState(cmdlist, subresource, old_layout, D3D12_RESOURCE_STATE_COPY_SOURCE);
-
-  // TODO: Rules for depth buffers here?
-  const D3D12_BOX srcbox{static_cast<UINT>(x),         static_cast<UINT>(y),          0u,
-                         static_cast<UINT>(x + width), static_cast<UINT>(y + height), 1u};
-  cmdlist->CopyTextureRegion(&dstloc, 0, 0, 0, &srcloc, &srcbox);
-
-  if (old_layout != D3D12_RESOURCE_STATE_COPY_SOURCE)
-    T->TransitionSubresourceToState(cmdlist, subresource, D3D12_RESOURCE_STATE_COPY_SOURCE, old_layout);
-
-  SubmitCommandList(true);
-
-  u8* map_pointer;
-  const D3D12_RANGE read_range{0u, size};
-  const HRESULT hr = m_download_buffer->Map(0, &read_range, reinterpret_cast<void**>(const_cast<u8**>(&map_pointer)));
-  if (FAILED(hr))
-  {
-    Log_ErrorPrintf("Map() failed with HRESULT %08X", hr);
-    return false;
-  }
-
-  StringUtil::StrideMemCpy(out_data, out_data_stride, map_pointer, pitch, width * T->GetPixelSize(), height);
-  m_download_buffer->Unmap(0, nullptr);
-  return true;
-}
-
-bool D3D12Device::CheckDownloadBufferSize(u32 required_size)
-{
-  if (m_download_buffer_size >= required_size)
-    return true;
-
-  DestroyDownloadBuffer();
-
-  D3D12MA::ALLOCATION_DESC allocation_desc = {};
-  allocation_desc.HeapType = D3D12_HEAP_TYPE_READBACK;
-
-  const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER,
-                                             0,
-                                             required_size,
-                                             1,
-                                             1,
-                                             1,
-                                             DXGI_FORMAT_UNKNOWN,
-                                             {1, 0},
-                                             D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
-                                             D3D12_RESOURCE_FLAG_NONE};
-
-  HRESULT hr = m_allocator->CreateResource(&allocation_desc, &resource_desc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
-                                           m_download_buffer_allocation.ReleaseAndGetAddressOf(),
-                                           IID_PPV_ARGS(m_download_buffer.ReleaseAndGetAddressOf()));
-  if (FAILED(hr))
-  {
-    Log_ErrorPrintf("CreateResource() failed with HRESULT %08X", hr);
-    return false;
-  }
-
-  return true;
-}
-
-void D3D12Device::DestroyDownloadBuffer()
-{
-  if (!m_download_buffer)
-    return;
-
-  m_download_buffer.Reset();
-  m_download_buffer_allocation.Reset();
-  m_download_buffer_size = 0;
-}
-
 D3D12Sampler::D3D12Sampler(D3D12DescriptorHandle descriptor) : m_descriptor(descriptor)
 {
 }
@@ -934,3 +828,184 @@ std::unique_ptr<GPUTextureBuffer> D3D12Device::CreateTextureBuffer(GPUTextureBuf
 
   return tb;
 }
+
+D3D12DownloadTexture::D3D12DownloadTexture(u32 width, u32 height, GPUTexture::Format format,
+                                           ComPtr<D3D12MA::Allocation> allocation, ComPtr<ID3D12Resource> buffer,
+                                           size_t buffer_size)
+  : GPUDownloadTexture(width, height, format, false), m_allocation(std::move(allocation)), m_buffer(std::move(buffer)),
+    m_buffer_size(buffer_size)
+{
+}
+
+D3D12DownloadTexture::~D3D12DownloadTexture()
+{
+  if (IsMapped())
+    D3D12DownloadTexture::Unmap();
+
+  if (m_buffer)
+    D3D12Device::GetInstance().DeferResourceDestruction(m_allocation.Get(), m_buffer.Get());
+}
+
+std::unique_ptr<D3D12DownloadTexture> D3D12DownloadTexture::Create(u32 width, u32 height, GPUTexture::Format format)
+{
+  const u32 buffer_size = GetBufferSize(width, height, format, D3D12_TEXTURE_DATA_PITCH_ALIGNMENT);
+
+  D3D12MA::ALLOCATION_DESC allocation_desc = {};
+  allocation_desc.HeapType = D3D12_HEAP_TYPE_READBACK;
+
+  const D3D12_RESOURCE_DESC resource_desc = {D3D12_RESOURCE_DIMENSION_BUFFER,
+                                             0,
+                                             buffer_size,
+                                             1,
+                                             1,
+                                             1,
+                                             DXGI_FORMAT_UNKNOWN,
+                                             {1, 0},
+                                             D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
+                                             D3D12_RESOURCE_FLAG_NONE};
+
+  ComPtr<D3D12MA::Allocation> allocation;
+  ComPtr<ID3D12Resource> buffer;
+
+  HRESULT hr = D3D12Device::GetInstance().GetAllocator()->CreateResource(
+    &allocation_desc, &resource_desc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, allocation.GetAddressOf(),
+    IID_PPV_ARGS(buffer.GetAddressOf()));
+  if (FAILED(hr))
+  {
+    Log_ErrorFmt("CreateResource() failed with HRESULT {:08X}", hr);
+    return {};
+  }
+
+  return std::unique_ptr<D3D12DownloadTexture>(
+    new D3D12DownloadTexture(width, height, format, std::move(allocation), std::move(buffer), buffer_size));
+}
+
+void D3D12DownloadTexture::CopyFromTexture(u32 dst_x, u32 dst_y, GPUTexture* src, u32 src_x, u32 src_y, u32 width,
+                                           u32 height, u32 src_layer, u32 src_level, bool use_transfer_pitch)
+{
+  D3D12Texture* const src12 = static_cast<D3D12Texture*>(src);
+  D3D12Device& dev = D3D12Device::GetInstance();
+
+  DebugAssert(src12->GetFormat() == m_format);
+  DebugAssert(src_level < src12->GetLevels());
+  DebugAssert((src_x + width) <= src12->GetMipWidth(src_level) && (src_y + height) <= src12->GetMipHeight(src_level));
+  DebugAssert((dst_x + width) <= m_width && (dst_y + height) <= m_height);
+  DebugAssert((dst_x == 0 && dst_y == 0) || !use_transfer_pitch);
+
+  u32 copy_offset, copy_size, copy_rows;
+  m_current_pitch = GetTransferPitch(use_transfer_pitch ? width : m_width, D3D12_TEXTURE_DATA_PITCH_ALIGNMENT);
+  GetTransferSize(dst_x, dst_y, width, height, m_current_pitch, &copy_offset, &copy_size, &copy_rows);
+
+  dev.GetStatistics().num_downloads++;
+  if (dev.InRenderPass())
+    dev.EndRenderPass();
+  src12->CommitClear();
+
+  if (IsMapped())
+    Unmap();
+
+  ID3D12GraphicsCommandList* cmdlist = dev.GetCommandList();
+  GL_INS_FMT("ReadbackTexture: {{{},{}}} {}x{} => {{{},{}}}", src_x, src_y, width, height, dst_x, dst_y);
+
+  D3D12_TEXTURE_COPY_LOCATION srcloc;
+  srcloc.pResource = src12->GetResource();
+  srcloc.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
+  srcloc.SubresourceIndex = src12->CalculateSubresource(src_layer, src_level);
+
+  D3D12_TEXTURE_COPY_LOCATION dstloc;
+  dstloc.pResource = m_buffer.Get();
+  dstloc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
+  dstloc.PlacedFootprint.Offset = copy_offset;
+  dstloc.PlacedFootprint.Footprint.Format = src12->GetDXGIFormat();
+  dstloc.PlacedFootprint.Footprint.Width = width;
+  dstloc.PlacedFootprint.Footprint.Height = height;
+  dstloc.PlacedFootprint.Footprint.Depth = 1;
+  dstloc.PlacedFootprint.Footprint.RowPitch = m_current_pitch;
+
+  const D3D12_RESOURCE_STATES old_layout = src12->GetResourceState();
+  if (old_layout != D3D12_RESOURCE_STATE_COPY_SOURCE)
+    src12->TransitionSubresourceToState(cmdlist, src_level, old_layout, D3D12_RESOURCE_STATE_COPY_SOURCE);
+
+  // TODO: Rules for depth buffers here?
+  const D3D12_BOX srcbox{static_cast<UINT>(src_x),         static_cast<UINT>(src_y),          0u,
+                         static_cast<UINT>(src_x + width), static_cast<UINT>(src_y + height), 1u};
+  cmdlist->CopyTextureRegion(&dstloc, 0, 0, 0, &srcloc, &srcbox);
+
+  if (old_layout != D3D12_RESOURCE_STATE_COPY_SOURCE)
+    src12->TransitionSubresourceToState(cmdlist, src_level, D3D12_RESOURCE_STATE_COPY_SOURCE, old_layout);
+
+  m_copy_fence_value = dev.GetCurrentFenceValue();
+  m_needs_flush = true;
+}
+
+bool D3D12DownloadTexture::Map(u32 x, u32 y, u32 width, u32 height)
+{
+  if (IsMapped())
+    return true;
+
+  // Never populated?
+  if (!m_current_pitch)
+    return false;
+
+  u32 copy_offset, copy_size, copy_rows;
+  GetTransferSize(x, y, width, height, m_current_pitch, &copy_offset, &copy_size, &copy_rows);
+
+  const D3D12_RANGE read_range{copy_offset, copy_offset + m_current_pitch * copy_rows};
+  const HRESULT hr = m_buffer->Map(0, &read_range, reinterpret_cast<void**>(const_cast<u8**>(&m_map_pointer)));
+  if (FAILED(hr))
+  {
+    Log_ErrorFmt("Map() failed with HRESULT {:08X}", hr);
+    return false;
+  }
+
+  return true;
+}
+
+void D3D12DownloadTexture::Unmap()
+{
+  if (!IsMapped())
+    return;
+
+  const D3D12_RANGE write_range = {};
+  m_buffer->Unmap(0, &write_range);
+  m_map_pointer = nullptr;
+}
+
+void D3D12DownloadTexture::Flush()
+{
+  if (!m_needs_flush)
+    return;
+
+  m_needs_flush = false;
+
+  D3D12Device& dev = D3D12Device::GetInstance();
+  if (dev.GetCompletedFenceValue() >= m_copy_fence_value)
+    return;
+
+  // Need to execute command buffer.
+  if (dev.GetCurrentFenceValue() == m_copy_fence_value)
+    dev.SubmitCommandList(true);
+  else
+    dev.WaitForFence(m_copy_fence_value);
+}
+
+void D3D12DownloadTexture::SetDebugName(std::string_view name)
+{
+  if (name.empty())
+    return;
+
+  D3D12::SetObjectName(m_buffer.Get(), name);
+}
+
+std::unique_ptr<GPUDownloadTexture> D3D12Device::CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format)
+{
+  return D3D12DownloadTexture::Create(width, height, format);
+}
+
+std::unique_ptr<GPUDownloadTexture> D3D12Device::CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format,
+                                                                       void* memory, size_t memory_size,
+                                                                       u32 memory_stride)
+{
+  Log_ErrorPrint("D3D12 cannot import memory for download textures");
+  return {};
+}
diff --git a/src/util/d3d12_texture.h b/src/util/d3d12_texture.h
index 277e22c73..4186f4ad3 100644
--- a/src/util/d3d12_texture.h
+++ b/src/util/d3d12_texture.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
 
 #pragma once
@@ -146,3 +146,34 @@ private:
   D3D12StreamBuffer m_buffer;
   D3D12DescriptorHandle m_descriptor;
 };
+
+class D3D12DownloadTexture final : public GPUDownloadTexture
+{
+public:
+  template<typename T>
+  using ComPtr = Microsoft::WRL::ComPtr<T>;
+
+  ~D3D12DownloadTexture() override;
+
+  static std::unique_ptr<D3D12DownloadTexture> Create(u32 width, u32 height, GPUTexture::Format format);
+
+  void CopyFromTexture(u32 dst_x, u32 dst_y, GPUTexture* src, u32 src_x, u32 src_y, u32 width, u32 height,
+                       u32 src_layer, u32 src_level, bool use_transfer_pitch) override;
+
+  bool Map(u32 x, u32 y, u32 width, u32 height) override;
+  void Unmap() override;
+
+  void Flush() override;
+
+  void SetDebugName(std::string_view name) override;
+
+private:
+  D3D12DownloadTexture(u32 width, u32 height, GPUTexture::Format format, ComPtr<D3D12MA::Allocation> allocation,
+                       ComPtr<ID3D12Resource> buffer, size_t buffer_size);
+
+  ComPtr<D3D12MA::Allocation> m_allocation;
+  ComPtr<ID3D12Resource> m_buffer;
+
+  u64 m_copy_fence_value = 0;
+  size_t m_buffer_size = 0;
+};
diff --git a/src/util/gpu_device.h b/src/util/gpu_device.h
index 33899aa6e..4c74e5746 100644
--- a/src/util/gpu_device.h
+++ b/src/util/gpu_device.h
@@ -439,6 +439,7 @@ public:
     FEATURE_MASK_TEXTURE_BUFFERS = (1 << 2),
     FEATURE_MASK_GEOMETRY_SHADERS = (1 << 3),
     FEATURE_MASK_TEXTURE_COPY_TO_SELF = (1 << 4),
+    FEATURE_MASK_MEMORY_IMPORT = (1 << 5),
   };
 
   struct Features
@@ -452,6 +453,7 @@ public:
     bool texture_buffers_emulated_with_ssbo : 1;
     bool geometry_shaders : 1;
     bool partial_msaa_resolve : 1;
+    bool memory_import : 1;
     bool gpu_timing : 1;
     bool shader_cache : 1;
     bool pipeline_cache : 1;
@@ -583,8 +585,12 @@ public:
   void RecycleTexture(std::unique_ptr<GPUTexture> texture);
   void PurgeTexturePool();
 
-  virtual bool DownloadTexture(GPUTexture* texture, u32 x, u32 y, u32 width, u32 height, void* out_data,
-                               u32 out_data_stride) = 0;
+  virtual std::unique_ptr<GPUDownloadTexture> CreateDownloadTexture(u32 width, u32 height,
+                                                                    GPUTexture::Format format) = 0;
+  virtual std::unique_ptr<GPUDownloadTexture> CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format,
+                                                                    void* memory, size_t memory_size,
+                                                                    u32 memory_stride) = 0;
+
   virtual void CopyTextureRegion(GPUTexture* dst, u32 dst_x, u32 dst_y, u32 dst_layer, u32 dst_level, GPUTexture* src,
                                  u32 src_x, u32 src_y, u32 src_layer, u32 src_level, u32 width, u32 height) = 0;
   virtual void ResolveTextureRegion(GPUTexture* dst, u32 dst_x, u32 dst_y, u32 dst_layer, u32 dst_level,
diff --git a/src/util/gpu_texture.cpp b/src/util/gpu_texture.cpp
index afb582548..e6786ecd1 100644
--- a/src/util/gpu_texture.cpp
+++ b/src/util/gpu_texture.cpp
@@ -1,9 +1,10 @@
-// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
 
 #include "gpu_texture.h"
 #include "gpu_device.h"
 
+#include "common/align.h"
 #include "common/bitutils.h"
 #include "common/log.h"
 #include "common/string_util.h"
@@ -52,6 +53,68 @@ const char* GPUTexture::GetFormatName(Format format)
   return format_names[static_cast<u8>(format)];
 }
 
+u32 GPUTexture::GetCompressedBytesPerBlock() const
+{
+  return GetCompressedBytesPerBlock(m_format);
+}
+
+u32 GPUTexture::GetCompressedBytesPerBlock(Format format)
+{
+  // TODO: Implement me
+  return GetPixelSize(format);
+}
+
+u32 GPUTexture::GetCompressedBlockSize() const
+{
+  return GetCompressedBlockSize(m_format);
+}
+
+u32 GPUTexture::GetCompressedBlockSize(Format format)
+{
+  // TODO: Implement me
+  /*if (format >= Format::BC1 && format <= Format::BC7)
+    return 4;
+  else*/
+  return 1;
+}
+
+u32 GPUTexture::CalcUploadPitch(Format format, u32 width)
+{
+  /*
+  if (format >= Format::BC1 && format <= Format::BC7)
+    width = Common::AlignUpPow2(width, 4) / 4;
+  */
+  return width * GetCompressedBytesPerBlock(format);
+}
+
+u32 GPUTexture::CalcUploadPitch(u32 width) const
+{
+  return CalcUploadPitch(m_format, width);
+}
+
+u32 GPUTexture::CalcUploadRowLengthFromPitch(u32 pitch) const
+{
+  return CalcUploadRowLengthFromPitch(m_format, pitch);
+}
+
+u32 GPUTexture::CalcUploadRowLengthFromPitch(Format format, u32 pitch)
+{
+  const u32 block_size = GetCompressedBlockSize(format);
+  const u32 bytes_per_block = GetCompressedBytesPerBlock(format);
+  return ((pitch + (bytes_per_block - 1)) / bytes_per_block) * block_size;
+}
+
+u32 GPUTexture::CalcUploadSize(u32 height, u32 pitch) const
+{
+  return CalcUploadSize(m_format, height, pitch);
+}
+
+u32 GPUTexture::CalcUploadSize(Format format, u32 height, u32 pitch)
+{
+  const u32 block_size = GetCompressedBlockSize(format);
+  return pitch * ((static_cast<u32>(height) + (block_size - 1)) / block_size);
+}
+
 std::array<float, 4> GPUTexture::GetUNormClearColor() const
 {
   return GPUDevice::RGBA8ToFloat(m_clear_value.color);
@@ -117,6 +180,12 @@ bool GPUTexture::IsDepthStencilFormat(Format format)
   return false;
 }
 
+bool GPUTexture::IsCompressedFormat(Format format)
+{
+  // TODO: Implement me
+  return false;
+}
+
 bool GPUTexture::ValidateConfig(u32 width, u32 height, u32 layers, u32 levels, u32 samples, Type type, Format format)
 {
   if (width > MAX_WIDTH || height > MAX_HEIGHT || layers > MAX_LAYERS || levels > MAX_LEVELS || samples > MAX_SAMPLES)
@@ -161,7 +230,7 @@ bool GPUTexture::ValidateConfig(u32 width, u32 height, u32 layers, u32 levels, u
   return true;
 }
 
-bool GPUTexture::ConvertTextureDataToRGBA8(u32 width, u32 height, std::vector<u32>& texture_data,
+bool GPUTexture::ConvertTextureDataToRGBA8(u32 width, u32 height, std::vector<u8>& texture_data,
                                            u32& texture_data_stride, GPUTexture::Format format)
 {
   switch (format)
@@ -170,9 +239,15 @@ bool GPUTexture::ConvertTextureDataToRGBA8(u32 width, u32 height, std::vector<u3
     {
       for (u32 y = 0; y < height; y++)
       {
-        u32* pixels = reinterpret_cast<u32*>(reinterpret_cast<u8*>(texture_data.data()) + (y * texture_data_stride));
+        u8* pixels = texture_data.data() + (y * texture_data_stride);
         for (u32 x = 0; x < width; x++)
-          pixels[x] = (pixels[x] & 0xFF00FF00) | ((pixels[x] & 0xFF) << 16) | ((pixels[x] >> 16) & 0xFF);
+        {
+          u32 pixel;
+          std::memcpy(&pixel, pixels, sizeof(pixel));
+          pixel = (pixel & 0xFF00FF00) | ((pixel & 0xFF) << 16) | ((pixel >> 16) & 0xFF);
+          std::memcpy(pixels, &pixel, sizeof(pixel));
+          pixels += sizeof(pixel);
+        }
       }
 
       return true;
@@ -183,12 +258,12 @@ bool GPUTexture::ConvertTextureDataToRGBA8(u32 width, u32 height, std::vector<u3
 
     case Format::RGB565:
     {
-      std::vector<u32> temp(width * height);
+      std::vector<u8> temp(width * height * sizeof(u32));
 
       for (u32 y = 0; y < height; y++)
       {
-        const u8* pixels_in = reinterpret_cast<u8*>(texture_data.data()) + (y * texture_data_stride);
-        u32* pixels_out = &temp[y * width];
+        const u8* pixels_in = texture_data.data() + (y * texture_data_stride);
+        u8* pixels_out = &temp[y * width * sizeof(u32)];
 
         for (u32 x = 0; x < width; x++)
         {
@@ -199,8 +274,10 @@ bool GPUTexture::ConvertTextureDataToRGBA8(u32 width, u32 height, std::vector<u3
           const u8 r5 = Truncate8(pixel_in >> 11);
           const u8 g6 = Truncate8((pixel_in >> 5) & 0x3F);
           const u8 b5 = Truncate8(pixel_in & 0x1F);
-          *(pixels_out++) = ZeroExtend32((r5 << 3) | (r5 & 7)) | (ZeroExtend32((g6 << 2) | (g6 & 3)) << 8) |
+          const u32 rgba8 = ZeroExtend32((r5 << 3) | (r5 & 7)) | (ZeroExtend32((g6 << 2) | (g6 & 3)) << 8) |
                             (ZeroExtend32((b5 << 3) | (b5 & 7)) << 16) | (0xFF000000u);
+          std::memcpy(pixels_out, &rgba8, sizeof(u32));
+          pixels_out += sizeof(u32);
         }
       }
 
@@ -211,12 +288,12 @@ bool GPUTexture::ConvertTextureDataToRGBA8(u32 width, u32 height, std::vector<u3
 
     case Format::RGBA5551:
     {
-      std::vector<u32> temp(width * height);
+      std::vector<u8> temp(width * height * sizeof(u32));
 
       for (u32 y = 0; y < height; y++)
       {
-        const u8* pixels_in = reinterpret_cast<u8*>(texture_data.data()) + (y * texture_data_stride);
-        u32* pixels_out = &temp[y * width];
+        const u8* pixels_in = texture_data.data() + (y * texture_data_stride);
+        u8* pixels_out = &temp[y * width];
 
         for (u32 x = 0; x < width; x++)
         {
@@ -228,8 +305,10 @@ bool GPUTexture::ConvertTextureDataToRGBA8(u32 width, u32 height, std::vector<u3
           const u8 r5 = Truncate8((pixel_in >> 10) & 0x1F);
           const u8 g6 = Truncate8((pixel_in >> 5) & 0x1F);
           const u8 b5 = Truncate8(pixel_in & 0x1F);
-          *(pixels_out++) = ZeroExtend32((r5 << 3) | (r5 & 7)) | (ZeroExtend32((g6 << 3) | (g6 & 7)) << 8) |
+          const u32 rgba8 = ZeroExtend32((r5 << 3) | (r5 & 7)) | (ZeroExtend32((g6 << 3) | (g6 & 7)) << 8) |
                             (ZeroExtend32((b5 << 3) | (b5 & 7)) << 16) | (a1 ? 0xFF000000u : 0u);
+          std::memcpy(pixels_out, &rgba8, sizeof(u32));
+          pixels_out += sizeof(u32);
         }
       }
 
@@ -244,13 +323,13 @@ bool GPUTexture::ConvertTextureDataToRGBA8(u32 width, u32 height, std::vector<u3
   }
 }
 
-void GPUTexture::FlipTextureDataRGBA8(u32 width, u32 height, std::vector<u32>& texture_data, u32 texture_data_stride)
+void GPUTexture::FlipTextureDataRGBA8(u32 width, u32 height, std::vector<u8>& texture_data, u32 texture_data_stride)
 {
-  std::vector<u32> temp(width);
+  std::vector<u8> temp(width * sizeof(u32));
   for (u32 flip_row = 0; flip_row < (height / 2); flip_row++)
   {
-    u32* top_ptr = &texture_data[flip_row * width];
-    u32* bottom_ptr = &texture_data[((height - 1) - flip_row) * width];
+    u8* top_ptr = &texture_data[flip_row * texture_data_stride];
+    u8* bottom_ptr = &texture_data[((height - 1) - flip_row) * texture_data_stride];
     std::memcpy(temp.data(), top_ptr, texture_data_stride);
     std::memcpy(top_ptr, bottom_ptr, texture_data_stride);
     std::memcpy(bottom_ptr, temp.data(), texture_data_stride);
@@ -260,3 +339,56 @@ void GPUTexture::FlipTextureDataRGBA8(u32 width, u32 height, std::vector<u32>& t
 void GPUTexture::MakeReadyForSampling()
 {
 }
+
+GPUDownloadTexture::GPUDownloadTexture(u32 width, u32 height, GPUTexture::Format format, bool is_imported)
+  : m_width(width), m_height(height), m_format(format), m_is_imported(is_imported)
+{
+}
+
+GPUDownloadTexture::~GPUDownloadTexture() = default;
+
+u32 GPUDownloadTexture::GetBufferSize(u32 width, u32 height, GPUTexture::Format format, u32 pitch_align /* = 1 */)
+{
+  DebugAssert(std::has_single_bit(pitch_align));
+  const u32 bytes_per_pixel = GPUTexture::GetPixelSize(format);
+  const u32 pitch = Common::AlignUpPow2(width * bytes_per_pixel, pitch_align);
+  return (pitch * height);
+}
+
+u32 GPUDownloadTexture::GetTransferPitch(u32 width, u32 pitch_align) const
+{
+  DebugAssert(std::has_single_bit(pitch_align));
+  const u32 bytes_per_pixel = GPUTexture::GetPixelSize(m_format);
+  return Common::AlignUpPow2(width * bytes_per_pixel, pitch_align);
+}
+
+void GPUDownloadTexture::GetTransferSize(u32 x, u32 y, u32 width, u32 height, u32 pitch, u32* copy_offset,
+                                         u32* copy_size, u32* copy_rows) const
+{
+  const u32 bytes_per_pixel = GPUTexture::GetPixelSize(m_format);
+  *copy_offset = (y * pitch) + (x * bytes_per_pixel);
+  *copy_size = width * bytes_per_pixel;
+  *copy_rows = height;
+}
+
+bool GPUDownloadTexture::ReadTexels(u32 x, u32 y, u32 width, u32 height, void* out_ptr, u32 out_stride)
+{
+  if (m_needs_flush)
+    Flush();
+
+  // if we're imported, and this is the same buffer, bail out
+  if (m_map_pointer == out_ptr)
+  {
+    // but stride should match
+    DebugAssert(x == 0 && y == 0 && width <= m_width && height <= m_height && out_stride == m_current_pitch);
+    return true;
+  }
+
+  if (!Map(x, y, width, height))
+    return false;
+
+  u32 copy_offset, copy_size, copy_rows;
+  GetTransferSize(x, y, width, height, m_current_pitch, &copy_offset, &copy_size, &copy_rows);
+  StringUtil::StrideMemCpy(out_ptr, out_stride, m_map_pointer + copy_offset, m_current_pitch, copy_size, copy_rows);
+  return true;
+}
diff --git a/src/util/gpu_texture.h b/src/util/gpu_texture.h
index 9dec6e654..d0369b946 100644
--- a/src/util/gpu_texture.h
+++ b/src/util/gpu_texture.h
@@ -80,11 +80,18 @@ public:
   static u32 GetPixelSize(GPUTexture::Format format);
   static bool IsDepthFormat(GPUTexture::Format format);
   static bool IsDepthStencilFormat(GPUTexture::Format format);
+  static bool IsCompressedFormat(Format format);
+  static u32 GetCompressedBytesPerBlock(Format format);
+  static u32 GetCompressedBlockSize(Format format);
+  static u32 CalcUploadPitch(Format format, u32 width);
+  static u32 CalcUploadRowLengthFromPitch(Format format, u32 pitch);
+  static u32 CalcUploadSize(Format format, u32 height, u32 pitch);
+
   static bool ValidateConfig(u32 width, u32 height, u32 layers, u32 levels, u32 samples, Type type, Format format);
 
-  static bool ConvertTextureDataToRGBA8(u32 width, u32 height, std::vector<u32>& texture_data, u32& texture_data_stride,
+  static bool ConvertTextureDataToRGBA8(u32 width, u32 height, std::vector<u8>& texture_data, u32& texture_data_stride,
                                         GPUTexture::Format format);
-  static void FlipTextureDataRGBA8(u32 width, u32 height, std::vector<u32>& texture_data, u32 texture_data_stride);
+  static void FlipTextureDataRGBA8(u32 width, u32 height, std::vector<u8>& texture_data, u32 texture_data_stride);
 
   ALWAYS_INLINE u32 GetWidth() const { return m_width; }
   ALWAYS_INLINE u32 GetHeight() const { return m_height; }
@@ -133,6 +140,12 @@ public:
 
   size_t GetVRAMUsage() const;
 
+  u32 GetCompressedBytesPerBlock() const;
+  u32 GetCompressedBlockSize() const;
+  u32 CalcUploadPitch(u32 width) const;
+  u32 CalcUploadRowLengthFromPitch(u32 pitch) const;
+  u32 CalcUploadSize(u32 height, u32 pitch) const;
+
   GPUTexture& operator=(const GPUTexture&) = delete;
 
   virtual bool Update(u32 x, u32 y, u32 width, u32 height, const void* data, u32 pitch, u32 layer = 0,
@@ -160,3 +173,71 @@ protected:
 
   ClearValue m_clear_value = {};
 };
+
+class GPUDownloadTexture
+{
+public:
+  GPUDownloadTexture(u32 width, u32 height, GPUTexture::Format format, bool is_imported);
+  virtual ~GPUDownloadTexture();
+
+  /// Basically, this has dimensions only because of DX11.
+  ALWAYS_INLINE u32 GetWidth() const { return m_width; }
+  ALWAYS_INLINE u32 GetHeight() const { return m_height; }
+  ALWAYS_INLINE GPUTexture::Format GetFormat() const { return m_format; }
+  ALWAYS_INLINE bool NeedsFlush() const { return m_needs_flush; }
+  ALWAYS_INLINE bool IsMapped() const { return (m_map_pointer != nullptr); }
+  ALWAYS_INLINE bool IsImported() const { return m_is_imported; }
+  ALWAYS_INLINE const u8* GetMapPointer() const { return m_map_pointer; }
+  ALWAYS_INLINE u32 GetMapPitch() const { return m_current_pitch; }
+
+  /// Calculates the pitch of a transfer.
+  u32 GetTransferPitch(u32 width, u32 pitch_align) const;
+
+  /// Calculates the size of the data you should transfer.
+  void GetTransferSize(u32 x, u32 y, u32 width, u32 height, u32 pitch, u32* copy_offset, u32* copy_size,
+                       u32* copy_rows) const;
+
+  /// Queues a copy from the specified texture to this buffer.
+  /// Does not complete immediately, you should flush before accessing the buffer.
+  /// use_transfer_pitch should be true if there's only a single texture being copied to this buffer before
+  /// it will be used. This allows the image to be packed tighter together, and buffer reuse.
+  virtual void CopyFromTexture(u32 dst_x, u32 dst_y, GPUTexture* src, u32 src_x, u32 src_y, u32 width, u32 height,
+                               u32 src_layer, u32 src_level, bool use_transfer_pitch = true) = 0;
+
+  /// Maps the texture into the CPU address space, enabling it to read the contents.
+  /// The Map call may not perform synchronization. If the contents of the staging texture
+  /// has been updated by a CopyFromTexture() call, you must call Flush() first.
+  /// If persistent mapping is supported in the backend, this may be a no-op.
+  virtual bool Map(u32 x, u32 y, u32 width, u32 height) = 0;
+
+  /// Unmaps the CPU-readable copy of the texture. May be a no-op on backends which
+  /// support persistent-mapped buffers.
+  virtual void Unmap() = 0;
+
+  /// Flushes pending writes from the CPU to the GPU, and reads from the GPU to the CPU.
+  /// This may cause a command buffer submit depending on if one has occurred between the last
+  /// call to CopyFromTexture() and the Flush() call.
+  virtual void Flush() = 0;
+
+  /// Sets object name that will be displayed in graphics debuggers.
+  virtual void SetDebugName(std::string_view name) = 0;
+
+  /// Reads the specified rectangle from the staging texture to out_ptr, with the specified stride
+  /// (length in bytes of each row). CopyFromTexture() must be called first. The contents of any
+  /// texels outside of the rectangle used for CopyFromTexture is undefined.
+  bool ReadTexels(u32 x, u32 y, u32 width, u32 height, void* out_ptr, u32 out_stride);
+
+  /// Returns what the size of the specified texture would be, in bytes.
+  static u32 GetBufferSize(u32 width, u32 height, GPUTexture::Format format, u32 pitch_align = 1);
+
+protected:
+  u32 m_width;
+  u32 m_height;
+  GPUTexture::Format m_format;
+
+  const u8* m_map_pointer = nullptr;
+  u32 m_current_pitch = 0;
+
+  bool m_is_imported = false;
+  bool m_needs_flush = false;
+};
diff --git a/src/util/metal_device.h b/src/util/metal_device.h
index 5df1242a2..d4f35b182 100644
--- a/src/util/metal_device.h
+++ b/src/util/metal_device.h
@@ -137,6 +137,34 @@ private:
   u8 m_map_level = 0;
 };
 
+class MetalDownloadTexture final : public GPUDownloadTexture
+{
+public:
+  ~MetalDownloadTexture() override;
+
+  static std::unique_ptr<MetalDownloadTexture> Create(u32 width, u32 height, GPUTexture::Format format, void* memory,
+                                                      size_t memory_size, u32 memory_stride);
+
+  void CopyFromTexture(u32 dst_x, u32 dst_y, GPUTexture* src, u32 src_x, u32 src_y, u32 width, u32 height,
+                       u32 src_layer, u32 src_level, bool use_transfer_pitch) override;
+
+  bool Map(u32 x, u32 y, u32 width, u32 height) override;
+  void Unmap() override;
+
+  void Flush() override;
+
+  void SetDebugName(std::string_view name) override;
+
+private:
+  MetalDownloadTexture(u32 width, u32 height, GPUTexture::Format format, u8* import_buffer, size_t buffer_offset,
+                       id<MTLBuffer> buffer, const u8* map_ptr, u32 map_pitch);
+
+  size_t m_buffer_offset = 0;
+  id<MTLBuffer> m_buffer = nil;
+
+  u64 m_copy_fence_counter = 0;
+};
+
 class MetalTextureBuffer final : public GPUTextureBuffer
 {
 public:
@@ -160,6 +188,7 @@ private:
 class MetalDevice final : public GPUDevice
 {
   friend MetalTexture;
+  friend MetalDownloadTexture;
 
 public:
   ALWAYS_INLINE static MetalDevice& GetInstance() { return *static_cast<MetalDevice*>(g_gpu_device.get()); }
@@ -188,8 +217,11 @@ public:
   std::unique_ptr<GPUSampler> CreateSampler(const GPUSampler::Config& config) override;
   std::unique_ptr<GPUTextureBuffer> CreateTextureBuffer(GPUTextureBuffer::Format format, u32 size_in_elements) override;
 
-  bool DownloadTexture(GPUTexture* texture, u32 x, u32 y, u32 width, u32 height, void* out_data,
-                       u32 out_data_stride) override;
+  std::unique_ptr<GPUDownloadTexture> CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format) override;
+  std::unique_ptr<GPUDownloadTexture> CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format,
+                                                            void* memory, size_t memory_size,
+                                                            u32 memory_stride) override;
+
   bool SupportsTextureFormat(GPUTexture::Format format) const override;
   void CopyTextureRegion(GPUTexture* dst, u32 dst_x, u32 dst_y, u32 dst_layer, u32 dst_level, GPUTexture* src,
                          u32 src_x, u32 src_y, u32 src_layer, u32 src_level, u32 width, u32 height) override;
@@ -302,8 +334,6 @@ private:
   void SetViewportInRenderEncoder();
   void SetScissorInRenderEncoder();
 
-  bool CheckDownloadBufferSize(u32 required_size);
-
   bool CreateLayer();
   void DestroyLayer();
   void RenderBlankFrame();
@@ -327,9 +357,6 @@ private:
 
   DepthStateMap m_depth_states;
 
-  id<MTLBuffer> m_download_buffer = nil;
-  u32 m_download_buffer_size = 0;
-
   MetalStreamBuffer m_vertex_buffer;
   MetalStreamBuffer m_index_buffer;
   MetalStreamBuffer m_uniform_buffer;
diff --git a/src/util/metal_device.mm b/src/util/metal_device.mm
index 3757377b7..e3bd58b13 100644
--- a/src/util/metal_device.mm
+++ b/src/util/metal_device.mm
@@ -234,6 +234,7 @@ void MetalDevice::SetFeatures(FeatureMask disabled_features)
   m_features.texture_buffers_emulated_with_ssbo = true;
   m_features.geometry_shaders = false;
   m_features.partial_msaa_resolve = false;
+  m_features.memory_import = true;
   m_features.shader_cache = true;
   m_features.pipeline_cache = false;
   m_features.prefer_unused_textures = true;
@@ -499,13 +500,6 @@ bool MetalDevice::CreateBuffers()
 
 void MetalDevice::DestroyBuffers()
 {
-  if (m_download_buffer != nil)
-  {
-    [m_download_buffer release];
-    m_download_buffer = nil;
-    m_download_buffer_size = 0;
-  }
-
   m_texture_upload_buffer.Destroy();
   m_uniform_buffer.Destroy();
   m_vertex_buffer.Destroy();
@@ -759,17 +753,17 @@ std::unique_ptr<GPUPipeline> MetalDevice::CreatePipeline(const GPUPipeline::Grap
     static constexpr u32 MAX_COMPONENTS = 4;
     static constexpr const MTLVertexFormat
       format_mapping[static_cast<u8>(GPUPipeline::VertexAttribute::Type::MaxCount)][MAX_COMPONENTS] = {
-        {MTLVertexFormatFloat, MTLVertexFormatFloat2, MTLVertexFormatFloat3, MTLVertexFormatFloat4},     // Float
-        {MTLVertexFormatUChar, MTLVertexFormatUChar2, MTLVertexFormatUChar3, MTLVertexFormatUChar4},     // UInt8
-        {MTLVertexFormatChar, MTLVertexFormatChar2, MTLVertexFormatChar3, MTLVertexFormatChar4},         // SInt8
+        {MTLVertexFormatFloat, MTLVertexFormatFloat2, MTLVertexFormatFloat3, MTLVertexFormatFloat4}, // Float
+        {MTLVertexFormatUChar, MTLVertexFormatUChar2, MTLVertexFormatUChar3, MTLVertexFormatUChar4}, // UInt8
+        {MTLVertexFormatChar, MTLVertexFormatChar2, MTLVertexFormatChar3, MTLVertexFormatChar4},     // SInt8
         {MTLVertexFormatUCharNormalized, MTLVertexFormatUChar2Normalized, MTLVertexFormatUChar3Normalized,
          MTLVertexFormatUChar4Normalized},                                                               // UNorm8
         {MTLVertexFormatUShort, MTLVertexFormatUShort2, MTLVertexFormatUShort3, MTLVertexFormatUShort4}, // UInt16
         {MTLVertexFormatShort, MTLVertexFormatShort2, MTLVertexFormatShort3, MTLVertexFormatShort4},     // SInt16
         {MTLVertexFormatUShortNormalized, MTLVertexFormatUShort2Normalized, MTLVertexFormatUShort3Normalized,
-         MTLVertexFormatUShort4Normalized},                                                              // UNorm16
-        {MTLVertexFormatUInt, MTLVertexFormatUInt2, MTLVertexFormatUInt3, MTLVertexFormatUInt4},         // UInt32
-        {MTLVertexFormatInt, MTLVertexFormatInt2, MTLVertexFormatInt3, MTLVertexFormatInt4},             // SInt32
+         MTLVertexFormatUShort4Normalized},                                                      // UNorm16
+        {MTLVertexFormatUInt, MTLVertexFormatUInt2, MTLVertexFormatUInt3, MTLVertexFormatUInt4}, // UInt32
+        {MTLVertexFormatInt, MTLVertexFormatInt2, MTLVertexFormatInt3, MTLVertexFormatInt4},     // SInt32
       };
 
     static constexpr std::array<MTLCullMode, static_cast<u32>(GPUPipeline::CullMode::MaxCount)> cull_mapping = {{
@@ -1132,6 +1126,166 @@ std::unique_ptr<GPUTexture> MetalDevice::CreateTexture(u32 width, u32 height, u3
   }
 }
 
+MetalDownloadTexture::MetalDownloadTexture(u32 width, u32 height, GPUTexture::Format format, u8* import_buffer,
+                                           size_t buffer_offset, id<MTLBuffer> buffer, const u8* map_ptr, u32 map_pitch)
+  : GPUDownloadTexture(width, height, format, (import_buffer != nullptr)), m_buffer_offset(buffer_offset),
+    m_buffer(buffer)
+{
+  m_map_pointer = map_ptr;
+  m_current_pitch = map_pitch;
+}
+
+MetalDownloadTexture::~MetalDownloadTexture()
+{
+  [m_buffer release];
+}
+
+std::unique_ptr<MetalDownloadTexture> MetalDownloadTexture::Create(u32 width, u32 height, GPUTexture::Format format,
+                                                                   void* memory, size_t memory_size, u32 memory_stride)
+{
+  @autoreleasepool
+  {
+    MetalDevice& dev = MetalDevice::GetInstance();
+    id<MTLBuffer> buffer = nil;
+    size_t memory_offset = 0;
+    const u8* map_ptr = nullptr;
+    u32 map_pitch = 0;
+    u32 buffer_size = 0;
+
+    constexpr MTLResourceOptions options = MTLResourceStorageModeShared | MTLResourceCPUCacheModeDefaultCache;
+
+    // not importing memory?
+    if (!memory)
+    {
+      map_pitch = Common::AlignUpPow2(GPUTexture::CalcUploadPitch(format, width), TEXTURE_UPLOAD_PITCH_ALIGNMENT);
+      buffer_size = height * map_pitch;
+      buffer = [[dev.m_device newBufferWithLength:buffer_size options:options] retain];
+      if (buffer == nil)
+      {
+        Log_ErrorFmt("Failed to create {} byte buffer", buffer_size);
+        return {};
+      }
+
+      map_ptr = static_cast<u8*>([buffer contents]);
+    }
+    else
+    {
+      map_pitch = memory_stride;
+      buffer_size = height * map_pitch;
+      Assert(buffer_size <= memory_size);
+
+      // Importing memory, we need to page align the buffer.
+      void* page_aligned_memory =
+        reinterpret_cast<void*>(Common::AlignDownPow2(reinterpret_cast<uintptr_t>(memory), HOST_PAGE_SIZE));
+      const size_t page_offset = static_cast<size_t>(static_cast<u8*>(memory) - static_cast<u8*>(page_aligned_memory));
+      const size_t page_aligned_size = Common::AlignUpPow2(page_offset + memory_size, HOST_PAGE_SIZE);
+      Log_DevFmt("Trying to import {} bytes of memory at {} for download texture", page_aligned_memory,
+                 page_aligned_size);
+
+      buffer = [[dev.m_device newBufferWithBytesNoCopy:page_aligned_memory
+                                                length:page_aligned_size
+                                               options:options
+                                           deallocator:nil] retain];
+      if (buffer == nil)
+      {
+        Log_ErrorFmt("Failed to import {} byte buffer", page_aligned_size);
+        return {};
+      }
+
+      map_ptr = static_cast<u8*>(memory);
+    }
+
+    return std::unique_ptr<MetalDownloadTexture>(new MetalDownloadTexture(
+      width, height, format, static_cast<u8*>(memory), memory_offset, buffer, map_ptr, map_pitch));
+  }
+}
+
+void MetalDownloadTexture::CopyFromTexture(u32 dst_x, u32 dst_y, GPUTexture* src, u32 src_x, u32 src_y, u32 width,
+                                           u32 height, u32 src_layer, u32 src_level, bool use_transfer_pitch)
+{
+  MetalTexture* const mtlTex = static_cast<MetalTexture*>(src);
+  MetalDevice& dev = MetalDevice::GetInstance();
+
+  DebugAssert(mtlTex->GetFormat() == m_format);
+  DebugAssert(src_level < mtlTex->GetLevels());
+  DebugAssert((src_x + width) <= mtlTex->GetMipWidth(src_level) && (src_y + height) <= mtlTex->GetMipHeight(src_level));
+  DebugAssert((dst_x + width) <= m_width && (dst_y + height) <= m_height);
+  DebugAssert((dst_x == 0 && dst_y == 0) || !use_transfer_pitch);
+  DebugAssert(!m_is_imported || !use_transfer_pitch);
+
+  u32 copy_offset, copy_size, copy_rows;
+  if (!m_is_imported)
+    m_current_pitch = GetTransferPitch(use_transfer_pitch ? width : m_width, TEXTURE_UPLOAD_PITCH_ALIGNMENT);
+  GetTransferSize(dst_x, dst_y, width, height, m_current_pitch, &copy_offset, &copy_size, &copy_rows);
+
+  dev.GetStatistics().num_downloads++;
+
+  dev.CommitClear(mtlTex);
+
+  id<MTLBlitCommandEncoder> encoder = dev.GetBlitEncoder(true);
+  [encoder copyFromTexture:mtlTex->GetMTLTexture()
+                 sourceSlice:src_layer
+                 sourceLevel:src_level
+                sourceOrigin:MTLOriginMake(src_x, src_y, 0)
+                  sourceSize:MTLSizeMake(width, height, 1)
+                    toBuffer:m_buffer
+           destinationOffset:m_buffer_offset + copy_offset
+      destinationBytesPerRow:m_current_pitch
+    destinationBytesPerImage:0];
+
+  m_copy_fence_counter = dev.m_current_fence_counter;
+  m_needs_flush = true;
+}
+
+bool MetalDownloadTexture::Map(u32 x, u32 y, u32 width, u32 height)
+{
+  // Always mapped.
+  return true;
+}
+
+void MetalDownloadTexture::Unmap()
+{
+  // Always mapped.
+}
+
+void MetalDownloadTexture::Flush()
+{
+  if (!m_needs_flush)
+    return;
+
+  m_needs_flush = false;
+
+  MetalDevice& dev = MetalDevice::GetInstance();
+  if (dev.m_completed_fence_counter >= m_copy_fence_counter)
+    return;
+
+  // Need to execute command buffer.
+  if (dev.GetCurrentFenceCounter() == m_copy_fence_counter)
+    dev.SubmitCommandBuffer(true);
+  else
+    dev.WaitForFenceCounter(m_copy_fence_counter);
+}
+
+void MetalDownloadTexture::SetDebugName(std::string_view name)
+{
+  @autoreleasepool
+  {
+    [m_buffer setLabel:StringViewToNSString(name)];
+  }
+}
+
+std::unique_ptr<GPUDownloadTexture> MetalDevice::CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format)
+{
+  return MetalDownloadTexture::Create(width, height, format, nullptr, 0, 0);
+}
+
+std::unique_ptr<GPUDownloadTexture> MetalDevice::CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format,
+                                                                       void* memory, size_t memory_size,
+                                                                       u32 memory_stride)
+{
+  return MetalDownloadTexture::Create(width, height, format, memory, memory_size, memory_stride);
+}
+
 MetalSampler::MetalSampler(id<MTLSamplerState> ss) : m_ss(ss)
 {
 }
@@ -1218,71 +1372,6 @@ std::unique_ptr<GPUSampler> MetalDevice::CreateSampler(const GPUSampler::Config&
   }
 }
 
-bool MetalDevice::DownloadTexture(GPUTexture* texture, u32 x, u32 y, u32 width, u32 height, void* out_data,
-                                  u32 out_data_stride)
-{
-  constexpr u32 src_layer = 0;
-  constexpr u32 src_level = 0;
-
-  const u32 copy_size = width * texture->GetPixelSize();
-  const u32 pitch = Common::AlignUpPow2(copy_size, TEXTURE_UPLOAD_PITCH_ALIGNMENT);
-  const u32 required_size = pitch * height;
-  if (!CheckDownloadBufferSize(required_size))
-    return false;
-
-  MetalTexture* T = static_cast<MetalTexture*>(texture);
-  CommitClear(T);
-
-  s_stats.num_downloads++;
-
-  @autoreleasepool
-  {
-    id<MTLBlitCommandEncoder> encoder = GetBlitEncoder(true);
-
-    [encoder copyFromTexture:T->GetMTLTexture()
-                   sourceSlice:src_layer
-                   sourceLevel:src_level
-                  sourceOrigin:MTLOriginMake(x, y, 0)
-                    sourceSize:MTLSizeMake(width, height, 1)
-                      toBuffer:m_download_buffer
-             destinationOffset:0
-        destinationBytesPerRow:pitch
-      destinationBytesPerImage:0];
-
-    SubmitCommandBuffer(true);
-
-    StringUtil::StrideMemCpy(out_data, out_data_stride, [m_download_buffer contents], pitch, copy_size, height);
-  }
-
-  return true;
-}
-
-bool MetalDevice::CheckDownloadBufferSize(u32 required_size)
-{
-  if (m_download_buffer_size >= required_size)
-    return true;
-
-  @autoreleasepool
-  {
-    // We don't need to defer releasing this one, it's not going to be used.
-    if (m_download_buffer != nil)
-      [m_download_buffer release];
-
-    constexpr MTLResourceOptions options = MTLResourceStorageModeShared | MTLResourceCPUCacheModeDefaultCache;
-    m_download_buffer = [[m_device newBufferWithLength:required_size options:options] retain];
-    if (m_download_buffer == nil)
-    {
-      Log_ErrorPrintf("Failed to create %u byte download buffer", required_size);
-      m_download_buffer_size = 0;
-      return false;
-    }
-
-    m_download_buffer_size = required_size;
-  }
-
-  return true;
-}
-
 bool MetalDevice::SupportsTextureFormat(GPUTexture::Format format) const
 {
   if (format == GPUTexture::Format::RGB565 || format == GPUTexture::Format::RGBA5551)
diff --git a/src/util/opengl_device.cpp b/src/util/opengl_device.cpp
index 61ffddb85..cef210663 100644
--- a/src/util/opengl_device.cpp
+++ b/src/util/opengl_device.cpp
@@ -43,6 +43,11 @@ void OpenGLDevice::BindUpdateTextureUnit()
   GetInstance().SetActiveTexture(UPDATE_TEXTURE_UNIT - GL_TEXTURE0);
 }
 
+bool OpenGLDevice::ShouldUsePBOsForDownloads()
+{
+  return !GetInstance().m_disable_pbo && !GetInstance().m_disable_async_download;
+}
+
 RenderAPI OpenGLDevice::GetRenderAPI() const
 {
   return m_gl_context->IsGLES() ? RenderAPI::OpenGLES : RenderAPI::OpenGL;
@@ -55,53 +60,6 @@ std::unique_ptr<GPUTexture> OpenGLDevice::CreateTexture(u32 width, u32 height, u
   return OpenGLTexture::Create(width, height, layers, levels, samples, type, format, data, data_stride);
 }
 
-bool OpenGLDevice::DownloadTexture(GPUTexture* texture, u32 x, u32 y, u32 width, u32 height, void* out_data,
-                                   u32 out_data_stride)
-{
-  OpenGLTexture* T = static_cast<OpenGLTexture*>(texture);
-
-  GLint alignment;
-  if (out_data_stride & 1)
-    alignment = 1;
-  else if (out_data_stride & 2)
-    alignment = 2;
-  else
-    alignment = 4;
-
-  glPixelStorei(GL_PACK_ALIGNMENT, alignment);
-  glPixelStorei(GL_PACK_ROW_LENGTH, out_data_stride / T->GetPixelSize());
-
-  const auto [gl_internal_format, gl_format, gl_type] =
-    OpenGLTexture::GetPixelFormatMapping(T->GetFormat(), m_gl_context->IsGLES());
-  const u32 layer = 0;
-  const u32 level = 0;
-
-  s_stats.num_downloads++;
-
-  if (GLAD_GL_VERSION_4_5 || GLAD_GL_ARB_get_texture_sub_image)
-  {
-    glGetTextureSubImage(T->GetGLId(), level, x, y, layer, width, height, 1, gl_format, gl_type,
-                         height * out_data_stride, out_data);
-  }
-  else
-  {
-    glBindFramebuffer(GL_READ_FRAMEBUFFER, m_read_fbo);
-
-    if (T->GetLayers() > 1)
-      glFramebufferTextureLayer(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, T->GetGLId(), level, layer);
-    else
-      glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, T->GetGLTarget(), T->GetGLId(), level);
-
-    DebugAssert(glCheckFramebufferStatus(GL_READ_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE);
-    glReadPixels(x, y, width, height, gl_format, gl_type, out_data);
-
-    glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-    glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
-  }
-
-  return true;
-}
-
 bool OpenGLDevice::SupportsTextureFormat(GPUTexture::Format format) const
 {
   const auto [gl_internal_format, gl_format, gl_type] =
@@ -362,11 +320,10 @@ bool OpenGLDevice::CreateDevice(const std::string_view& adapter, bool threaded_p
     glObjectLabel = nullptr;
   }
 
-  bool buggy_pbo;
-  if (!CheckFeatures(&buggy_pbo, disabled_features))
+  if (!CheckFeatures(disabled_features))
     return false;
 
-  if (!CreateBuffers(buggy_pbo))
+  if (!CreateBuffers())
     return false;
 
   // Scissor test should always be enabled.
@@ -375,7 +332,7 @@ bool OpenGLDevice::CreateDevice(const std::string_view& adapter, bool threaded_p
   return true;
 }
 
-bool OpenGLDevice::CheckFeatures(bool* buggy_pbo, FeatureMask disabled_features)
+bool OpenGLDevice::CheckFeatures(FeatureMask disabled_features)
 {
   const bool is_gles = m_gl_context->IsGLES();
 
@@ -424,10 +381,9 @@ bool OpenGLDevice::CheckFeatures(bool* buggy_pbo, FeatureMask disabled_features)
   // using the normal texture update routines and letting the driver take care of it. PBOs are also completely
   // broken on mobile drivers.
   const bool is_shitty_mobile_driver = (vendor_id_powervr || vendor_id_qualcomm || vendor_id_arm);
-  const bool is_buggy_pbo =
+  m_disable_pbo =
     (!GLAD_GL_VERSION_4_4 && !GLAD_GL_ARB_buffer_storage && !GLAD_GL_EXT_buffer_storage) || is_shitty_mobile_driver;
-  *buggy_pbo = is_buggy_pbo;
-  if (is_buggy_pbo && !is_shitty_mobile_driver)
+  if (m_disable_pbo && !is_shitty_mobile_driver)
     Log_WarningPrint("Not using PBOs for texture uploads because buffer_storage is unavailable.");
 
   GLint max_texture_size = 1024;
@@ -517,6 +473,7 @@ bool OpenGLDevice::CheckFeatures(bool* buggy_pbo, FeatureMask disabled_features)
   m_features.gpu_timing = !(m_gl_context->IsGLES() &&
                             (!GLAD_GL_EXT_disjoint_timer_query || !glGetQueryObjectivEXT || !glGetQueryObjectui64vEXT));
   m_features.partial_msaa_resolve = true;
+  m_features.memory_import = true;
 
   m_features.shader_cache = false;
 
@@ -539,6 +496,13 @@ bool OpenGLDevice::CheckFeatures(bool* buggy_pbo, FeatureMask disabled_features)
   // Mobile drivers prefer textures to not be updated mid-frame.
   m_features.prefer_unused_textures = is_gles || vendor_id_arm || vendor_id_powervr || vendor_id_qualcomm;
 
+  if (vendor_id_intel)
+  {
+    // Intel drivers corrupt image on readback when syncs are used for downloads.
+    Log_WarningPrint("Disabling async downloads with PBOs due to it being broken on Intel drivers.");
+    m_disable_async_download = true;
+  }
+
   return true;
 }
 
@@ -711,7 +675,7 @@ void OpenGLDevice::DestroySurface()
     Log_ErrorPrintf("Failed to switch to surfaceless");
 }
 
-bool OpenGLDevice::CreateBuffers(bool buggy_pbo)
+bool OpenGLDevice::CreateBuffers()
 {
   if (!(m_vertex_buffer = OpenGLStreamBuffer::Create(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE)) ||
       !(m_index_buffer = OpenGLStreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, INDEX_BUFFER_SIZE)) ||
@@ -727,7 +691,7 @@ bool OpenGLDevice::CreateBuffers(bool buggy_pbo)
 
   glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, reinterpret_cast<GLint*>(&m_uniform_buffer_alignment));
 
-  if (!buggy_pbo)
+  if (!m_disable_pbo)
   {
     if (!(m_texture_stream_buffer = OpenGLStreamBuffer::Create(GL_PIXEL_UNPACK_BUFFER, TEXTURE_STREAM_BUFFER_SIZE)))
     {
diff --git a/src/util/opengl_device.h b/src/util/opengl_device.h
index 8e22b8284..6f4f102ba 100644
--- a/src/util/opengl_device.h
+++ b/src/util/opengl_device.h
@@ -20,9 +20,13 @@
 class OpenGLPipeline;
 class OpenGLStreamBuffer;
 class OpenGLTexture;
+class OpenGLDownloadTexture;
 
 class OpenGLDevice final : public GPUDevice
 {
+  friend OpenGLTexture;
+  friend OpenGLDownloadTexture;
+
 public:
   OpenGLDevice();
   ~OpenGLDevice();
@@ -34,6 +38,7 @@ public:
   }
   ALWAYS_INLINE static bool IsGLES() { return GetInstance().m_gl_context->IsGLES(); }
   static void BindUpdateTextureUnit();
+  static bool ShouldUsePBOsForDownloads();
 
   RenderAPI GetRenderAPI() const override;
 
@@ -53,8 +58,11 @@ public:
   std::unique_ptr<GPUSampler> CreateSampler(const GPUSampler::Config& config) override;
   std::unique_ptr<GPUTextureBuffer> CreateTextureBuffer(GPUTextureBuffer::Format format, u32 size_in_elements) override;
 
-  bool DownloadTexture(GPUTexture* texture, u32 x, u32 y, u32 width, u32 height, void* out_data,
-                       u32 out_data_stride) override;
+  std::unique_ptr<GPUDownloadTexture> CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format) override;
+  std::unique_ptr<GPUDownloadTexture> CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format,
+                                                            void* memory, size_t memory_size,
+                                                            u32 memory_stride) override;
+
   bool SupportsTextureFormat(GPUTexture::Format format) const override;
   void CopyTextureRegion(GPUTexture* dst, u32 dst_x, u32 dst_y, u32 dst_layer, u32 dst_level, GPUTexture* src,
                          u32 src_x, u32 src_y, u32 src_layer, u32 src_level, u32 width, u32 height) override;
@@ -137,8 +145,8 @@ private:
   static constexpr u32 UNIFORM_BUFFER_SIZE = 2 * 1024 * 1024;
   static constexpr u32 TEXTURE_STREAM_BUFFER_SIZE = 16 * 1024 * 1024;
 
-  bool CheckFeatures(bool* buggy_pbo, FeatureMask disabled_features);
-  bool CreateBuffers(bool buggy_pbo);
+  bool CheckFeatures(FeatureMask disabled_features);
+  bool CreateBuffers();
   void DestroyBuffers();
 
   void SetSwapInterval();
@@ -215,4 +223,7 @@ private:
   std::string m_pipeline_disk_cache_filename;
   u32 m_pipeline_disk_cache_data_end = 0;
   bool m_pipeline_disk_cache_changed = false;
+
+  bool m_disable_pbo = false;
+  bool m_disable_async_download = false;
 };
diff --git a/src/util/opengl_texture.cpp b/src/util/opengl_texture.cpp
index 980dc4d1d..96795a38a 100644
--- a/src/util/opengl_texture.cpp
+++ b/src/util/opengl_texture.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
 
 #include "opengl_texture.h"
@@ -7,6 +7,7 @@
 
 #include "common/align.h"
 #include "common/assert.h"
+#include "common/intrin.h"
 #include "common/log.h"
 #include "common/string_util.h"
 
@@ -696,3 +697,207 @@ std::unique_ptr<GPUTextureBuffer> OpenGLDevice::CreateTextureBuffer(GPUTextureBu
   return std::unique_ptr<GPUTextureBuffer>(
     new OpenGLTextureBuffer(format, size_in_elements, std::move(buffer), texture_id));
 }
+
+OpenGLDownloadTexture::OpenGLDownloadTexture(u32 width, u32 height, GPUTexture::Format format, bool imported,
+                                             GLuint buffer_id, u8* cpu_buffer, u32 buffer_size, const u8* map_ptr,
+                                             u32 map_pitch)
+  : GPUDownloadTexture(width, height, format, imported), m_buffer_id(buffer_id), m_buffer_size(buffer_size),
+    m_cpu_buffer(cpu_buffer)
+{
+  m_map_pointer = map_ptr;
+  m_current_pitch = map_pitch;
+}
+
+OpenGLDownloadTexture::~OpenGLDownloadTexture()
+{
+  if (m_buffer_id != 0)
+  {
+    if (m_sync)
+      glDeleteSync(m_sync);
+
+    if (m_map_pointer)
+    {
+      glBindBuffer(GL_PIXEL_PACK_BUFFER, m_buffer_id);
+      glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
+      glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+    }
+
+    glDeleteBuffers(1, &m_buffer_id);
+  }
+  else if (m_cpu_buffer && !m_is_imported)
+  {
+    Common::AlignedFree(m_cpu_buffer);
+  }
+}
+
+std::unique_ptr<OpenGLDownloadTexture> OpenGLDownloadTexture::Create(u32 width, u32 height, GPUTexture::Format format,
+                                                                     void* memory, size_t memory_size, u32 memory_pitch)
+{
+  const u32 buffer_pitch =
+    memory ? memory_pitch :
+             Common::AlignUpPow2(GPUTexture::CalcUploadPitch(format, width), TEXTURE_UPLOAD_PITCH_ALIGNMENT);
+  const u32 buffer_size = memory ? static_cast<u32>(memory_size) : (height * buffer_pitch);
+
+  const bool use_buffer_storage = (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage) &&
+                                  !memory && OpenGLDevice::ShouldUsePBOsForDownloads();
+  if (use_buffer_storage)
+  {
+    GLuint buffer_id;
+    glGenBuffers(1, &buffer_id);
+    glBindBuffer(GL_PIXEL_PACK_BUFFER, buffer_id);
+
+    const u32 flags = GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT;
+    const u32 map_flags = GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT;
+
+    if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage)
+      glBufferStorage(GL_PIXEL_PACK_BUFFER, buffer_size, nullptr, flags);
+    else if (GLAD_GL_EXT_buffer_storage)
+      glBufferStorageEXT(GL_PIXEL_PACK_BUFFER, buffer_size, nullptr, flags);
+
+    u8* buffer_map = static_cast<u8*>(glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, buffer_size, map_flags));
+
+    glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+
+    if (!buffer_map)
+    {
+      Log_ErrorPrint("Failed to map persistent download buffer");
+      glDeleteBuffers(1, &buffer_id);
+      return {};
+    }
+
+    return std::unique_ptr<OpenGLDownloadTexture>(new OpenGLDownloadTexture(
+      width, height, format, false, buffer_id, nullptr, buffer_size, buffer_map, buffer_pitch));
+  }
+
+  // Fallback to glReadPixels() + CPU buffer.
+  const bool imported = (memory != nullptr);
+  u8* cpu_buffer =
+    imported ? static_cast<u8*>(memory) : static_cast<u8*>(Common::AlignedMalloc(buffer_size, VECTOR_ALIGNMENT));
+  if (!cpu_buffer)
+    return {};
+
+  return std::unique_ptr<OpenGLDownloadTexture>(
+    new OpenGLDownloadTexture(width, height, format, imported, 0, cpu_buffer, buffer_size, cpu_buffer, buffer_pitch));
+}
+
+void OpenGLDownloadTexture::CopyFromTexture(u32 dst_x, u32 dst_y, GPUTexture* src, u32 src_x, u32 src_y, u32 width,
+                                            u32 height, u32 src_layer, u32 src_level, bool use_transfer_pitch)
+{
+  OpenGLTexture* const srcgl = static_cast<OpenGLTexture*>(src);
+  OpenGLDevice& dev = OpenGLDevice::GetInstance();
+
+  DebugAssert(srcgl->GetFormat() == m_format);
+  DebugAssert(src_level < srcgl->GetLevels());
+  DebugAssert((src_x + width) <= srcgl->GetMipWidth(src_level) && (src_y + height) <= srcgl->GetMipHeight(src_level));
+  DebugAssert((dst_x + width) <= m_width && (dst_y + height) <= m_height);
+  DebugAssert((dst_x == 0 && dst_y == 0) || !use_transfer_pitch);
+  DebugAssert(!m_is_imported || !use_transfer_pitch);
+
+  dev.CommitClear(srcgl);
+
+  u32 copy_offset, copy_size, copy_rows;
+  if (!m_is_imported)
+    m_current_pitch = GetTransferPitch(use_transfer_pitch ? width : m_width, TEXTURE_UPLOAD_PITCH_ALIGNMENT);
+  GetTransferSize(dst_x, dst_y, width, height, m_current_pitch, &copy_offset, &copy_size, &copy_rows);
+  dev.GetStatistics().num_downloads++;
+
+  GLint alignment;
+  if (m_current_pitch & 1)
+    alignment = 1;
+  else if (m_current_pitch & 2)
+    alignment = 2;
+  else
+    alignment = 4;
+
+  glPixelStorei(GL_PACK_ALIGNMENT, alignment);
+  glPixelStorei(GL_PACK_ROW_LENGTH, GPUTexture::CalcUploadRowLengthFromPitch(m_format, m_current_pitch));
+
+  if (!m_cpu_buffer)
+  {
+    // Read to PBO.
+    glBindBuffer(GL_PIXEL_PACK_BUFFER, m_buffer_id);
+  }
+
+  const auto [gl_internal_format, gl_format, gl_type] =
+    OpenGLTexture::GetPixelFormatMapping(srcgl->GetFormat(), dev.IsGLES());
+  if (GLAD_GL_VERSION_4_5 || GLAD_GL_ARB_get_texture_sub_image)
+  {
+    glGetTextureSubImage(srcgl->GetGLId(), src_level, src_x, src_y, 0, width, height, 1, gl_format, gl_type,
+                         m_current_pitch * height, m_cpu_buffer + copy_offset);
+  }
+  else
+  {
+    glBindFramebuffer(GL_READ_FRAMEBUFFER, dev.m_read_fbo);
+    glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, srcgl->GetGLId(), 0);
+
+    glReadPixels(src_x, src_y, width, height, gl_format, gl_type, m_cpu_buffer + copy_offset);
+
+    glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
+  }
+
+  if (m_cpu_buffer)
+  {
+    // If using CPU buffers, we never need to flush.
+    m_needs_flush = false;
+  }
+  else
+  {
+    glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+
+    // Create a sync object so we know when the GPU is done copying.
+    if (m_sync)
+      glDeleteSync(m_sync);
+
+    m_sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+    m_needs_flush = true;
+  }
+
+  glPixelStorei(GL_PACK_ROW_LENGTH, 0);
+}
+
+bool OpenGLDownloadTexture::Map(u32 x, u32 y, u32 width, u32 height)
+{
+  // Either always mapped, or CPU buffer.
+  return true;
+}
+
+void OpenGLDownloadTexture::Unmap()
+{
+  // Either always mapped, or CPU buffer.
+}
+
+void OpenGLDownloadTexture::Flush()
+{
+  // If we're using CPU buffers, we did the readback synchronously...
+  if (!m_needs_flush || !m_sync)
+    return;
+
+  m_needs_flush = false;
+
+  glClientWaitSync(m_sync, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
+  glDeleteSync(m_sync);
+  m_sync = {};
+}
+
+void OpenGLDownloadTexture::SetDebugName(std::string_view name)
+{
+  if (name.empty())
+    return;
+
+  if (glObjectLabel)
+    glObjectLabel(GL_BUFFER, m_buffer_id, static_cast<GLsizei>(name.length()), name.data());
+}
+
+std::unique_ptr<GPUDownloadTexture> OpenGLDevice::CreateDownloadTexture(u32 width, u32 height,
+                                                                        GPUTexture::Format format)
+{
+  return OpenGLDownloadTexture::Create(width, height, format, nullptr, 0, 0);
+}
+
+std::unique_ptr<GPUDownloadTexture> OpenGLDevice::CreateDownloadTexture(u32 width, u32 height,
+                                                                        GPUTexture::Format format, void* memory,
+                                                                        size_t memory_size, u32 memory_stride)
+{
+  // not _really_ memory importing, but PBOs are broken on Intel....
+  return OpenGLDownloadTexture::Create(width, height, format, memory, memory_size, memory_stride);
+}
diff --git a/src/util/opengl_texture.h b/src/util/opengl_texture.h
index b8fc3e382..f0bd6c11c 100644
--- a/src/util/opengl_texture.h
+++ b/src/util/opengl_texture.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
 
 #pragma once
@@ -99,3 +99,34 @@ private:
 
   GLuint m_id;
 };
+
+class OpenGLDownloadTexture final : public GPUDownloadTexture
+{
+public:
+  ~OpenGLDownloadTexture() override;
+
+  static std::unique_ptr<OpenGLDownloadTexture> Create(u32 width, u32 height, GPUTexture::Format format, void* memory,
+                                                       size_t memory_size, u32 memory_pitch);
+
+  void CopyFromTexture(u32 dst_x, u32 dst_y, GPUTexture* src, u32 src_x, u32 src_y, u32 width, u32 height,
+                       u32 src_layer, u32 src_level, bool use_transfer_pitch) override;
+
+  bool Map(u32 x, u32 y, u32 width, u32 height) override;
+  void Unmap() override;
+
+  void Flush() override;
+
+  void SetDebugName(std::string_view name) override;
+
+private:
+  OpenGLDownloadTexture(u32 width, u32 height, GPUTexture::Format format, bool imported, GLuint buffer_id,
+                        u8* cpu_buffer, u32 buffer_size, const u8* map_ptr, u32 map_pitch);
+
+  GLuint m_buffer_id = 0;
+  u32 m_buffer_size = 0;
+
+  GLsync m_sync = {};
+
+  // used when buffer storage is not available
+  u8* m_cpu_buffer = nullptr;
+};
diff --git a/src/util/vulkan_device.cpp b/src/util/vulkan_device.cpp
index 41c43d841..2a23b2951 100644
--- a/src/util/vulkan_device.cpp
+++ b/src/util/vulkan_device.cpp
@@ -1515,6 +1515,14 @@ void VulkanDevice::DeferBufferDestruction(VkBuffer object, VmaAllocation allocat
                                  [this, object, allocation]() { vmaDestroyBuffer(m_allocator, object, allocation); });
 }
 
+void VulkanDevice::DeferBufferDestruction(VkBuffer object, VkDeviceMemory memory)
+{
+  m_cleanup_objects.emplace_back(GetCurrentFenceCounter(), [this, object, memory]() {
+    vkDestroyBuffer(m_device, object, nullptr);
+    vkFreeMemory(m_device, memory, nullptr);
+  });
+}
+
 void VulkanDevice::DeferFramebufferDestruction(VkFramebuffer object)
 {
   m_cleanup_objects.emplace_back(GetCurrentFenceCounter(),
@@ -2067,7 +2075,6 @@ void VulkanDevice::DestroyDevice()
   for (auto& it : m_cleanup_objects)
     it.second();
   m_cleanup_objects.clear();
-  DestroyDownloadBuffer();
   DestroyPersistentDescriptorSets();
   DestroyBuffers();
   DestroySamplers();
@@ -2528,6 +2535,7 @@ bool VulkanDevice::CheckFeatures(FeatureMask disabled_features)
     !(disabled_features & FEATURE_MASK_GEOMETRY_SHADERS) && m_device_features.geometryShader;
 
   m_features.partial_msaa_resolve = true;
+  m_features.memory_import = m_optional_extensions.vk_ext_external_memory_host;
   m_features.shader_cache = true;
   m_features.pipeline_cache = true;
   m_features.prefer_unused_textures = true;
@@ -2981,21 +2989,21 @@ void VulkanDevice::RenderBlankFrame()
   InvalidateCachedState();
 }
 
-bool VulkanDevice::TryImportHostMemory(const void* data, u32 data_size, VkBufferUsageFlags buffer_usage,
-                                       VkDeviceMemory* out_memory, VkBuffer* out_buffer, u32* out_offset)
+bool VulkanDevice::TryImportHostMemory(void* data, size_t data_size, VkBufferUsageFlags buffer_usage,
+                                       VkDeviceMemory* out_memory, VkBuffer* out_buffer, VkDeviceSize* out_offset)
 {
   if (!m_optional_extensions.vk_ext_external_memory_host)
     return false;
 
   // Align to the nearest page
-  const void* data_aligned =
-    reinterpret_cast<const void*>(Common::AlignDownPow2(reinterpret_cast<uintptr_t>(data), HOST_PAGE_SIZE));
+  void* data_aligned =
+    reinterpret_cast<void*>(Common::AlignDownPow2(reinterpret_cast<uintptr_t>(data), HOST_PAGE_SIZE));
 
   // Offset to the start of the data within the page
-  const u32 data_offset = reinterpret_cast<uintptr_t>(data) & (HOST_PAGE_SIZE - 1);
+  const size_t data_offset = reinterpret_cast<uintptr_t>(data) & static_cast<uintptr_t>(HOST_PAGE_MASK);
 
   // Full amount of data that must be imported, including the pages
-  const u32 data_size_aligned = Common::AlignUpPow2(data_offset + data_size, HOST_PAGE_SIZE);
+  const size_t data_size_aligned = Common::AlignUpPow2(data_offset + data_size, HOST_PAGE_SIZE);
 
   VkMemoryHostPointerPropertiesEXT pointer_properties = {VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT, nullptr,
                                                          0};
@@ -3003,6 +3011,7 @@ bool VulkanDevice::TryImportHostMemory(const void* data, u32 data_size, VkBuffer
                                                      data_aligned, &pointer_properties);
   if (res != VK_SUCCESS || pointer_properties.memoryTypeBits == 0)
   {
+    LOG_VULKAN_ERROR(res, "vkGetMemoryHostPointerPropertiesEXT() failed: ");
     return false;
   }
 
@@ -3015,6 +3024,7 @@ bool VulkanDevice::TryImportHostMemory(const void* data, u32 data_size, VkBuffer
   res = vmaFindMemoryTypeIndex(m_allocator, pointer_properties.memoryTypeBits, &vma_alloc_info, &memory_index);
   if (res != VK_SUCCESS)
   {
+    LOG_VULKAN_ERROR(res, "vmaFindMemoryTypeIndex() failed: ");
     return false;
   }
 
@@ -3030,6 +3040,7 @@ bool VulkanDevice::TryImportHostMemory(const void* data, u32 data_size, VkBuffer
   res = vkAllocateMemory(m_device, &alloc_info, nullptr, &imported_memory);
   if (res != VK_SUCCESS)
   {
+    LOG_VULKAN_ERROR(res, "vkAllocateMemory() failed: ");
     return false;
   }
 
@@ -3049,10 +3060,10 @@ bool VulkanDevice::TryImportHostMemory(const void* data, u32 data_size, VkBuffer
   res = vkCreateBuffer(m_device, &buffer_info, nullptr, &imported_buffer);
   if (res != VK_SUCCESS)
   {
+    LOG_VULKAN_ERROR(res, "vkCreateBuffer() failed: ");
     if (imported_memory != VK_NULL_HANDLE)
-    {
       vkFreeMemory(m_device, imported_memory, nullptr);
-    }
+
     return false;
   }
 
@@ -3061,7 +3072,7 @@ bool VulkanDevice::TryImportHostMemory(const void* data, u32 data_size, VkBuffer
   *out_memory = imported_memory;
   *out_buffer = imported_buffer;
   *out_offset = data_offset;
-
+  Log_DevFmt("Imported {} byte buffer covering {} bytes at {}", data_size, data_size_aligned, data);
   return true;
 }
 
diff --git a/src/util/vulkan_device.h b/src/util/vulkan_device.h
index d06fb8179..ef3e00f94 100644
--- a/src/util/vulkan_device.h
+++ b/src/util/vulkan_device.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
 
 #pragma once
@@ -25,6 +25,7 @@ class VulkanPipeline;
 class VulkanSwapChain;
 class VulkanTexture;
 class VulkanTextureBuffer;
+class VulkanDownloadTexture;
 
 struct VK_PIPELINE_CACHE_HEADER;
 
@@ -32,6 +33,7 @@ class VulkanDevice final : public GPUDevice
 {
 public:
   friend VulkanTexture;
+  friend VulkanDownloadTexture;
 
   enum : u32
   {
@@ -81,8 +83,11 @@ public:
   std::unique_ptr<GPUSampler> CreateSampler(const GPUSampler::Config& config) override;
   std::unique_ptr<GPUTextureBuffer> CreateTextureBuffer(GPUTextureBuffer::Format format, u32 size_in_elements) override;
 
-  bool DownloadTexture(GPUTexture* texture, u32 x, u32 y, u32 width, u32 height, void* out_data,
-                       u32 out_data_stride) override;
+  std::unique_ptr<GPUDownloadTexture> CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format) override;
+  std::unique_ptr<GPUDownloadTexture> CreateDownloadTexture(u32 width, u32 height, GPUTexture::Format format,
+                                                            void* memory, size_t memory_size,
+                                                            u32 memory_stride) override;
+
   bool SupportsTextureFormat(GPUTexture::Format format) const override;
   void CopyTextureRegion(GPUTexture* dst, u32 dst_x, u32 dst_y, u32 dst_layer, u32 dst_level, GPUTexture* src,
                          u32 src_x, u32 src_y, u32 src_layer, u32 src_level, u32 width, u32 height) override;
@@ -196,6 +201,7 @@ public:
   // Schedule a vulkan resource for destruction later on. This will occur when the command buffer
   // is next re-used, and the GPU has finished working with the specified resource.
   void DeferBufferDestruction(VkBuffer object, VmaAllocation allocation);
+  void DeferBufferDestruction(VkBuffer object, VkDeviceMemory memory);
   void DeferFramebufferDestruction(VkFramebuffer object);
   void DeferImageDestruction(VkImage object, VmaAllocation allocation);
   void DeferImageViewDestruction(VkImageView object);
@@ -341,11 +347,8 @@ private:
 
   void RenderBlankFrame();
 
-  bool TryImportHostMemory(const void* data, u32 data_size, VkBufferUsageFlags buffer_usage, VkDeviceMemory* out_memory,
-                           VkBuffer* out_buffer, u32* out_offset);
-
-  bool CheckDownloadBufferSize(u32 required_size);
-  void DestroyDownloadBuffer();
+  bool TryImportHostMemory(void* data, size_t data_size, VkBufferUsageFlags buffer_usage, VkDeviceMemory* out_memory,
+                           VkBuffer* out_buffer, VkDeviceSize* out_offset);
 
   /// Set dirty flags on everything to force re-bind at next draw time.
   void InvalidateCachedState();
@@ -454,11 +457,6 @@ private:
 
   SamplerMap m_sampler_map;
 
-  VmaAllocation m_download_buffer_allocation = VK_NULL_HANDLE;
-  VkBuffer m_download_buffer = VK_NULL_HANDLE;
-  u8* m_download_buffer_map = nullptr;
-  u32 m_download_buffer_size = 0;
-
   // Which bindings/state has to be updated before the next draw.
   u32 m_dirty_flags = ALL_DIRTY_STATE;
 
diff --git a/src/util/vulkan_texture.cpp b/src/util/vulkan_texture.cpp
index f82f810b8..5626c9e06 100644
--- a/src/util/vulkan_texture.cpp
+++ b/src/util/vulkan_texture.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
 
 #include "vulkan_texture.h"
@@ -736,126 +736,6 @@ std::unique_ptr<GPUTexture> VulkanDevice::CreateTexture(u32 width, u32 height, u
   return tex;
 }
 
-bool VulkanDevice::DownloadTexture(GPUTexture* texture, u32 x, u32 y, u32 width, u32 height, void* out_data,
-                                   u32 out_data_stride)
-{
-  VulkanTexture* T = static_cast<VulkanTexture*>(texture);
-  T->CommitClear();
-
-  const u32 pitch = Common::AlignUp(width * T->GetPixelSize(), GetBufferCopyRowPitchAlignment());
-  const u32 size = pitch * height;
-  const u32 level = 0;
-  if (!CheckDownloadBufferSize(size))
-  {
-    Log_ErrorPrintf("Can't read back %ux%u", width, height);
-    return false;
-  }
-
-  s_stats.num_downloads++;
-
-  if (InRenderPass())
-    EndRenderPass();
-
-  const VkCommandBuffer cmdbuf = GetCurrentCommandBuffer();
-
-  VulkanTexture::Layout old_layout = T->GetLayout();
-  if (old_layout != VulkanTexture::Layout::TransferSrc)
-    T->TransitionSubresourcesToLayout(cmdbuf, 0, 1, 0, 1, old_layout, VulkanTexture::Layout::TransferSrc);
-
-  VkBufferImageCopy image_copy = {};
-  const VkImageAspectFlags aspect = T->IsDepthStencil() ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT;
-  image_copy.bufferOffset = 0;
-  image_copy.bufferRowLength = pitch / T->GetPixelSize();
-  image_copy.bufferImageHeight = 0;
-  image_copy.imageSubresource = {aspect, level, 0u, 1u};
-  image_copy.imageOffset = {static_cast<s32>(x), static_cast<s32>(y), 0};
-  image_copy.imageExtent = {width, height, 1u};
-
-  // do the copy
-  vkCmdCopyImageToBuffer(cmdbuf, T->GetImage(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, m_download_buffer, 1,
-                         &image_copy);
-
-  // flush gpu cache
-  const VkBufferMemoryBarrier buffer_info = {
-    VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, // VkStructureType    sType
-    nullptr,                                 // const void*        pNext
-    VK_ACCESS_TRANSFER_WRITE_BIT,            // VkAccessFlags      srcAccessMask
-    VK_ACCESS_HOST_READ_BIT,                 // VkAccessFlags      dstAccessMask
-    VK_QUEUE_FAMILY_IGNORED,                 // uint32_t           srcQueueFamilyIndex
-    VK_QUEUE_FAMILY_IGNORED,                 // uint32_t           dstQueueFamilyIndex
-    m_download_buffer,                       // VkBuffer           buffer
-    0,                                       // VkDeviceSize       offset
-    size                                     // VkDeviceSize       size
-  };
-  vkCmdPipelineBarrier(cmdbuf, VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0, 0, nullptr, 1, &buffer_info,
-                       0, nullptr);
-
-  if (old_layout != VulkanTexture::Layout::TransferSrc)
-    T->TransitionSubresourcesToLayout(cmdbuf, 0, 1, 0, 1, VulkanTexture::Layout::TransferSrc, old_layout);
-
-  SubmitCommandBuffer(true);
-
-  // invalidate cpu cache before reading
-  VkResult res = vmaInvalidateAllocation(m_allocator, m_download_buffer_allocation, 0, size);
-  if (res != VK_SUCCESS)
-    LOG_VULKAN_ERROR(res, "vmaInvalidateAllocation() failed, readback may be incorrect: ");
-
-  StringUtil::StrideMemCpy(out_data, out_data_stride, m_download_buffer_map, pitch, width * T->GetPixelSize(), height);
-  return true;
-}
-
-bool VulkanDevice::CheckDownloadBufferSize(u32 required_size)
-{
-  if (m_download_buffer_size >= required_size)
-    return true;
-
-  DestroyDownloadBuffer();
-
-  // Adreno has slow coherent cached reads.
-  const bool is_adreno = (m_device_properties.vendorID == 0x5143 ||
-                          m_device_driver_properties.driverID == VK_DRIVER_ID_QUALCOMM_PROPRIETARY);
-
-  const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
-                                  nullptr,
-                                  0u,
-                                  required_size,
-                                  VK_BUFFER_USAGE_TRANSFER_DST_BIT,
-                                  VK_SHARING_MODE_EXCLUSIVE,
-                                  0u,
-                                  nullptr};
-
-  VmaAllocationCreateInfo aci = {};
-  aci.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
-  aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
-  aci.preferredFlags = is_adreno ? (VK_MEMORY_PROPERTY_HOST_CACHED_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) :
-                                   VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
-
-  VmaAllocationInfo ai = {};
-  VkResult res = vmaCreateBuffer(m_allocator, &bci, &aci, &m_download_buffer, &m_download_buffer_allocation, &ai);
-  if (res != VK_SUCCESS)
-  {
-    LOG_VULKAN_ERROR(res, "vmaCreateBuffer() failed: ");
-    return false;
-  }
-
-  m_download_buffer_map = static_cast<u8*>(ai.pMappedData);
-  return true;
-}
-
-void VulkanDevice::DestroyDownloadBuffer()
-{
-  if (m_download_buffer == VK_NULL_HANDLE)
-    return;
-
-  vmaDestroyBuffer(m_allocator, m_download_buffer, m_download_buffer_allocation);
-
-  // unmapped as part of the buffer destroy
-  m_download_buffer = VK_NULL_HANDLE;
-  m_download_buffer_allocation = VK_NULL_HANDLE;
-  m_download_buffer_map = nullptr;
-  m_download_buffer_size = 0;
-}
-
 VulkanSampler::VulkanSampler(VkSampler sampler) : m_sampler(sampler)
 {
 }
@@ -1081,3 +961,218 @@ std::unique_ptr<GPUTextureBuffer> VulkanDevice::CreateTextureBuffer(GPUTextureBu
 
   return tb;
 }
+
+VulkanDownloadTexture::VulkanDownloadTexture(u32 width, u32 height, GPUTexture::Format format, VmaAllocation allocation,
+                                             VkDeviceMemory memory, VkBuffer buffer, VkDeviceSize memory_offset,
+                                             VkDeviceSize buffer_size, const u8* map_ptr, u32 map_pitch)
+  : GPUDownloadTexture(width, height, format, (memory != VK_NULL_HANDLE)), m_allocation(allocation), m_memory(memory),
+    m_buffer(buffer), m_memory_offset(memory_offset), m_buffer_size(buffer_size)
+{
+  m_map_pointer = map_ptr;
+  m_current_pitch = map_pitch;
+}
+
+VulkanDownloadTexture::~VulkanDownloadTexture()
+{
+  if (m_allocation != VK_NULL_HANDLE)
+  {
+    // Buffer was created mapped, no need to manually unmap.
+    VulkanDevice::GetInstance().DeferBufferDestruction(m_buffer, m_allocation);
+  }
+  else
+  {
+    // imported
+    DebugAssert(m_is_imported && m_memory != VK_NULL_HANDLE);
+    VulkanDevice::GetInstance().DeferBufferDestruction(m_buffer, m_memory);
+  }
+}
+
+std::unique_ptr<VulkanDownloadTexture> VulkanDownloadTexture::Create(u32 width, u32 height, GPUTexture::Format format,
+                                                                     void* memory, size_t memory_size,
+                                                                     u32 memory_stride)
+{
+  VulkanDevice& dev = VulkanDevice::GetInstance();
+  VmaAllocation allocation = VK_NULL_HANDLE;
+  VkDeviceMemory dev_memory = VK_NULL_HANDLE;
+  VkBuffer buffer = VK_NULL_HANDLE;
+  VkDeviceSize memory_offset = 0;
+  const u8* map_ptr = nullptr;
+  u32 map_pitch = 0;
+  u32 buffer_size = 0;
+
+  // not importing memory?
+  if (!memory)
+  {
+    map_pitch = Common::AlignUpPow2(GPUTexture::CalcUploadPitch(format, width), dev.GetBufferCopyRowPitchAlignment());
+    buffer_size = height * map_pitch;
+
+    const VkBufferCreateInfo bci = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+                                    nullptr,
+                                    0u,
+                                    buffer_size,
+                                    VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+                                    VK_SHARING_MODE_EXCLUSIVE,
+                                    0u,
+                                    nullptr};
+
+    VmaAllocationCreateInfo aci = {};
+    aci.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
+    aci.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT;
+    aci.preferredFlags = VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+
+    VmaAllocationInfo ai = {};
+    VkResult res = vmaCreateBuffer(VulkanDevice::GetInstance().GetAllocator(), &bci, &aci, &buffer, &allocation, &ai);
+    if (res != VK_SUCCESS)
+    {
+      LOG_VULKAN_ERROR(res, "vmaCreateBuffer() failed: ");
+      return {};
+    }
+
+    DebugAssert(ai.pMappedData);
+    map_ptr = static_cast<u8*>(ai.pMappedData);
+  }
+  else
+  {
+    map_pitch = memory_stride;
+    buffer_size = height * map_pitch;
+    Assert(buffer_size <= memory_size);
+
+    if (!dev.TryImportHostMemory(memory, memory_size, VK_BUFFER_USAGE_TRANSFER_DST_BIT, &dev_memory, &buffer,
+                                 &memory_offset))
+    {
+      return {};
+    }
+
+    map_ptr = static_cast<u8*>(memory);
+  }
+
+  return std::unique_ptr<VulkanDownloadTexture>(new VulkanDownloadTexture(
+    width, height, format, allocation, dev_memory, buffer, memory_offset, buffer_size, map_ptr, map_pitch));
+}
+
+void VulkanDownloadTexture::CopyFromTexture(u32 dst_x, u32 dst_y, GPUTexture* src, u32 src_x, u32 src_y, u32 width,
+                                            u32 height, u32 src_layer, u32 src_level, bool use_transfer_pitch)
+{
+  VulkanTexture* const vkTex = static_cast<VulkanTexture*>(src);
+  VulkanDevice& dev = VulkanDevice::GetInstance();
+
+  DebugAssert(vkTex->GetFormat() == m_format);
+  DebugAssert(src_level < vkTex->GetLevels());
+  DebugAssert((src_x + width) <= src->GetMipWidth(src_level) && (src_y + height) <= src->GetMipHeight(src_level));
+  DebugAssert((dst_x + width) <= m_width && (dst_y + height) <= m_height);
+  DebugAssert((dst_x == 0 && dst_y == 0) || !use_transfer_pitch);
+  DebugAssert(!m_is_imported || !use_transfer_pitch);
+
+  u32 copy_offset, copy_size, copy_rows;
+  if (!m_is_imported)
+    m_current_pitch = GetTransferPitch(use_transfer_pitch ? width : m_width, dev.GetBufferCopyRowPitchAlignment());
+  GetTransferSize(dst_x, dst_y, width, height, m_current_pitch, &copy_offset, &copy_size, &copy_rows);
+
+  dev.GetStatistics().num_downloads++;
+  if (dev.InRenderPass())
+    dev.EndRenderPass();
+  vkTex->CommitClear();
+
+  const VkCommandBuffer cmdbuf = dev.GetCurrentCommandBuffer();
+  GL_INS_FMT("VulkanDownloadTexture::CopyFromTexture: {{{},{}}} {}x{} => {{{},{}}}", src_x, src_y, width, height, dst_x,
+             dst_y);
+
+  VulkanTexture::Layout old_layout = vkTex->GetLayout();
+  if (old_layout == VulkanTexture::Layout::Undefined)
+    vkTex->TransitionToLayout(cmdbuf, VulkanTexture::Layout::TransferSrc);
+  else if (old_layout != VulkanTexture::Layout::TransferSrc)
+    vkTex->TransitionSubresourcesToLayout(cmdbuf, 0, 1, src_level, 1, old_layout, VulkanTexture::Layout::TransferSrc);
+
+  VkBufferImageCopy image_copy = {};
+  const VkImageAspectFlags aspect = vkTex->IsDepthStencil() ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_COLOR_BIT;
+  image_copy.bufferOffset = m_memory_offset + copy_offset;
+  image_copy.bufferRowLength = GPUTexture::CalcUploadRowLengthFromPitch(m_format, m_current_pitch);
+  image_copy.bufferImageHeight = 0;
+  image_copy.imageSubresource = {aspect, src_level, src_layer, 1u};
+  image_copy.imageOffset = {static_cast<s32>(src_x), static_cast<s32>(src_y), 0};
+  image_copy.imageExtent = {width, height, 1u};
+
+  // do the copy
+  vkCmdCopyImageToBuffer(cmdbuf, vkTex->GetImage(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, m_buffer, 1, &image_copy);
+
+  // flush gpu cache
+  const VkBufferMemoryBarrier buffer_info = {
+    VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, // VkStructureType    sType
+    nullptr,                                 // const void*        pNext
+    VK_ACCESS_TRANSFER_WRITE_BIT,            // VkAccessFlags      srcAccessMask
+    VK_ACCESS_HOST_READ_BIT,                 // VkAccessFlags      dstAccessMask
+    VK_QUEUE_FAMILY_IGNORED,                 // uint32_t           srcQueueFamilyIndex
+    VK_QUEUE_FAMILY_IGNORED,                 // uint32_t           dstQueueFamilyIndex
+    m_buffer,                                // VkBuffer           buffer
+    0,                                       // VkDeviceSize       offset
+    copy_size                                // VkDeviceSize       size
+  };
+  vkCmdPipelineBarrier(cmdbuf, VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0, 0, nullptr, 1, &buffer_info,
+                       0, nullptr);
+
+  if (old_layout != VulkanTexture::Layout::TransferSrc && old_layout != VulkanTexture::Layout::Undefined)
+    vkTex->TransitionSubresourcesToLayout(cmdbuf, 0, 1, src_level, 1, VulkanTexture::Layout::TransferSrc, old_layout);
+
+  m_copy_fence_counter = dev.GetCurrentFenceCounter();
+  m_needs_cache_invalidate = true;
+  m_needs_flush = true;
+}
+
+bool VulkanDownloadTexture::Map(u32 x, u32 y, u32 width, u32 height)
+{
+  // Always mapped, but we might need to invalidate the cache.
+  if (m_needs_cache_invalidate)
+  {
+    u32 copy_offset, copy_size, copy_rows;
+    GetTransferSize(x, y, width, height, m_current_pitch, &copy_offset, &copy_size, &copy_rows);
+    vmaInvalidateAllocation(VulkanDevice::GetInstance().GetAllocator(), m_allocation, copy_offset,
+                            m_current_pitch * copy_rows);
+    m_needs_cache_invalidate = false;
+  }
+
+  return true;
+}
+
+void VulkanDownloadTexture::Unmap()
+{
+  // Always mapped.
+}
+
+void VulkanDownloadTexture::Flush()
+{
+  if (!m_needs_flush)
+    return;
+
+  m_needs_flush = false;
+
+  VulkanDevice& dev = VulkanDevice::GetInstance();
+  if (dev.GetCompletedFenceCounter() >= m_copy_fence_counter)
+    return;
+
+  // Need to execute command buffer.
+  if (dev.GetCurrentFenceCounter() == m_copy_fence_counter)
+    dev.SubmitCommandBuffer(true);
+  else
+    dev.WaitForFenceCounter(m_copy_fence_counter);
+}
+
+void VulkanDownloadTexture::SetDebugName(std::string_view name)
+{
+  if (name.empty())
+    return;
+
+  Vulkan::SetObjectName(VulkanDevice::GetInstance().GetVulkanDevice(), m_buffer, name);
+}
+
+std::unique_ptr<GPUDownloadTexture> VulkanDevice::CreateDownloadTexture(u32 width, u32 height,
+                                                                        GPUTexture::Format format)
+{
+  return VulkanDownloadTexture::Create(width, height, format, nullptr, 0, 0);
+}
+
+std::unique_ptr<GPUDownloadTexture> VulkanDevice::CreateDownloadTexture(u32 width, u32 height,
+                                                                        GPUTexture::Format format, void* memory,
+                                                                        size_t memory_size, u32 memory_stride)
+{
+  return VulkanDownloadTexture::Create(width, height, format, memory, memory_size, memory_stride);
+}
diff --git a/src/util/vulkan_texture.h b/src/util/vulkan_texture.h
index f33b8c472..7e8a0b684 100644
--- a/src/util/vulkan_texture.h
+++ b/src/util/vulkan_texture.h
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
 // SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
 
 #pragma once
@@ -150,3 +150,37 @@ private:
   VkBufferView m_buffer_view = VK_NULL_HANDLE;
   VkDescriptorSet m_descriptor_set = VK_NULL_HANDLE;
 };
+
+class VulkanDownloadTexture final : public GPUDownloadTexture
+{
+public:
+  ~VulkanDownloadTexture() override;
+
+  static std::unique_ptr<VulkanDownloadTexture> Create(u32 width, u32 height, GPUTexture::Format format, void* memory,
+                                                       size_t memory_size, u32 memory_stride);
+
+  void CopyFromTexture(u32 dst_x, u32 dst_y, GPUTexture* src, u32 src_x, u32 src_y, u32 width, u32 height,
+                       u32 src_layer, u32 src_level, bool use_transfer_pitch) override;
+
+  bool Map(u32 x, u32 y, u32 width, u32 height) override;
+  void Unmap() override;
+
+  void Flush() override;
+
+  void SetDebugName(std::string_view name) override;
+
+private:
+  VulkanDownloadTexture(u32 width, u32 height, GPUTexture::Format format, VmaAllocation allocation,
+                        VkDeviceMemory memory, VkBuffer buffer, VkDeviceSize memory_offset, VkDeviceSize buffer_size,
+                        const u8* map_ptr, u32 map_pitch);
+
+  VmaAllocation m_allocation = VK_NULL_HANDLE;
+  VkDeviceMemory m_memory = VK_NULL_HANDLE;
+  VkBuffer m_buffer = VK_NULL_HANDLE;
+
+  u64 m_copy_fence_counter = 0;
+  VkDeviceSize m_memory_offset = 0;
+  VkDeviceSize m_buffer_size = 0;
+
+  bool m_needs_cache_invalidate = false;
+};