From 7566c45f6440402f077be8962354a388617f959c Mon Sep 17 00:00:00 2001
From: Connor McLaughlin <stenzek@gmail.com>
Date: Sun, 18 Oct 2020 14:43:55 +1000
Subject: [PATCH] CPU/Recompiler: Implement fastmem

---
 .../src/main/res/xml/advanced_preferences.xml |   6 +
 src/common/page_fault_handler.cpp             |   1 +
 src/common/page_fault_handler.h               |   3 +-
 src/core/bus.cpp                              | 210 ++++++++-
 src/core/bus.h                                |  52 ++-
 src/core/cpu_code_cache.cpp                   | 229 ++++++++--
 src/core/cpu_code_cache.h                     |  16 +-
 src/core/cpu_core.cpp                         |   6 +
 src/core/cpu_core.h                           |   2 +
 src/core/cpu_recompiler_code_generator.cpp    |  23 +-
 src/core/cpu_recompiler_code_generator.h      |  11 +-
 .../cpu_recompiler_code_generator_aarch64.cpp | 283 ++++++++++---
 .../cpu_recompiler_code_generator_x64.cpp     | 399 +++++++++++++++---
 src/core/cpu_recompiler_thunks.h              |   1 +
 src/core/cpu_recompiler_types.h               |  10 +
 src/core/host_interface.cpp                   |   7 +-
 src/core/settings.cpp                         |   2 +
 src/core/settings.h                           |   6 +
 src/core/system.cpp                           |  12 +-
 src/core/types.h                              |   2 +-
 src/duckstation-qt/advancedsettingswidget.cpp |  15 +-
 src/duckstation-sdl/sdl_host_interface.cpp    |   1 +
 22 files changed, 1104 insertions(+), 193 deletions(-)

diff --git a/android/app/src/main/res/xml/advanced_preferences.xml b/android/app/src/main/res/xml/advanced_preferences.xml
index 75e718d14..c1ce2a1ac 100644
--- a/android/app/src/main/res/xml/advanced_preferences.xml
+++ b/android/app/src/main/res/xml/advanced_preferences.xml
@@ -40,6 +40,12 @@
             app:defaultValue="false"
             app:summary="Determines whether the CPU's instruction cache is simulated in the recompiler. Improves accuracy at a small cost to performance. If games are running too fast, try enabling this option."
             app:iconSpaceReserved="false" />
+        <SwitchPreferenceCompat
+            app:key="CPU/Fastmem"
+            app:title="CPU Recompiler Fast Memory Access"
+            app:defaultValue="true"
+            app:summary="Makes guest memory access more efficient by using page faults and backpatching. Disable if it is unstable on your device."
+            app:iconSpaceReserved="false" />
         <SwitchPreferenceCompat
             app:key="BIOS/PatchTTYEnable"
             app:title="@string/settings_console_tty_output"
diff --git a/src/common/page_fault_handler.cpp b/src/common/page_fault_handler.cpp
index f9fd1686c..e0125759c 100644
--- a/src/common/page_fault_handler.cpp
+++ b/src/common/page_fault_handler.cpp
@@ -3,6 +3,7 @@
 #include <algorithm>
 #include <cstring>
 #include <mutex>
+#include <vector>
 Log_SetChannel(Common::PageFaultHandler);
 
 #if defined(WIN32)
diff --git a/src/common/page_fault_handler.h b/src/common/page_fault_handler.h
index b2c4f9040..67ef38cbd 100644
--- a/src/common/page_fault_handler.h
+++ b/src/common/page_fault_handler.h
@@ -1,6 +1,5 @@
 #pragma once
 #include "types.h"
-#include <functional>
 
 namespace Common::PageFaultHandler {
 enum class HandlerResult
@@ -9,7 +8,7 @@ enum class HandlerResult
   ExecuteNextHandler,
 };
 
-using Callback = std::function<HandlerResult(void* exception_pc, void* fault_address, bool is_write)>;
+using Callback = HandlerResult(*)(void* exception_pc, void* fault_address, bool is_write);
 using Handle = void*;
 
 bool InstallHandler(void* owner, Callback callback);
diff --git a/src/core/bus.cpp b/src/core/bus.cpp
index f238e6b17..52fde0a77 100644
--- a/src/core/bus.cpp
+++ b/src/core/bus.cpp
@@ -10,6 +10,7 @@
 #include "cpu_disasm.h"
 #include "dma.h"
 #include "gpu.h"
+#include "host_interface.h"
 #include "interrupt_controller.h"
 #include "mdec.h"
 #include "pad.h"
@@ -22,11 +23,6 @@ Log_SetChannel(Bus);
 
 namespace Bus {
 
-enum : TickCount
-{
-  RAM_READ_TICKS = 4
-};
-
 union MEMDELAY
 {
   u32 bits;
@@ -74,7 +70,7 @@ union MEMCTRL
 };
 
 std::bitset<CPU_CODE_CACHE_PAGE_COUNT> m_ram_code_bits{};
-u8 g_ram[RAM_SIZE]{};   // 2MB RAM
+u8* g_ram = nullptr;    // 2MB RAM
 u8 g_bios[BIOS_SIZE]{}; // 512K BIOS ROM
 
 static std::array<TickCount, 3> m_exp1_access_time = {};
@@ -90,9 +86,17 @@ static u32 m_ram_size_reg = 0;
 
 static std::string m_tty_line_buffer;
 
+static Common::MemoryArena m_memory_arena;
+static u8* m_fastmem_base = nullptr;
+static std::vector<Common::MemoryArena::View> m_fastmem_ram_views;
+
 static std::tuple<TickCount, TickCount, TickCount> CalculateMemoryTiming(MEMDELAY mem_delay, COMDELAY common_delay);
 static void RecalculateMemoryTimings();
 
+static void SetCodePageFastmemProtection(u32 page_index, bool writable);
+static bool AllocateMemory();
+static void UnmapFastmemViews();
+
 #define FIXUP_WORD_READ_OFFSET(offset) ((offset) & ~u32(3))
 #define FIXUP_WORD_READ_VALUE(offset, value) ((value) >> (((offset)&u32(3)) * 8u))
 #define FIXUP_HALFWORD_READ_OFFSET(offset) ((offset) & ~u32(1))
@@ -108,19 +112,30 @@ ALWAYS_INLINE static void FixupUnalignedWordAccessW32(u32& offset, u32& value)
   value <<= byte_offset * 8;
 }
 
-void Initialize()
+bool Initialize()
 {
+  if (!AllocateMemory())
+  {
+    g_host_interface->ReportError("Failed to allocate memory");
+    return false;
+  }
+
   Reset();
+  return true;
 }
 
 void Shutdown()
 {
-  //
+  UnmapFastmemViews();
+  if (g_ram)
+    m_memory_arena.ReleaseViewPtr(g_ram, RAM_SIZE);
+
+  CPU::g_state.fastmem_base = nullptr;
 }
 
 void Reset()
 {
-  std::memset(g_ram, 0, sizeof(g_ram));
+  std::memset(g_ram, 0, RAM_SIZE);
   m_MEMCTRL.exp1_base = 0x1F000000;
   m_MEMCTRL.exp2_base = 0x1F802000;
   m_MEMCTRL.exp1_delay_size.bits = 0x0013243F;
@@ -142,8 +157,8 @@ bool DoState(StateWrapper& sw)
   sw.Do(&m_bios_access_time);
   sw.Do(&m_cdrom_access_time);
   sw.Do(&m_spu_access_time);
-  sw.DoBytes(g_ram, sizeof(g_ram));
-  sw.DoBytes(g_bios, sizeof(g_bios));
+  sw.DoBytes(g_ram, RAM_SIZE);
+  sw.DoBytes(g_bios, BIOS_SIZE);
   sw.DoArray(m_MEMCTRL.regs, countof(m_MEMCTRL.regs));
   sw.Do(&m_ram_size_reg);
   sw.Do(&m_tty_line_buffer);
@@ -222,6 +237,179 @@ void RecalculateMemoryTimings()
                   m_spu_access_time[2] + 1);
 }
 
+bool AllocateMemory()
+{
+  if (!m_memory_arena.Create(MEMORY_ARENA_SIZE, true, false))
+  {
+    Log_ErrorPrint("Failed to create memory arena");
+    return false;
+  }
+
+  // Create the base views.
+  g_ram = static_cast<u8*>(m_memory_arena.CreateViewPtr(MEMORY_ARENA_RAM_OFFSET, RAM_SIZE, true, false));
+  if (!g_ram)
+  {
+    Log_ErrorPrint("Failed to create base views of memory");
+    return false;
+  }
+
+  return true;
+}
+
+void UnmapFastmemViews()
+{
+  m_fastmem_ram_views.clear();
+}
+
+void UpdateFastmemViews(bool enabled, bool isolate_cache)
+{
+  UnmapFastmemViews();
+  if (!enabled)
+  {
+    m_fastmem_base = nullptr;
+    return;
+  }
+
+  Log_DevPrintf("Remapping fastmem area, isolate cache = %s", isolate_cache ? "true " : "false");
+  if (!m_fastmem_base)
+  {
+    m_fastmem_base = static_cast<u8*>(m_memory_arena.FindBaseAddressForMapping(FASTMEM_REGION_SIZE));
+    if (!m_fastmem_base)
+    {
+      Log_ErrorPrint("Failed to find base address for fastmem");
+      return;
+    }
+
+    Log_InfoPrintf("Fastmem base: %p", m_fastmem_base);
+    CPU::g_state.fastmem_base = m_fastmem_base;
+  }
+
+  auto MapRAM = [](u32 base_address) {
+    u8* map_address = m_fastmem_base + base_address;
+    auto view = m_memory_arena.CreateView(MEMORY_ARENA_RAM_OFFSET, RAM_SIZE, true, false, map_address);
+    if (!view)
+    {
+      Log_ErrorPrintf("Failed to map RAM at fastmem area %p (offset 0x%08X)", map_address, RAM_SIZE);
+      return;
+    }
+
+    // mark all pages with code as non-writable
+    for (u32 i = 0; i < CPU_CODE_CACHE_PAGE_COUNT; i++)
+    {
+      if (m_ram_code_bits[i])
+      {
+        u8* page_address = map_address + (i * CPU_CODE_CACHE_PAGE_SIZE);
+        if (!m_memory_arena.SetPageProtection(page_address, CPU_CODE_CACHE_PAGE_SIZE, true, false, false))
+        {
+          Log_ErrorPrintf("Failed to write-protect code page at %p");
+          return;
+        }
+      }
+    }
+
+    m_fastmem_ram_views.push_back(std::move(view.value()));
+  };
+
+  if (!isolate_cache)
+  {
+    // KUSEG - cached
+    MapRAM(0x00000000);
+    //MapRAM(0x00200000);
+    //MapRAM(0x00400000);
+    //MapRAM(0x00600000);
+
+    // KSEG0 - cached
+    MapRAM(0x80000000);
+    //MapRAM(0x80200000);
+    //MapRAM(0x80400000);
+    //MapRAM(0x80600000);
+  }
+
+  // KSEG1 - uncached
+  MapRAM(0xA0000000);
+  //MapRAM(0xA0200000);
+  //MapRAM(0xA0400000);
+  //MapRAM(0xA0600000);
+}
+
+bool IsRAMCodePage(u32 index)
+{
+  return m_ram_code_bits[index];
+}
+
+void SetRAMCodePage(u32 index)
+{
+  if (m_ram_code_bits[index])
+    return;
+
+  // protect fastmem pages
+  m_ram_code_bits[index] = true;
+  SetCodePageFastmemProtection(index, false);
+}
+
+void ClearRAMCodePage(u32 index)
+{
+  if (!m_ram_code_bits[index])
+    return;
+
+  // unprotect fastmem pages
+  m_ram_code_bits[index] = false;
+  SetCodePageFastmemProtection(index, true);
+}
+
+void SetCodePageFastmemProtection(u32 page_index, bool writable)
+{
+  // unprotect fastmem pages
+  for (const auto& view : m_fastmem_ram_views)
+  {
+    u8* page_address = static_cast<u8*>(view.GetBasePointer()) + (page_index * CPU_CODE_CACHE_PAGE_SIZE);
+    if (!m_memory_arena.SetPageProtection(page_address, CPU_CODE_CACHE_PAGE_SIZE, true, writable, false))
+    {
+      Log_ErrorPrintf("Failed to %s code page %u (0x%08X) @ %p", writable ? "unprotect" : "protect", page_index,
+                      page_index * CPU_CODE_CACHE_PAGE_SIZE, page_address);
+    }
+  }
+}
+
+void ClearRAMCodePageFlags()
+{
+  m_ram_code_bits.reset();
+
+  // unprotect fastmem pages
+  for (const auto& view : m_fastmem_ram_views)
+  {
+    if (!m_memory_arena.SetPageProtection(view.GetBasePointer(), view.GetMappingSize(), true, true, false))
+    {
+      Log_ErrorPrintf("Failed to unprotect code pages for fastmem view @ %p", view.GetBasePointer());
+    }
+  }
+}
+
+bool IsCodePageAddress(PhysicalMemoryAddress address)
+{
+  return IsRAMAddress(address) ? m_ram_code_bits[(address & RAM_MASK) / CPU_CODE_CACHE_PAGE_SIZE] : false;
+}
+
+bool HasCodePagesInRange(PhysicalMemoryAddress start_address, u32 size)
+{
+  if (!IsRAMAddress(start_address))
+    return false;
+
+  start_address = (start_address & RAM_MASK);
+
+  const u32 end_address = start_address + size;
+  while (start_address < end_address)
+  {
+    const u32 code_page_index = start_address / CPU_CODE_CACHE_PAGE_SIZE;
+    if (m_ram_code_bits[code_page_index])
+      return true;
+
+    start_address += CPU_CODE_CACHE_PAGE_SIZE;
+  }
+
+  return false;
+}
+
 static TickCount DoInvalidAccess(MemoryAccessType type, MemoryAccessSize size, PhysicalMemoryAddress address,
                                  u32& value)
 {
diff --git a/src/core/bus.h b/src/core/bus.h
index d2f187ba6..9b8056f9a 100644
--- a/src/core/bus.h
+++ b/src/core/bus.h
@@ -1,5 +1,6 @@
 #pragma once
 #include "common/bitfield.h"
+#include "common/memory_arena.h"
 #include "types.h"
 #include <array>
 #include <bitset>
@@ -65,26 +66,67 @@ enum : u32
   MEMCTRL_REG_COUNT = 9
 };
 
-void Initialize();
+enum : TickCount
+{
+  RAM_READ_TICKS = 4
+};
+
+enum : size_t
+{
+  // Our memory arena contains storage for RAM.
+  MEMORY_ARENA_SIZE = RAM_SIZE,
+
+  // Offsets within the memory arena.
+  MEMORY_ARENA_RAM_OFFSET = 0,
+
+  // Fastmem region size is 4GB to cover the entire 32-bit address space.
+  FASTMEM_REGION_SIZE = UINT64_C(0x100000000)
+};
+
+bool Initialize();
 void Shutdown();
 void Reset();
 bool DoState(StateWrapper& sw);
 
+u8* GetFastmemBase();
+void UpdateFastmemViews(bool enabled, bool isolate_cache);
+
 void SetExpansionROM(std::vector<u8> data);
 void SetBIOS(const std::vector<u8>& image);
 
 extern std::bitset<CPU_CODE_CACHE_PAGE_COUNT> m_ram_code_bits;
-extern u8 g_ram[RAM_SIZE];   // 2MB RAM
+extern u8* g_ram;            // 2MB RAM
 extern u8 g_bios[BIOS_SIZE]; // 512K BIOS ROM
 
+/// Returns true if the address specified is writable (RAM).
+ALWAYS_INLINE static bool IsRAMAddress(PhysicalMemoryAddress address)
+{
+  return address < RAM_MIRROR_END;
+}
+
+/// Returns the code page index for a RAM address.
+ALWAYS_INLINE static u32 GetRAMCodePageIndex(PhysicalMemoryAddress address)
+{
+  return (address & RAM_MASK) / CPU_CODE_CACHE_PAGE_SIZE;
+}
+
+/// Returns true if the specified page contains code.
+bool IsRAMCodePage(u32 index);
+
 /// Flags a RAM region as code, so we know when to invalidate blocks.
-ALWAYS_INLINE void SetRAMCodePage(u32 index) { m_ram_code_bits[index] = true; }
+void SetRAMCodePage(u32 index);
 
 /// Unflags a RAM region as code, the code cache will no longer be notified when writes occur.
-ALWAYS_INLINE void ClearRAMCodePage(u32 index) { m_ram_code_bits[index] = false; }
+void ClearRAMCodePage(u32 index);
 
 /// Clears all code bits for RAM regions.
-ALWAYS_INLINE void ClearRAMCodePageFlags() { m_ram_code_bits.reset(); }
+void ClearRAMCodePageFlags();
+
+/// Returns true if the specified address is in a code page.
+bool IsCodePageAddress(PhysicalMemoryAddress address);
+
+/// Returns true if the range specified overlaps with a code page.
+bool HasCodePagesInRange(PhysicalMemoryAddress start_address, u32 size);
 
 /// Returns the number of cycles stolen by DMA RAM access.
 ALWAYS_INLINE TickCount GetDMARAMTickCount(u32 word_count)
diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp
index 7e8db76f5..d8242d8e2 100644
--- a/src/core/cpu_code_cache.cpp
+++ b/src/core/cpu_code_cache.cpp
@@ -5,6 +5,7 @@
 #include "cpu_core.h"
 #include "cpu_core_private.h"
 #include "cpu_disasm.h"
+#include "settings.h"
 #include "system.h"
 #include "timing_event.h"
 Log_SetChannel(CPU::CodeCache);
@@ -62,6 +63,7 @@ static void SetFastMap(u32 pc, CodeBlock::HostCodePointer function)
 #endif
 
 using BlockMap = std::unordered_map<u32, CodeBlock*>;
+using HostCodeMap = std::map<CodeBlock::HostCodePointer, CodeBlock*>;
 
 void LogCurrentState();
 
@@ -86,36 +88,68 @@ static void LinkBlock(CodeBlock* from, CodeBlock* to);
 /// Unlink all blocks which point to this block, and any that this block links to.
 static void UnlinkBlock(CodeBlock* block);
 
-static bool s_use_recompiler = false;
+static void ClearState();
+
 static BlockMap s_blocks;
 static std::array<std::vector<CodeBlock*>, CPU_CODE_CACHE_PAGE_COUNT> m_ram_block_map;
 
-void Initialize(bool use_recompiler)
+#ifdef WITH_RECOMPILER
+static HostCodeMap s_host_code_map;
+
+static void AddBlockToHostCodeMap(CodeBlock* block);
+static void RemoveBlockFromHostCodeMap(CodeBlock* block);
+static bool InitializeFastmem();
+static void ShutdownFastmem();
+static Common::PageFaultHandler::HandlerResult PageFaultHandler(void* exception_pc, void* fault_address, bool is_write);
+#endif
+
+void Initialize()
 {
   Assert(s_blocks.empty());
 
 #ifdef WITH_RECOMPILER
-  s_use_recompiler = use_recompiler;
-#ifdef USE_STATIC_CODE_BUFFER
-  if (!s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), RECOMPILER_FAR_CODE_CACHE_SIZE,
-                                RECOMPILER_GUARD_SIZE))
-#else
-  if (!s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE))
-#endif
+  if (g_settings.IsUsingRecompiler())
   {
-    Panic("Failed to initialize code space");
-  }
-
-  ResetFastMap();
-  CompileDispatcher();
+#ifdef USE_STATIC_CODE_BUFFER
+    if (!s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), RECOMPILER_FAR_CODE_CACHE_SIZE,
+                                  RECOMPILER_GUARD_SIZE))
 #else
-  s_use_recompiler = false;
+    if (!s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE))
+#endif
+    {
+      Panic("Failed to initialize code space");
+    }
+
+    if (g_settings.IsUsingFastmem() && !InitializeFastmem())
+      Panic("Failed to initialize fastmem");
+
+    ResetFastMap();
+    CompileDispatcher();
+  }
+#endif
+}
+
+void ClearState()
+{
+  Bus::ClearRAMCodePageFlags();
+  for (auto& it : m_ram_block_map)
+    it.clear();
+
+  for (const auto& it : s_blocks)
+    delete it.second;
+
+  s_blocks.clear();
+#ifdef WITH_RECOMPILER
+  s_host_code_map.clear();
+  s_code_buffer.Reset();
+  ResetFastMap();
 #endif
 }
 
 void Shutdown()
 {
-  Flush();
+  ClearState();
+  ShutdownFastmem();
 #ifdef WITH_RECOMPILER
   s_code_buffer.Destroy();
 #endif
@@ -286,31 +320,42 @@ void ExecuteRecompiler()
 
 #endif
 
-void SetUseRecompiler(bool enable)
+void Reinitialize()
 {
-#ifdef WITH_RECOMPILER
-  if (s_use_recompiler == enable)
-    return;
+  ClearState();
 
-  s_use_recompiler = enable;
-  Flush();
+#ifdef WITH_RECOMPILER
+
+  ShutdownFastmem();
+  s_code_buffer.Destroy();
+
+  if (g_settings.IsUsingRecompiler())
+  {
+
+#ifdef USE_STATIC_CODE_BUFFER
+    if (!s_code_buffer.Initialize(s_code_storage, sizeof(s_code_storage), RECOMPILER_FAR_CODE_CACHE_SIZE,
+                                  RECOMPILER_GUARD_SIZE))
+#else
+    if (!s_code_buffer.Allocate(RECOMPILER_CODE_CACHE_SIZE, RECOMPILER_FAR_CODE_CACHE_SIZE))
+#endif
+    {
+      Panic("Failed to initialize code space");
+    }
+
+    if (g_settings.IsUsingFastmem() && !InitializeFastmem())
+      Panic("Failed to initialize fastmem");
+
+    ResetFastMap();
+    CompileDispatcher();
+  }
 #endif
 }
 
 void Flush()
 {
-  Bus::ClearRAMCodePageFlags();
-  for (auto& it : m_ram_block_map)
-    it.clear();
-
-  for (const auto& it : s_blocks)
-    delete it.second;
-  s_blocks.clear();
-#ifdef WITH_RECOMPILER
-  s_code_buffer.Reset();
-  ResetFastMap();
-  CompileDispatcher();
-#endif
+  ClearState();
+  if (g_settings.IsUsingRecompiler())
+    CompileDispatcher();
 }
 
 void LogCurrentState()
@@ -365,6 +410,8 @@ CodeBlock* LookupBlock(CodeBlockKey key)
   }
 
   iter = s_blocks.emplace(key.bits, block).first;
+  AddBlockToHostCodeMap(block);
+
   return block;
 }
 
@@ -391,6 +438,8 @@ bool RevalidateBlock(CodeBlock* block)
   return true;
 
 recompile:
+  RemoveBlockFromHostCodeMap(block);
+
   block->instructions.clear();
   if (!CompileBlock(block))
   {
@@ -400,6 +449,7 @@ recompile:
   }
 
   // re-add to page map again
+  AddBlockToHostCodeMap(block);
   if (block->IsInRAM())
     AddBlockToPageMap(block);
 
@@ -446,6 +496,9 @@ bool CompileBlock(CodeBlock* block)
       block->uncached_fetch_ticks += GetInstructionReadTicks(pc);
     }
 
+    block->contains_loadstore_instructions |= cbi.is_load_instruction;
+    block->contains_loadstore_instructions |= cbi.is_store_instruction;
+
     // instruction is decoded now
     block->instructions.push_back(cbi);
     pc += sizeof(cbi.instruction.bits);
@@ -488,7 +541,7 @@ bool CompileBlock(CodeBlock* block)
   }
 
 #ifdef WITH_RECOMPILER
-  if (s_use_recompiler)
+  if (g_settings.IsUsingRecompiler())
   {
     // Ensure we're not going to run out of space while compiling this block.
     if (s_code_buffer.GetFreeCodeSpace() <
@@ -559,6 +612,9 @@ void FlushBlock(CodeBlock* block)
     RemoveBlockFromPageMap(block);
 
   UnlinkBlock(block);
+#ifdef WITH_RECOMPILER
+  RemoveBlockFromHostCodeMap(block);
+#endif
 
   s_blocks.erase(iter);
   delete block;
@@ -620,4 +676,107 @@ void UnlinkBlock(CodeBlock* block)
   block->link_successors.clear();
 }
 
+#ifdef WITH_RECOMPILER
+
+void AddBlockToHostCodeMap(CodeBlock* block)
+{
+  if (!g_settings.IsUsingRecompiler())
+    return;
+
+  auto ir = s_host_code_map.emplace(block->host_code, block);
+  Assert(ir.second);
+}
+
+void RemoveBlockFromHostCodeMap(CodeBlock* block)
+{
+  if (!g_settings.IsUsingRecompiler())
+    return;
+
+  HostCodeMap::iterator hc_iter = s_host_code_map.find(block->host_code);
+  Assert(hc_iter != s_host_code_map.end());
+  s_host_code_map.erase(hc_iter);
+}
+
+bool InitializeFastmem()
+{
+  if (!Common::PageFaultHandler::InstallHandler(&s_host_code_map, PageFaultHandler))
+  {
+    Log_ErrorPrintf("Failed to install page fault handler");
+    return false;
+  }
+
+  Bus::UpdateFastmemViews(true, g_state.cop0_regs.sr.Isc);
+  return true;
+}
+
+void ShutdownFastmem()
+{
+  Common::PageFaultHandler::RemoveHandler(&s_host_code_map);
+  Bus::UpdateFastmemViews(false, false);
+}
+
+Common::PageFaultHandler::HandlerResult PageFaultHandler(void* exception_pc, void* fault_address, bool is_write)
+{
+  if (static_cast<u8*>(fault_address) < g_state.fastmem_base ||
+      (static_cast<u8*>(fault_address) - g_state.fastmem_base) >= Bus::FASTMEM_REGION_SIZE)
+  {
+    return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler;
+  }
+
+  const PhysicalMemoryAddress fastmem_address =
+    static_cast<PhysicalMemoryAddress>(static_cast<ptrdiff_t>(static_cast<u8*>(fault_address) - g_state.fastmem_base));
+
+  Log_DevPrintf("Page fault handler invoked at PC=%p Address=%p %s, fastmem offset 0x%08X", exception_pc, fault_address,
+                is_write ? "(write)" : "(read)", fastmem_address);
+
+  if (is_write && !g_state.cop0_regs.sr.Isc && Bus::IsRAMAddress(fastmem_address))
+  {
+    // this is probably a code page, since we aren't going to fault due to requiring fastmem on RAM.
+    const u32 code_page_index = Bus::GetRAMCodePageIndex(fastmem_address);
+    if (Bus::IsRAMCodePage(code_page_index))
+    {
+      InvalidateBlocksWithPageIndex(code_page_index);
+      return Common::PageFaultHandler::HandlerResult::ContinueExecution;
+    }
+  }
+
+  // use upper_bound to find the next block after the pc
+  HostCodeMap::iterator upper_iter =
+    s_host_code_map.upper_bound(reinterpret_cast<CodeBlock::HostCodePointer>(exception_pc));
+  if (upper_iter == s_host_code_map.begin())
+    return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler;
+
+  // then decrement it by one to (hopefully) get the block we want
+  upper_iter--;
+
+  // find the loadstore info in the code block
+  CodeBlock* block = upper_iter->second;
+  for (auto bpi_iter = block->loadstore_backpatch_info.begin(); bpi_iter != block->loadstore_backpatch_info.end();
+       ++bpi_iter)
+  {
+    const Recompiler::LoadStoreBackpatchInfo& lbi = *bpi_iter;
+    if (lbi.host_pc == exception_pc)
+    {
+      // found it, do fixup
+      if (Recompiler::CodeGenerator::BackpatchLoadStore(lbi))
+      {
+        // remove the backpatch entry since we won't be coming back to this one
+        block->loadstore_backpatch_info.erase(bpi_iter);
+        return Common::PageFaultHandler::HandlerResult::ContinueExecution;
+      }
+      else
+      {
+        Log_ErrorPrintf("Failed to backpatch %p in block 0x%08X", exception_pc, block->GetPC());
+        return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler;
+      }
+    }
+  }
+
+  // we didn't find the pc in our list..
+  Log_ErrorPrintf("Loadstore PC not found for %p in block 0x%08X", exception_pc, block->GetPC());
+  return Common::PageFaultHandler::HandlerResult::ExecuteNextHandler;
+}
+
+#endif
+
 } // namespace CPU::CodeCache
diff --git a/src/core/cpu_code_cache.h b/src/core/cpu_code_cache.h
index 6d09a8c0d..ee03aea6b 100644
--- a/src/core/cpu_code_cache.h
+++ b/src/core/cpu_code_cache.h
@@ -2,12 +2,18 @@
 #include "bus.h"
 #include "common/bitfield.h"
 #include "common/jit_code_buffer.h"
+#include "common/page_fault_handler.h"
 #include "cpu_types.h"
 #include <array>
+#include <map>
 #include <memory>
 #include <unordered_map>
 #include <vector>
 
+#ifdef WITH_RECOMPILER
+#include "cpu_recompiler_types.h"
+#endif
+
 namespace CPU {
 
 enum : u32
@@ -71,6 +77,12 @@ struct CodeBlock
 
   TickCount uncached_fetch_ticks = 0;
   u32 icache_line_count = 0;
+
+#ifdef WITH_RECOMPILER
+  std::vector<Recompiler::LoadStoreBackpatchInfo> loadstore_backpatch_info;
+#endif
+
+  bool contains_loadstore_instructions = false;
   bool invalidated = false;
 
   const u32 GetPC() const { return key.GetPC(); }
@@ -89,7 +101,7 @@ struct CodeBlock
 
 namespace CodeCache {
 
-void Initialize(bool use_recompiler);
+void Initialize();
 void Shutdown();
 void Execute();
 
@@ -105,7 +117,7 @@ void ExecuteRecompiler();
 void Flush();
 
 /// Changes whether the recompiler is enabled.
-void SetUseRecompiler(bool enable);
+void Reinitialize();
 
 /// Invalidates all blocks which are in the range of the specified code page.
 void InvalidateBlocksWithPageIndex(u32 page_index);
diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp
index b433ff3ff..a8e21e2f7 100644
--- a/src/core/cpu_core.cpp
+++ b/src/core/cpu_core.cpp
@@ -1,4 +1,5 @@
 #include "cpu_core.h"
+#include "bus.h"
 #include "common/align.h"
 #include "common/file_system.h"
 #include "common/log.h"
@@ -1563,6 +1564,11 @@ bool InterpretInstructionPGXP()
   return g_state.exception_raised;
 }
 
+void UpdateFastmemMapping()
+{
+  Bus::UpdateFastmemViews(true, g_state.cop0_regs.sr.Isc);
+}
+
 } // namespace Recompiler::Thunks
 
 } // namespace CPU
\ No newline at end of file
diff --git a/src/core/cpu_core.h b/src/core/cpu_core.h
index 2fb7be2ab..5a59c1d3d 100644
--- a/src/core/cpu_core.h
+++ b/src/core/cpu_core.h
@@ -79,6 +79,8 @@ struct State
   // GTE registers are stored here so we can access them on ARM with a single instruction
   GTE::Regs gte_regs = {};
 
+  u8* fastmem_base = nullptr;
+
   // data cache (used as scratchpad)
   std::array<u8, DCACHE_SIZE> dcache = {};
   std::array<u32, ICACHE_LINES> icache_tags = {};
diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp
index a4999d08b..e0f84f416 100644
--- a/src/core/cpu_recompiler_code_generator.cpp
+++ b/src/core/cpu_recompiler_code_generator.cpp
@@ -19,8 +19,7 @@ u32 CodeGenerator::CalculateRegisterOffset(Reg reg)
   return u32(offsetof(State, regs.r[0]) + (static_cast<u32>(reg) * sizeof(u32)));
 }
 
-bool CodeGenerator::CompileBlock(const CodeBlock* block, CodeBlock::HostCodePointer* out_host_code,
-                                 u32* out_host_code_size)
+bool CodeGenerator::CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size)
 {
   // TODO: Align code buffer.
 
@@ -40,8 +39,10 @@ bool CodeGenerator::CompileBlock(const CodeBlock* block, CodeBlock::HostCodePoin
     Log_DebugPrintf("Compiling instruction '%s'", disasm.GetCharArray());
 #endif
 
+    m_current_instruction = cbi;
     if (!CompileInstruction(*cbi))
     {
+      m_current_instruction = nullptr;
       m_block_end = nullptr;
       m_block_start = nullptr;
       m_block = nullptr;
@@ -60,6 +61,7 @@ bool CodeGenerator::CompileBlock(const CodeBlock* block, CodeBlock::HostCodePoin
 
   DebugAssert(m_register_cache.GetUsedHostRegisters() == 0);
 
+  m_current_instruction = nullptr;
   m_block_end = nullptr;
   m_block_start = nullptr;
   m_block = nullptr;
@@ -1912,7 +1914,22 @@ bool CodeGenerator::Compile_cop0(const CodeBlockInstruction& cbi)
               value = AndValues(value, Value::FromConstantU32(write_mask));
             }
 
-            EmitStoreCPUStructField(offset, value);
+            // changing SR[Isc] needs to update fastmem views
+            if (reg == Cop0Reg::SR && g_settings.cpu_fastmem)
+            {
+              LabelType skip_fastmem_update;
+              Value old_value = m_register_cache.AllocateScratch(RegSize_32);
+              EmitLoadCPUStructField(old_value.host_reg, RegSize_32, offset);
+              EmitStoreCPUStructField(offset, value);
+              EmitXor(old_value.host_reg, old_value.host_reg, value);
+              EmitBranchIfBitClear(old_value.host_reg, RegSize_32, 16, &skip_fastmem_update);
+              EmitFunctionCall(nullptr, &Thunks::UpdateFastmemMapping, m_register_cache.GetCPUPtr());
+              EmitBindLabel(&skip_fastmem_update);
+            }
+            else
+            {
+              EmitStoreCPUStructField(offset, value);
+            }
           }
         }
 
diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h
index be07f25e8..26fc77b3d 100644
--- a/src/core/cpu_recompiler_code_generator.h
+++ b/src/core/cpu_recompiler_code_generator.h
@@ -23,7 +23,9 @@ public:
   static const char* GetHostRegName(HostReg reg, RegSize size = HostPointerSize);
   static void AlignCodeBuffer(JitCodeBuffer* code_buffer);
 
-  bool CompileBlock(const CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size);
+  static bool BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi);
+
+  bool CompileBlock(CodeBlock* block, CodeBlock::HostCodePointer* out_host_code, u32* out_host_code_size);
 
   CodeCache::DispatcherFunction CompileDispatcher();
   CodeCache::SingleBlockDispatcherFunction CompileSingleBlockDispatcher();
@@ -74,7 +76,11 @@ public:
 
   // Automatically generates an exception handler.
   Value EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const Value& address, RegSize size);
+  void EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, Value& result);
+  void EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size, Value& result, bool in_far_code);
   void EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const Value& address, const Value& value);
+  void EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, const Value& value);
+  void EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, const Value& value, bool in_far_code);
 
   // Unconditional branch to pointer. May allocate a scratch register.
   void EmitBranch(const void* address, bool allow_scratch = true);
@@ -208,9 +214,10 @@ private:
   bool Compile_cop2(const CodeBlockInstruction& cbi);
 
   JitCodeBuffer* m_code_buffer;
-  const CodeBlock* m_block = nullptr;
+  CodeBlock* m_block = nullptr;
   const CodeBlockInstruction* m_block_start = nullptr;
   const CodeBlockInstruction* m_block_end = nullptr;
+  const CodeBlockInstruction* m_current_instruction = nullptr;
   RegisterCache m_register_cache;
   CodeEmitter m_near_emitter;
   CodeEmitter m_far_emitter;
diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp
index 5e0d12f23..1674b7fb5 100644
--- a/src/core/cpu_recompiler_code_generator_aarch64.cpp
+++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp
@@ -14,6 +14,7 @@ namespace a64 = vixl::aarch64;
 namespace CPU::Recompiler {
 
 constexpr HostReg RCPUPTR = 19;
+constexpr HostReg RMEMBASEPTR = 20;
 constexpr HostReg RRETURN = 0;
 constexpr HostReg RARG1 = 0;
 constexpr HostReg RARG2 = 1;
@@ -86,6 +87,11 @@ static const a64::XRegister GetCPUPtrReg()
   return GetHostReg64(RCPUPTR);
 }
 
+static const a64::XRegister GetFastmemBasePtrReg()
+{
+  return GetHostReg64(RMEMBASEPTR);
+}
+
 CodeGenerator::CodeGenerator(JitCodeBuffer* code_buffer)
   : m_code_buffer(code_buffer), m_register_cache(*this),
     m_near_emitter(static_cast<vixl::byte*>(code_buffer->GetFreeCodePointer()), code_buffer->GetFreeCodeSpace(),
@@ -188,10 +194,21 @@ void CodeGenerator::EmitBeginBlock()
   // Store the CPU struct pointer. TODO: make this better.
   const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR);
   DebugAssert(cpu_reg_allocated);
+
+  // If there's loadstore instructions, preload the fastmem base.
+  if (m_block->contains_loadstore_instructions)
+  {
+    const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR);
+    Assert(fastmem_reg_allocated);
+    m_emit->Ldr(GetFastmemBasePtrReg(), a64::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base)));
+  }
 }
 
 void CodeGenerator::EmitEndBlock()
 {
+  if (m_block->contains_loadstore_instructions)
+    m_register_cache.FreeHostReg(RMEMBASEPTR);
+
   m_register_cache.FreeHostReg(RCPUPTR);
   m_register_cache.PopCalleeSavedRegisters(true);
 
@@ -1285,12 +1302,105 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const
 
   AddPendingCycles(true);
 
+  Value result = m_register_cache.AllocateScratch(RegSize_64);
+  if (g_settings.IsUsingFastmem())
+  {
+    EmitLoadGuestMemoryFastmem(cbi, address, size, result);
+  }
+  else
+  {
+    m_register_cache.FlushCallerSavedGuestRegisters(true, true);
+    EmitLoadGuestMemorySlowmem(cbi, address, size, result, false);
+  }
+
+  // Downcast to ignore upper 56/48/32 bits. This should be a noop.
+  switch (size)
+  {
+    case RegSize_8:
+      ConvertValueSizeInPlace(&result, RegSize_8, false);
+      break;
+
+    case RegSize_16:
+      ConvertValueSizeInPlace(&result, RegSize_16, false);
+      break;
+
+    case RegSize_32:
+      ConvertValueSizeInPlace(&result, RegSize_32, false);
+      break;
+
+    default:
+      UnreachableCode();
+      break;
+  }
+
+  return result;
+}
+
+void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
+                                               Value& result)
+{
+  // fastmem
+  LoadStoreBackpatchInfo bpi;
+  bpi.host_pc = GetCurrentNearCodePointer();
+  bpi.address_host_reg = HostReg_Invalid;
+  bpi.value_host_reg = result.host_reg;
+  bpi.guest_pc = m_current_instruction->pc;
+
+  a64::MemOperand actual_address;
+  if (address.IsConstant())
+  {
+    m_emit->Mov(GetHostReg32(result.host_reg), address.constant_value);
+    actual_address = a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(result.host_reg));
+    bpi.host_pc = GetCurrentNearCodePointer();
+  }
+  else
+  {
+    actual_address = a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(address));
+  }
+
+  // TODO: movsx/zx inline here
+  switch (size)
+  {
+    case RegSize_8:
+      m_emit->Ldrb(GetHostReg32(result.host_reg), actual_address);
+      break;
+
+    case RegSize_16:
+      m_emit->Ldrh(GetHostReg32(result.host_reg), actual_address);
+      break;
+
+    case RegSize_32:
+      m_emit->Ldr(GetHostReg32(result.host_reg), actual_address);
+      break;
+
+    default:
+      UnreachableCode();
+      break;
+  }
+
+  EmitAddCPUStructField(offsetof(State, pending_ticks), Value::FromConstantU32(Bus::RAM_READ_TICKS));
+
+  bpi.host_code_size = static_cast<u32>(
+    static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
+
+  // generate slowmem fallback
+  bpi.host_slowmem_pc = GetCurrentFarCodePointer();
+  SwitchToFarCode();
+  EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
+
+  // return to the block code
+  EmitBranch(GetCurrentNearCodePointer(), false);
+
+  SwitchToNearCode();
+
+  m_block->loadstore_backpatch_info.push_back(bpi);
+}
+
+void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
+                                               Value& result, bool in_far_code)
+{
   if (g_settings.cpu_recompiler_memory_exceptions)
   {
-    // We need to use the full 64 bits here since we test the sign bit result.
-    Value result = m_register_cache.AllocateScratch(RegSize_64);
-    m_register_cache.FlushCallerSavedGuestRegisters(true, true);
-
     // NOTE: This can leave junk in the upper bits
     switch (size)
     {
@@ -1319,7 +1429,8 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const
     m_emit->Bind(&load_okay);
 
     // load exception path
-    SwitchToFarCode();
+    if (!in_far_code)
+      SwitchToFarCode();
 
     // cause_bits = (-result << 2) | BD | cop_n
     m_emit->neg(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg));
@@ -1330,37 +1441,14 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const
     EmitFunctionCall(nullptr, static_cast<void (*)(u32, u32)>(&CPU::RaiseException), result, GetCurrentInstructionPC());
 
     EmitExceptionExit();
-    SwitchToNearCode();
+
+    if (!in_far_code)
+      SwitchToNearCode();
 
     m_register_cache.PopState();
-
-    // Downcast to ignore upper 56/48/32 bits. This should be a noop.
-    switch (size)
-    {
-      case RegSize_8:
-        ConvertValueSizeInPlace(&result, RegSize_8, false);
-        break;
-
-      case RegSize_16:
-        ConvertValueSizeInPlace(&result, RegSize_16, false);
-        break;
-
-      case RegSize_32:
-        ConvertValueSizeInPlace(&result, RegSize_32, false);
-        break;
-
-      default:
-        UnreachableCode();
-        break;
-    }
-
-    return result;
   }
   else
   {
-    Value result = m_register_cache.AllocateScratch(RegSize_32);
-    m_register_cache.FlushCallerSavedGuestRegisters(true, true);
-
     switch (size)
     {
       case RegSize_8:
@@ -1379,27 +1467,6 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const
         UnreachableCode();
         break;
     }
-
-    // Downcast to ignore upper 56/48/32 bits. This should be a noop.
-    switch (size)
-    {
-      case RegSize_8:
-        ConvertValueSizeInPlace(&result, RegSize_8, false);
-        break;
-
-      case RegSize_16:
-        ConvertValueSizeInPlace(&result, RegSize_16, false);
-        break;
-
-      case RegSize_32:
-        break;
-
-      default:
-        UnreachableCode();
-        break;
-    }
-
-    return result;
   }
 }
 
@@ -1420,11 +1487,87 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const
 
   AddPendingCycles(true);
 
+  if (g_settings.IsUsingFastmem())
+  {
+    // we need the value in a host register to store it
+    Value value_in_hr = GetValueInHostRegister(value);
+    EmitStoreGuestMemoryFastmem(cbi, address, value_in_hr);
+  }
+  else
+  {
+    m_register_cache.FlushCallerSavedGuestRegisters(true, true);
+    EmitStoreGuestMemorySlowmem(cbi, address, value, false);
+  }
+}
+
+void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address,
+                                                const Value& value)
+{
+  // fastmem
+  LoadStoreBackpatchInfo bpi;
+  bpi.host_pc = GetCurrentNearCodePointer();
+  bpi.address_host_reg = HostReg_Invalid;
+  bpi.value_host_reg = value.host_reg;
+  bpi.guest_pc = m_current_instruction->pc;
+
+  a64::MemOperand actual_address;
+  if (address.IsConstant())
+  {
+    m_emit->Mov(GetHostReg32(RSCRATCH), address.constant_value);
+    actual_address = a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(RSCRATCH));
+    bpi.host_pc = GetCurrentNearCodePointer();
+  }
+  else
+  {
+    actual_address = a64::MemOperand(GetFastmemBasePtrReg(), GetHostReg32(address));
+  }
+
+  switch (value.size)
+  {
+    case RegSize_8:
+      m_emit->Strb(GetHostReg8(value), actual_address);
+      break;
+
+    case RegSize_16:
+      m_emit->Strh(GetHostReg16(value), actual_address);
+      break;
+
+    case RegSize_32:
+      m_emit->Str(GetHostReg32(value), actual_address);
+      break;
+
+    default:
+      UnreachableCode();
+      break;
+  }
+
+  bpi.host_code_size = static_cast<u32>(
+    static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
+
+  // generate slowmem fallback
+  bpi.host_slowmem_pc = GetCurrentFarCodePointer();
+  SwitchToFarCode();
+
+  EmitStoreGuestMemorySlowmem(cbi, address, value, true);
+
+  // return to the block code
+  EmitBranch(GetCurrentNearCodePointer(), false);
+
+  SwitchToNearCode();
+
+  m_block->loadstore_backpatch_info.push_back(bpi);
+}
+
+void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address,
+                                                const Value& value, bool in_far_code)
+{
+  AddPendingCycles(true);
+
   if (g_settings.cpu_recompiler_memory_exceptions)
   {
-    Value result = m_register_cache.AllocateScratch(RegSize_32);
-    m_register_cache.FlushCallerSavedGuestRegisters(true, true);
+    Assert(!in_far_code);
 
+    Value result = m_register_cache.AllocateScratch(RegSize_32);
     switch (value.size)
     {
       case RegSize_8:
@@ -1452,7 +1595,8 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const
     m_emit->Bind(&store_okay);
 
     // store exception path
-    SwitchToFarCode();
+    if (!in_far_code)
+      SwitchToFarCode();
 
     // cause_bits = (result << 2) | BD | cop_n
     m_emit->lsl(GetHostReg32(result.host_reg), GetHostReg32(result.host_reg), 2);
@@ -1461,15 +1605,14 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const
              static_cast<Exception>(0), cbi.is_branch_delay_slot, false, cbi.instruction.cop.cop_n)));
     EmitFunctionCall(nullptr, static_cast<void (*)(u32, u32)>(&CPU::RaiseException), result, GetCurrentInstructionPC());
 
-    EmitExceptionExit();
+    if (!in_far_code)
+      EmitExceptionExit();
     SwitchToNearCode();
 
     m_register_cache.PopState();
   }
   else
   {
-    m_register_cache.FlushCallerSavedGuestRegisters(true, true);
-
     switch (value.size)
     {
       case RegSize_8:
@@ -1491,6 +1634,30 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const
   }
 }
 
+bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi)
+{
+  Log_DevPrintf("Backpatching %p (guest PC 0x%08X) to slowmem at %p", lbi.host_pc, lbi.guest_pc, lbi.host_slowmem_pc);
+
+  // check jump distance
+  const s64 jump_distance =
+    static_cast<s64>(reinterpret_cast<intptr_t>(lbi.host_slowmem_pc) - reinterpret_cast<intptr_t>(lbi.host_pc));
+  Assert(Common::IsAligned(jump_distance, 4));
+  Assert(a64::Instruction::IsValidImmPCOffset(a64::UncondBranchType, jump_distance >> 2));
+
+  // turn it into a jump to the slowmem handler
+  vixl::aarch64::MacroAssembler emit(static_cast<vixl::byte*>(lbi.host_pc), lbi.host_code_size,
+                                     a64::PositionDependentCode);
+  emit.b(jump_distance >> 2);
+
+  const s32 nops = (static_cast<s32>(lbi.host_code_size) - static_cast<s32>(emit.GetCursorOffset())) / 4;
+  Assert(nops >= 0);
+  for (s32 i = 0; i < nops; i++)
+    emit.nop();
+
+  JitCodeBuffer::FlushInstructionCache(lbi.host_pc, lbi.host_code_size);
+  return true;
+}
+
 void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr)
 {
   EmitLoadGlobalAddress(RSCRATCH, ptr);
diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp
index 5ad82d345..e56a49716 100644
--- a/src/core/cpu_recompiler_code_generator_x64.cpp
+++ b/src/core/cpu_recompiler_code_generator_x64.cpp
@@ -1,4 +1,5 @@
 #include "common/align.h"
+#include "common/assert.h"
 #include "common/log.h"
 #include "cpu_core.h"
 #include "cpu_core_private.h"
@@ -12,6 +13,7 @@ namespace CPU::Recompiler {
 
 #if defined(ABI_WIN64)
 constexpr HostReg RCPUPTR = Xbyak::Operand::RBP;
+constexpr HostReg RMEMBASEPTR = Xbyak::Operand::RBX;
 constexpr HostReg RRETURN = Xbyak::Operand::RAX;
 constexpr HostReg RARG1 = Xbyak::Operand::RCX;
 constexpr HostReg RARG2 = Xbyak::Operand::RDX;
@@ -21,6 +23,7 @@ constexpr u32 FUNCTION_CALL_SHADOW_SPACE = 32;
 constexpr u64 FUNCTION_CALL_STACK_ALIGNMENT = 16;
 #elif defined(ABI_SYSV)
 constexpr HostReg RCPUPTR = Xbyak::Operand::RBP;
+constexpr HostReg RMEMBASEPTR = Xbyak::Operand::RBX;
 constexpr HostReg RRETURN = Xbyak::Operand::RAX;
 constexpr HostReg RARG1 = Xbyak::Operand::RDI;
 constexpr HostReg RARG2 = Xbyak::Operand::RSI;
@@ -79,6 +82,11 @@ static const Xbyak::Reg64 GetCPUPtrReg()
   return GetHostReg64(RCPUPTR);
 }
 
+static const Xbyak::Reg64 GetFastmemBasePtrReg()
+{
+  return GetHostReg64(RMEMBASEPTR);
+}
+
 CodeGenerator::CodeGenerator(JitCodeBuffer* code_buffer)
   : m_code_buffer(code_buffer), m_register_cache(*this),
     m_near_emitter(code_buffer->GetFreeCodeSpace(), code_buffer->GetFreeCodePointer()),
@@ -140,7 +148,6 @@ void CodeGenerator::InitHostRegs()
   m_register_cache.SetCalleeSavedHostRegs({Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::RDI,
                                            Xbyak::Operand::RSI, Xbyak::Operand::RSP, Xbyak::Operand::R12,
                                            Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15});
-  m_register_cache.SetCPUPtrHostReg(RCPUPTR);
 #elif defined(ABI_SYSV)
   m_register_cache.SetHostRegAllocationOrder(
     {Xbyak::Operand::RBX, /*Xbyak::Operand::RSP, */ Xbyak::Operand::RBP, Xbyak::Operand::R12, Xbyak::Operand::R13,
@@ -154,8 +161,9 @@ void CodeGenerator::InitHostRegs()
   m_register_cache.SetCalleeSavedHostRegs({Xbyak::Operand::RBX, Xbyak::Operand::RSP, Xbyak::Operand::RBP,
                                            Xbyak::Operand::R12, Xbyak::Operand::R13, Xbyak::Operand::R14,
                                            Xbyak::Operand::R15});
-  m_register_cache.SetCPUPtrHostReg(RCPUPTR);
 #endif
+
+  m_register_cache.SetCPUPtrHostReg(RCPUPTR);
 }
 
 void CodeGenerator::SwitchToFarCode()
@@ -196,11 +204,22 @@ void CodeGenerator::EmitBeginBlock()
   const bool cpu_reg_allocated = m_register_cache.AllocateHostReg(RCPUPTR);
   DebugAssert(cpu_reg_allocated);
   // m_emit->mov(GetCPUPtrReg(), reinterpret_cast<size_t>(&g_state));
+
+  // If there's loadstore instructions, preload the fastmem base.
+  if (m_block->contains_loadstore_instructions)
+  {
+    const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR);
+    Assert(fastmem_reg_allocated);
+    m_emit->mov(GetFastmemBasePtrReg(), m_emit->qword[GetCPUPtrReg() + offsetof(CPU::State, fastmem_base)]);
+  }
 }
 
 void CodeGenerator::EmitEndBlock()
 {
   m_register_cache.FreeHostReg(RCPUPTR);
+  if (m_block->contains_loadstore_instructions)
+    m_register_cache.FreeHostReg(RMEMBASEPTR);
+
   m_register_cache.PopCalleeSavedRegisters(true);
 
   m_emit->ret();
@@ -1747,12 +1766,140 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const
 
   AddPendingCycles(true);
 
+  Value result = m_register_cache.AllocateScratch(RegSize_64);
+  if (g_settings.IsUsingFastmem())
+  {
+    EmitLoadGuestMemoryFastmem(cbi, address, size, result);
+  }
+  else
+  {
+    m_register_cache.FlushCallerSavedGuestRegisters(true, true);
+    EmitLoadGuestMemorySlowmem(cbi, address, size, result, false);
+  }
+
+  // Downcast to ignore upper 56/48/32 bits. This should be a noop.
+  switch (size)
+  {
+    case RegSize_8:
+      ConvertValueSizeInPlace(&result, RegSize_8, false);
+      break;
+
+    case RegSize_16:
+      ConvertValueSizeInPlace(&result, RegSize_16, false);
+      break;
+
+    case RegSize_32:
+      ConvertValueSizeInPlace(&result, RegSize_32, false);
+      break;
+
+    default:
+      UnreachableCode();
+      break;
+  }
+
+  return result;
+}
+
+void CodeGenerator::EmitLoadGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
+                                               Value& result)
+{
+  // fastmem
+  LoadStoreBackpatchInfo bpi;
+  bpi.host_pc = GetCurrentNearCodePointer();
+  bpi.address_host_reg = HostReg_Invalid;
+  bpi.value_host_reg = result.host_reg;
+  bpi.guest_pc = m_current_instruction->pc;
+
+  // can't store displacements > 0x80000000 in-line
+  const Value* actual_address = &address;
+  if (address.IsConstant() && address.constant_value >= 0x80000000)
+  {
+    actual_address = &result;
+    m_emit->mov(GetHostReg32(result.host_reg), address.constant_value);
+    bpi.host_pc = GetCurrentNearCodePointer();
+  }
+
+  // TODO: movsx/zx inline here
+  switch (size)
+  {
+    case RegSize_8:
+    {
+      if (actual_address->IsConstant())
+      {
+        m_emit->mov(GetHostReg8(result.host_reg),
+                    m_emit->byte[GetFastmemBasePtrReg() + actual_address->constant_value]);
+      }
+      else
+      {
+        m_emit->mov(GetHostReg8(result.host_reg),
+                    m_emit->byte[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)]);
+      }
+    }
+    break;
+
+    case RegSize_16:
+    {
+      if (actual_address->IsConstant())
+      {
+        m_emit->mov(GetHostReg16(result.host_reg),
+                    m_emit->word[GetFastmemBasePtrReg() + actual_address->constant_value]);
+      }
+      else
+      {
+        m_emit->mov(GetHostReg16(result.host_reg),
+                    m_emit->word[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)]);
+      }
+    }
+    break;
+
+    case RegSize_32:
+    {
+      if (actual_address->IsConstant())
+      {
+        m_emit->mov(GetHostReg32(result.host_reg),
+                    m_emit->dword[GetFastmemBasePtrReg() + actual_address->constant_value]);
+      }
+      else
+      {
+        m_emit->mov(GetHostReg32(result.host_reg),
+                    m_emit->dword[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)]);
+      }
+    }
+    break;
+  }
+
+  // TODO: BIOS reads...
+  EmitAddCPUStructField(offsetof(CPU::State, pending_ticks), Value::FromConstantU32(Bus::RAM_READ_TICKS));
+
+  // insert nops, we need at least 5 bytes for a relative jump
+  const u32 fastmem_size =
+    static_cast<u32>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc));
+  const u32 nops = (fastmem_size < 5 ? 5 - fastmem_size : 0);
+  for (u32 i = 0; i < nops; i++)
+    m_emit->nop();
+
+  bpi.host_code_size = static_cast<u32>(
+    static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
+
+  // generate slowmem fallback
+  m_far_emitter.align(16);
+  bpi.host_slowmem_pc = GetCurrentFarCodePointer();
+  SwitchToFarCode();
+  EmitLoadGuestMemorySlowmem(cbi, address, size, result, true);
+
+  // return to the block code
+  m_emit->jmp(GetCurrentNearCodePointer());
+
+  SwitchToNearCode();
+
+  m_block->loadstore_backpatch_info.push_back(bpi);
+}
+
+void CodeGenerator::EmitLoadGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address, RegSize size,
+                                               Value& result, bool in_far_code)
+{
   if (g_settings.cpu_recompiler_memory_exceptions)
   {
-    // We need to use the full 64 bits here since we test the sign bit result.
-    Value result = m_register_cache.AllocateScratch(RegSize_64);
-    m_register_cache.FlushCallerSavedGuestRegisters(true, true);
-
     // NOTE: This can leave junk in the upper bits
     switch (size)
     {
@@ -1779,7 +1926,8 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const
     m_register_cache.PushState();
 
     // load exception path
-    SwitchToFarCode();
+    if (!in_far_code)
+      SwitchToFarCode();
 
     // cause_bits = (-result << 2) | BD | cop_n
     m_emit->neg(GetHostReg32(result.host_reg));
@@ -1790,37 +1938,14 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const
     EmitFunctionCall(nullptr, static_cast<void (*)(u32, u32)>(&CPU::RaiseException), result, GetCurrentInstructionPC());
 
     EmitExceptionExit();
-    SwitchToNearCode();
+
+    if (!in_far_code)
+      SwitchToNearCode();
 
     m_register_cache.PopState();
-
-    // Downcast to ignore upper 56/48/32 bits. This should be a noop.
-    switch (size)
-    {
-      case RegSize_8:
-        ConvertValueSizeInPlace(&result, RegSize_8, false);
-        break;
-
-      case RegSize_16:
-        ConvertValueSizeInPlace(&result, RegSize_16, false);
-        break;
-
-      case RegSize_32:
-        ConvertValueSizeInPlace(&result, RegSize_32, false);
-        break;
-
-      default:
-        UnreachableCode();
-        break;
-    }
-
-    return result;
   }
   else
   {
-    Value result = m_register_cache.AllocateScratch(RegSize_32);
-    m_register_cache.FlushCallerSavedGuestRegisters(true, true);
-
     switch (size)
     {
       case RegSize_8:
@@ -1839,27 +1964,6 @@ Value CodeGenerator::EmitLoadGuestMemory(const CodeBlockInstruction& cbi, const
         UnreachableCode();
         break;
     }
-
-    // Downcast to ignore upper 56/48/32 bits. This should be a noop.
-    switch (size)
-    {
-      case RegSize_8:
-        ConvertValueSizeInPlace(&result, RegSize_8, false);
-        break;
-
-      case RegSize_16:
-        ConvertValueSizeInPlace(&result, RegSize_16, false);
-        break;
-
-      case RegSize_32:
-        break;
-
-      default:
-        UnreachableCode();
-        break;
-    }
-
-    return result;
   }
 }
 
@@ -1880,11 +1984,164 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const
 
   AddPendingCycles(true);
 
+  if (g_settings.IsUsingFastmem())
+  {
+    EmitStoreGuestMemoryFastmem(cbi, address, value);
+  }
+  else
+  {
+    m_register_cache.FlushCallerSavedGuestRegisters(true, true);
+    EmitStoreGuestMemorySlowmem(cbi, address, value, false);
+  }
+}
+
+void CodeGenerator::EmitStoreGuestMemoryFastmem(const CodeBlockInstruction& cbi, const Value& address,
+                                                const Value& value)
+{
+  // fastmem
+  LoadStoreBackpatchInfo bpi;
+  bpi.host_pc = GetCurrentNearCodePointer();
+  bpi.address_host_reg = HostReg_Invalid;
+  bpi.value_host_reg = value.host_reg;
+  bpi.guest_pc = m_current_instruction->pc;
+
+  // can't store displacements > 0x80000000 in-line
+  const Value* actual_address = &address;
+  Value temp_address;
+  if (address.IsConstant() && address.constant_value >= 0x80000000)
+  {
+    temp_address.SetHostReg(&m_register_cache, RRETURN, RegSize_32);
+    actual_address = &temp_address;
+    m_emit->mov(GetHostReg32(temp_address), address.constant_value);
+    bpi.host_pc = GetCurrentNearCodePointer();
+  }
+
+  switch (value.size)
+  {
+    case RegSize_8:
+    {
+      if (actual_address->IsConstant())
+      {
+        if (value.IsConstant())
+        {
+          m_emit->mov(m_emit->byte[GetFastmemBasePtrReg() + actual_address->constant_value], value.constant_value);
+        }
+        else
+        {
+          m_emit->mov(m_emit->byte[GetFastmemBasePtrReg() + actual_address->constant_value],
+                      GetHostReg8(value.host_reg));
+        }
+      }
+      else
+      {
+        if (value.IsConstant())
+        {
+          m_emit->mov(m_emit->byte[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)],
+                      value.constant_value);
+        }
+        else
+        {
+          m_emit->mov(m_emit->byte[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)],
+                      GetHostReg8(value.host_reg));
+        }
+      }
+    }
+    break;
+
+    case RegSize_16:
+    {
+      if (actual_address->IsConstant())
+      {
+        if (value.IsConstant())
+        {
+          m_emit->mov(m_emit->word[GetFastmemBasePtrReg() + actual_address->constant_value], value.constant_value);
+        }
+        else
+        {
+          m_emit->mov(m_emit->word[GetFastmemBasePtrReg() + actual_address->constant_value],
+                      GetHostReg16(value.host_reg));
+        }
+      }
+      else
+      {
+        if (value.IsConstant())
+        {
+          m_emit->mov(m_emit->word[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)],
+                      value.constant_value);
+        }
+        else
+        {
+          m_emit->mov(m_emit->word[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)],
+                      GetHostReg16(value.host_reg));
+        }
+      }
+    }
+    break;
+
+    case RegSize_32:
+    {
+      if (actual_address->IsConstant())
+      {
+        if (value.IsConstant())
+        {
+          m_emit->mov(m_emit->dword[GetFastmemBasePtrReg() + actual_address->constant_value], value.constant_value);
+        }
+        else
+        {
+          m_emit->mov(m_emit->dword[GetFastmemBasePtrReg() + actual_address->constant_value],
+                      GetHostReg32(value.host_reg));
+        }
+      }
+      else
+      {
+        if (value.IsConstant())
+        {
+          m_emit->mov(m_emit->dword[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)],
+                      value.constant_value);
+        }
+        else
+        {
+          m_emit->mov(m_emit->dword[GetFastmemBasePtrReg() + GetHostReg64(actual_address->host_reg)],
+                      GetHostReg32(value.host_reg));
+        }
+      }
+    }
+    break;
+  }
+
+  // insert nops, we need at least 5 bytes for a relative jump
+  const u32 fastmem_size =
+    static_cast<u32>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc));
+  const u32 nops = (fastmem_size < 5 ? 5 - fastmem_size : 0);
+  for (u32 i = 0; i < nops; i++)
+    m_emit->nop();
+
+  bpi.host_code_size = static_cast<u32>(
+    static_cast<ptrdiff_t>(static_cast<u8*>(GetCurrentNearCodePointer()) - static_cast<u8*>(bpi.host_pc)));
+
+  // generate slowmem fallback
+  m_far_emitter.align();
+  bpi.host_slowmem_pc = GetCurrentFarCodePointer();
+  SwitchToFarCode();
+
+  EmitStoreGuestMemorySlowmem(cbi, address, value, true);
+
+  // return to the block code
+  m_emit->jmp(GetCurrentNearCodePointer());
+
+  SwitchToNearCode();
+
+  m_block->loadstore_backpatch_info.push_back(bpi);
+}
+
+void CodeGenerator::EmitStoreGuestMemorySlowmem(const CodeBlockInstruction& cbi, const Value& address,
+                                                const Value& value, bool in_far_code)
+{
   if (g_settings.cpu_recompiler_memory_exceptions)
   {
-    Value result = m_register_cache.AllocateScratch(RegSize_32);
-    m_register_cache.FlushCallerSavedGuestRegisters(true, true);
+    Assert(!in_far_code);
 
+    Value result = m_register_cache.AllocateScratch(RegSize_32);
     switch (value.size)
     {
       case RegSize_8:
@@ -1910,24 +2167,24 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const
     m_emit->jnz(GetCurrentFarCodePointer());
 
     // store exception path
-    SwitchToFarCode();
+    if (!in_far_code)
+      SwitchToFarCode();
 
     // cause_bits = (result << 2) | BD | cop_n
-    m_emit->shl(GetHostReg32(result.host_reg), 2);
-    m_emit->or_(GetHostReg32(result.host_reg),
+    m_emit->shl(GetHostReg32(result), 2);
+    m_emit->or_(GetHostReg32(result),
                 Cop0Registers::CAUSE::MakeValueForException(static_cast<Exception>(0), cbi.is_branch_delay_slot, false,
                                                             cbi.instruction.cop.cop_n));
     EmitFunctionCall(nullptr, static_cast<void (*)(u32, u32)>(&CPU::RaiseException), result, GetCurrentInstructionPC());
 
     EmitExceptionExit();
-    SwitchToNearCode();
+    if (!in_far_code)
+      SwitchToNearCode();
 
     m_register_cache.PopState();
   }
   else
   {
-    m_register_cache.FlushCallerSavedGuestRegisters(true, true);
-
     switch (value.size)
     {
       case RegSize_8:
@@ -1949,6 +2206,24 @@ void CodeGenerator::EmitStoreGuestMemory(const CodeBlockInstruction& cbi, const
   }
 }
 
+bool CodeGenerator::BackpatchLoadStore(const LoadStoreBackpatchInfo& lbi)
+{
+  Log_DevPrintf("Backpatching %p (guest PC 0x%08X) to slowmem", lbi.host_pc, lbi.guest_pc);
+
+  // turn it into a jump to the slowmem handler
+  Xbyak::CodeGenerator cg(lbi.host_code_size, lbi.host_pc);
+  cg.jmp(lbi.host_slowmem_pc);
+
+  const s32 nops = static_cast<s32>(lbi.host_code_size) -
+                   static_cast<s32>(static_cast<ptrdiff_t>(cg.getCurr() - static_cast<u8*>(lbi.host_pc)));
+  Assert(nops >= 0);
+  for (s32 i = 0; i < nops; i++)
+    cg.nop();
+
+  JitCodeBuffer::FlushInstructionCache(lbi.host_pc, lbi.host_code_size);
+  return true;
+}
+
 void CodeGenerator::EmitLoadGlobal(HostReg host_reg, RegSize size, const void* ptr)
 {
   const s64 displacement =
diff --git a/src/core/cpu_recompiler_thunks.h b/src/core/cpu_recompiler_thunks.h
index f698a859d..b9f5ced77 100644
--- a/src/core/cpu_recompiler_thunks.h
+++ b/src/core/cpu_recompiler_thunks.h
@@ -32,6 +32,7 @@ void UncheckedWriteMemoryByte(u32 address, u8 value);
 void UncheckedWriteMemoryHalfWord(u32 address, u16 value);
 void UncheckedWriteMemoryWord(u32 address, u32 value);
 
+void UpdateFastmemMapping();
 
 } // namespace Recompiler::Thunks
 
diff --git a/src/core/cpu_recompiler_types.h b/src/core/cpu_recompiler_types.h
index 9bb224223..3a8f1bc3c 100644
--- a/src/core/cpu_recompiler_types.h
+++ b/src/core/cpu_recompiler_types.h
@@ -127,6 +127,16 @@ constexpr bool SHIFTS_ARE_IMPLICITLY_MASKED = false;
 
 #endif
 
+struct LoadStoreBackpatchInfo
+{
+  void* host_pc;            // pointer to instruction which will fault
+  void* host_slowmem_pc;    // pointer to slowmem callback code
+  u32 host_code_size;       // size of the fastmem load as well as the add for cycles
+  HostReg address_host_reg; // register containing the guest address to load/store
+  HostReg value_host_reg;   // register containing the source/destination
+  PhysicalMemoryAddress guest_pc;
+};
+
 } // namespace Recompiler
 
 } // namespace CPU
diff --git a/src/core/host_interface.cpp b/src/core/host_interface.cpp
index 774622036..6baf40416 100644
--- a/src/core/host_interface.cpp
+++ b/src/core/host_interface.cpp
@@ -425,6 +425,7 @@ void HostInterface::SetDefaultSettings(SettingsInterface& si)
   si.SetStringValue("CPU", "ExecutionMode", Settings::GetCPUExecutionModeName(Settings::DEFAULT_CPU_EXECUTION_MODE));
   si.SetBoolValue("CPU", "RecompilerMemoryExceptions", false);
   si.SetBoolValue("CPU", "ICache", false);
+  si.SetBoolValue("CPU", "Fastmem", true);
 
   si.SetStringValue("GPU", "Renderer", Settings::GetRendererName(Settings::DEFAULT_GPU_RENDERER));
   si.SetIntValue("GPU", "ResolutionScale", 1);
@@ -586,14 +587,14 @@ void HostInterface::CheckForSettingsChanges(const Settings& old_settings)
     if (g_settings.emulation_speed != old_settings.emulation_speed)
       System::UpdateThrottlePeriod();
 
-    if (g_settings.cpu_execution_mode != old_settings.cpu_execution_mode)
+    if (g_settings.cpu_execution_mode != old_settings.cpu_execution_mode ||
+        g_settings.cpu_fastmem != old_settings.cpu_fastmem)
     {
       AddFormattedOSDMessage(
         5.0f, TranslateString("OSDMessage", "Switching to %s CPU execution mode."),
         TranslateString("OSDMessage", Settings::GetCPUExecutionModeDisplayName(g_settings.cpu_execution_mode))
           .GetCharArray());
-      CPU::CodeCache::SetUseRecompiler(g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler);
-      CPU::CodeCache::Flush();
+      CPU::CodeCache::Reinitialize();
       CPU::ClearICache();
     }
 
diff --git a/src/core/settings.cpp b/src/core/settings.cpp
index af5a1c949..3fd49cbe3 100644
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -129,6 +129,7 @@ void Settings::Load(SettingsInterface& si)
   UpdateOverclockActive();
   cpu_recompiler_memory_exceptions = si.GetBoolValue("CPU", "RecompilerMemoryExceptions", false);
   cpu_recompiler_icache = si.GetBoolValue("CPU", "RecompilerICache", false);
+  cpu_fastmem = si.GetBoolValue("CPU", "Fastmem", true);
 
   gpu_renderer = ParseRendererName(si.GetStringValue("GPU", "Renderer", GetRendererName(DEFAULT_GPU_RENDERER)).c_str())
                    .value_or(DEFAULT_GPU_RENDERER);
@@ -258,6 +259,7 @@ void Settings::Save(SettingsInterface& si) const
   si.SetIntValue("CPU", "OverclockDenominator", cpu_overclock_denominator);
   si.SetBoolValue("CPU", "RecompilerMemoryExceptions", cpu_recompiler_memory_exceptions);
   si.SetBoolValue("CPU", "RecompilerICache", cpu_recompiler_icache);
+  si.SetBoolValue("CPU", "Fastmem", cpu_fastmem);
 
   si.SetStringValue("GPU", "Renderer", GetRendererName(gpu_renderer));
   si.SetStringValue("GPU", "Adapter", gpu_adapter.c_str());
diff --git a/src/core/settings.h b/src/core/settings.h
index d1f3786c2..b919bdcde 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -76,6 +76,7 @@ struct Settings
   bool cpu_overclock_active = false;
   bool cpu_recompiler_memory_exceptions = false;
   bool cpu_recompiler_icache = false;
+  bool cpu_fastmem = true;
 
   float emulation_speed = 1.0f;
   bool speed_limiter_enabled = true;
@@ -179,6 +180,11 @@ struct Settings
     return gpu_pgxp_enable ? (gpu_pgxp_cpu ? PGXPMode::CPU : PGXPMode::Memory) : PGXPMode::Disabled;
   }
 
+  ALWAYS_INLINE bool IsUsingFastmem() const
+  {
+    return (cpu_fastmem && cpu_execution_mode == CPUExecutionMode::Recompiler && !cpu_recompiler_memory_exceptions);
+  }
+
   bool HasAnyPerGameMemoryCards() const;
 
   static void CPUOverclockPercentToFraction(u32 percent, u32* numerator, u32* denominator);
diff --git a/src/core/system.cpp b/src/core/system.cpp
index 0038902ad..a24469f39 100644
--- a/src/core/system.cpp
+++ b/src/core/system.cpp
@@ -506,6 +506,9 @@ bool RecreateGPU(GPURenderer renderer)
     return false;
   }
 
+  // reinitialize the code cache because the address space could change
+  CPU::CodeCache::Reinitialize();
+
   if (state_valid)
   {
     state_stream->SeekAbsolute(0);
@@ -728,14 +731,17 @@ bool Initialize(bool force_software_renderer)
   TimingEvents::Initialize();
 
   CPU::Initialize();
-  CPU::CodeCache::Initialize(g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler);
-  Bus::Initialize();
+
+  if (!Bus::Initialize())
+    return false;
 
   if (!CreateGPU(force_software_renderer ? GPURenderer::Software : g_settings.gpu_renderer))
     return false;
 
-  g_dma.Initialize();
+  // CPU code cache must happen after GPU, because it might steal our address space.
+  CPU::CodeCache::Initialize();
 
+  g_dma.Initialize();
   g_interrupt_controller.Initialize();
 
   g_cdrom.Initialize();
diff --git a/src/core/types.h b/src/core/types.h
index a8834eef6..700df5c4b 100644
--- a/src/core/types.h
+++ b/src/core/types.h
@@ -132,6 +132,6 @@ enum : u32
 
 enum : u32
 {
-  CPU_CODE_CACHE_PAGE_SIZE = 1024,
+  CPU_CODE_CACHE_PAGE_SIZE = 4096,
   CPU_CODE_CACHE_PAGE_COUNT = 0x200000 / CPU_CODE_CACHE_PAGE_SIZE
 };
diff --git a/src/duckstation-qt/advancedsettingswidget.cpp b/src/duckstation-qt/advancedsettingswidget.cpp
index 135c17d92..b1eb72ff3 100644
--- a/src/duckstation-qt/advancedsettingswidget.cpp
+++ b/src/duckstation-qt/advancedsettingswidget.cpp
@@ -90,6 +90,8 @@ AdvancedSettingsWidget::AdvancedSettingsWidget(QtHostInterface* host_interface,
 
   addBooleanTweakOption(m_host_interface, m_ui.tweakOptionTable, tr("Enable Recompiler Memory Exceptions"), "CPU",
                         "RecompilerMemoryExceptions", false);
+  addBooleanTweakOption(m_host_interface, m_ui.tweakOptionTable, tr("Enable Recompiler Fast Memory Access"), "CPU",
+                        "Fastmem", true);
   addBooleanTweakOption(m_host_interface, m_ui.tweakOptionTable, tr("Enable Recompiler ICache"), "CPU",
                         "RecompilerICache", false);
 
@@ -113,10 +115,11 @@ void AdvancedSettingsWidget::onResetToDefaultClicked()
   setBooleanTweakOption(m_ui.tweakOptionTable, 1, false);
   setBooleanTweakOption(m_ui.tweakOptionTable, 2, false);
   setBooleanTweakOption(m_ui.tweakOptionTable, 3, false);
-  setBooleanTweakOption(m_ui.tweakOptionTable, 4, false);
-  setIntRangeTweakOption(m_ui.tweakOptionTable, 5, static_cast<int>(Settings::DEFAULT_DMA_MAX_SLICE_TICKS));
-  setIntRangeTweakOption(m_ui.tweakOptionTable, 6, static_cast<int>(Settings::DEFAULT_DMA_HALT_TICKS));
-  setIntRangeTweakOption(m_ui.tweakOptionTable, 7, static_cast<int>(Settings::DEFAULT_GPU_FIFO_SIZE));
-  setIntRangeTweakOption(m_ui.tweakOptionTable, 8, static_cast<int>(Settings::DEFAULT_GPU_MAX_RUN_AHEAD));
-  setBooleanTweakOption(m_ui.tweakOptionTable, 9, false);
+  setBooleanTweakOption(m_ui.tweakOptionTable, 4, true);
+  setBooleanTweakOption(m_ui.tweakOptionTable, 5, false);
+  setIntRangeTweakOption(m_ui.tweakOptionTable, 6, static_cast<int>(Settings::DEFAULT_DMA_MAX_SLICE_TICKS));
+  setIntRangeTweakOption(m_ui.tweakOptionTable, 7, static_cast<int>(Settings::DEFAULT_DMA_HALT_TICKS));
+  setIntRangeTweakOption(m_ui.tweakOptionTable, 8, static_cast<int>(Settings::DEFAULT_GPU_FIFO_SIZE));
+  setIntRangeTweakOption(m_ui.tweakOptionTable, 9, static_cast<int>(Settings::DEFAULT_GPU_MAX_RUN_AHEAD));
+  setBooleanTweakOption(m_ui.tweakOptionTable, 10, false);
 }
diff --git a/src/duckstation-sdl/sdl_host_interface.cpp b/src/duckstation-sdl/sdl_host_interface.cpp
index 8a95cf639..5b0b0a637 100644
--- a/src/duckstation-sdl/sdl_host_interface.cpp
+++ b/src/duckstation-sdl/sdl_host_interface.cpp
@@ -884,6 +884,7 @@ void SDLHostInterface::DrawQuickSettingsMenu()
 
   settings_changed |=
     ImGui::MenuItem("Recompiler Memory Exceptions", nullptr, &m_settings_copy.cpu_recompiler_memory_exceptions);
+  settings_changed |= ImGui::MenuItem("Recompiler Fastmem", nullptr, &m_settings_copy.cpu_fastmem);
   settings_changed |= ImGui::MenuItem("Recompiler ICache", nullptr, &m_settings_copy.cpu_recompiler_icache);
 
   ImGui::Separator();