From 9501439d6b4647bfc840b7173774435f6132f8ef Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Wed, 4 Oct 2023 00:39:18 +1000
Subject: [PATCH] CPU: Add new experimental recompiler

---
 CMakeLists.txt                                |    4 +
 src/core/CMakeLists.txt                       |   33 +
 src/core/core.props                           |    1 +
 src/core/core.vcxproj                         |   14 +
 src/core/core.vcxproj.filters                 |    6 +
 src/core/cpu_code_cache.cpp                   |   19 +-
 src/core/cpu_code_cache_private.h             |    2 +-
 src/core/cpu_core.cpp                         |    1 +
 src/core/cpu_newrec_compiler.cpp              | 2277 +++++++++++++++
 src/core/cpu_newrec_compiler.h                |  465 ++++
 src/core/cpu_newrec_compiler_aarch64.cpp      | 2235 +++++++++++++++
 src/core/cpu_newrec_compiler_aarch64.h        |  164 ++
 src/core/cpu_newrec_compiler_riscv64.cpp      | 2453 +++++++++++++++++
 src/core/cpu_newrec_compiler_riscv64.h        |  168 ++
 src/core/cpu_newrec_compiler_x64.cpp          | 2196 +++++++++++++++
 src/core/cpu_newrec_compiler_x64.h            |  140 +
 .../cpu_recompiler_code_generator_x64.cpp     |    3 +-
 src/core/cpu_recompiler_types.h               |   37 +
 src/core/imgui_overlays.cpp                   |    5 +
 src/core/settings.cpp                         |    6 +-
 src/core/settings.h                           |    5 +-
 src/core/system.cpp                           |    2 +-
 src/core/types.h                              |    1 +
 23 files changed, 10228 insertions(+), 9 deletions(-)
 create mode 100644 src/core/cpu_newrec_compiler.cpp
 create mode 100644 src/core/cpu_newrec_compiler.h
 create mode 100644 src/core/cpu_newrec_compiler_aarch64.cpp
 create mode 100644 src/core/cpu_newrec_compiler_aarch64.h
 create mode 100644 src/core/cpu_newrec_compiler_riscv64.cpp
 create mode 100644 src/core/cpu_newrec_compiler_riscv64.h
 create mode 100644 src/core/cpu_newrec_compiler_x64.cpp
 create mode 100644 src/core/cpu_newrec_compiler_x64.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 485be291d..d4d62cb80 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,7 @@ endif()
 # Renderer options.
 option(ENABLE_OPENGL "Build with OpenGL renderer" ON)
 option(ENABLE_VULKAN "Build with Vulkan renderer" ON)
+option(ENABLE_NEWREC "Build with experimental new dynarec (needed for RISC-V)" ON)
 
 # Global options.
 if(NOT ANDROID)
@@ -171,6 +172,9 @@ elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm" OR "${CMAKE_SYSTEM_PROCESSOR}"
   endif()
 elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "riscv64")
   set(CPU_ARCH "riscv64")
+
+  # Not done for us. Or we should inline atomics?
+  link_libraries("-latomic")
 else()
   message(FATAL_ERROR "Unknown system processor: ${CMAKE_SYSTEM_PROCESSOR}")
 endif()
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index d2e070ddb..ee3cdf607 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -121,6 +121,11 @@ set(RECOMPILER_SRCS
   cpu_recompiler_types.h
 )
 
+set(NEWREC_SOURCES
+  cpu_newrec_compiler.cpp
+  cpu_newrec_compiler.h
+)
+
 target_precompile_headers(core PRIVATE "pch.h")
 target_include_directories(core PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/..")
 target_include_directories(core PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/..")
@@ -134,6 +139,15 @@ if(${CPU_ARCH} STREQUAL "x64")
     cpu_recompiler_code_generator_x64.cpp
   )
   message("Building x64 recompiler")
+
+  if(ENABLE_NEWREC)
+    target_compile_definitions(core PUBLIC "ENABLE_NEWREC=1")
+    target_sources(core PRIVATE ${NEWREC_SOURCES}
+      cpu_newrec_compiler_x64.cpp
+      cpu_newrec_compiler_x64.h
+    )
+    message("Building x64 newrec")
+  endif()
 elseif(${CPU_ARCH} STREQUAL "aarch32")
   target_compile_definitions(core PUBLIC "ENABLE_RECOMPILER=1")
   target_sources(core PRIVATE ${RECOMPILER_SRCS}
@@ -148,6 +162,25 @@ elseif(${CPU_ARCH} STREQUAL "aarch64")
   )
   target_link_libraries(core PUBLIC vixl)
   message("Building AArch64 recompiler")
+  if(ENABLE_NEWREC)
+    target_compile_definitions(core PUBLIC "ENABLE_NEWREC=1")
+    target_sources(core PRIVATE ${NEWREC_SOURCES}
+      cpu_newrec_compiler_aarch64.cpp
+      cpu_newrec_compiler_aarch64.h
+    )
+    message("Building AArch64 newrec")
+  endif()
+elseif(${CPU_ARCH} STREQUAL "riscv64")
+  target_compile_definitions(core PUBLIC "ENABLE_MMAP_FASTMEM=1")
+  if(ENABLE_NEWREC)
+    target_compile_definitions(core PUBLIC "ENABLE_NEWREC=1")
+    target_sources(core PRIVATE ${NEWREC_SOURCES}
+      cpu_newrec_compiler_riscv64.cpp
+      cpu_newrec_compiler_riscv64.h
+    )
+    target_link_libraries(core PUBLIC biscuit::biscuit riscv-disas)
+    message("Building RISC-V 64-bit newrec")
+  endif()
 else()
   message("Not building recompiler")
 endif()
diff --git a/src/core/core.props b/src/core/core.props
index a4ceaf16b..9bd5357f4 100644
--- a/src/core/core.props
+++ b/src/core/core.props
@@ -8,6 +8,7 @@
       <PreprocessorDefinitions Condition="('$(Platform)'!='ARM64')">ENABLE_RAINTEGRATION=1;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <PreprocessorDefinitions Condition="('$(Platform)'=='x64' Or '$(Platform)'=='ARM' Or '$(Platform)'=='ARM64')">ENABLE_RECOMPILER=1;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <PreprocessorDefinitions Condition="('$(Platform)'=='x64' Or '$(Platform)'=='ARM64')">ENABLE_MMAP_FASTMEM=1;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions Condition="('$(Platform)'=='x64' Or '$(Platform)'=='ARM64')">ENABLE_NEWREC=1;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 
       <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories);$(SolutionDir)dep\xxhash\include;$(SolutionDir)dep\zlib\include;$(SolutionDir)dep\rcheevos\include;$(SolutionDir)dep\rapidjson\include;$(SolutionDir)dep\discord-rpc\include</AdditionalIncludeDirectories>
       <AdditionalIncludeDirectories Condition="'$(Platform)'!='ARM64'">%(AdditionalIncludeDirectories);$(SolutionDir)dep\rainterface</AdditionalIncludeDirectories>
diff --git a/src/core/core.vcxproj b/src/core/core.vcxproj
index 6366658d7..9846a201c 100644
--- a/src/core/core.vcxproj
+++ b/src/core/core.vcxproj
@@ -13,6 +13,13 @@
     <ClCompile Include="cpu_core.cpp" />
     <ClCompile Include="cpu_disasm.cpp" />
     <ClCompile Include="cpu_code_cache.cpp" />
+    <ClCompile Include="cpu_newrec_compiler.cpp" />
+    <ClCompile Include="cpu_newrec_compiler_aarch64.cpp">
+      <ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="cpu_newrec_compiler_x64.cpp">
+      <ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
+    </ClCompile>
     <ClCompile Include="cpu_recompiler_code_generator.cpp">
       <ExcludedFromBuild Condition="'$(Platform)'=='Win32'">true</ExcludedFromBuild>
     </ClCompile>
@@ -90,6 +97,13 @@
     <ClInclude Include="cpu_core_private.h" />
     <ClInclude Include="cpu_disasm.h" />
     <ClInclude Include="cpu_code_cache.h" />
+    <ClInclude Include="cpu_newrec_compiler.h" />
+    <ClInclude Include="cpu_newrec_compiler_aarch64.h">
+      <ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="cpu_newrec_compiler_x64.h">
+      <ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
+    </ClInclude>
     <ClInclude Include="cpu_recompiler_code_generator.h">
       <ExcludedFromBuild Condition="'$(Platform)'=='Win32'">true</ExcludedFromBuild>
     </ClInclude>
diff --git a/src/core/core.vcxproj.filters b/src/core/core.vcxproj.filters
index f0bd545d4..e6dea09a4 100644
--- a/src/core/core.vcxproj.filters
+++ b/src/core/core.vcxproj.filters
@@ -60,6 +60,9 @@
     <ClCompile Include="hotkeys.cpp" />
     <ClCompile Include="gpu_shadergen.cpp" />
     <ClCompile Include="pch.cpp" />
+    <ClCompile Include="cpu_newrec_compiler.cpp" />
+    <ClCompile Include="cpu_newrec_compiler_x64.cpp" />
+    <ClCompile Include="cpu_newrec_compiler_aarch64.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="types.h" />
@@ -125,5 +128,8 @@
     <ClInclude Include="gpu_shadergen.h" />
     <ClInclude Include="pch.h" />
     <ClInclude Include="cpu_code_cache_private.h" />
+    <ClInclude Include="cpu_newrec_compiler.h" />
+    <ClInclude Include="cpu_newrec_compiler_x64.h" />
+    <ClInclude Include="cpu_newrec_compiler_aarch64.h" />
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp
index 3edef3860..8bf28e196 100644
--- a/src/core/cpu_code_cache.cpp
+++ b/src/core/cpu_code_cache.cpp
@@ -21,6 +21,10 @@ Log_SetChannel(CPU::CodeCache);
 #include "cpu_recompiler_code_generator.h"
 #endif
 
+#ifdef ENABLE_NEWREC
+#include "cpu_newrec_compiler.h"
+#endif
+
 #include <unordered_set>
 #include <zlib.h>
 
@@ -144,7 +148,8 @@ static u32 s_total_host_instructions_emitted = 0;
 bool CPU::CodeCache::IsUsingAnyRecompiler()
 {
 #ifdef ENABLE_RECOMPILER_SUPPORT
-  return g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler;
+  return (g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler ||
+          g_settings.cpu_execution_mode == CPUExecutionMode::NewRec);
 #else
   return false;
 #endif
@@ -498,8 +503,8 @@ CPU::CodeCache::Block* CPU::CodeCache::CreateBlock(u32 pc, const BlockInstructio
     return block;
   }
 
-  // TODO: Only used by NewRec for now, don't waste time filling it.
-  if constexpr (false)
+  // Old rec doesn't use backprop info, don't waste time filling it.
+  if (g_settings.cpu_execution_mode == CPUExecutionMode::NewRec)
     FillBlockRegInfo(block);
 
   // add it to the tracking list for its page
@@ -1419,6 +1424,10 @@ bool CPU::CodeCache::CompileBlock(Block* block)
     host_code = codegen.CompileBlock(block, &host_code_size, &host_far_code_size);
   }
 #endif
+#ifdef ENABLE_NEWREC
+  if (g_settings.cpu_execution_mode == CPUExecutionMode::NewRec)
+    host_code = NewRec::g_compiler->CompileBlock(block, &host_code_size, &host_far_code_size);
+#endif
 
   s_code_buffer.WriteProtect(true);
 
@@ -1570,6 +1579,10 @@ void CPU::CodeCache::BackpatchLoadStore(void* host_pc, const LoadstoreBackpatchI
   if (g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler)
     Recompiler::CodeGenerator::BackpatchLoadStore(host_pc, info);
 #endif
+#ifdef ENABLE_NEWREC
+  if (g_settings.cpu_execution_mode == CPUExecutionMode::NewRec)
+    NewRec::BackpatchLoadStore(host_pc, info);
+#endif
 
   s_code_buffer.WriteProtect(true);
 }
diff --git a/src/core/cpu_code_cache_private.h b/src/core/cpu_code_cache_private.h
index f1392e0d8..341fde10f 100644
--- a/src/core/cpu_code_cache_private.h
+++ b/src/core/cpu_code_cache_private.h
@@ -227,7 +227,7 @@ void InterpretUncachedBlock();
 
 void LogCurrentState();
 
-#if defined(ENABLE_RECOMPILER)
+#if defined(ENABLE_RECOMPILER) || defined(ENABLE_NEWREC)
 #define ENABLE_RECOMPILER_SUPPORT 1
 
 #if defined(_DEBUG) || false
diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp
index 531453bde..cedabfde0 100644
--- a/src/core/cpu_core.cpp
+++ b/src/core/cpu_core.cpp
@@ -2231,6 +2231,7 @@ void CPU::Execute()
   {
     case CPUExecutionMode::Recompiler:
     case CPUExecutionMode::CachedInterpreter:
+    case CPUExecutionMode::NewRec:
       CodeCache::Execute();
       break;
 
diff --git a/src/core/cpu_newrec_compiler.cpp b/src/core/cpu_newrec_compiler.cpp
new file mode 100644
index 000000000..5a3fb9b42
--- /dev/null
+++ b/src/core/cpu_newrec_compiler.cpp
@@ -0,0 +1,2277 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#include "cpu_newrec_compiler.h"
+#include "common/assert.h"
+#include "common/log.h"
+#include "common/small_string.h"
+#include "cpu_code_cache.h"
+#include "cpu_core_private.h"
+#include "cpu_disasm.h"
+#include "pgxp.h"
+#include "settings.h"
+#include <cstdint>
+#include <limits>
+Log_SetChannel(NewRec::Compiler);
+
+// TODO: direct link skip delay slot check
+// TODO: speculative constants
+// TODO: std::bitset in msvc has bounds checks even in release...
+
+const std::array<std::array<const void*, 2>, 3> CPU::NewRec::Compiler::s_pgxp_mem_load_functions = {
+  {{{reinterpret_cast<const void*>(&PGXP::CPU_LBx), reinterpret_cast<const void*>(&PGXP::CPU_LBx)}},
+   {{reinterpret_cast<const void*>(&PGXP::CPU_LHU), reinterpret_cast<const void*>(&PGXP::CPU_LH)}},
+   {{reinterpret_cast<const void*>(&PGXP::CPU_LW)}}}};
+const std::array<const void*, 3> CPU::NewRec::Compiler::s_pgxp_mem_store_functions = {
+  {reinterpret_cast<const void*>(&PGXP::CPU_SB), reinterpret_cast<const void*>(&PGXP::CPU_SH),
+   reinterpret_cast<const void*>(&PGXP::CPU_SW)}};
+
+CPU::NewRec::Compiler::Compiler() = default;
+
+CPU::NewRec::Compiler::~Compiler() = default;
+
+void CPU::NewRec::Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,
+                                  u32 far_code_space)
+{
+  m_block = block;
+  m_compiler_pc = block->pc;
+  m_cycles = 0;
+  m_gte_done_cycle = 0;
+  inst = nullptr;
+  iinfo = nullptr;
+  m_current_instruction_pc = 0;
+  m_current_instruction_branch_delay_slot = false;
+  m_dirty_pc = false;
+  m_dirty_instruction_bits = false;
+  m_dirty_gte_done_cycle = true;
+  m_block_ended = false;
+  m_constant_reg_values.fill(0);
+  m_constant_regs_valid.reset();
+  m_constant_regs_dirty.reset();
+
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+    ClearHostReg(i);
+  m_register_alloc_counter = 0;
+
+  m_constant_reg_values[static_cast<u32>(Reg::zero)] = 0;
+  m_constant_regs_valid.set(static_cast<u32>(Reg::zero));
+
+  m_load_delay_dirty = EMULATE_LOAD_DELAYS;
+  m_load_delay_register = Reg::count;
+  m_load_delay_value_register = NUM_HOST_REGS;
+}
+
+void CPU::NewRec::Compiler::BeginBlock()
+{
+#if 0
+  GenerateCall(reinterpret_cast<const void*>(&CPU::CodeCache::LogCurrentState));
+#endif
+
+  if (m_block->protection == CodeCache::PageProtectionMode::ManualCheck)
+  {
+    Log_DebugPrintf("Generate manual protection for PC %08X", m_block->pc);
+    const u8* ram_ptr = Bus::g_ram + VirtualAddressToPhysical(m_block->pc);
+    const u8* shadow_ptr = reinterpret_cast<const u8*>(m_block->Instructions());
+    GenerateBlockProtectCheck(ram_ptr, shadow_ptr, m_block->size * sizeof(Instruction));
+  }
+
+  if (m_block->uncached_fetch_ticks > 0 || m_block->icache_line_count > 0)
+    GenerateICacheCheckAndUpdate();
+
+  if (g_settings.bios_tty_logging)
+  {
+    if (m_block->pc == 0xa0)
+      GenerateCall(reinterpret_cast<const void*>(&CPU::HandleA0Syscall));
+    else if (m_block->pc == 0xb0)
+      GenerateCall(reinterpret_cast<const void*>(&CPU::HandleB0Syscall));
+  }
+
+  inst = m_block->Instructions();
+  iinfo = m_block->InstructionsInfo();
+  m_current_instruction_pc = m_block->pc;
+  m_current_instruction_branch_delay_slot = false;
+  m_compiler_pc += sizeof(Instruction);
+  m_dirty_pc = true;
+  m_dirty_instruction_bits = true;
+}
+
+const void* CPU::NewRec::Compiler::CompileBlock(CodeCache::Block* block, u32* host_code_size, u32* host_far_code_size)
+{
+  JitCodeBuffer& buffer = CodeCache::GetCodeBuffer();
+  Reset(block, buffer.GetFreeCodePointer(), buffer.GetFreeCodeSpace(), buffer.GetFreeFarCodePointer(),
+        buffer.GetFreeFarCodeSpace());
+
+  Log_DebugPrintf("Block range: %08X -> %08X", block->pc, block->pc + block->size * 4);
+
+  BeginBlock();
+
+  for (;;)
+  {
+    CompileInstruction();
+
+    if (iinfo->is_last_instruction || m_block_ended)
+    {
+      if (!m_block_ended)
+      {
+        // Block was truncated. Link it.
+        EndBlock(m_compiler_pc, false);
+      }
+
+      break;
+    }
+
+    inst++;
+    iinfo++;
+    m_current_instruction_pc += sizeof(Instruction);
+    m_compiler_pc += sizeof(Instruction);
+    m_dirty_pc = true;
+    m_dirty_instruction_bits = true;
+  }
+
+  // Nothing should be valid anymore
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+    DebugAssert(!IsHostRegAllocated(i));
+  for (u32 i = 1; i < static_cast<u32>(Reg::count); i++)
+    DebugAssert(!m_constant_regs_dirty.test(i) && !m_constant_regs_valid.test(i));
+
+  u32 code_size, far_code_size;
+  const void* code = EndCompile(&code_size, &far_code_size);
+  *host_code_size = code_size;
+  *host_far_code_size = far_code_size;
+  buffer.CommitCode(code_size);
+  buffer.CommitFarCode(far_code_size);
+
+  return code;
+}
+
+void CPU::NewRec::Compiler::SetConstantReg(Reg r, u32 v)
+{
+  DebugAssert(r < Reg::count && r != Reg::zero);
+
+  // There might still be an incoming load delay which we need to cancel.
+  CancelLoadDelaysToReg(r);
+
+  if (m_constant_regs_valid.test(static_cast<u32>(r)) && m_constant_reg_values[static_cast<u8>(r)] == v)
+  {
+    // Shouldn't be any host regs though.
+    DebugAssert(!CheckHostReg(0, HR_TYPE_CPU_REG, r).has_value());
+    return;
+  }
+
+  m_constant_reg_values[static_cast<u32>(r)] = v;
+  m_constant_regs_valid.set(static_cast<u32>(r));
+  m_constant_regs_dirty.set(static_cast<u32>(r));
+
+  if (const std::optional<u32> hostreg = CheckHostReg(0, HR_TYPE_CPU_REG, r); hostreg.has_value())
+  {
+    Log_DebugPrintf("Discarding guest register %s in host register %s due to constant set", GetRegName(r),
+                    GetHostRegName(hostreg.value()));
+    FreeHostReg(hostreg.value());
+  }
+}
+
+void CPU::NewRec::Compiler::CancelLoadDelaysToReg(Reg reg)
+{
+  if (m_load_delay_register != reg)
+    return;
+
+  Log_DebugPrintf("Cancelling load delay to %s", GetRegName(reg));
+  m_load_delay_register = Reg::count;
+  if (m_load_delay_value_register != NUM_HOST_REGS)
+    ClearHostReg(m_load_delay_value_register);
+}
+
+void CPU::NewRec::Compiler::UpdateLoadDelay()
+{
+  if (m_load_delay_dirty)
+  {
+    // we shouldn't have a static load delay.
+    DebugAssert(!HasLoadDelay());
+
+    // have to invalidate registers, we might have one of them cached
+    // TODO: double check the order here, will we trash a new value? we shouldn't...
+    // thankfully since this only happens on the first instruction, we can get away with just killing anything which
+    // isn't in write mode, because nothing could've been written before it, and the new value overwrites any
+    // load-delayed value
+    Log_DebugPrintf("Invalidating non-dirty registers, and flushing load delay from state");
+
+    constexpr u32 req_flags = (HR_ALLOCATED | HR_MODE_WRITE);
+
+    for (u32 i = 0; i < NUM_HOST_REGS; i++)
+    {
+      HostRegAlloc& ra = m_host_regs[i];
+      if (ra.type != HR_TYPE_CPU_REG || !IsHostRegAllocated(i) || ((ra.flags & req_flags) == req_flags))
+        continue;
+
+      Log_DebugPrintf("Freeing non-dirty cached register %s in %s", GetRegName(ra.reg), GetHostRegName(i));
+      DebugAssert(!(ra.flags & HR_MODE_WRITE));
+      ClearHostReg(i);
+    }
+
+    // remove any non-dirty constants too
+    for (u32 i = 1; i < static_cast<u32>(Reg::count); i++)
+    {
+      if (!HasConstantReg(static_cast<Reg>(i)) || HasDirtyConstantReg(static_cast<Reg>(i)))
+        continue;
+
+      Log_DebugPrintf("Clearing non-dirty constant %s", GetRegName(static_cast<Reg>(i)));
+      ClearConstantReg(static_cast<Reg>(i));
+    }
+
+    Flush(FLUSH_LOAD_DELAY_FROM_STATE);
+  }
+
+  // commit the delayed register load
+  FinishLoadDelay();
+
+  // move next load delay forward
+  if (m_next_load_delay_register != Reg::count)
+  {
+    // if it somehow got flushed, read it back in.
+    if (m_next_load_delay_value_register == NUM_HOST_REGS)
+    {
+      AllocateHostReg(HR_MODE_READ, HR_TYPE_NEXT_LOAD_DELAY_VALUE, m_next_load_delay_register);
+      DebugAssert(m_next_load_delay_value_register != NUM_HOST_REGS);
+    }
+
+    HostRegAlloc& ra = m_host_regs[m_next_load_delay_value_register];
+    ra.flags |= HR_MODE_WRITE;
+    ra.type = HR_TYPE_LOAD_DELAY_VALUE;
+
+    m_load_delay_register = m_next_load_delay_register;
+    m_load_delay_value_register = m_next_load_delay_value_register;
+    m_next_load_delay_register = Reg::count;
+    m_next_load_delay_value_register = NUM_HOST_REGS;
+  }
+}
+
+void CPU::NewRec::Compiler::FinishLoadDelay()
+{
+  DebugAssert(!m_load_delay_dirty);
+  if (!HasLoadDelay())
+    return;
+
+  // we may need to reload the value..
+  if (m_load_delay_value_register == NUM_HOST_REGS)
+  {
+    AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, m_load_delay_register);
+    DebugAssert(m_load_delay_value_register != NUM_HOST_REGS);
+  }
+
+  // kill any (old) cached value for this register
+  DeleteMIPSReg(m_load_delay_register, false);
+
+  Log_DebugPrintf("Finished delayed load to %s in host register %s", GetRegName(m_load_delay_register),
+                  GetHostRegName(m_load_delay_value_register));
+
+  // and swap the mode over so it gets written back later
+  HostRegAlloc& ra = m_host_regs[m_load_delay_value_register];
+  DebugAssert(ra.reg == m_load_delay_register);
+  ra.flags = (ra.flags & IMMUTABLE_HR_FLAGS) | HR_ALLOCATED | HR_MODE_READ | HR_MODE_WRITE;
+  ra.counter = m_register_alloc_counter++;
+  ra.type = HR_TYPE_CPU_REG;
+
+  // constants are gone
+  Log_DebugPrintf("Clearing constant in %s due to load delay", GetRegName(m_load_delay_register));
+  ClearConstantReg(m_load_delay_register);
+
+  m_load_delay_register = Reg::count;
+  m_load_delay_value_register = NUM_HOST_REGS;
+}
+
+void CPU::NewRec::Compiler::FinishLoadDelayToReg(Reg reg)
+{
+  if (m_load_delay_dirty)
+  {
+    // inter-block :(
+    UpdateLoadDelay();
+    return;
+  }
+
+  if (m_load_delay_register != reg)
+    return;
+
+  FinishLoadDelay();
+}
+
+u32 CPU::NewRec::Compiler::GetFlagsForNewLoadDelayedReg() const
+{
+  return g_settings.gpu_pgxp_enable ? (HR_MODE_WRITE | HR_CALLEE_SAVED) : (HR_MODE_WRITE);
+}
+
+void CPU::NewRec::Compiler::ClearConstantReg(Reg r)
+{
+  DebugAssert(r < Reg::count && r != Reg::zero);
+  m_constant_reg_values[static_cast<u32>(r)] = 0;
+  m_constant_regs_valid.reset(static_cast<u32>(r));
+  m_constant_regs_dirty.reset(static_cast<u32>(r));
+}
+
+void CPU::NewRec::Compiler::FlushConstantRegs(bool invalidate)
+{
+  for (u32 i = 1; i < static_cast<u32>(Reg::count); i++)
+  {
+    if (m_constant_regs_dirty.test(static_cast<u32>(i)))
+      FlushConstantReg(static_cast<Reg>(i));
+    if (invalidate)
+      ClearConstantReg(static_cast<Reg>(i));
+  }
+}
+
+CPU::Reg CPU::NewRec::Compiler::MipsD() const
+{
+  return inst->r.rd;
+}
+
+u32 CPU::NewRec::Compiler::GetConditionalBranchTarget(CompileFlags cf) const
+{
+  // compiler pc has already been advanced when swapping branch delay slots
+  const u32 current_pc = m_compiler_pc - (cf.delay_slot_swapped ? sizeof(Instruction) : 0);
+  return current_pc + (inst->i.imm_sext32() << 2);
+}
+
+u32 CPU::NewRec::Compiler::GetBranchReturnAddress(CompileFlags cf) const
+{
+  // compiler pc has already been advanced when swapping branch delay slots
+  return m_compiler_pc + (cf.delay_slot_swapped ? 0 : sizeof(Instruction));
+}
+
+bool CPU::NewRec::Compiler::TrySwapDelaySlot(Reg rs, Reg rt, Reg rd)
+{
+  if constexpr (!SWAP_BRANCH_DELAY_SLOTS)
+    return false;
+
+  const Instruction* next_instruction = inst + 1;
+  DebugAssert(next_instruction < (m_block->Instructions() + m_block->size));
+
+  const Reg opcode_rs = next_instruction->r.rs;
+  const Reg opcode_rt = next_instruction->r.rt;
+  const Reg opcode_rd = next_instruction->r.rd;
+
+#ifdef _DEBUG
+  TinyString disasm;
+  DisassembleInstruction(&disasm, m_current_instruction_pc + 4, next_instruction->bits);
+#endif
+
+  // Just in case we read it in the instruction.. but the block should end after this.
+  const Instruction* const backup_instruction = inst;
+  const u32 backup_instruction_pc = m_current_instruction_pc;
+  const bool backup_instruction_delay_slot = m_current_instruction_branch_delay_slot;
+
+  if (next_instruction->bits == 0)
+  {
+    // nop
+    goto is_safe;
+  }
+
+  // can't swap when the branch is the first instruction because of bloody load delays
+  if ((EMULATE_LOAD_DELAYS && m_block->pc == m_current_instruction_pc) || m_load_delay_dirty ||
+      (HasLoadDelay() && (m_load_delay_register == rs || m_load_delay_register == rt || m_load_delay_register == rd)))
+  {
+    goto is_unsafe;
+  }
+
+  switch (next_instruction->op)
+  {
+    case InstructionOp::addi:
+    case InstructionOp::addiu:
+    case InstructionOp::slti:
+    case InstructionOp::sltiu:
+    case InstructionOp::andi:
+    case InstructionOp::ori:
+    case InstructionOp::xori:
+    case InstructionOp::lui:
+    case InstructionOp::lb:
+    case InstructionOp::lh:
+    case InstructionOp::lwl:
+    case InstructionOp::lw:
+    case InstructionOp::lbu:
+    case InstructionOp::lhu:
+    case InstructionOp::lwr:
+    case InstructionOp::sb:
+    case InstructionOp::sh:
+    case InstructionOp::swl:
+    case InstructionOp::sw:
+    case InstructionOp::swr:
+    {
+      if ((rs != Reg::zero && rs == opcode_rt) || (rt != Reg::zero && rt == opcode_rt) ||
+          (rd != Reg::zero && (rd == opcode_rs || rd == opcode_rt)) ||
+          (HasLoadDelay() && (m_load_delay_register == opcode_rs || m_load_delay_register == opcode_rt)))
+      {
+        goto is_unsafe;
+      }
+    }
+    break;
+
+    case InstructionOp::lwc2: // LWC2
+    case InstructionOp::swc2: // SWC2
+      break;
+
+    case InstructionOp::funct: // SPECIAL
+    {
+      switch (next_instruction->r.funct)
+      {
+        case InstructionFunct::sll:
+        case InstructionFunct::srl:
+        case InstructionFunct::sra:
+        case InstructionFunct::sllv:
+        case InstructionFunct::srlv:
+        case InstructionFunct::srav:
+        case InstructionFunct::add:
+        case InstructionFunct::addu:
+        case InstructionFunct::sub:
+        case InstructionFunct::subu:
+        case InstructionFunct::and_:
+        case InstructionFunct::or_:
+        case InstructionFunct::xor_:
+        case InstructionFunct::nor:
+        case InstructionFunct::slt:
+        case InstructionFunct::sltu:
+        {
+          if ((rs != Reg::zero && rs == opcode_rd) || (rt != Reg::zero && rt == opcode_rd) ||
+              (rd != Reg::zero && (rd == opcode_rs || rd == opcode_rt)) ||
+              (HasLoadDelay() && (m_load_delay_register == opcode_rs || m_load_delay_register == opcode_rt ||
+                                  m_load_delay_register == opcode_rd)))
+          {
+            goto is_unsafe;
+          }
+        }
+        break;
+
+        case InstructionFunct::mult:
+        case InstructionFunct::multu:
+        case InstructionFunct::div:
+        case InstructionFunct::divu:
+        {
+          if (HasLoadDelay() && (m_load_delay_register == opcode_rs || m_load_delay_register == opcode_rt))
+            goto is_unsafe;
+        }
+        break;
+
+        default:
+          goto is_unsafe;
+      }
+    }
+    break;
+
+    case InstructionOp::cop0: // COP0
+    case InstructionOp::cop1: // COP1
+    case InstructionOp::cop2: // COP2
+    case InstructionOp::cop3: // COP3
+    {
+      if (next_instruction->cop.IsCommonInstruction())
+      {
+        switch (next_instruction->cop.CommonOp())
+        {
+          case CopCommonInstruction::mfcn: // MFC0
+          case CopCommonInstruction::cfcn: // CFC0
+          {
+            if ((rs != Reg::zero && rs == opcode_rt) || (rt != Reg::zero && rt == opcode_rt) ||
+                (rd != Reg::zero && rd == opcode_rt) || (HasLoadDelay() && m_load_delay_register == opcode_rt))
+            {
+              goto is_unsafe;
+            }
+          }
+          break;
+
+          case CopCommonInstruction::mtcn: // MTC0
+          case CopCommonInstruction::ctcn: // CTC0
+            break;
+        }
+      }
+      else
+      {
+        // swap when it's GTE
+        if (next_instruction->op != InstructionOp::cop2)
+          goto is_unsafe;
+      }
+    }
+    break;
+
+    default:
+      goto is_unsafe;
+  }
+
+is_safe:
+#ifdef _DEBUG
+  Log_DevFmt("Swapping delay slot {:08X} {}", m_current_instruction_pc + 4, disasm);
+#endif
+
+  CompileBranchDelaySlot();
+
+  inst = backup_instruction;
+  m_current_instruction_pc = backup_instruction_pc;
+  m_current_instruction_branch_delay_slot = backup_instruction_delay_slot;
+  return true;
+
+is_unsafe:
+#ifdef _DEBUG
+  Log_DevFmt("NOT swapping delay slot {:08X} {}", m_current_instruction_pc + 4, disasm);
+#endif
+
+  return false;
+}
+
+void CPU::NewRec::Compiler::SetCompilerPC(u32 newpc)
+{
+  m_compiler_pc = newpc;
+  m_dirty_pc = true;
+}
+
+u32 CPU::NewRec::Compiler::GetFreeHostReg(u32 flags)
+{
+  const u32 req_flags = HR_USABLE | (flags & HR_CALLEE_SAVED);
+
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    if ((m_host_regs[i].flags & (req_flags | HR_NEEDED | HR_ALLOCATED)) == req_flags)
+      return i;
+  }
+
+  // find register with lowest counter
+  u32 lowest = NUM_HOST_REGS;
+  u16 lowest_count = std::numeric_limits<u16>::max();
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    const HostRegAlloc& ra = m_host_regs[i];
+    if ((ra.flags & (req_flags | HR_NEEDED)) != req_flags)
+      continue;
+
+    DebugAssert(ra.flags & HR_ALLOCATED);
+    if (ra.type == HR_TYPE_TEMP)
+    {
+      // can't punt temps
+      continue;
+    }
+
+    if (ra.counter < lowest_count)
+    {
+      lowest = i;
+      lowest_count = ra.counter;
+    }
+  }
+
+  //
+
+  AssertMsg(lowest != NUM_HOST_REGS, "Register allocation failed.");
+
+  const HostRegAlloc& ra = m_host_regs[lowest];
+  switch (ra.type)
+  {
+    case HR_TYPE_CPU_REG:
+    {
+      // If the register is needed later, and we're allocating a callee-saved register, try moving it to a caller-saved
+      // register.
+      if (iinfo->UsedTest(ra.reg) && flags & HR_CALLEE_SAVED)
+      {
+        u32 caller_saved_lowest = NUM_HOST_REGS;
+        u16 caller_saved_lowest_count = std::numeric_limits<u16>::max();
+        for (u32 i = 0; i < NUM_HOST_REGS; i++)
+        {
+          constexpr u32 caller_req_flags = HR_USABLE;
+          constexpr u32 caller_req_mask = HR_USABLE | HR_NEEDED | HR_CALLEE_SAVED;
+          const HostRegAlloc& caller_ra = m_host_regs[i];
+          if ((caller_ra.flags & caller_req_mask) != caller_req_flags)
+            continue;
+
+          if (!(caller_ra.flags & HR_ALLOCATED))
+          {
+            caller_saved_lowest = i;
+            caller_saved_lowest_count = 0;
+            break;
+          }
+
+          if (caller_ra.type == HR_TYPE_TEMP)
+            continue;
+
+          if (caller_ra.counter < caller_saved_lowest_count)
+          {
+            caller_saved_lowest = i;
+            caller_saved_lowest_count = caller_ra.counter;
+          }
+        }
+
+        if (caller_saved_lowest_count < lowest_count)
+        {
+          Log_DebugPrintf("Moving caller-saved host register %s with MIPS register %s to %s for allocation",
+                          GetHostRegName(lowest), GetRegName(ra.reg), GetHostRegName(caller_saved_lowest));
+          if (IsHostRegAllocated(caller_saved_lowest))
+            FreeHostReg(caller_saved_lowest);
+          CopyHostReg(caller_saved_lowest, lowest);
+          SwapHostRegAlloc(caller_saved_lowest, lowest);
+          DebugAssert(!IsHostRegAllocated(lowest));
+          return lowest;
+        }
+      }
+
+      Log_DebugPrintf("Freeing register %s in host register %s for allocation", GetHostRegName(lowest),
+                      GetRegName(ra.reg));
+    }
+    break;
+    case HR_TYPE_LOAD_DELAY_VALUE:
+    {
+      Log_DebugPrintf("Freeing load delay register %s in host register %s for allocation", GetHostRegName(lowest),
+                      GetRegName(ra.reg));
+    }
+    break;
+    case HR_TYPE_NEXT_LOAD_DELAY_VALUE:
+    {
+      Log_DebugPrintf("Freeing next load delay register %s in host register %s due for allocation",
+                      GetHostRegName(lowest), GetRegName(ra.reg));
+    }
+    break;
+    default:
+    {
+      Panic("Unknown type freed");
+    }
+    break;
+  }
+
+  FreeHostReg(lowest);
+  return lowest;
+}
+
+const char* CPU::NewRec::Compiler::GetReadWriteModeString(u32 flags)
+{
+  if ((flags & (HR_MODE_READ | HR_MODE_WRITE)) == (HR_MODE_READ | HR_MODE_WRITE))
+    return "read-write";
+  else if (flags & HR_MODE_READ)
+    return "read-only";
+  else if (flags & HR_MODE_WRITE)
+    return "write-only";
+  else
+    return "UNKNOWN";
+}
+
+u32 CPU::NewRec::Compiler::AllocateHostReg(u32 flags, HostRegAllocType type /* = HR_TYPE_TEMP */,
+                                           Reg reg /* = Reg::count */)
+{
+  // Cancel any load delays before booting anything out
+  if (flags & HR_MODE_WRITE && (type == HR_TYPE_CPU_REG || type == HR_TYPE_NEXT_LOAD_DELAY_VALUE))
+    CancelLoadDelaysToReg(reg);
+
+  // Already have a matching type?
+  if (type != HR_TYPE_TEMP)
+  {
+    const std::optional<u32> check_reg = CheckHostReg(flags, type, reg);
+
+    // shouldn't be allocating >1 load delay in a single instruction..
+    // TODO: prefer callee saved registers for load delay
+    DebugAssert((type != HR_TYPE_LOAD_DELAY_VALUE && type != HR_TYPE_NEXT_LOAD_DELAY_VALUE) || !check_reg.has_value());
+    if (check_reg.has_value())
+      return check_reg.value();
+  }
+
+  const u32 hreg = GetFreeHostReg(flags);
+  HostRegAlloc& ra = m_host_regs[hreg];
+  ra.flags = (ra.flags & IMMUTABLE_HR_FLAGS) | (flags & ALLOWED_HR_FLAGS) | HR_ALLOCATED | HR_NEEDED;
+  ra.type = type;
+  ra.reg = reg;
+  ra.counter = m_register_alloc_counter++;
+
+  switch (type)
+  {
+    case HR_TYPE_CPU_REG:
+    {
+      DebugAssert(reg != Reg::zero);
+
+      Log_DebugPrintf("Allocate host reg %s to guest reg %s in %s mode", GetHostRegName(hreg), GetRegName(reg),
+                      GetReadWriteModeString(flags));
+
+      if (flags & HR_MODE_READ)
+      {
+        DebugAssert(ra.reg > Reg::zero && ra.reg < Reg::count);
+
+        if (HasConstantReg(reg))
+        {
+          // may as well flush it now
+          Log_DebugPrintf("Flush constant register in guest reg %s to host reg %s", GetRegName(reg),
+                          GetHostRegName(hreg));
+          LoadHostRegWithConstant(hreg, GetConstantRegU32(reg));
+          m_constant_regs_dirty.reset(static_cast<u8>(reg));
+          ra.flags |= HR_MODE_WRITE;
+        }
+        else
+        {
+          LoadHostRegFromCPUPointer(hreg, &g_state.regs.r[static_cast<u8>(reg)]);
+        }
+      }
+
+      if (flags & HR_MODE_WRITE && HasConstantReg(reg))
+      {
+        DebugAssert(reg != Reg::zero);
+        Log_DebugPrintf("Clearing constant register in guest reg %s due to write mode in %s", GetRegName(reg),
+                        GetHostRegName(hreg));
+
+        ClearConstantReg(reg);
+      }
+    }
+    break;
+
+    case HR_TYPE_LOAD_DELAY_VALUE:
+    {
+      DebugAssert(!m_load_delay_dirty && (!HasLoadDelay() || !(flags & HR_MODE_WRITE)));
+      Log_DebugPrintf("Allocating load delayed guest register %s in host reg %s in %s mode", GetRegName(reg),
+                      GetHostRegName(hreg), GetReadWriteModeString(flags));
+      m_load_delay_register = reg;
+      m_load_delay_value_register = hreg;
+      if (flags & HR_MODE_READ)
+        LoadHostRegFromCPUPointer(hreg, &g_state.load_delay_value);
+    }
+    break;
+
+    case HR_TYPE_NEXT_LOAD_DELAY_VALUE:
+    {
+      Log_DebugPrintf("Allocating next load delayed guest register %s in host reg %s in %s mode", GetRegName(reg),
+                      GetHostRegName(hreg), GetReadWriteModeString(flags));
+      m_next_load_delay_register = reg;
+      m_next_load_delay_value_register = hreg;
+      if (flags & HR_MODE_READ)
+        LoadHostRegFromCPUPointer(hreg, &g_state.next_load_delay_value);
+    }
+    break;
+
+    case HR_TYPE_TEMP:
+    {
+      DebugAssert(!(flags & (HR_MODE_READ | HR_MODE_WRITE)));
+      Log_DebugPrintf("Allocate host reg %s as temporary", GetHostRegName(hreg));
+    }
+    break;
+
+    default:
+      Panic("Unknown type");
+      break;
+  }
+
+  return hreg;
+}
+
+std::optional<u32> CPU::NewRec::Compiler::CheckHostReg(u32 flags, HostRegAllocType type /* = HR_TYPE_TEMP */,
+                                                       Reg reg /* = Reg::count */)
+{
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    HostRegAlloc& ra = m_host_regs[i];
+    if (!(ra.flags & HR_ALLOCATED) || ra.type != type || ra.reg != reg)
+      continue;
+
+    DebugAssert(ra.flags & HR_MODE_READ);
+    if (flags & HR_MODE_WRITE)
+    {
+      DebugAssert(type == HR_TYPE_CPU_REG);
+      if (!(ra.flags & HR_MODE_WRITE))
+      {
+        Log_DebugPrintf("Switch guest reg %s from read to read-write in host reg %s", GetRegName(reg),
+                        GetHostRegName(i));
+      }
+
+      if (HasConstantReg(reg))
+      {
+        DebugAssert(reg != Reg::zero);
+        Log_DebugPrintf("Clearing constant register in guest reg %s due to write mode in %s", GetRegName(reg),
+                        GetHostRegName(i));
+
+        ClearConstantReg(reg);
+      }
+    }
+
+    ra.flags |= (flags & ALLOWED_HR_FLAGS) | HR_NEEDED;
+    ra.counter = m_register_alloc_counter++;
+
+    // Need a callee saved reg?
+    if (flags & HR_CALLEE_SAVED && !(ra.flags & HR_CALLEE_SAVED))
+    {
+      // Need to move it to one which is
+      const u32 new_reg = GetFreeHostReg(HR_CALLEE_SAVED);
+      Log_DebugPrintf("Rename host reg %s to %s for callee saved", GetHostRegName(i), GetHostRegName(new_reg));
+
+      CopyHostReg(new_reg, i);
+      SwapHostRegAlloc(i, new_reg);
+      DebugAssert(!IsHostRegAllocated(i));
+      return new_reg;
+    }
+
+    return i;
+  }
+
+  return std::nullopt;
+}
+
+u32 CPU::NewRec::Compiler::AllocateTempHostReg(u32 flags)
+{
+  return AllocateHostReg(flags, HR_TYPE_TEMP);
+}
+
+void CPU::NewRec::Compiler::SwapHostRegAlloc(u32 lhs, u32 rhs)
+{
+  HostRegAlloc& lra = m_host_regs[lhs];
+  HostRegAlloc& rra = m_host_regs[rhs];
+
+  const u8 lra_flags = lra.flags;
+  lra.flags = (lra.flags & IMMUTABLE_HR_FLAGS) | (rra.flags & ~IMMUTABLE_HR_FLAGS);
+  rra.flags = (rra.flags & IMMUTABLE_HR_FLAGS) | (lra_flags & ~IMMUTABLE_HR_FLAGS);
+  std::swap(lra.type, rra.type);
+  std::swap(lra.reg, rra.reg);
+  std::swap(lra.counter, rra.counter);
+}
+
+void CPU::NewRec::Compiler::FlushHostReg(u32 reg)
+{
+  HostRegAlloc& ra = m_host_regs[reg];
+  if (ra.flags & HR_MODE_WRITE)
+  {
+    switch (ra.type)
+    {
+      case HR_TYPE_CPU_REG:
+      {
+        DebugAssert(ra.reg > Reg::zero && ra.reg < Reg::count);
+        Log_DebugPrintf("Flushing register %s in host register %s to state", GetRegName(ra.reg), GetHostRegName(reg));
+        StoreHostRegToCPUPointer(reg, &g_state.regs.r[static_cast<u8>(ra.reg)]);
+      }
+      break;
+
+      case HR_TYPE_LOAD_DELAY_VALUE:
+      {
+        DebugAssert(m_load_delay_value_register == reg);
+        Log_DebugPrintf("Flushing load delayed register %s in host register %s to state", GetRegName(ra.reg),
+                        GetHostRegName(reg));
+
+        StoreHostRegToCPUPointer(reg, &g_state.load_delay_value);
+        m_load_delay_value_register = NUM_HOST_REGS;
+      }
+      break;
+
+      case HR_TYPE_NEXT_LOAD_DELAY_VALUE:
+      {
+        DebugAssert(m_next_load_delay_value_register == reg);
+        Log_WarningPrintf("Flushing NEXT load delayed register %s in host register %s to state", GetRegName(ra.reg),
+                          GetHostRegName(reg));
+
+        StoreHostRegToCPUPointer(reg, &g_state.next_load_delay_value);
+        m_next_load_delay_value_register = NUM_HOST_REGS;
+      }
+      break;
+
+      default:
+        break;
+    }
+
+    ra.flags = (ra.flags & ~HR_MODE_WRITE) | HR_MODE_READ;
+  }
+}
+
+void CPU::NewRec::Compiler::FreeHostReg(u32 reg)
+{
+  DebugAssert(IsHostRegAllocated(reg));
+  FlushHostReg(reg);
+  ClearHostReg(reg);
+}
+
+void CPU::NewRec::Compiler::ClearHostReg(u32 reg)
+{
+  HostRegAlloc& ra = m_host_regs[reg];
+  ra.flags &= IMMUTABLE_HR_FLAGS;
+  ra.type = HR_TYPE_TEMP;
+  ra.counter = 0;
+  ra.reg = Reg::count;
+}
+
+void CPU::NewRec::Compiler::MarkRegsNeeded(HostRegAllocType type, Reg reg)
+{
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    HostRegAlloc& ra = m_host_regs[i];
+    if (ra.flags & HR_ALLOCATED && ra.type == type && ra.reg == reg)
+      ra.flags |= HR_NEEDED;
+  }
+}
+
+void CPU::NewRec::Compiler::RenameHostReg(u32 reg, u32 new_flags, HostRegAllocType new_type, Reg new_reg)
+{
+  // only supported for cpu regs for now
+  DebugAssert(new_type == HR_TYPE_TEMP || new_type == HR_TYPE_CPU_REG || new_type == HR_TYPE_NEXT_LOAD_DELAY_VALUE);
+
+  const std::optional<u32> old_reg = CheckHostReg(0, new_type, new_reg);
+  if (old_reg.has_value())
+  {
+    // don't writeback
+    ClearHostReg(old_reg.value());
+  }
+
+  // kill any load delay to this reg
+  if (new_type == HR_TYPE_CPU_REG || new_type == HR_TYPE_NEXT_LOAD_DELAY_VALUE)
+    CancelLoadDelaysToReg(new_reg);
+
+  if (new_type == HR_TYPE_CPU_REG)
+  {
+    Log_DebugPrintf("Renaming host reg %s to guest reg %s", GetHostRegName(reg), GetRegName(new_reg));
+  }
+  else if (new_type == HR_TYPE_NEXT_LOAD_DELAY_VALUE)
+  {
+    Log_DebugPrintf("Renaming host reg %s to load delayed guest reg %s", GetHostRegName(reg), GetRegName(new_reg));
+    DebugAssert(m_next_load_delay_register == Reg::count && m_next_load_delay_value_register == NUM_HOST_REGS);
+    m_next_load_delay_register = new_reg;
+    m_next_load_delay_value_register = reg;
+  }
+  else
+  {
+    Log_DebugPrintf("Renaming host reg %s to temp", GetHostRegName(reg));
+  }
+
+  HostRegAlloc& ra = m_host_regs[reg];
+  ra.flags = (ra.flags & IMMUTABLE_HR_FLAGS) | HR_NEEDED | HR_ALLOCATED | (new_flags & ALLOWED_HR_FLAGS);
+  ra.counter = m_register_alloc_counter++;
+  ra.type = new_type;
+  ra.reg = new_reg;
+}
+
+void CPU::NewRec::Compiler::ClearHostRegNeeded(u32 reg)
+{
+  DebugAssert(reg < NUM_HOST_REGS && IsHostRegAllocated(reg));
+  HostRegAlloc& ra = m_host_regs[reg];
+  if (ra.flags & HR_MODE_WRITE)
+    ra.flags |= HR_MODE_READ;
+
+  ra.flags &= ~HR_NEEDED;
+}
+
+void CPU::NewRec::Compiler::ClearHostRegsNeeded()
+{
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    HostRegAlloc& ra = m_host_regs[i];
+    if (!(ra.flags & HR_ALLOCATED))
+      continue;
+
+    // shouldn't have any temps left
+    DebugAssert(ra.type != HR_TYPE_TEMP);
+
+    if (ra.flags & HR_MODE_WRITE)
+      ra.flags |= HR_MODE_READ;
+
+    ra.flags &= ~HR_NEEDED;
+  }
+}
+
+void CPU::NewRec::Compiler::DeleteMIPSReg(Reg reg, bool flush)
+{
+  DebugAssert(reg != Reg::zero);
+
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    HostRegAlloc& ra = m_host_regs[i];
+    if (ra.flags & HR_ALLOCATED && ra.type == HR_TYPE_CPU_REG && ra.reg == reg)
+    {
+      if (flush)
+        FlushHostReg(i);
+      ClearHostReg(i);
+      ClearConstantReg(reg);
+      return;
+    }
+  }
+
+  if (flush)
+    FlushConstantReg(reg);
+  ClearConstantReg(reg);
+}
+
+bool CPU::NewRec::Compiler::TryRenameMIPSReg(Reg to, Reg from, u32 fromhost, Reg other)
+{
+  // can't rename when in form Rd = Rs op Rt and Rd == Rs or Rd == Rt
+  if (to == from || to == other || !iinfo->RenameTest(from))
+    return false;
+
+  Log_DebugPrintf("Renaming MIPS register %s to %s", GetRegName(from), GetRegName(to));
+
+  if (iinfo->LiveTest(from))
+    FlushHostReg(fromhost);
+
+  // remove all references to renamed-to register
+  DeleteMIPSReg(to, false);
+
+  // and do the actual rename, new register has been modified.
+  m_host_regs[fromhost].reg = to;
+  m_host_regs[fromhost].flags |= HR_MODE_READ | HR_MODE_WRITE;
+  return true;
+}
+
+void CPU::NewRec::Compiler::UpdateHostRegCounters()
+{
+  const CodeCache::InstructionInfo* const info_end = m_block->InstructionsInfo() + m_block->size;
+
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    HostRegAlloc& ra = m_host_regs[i];
+    if ((ra.flags & (HR_ALLOCATED | HR_NEEDED)) != HR_ALLOCATED)
+      continue;
+
+    // Try not to punt out load delays.
+    if (ra.type != HR_TYPE_CPU_REG)
+    {
+      ra.counter = std::numeric_limits<u16>::max();
+      continue;
+    }
+
+    DebugAssert(IsHostRegAllocated(i));
+    const CodeCache::InstructionInfo* cur = iinfo;
+    const Reg reg = ra.reg;
+    if (!(cur->reg_flags[static_cast<u8>(reg)] & CodeCache::RI_USED))
+    {
+      ra.counter = 0;
+      continue;
+    }
+
+    // order based on the number of instructions until this register is used
+    u16 counter_val = std::numeric_limits<u16>::max();
+    for (; cur != info_end; cur++, counter_val--)
+    {
+      if (cur->ReadsReg(reg))
+        break;
+    }
+
+    ra.counter = counter_val;
+  }
+}
+
+void CPU::NewRec::Compiler::Flush(u32 flags)
+{
+  // TODO: Flush unneeded caller-saved regs (backup/replace calle-saved needed with caller-saved)
+  if (flags &
+      (FLUSH_FREE_UNNEEDED_CALLER_SAVED_REGISTERS | FLUSH_FREE_CALLER_SAVED_REGISTERS | FLUSH_FREE_ALL_REGISTERS))
+  {
+    const u32 req_mask = (flags & FLUSH_FREE_ALL_REGISTERS) ?
+                           HR_ALLOCATED :
+                           ((flags & FLUSH_FREE_CALLER_SAVED_REGISTERS) ? (HR_ALLOCATED | HR_CALLEE_SAVED) :
+                                                                          (HR_ALLOCATED | HR_CALLEE_SAVED | HR_NEEDED));
+    constexpr u32 req_flags = HR_ALLOCATED;
+
+    for (u32 i = 0; i < NUM_HOST_REGS; i++)
+    {
+      HostRegAlloc& ra = m_host_regs[i];
+      if ((ra.flags & req_mask) == req_flags)
+        FreeHostReg(i);
+    }
+  }
+
+  if (flags & FLUSH_INVALIDATE_MIPS_REGISTERS)
+  {
+    for (u32 i = 0; i < NUM_HOST_REGS; i++)
+    {
+      HostRegAlloc& ra = m_host_regs[i];
+      if (ra.flags & HR_ALLOCATED && ra.type == HR_TYPE_CPU_REG)
+        FreeHostReg(i);
+    }
+
+    FlushConstantRegs(true);
+  }
+  else
+  {
+    if (flags & FLUSH_FLUSH_MIPS_REGISTERS)
+    {
+      for (u32 i = 0; i < NUM_HOST_REGS; i++)
+      {
+        HostRegAlloc& ra = m_host_regs[i];
+        if ((ra.flags & (HR_ALLOCATED | HR_MODE_WRITE)) == (HR_ALLOCATED | HR_MODE_WRITE) && ra.type == HR_TYPE_CPU_REG)
+          FlushHostReg(i);
+      }
+
+      // flush any constant registers which are dirty too
+      FlushConstantRegs(false);
+    }
+  }
+}
+
+void CPU::NewRec::Compiler::FlushConstantReg(Reg r)
+{
+  DebugAssert(m_constant_regs_valid.test(static_cast<u32>(r)));
+  Log_DebugPrintf("Writing back register %s with constant value 0x%08X", GetRegName(r),
+                  m_constant_reg_values[static_cast<u32>(r)]);
+  StoreConstantToCPUPointer(m_constant_reg_values[static_cast<u32>(r)], &g_state.regs.r[static_cast<u32>(r)]);
+  m_constant_regs_dirty.reset(static_cast<u32>(r));
+}
+
+void CPU::NewRec::Compiler::BackupHostState()
+{
+  DebugAssert(m_host_state_backup_count < m_host_state_backup.size());
+
+  // need to back up everything...
+  HostStateBackup& bu = m_host_state_backup[m_host_state_backup_count];
+  bu.cycles = m_cycles;
+  bu.gte_done_cycle = m_gte_done_cycle;
+  bu.compiler_pc = m_compiler_pc;
+  bu.dirty_pc = m_dirty_pc;
+  bu.dirty_instruction_bits = m_dirty_instruction_bits;
+  bu.dirty_gte_done_cycle = m_dirty_gte_done_cycle;
+  bu.block_ended = m_block_ended;
+  bu.inst = inst;
+  bu.current_instruction_pc = m_current_instruction_pc;
+  bu.current_instruction_delay_slot = m_current_instruction_branch_delay_slot;
+  bu.const_regs_valid = m_constant_regs_valid;
+  bu.const_regs_dirty = m_constant_regs_dirty;
+  bu.const_regs_values = m_constant_reg_values;
+  bu.host_regs = m_host_regs;
+  bu.register_alloc_counter = m_register_alloc_counter;
+  bu.load_delay_dirty = m_load_delay_dirty;
+  bu.load_delay_register = m_load_delay_register;
+  bu.load_delay_value_register = m_load_delay_value_register;
+  bu.next_load_delay_register = m_next_load_delay_register;
+  bu.next_load_delay_value_register = m_next_load_delay_value_register;
+  m_host_state_backup_count++;
+}
+
+void CPU::NewRec::Compiler::RestoreHostState()
+{
+  DebugAssert(m_host_state_backup_count > 0);
+  m_host_state_backup_count--;
+
+  HostStateBackup& bu = m_host_state_backup[m_host_state_backup_count];
+  m_host_regs = std::move(bu.host_regs);
+  m_constant_reg_values = std::move(bu.const_regs_values);
+  m_constant_regs_dirty = std::move(bu.const_regs_dirty);
+  m_constant_regs_valid = std::move(bu.const_regs_valid);
+  m_current_instruction_branch_delay_slot = bu.current_instruction_delay_slot;
+  m_current_instruction_pc = bu.current_instruction_pc;
+  inst = bu.inst;
+  m_block_ended = bu.block_ended;
+  m_dirty_gte_done_cycle = bu.dirty_gte_done_cycle;
+  m_dirty_instruction_bits = bu.dirty_instruction_bits;
+  m_dirty_pc = bu.dirty_pc;
+  m_compiler_pc = bu.compiler_pc;
+  m_register_alloc_counter = bu.register_alloc_counter;
+  m_load_delay_dirty = bu.load_delay_dirty;
+  m_load_delay_register = bu.load_delay_register;
+  m_load_delay_value_register = bu.load_delay_value_register;
+  m_next_load_delay_register = bu.next_load_delay_register;
+  m_next_load_delay_value_register = bu.next_load_delay_value_register;
+  m_gte_done_cycle = bu.gte_done_cycle;
+  m_cycles = bu.cycles;
+}
+
+void CPU::NewRec::Compiler::AddLoadStoreInfo(void* code_address, u32 code_size, u32 address_register, u32 data_register,
+                                             MemoryAccessSize size, bool is_signed, bool is_load)
+{
+  DebugAssert(CodeCache::IsUsingFastmem());
+  DebugAssert(address_register < NUM_HOST_REGS);
+  DebugAssert(data_register < NUM_HOST_REGS);
+
+  u32 gpr_bitmask = 0;
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    if (IsHostRegAllocated(i))
+      gpr_bitmask |= (1u << i);
+  }
+
+  CPU::CodeCache::AddLoadStoreInfo(code_address, code_size, m_current_instruction_pc, m_cycles, gpr_bitmask,
+                                   static_cast<u8>(address_register), static_cast<u8>(data_register), size, is_signed,
+                                   is_load);
+}
+
+void CPU::NewRec::Compiler::CompileInstruction()
+{
+#ifdef _DEBUG
+  TinyString str;
+  DisassembleInstruction(&str, m_current_instruction_pc, inst->bits);
+  Log_DebugFmt("Compiling{} {:08X}: {}", m_current_instruction_branch_delay_slot ? " branch delay slot" : "",
+               m_current_instruction_pc, str);
+#endif
+
+  m_cycles++;
+
+  if (IsNopInstruction(*inst))
+  {
+    UpdateLoadDelay();
+    return;
+  }
+
+  switch (inst->op)
+  {
+#define PGXPFN(x) reinterpret_cast<const void*>(&PGXP::x)
+
+      // clang-format off
+      // TODO: PGXP for jalr
+
+    case InstructionOp::funct:
+    {
+      switch (inst->r.funct)
+      {
+        case InstructionFunct::sll: CompileTemplate(&Compiler::Compile_sll_const, &Compiler::Compile_sll, PGXPFN(CPU_SLL), TF_WRITES_D | TF_READS_T); break;
+        case InstructionFunct::srl: CompileTemplate(&Compiler::Compile_srl_const, &Compiler::Compile_srl, PGXPFN(CPU_SRL), TF_WRITES_D | TF_READS_T); break;
+        case InstructionFunct::sra: CompileTemplate(&Compiler::Compile_sra_const, &Compiler::Compile_sra, PGXPFN(CPU_SRA), TF_WRITES_D | TF_READS_T); break;
+        case InstructionFunct::sllv: CompileTemplate(&Compiler::Compile_sllv_const, &Compiler::Compile_sllv, PGXPFN(CPU_SLLV), TF_WRITES_D | TF_READS_S | TF_READS_T); break;
+        case InstructionFunct::srlv: CompileTemplate(&Compiler::Compile_srlv_const, &Compiler::Compile_srlv, PGXPFN(CPU_SRLV), TF_WRITES_D | TF_READS_S | TF_READS_T); break;
+        case InstructionFunct::srav: CompileTemplate(&Compiler::Compile_srav_const, &Compiler::Compile_srav, PGXPFN(CPU_SRAV), TF_WRITES_D | TF_READS_S | TF_READS_T); break;
+        case InstructionFunct::jr: CompileTemplate(&Compiler::Compile_jr_const, &Compiler::Compile_jr, nullptr, TF_READS_S); break;
+        case InstructionFunct::jalr: CompileTemplate(&Compiler::Compile_jalr_const, &Compiler::Compile_jalr, nullptr, /*TF_WRITES_D |*/ TF_READS_S | TF_NO_NOP); break;
+        case InstructionFunct::syscall: Compile_syscall(); break;
+        case InstructionFunct::break_: Compile_break(); break;
+        case InstructionFunct::mfhi: CompileMoveRegTemplate(inst->r.rd, Reg::hi, g_settings.gpu_pgxp_cpu); break;
+        case InstructionFunct::mthi: CompileMoveRegTemplate(Reg::hi, inst->r.rs, g_settings.gpu_pgxp_cpu); break;
+        case InstructionFunct::mflo: CompileMoveRegTemplate(inst->r.rd, Reg::lo, g_settings.gpu_pgxp_cpu); break;
+        case InstructionFunct::mtlo: CompileMoveRegTemplate(Reg::lo, inst->r.rs, g_settings.gpu_pgxp_cpu); break;
+        case InstructionFunct::mult: CompileTemplate(&Compiler::Compile_mult_const, &Compiler::Compile_mult, PGXPFN(CPU_MULT), TF_READS_S | TF_READS_T | TF_WRITES_LO | TF_WRITES_HI | TF_COMMUTATIVE); break;
+        case InstructionFunct::multu: CompileTemplate(&Compiler::Compile_multu_const, &Compiler::Compile_multu, PGXPFN(CPU_MULTU), TF_READS_S | TF_READS_T | TF_WRITES_LO | TF_WRITES_HI | TF_COMMUTATIVE); break;
+        case InstructionFunct::div: CompileTemplate(&Compiler::Compile_div_const, &Compiler::Compile_div, PGXPFN(CPU_DIV), TF_READS_S | TF_READS_T | TF_WRITES_LO | TF_WRITES_HI); break;
+        case InstructionFunct::divu: CompileTemplate(&Compiler::Compile_divu_const, &Compiler::Compile_divu, PGXPFN(CPU_DIVU), TF_READS_S | TF_READS_T | TF_WRITES_LO | TF_WRITES_HI); break;
+        case InstructionFunct::add: CompileTemplate(&Compiler::Compile_add_const, &Compiler::Compile_add, PGXPFN(CPU_ADD), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_CAN_OVERFLOW | TF_RENAME_WITH_ZERO_T); break;
+        case InstructionFunct::addu: CompileTemplate(&Compiler::Compile_addu_const, &Compiler::Compile_addu, PGXPFN(CPU_ADD), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_T); break;
+        case InstructionFunct::sub: CompileTemplate(&Compiler::Compile_sub_const, &Compiler::Compile_sub, PGXPFN(CPU_SUB), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_CAN_OVERFLOW | TF_RENAME_WITH_ZERO_T); break;
+        case InstructionFunct::subu: CompileTemplate(&Compiler::Compile_subu_const, &Compiler::Compile_subu, PGXPFN(CPU_SUB), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_RENAME_WITH_ZERO_T); break;
+        case InstructionFunct::and_: CompileTemplate(&Compiler::Compile_and_const, &Compiler::Compile_and, PGXPFN(CPU_AND_), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE); break;
+        case InstructionFunct::or_: CompileTemplate(&Compiler::Compile_or_const, &Compiler::Compile_or, PGXPFN(CPU_OR_), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_T); break;
+        case InstructionFunct::xor_: CompileTemplate(&Compiler::Compile_xor_const, &Compiler::Compile_xor, PGXPFN(CPU_XOR_), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_T); break;
+        case InstructionFunct::nor: CompileTemplate(&Compiler::Compile_nor_const, &Compiler::Compile_nor, PGXPFN(CPU_NOR), TF_WRITES_D | TF_READS_S | TF_READS_T | TF_COMMUTATIVE); break;
+        case InstructionFunct::slt: CompileTemplate(&Compiler::Compile_slt_const, &Compiler::Compile_slt, PGXPFN(CPU_SLT), TF_WRITES_D | TF_READS_T | TF_READS_S); break;
+        case InstructionFunct::sltu: CompileTemplate(&Compiler::Compile_sltu_const, &Compiler::Compile_sltu, PGXPFN(CPU_SLTU), TF_WRITES_D | TF_READS_T | TF_READS_S); break;
+
+      default: Panic("fixme funct"); break;
+      }
+    }
+    break;
+
+    case InstructionOp::j: Compile_j(); break;
+    case InstructionOp::jal: Compile_jal(); break;
+
+    case InstructionOp::b: CompileTemplate(&Compiler::Compile_b_const, &Compiler::Compile_b, nullptr, TF_READS_S | TF_CAN_SWAP_DELAY_SLOT); break;
+    case InstructionOp::blez: CompileTemplate(&Compiler::Compile_blez_const, &Compiler::Compile_blez, nullptr, TF_READS_S | TF_CAN_SWAP_DELAY_SLOT); break;
+    case InstructionOp::bgtz: CompileTemplate(&Compiler::Compile_bgtz_const, &Compiler::Compile_bgtz, nullptr, TF_READS_S | TF_CAN_SWAP_DELAY_SLOT); break;
+    case InstructionOp::beq: CompileTemplate(&Compiler::Compile_beq_const, &Compiler::Compile_beq, nullptr, TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_CAN_SWAP_DELAY_SLOT); break;
+    case InstructionOp::bne: CompileTemplate(&Compiler::Compile_bne_const, &Compiler::Compile_bne, nullptr, TF_READS_S | TF_READS_T | TF_COMMUTATIVE | TF_CAN_SWAP_DELAY_SLOT); break;
+
+    case InstructionOp::addi: CompileTemplate(&Compiler::Compile_addi_const, &Compiler::Compile_addi, PGXPFN(CPU_ADDI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE | TF_CAN_OVERFLOW | TF_RENAME_WITH_ZERO_IMM); break;
+    case InstructionOp::addiu: CompileTemplate(&Compiler::Compile_addiu_const, &Compiler::Compile_addiu, PGXPFN(CPU_ADDI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_IMM); break;
+    case InstructionOp::slti: CompileTemplate(&Compiler::Compile_slti_const, &Compiler::Compile_slti, PGXPFN(CPU_SLTI), TF_WRITES_T | TF_READS_S); break;
+    case InstructionOp::sltiu: CompileTemplate(&Compiler::Compile_sltiu_const, &Compiler::Compile_sltiu, PGXPFN(CPU_SLTIU), TF_WRITES_T | TF_READS_S); break;
+    case InstructionOp::andi: CompileTemplate(&Compiler::Compile_andi_const, &Compiler::Compile_andi, PGXPFN(CPU_ANDI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE); break;
+    case InstructionOp::ori: CompileTemplate(&Compiler::Compile_ori_const, &Compiler::Compile_ori, PGXPFN(CPU_ORI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_IMM); break;
+    case InstructionOp::xori: CompileTemplate(&Compiler::Compile_xori_const, &Compiler::Compile_xori, PGXPFN(CPU_XORI), TF_WRITES_T | TF_READS_S | TF_COMMUTATIVE | TF_RENAME_WITH_ZERO_IMM); break;
+    case InstructionOp::lui: Compile_lui(); break;
+
+    case InstructionOp::lb: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::Byte, false, true, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break;
+    case InstructionOp::lbu: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::Byte, false, false, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break;
+    case InstructionOp::lh: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::HalfWord, false, true, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break;
+    case InstructionOp::lhu: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::HalfWord, false, false, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break;
+    case InstructionOp::lw: CompileLoadStoreTemplate(&Compiler::Compile_lxx, MemoryAccessSize::Word, false, false, TF_READS_S | TF_WRITES_T | TF_LOAD_DELAY); break;
+    case InstructionOp::lwl: CompileLoadStoreTemplate(&Compiler::Compile_lwx, MemoryAccessSize::Word, false, false, TF_READS_S | /*TF_READS_T | TF_WRITES_T | */TF_LOAD_DELAY); break;
+    case InstructionOp::lwr: CompileLoadStoreTemplate(&Compiler::Compile_lwx, MemoryAccessSize::Word, false, false, TF_READS_S | /*TF_READS_T | TF_WRITES_T | */TF_LOAD_DELAY); break;
+    case InstructionOp::sb: CompileLoadStoreTemplate(&Compiler::Compile_sxx, MemoryAccessSize::Byte, true, false, TF_READS_S | TF_READS_T); break;
+    case InstructionOp::sh: CompileLoadStoreTemplate(&Compiler::Compile_sxx, MemoryAccessSize::HalfWord, true, false, TF_READS_S | TF_READS_T); break;
+    case InstructionOp::sw: CompileLoadStoreTemplate(&Compiler::Compile_sxx, MemoryAccessSize::Word, true, false, TF_READS_S | TF_READS_T); break;
+    case InstructionOp::swl: CompileLoadStoreTemplate(&Compiler::Compile_swx, MemoryAccessSize::Word, false, false, TF_READS_S | /*TF_READS_T | TF_WRITES_T | */TF_LOAD_DELAY); break;
+    case InstructionOp::swr: CompileLoadStoreTemplate(&Compiler::Compile_swx, MemoryAccessSize::Word, false, false, TF_READS_S | /*TF_READS_T | TF_WRITES_T | */TF_LOAD_DELAY); break;
+
+    case InstructionOp::cop0:
+      {
+        if (inst->cop.IsCommonInstruction())
+        {
+          switch (inst->cop.CommonOp())
+          {
+            case CopCommonInstruction::mfcn: if (inst->r.rt != Reg::zero) { CompileTemplate(nullptr, &Compiler::Compile_mfc0, nullptr, TF_WRITES_T | TF_LOAD_DELAY); } break;
+            case CopCommonInstruction::mtcn: CompileTemplate(nullptr, &Compiler::Compile_mtc0, PGXPFN(CPU_MTC0), TF_READS_T); break;
+            default: Compile_Fallback(); break;
+          }
+        }
+        else
+        {
+          switch (inst->cop.Cop0Op())
+          {
+            case Cop0Instruction::rfe: CompileTemplate(nullptr, &Compiler::Compile_rfe, nullptr, 0); break;
+            default: Compile_Fallback(); break;
+          }
+        }
+      }
+      break;
+
+    case InstructionOp::cop2:
+      {
+        if (inst->cop.IsCommonInstruction())
+        {
+          switch (inst->cop.CommonOp())
+          {
+            case CopCommonInstruction::mfcn: if (inst->r.rt != Reg::zero) { CompileTemplate(nullptr, &Compiler::Compile_mfc2, nullptr, TF_GTE_STALL); } break;
+            case CopCommonInstruction::cfcn: if (inst->r.rt != Reg::zero) { CompileTemplate(nullptr, &Compiler::Compile_mfc2, nullptr, TF_GTE_STALL); } break;
+            case CopCommonInstruction::mtcn: CompileTemplate(nullptr, &Compiler::Compile_mtc2, PGXPFN(CPU_MTC2), TF_GTE_STALL | TF_READS_T | TF_PGXP_WITHOUT_CPU); break;
+            case CopCommonInstruction::ctcn: CompileTemplate(nullptr, &Compiler::Compile_mtc2, PGXPFN(CPU_MTC2), TF_GTE_STALL | TF_READS_T | TF_PGXP_WITHOUT_CPU); break;
+            default: Compile_Fallback(); break;
+          }
+        }
+        else
+        {
+          // GTE ops
+          CompileTemplate(nullptr, &Compiler::Compile_cop2, nullptr, TF_GTE_STALL);
+        }
+      }
+      break;
+
+    case InstructionOp::lwc2: CompileLoadStoreTemplate(&Compiler::Compile_lwc2, MemoryAccessSize::Word, false, false, TF_GTE_STALL | TF_READS_S | TF_LOAD_DELAY); break;
+    case InstructionOp::swc2: CompileLoadStoreTemplate(&Compiler::Compile_swc2, MemoryAccessSize::Word, true, false, TF_GTE_STALL | TF_READS_S); break;
+
+    default: Panic("Fixme"); break;
+      // clang-format on
+
+#undef PGXPFN
+  }
+
+  ClearHostRegsNeeded();
+  UpdateLoadDelay();
+
+#if 0
+  const void* end = GetCurrentCodePointer();
+  if (start != end && !m_current_instruction_branch_delay_slot)
+  {
+    CodeCache::DisassembleAndLogHostCode(start,
+                                         static_cast<u32>(static_cast<const u8*>(end) - static_cast<const u8*>(start)));
+  }
+#endif
+}
+
+void CPU::NewRec::Compiler::CompileBranchDelaySlot(bool dirty_pc /* = true */)
+{
+  // Update load delay at the end of the previous instruction.
+  UpdateLoadDelay();
+
+  // TODO: Move cycle add before this.
+  inst++;
+  iinfo++;
+  m_current_instruction_pc += sizeof(Instruction);
+  m_current_instruction_branch_delay_slot = true;
+  m_compiler_pc += sizeof(Instruction);
+  m_dirty_pc = dirty_pc;
+  m_dirty_instruction_bits = true;
+
+  CompileInstruction();
+
+  m_current_instruction_branch_delay_slot = false;
+}
+
+void CPU::NewRec::Compiler::CompileTemplate(void (Compiler::*const_func)(CompileFlags),
+                                            void (Compiler::*func)(CompileFlags), const void* pgxp_cpu_func, u32 tflags)
+{
+  // TODO: This is where we will do memory operand optimization. Remember to kill constants!
+  // TODO: Swap S and T if commutative
+  // TODO: For and, treat as zeroing if imm is zero
+  // TODO: Optimize slt + bne to cmp + jump
+  // TODO: Prefer memory operands when load delay is dirty, since we're going to invalidate immediately after the first
+  // instruction..
+  // TODO: andi with zero -> zero const
+  // TODO: load constant so it can be flushed if it's not overwritten later
+  // TODO: inline PGXP ops.
+  // TODO: don't rename on sltu.
+
+  bool allow_constant = static_cast<bool>(const_func);
+  Reg rs = inst->r.rs.GetValue();
+  Reg rt = inst->r.rt.GetValue();
+  Reg rd = inst->r.rd.GetValue();
+
+  if (tflags & TF_GTE_STALL)
+    StallUntilGTEComplete();
+
+  // throw away instructions writing to $zero
+  if (!(tflags & TF_NO_NOP) && (!g_settings.cpu_recompiler_memory_exceptions || !(tflags & TF_CAN_OVERFLOW)) &&
+      ((tflags & TF_WRITES_T && rt == Reg::zero) || (tflags & TF_WRITES_D && rd == Reg::zero)))
+  {
+    Log_DebugPrintf("Skipping instruction because it writes to zero");
+    return;
+  }
+
+  // handle rename operations
+  if ((tflags & TF_RENAME_WITH_ZERO_T && HasConstantRegValue(rt, 0)))
+  {
+    DebugAssert((tflags & (TF_WRITES_D | TF_READS_S | TF_READS_T)) == (TF_WRITES_D | TF_READS_S | TF_READS_T));
+    CompileMoveRegTemplate(rd, rs, true);
+    return;
+  }
+  else if ((tflags & (TF_RENAME_WITH_ZERO_T | TF_COMMUTATIVE)) == (TF_RENAME_WITH_ZERO_T | TF_COMMUTATIVE) &&
+           HasConstantRegValue(rs, 0))
+  {
+    DebugAssert((tflags & (TF_WRITES_D | TF_READS_S | TF_READS_T)) == (TF_WRITES_D | TF_READS_S | TF_READS_T));
+    CompileMoveRegTemplate(rd, rt, true);
+    return;
+  }
+  else if (tflags & TF_RENAME_WITH_ZERO_IMM && inst->i.imm == 0)
+  {
+    CompileMoveRegTemplate(rt, rs, true);
+    return;
+  }
+
+  if (pgxp_cpu_func && g_settings.gpu_pgxp_enable && ((tflags & TF_PGXP_WITHOUT_CPU) || g_settings.UsingPGXPCPUMode()))
+  {
+    std::array<Reg, 2> reg_args = {{Reg::count, Reg::count}};
+    u32 num_reg_args = 0;
+    if (tflags & TF_READS_S)
+      reg_args[num_reg_args++] = rs;
+    if (tflags & TF_READS_T)
+      reg_args[num_reg_args++] = rt;
+    if (tflags & TF_READS_LO)
+      reg_args[num_reg_args++] = Reg::lo;
+    if (tflags & TF_READS_HI)
+      reg_args[num_reg_args++] = Reg::hi;
+
+    DebugAssert(num_reg_args <= 2);
+    GeneratePGXPCallWithMIPSRegs(pgxp_cpu_func, inst->bits, reg_args[0], reg_args[1]);
+  }
+
+  // if it's a commutative op, and we have one constant reg but not the other, swap them
+  // TODO: make it swap when writing to T as well
+  // TODO: drop the hack for rd == rt
+  if (tflags & TF_COMMUTATIVE && !(tflags & TF_WRITES_T) &&
+      ((HasConstantReg(rs) && !HasConstantReg(rt)) || (tflags & TF_WRITES_D && rd == rt)))
+  {
+    Log_DebugPrintf("Swapping S:%s and T:%s due to commutative op and constants", GetRegName(rs), GetRegName(rt));
+    std::swap(rs, rt);
+  }
+
+  CompileFlags cf = {};
+
+  if (tflags & TF_READS_S)
+  {
+    MarkRegsNeeded(HR_TYPE_CPU_REG, rs);
+    if (HasConstantReg(rs))
+      cf.const_s = true;
+    else
+      allow_constant = false;
+  }
+  if (tflags & TF_READS_T)
+  {
+    MarkRegsNeeded(HR_TYPE_CPU_REG, rt);
+    if (HasConstantReg(rt))
+      cf.const_t = true;
+    else
+      allow_constant = false;
+  }
+  if (tflags & TF_READS_LO)
+  {
+    MarkRegsNeeded(HR_TYPE_CPU_REG, Reg::lo);
+    if (HasConstantReg(Reg::lo))
+      cf.const_lo = true;
+    else
+      allow_constant = false;
+  }
+  if (tflags & TF_READS_HI)
+  {
+    MarkRegsNeeded(HR_TYPE_CPU_REG, Reg::hi);
+    if (HasConstantReg(Reg::hi))
+      cf.const_hi = true;
+    else
+      allow_constant = false;
+  }
+
+  // Needed because of potential swapping
+  if (tflags & TF_READS_S)
+    cf.mips_s = static_cast<u8>(rs);
+  if (tflags & (TF_READS_T | TF_WRITES_T))
+    cf.mips_t = static_cast<u8>(rt);
+
+  if (allow_constant)
+  {
+    // woot, constant path
+    (this->*const_func)(cf);
+    return;
+  }
+
+  UpdateHostRegCounters();
+
+  if (tflags & TF_CAN_SWAP_DELAY_SLOT && TrySwapDelaySlot(cf.MipsS(), cf.MipsT()))
+    cf.delay_slot_swapped = true;
+
+  if (tflags & TF_READS_S &&
+      (tflags & TF_NEEDS_REG_S || !cf.const_s || (tflags & TF_WRITES_D && rd != Reg::zero && rd == rs)))
+  {
+    cf.host_s = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rs);
+    cf.const_s = false;
+    cf.valid_host_s = true;
+  }
+
+  if (tflags & TF_READS_T &&
+      (tflags & (TF_NEEDS_REG_T | TF_WRITES_T) || !cf.const_t || (tflags & TF_WRITES_D && rd != Reg::zero && rd == rt)))
+  {
+    cf.host_t = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt);
+    cf.const_t = false;
+    cf.valid_host_t = true;
+  }
+
+  if (tflags & (TF_READS_LO | TF_WRITES_LO))
+  {
+    cf.host_lo =
+      AllocateHostReg(((tflags & TF_READS_LO) ? HR_MODE_READ : 0u) | ((tflags & TF_WRITES_LO) ? HR_MODE_WRITE : 0u),
+                      HR_TYPE_CPU_REG, Reg::lo);
+    cf.const_lo = false;
+    cf.valid_host_lo = true;
+  }
+
+  if (tflags & (TF_READS_HI | TF_WRITES_HI))
+  {
+    cf.host_hi =
+      AllocateHostReg(((tflags & TF_READS_HI) ? HR_MODE_READ : 0u) | ((tflags & TF_WRITES_HI) ? HR_MODE_WRITE : 0u),
+                      HR_TYPE_CPU_REG, Reg::hi);
+    cf.const_hi = false;
+    cf.valid_host_hi = true;
+  }
+
+  const HostRegAllocType write_type =
+    (tflags & TF_LOAD_DELAY && EMULATE_LOAD_DELAYS) ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG;
+
+  if (tflags & TF_CAN_OVERFLOW && g_settings.cpu_recompiler_memory_exceptions)
+  {
+    // allocate a temp register for the result, then swap it back
+    const u32 tempreg = AllocateHostReg(0, HR_TYPE_TEMP);
+    ;
+    if (tflags & TF_WRITES_D)
+    {
+      cf.host_d = tempreg;
+      cf.valid_host_d = true;
+    }
+    else if (tflags & TF_WRITES_T)
+    {
+      cf.host_t = tempreg;
+      cf.valid_host_t = true;
+    }
+
+    (this->*func)(cf);
+
+    if (tflags & TF_WRITES_D && rd != Reg::zero)
+    {
+      DeleteMIPSReg(rd, false);
+      RenameHostReg(tempreg, HR_MODE_WRITE, write_type, rd);
+    }
+    else if (tflags & TF_WRITES_T && rt != Reg::zero)
+    {
+      DeleteMIPSReg(rt, false);
+      RenameHostReg(tempreg, HR_MODE_WRITE, write_type, rt);
+    }
+    else
+    {
+      FreeHostReg(tempreg);
+    }
+  }
+  else
+  {
+    if (tflags & TF_WRITES_D && rd != Reg::zero)
+    {
+      if (tflags & TF_READS_S && cf.valid_host_s && TryRenameMIPSReg(rd, rs, cf.host_s, Reg::count))
+        cf.host_d = cf.host_s;
+      else
+        cf.host_d = AllocateHostReg(HR_MODE_WRITE, write_type, rd);
+      cf.valid_host_d = true;
+    }
+
+    if (tflags & TF_WRITES_T && rt != Reg::zero)
+    {
+      if (tflags & TF_READS_S && cf.valid_host_s && TryRenameMIPSReg(rt, rs, cf.host_s, Reg::count))
+        cf.host_t = cf.host_s;
+      else
+        cf.host_t = AllocateHostReg(HR_MODE_WRITE, write_type, rt);
+      cf.valid_host_t = true;
+    }
+
+    (this->*func)(cf);
+  }
+}
+
+void CPU::NewRec::Compiler::CompileLoadStoreTemplate(void (Compiler::*func)(CompileFlags, MemoryAccessSize, bool,
+                                                                            const std::optional<VirtualMemoryAddress>&),
+                                                     MemoryAccessSize size, bool store, bool sign, u32 tflags)
+{
+  const Reg rs = inst->i.rs;
+  const Reg rt = inst->i.rt;
+
+  if (tflags & TF_GTE_STALL)
+    StallUntilGTEComplete();
+
+  CompileFlags cf = {};
+
+  if (tflags & TF_READS_S)
+  {
+    MarkRegsNeeded(HR_TYPE_CPU_REG, rs);
+    cf.mips_s = static_cast<u8>(rs);
+  }
+  if (tflags & (TF_READS_T | TF_WRITES_T))
+  {
+    if (tflags & TF_READS_T)
+      MarkRegsNeeded(HR_TYPE_CPU_REG, rt);
+    cf.mips_t = static_cast<u8>(rt);
+  }
+
+  UpdateHostRegCounters();
+
+  // constant address?
+  std::optional<VirtualMemoryAddress> addr;
+  if (HasConstantReg(rs))
+  {
+    addr = GetConstantRegU32(rs) + inst->i.imm_sext32();
+    cf.const_s = true;
+  }
+  else
+  {
+    if constexpr (HAS_MEMORY_OPERANDS)
+    {
+      // don't bother caching it since we're going to flush anyway
+      // TODO: make less rubbish, if it's caller saved we don't need to flush...
+      const std::optional<u32> hreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rs);
+      if (hreg.has_value())
+      {
+        cf.valid_host_s = true;
+        cf.host_s = hreg.value();
+      }
+    }
+    else
+    {
+      // need rs in a register
+      cf.host_s = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rs);
+      cf.valid_host_s = true;
+    }
+  }
+
+  // reads T -> store, writes T -> load
+  // for now, we defer the allocation to afterwards, because C call
+  if (tflags & TF_READS_T)
+  {
+    if (HasConstantReg(rt))
+    {
+      cf.const_t = true;
+    }
+    else
+    {
+      if constexpr (HAS_MEMORY_OPERANDS)
+      {
+        const std::optional<u32> hreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt);
+        if (hreg.has_value())
+        {
+          cf.valid_host_t = true;
+          cf.host_t = hreg.value();
+        }
+      }
+      else
+      {
+        cf.host_t = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt);
+        cf.valid_host_t = true;
+      }
+    }
+  }
+
+  (this->*func)(cf, size, sign, addr);
+}
+
+void CPU::NewRec::Compiler::FlushForLoadStore(const std::optional<VirtualMemoryAddress>& address, bool store)
+{
+  if (CodeCache::IsUsingFastmem() && !g_settings.cpu_recompiler_memory_exceptions)
+    return;
+
+  // TODO: Stores don't need to flush GTE cycles...
+  Flush(FLUSH_FOR_C_CALL | FLUSH_FOR_LOADSTORE);
+}
+
+void CPU::NewRec::Compiler::CompileMoveRegTemplate(Reg dst, Reg src, bool pgxp_move)
+{
+  if (dst == src || dst == Reg::zero)
+    return;
+
+  if (HasConstantReg(src))
+  {
+    DeleteMIPSReg(dst, false);
+    SetConstantReg(dst, GetConstantRegU32(src));
+  }
+  else
+  {
+    const u32 srcreg = AllocateHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, src);
+    if (!TryRenameMIPSReg(dst, src, srcreg, Reg::count))
+    {
+      const u32 dstreg = AllocateHostReg(HR_MODE_WRITE, HR_TYPE_CPU_REG, dst);
+      CopyHostReg(dstreg, srcreg);
+      ClearHostRegNeeded(dstreg);
+    }
+  }
+
+  // TODO: This could be made better if we only did it for registers where there was a previous MFC2.
+  if (g_settings.gpu_pgxp_enable && pgxp_move)
+  {
+    // might've been renamed, so use dst here
+    GeneratePGXPCallWithMIPSRegs(reinterpret_cast<const void*>(&PGXP::CPU_MOVE),
+                                 (static_cast<u32>(dst) << 8) | (static_cast<u32>(src)), dst);
+  }
+}
+
+void CPU::NewRec::Compiler::Compile_j()
+{
+  const u32 newpc = (m_compiler_pc & UINT32_C(0xF0000000)) | (inst->j.target << 2);
+
+  // TODO: Delay slot swap.
+  // We could also move the cycle commit back.
+  CompileBranchDelaySlot();
+  EndBlock(newpc, true);
+}
+
+void CPU::NewRec::Compiler::Compile_jr_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()));
+  const u32 newpc = GetConstantRegU32(cf.MipsS());
+  if (newpc & 3 && g_settings.cpu_recompiler_memory_exceptions)
+  {
+    EndBlockWithException(Exception::AdEL);
+    return;
+  }
+
+  CompileBranchDelaySlot();
+  EndBlock(newpc, true);
+}
+
+void CPU::NewRec::Compiler::Compile_jal()
+{
+  const u32 newpc = (m_compiler_pc & UINT32_C(0xF0000000)) | (inst->j.target << 2);
+  SetConstantReg(Reg::ra, GetBranchReturnAddress({}));
+  CompileBranchDelaySlot();
+  EndBlock(newpc, true);
+}
+
+void CPU::NewRec::Compiler::Compile_jalr_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()));
+  const u32 newpc = GetConstantRegU32(cf.MipsS());
+  if (MipsD() != Reg::zero)
+    SetConstantReg(MipsD(), GetBranchReturnAddress({}));
+
+  CompileBranchDelaySlot();
+  EndBlock(newpc, true);
+}
+
+void CPU::NewRec::Compiler::Compile_syscall()
+{
+  EndBlockWithException(Exception::Syscall);
+}
+
+void CPU::NewRec::Compiler::Compile_break()
+{
+  EndBlockWithException(Exception::BP);
+}
+
+void CPU::NewRec::Compiler::Compile_b_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()));
+
+  const u8 irt = static_cast<u8>(inst->i.rt.GetValue());
+  const bool bgez = ConvertToBoolUnchecked(irt & u8(1));
+  const bool link = (irt & u8(0x1E)) == u8(0x10);
+
+  const s32 rs = GetConstantRegS32(cf.MipsS());
+  const bool taken = bgez ? (rs >= 0) : (rs < 0);
+  const u32 taken_pc = GetConditionalBranchTarget(cf);
+
+  if (link)
+    SetConstantReg(Reg::ra, GetBranchReturnAddress(cf));
+
+  CompileBranchDelaySlot();
+  EndBlock(taken ? taken_pc : m_compiler_pc, true);
+}
+
+void CPU::NewRec::Compiler::Compile_b(CompileFlags cf)
+{
+  const u8 irt = static_cast<u8>(inst->i.rt.GetValue());
+  const bool bgez = ConvertToBoolUnchecked(irt & u8(1));
+  const bool link = (irt & u8(0x1E)) == u8(0x10);
+
+  if (link)
+    SetConstantReg(Reg::ra, GetBranchReturnAddress(cf));
+
+  Compile_bxx(cf, bgez ? BranchCondition::GreaterEqualZero : BranchCondition::LessThanZero);
+}
+
+void CPU::NewRec::Compiler::Compile_blez(CompileFlags cf)
+{
+  Compile_bxx(cf, BranchCondition::LessEqualZero);
+}
+
+void CPU::NewRec::Compiler::Compile_blez_const(CompileFlags cf)
+{
+  Compile_bxx_const(cf, BranchCondition::LessEqualZero);
+}
+
+void CPU::NewRec::Compiler::Compile_bgtz(CompileFlags cf)
+{
+  Compile_bxx(cf, BranchCondition::GreaterThanZero);
+}
+
+void CPU::NewRec::Compiler::Compile_bgtz_const(CompileFlags cf)
+{
+  Compile_bxx_const(cf, BranchCondition::GreaterThanZero);
+}
+
+void CPU::NewRec::Compiler::Compile_beq(CompileFlags cf)
+{
+  Compile_bxx(cf, BranchCondition::Equal);
+}
+
+void CPU::NewRec::Compiler::Compile_beq_const(CompileFlags cf)
+{
+  Compile_bxx_const(cf, BranchCondition::Equal);
+}
+
+void CPU::NewRec::Compiler::Compile_bne(CompileFlags cf)
+{
+  Compile_bxx(cf, BranchCondition::NotEqual);
+}
+
+void CPU::NewRec::Compiler::Compile_bne_const(CompileFlags cf)
+{
+  Compile_bxx_const(cf, BranchCondition::NotEqual);
+}
+
+void CPU::NewRec::Compiler::Compile_bxx_const(CompileFlags cf, BranchCondition cond)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+
+  bool taken;
+  switch (cond)
+  {
+    case BranchCondition::Equal:
+      taken = GetConstantRegU32(cf.MipsS()) == GetConstantRegU32(cf.MipsT());
+      break;
+
+    case BranchCondition::NotEqual:
+      taken = GetConstantRegU32(cf.MipsS()) != GetConstantRegU32(cf.MipsT());
+      break;
+
+    case BranchCondition::GreaterThanZero:
+      taken = GetConstantRegS32(cf.MipsS()) > 0;
+      break;
+
+    case BranchCondition::GreaterEqualZero:
+      taken = GetConstantRegS32(cf.MipsS()) >= 0;
+      break;
+
+    case BranchCondition::LessThanZero:
+      taken = GetConstantRegS32(cf.MipsS()) < 0;
+      break;
+
+    case BranchCondition::LessEqualZero:
+      taken = GetConstantRegS32(cf.MipsS()) <= 0;
+      break;
+
+    default:
+      Panic("Unhandled condition");
+      return;
+  }
+
+  const u32 taken_pc = GetConditionalBranchTarget(cf);
+  CompileBranchDelaySlot();
+  EndBlock(taken ? taken_pc : m_compiler_pc, true);
+}
+
+void CPU::NewRec::Compiler::Compile_sll_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsT()) << inst->r.shamt);
+}
+
+void CPU::NewRec::Compiler::Compile_srl_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsT()) >> inst->r.shamt);
+}
+
+void CPU::NewRec::Compiler::Compile_sra_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), static_cast<u32>(GetConstantRegS32(cf.MipsT()) >> inst->r.shamt));
+}
+
+void CPU::NewRec::Compiler::Compile_sllv_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsT()) << (GetConstantRegU32(cf.MipsS()) & 0x1Fu));
+}
+
+void CPU::NewRec::Compiler::Compile_srlv_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsT()) >> (GetConstantRegU32(cf.MipsS()) & 0x1Fu));
+}
+
+void CPU::NewRec::Compiler::Compile_srav_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), static_cast<u32>(GetConstantRegS32(cf.MipsT()) >> (GetConstantRegU32(cf.MipsS()) & 0x1Fu)));
+}
+
+void CPU::NewRec::Compiler::Compile_and_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) & GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_or_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) | GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_xor_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) ^ GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_nor_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), ~(GetConstantRegU32(cf.MipsS()) | GetConstantRegU32(cf.MipsT())));
+}
+
+void CPU::NewRec::Compiler::Compile_slt_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), BoolToUInt32(GetConstantRegS32(cf.MipsS()) < GetConstantRegS32(cf.MipsT())));
+}
+
+void CPU::NewRec::Compiler::Compile_sltu_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), BoolToUInt32(GetConstantRegU32(cf.MipsS()) < GetConstantRegU32(cf.MipsT())));
+}
+
+void CPU::NewRec::Compiler::Compile_mult_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+
+  const u64 res =
+    static_cast<u64>(static_cast<s64>(GetConstantRegS32(cf.MipsS())) * static_cast<s64>(GetConstantRegS32(cf.MipsT())));
+  SetConstantReg(Reg::hi, static_cast<u32>(res >> 32));
+  SetConstantReg(Reg::lo, static_cast<u32>(res));
+}
+
+void CPU::NewRec::Compiler::Compile_multu_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+
+  const u64 res = static_cast<u64>(GetConstantRegU32(cf.MipsS())) * static_cast<u64>(GetConstantRegU32(cf.MipsT()));
+  SetConstantReg(Reg::hi, static_cast<u32>(res >> 32));
+  SetConstantReg(Reg::lo, static_cast<u32>(res));
+}
+
+void CPU::NewRec::Compiler::Compile_div_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+
+  const s32 num = GetConstantRegS32(cf.MipsS());
+  const s32 denom = GetConstantRegS32(cf.MipsT());
+
+  s32 lo, hi;
+  if (denom == 0)
+  {
+    // divide by zero
+    lo = (num >= 0) ? UINT32_C(0xFFFFFFFF) : UINT32_C(1);
+    hi = static_cast<u32>(num);
+  }
+  else if (static_cast<u32>(num) == UINT32_C(0x80000000) && denom == -1)
+  {
+    // unrepresentable
+    lo = UINT32_C(0x80000000);
+    hi = 0;
+  }
+  else
+  {
+    lo = num / denom;
+    hi = num % denom;
+  }
+
+  SetConstantReg(Reg::hi, hi);
+  SetConstantReg(Reg::lo, lo);
+}
+
+void CPU::NewRec::Compiler::Compile_divu_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+
+  const u32 num = GetConstantRegU32(cf.MipsS());
+  const u32 denom = GetConstantRegU32(cf.MipsT());
+
+  u32 lo, hi;
+
+  if (denom == 0)
+  {
+    // divide by zero
+    lo = UINT32_C(0xFFFFFFFF);
+    hi = static_cast<u32>(num);
+  }
+  else
+  {
+    lo = num / denom;
+    hi = num % denom;
+  }
+
+  SetConstantReg(Reg::hi, hi);
+  SetConstantReg(Reg::lo, lo);
+}
+
+void CPU::NewRec::Compiler::Compile_add_const(CompileFlags cf)
+{
+  // TODO: Overflow
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+  if (MipsD() != Reg::zero)
+    SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) + GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_addu_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) + GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_sub_const(CompileFlags cf)
+{
+  // TODO: Overflow
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+  if (MipsD() != Reg::zero)
+    SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) - GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_subu_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()) && HasConstantReg(cf.MipsT()));
+  SetConstantReg(MipsD(), GetConstantRegU32(cf.MipsS()) - GetConstantRegU32(cf.MipsT()));
+}
+
+void CPU::NewRec::Compiler::Compile_addi_const(CompileFlags cf)
+{
+  // TODO: Overflow
+  DebugAssert(HasConstantReg(cf.MipsS()));
+  if (cf.MipsT() != Reg::zero)
+    SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) + inst->i.imm_sext32());
+}
+
+void CPU::NewRec::Compiler::Compile_addiu_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()));
+  SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) + inst->i.imm_sext32());
+}
+
+void CPU::NewRec::Compiler::Compile_slti_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()));
+  SetConstantReg(cf.MipsT(), BoolToUInt32(GetConstantRegS32(cf.MipsS()) < static_cast<s32>(inst->i.imm_sext32())));
+}
+
+void CPU::NewRec::Compiler::Compile_sltiu_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()));
+  SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) < inst->i.imm_sext32());
+}
+
+void CPU::NewRec::Compiler::Compile_andi_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()));
+  SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) & inst->i.imm_zext32());
+}
+
+void CPU::NewRec::Compiler::Compile_ori_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()));
+  SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) | inst->i.imm_zext32());
+}
+
+void CPU::NewRec::Compiler::Compile_xori_const(CompileFlags cf)
+{
+  DebugAssert(HasConstantReg(cf.MipsS()));
+  SetConstantReg(cf.MipsT(), GetConstantRegU32(cf.MipsS()) ^ inst->i.imm_zext32());
+}
+
+void CPU::NewRec::Compiler::Compile_lui()
+{
+  if (inst->i.rt == Reg::zero)
+    return;
+
+  SetConstantReg(inst->i.rt, inst->i.imm_zext32() << 16);
+}
+
+static constexpr const std::array<std::pair<u32*, u32>, 16> s_cop0_table = {
+  {{nullptr, 0x00000000u},
+   {nullptr, 0x00000000u},
+   {nullptr, 0x00000000u},
+   {&CPU::g_state.cop0_regs.BPC, 0xffffffffu},
+   {nullptr, 0},
+   {&CPU::g_state.cop0_regs.BDA, 0xffffffffu},
+   {&CPU::g_state.cop0_regs.TAR, 0x00000000u},
+   {&CPU::g_state.cop0_regs.dcic.bits, CPU::Cop0Registers::DCIC::WRITE_MASK},
+   {&CPU::g_state.cop0_regs.BadVaddr, 0x00000000u},
+   {&CPU::g_state.cop0_regs.BDAM, 0xffffffffu},
+   {nullptr, 0x00000000u},
+   {&CPU::g_state.cop0_regs.BPCM, 0xffffffffu},
+   {&CPU::g_state.cop0_regs.sr.bits, CPU::Cop0Registers::SR::WRITE_MASK},
+   {&CPU::g_state.cop0_regs.cause.bits, CPU::Cop0Registers::CAUSE::WRITE_MASK},
+   {&CPU::g_state.cop0_regs.EPC, 0x00000000u},
+   {&CPU::g_state.cop0_regs.PRID, 0x00000000u}}};
+
+u32* CPU::NewRec::Compiler::GetCop0RegPtr(Cop0Reg reg)
+{
+  return (static_cast<u8>(reg) < s_cop0_table.size()) ? s_cop0_table[static_cast<u8>(reg)].first : nullptr;
+}
+
+u32 CPU::NewRec::Compiler::GetCop0RegWriteMask(Cop0Reg reg)
+{
+  return (static_cast<u8>(reg) < s_cop0_table.size()) ? s_cop0_table[static_cast<u8>(reg)].second : 0;
+}
+
+void CPU::NewRec::Compiler::Compile_mfc0(CompileFlags cf)
+{
+  const Cop0Reg r = static_cast<Cop0Reg>(MipsD());
+  const u32* ptr = GetCop0RegPtr(r);
+  if (!ptr)
+  {
+    Log_ErrorPrintf("Read from unknown cop0 reg %u", static_cast<u32>(r));
+    Compile_Fallback();
+    return;
+  }
+
+  DebugAssert(cf.valid_host_t);
+  LoadHostRegFromCPUPointer(cf.host_t, ptr);
+}
+
+std::pair<u32*, CPU::NewRec::Compiler::GTERegisterAccessAction>
+CPU::NewRec::Compiler::GetGTERegisterPointer(u32 index, bool writing)
+{
+  if (!writing)
+  {
+    // Most GTE registers can be read directly. Handle the special cases here.
+    if (index == 15) // SXY3
+    {
+      // mirror of SXY2
+      index = 14;
+    }
+
+    switch (index)
+    {
+      case 28: // IRGB
+      case 29: // ORGB
+      {
+        return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::CallHandler);
+      }
+      break;
+
+      default:
+      {
+        return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::Direct);
+      }
+      break;
+    }
+  }
+  else
+  {
+    switch (index)
+    {
+      case 1:  // V0[z]
+      case 3:  // V1[z]
+      case 5:  // V2[z]
+      case 8:  // IR0
+      case 9:  // IR1
+      case 10: // IR2
+      case 11: // IR3
+      case 36: // RT33
+      case 44: // L33
+      case 52: // LR33
+      case 58: // H       - sign-extended on read but zext on use
+      case 59: // DQA
+      case 61: // ZSF3
+      case 62: // ZSF4
+      {
+        // sign-extend z component of vector registers
+        return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::SignExtend16);
+      }
+      break;
+
+      case 7:  // OTZ
+      case 16: // SZ0
+      case 17: // SZ1
+      case 18: // SZ2
+      case 19: // SZ3
+      {
+        // zero-extend unsigned values
+        return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::ZeroExtend16);
+      }
+      break;
+
+      case 15: // SXY3
+      {
+        // writing to SXYP pushes to the FIFO
+        return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::PushFIFO);
+      }
+      break;
+
+      case 28: // IRGB
+      case 30: // LZCS
+      case 63: // FLAG
+      {
+        return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::CallHandler);
+      }
+
+      case 29: // ORGB
+      case 31: // LZCR
+      {
+        // read-only registers
+        return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::Ignore);
+      }
+
+      default:
+      {
+        // written as-is, 2x16 or 1x32 bits
+        return std::make_pair(&g_state.gte_regs.r32[index], GTERegisterAccessAction::Direct);
+      }
+    }
+  }
+}
+
+void CPU::NewRec::Compiler::AddGTETicks(TickCount ticks)
+{
+  // TODO: check, int has +1 here
+  m_gte_done_cycle = m_cycles + ticks;
+  Log_DebugPrintf("Adding %d GTE ticks", ticks);
+}
+
+void CPU::NewRec::Compiler::StallUntilGTEComplete()
+{
+  // TODO: hack to match old rec.. this may or may not be correct behavior
+  // it's the difference between stalling before and after the current instruction's cycle
+  DebugAssert(m_cycles > 0);
+  m_cycles--;
+
+  if (!m_dirty_gte_done_cycle)
+  {
+    // simple case - in block scheduling
+    if (m_gte_done_cycle > m_cycles)
+    {
+      Log_DebugPrintf("Stalling for %d ticks from GTE", m_gte_done_cycle - m_cycles);
+      m_cycles += (m_gte_done_cycle - m_cycles);
+    }
+  }
+  else
+  {
+    // switch to in block scheduling
+    Log_DebugPrintf("Flushing GTE stall from state");
+    Flush(FLUSH_GTE_STALL_FROM_STATE);
+  }
+
+  m_cycles++;
+}
+
+void CPU::NewRec::BackpatchLoadStore(void* exception_pc, const CodeCache::LoadstoreBackpatchInfo& info)
+{
+  // remove the cycles we added for the memory read, then take them off again after the backpatch
+  // the normal rec path will add the ram read ticks later, so we need to take them off at the end
+  DebugAssert(!info.is_load || info.cycles >= Bus::RAM_READ_TICKS);
+  const TickCount cycles_to_add =
+    static_cast<TickCount>(static_cast<u32>(info.cycles)) - (info.is_load ? Bus::RAM_READ_TICKS : 0);
+  const TickCount cycles_to_remove = static_cast<TickCount>(static_cast<u32>(info.cycles));
+
+  JitCodeBuffer& buffer = CodeCache::GetCodeBuffer();
+  void* thunk_address = buffer.GetFreeFarCodePointer();
+  const u32 thunk_size = CompileLoadStoreThunk(
+    thunk_address, buffer.GetFreeFarCodeSpace(), exception_pc, info.code_size, cycles_to_add, cycles_to_remove,
+    info.gpr_bitmask, info.address_register, info.data_register, info.AccessSize(), info.is_signed, info.is_load);
+
+#if 0
+  Log_DebugPrintf("**Backpatch Thunk**");
+  CodeCache::DisassembleAndLogHostCode(thunk_address, thunk_size);
+#endif
+
+  // backpatch to a jump to the slowmem handler
+  CodeCache::EmitJump(exception_pc, thunk_address, true);
+
+  buffer.CommitFarCode(thunk_size);
+}
diff --git a/src/core/cpu_newrec_compiler.h b/src/core/cpu_newrec_compiler.h
new file mode 100644
index 000000000..7781006cf
--- /dev/null
+++ b/src/core/cpu_newrec_compiler.h
@@ -0,0 +1,465 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+#include "cpu_code_cache_private.h"
+#include "cpu_recompiler_types.h"
+#include "cpu_types.h"
+#include <array>
+#include <bitset>
+#include <optional>
+#include <utility>
+#include <vector>
+
+namespace CPU::NewRec {
+
+// Global options
+static constexpr bool EMULATE_LOAD_DELAYS = true;
+static constexpr bool SWAP_BRANCH_DELAY_SLOTS = true;
+
+// Arch-specific options
+#if defined(CPU_ARCH_X64)
+static constexpr u32 NUM_HOST_REGS = 16;
+static constexpr bool HAS_MEMORY_OPERANDS = true;
+#elif defined(CPU_ARCH_ARM64)
+static constexpr u32 NUM_HOST_REGS = 32;
+static constexpr bool HAS_MEMORY_OPERANDS = false;
+#elif defined(CPU_ARCH_RISCV64)
+static constexpr u32 NUM_HOST_REGS = 32;
+static constexpr bool HAS_MEMORY_OPERANDS = false;
+#endif
+
+// TODO: Get rid of the virtuals... somehow.
+class Compiler
+{
+public:
+  Compiler();
+  virtual ~Compiler();
+
+  const void* CompileBlock(CodeCache::Block* block, u32* host_code_size, u32* host_far_code_size);
+
+protected:
+  enum FlushFlags : u32
+  {
+    FLUSH_FLUSH_MIPS_REGISTERS = (1 << 0),
+    FLUSH_INVALIDATE_MIPS_REGISTERS = (1 << 1),
+    FLUSH_FREE_CALLER_SAVED_REGISTERS = (1 << 2),
+    FLUSH_FREE_UNNEEDED_CALLER_SAVED_REGISTERS = (1 << 3),
+    FLUSH_FREE_ALL_REGISTERS = (1 << 4),
+    FLUSH_PC = (1 << 5),
+    FLUSH_INSTRUCTION_BITS = (1 << 6),
+    FLUSH_CYCLES = (1 << 7),
+    FLUSH_LOAD_DELAY = (1 << 8),
+    FLUSH_LOAD_DELAY_FROM_STATE = (1 << 9),
+    FLUSH_GTE_DONE_CYCLE = (1 << 10),
+    FLUSH_GTE_STALL_FROM_STATE = (1 << 11),
+
+    FLUSH_FOR_C_CALL = (FLUSH_FREE_CALLER_SAVED_REGISTERS),
+    FLUSH_FOR_LOADSTORE = (FLUSH_FREE_CALLER_SAVED_REGISTERS | FLUSH_CYCLES),
+    FLUSH_FOR_BRANCH = (FLUSH_FLUSH_MIPS_REGISTERS),
+    FLUSH_FOR_EXCEPTION =
+      (FLUSH_CYCLES | FLUSH_GTE_DONE_CYCLE), // GTE cycles needed because it stalls when a GTE instruction is next.
+    FLUSH_FOR_INTERPRETER =
+      (FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_INVALIDATE_MIPS_REGISTERS | FLUSH_FREE_CALLER_SAVED_REGISTERS | FLUSH_PC |
+       FLUSH_CYCLES | FLUSH_INSTRUCTION_BITS | FLUSH_LOAD_DELAY | FLUSH_GTE_DONE_CYCLE),
+    FLUSH_END_BLOCK = 0xFFFFFFFFu & ~(FLUSH_PC | FLUSH_CYCLES | FLUSH_GTE_DONE_CYCLE | FLUSH_INSTRUCTION_BITS |
+                                      FLUSH_GTE_STALL_FROM_STATE),
+  };
+
+  union CompileFlags
+  {
+    struct
+    {
+      u32 const_s : 1;  // S is constant
+      u32 const_t : 1;  // T is constant
+      u32 const_lo : 1; // LO is constant
+      u32 const_hi : 1; // HI is constant
+
+      u32 valid_host_d : 1;  // D is valid in host register
+      u32 valid_host_s : 1;  // S is valid in host register
+      u32 valid_host_t : 1;  // T is valid in host register
+      u32 valid_host_lo : 1; // LO is valid in host register
+      u32 valid_host_hi : 1; // HI is valid in host register
+
+      u32 host_d : 5;  // D host register
+      u32 host_s : 5;  // S host register
+      u32 host_t : 5;  // T host register
+      u32 host_lo : 5; // LO host register
+
+      u32 delay_slot_swapped : 1;
+      u32 pad1 : 2; // 28..31
+
+      u32 host_hi : 5; // HI host register
+
+      u32 mips_s : 5; // S guest register
+      u32 mips_t : 5; // T guest register
+
+      u32 pad2 : 15; // 32 bits
+    };
+
+    u64 bits;
+
+    ALWAYS_INLINE Reg MipsS() const { return static_cast<Reg>(mips_s); }
+    ALWAYS_INLINE Reg MipsT() const { return static_cast<Reg>(mips_t); }
+  };
+  static_assert(sizeof(CompileFlags) == sizeof(u64));
+
+  enum TemplateFlag : u32
+  {
+    TF_READS_S = (1 << 0),
+    TF_READS_T = (1 << 1),
+    TF_READS_LO = (1 << 2),
+    TF_READS_HI = (1 << 3),
+    TF_WRITES_D = (1 << 4),
+    TF_WRITES_T = (1 << 5),
+    TF_WRITES_LO = (1 << 6),
+    TF_WRITES_HI = (1 << 7),
+    TF_COMMUTATIVE = (1 << 8), // S op T == T op S
+    TF_CAN_OVERFLOW = (1 << 9),
+
+    // TF_NORENAME = // TODO
+    TF_LOAD_DELAY = (1 << 10),
+    TF_GTE_STALL = (1 << 11),
+
+    TF_NO_NOP = (1 << 12),
+    TF_NEEDS_REG_S = (1 << 13),
+    TF_NEEDS_REG_T = (1 << 14),
+    TF_CAN_SWAP_DELAY_SLOT = (1 << 15),
+
+    TF_RENAME_WITH_ZERO_T = (1 << 16), // add commutative for S as well
+    TF_RENAME_WITH_ZERO_IMM = (1 << 17),
+
+    TF_PGXP_WITHOUT_CPU = (1 << 18),
+  };
+
+  enum HostRegFlags : u8
+  {
+    HR_ALLOCATED = (1 << 0),
+    HR_NEEDED = (1 << 1),
+    HR_MODE_READ = (1 << 2),  // valid
+    HR_MODE_WRITE = (1 << 3), // dirty
+
+    HR_USABLE = (1 << 7),
+    HR_CALLEE_SAVED = (1 << 6),
+
+    ALLOWED_HR_FLAGS = HR_MODE_READ | HR_MODE_WRITE,
+    IMMUTABLE_HR_FLAGS = HR_USABLE | HR_CALLEE_SAVED,
+  };
+
+  enum HostRegAllocType : u8
+  {
+    HR_TYPE_TEMP,
+    HR_TYPE_CPU_REG,
+    HR_TYPE_PC_WRITEBACK,
+    HR_TYPE_LOAD_DELAY_VALUE,
+    HR_TYPE_NEXT_LOAD_DELAY_VALUE,
+  };
+
+  struct HostRegAlloc
+  {
+    u8 flags;
+    HostRegAllocType type;
+    Reg reg;
+    u16 counter;
+  };
+
+  enum class BranchCondition : u8
+  {
+    Equal,
+    NotEqual,
+    GreaterThanZero,
+    GreaterEqualZero,
+    LessThanZero,
+    LessEqualZero,
+  };
+
+  ALWAYS_INLINE bool HasConstantReg(Reg r) const { return m_constant_regs_valid.test(static_cast<u32>(r)); }
+  ALWAYS_INLINE bool HasDirtyConstantReg(Reg r) const { return m_constant_regs_dirty.test(static_cast<u32>(r)); }
+  ALWAYS_INLINE bool HasConstantRegValue(Reg r, u32 val) const
+  {
+    return m_constant_regs_valid.test(static_cast<u32>(r)) && m_constant_reg_values[static_cast<u32>(r)] == val;
+  }
+  ALWAYS_INLINE u32 GetConstantRegU32(Reg r) const { return m_constant_reg_values[static_cast<u32>(r)]; }
+  ALWAYS_INLINE s32 GetConstantRegS32(Reg r) const
+  {
+    return static_cast<s32>(m_constant_reg_values[static_cast<u32>(r)]);
+  }
+  void SetConstantReg(Reg r, u32 v);
+  void ClearConstantReg(Reg r);
+  void FlushConstantReg(Reg r);
+  void FlushConstantRegs(bool invalidate);
+
+  Reg MipsD() const;
+  u32 GetConditionalBranchTarget(CompileFlags cf) const;
+  u32 GetBranchReturnAddress(CompileFlags cf) const;
+  bool TrySwapDelaySlot(Reg rs = Reg::zero, Reg rt = Reg::zero, Reg rd = Reg::zero);
+  void SetCompilerPC(u32 newpc);
+
+  virtual const void* GetCurrentCodePointer() = 0;
+
+  virtual void Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,
+                     u32 far_code_space);
+  virtual void BeginBlock();
+  virtual void GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) = 0;
+  virtual void GenerateICacheCheckAndUpdate() = 0;
+  virtual void GenerateCall(const void* func, s32 arg1reg = -1, s32 arg2reg = -1, s32 arg3reg = -1) = 0;
+  virtual void EndBlock(const std::optional<u32>& newpc, bool do_event_test) = 0;
+  virtual void EndBlockWithException(Exception excode) = 0;
+  virtual const void* EndCompile(u32* code_size, u32* far_code_size) = 0;
+
+  ALWAYS_INLINE bool IsHostRegAllocated(u32 r) const { return (m_host_regs[r].flags & HR_ALLOCATED) != 0; }
+  static const char* GetReadWriteModeString(u32 flags);
+  virtual const char* GetHostRegName(u32 reg) const = 0;
+  u32 GetFreeHostReg(u32 flags);
+  u32 AllocateHostReg(u32 flags, HostRegAllocType type = HR_TYPE_TEMP, Reg reg = Reg::count);
+  std::optional<u32> CheckHostReg(u32 flags, HostRegAllocType type = HR_TYPE_TEMP, Reg reg = Reg::count);
+  u32 AllocateTempHostReg(u32 flags = 0);
+  void SwapHostRegAlloc(u32 lhs, u32 rhs);
+  void FlushHostReg(u32 reg);
+  void FreeHostReg(u32 reg);
+  void ClearHostReg(u32 reg);
+  void MarkRegsNeeded(HostRegAllocType type, Reg reg);
+  void RenameHostReg(u32 reg, u32 new_flags, HostRegAllocType new_type, Reg new_reg);
+  void ClearHostRegNeeded(u32 reg);
+  void ClearHostRegsNeeded();
+  void DeleteMIPSReg(Reg reg, bool flush);
+  bool TryRenameMIPSReg(Reg to, Reg from, u32 fromhost, Reg other);
+  void UpdateHostRegCounters();
+
+  virtual void LoadHostRegWithConstant(u32 reg, u32 val) = 0;
+  virtual void LoadHostRegFromCPUPointer(u32 reg, const void* ptr) = 0;
+  virtual void StoreConstantToCPUPointer(u32 val, const void* ptr) = 0;
+  virtual void StoreHostRegToCPUPointer(u32 reg, const void* ptr) = 0;
+  virtual void CopyHostReg(u32 dst, u32 src) = 0;
+  virtual void Flush(u32 flags);
+
+  /// Returns true if there is a load delay which will be stored at the end of the instruction.
+  bool HasLoadDelay() const { return m_load_delay_register != Reg::count; }
+
+  /// Cancels any pending load delay to the specified register.
+  void CancelLoadDelaysToReg(Reg reg);
+
+  /// Moves load delay to the next load delay, and writes any previous load delay to the destination register.
+  void UpdateLoadDelay();
+
+  /// Flushes the load delay, i.e. writes it to the destination register.
+  void FinishLoadDelay();
+
+  /// Flushes the load delay, but only if it matches the specified register.
+  void FinishLoadDelayToReg(Reg reg);
+
+  /// Uses a caller-saved register for load delays when PGXP is enabled.
+  u32 GetFlagsForNewLoadDelayedReg() const;
+
+  void BackupHostState();
+  void RestoreHostState();
+
+  /// Registers loadstore for possible backpatching.
+  void AddLoadStoreInfo(void* code_address, u32 code_size, u32 address_register, u32 data_register,
+                        MemoryAccessSize size, bool is_signed, bool is_load);
+
+  void CompileInstruction();
+  void CompileBranchDelaySlot(bool dirty_pc = true);
+
+  void CompileTemplate(void (Compiler::*const_func)(CompileFlags), void (Compiler::*func)(CompileFlags),
+                       const void* pgxp_cpu_func, u32 tflags);
+  void CompileLoadStoreTemplate(void (Compiler::*func)(CompileFlags, MemoryAccessSize, bool,
+                                                       const std::optional<VirtualMemoryAddress>&),
+                                MemoryAccessSize size, bool store, bool sign, u32 tflags);
+  void FlushForLoadStore(const std::optional<VirtualMemoryAddress>& address, bool store);
+  void CompileMoveRegTemplate(Reg dst, Reg src, bool pgxp_move);
+
+  virtual void GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg = Reg::count,
+                                            Reg arg3reg = Reg::count) = 0;
+
+  virtual void Compile_Fallback() = 0;
+
+  void Compile_j();
+  virtual void Compile_jr(CompileFlags cf) = 0;
+  void Compile_jr_const(CompileFlags cf);
+  void Compile_jal();
+  virtual void Compile_jalr(CompileFlags cf) = 0;
+  void Compile_jalr_const(CompileFlags cf);
+  void Compile_syscall();
+  void Compile_break();
+
+  void Compile_b_const(CompileFlags cf);
+  void Compile_b(CompileFlags cf);
+  void Compile_blez(CompileFlags cf);
+  void Compile_blez_const(CompileFlags cf);
+  void Compile_bgtz(CompileFlags cf);
+  void Compile_bgtz_const(CompileFlags cf);
+  void Compile_beq(CompileFlags cf);
+  void Compile_beq_const(CompileFlags cf);
+  void Compile_bne(CompileFlags cf);
+  void Compile_bne_const(CompileFlags cf);
+  virtual void Compile_bxx(CompileFlags cf, BranchCondition cond) = 0;
+  void Compile_bxx_const(CompileFlags cf, BranchCondition cond);
+
+  void Compile_sll_const(CompileFlags cf);
+  virtual void Compile_sll(CompileFlags cf) = 0;
+  void Compile_srl_const(CompileFlags cf);
+  virtual void Compile_srl(CompileFlags cf) = 0;
+  void Compile_sra_const(CompileFlags cf);
+  virtual void Compile_sra(CompileFlags cf) = 0;
+  void Compile_sllv_const(CompileFlags cf);
+  virtual void Compile_sllv(CompileFlags cf) = 0;
+  void Compile_srlv_const(CompileFlags cf);
+  virtual void Compile_srlv(CompileFlags cf) = 0;
+  void Compile_srav_const(CompileFlags cf);
+  virtual void Compile_srav(CompileFlags cf) = 0;
+  void Compile_mult_const(CompileFlags cf);
+  virtual void Compile_mult(CompileFlags cf) = 0;
+  void Compile_multu_const(CompileFlags cf);
+  virtual void Compile_multu(CompileFlags cf) = 0;
+  void Compile_div_const(CompileFlags cf);
+  virtual void Compile_div(CompileFlags cf) = 0;
+  void Compile_divu_const(CompileFlags cf);
+  virtual void Compile_divu(CompileFlags cf) = 0;
+  void Compile_add_const(CompileFlags cf);
+  virtual void Compile_add(CompileFlags cf) = 0;
+  void Compile_addu_const(CompileFlags cf);
+  virtual void Compile_addu(CompileFlags cf) = 0;
+  void Compile_sub_const(CompileFlags cf);
+  virtual void Compile_sub(CompileFlags cf) = 0;
+  void Compile_subu_const(CompileFlags cf);
+  virtual void Compile_subu(CompileFlags cf) = 0;
+  void Compile_and_const(CompileFlags cf);
+  virtual void Compile_and(CompileFlags cf) = 0;
+  void Compile_or_const(CompileFlags cf);
+  virtual void Compile_or(CompileFlags cf) = 0;
+  void Compile_xor_const(CompileFlags cf);
+  virtual void Compile_xor(CompileFlags cf) = 0;
+  void Compile_nor_const(CompileFlags cf);
+  virtual void Compile_nor(CompileFlags cf) = 0;
+  void Compile_slt_const(CompileFlags cf);
+  virtual void Compile_slt(CompileFlags cf) = 0;
+  void Compile_sltu_const(CompileFlags cf);
+  virtual void Compile_sltu(CompileFlags cf) = 0;
+
+  void Compile_addi_const(CompileFlags cf);
+  virtual void Compile_addi(CompileFlags cf) = 0;
+  void Compile_addiu_const(CompileFlags cf);
+  virtual void Compile_addiu(CompileFlags cf) = 0;
+  void Compile_slti_const(CompileFlags cf);
+  virtual void Compile_slti(CompileFlags cf) = 0;
+  void Compile_sltiu_const(CompileFlags cf);
+  virtual void Compile_sltiu(CompileFlags cf) = 0;
+  void Compile_andi_const(CompileFlags cf);
+  virtual void Compile_andi(CompileFlags cf) = 0;
+  void Compile_ori_const(CompileFlags cf);
+  virtual void Compile_ori(CompileFlags cf) = 0;
+  void Compile_xori_const(CompileFlags cf);
+  virtual void Compile_xori(CompileFlags cf) = 0;
+  void Compile_lui();
+
+  virtual void Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                           const std::optional<VirtualMemoryAddress>& address) = 0;
+  virtual void Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                           const std::optional<VirtualMemoryAddress>& address) = 0; // lwl/lwr
+  virtual void Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                            const std::optional<VirtualMemoryAddress>& address) = 0;
+  virtual void Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                           const std::optional<VirtualMemoryAddress>& address) = 0;
+  virtual void Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                           const std::optional<VirtualMemoryAddress>& address) = 0; // swl/swr
+  virtual void Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                            const std::optional<VirtualMemoryAddress>& address) = 0;
+
+  static u32* GetCop0RegPtr(Cop0Reg reg);
+  static u32 GetCop0RegWriteMask(Cop0Reg reg);
+
+  void Compile_mfc0(CompileFlags cf);
+  virtual void Compile_mtc0(CompileFlags cf) = 0;
+  virtual void Compile_rfe(CompileFlags cf) = 0;
+
+  void AddGTETicks(TickCount ticks);
+  void StallUntilGTEComplete();
+  virtual void Compile_mfc2(CompileFlags cf) = 0;
+  virtual void Compile_mtc2(CompileFlags cf) = 0;
+  virtual void Compile_cop2(CompileFlags cf) = 0;
+
+  enum GTERegisterAccessAction : u8
+  {
+    Ignore,
+    Direct,
+    ZeroExtend16,
+    SignExtend16,
+    CallHandler,
+    PushFIFO,
+  };
+
+  static std::pair<u32*, GTERegisterAccessAction> GetGTERegisterPointer(u32 index, bool writing);
+
+  CodeCache::Block* m_block = nullptr;
+  u32 m_compiler_pc = 0;
+  TickCount m_cycles = 0;
+  TickCount m_gte_done_cycle = 0;
+
+  const Instruction* inst = nullptr;
+  const CodeCache::InstructionInfo* iinfo = nullptr;
+  u32 m_current_instruction_pc = 0;
+  bool m_current_instruction_branch_delay_slot = false;
+  bool m_branch_delay_slot_swapped = false;
+
+  bool m_dirty_pc = false;
+  bool m_dirty_instruction_bits = false;
+  bool m_dirty_gte_done_cycle = false;
+  bool m_block_ended = false;
+
+  std::bitset<static_cast<size_t>(Reg::count)> m_constant_regs_valid = {};
+  std::bitset<static_cast<size_t>(Reg::count)> m_constant_regs_dirty = {};
+  std::array<u32, static_cast<size_t>(Reg::count)> m_constant_reg_values = {};
+
+  std::array<HostRegAlloc, NUM_HOST_REGS> m_host_regs = {};
+  u16 m_register_alloc_counter = 0;
+
+  bool m_load_delay_dirty = true;
+  Reg m_load_delay_register = Reg::count;
+  u32 m_load_delay_value_register = 0;
+
+  Reg m_next_load_delay_register = Reg::count;
+  u32 m_next_load_delay_value_register = 0;
+
+  struct HostStateBackup
+  {
+    TickCount cycles;
+    TickCount gte_done_cycle;
+    u32 compiler_pc;
+    bool dirty_pc;
+    bool dirty_instruction_bits;
+    bool dirty_gte_done_cycle;
+    bool block_ended;
+    const Instruction* inst;
+    const CodeCache::InstructionInfo* iinfo;
+    u32 current_instruction_pc;
+    bool current_instruction_delay_slot;
+    std::bitset<static_cast<size_t>(Reg::count)> const_regs_valid;
+    std::bitset<static_cast<size_t>(Reg::count)> const_regs_dirty;
+    std::array<u32, static_cast<size_t>(Reg::count)> const_regs_values;
+    std::array<HostRegAlloc, NUM_HOST_REGS> host_regs;
+    u16 register_alloc_counter;
+    bool load_delay_dirty;
+    Reg load_delay_register;
+    u32 load_delay_value_register;
+    Reg next_load_delay_register;
+    u32 next_load_delay_value_register;
+  };
+
+  // we need two of these, one for branch delays, and another if we have an overflow in the delay slot
+  std::array<HostStateBackup, 2> m_host_state_backup = {};
+  u32 m_host_state_backup_count = 0;
+
+  // PGXP memory callbacks
+  static const std::array<std::array<const void*, 2>, 3> s_pgxp_mem_load_functions;
+  static const std::array<const void*, 3> s_pgxp_mem_store_functions;
+};
+
+void BackpatchLoadStore(void* exception_pc, const CodeCache::LoadstoreBackpatchInfo& info);
+
+u32 CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size, TickCount cycles_to_add,
+                          TickCount cycles_to_remove, u32 gpr_bitmask, u8 address_register, u8 data_register,
+                          MemoryAccessSize size, bool is_signed, bool is_load);
+
+extern Compiler* g_compiler;
+} // namespace CPU::NewRec
diff --git a/src/core/cpu_newrec_compiler_aarch64.cpp b/src/core/cpu_newrec_compiler_aarch64.cpp
new file mode 100644
index 000000000..4d05927bd
--- /dev/null
+++ b/src/core/cpu_newrec_compiler_aarch64.cpp
@@ -0,0 +1,2235 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#include "cpu_newrec_compiler_aarch64.h"
+#include "common/align.h"
+#include "common/assert.h"
+#include "common/log.h"
+#include "common/string_util.h"
+#include "cpu_core_private.h"
+#include "cpu_recompiler_thunks.h"
+#include "gte.h"
+#include "pgxp.h"
+#include "settings.h"
+#include "timing_event.h"
+#include <limits>
+Log_SetChannel(CPU::NewRec);
+
+#define DUMP_BLOCKS
+
+#ifdef DUMP_BLOCKS
+#include "vixl/aarch64/disasm-aarch64.h"
+#endif
+
+using namespace vixl::aarch64;
+
+#define RWRET vixl::aarch64::w0
+#define RXRET vixl::aarch64::x0
+#define RWARG1 vixl::aarch64::w0
+#define RXARG1 vixl::aarch64::x0
+#define RWARG2 vixl::aarch64::w1
+#define RXARG2 vixl::aarch64::x1
+#define RWARG3 vixl::aarch64::w2
+#define RXARG3 vixl::aarch64::x2
+#define RWSCRATCH vixl::aarch64::w16
+#define RXSCRATCH vixl::aarch64::x16
+#define RSTATE vixl::aarch64::x19
+#define RMEMBASE vixl::aarch64::x20
+
+#define PTR(x) vixl::aarch64::MemOperand(RSTATE, (u32)(((u8*)(x)) - ((u8*)&g_state)))
+
+namespace CPU::NewRec {
+
+using CPU::Recompiler::armEmitCall;
+using CPU::Recompiler::armEmitCondBranch;
+using CPU::Recompiler::armEmitJmp;
+using CPU::Recompiler::armEmitMov;
+using CPU::Recompiler::armGetJumpTrampoline;
+using CPU::Recompiler::armGetPCDisplacement;
+using CPU::Recompiler::armIsCallerSavedRegister;
+using CPU::Recompiler::armMoveAddressToReg;
+
+AArch64Compiler s_instance;
+Compiler* g_compiler = &s_instance;
+
+} // namespace CPU::NewRec
+
+CPU::NewRec::AArch64Compiler::AArch64Compiler() = default;
+
+CPU::NewRec::AArch64Compiler::~AArch64Compiler() = default;
+
+const void* CPU::NewRec::AArch64Compiler::GetCurrentCodePointer()
+{
+  return armAsm->GetCursorAddress<const void*>();
+}
+
+void CPU::NewRec::AArch64Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space,
+                                         u8* far_code_buffer, u32 far_code_space)
+{
+  Compiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
+
+  // TODO: don't recreate this every time..
+  DebugAssert(!m_emitter && !m_far_emitter && !armAsm);
+  m_emitter = std::make_unique<Assembler>(code_buffer, code_buffer_space, PositionDependentCode);
+  m_far_emitter = std::make_unique<Assembler>(far_code_buffer, far_code_space, PositionDependentCode);
+  armAsm = m_emitter.get();
+
+#ifdef VIXL_DEBUG
+  m_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(m_emitter.get(), code_buffer_space,
+                                                                 vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
+  m_far_emitter_check = std::make_unique<vixl::CodeBufferCheckScope>(
+    m_far_emitter.get(), far_code_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
+#endif
+
+  // Need to wipe it out so it's correct when toggling fastmem.
+  m_host_regs = {};
+
+  const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE.GetCode() : NUM_HOST_REGS;
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    HostRegAlloc& ra = m_host_regs[i];
+
+    if (i == RWARG1.GetCode() || i == RWARG1.GetCode() || i == RWARG2.GetCode() || i == RWARG3.GetCode() ||
+        i == RWSCRATCH.GetCode() || i == RSTATE.GetCode() || i == membase_idx || i == x18.GetCode() || i >= 30)
+    {
+      continue;
+    }
+
+    ra.flags = HR_USABLE | (armIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::SwitchToFarCode(bool emit_jump, vixl::aarch64::Condition cond)
+{
+  DebugAssert(armAsm == m_emitter.get());
+  if (emit_jump)
+  {
+    const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress<const void*>());
+    if (cond != Condition::al)
+    {
+      if (vixl::IsInt19(disp))
+      {
+        armAsm->b(disp, cond);
+      }
+      else
+      {
+        Label skip;
+        armAsm->b(&skip, vixl::aarch64::InvertCondition(cond));
+        armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress<const void*>()));
+        armAsm->bind(&skip);
+      }
+    }
+    else
+    {
+      armAsm->b(disp);
+    }
+  }
+  armAsm = m_far_emitter.get();
+}
+
+void CPU::NewRec::AArch64Compiler::SwitchToFarCodeIfBitSet(const vixl::aarch64::Register& reg, u32 bit)
+{
+  const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress<const void*>());
+  if (vixl::IsInt14(disp))
+  {
+    armAsm->tbnz(reg, bit, disp);
+  }
+  else
+  {
+    Label skip;
+    armAsm->tbz(reg, bit, &skip);
+    armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress<const void*>()));
+    armAsm->bind(&skip);
+  }
+
+  armAsm = m_far_emitter.get();
+}
+
+void CPU::NewRec::AArch64Compiler::SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch64::Register& reg, bool nonzero)
+{
+  const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress<const void*>());
+  if (vixl::IsInt19(disp))
+  {
+    nonzero ? armAsm->cbnz(reg, disp) : armAsm->cbz(reg, disp);
+  }
+  else
+  {
+    Label skip;
+    nonzero ? armAsm->cbz(reg, &skip) : armAsm->cbnz(reg, &skip);
+    armAsm->b(armGetPCDisplacement(GetCurrentCodePointer(), m_far_emitter->GetCursorAddress<const void*>()));
+    armAsm->bind(&skip);
+  }
+
+  armAsm = m_far_emitter.get();
+}
+
+void CPU::NewRec::AArch64Compiler::SwitchToNearCode(bool emit_jump, vixl::aarch64::Condition cond)
+{
+  DebugAssert(armAsm == m_far_emitter.get());
+  if (emit_jump)
+  {
+    const s64 disp = armGetPCDisplacement(GetCurrentCodePointer(), m_emitter->GetCursorAddress<const void*>());
+    (cond != Condition::al) ? armAsm->b(disp, cond) : armAsm->b(disp);
+  }
+  armAsm = m_emitter.get();
+}
+
+void CPU::NewRec::AArch64Compiler::EmitMov(const vixl::aarch64::WRegister& dst, u32 val)
+{
+  armEmitMov(armAsm, dst, val);
+}
+
+void CPU::NewRec::AArch64Compiler::EmitCall(const void* ptr, bool force_inline /*= false*/)
+{
+  armEmitCall(armAsm, ptr, force_inline);
+}
+
+vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckAddSubConstant(s32 val)
+{
+  if (Assembler::IsImmAddSub(val))
+    return vixl::aarch64::Operand(static_cast<int64_t>(val));
+
+  EmitMov(RWSCRATCH, static_cast<u32>(val));
+  return vixl::aarch64::Operand(RWSCRATCH);
+}
+
+vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckAddSubConstant(u32 val)
+{
+  return armCheckAddSubConstant(static_cast<s32>(val));
+}
+
+vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckCompareConstant(s32 val)
+{
+  if (Assembler::IsImmConditionalCompare(val))
+    return vixl::aarch64::Operand(static_cast<int64_t>(val));
+
+  EmitMov(RWSCRATCH, static_cast<u32>(val));
+  return vixl::aarch64::Operand(RWSCRATCH);
+}
+
+vixl::aarch64::Operand CPU::NewRec::AArch64Compiler::armCheckLogicalConstant(u32 val)
+{
+  if (Assembler::IsImmLogical(val, 32))
+    return vixl::aarch64::Operand(static_cast<s64>(static_cast<u64>(val)));
+
+  EmitMov(RWSCRATCH, val);
+  return vixl::aarch64::Operand(RWSCRATCH);
+}
+
+void CPU::NewRec::AArch64Compiler::BeginBlock()
+{
+  Compiler::BeginBlock();
+}
+
+void CPU::NewRec::AArch64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
+{
+  // store it first to reduce code size, because we can offset
+  armMoveAddressToReg(armAsm, RXARG1, ram_ptr);
+  armMoveAddressToReg(armAsm, RXARG2, shadow_ptr);
+
+  bool first = true;
+  u32 offset = 0;
+  Label block_changed;
+
+  while (size >= 16)
+  {
+    const VRegister vtmp = v2.V4S();
+    const VRegister dst = first ? v0.V4S() : v1.V4S();
+    armAsm->ldr(dst, MemOperand(RXARG1, offset));
+    armAsm->ldr(vtmp, MemOperand(RXARG2, offset));
+    armAsm->cmeq(dst, dst, vtmp);
+    if (!first)
+      armAsm->and_(dst.V16B(), dst.V16B(), vtmp.V16B());
+    else
+      first = false;
+
+    offset += 16;
+    size -= 16;
+  }
+
+  if (!first)
+  {
+    // TODO: make sure this doesn't choke on ffffffff
+    armAsm->uminv(s0, v0.V4S());
+    armAsm->fcmp(s0, 0.0);
+    armAsm->b(&block_changed, eq);
+  }
+
+  while (size >= 8)
+  {
+    armAsm->ldr(RXARG3, MemOperand(RXARG1, offset));
+    armAsm->ldr(RXSCRATCH, MemOperand(RXARG2, offset));
+    armAsm->cmp(RXARG3, RXSCRATCH);
+    armAsm->b(&block_changed, ne);
+    offset += 8;
+    size -= 8;
+  }
+
+  while (size >= 4)
+  {
+    armAsm->ldr(RWARG3, MemOperand(RXARG1, offset));
+    armAsm->ldr(RWSCRATCH, MemOperand(RXARG2, offset));
+    armAsm->cmp(RWARG3, RWSCRATCH);
+    armAsm->b(&block_changed, ne);
+    offset += 4;
+    size -= 4;
+  }
+
+  DebugAssert(size == 0);
+
+  Label block_unchanged;
+  armAsm->b(&block_unchanged);
+  armAsm->bind(&block_changed);
+  armEmitJmp(armAsm, CodeCache::g_discard_and_recompile_block, false);
+  armAsm->bind(&block_unchanged);
+}
+
+void CPU::NewRec::AArch64Compiler::GenerateICacheCheckAndUpdate()
+{
+  if (GetSegmentForAddress(m_block->pc) >= Segment::KSEG1)
+  {
+    armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
+    armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(static_cast<u32>(m_block->uncached_fetch_ticks)));
+    armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
+  }
+  else
+  {
+    const auto& ticks_reg = RWARG1;
+    const auto& current_tag_reg = RWARG2;
+    const auto& existing_tag_reg = RWARG3;
+
+    VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
+    armAsm->ldr(ticks_reg, PTR(&g_state.pending_ticks));
+    armEmitMov(armAsm, current_tag_reg, current_pc);
+
+    for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
+    {
+      const TickCount fill_ticks = GetICacheFillTicks(current_pc);
+      if (fill_ticks <= 0)
+        continue;
+
+      const u32 line = GetICacheLine(current_pc);
+      const u32 offset = offsetof(State, icache_tags) + (line * sizeof(u32));
+
+      Label cache_hit;
+      armAsm->ldr(existing_tag_reg, MemOperand(RSTATE, offset));
+      armAsm->cmp(existing_tag_reg, current_tag_reg);
+      armAsm->b(&cache_hit, eq);
+
+      armAsm->str(current_tag_reg, MemOperand(RSTATE, offset));
+      armAsm->add(ticks_reg, ticks_reg, armCheckAddSubConstant(static_cast<u32>(fill_ticks)));
+      armAsm->bind(&cache_hit);
+
+      if (i != (m_block->icache_line_count - 1))
+        armAsm->add(current_tag_reg, current_tag_reg, armCheckAddSubConstant(ICACHE_LINE_SIZE));
+    }
+
+    armAsm->str(ticks_reg, PTR(&g_state.pending_ticks));
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
+                                                s32 arg3reg /*= -1*/)
+{
+  if (arg1reg >= 0 && arg1reg != static_cast<s32>(RXARG1.GetCode()))
+    armAsm->mov(RXARG1, XRegister(arg1reg));
+  if (arg1reg >= 0 && arg2reg != static_cast<s32>(RXARG2.GetCode()))
+    armAsm->mov(RXARG2, XRegister(arg2reg));
+  if (arg1reg >= 0 && arg3reg != static_cast<s32>(RXARG3.GetCode()))
+    armAsm->mov(RXARG3, XRegister(arg3reg));
+  EmitCall(func);
+}
+
+void CPU::NewRec::AArch64Compiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
+{
+  if (newpc.has_value())
+  {
+    if (m_dirty_pc || m_compiler_pc != newpc)
+    {
+      EmitMov(RWSCRATCH, newpc.value());
+      armAsm->str(RWSCRATCH, PTR(&g_state.pc));
+    }
+  }
+  m_dirty_pc = false;
+
+  // flush regs
+  Flush(FLUSH_END_BLOCK);
+  EndAndLinkBlock(newpc, do_event_test);
+}
+
+void CPU::NewRec::AArch64Compiler::EndBlockWithException(Exception excode)
+{
+  // flush regs, but not pc, it's going to get overwritten
+  // flush cycles because of the GTE instruction stuff...
+  Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION);
+
+  // TODO: flush load delay
+  // TODO: break for pcdrv
+
+  EmitMov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
+                                                              inst->cop.cop_n));
+  EmitMov(RWARG2, m_current_instruction_pc);
+  EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
+  m_dirty_pc = false;
+
+  EndAndLinkBlock(std::nullopt, true);
+}
+
+void CPU::NewRec::AArch64Compiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test)
+{
+  // event test
+  // pc should've been flushed
+  DebugAssert(!m_dirty_pc);
+
+  // TODO: try extracting this to a function
+  // TODO: move the cycle flush in here..
+
+  // save cycles for event test
+  const TickCount cycles = std::exchange(m_cycles, 0);
+
+  // pending_ticks += cycles
+  // if (pending_ticks >= downcount) { dispatch_event(); }
+  if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)
+    armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
+  if (do_event_test)
+    armAsm->ldr(RWARG2, PTR(&g_state.downcount));
+  if (cycles > 0)
+    armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(cycles));
+  if (m_gte_done_cycle > cycles)
+  {
+    armAsm->add(RWARG2, RWARG1, armCheckAddSubConstant(m_gte_done_cycle - cycles));
+    armAsm->str(RWARG2, PTR(&g_state.gte_completion_tick));
+  }
+  if (do_event_test)
+    armAsm->cmp(RWARG1, RWARG2);
+  if (cycles > 0)
+    armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
+  if (do_event_test)
+    armEmitCondBranch(armAsm, ge, CodeCache::g_run_events_and_dispatch);
+
+  // jump to dispatcher or next block
+  if (!newpc.has_value())
+  {
+    armEmitJmp(armAsm, CodeCache::g_dispatcher, false);
+  }
+  else
+  {
+    if (newpc.value() == m_block->pc)
+    {
+      // Special case: ourselves! No need to backlink then.
+      Log_DebugPrintf("Linking block at %08X to self", m_block->pc);
+      armEmitJmp(armAsm, armAsm->GetBuffer()->GetStartAddress<const void*>(), true);
+    }
+    else
+    {
+      const void* target = CodeCache::CreateBlockLink(m_block, armAsm->GetCursorAddress<void*>(), newpc.value());
+      armEmitJmp(armAsm, target, true);
+    }
+  }
+
+  m_block_ended = true;
+}
+
+const void* CPU::NewRec::AArch64Compiler::EndCompile(u32* code_size, u32* far_code_size)
+{
+#ifdef VIXL_DEBUG
+  m_emitter_check.reset();
+  m_far_emitter_check.reset();
+#endif
+
+  m_emitter->FinalizeCode();
+  m_far_emitter->FinalizeCode();
+
+  u8* const code = m_emitter->GetBuffer()->GetStartAddress<u8*>();
+  *code_size = static_cast<u32>(m_emitter->GetCursorOffset());
+  *far_code_size = static_cast<u32>(m_far_emitter->GetCursorOffset());
+  armAsm = nullptr;
+  m_far_emitter.reset();
+  m_emitter.reset();
+  return code;
+}
+
+const char* CPU::NewRec::AArch64Compiler::GetHostRegName(u32 reg) const
+{
+  static constexpr std::array<const char*, 32> reg64_names = {
+    {"x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",  "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
+     "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp",  "lr",  "sp"}};
+  return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
+}
+
+void CPU::NewRec::AArch64Compiler::LoadHostRegWithConstant(u32 reg, u32 val)
+{
+  EmitMov(WRegister(reg), val);
+}
+
+void CPU::NewRec::AArch64Compiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
+{
+  armAsm->ldr(WRegister(reg), PTR(ptr));
+}
+
+void CPU::NewRec::AArch64Compiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
+{
+  armAsm->str(WRegister(reg), PTR(ptr));
+}
+
+void CPU::NewRec::AArch64Compiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
+{
+  if (val == 0)
+  {
+    armAsm->str(wzr, PTR(ptr));
+    return;
+  }
+
+  EmitMov(RWSCRATCH, val);
+  armAsm->str(RWSCRATCH, PTR(ptr));
+}
+
+void CPU::NewRec::AArch64Compiler::CopyHostReg(u32 dst, u32 src)
+{
+  if (src != dst)
+    armAsm->mov(WRegister(dst), WRegister(src));
+}
+
+void CPU::NewRec::AArch64Compiler::AssertRegOrConstS(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_s || cf.const_s);
+}
+
+void CPU::NewRec::AArch64Compiler::AssertRegOrConstT(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_t || cf.const_t);
+}
+
+vixl::aarch64::MemOperand CPU::NewRec::AArch64Compiler::MipsPtr(Reg r) const
+{
+  DebugAssert(r < Reg::count);
+  return PTR(&g_state.regs.r[static_cast<u32>(r)]);
+}
+
+vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegD(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_d);
+  return WRegister(cf.host_d);
+}
+
+vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegS(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_s);
+  return WRegister(cf.host_s);
+}
+
+vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegT(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_t);
+  return WRegister(cf.host_t);
+}
+
+vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegLO(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_lo);
+  return WRegister(cf.host_lo);
+}
+
+vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::CFGetRegHI(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_hi);
+  return WRegister(cf.host_hi);
+}
+
+void CPU::NewRec::AArch64Compiler::MoveSToReg(const vixl::aarch64::WRegister& dst, CompileFlags cf)
+{
+  if (cf.valid_host_s)
+  {
+    if (cf.host_s != dst.GetCode())
+      armAsm->mov(dst, WRegister(cf.host_s));
+  }
+  else if (cf.const_s)
+  {
+    const u32 cv = GetConstantRegU32(cf.MipsS());
+    if (cv == 0)
+      armAsm->mov(dst, wzr);
+    else
+      EmitMov(dst, cv);
+  }
+  else
+  {
+    Log_WarningPrintf("Hit memory path in MoveSToReg() for %s", GetRegName(cf.MipsS()));
+    armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_s]));
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::MoveTToReg(const vixl::aarch64::WRegister& dst, CompileFlags cf)
+{
+  if (cf.valid_host_t)
+  {
+    if (cf.host_t != dst.GetCode())
+      armAsm->mov(dst, WRegister(cf.host_t));
+  }
+  else if (cf.const_t)
+  {
+    const u32 cv = GetConstantRegU32(cf.MipsT());
+    if (cv == 0)
+      armAsm->mov(dst, wzr);
+    else
+      EmitMov(dst, cv);
+  }
+  else
+  {
+    Log_WarningPrintf("Hit memory path in MoveTToReg() for %s", GetRegName(cf.MipsT()));
+    armAsm->ldr(dst, PTR(&g_state.regs.r[cf.mips_t]));
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::MoveMIPSRegToReg(const vixl::aarch64::WRegister& dst, Reg reg)
+{
+  DebugAssert(reg < Reg::count);
+  if (const std::optional<u32> hreg = CheckHostReg(0, Compiler::HR_TYPE_CPU_REG, reg))
+    armAsm->mov(dst, WRegister(hreg.value()));
+  else if (HasConstantReg(reg))
+    EmitMov(dst, GetConstantRegU32(reg));
+  else
+    armAsm->ldr(dst, MipsPtr(reg));
+}
+
+void CPU::NewRec::AArch64Compiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val,
+                                                                Reg arg2reg /* = Reg::count */,
+                                                                Reg arg3reg /* = Reg::count */)
+{
+  DebugAssert(g_settings.gpu_pgxp_enable);
+
+  Flush(FLUSH_FOR_C_CALL);
+
+  if (arg2reg != Reg::count)
+    MoveMIPSRegToReg(RWARG2, arg2reg);
+  if (arg3reg != Reg::count)
+    MoveMIPSRegToReg(RWARG3, arg3reg);
+
+  EmitMov(RWARG1, arg1val);
+  EmitCall(func);
+}
+
+void CPU::NewRec::AArch64Compiler::Flush(u32 flags)
+{
+  Compiler::Flush(flags);
+
+  if (flags & FLUSH_PC && m_dirty_pc)
+  {
+    StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);
+    m_dirty_pc = false;
+  }
+
+  if (flags & FLUSH_INSTRUCTION_BITS)
+  {
+    // This sucks, but it's only used for fallbacks.
+    Panic("Not implemented");
+  }
+
+  if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
+  {
+    // This sucks :(
+    // TODO: make it a function?
+    armAsm->ldrb(RWARG1, PTR(&g_state.load_delay_reg));
+    armAsm->ldr(RWARG2, PTR(&g_state.load_delay_value));
+    EmitMov(RWSCRATCH, offsetof(CPU::State, regs.r[0]));
+    armAsm->add(RWARG1, RWSCRATCH, vixl::aarch64::Operand(RWARG1, LSL, 2));
+    armAsm->str(RWARG2, MemOperand(RSTATE, RXARG1));
+    EmitMov(RWSCRATCH, static_cast<u8>(Reg::count));
+    armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));
+    m_load_delay_dirty = false;
+  }
+
+  if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
+  {
+    if (m_load_delay_value_register != NUM_HOST_REGS)
+      FreeHostReg(m_load_delay_value_register);
+
+    EmitMov(RWSCRATCH, static_cast<u8>(m_load_delay_register));
+    armAsm->strb(RWSCRATCH, PTR(&g_state.load_delay_reg));
+    m_load_delay_register = Reg::count;
+    m_load_delay_dirty = true;
+  }
+
+  if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
+  {
+    // May as well flush cycles while we're here.
+    // GTE spanning blocks is very rare, we _could_ disable this for speed.
+    armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
+    armAsm->ldr(RWARG2, PTR(&g_state.gte_completion_tick));
+    if (m_cycles > 0)
+    {
+      armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
+      m_cycles = 0;
+    }
+    armAsm->cmp(RWARG2, RWARG1);
+    armAsm->csel(RWARG1, RWARG2, RWARG1, hs);
+    armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
+    m_dirty_gte_done_cycle = false;
+  }
+
+  if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
+  {
+    armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
+
+    // update cycles at the same time
+    if (flags & FLUSH_CYCLES && m_cycles > 0)
+    {
+      armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
+      armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
+      m_gte_done_cycle -= m_cycles;
+      m_cycles = 0;
+    }
+
+    armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_gte_done_cycle));
+    armAsm->str(RWARG1, PTR(&g_state.gte_completion_tick));
+    m_gte_done_cycle = 0;
+    m_dirty_gte_done_cycle = true;
+  }
+
+  if (flags & FLUSH_CYCLES && m_cycles > 0)
+  {
+    armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks));
+    armAsm->add(RWARG1, RWARG1, armCheckAddSubConstant(m_cycles));
+    armAsm->str(RWARG1, PTR(&g_state.pending_ticks));
+    m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
+    m_cycles = 0;
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_Fallback()
+{
+  Flush(FLUSH_FOR_INTERPRETER);
+
+#if 0
+  cg->call(&CPU::Recompiler::Thunks::InterpretInstruction);
+
+  // TODO: make me less garbage
+  // TODO: this is wrong, it flushes the load delay on the same cycle when we return.
+  // but nothing should be going through here..
+  Label no_load_delay;
+  cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]);
+  cg->cmp(RWARG1, static_cast<u8>(Reg::count));
+  cg->je(no_load_delay, CodeGenerator::T_SHORT);
+  cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]);
+  cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1);
+  cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2);
+  cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast<u32>(Reg::count));
+  cg->L(no_load_delay);
+
+  m_load_delay_dirty = EMULATE_LOAD_DELAYS;
+#else
+  Panic("Fixme");
+#endif
+}
+
+void CPU::NewRec::AArch64Compiler::CheckBranchTarget(const vixl::aarch64::WRegister& pcreg)
+{
+  if (!g_settings.cpu_recompiler_memory_exceptions)
+    return;
+
+  armAsm->tst(pcreg, armCheckLogicalConstant(0x3));
+  SwitchToFarCode(true, ne);
+
+  BackupHostState();
+  EndBlockWithException(Exception::AdEL);
+
+  RestoreHostState();
+  SwitchToNearCode(false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_jr(CompileFlags cf)
+{
+  const WRegister pcreg = CFGetRegS(cf);
+  CheckBranchTarget(pcreg);
+
+  armAsm->str(pcreg, PTR(&g_state.pc));
+
+  CompileBranchDelaySlot(false);
+  EndBlock(std::nullopt, true);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_jalr(CompileFlags cf)
+{
+  const WRegister pcreg = CFGetRegS(cf);
+  if (MipsD() != Reg::zero)
+    SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
+
+  CheckBranchTarget(pcreg);
+  armAsm->str(pcreg, PTR(&g_state.pc));
+
+  CompileBranchDelaySlot(false);
+  EndBlock(std::nullopt, true);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
+{
+  AssertRegOrConstS(cf);
+
+  const u32 taken_pc = GetConditionalBranchTarget(cf);
+
+  Flush(FLUSH_FOR_BRANCH);
+
+  DebugAssert(cf.valid_host_s);
+
+  // MipsT() here should equal zero for zero branches.
+  DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
+
+  Label taken;
+  const WRegister rs = CFGetRegS(cf);
+  switch (cond)
+  {
+    case BranchCondition::Equal:
+    case BranchCondition::NotEqual:
+    {
+      AssertRegOrConstT(cf);
+      if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0))
+      {
+        (cond == BranchCondition::Equal) ? armAsm->cbz(rs, &taken) : armAsm->cbnz(rs, &taken);
+      }
+      else
+      {
+        if (cf.valid_host_t)
+          armAsm->cmp(rs, CFGetRegT(cf));
+        else if (cf.const_t)
+          armAsm->cmp(rs, armCheckCompareConstant(GetConstantRegU32(cf.MipsT())));
+
+        armAsm->b(&taken, (cond == BranchCondition::Equal) ? eq : ne);
+      }
+    }
+    break;
+
+    case BranchCondition::GreaterThanZero:
+    {
+      armAsm->cmp(rs, 0);
+      armAsm->b(&taken, gt);
+    }
+    break;
+
+    case BranchCondition::GreaterEqualZero:
+    {
+      armAsm->cmp(rs, 0);
+      armAsm->b(&taken, ge);
+    }
+    break;
+
+    case BranchCondition::LessThanZero:
+    {
+      armAsm->cmp(rs, 0);
+      armAsm->b(&taken, lt);
+    }
+    break;
+
+    case BranchCondition::LessEqualZero:
+    {
+      armAsm->cmp(rs, 0);
+      armAsm->b(&taken, le);
+    }
+    break;
+  }
+
+  BackupHostState();
+  if (!cf.delay_slot_swapped)
+    CompileBranchDelaySlot();
+
+  EndBlock(m_compiler_pc, true);
+
+  armAsm->bind(&taken);
+
+  RestoreHostState();
+  if (!cf.delay_slot_swapped)
+    CompileBranchDelaySlot();
+
+  EndBlock(taken_pc, true);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_addi(CompileFlags cf, bool overflow)
+{
+  const WRegister rs = CFGetRegS(cf);
+  const WRegister rt = CFGetRegT(cf);
+  if (const u32 imm = inst->i.imm_sext32(); imm != 0)
+  {
+    if (!overflow)
+    {
+      armAsm->add(rt, rs, armCheckAddSubConstant(imm));
+    }
+    else
+    {
+      armAsm->adds(rt, rs, armCheckAddSubConstant(imm));
+      TestOverflow(rt);
+    }
+  }
+  else if (rt.GetCode() != rs.GetCode())
+  {
+    armAsm->mov(rt, rs);
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_addi(CompileFlags cf)
+{
+  Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_addiu(CompileFlags cf)
+{
+  Compile_addi(cf, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_slti(CompileFlags cf)
+{
+  Compile_slti(cf, true);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sltiu(CompileFlags cf)
+{
+  Compile_slti(cf, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_slti(CompileFlags cf, bool sign)
+{
+  armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(static_cast<s32>(inst->i.imm_sext32())));
+  armAsm->cset(CFGetRegT(cf), sign ? lt : lo);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_andi(CompileFlags cf)
+{
+  const WRegister rt = CFGetRegT(cf);
+  if (const u32 imm = inst->i.imm_zext32(); imm != 0)
+    armAsm->and_(rt, CFGetRegS(cf), armCheckLogicalConstant(imm));
+  else
+    armAsm->mov(rt, wzr);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_ori(CompileFlags cf)
+{
+  const WRegister rt = CFGetRegT(cf);
+  const WRegister rs = CFGetRegS(cf);
+  if (const u32 imm = inst->i.imm_zext32(); imm != 0)
+    armAsm->orr(rt, rs, armCheckLogicalConstant(imm));
+  else if (rt.GetCode() != rs.GetCode())
+    armAsm->mov(rt, rs);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_xori(CompileFlags cf)
+{
+  const WRegister rt = CFGetRegT(cf);
+  const WRegister rs = CFGetRegS(cf);
+  if (const u32 imm = inst->i.imm_zext32(); imm != 0)
+    armAsm->eor(rt, rs, armCheckLogicalConstant(imm));
+  else if (rt.GetCode() != rs.GetCode())
+    armAsm->mov(rt, rs);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_shift(CompileFlags cf,
+                                                 void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
+                                                                                      const vixl::aarch64::Register&,
+                                                                                      unsigned))
+{
+  const WRegister rd = CFGetRegD(cf);
+  const WRegister rt = CFGetRegT(cf);
+  if (inst->r.shamt > 0)
+    (armAsm->*op)(rd, rt, inst->r.shamt);
+  else if (rd.GetCode() != rt.GetCode())
+    armAsm->mov(rd, rt);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sll(CompileFlags cf)
+{
+  Compile_shift(cf, &Assembler::lsl);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_srl(CompileFlags cf)
+{
+  Compile_shift(cf, &Assembler::lsr);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sra(CompileFlags cf)
+{
+  Compile_shift(cf, &Assembler::asr);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_variable_shift(
+  CompileFlags cf,
+  void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&, const vixl::aarch64::Register&,
+                                       const vixl::aarch64::Register&),
+  void (vixl::aarch64::Assembler::*op_const)(const vixl::aarch64::Register&, const vixl::aarch64::Register&, unsigned))
+{
+  const WRegister rd = CFGetRegD(cf);
+
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+
+  const WRegister rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
+  if (!cf.valid_host_t)
+    MoveTToReg(rt, cf);
+
+  if (cf.const_s)
+  {
+    if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)
+      (armAsm->*op_const)(rd, rt, shift);
+    else if (rd.GetCode() != rt.GetCode())
+      armAsm->mov(rd, rt);
+  }
+  else
+  {
+    (armAsm->*op)(rd, rt, CFGetRegS(cf));
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sllv(CompileFlags cf)
+{
+  Compile_variable_shift(cf, &Assembler::lslv, &Assembler::lsl);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_srlv(CompileFlags cf)
+{
+  Compile_variable_shift(cf, &Assembler::lsrv, &Assembler::lsr);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_srav(CompileFlags cf)
+{
+  Compile_variable_shift(cf, &Assembler::asrv, &Assembler::asr);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_mult(CompileFlags cf, bool sign)
+{
+  const WRegister rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
+  if (!cf.valid_host_s)
+    MoveSToReg(rs, cf);
+
+  const WRegister rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
+  if (!cf.valid_host_t)
+    MoveTToReg(rt, cf);
+
+  // TODO: if lo/hi gets killed, we can use a 32-bit multiply
+  const WRegister lo = CFGetRegLO(cf);
+  const WRegister hi = CFGetRegHI(cf);
+
+  (sign) ? armAsm->smull(lo.X(), rs, rt) : armAsm->umull(lo.X(), rs, rt);
+  armAsm->lsr(hi.X(), lo.X(), 32);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_mult(CompileFlags cf)
+{
+  Compile_mult(cf, true);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_multu(CompileFlags cf)
+{
+  Compile_mult(cf, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_div(CompileFlags cf)
+{
+  const WRegister rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
+  if (!cf.valid_host_s)
+    MoveSToReg(rs, cf);
+
+  const WRegister rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
+  if (!cf.valid_host_t)
+    MoveTToReg(rt, cf);
+
+  const WRegister rlo = CFGetRegLO(cf);
+  const WRegister rhi = CFGetRegHI(cf);
+
+  // TODO: This could be slightly more optimal
+  Label done;
+  Label not_divide_by_zero;
+  armAsm->cbnz(rt, &not_divide_by_zero);
+  armAsm->cmp(rs, 0);
+  armAsm->mov(rhi, rs); // hi = num
+  EmitMov(rlo, 1);
+  EmitMov(RWSCRATCH, static_cast<u32>(-1));
+  armAsm->csel(rlo, RWSCRATCH, rlo, ge); // lo = s >= 0 ? -1 : 1
+  armAsm->b(&done);
+
+  armAsm->bind(&not_divide_by_zero);
+  Label not_unrepresentable;
+  armAsm->cmp(rs, armCheckCompareConstant(static_cast<s32>(0x80000000u)));
+  armAsm->b(&not_unrepresentable, ne);
+  armAsm->cmp(rt, armCheckCompareConstant(-1));
+  armAsm->b(&not_unrepresentable, ne);
+
+  EmitMov(rlo, 0x80000000u);
+  EmitMov(rhi, 0);
+  armAsm->b(&done);
+
+  armAsm->bind(&not_unrepresentable);
+
+  armAsm->sdiv(rlo, rs, rt);
+
+  // TODO: skip when hi is dead
+  armAsm->msub(rhi, rlo, rt, rs);
+
+  armAsm->bind(&done);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_divu(CompileFlags cf)
+{
+  const WRegister rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
+  if (!cf.valid_host_s)
+    MoveSToReg(rs, cf);
+
+  const WRegister rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
+  if (!cf.valid_host_t)
+    MoveTToReg(rt, cf);
+
+  const WRegister rlo = CFGetRegLO(cf);
+  const WRegister rhi = CFGetRegHI(cf);
+
+  Label done;
+  Label not_divide_by_zero;
+  armAsm->cbnz(rt, &not_divide_by_zero);
+  EmitMov(rlo, static_cast<u32>(-1));
+  armAsm->mov(rhi, rs);
+  armAsm->b(&done);
+
+  armAsm->bind(&not_divide_by_zero);
+
+  armAsm->udiv(rlo, rs, rt);
+
+  // TODO: skip when hi is dead
+  armAsm->msub(rhi, rlo, rt, rs);
+
+  armAsm->bind(&done);
+}
+
+void CPU::NewRec::AArch64Compiler::TestOverflow(const vixl::aarch64::WRegister& result)
+{
+  SwitchToFarCode(true, vs);
+
+  BackupHostState();
+
+  // toss the result
+  ClearHostReg(result.GetCode());
+
+  EndBlockWithException(Exception::Ov);
+
+  RestoreHostState();
+
+  SwitchToNearCode(false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_dst_op(CompileFlags cf,
+                                                  void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
+                                                                                       const vixl::aarch64::Register&,
+                                                                                       const vixl::aarch64::Operand&),
+                                                  bool commutative, bool logical, bool overflow)
+{
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+
+  const WRegister rd = CFGetRegD(cf);
+  if (cf.valid_host_s && cf.valid_host_t)
+  {
+    (armAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));
+  }
+  else if (commutative && (cf.const_s || cf.const_t))
+  {
+    const WRegister src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);
+    if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
+    {
+      (armAsm->*op)(rd, src, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
+    }
+    else
+    {
+      if (rd.GetCode() != src.GetCode())
+        armAsm->mov(rd, src);
+      overflow = false;
+    }
+  }
+  else if (cf.const_s)
+  {
+    // TODO: Check where we can use wzr here
+    EmitMov(RWSCRATCH, GetConstantRegU32(cf.MipsS()));
+    (armAsm->*op)(rd, RWSCRATCH, CFGetRegT(cf));
+  }
+  else if (cf.const_t)
+  {
+    const WRegister rs = CFGetRegS(cf);
+    if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
+    {
+      (armAsm->*op)(rd, rs, logical ? armCheckLogicalConstant(cv) : armCheckAddSubConstant(cv));
+    }
+    else
+    {
+      if (rd.GetCode() != rs.GetCode())
+        armAsm->mov(rd, rs);
+      overflow = false;
+    }
+  }
+
+  if (overflow)
+    TestOverflow(rd);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_add(CompileFlags cf)
+{
+  if (g_settings.cpu_recompiler_memory_exceptions)
+    Compile_dst_op(cf, &Assembler::adds, true, false, true);
+  else
+    Compile_dst_op(cf, &Assembler::add, true, false, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_addu(CompileFlags cf)
+{
+  Compile_dst_op(cf, &Assembler::add, true, false, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sub(CompileFlags cf)
+{
+  if (g_settings.cpu_recompiler_memory_exceptions)
+    Compile_dst_op(cf, &Assembler::subs, false, false, true);
+  else
+    Compile_dst_op(cf, &Assembler::sub, false, false, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_subu(CompileFlags cf)
+{
+  Compile_dst_op(cf, &Assembler::sub, false, false, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_and(CompileFlags cf)
+{
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+
+  // special cases - and with self -> self, and with 0 -> 0
+  const WRegister regd = CFGetRegD(cf);
+  if (cf.MipsS() == cf.MipsT())
+  {
+    armAsm->mov(regd, CFGetRegS(cf));
+    return;
+  }
+  else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
+  {
+    armAsm->mov(regd, wzr);
+    return;
+  }
+
+  Compile_dst_op(cf, &Assembler::and_, true, true, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_or(CompileFlags cf)
+{
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+
+  // or/nor with 0 -> no effect
+  const WRegister regd = CFGetRegD(cf);
+  if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
+  {
+    cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
+    return;
+  }
+
+  Compile_dst_op(cf, &Assembler::orr, true, true, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_xor(CompileFlags cf)
+{
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+
+  const WRegister regd = CFGetRegD(cf);
+  if (cf.MipsS() == cf.MipsT())
+  {
+    // xor with self -> zero
+    armAsm->mov(regd, wzr);
+    return;
+  }
+  else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
+  {
+    // xor with zero -> no effect
+    cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
+    return;
+  }
+
+  Compile_dst_op(cf, &Assembler::eor, true, true, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_nor(CompileFlags cf)
+{
+  Compile_or(cf);
+  armAsm->mvn(CFGetRegD(cf), CFGetRegD(cf));
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_slt(CompileFlags cf)
+{
+  Compile_slt(cf, true);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sltu(CompileFlags cf)
+{
+  Compile_slt(cf, false);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_slt(CompileFlags cf, bool sign)
+{
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+
+  // TODO: swap and reverse op for constants
+  if (cf.const_s)
+  {
+    EmitMov(RWSCRATCH, GetConstantRegS32(cf.MipsS()));
+    armAsm->cmp(RWSCRATCH, CFGetRegT(cf));
+  }
+  else if (cf.const_t)
+  {
+    armAsm->cmp(CFGetRegS(cf), armCheckCompareConstant(GetConstantRegS32(cf.MipsT())));
+  }
+  else
+  {
+    armAsm->cmp(CFGetRegS(cf), CFGetRegT(cf));
+  }
+
+  armAsm->cset(CFGetRegD(cf), sign ? lt : lo);
+}
+
+vixl::aarch64::WRegister
+CPU::NewRec::AArch64Compiler::ComputeLoadStoreAddressArg(CompileFlags cf,
+                                                         const std::optional<VirtualMemoryAddress>& address,
+                                                         const std::optional<const vixl::aarch64::WRegister>& reg)
+{
+  const u32 imm = inst->i.imm_sext32();
+  if (cf.valid_host_s && imm == 0 && !reg.has_value())
+    return CFGetRegS(cf);
+
+  const WRegister dst = reg.has_value() ? reg.value() : RWARG1;
+  if (address.has_value())
+  {
+    EmitMov(dst, address.value());
+  }
+  else if (imm == 0)
+  {
+    if (cf.valid_host_s)
+    {
+      if (const WRegister src = CFGetRegS(cf); src.GetCode() != dst.GetCode())
+        armAsm->mov(dst, CFGetRegS(cf));
+    }
+    else
+    {
+      armAsm->ldr(dst, MipsPtr(cf.MipsS()));
+    }
+  }
+  else
+  {
+    if (cf.valid_host_s)
+    {
+      armAsm->add(dst, CFGetRegS(cf), armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
+    }
+    else
+    {
+      armAsm->ldr(dst, MipsPtr(cf.MipsS()));
+      armAsm->add(dst, dst, armCheckAddSubConstant(static_cast<s32>(inst->i.imm_sext32())));
+    }
+  }
+
+  return dst;
+}
+
+template<typename RegAllocFn>
+vixl::aarch64::WRegister CPU::NewRec::AArch64Compiler::GenerateLoad(const vixl::aarch64::WRegister& addr_reg,
+                                                                    MemoryAccessSize size, bool sign,
+                                                                    const RegAllocFn& dst_reg_alloc)
+{
+  const bool checked = g_settings.cpu_recompiler_memory_exceptions;
+  if (!checked && CodeCache::IsUsingFastmem())
+  {
+    m_cycles += Bus::RAM_READ_TICKS;
+
+    const WRegister dst = dst_reg_alloc();
+
+    if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
+    {
+      DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());
+      armAsm->lsr(RWARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
+      armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 8));
+    }
+
+    const MemOperand mem =
+      MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());
+    u8* start = m_emitter->GetCursorAddress<u8*>();
+    switch (size)
+    {
+      case MemoryAccessSize::Byte:
+        sign ? armAsm->ldrsb(dst, mem) : armAsm->ldrb(dst, mem);
+        break;
+
+      case MemoryAccessSize::HalfWord:
+        sign ? armAsm->ldrsh(dst, mem) : armAsm->ldrh(dst, mem);
+        break;
+
+      case MemoryAccessSize::Word:
+        armAsm->ldr(dst, mem);
+        break;
+    }
+
+    AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), dst.GetCode(), size, sign, true);
+    return dst;
+  }
+
+  if (addr_reg.GetCode() != RWARG1.GetCode())
+    armAsm->mov(RWARG1, addr_reg);
+
+  switch (size)
+  {
+    case MemoryAccessSize::Byte:
+    {
+      EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryByte) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte));
+    }
+    break;
+    case MemoryAccessSize::HalfWord:
+    {
+      EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryHalfWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord));
+    }
+    break;
+    case MemoryAccessSize::Word:
+    {
+      EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord));
+    }
+    break;
+  }
+
+  // TODO: turn this into an asm function instead
+  if (checked)
+  {
+    SwitchToFarCodeIfBitSet(RXRET, 63);
+    BackupHostState();
+
+    // Need to stash this in a temp because of the flush.
+    const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
+    armAsm->neg(temp.X(), RXRET);
+    armAsm->lsl(temp, temp, 2);
+
+    Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
+
+    // cause_bits = (-result << 2) | BD | cop_n
+    armAsm->orr(RWARG1, temp,
+                armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
+                  static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
+    EmitMov(RWARG2, m_current_instruction_pc);
+    EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
+    FreeHostReg(temp.GetCode());
+    EndBlock(std::nullopt, true);
+
+    RestoreHostState();
+    SwitchToNearCode(false);
+  }
+
+  const WRegister dst_reg = dst_reg_alloc();
+  switch (size)
+  {
+    case MemoryAccessSize::Byte:
+    {
+      sign ? armAsm->sxtb(dst_reg, RWRET) : armAsm->uxtb(dst_reg, RWRET);
+    }
+    break;
+    case MemoryAccessSize::HalfWord:
+    {
+      sign ? armAsm->sxth(dst_reg, RWRET) : armAsm->uxth(dst_reg, RWRET);
+    }
+    break;
+    case MemoryAccessSize::Word:
+    {
+      if (dst_reg.GetCode() != RWRET.GetCode())
+        armAsm->mov(dst_reg, RWRET);
+    }
+    break;
+  }
+
+  return dst_reg;
+}
+
+void CPU::NewRec::AArch64Compiler::GenerateStore(const vixl::aarch64::WRegister& addr_reg,
+                                                 const vixl::aarch64::WRegister& value_reg, MemoryAccessSize size)
+{
+  const bool checked = g_settings.cpu_recompiler_memory_exceptions;
+  if (!checked && CodeCache::IsUsingFastmem())
+  {
+    if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
+    {
+      DebugAssert(addr_reg.GetCode() != RWARG3.GetCode());
+      armAsm->lsr(RWARG3, addr_reg, Bus::FASTMEM_LUT_PAGE_SHIFT);
+      armAsm->ldr(RXARG3, MemOperand(RMEMBASE, RXARG3, LSL, 8));
+    }
+
+    const MemOperand mem =
+      MemOperand((g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE, addr_reg.X());
+    u8* start = m_emitter->GetCursorAddress<u8*>();
+    switch (size)
+    {
+      case MemoryAccessSize::Byte:
+        armAsm->strb(value_reg, mem);
+        break;
+
+      case MemoryAccessSize::HalfWord:
+        armAsm->strh(value_reg, mem);
+        break;
+
+      case MemoryAccessSize::Word:
+        armAsm->str(value_reg, mem);
+        break;
+    }
+    AddLoadStoreInfo(start, kInstructionSize, addr_reg.GetCode(), value_reg.GetCode(), size, false, false);
+    return;
+  }
+
+  if (addr_reg.GetCode() != RWARG1.GetCode())
+    armAsm->mov(RWARG1, addr_reg);
+  if (value_reg.GetCode() != RWARG2.GetCode())
+    armAsm->mov(RWARG2, value_reg);
+
+  switch (size)
+  {
+    case MemoryAccessSize::Byte:
+    {
+      EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryByte) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte));
+    }
+    break;
+    case MemoryAccessSize::HalfWord:
+    {
+      EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryHalfWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord));
+    }
+    break;
+    case MemoryAccessSize::Word:
+    {
+      EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord));
+    }
+    break;
+  }
+
+  // TODO: turn this into an asm function instead
+  if (checked)
+  {
+    SwitchToFarCodeIfRegZeroOrNonZero(RXRET, true);
+    BackupHostState();
+
+    // Need to stash this in a temp because of the flush.
+    const WRegister temp = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
+    armAsm->lsl(temp, RWRET, 2);
+
+    Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
+
+    // cause_bits = (result << 2) | BD | cop_n
+    armAsm->orr(RWARG1, temp,
+                armCheckLogicalConstant(Cop0Registers::CAUSE::MakeValueForException(
+                  static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n)));
+    EmitMov(RWARG2, m_current_instruction_pc);
+    EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
+    FreeHostReg(temp.GetCode());
+    EndBlock(std::nullopt, true);
+
+    RestoreHostState();
+    SwitchToNearCode(false);
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                               const std::optional<VirtualMemoryAddress>& address)
+{
+  const std::optional<WRegister> addr_reg =
+    g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
+                                 std::optional<WRegister>();
+  FlushForLoadStore(address, false);
+  const WRegister addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
+  const WRegister data = GenerateLoad(addr, size, sign, [this, cf]() {
+    if (cf.MipsT() == Reg::zero)
+      return RWRET;
+
+    return WRegister(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
+                                     EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG,
+                                     cf.MipsT()));
+  });
+
+  if (g_settings.gpu_pgxp_enable)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+
+    EmitMov(RWARG1, inst->bits);
+    armAsm->mov(RWARG2, addr);
+    armAsm->mov(RWARG3, data);
+    EmitCall(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);
+    FreeHostReg(addr_reg.value().GetCode());
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                               const std::optional<VirtualMemoryAddress>& address)
+{
+  DebugAssert(size == MemoryAccessSize::Word && !sign);
+  FlushForLoadStore(address, false);
+
+  // TODO: if address is constant, this can be simplified..
+
+  // If we're coming from another block, just flush the load delay and hope for the best..
+  if (m_load_delay_dirty)
+    UpdateLoadDelay();
+
+  // We'd need to be careful here if we weren't overwriting it..
+  const WRegister addr = WRegister(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP));
+  ComputeLoadStoreAddressArg(cf, address, addr);
+  armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
+  GenerateLoad(RWARG1, MemoryAccessSize::Word, false, []() { return RWRET; });
+
+  if (inst->r.rt == Reg::zero)
+  {
+    FreeHostReg(addr.GetCode());
+    return;
+  }
+
+  // lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
+  // never written back. NOTE: can't trust T in cf because of the flush
+  const Reg rt = inst->r.rt;
+  WRegister value;
+  if (m_load_delay_register == rt)
+  {
+    const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
+                                 AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
+                                 m_load_delay_value_register;
+    RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
+    value = WRegister(existing_ld_rt);
+  }
+  else
+  {
+    if constexpr (EMULATE_LOAD_DELAYS)
+    {
+      value = WRegister(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
+      if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
+        armAsm->mov(value, WRegister(rtreg.value()));
+      else if (HasConstantReg(rt))
+        EmitMov(value, GetConstantRegU32(rt));
+      else
+        armAsm->ldr(value, MipsPtr(rt));
+    }
+    else
+    {
+      value = WRegister(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
+    }
+  }
+
+  DebugAssert(value.GetCode() != RWARG2.GetCode() && value.GetCode() != RWARG3.GetCode());
+  armAsm->and_(RWARG2, addr, 3);
+  armAsm->lsl(RWARG2, RWARG2, 3); // *8
+  EmitMov(RWARG3, 24);
+  armAsm->sub(RWARG3, RWARG3, RWARG2);
+
+  if (inst->op == InstructionOp::lwl)
+  {
+    // const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
+    // new_value = (value & mask) | (RWRET << (24 - shift));
+    EmitMov(addr, 0xFFFFFFu);
+    armAsm->lsrv(addr, addr, RWARG2);
+    armAsm->and_(value, value, addr);
+    armAsm->lslv(RWRET, RWRET, RWARG3);
+    armAsm->orr(value, value, RWRET);
+  }
+  else
+  {
+    // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
+    // new_value = (value & mask) | (RWRET >> shift);
+    armAsm->lsrv(RWRET, RWRET, RWARG2);
+    EmitMov(addr, 0xFFFFFF00u);
+    armAsm->lslv(addr, addr, RWARG3);
+    armAsm->and_(value, value, addr);
+    armAsm->orr(value, value, RWRET);
+  }
+
+  FreeHostReg(addr.GetCode());
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                                const std::optional<VirtualMemoryAddress>& address)
+{
+  const std::optional<WRegister> addr_reg =
+    g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
+                                 std::optional<WRegister>();
+  FlushForLoadStore(address, false);
+  const WRegister addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
+  GenerateLoad(addr, MemoryAccessSize::Word, false, []() { return RWRET; });
+
+  const u32 index = static_cast<u32>(inst->r.rt.GetValue());
+  const auto [ptr, action] = GetGTERegisterPointer(index, true);
+  switch (action)
+  {
+    case GTERegisterAccessAction::Ignore:
+    {
+      break;
+    }
+
+    case GTERegisterAccessAction::Direct:
+    {
+      armAsm->str(RWRET, PTR(ptr));
+      break;
+    }
+
+    case GTERegisterAccessAction::SignExtend16:
+    {
+      armAsm->sxth(RWRET, RWRET);
+      armAsm->str(RWRET, PTR(ptr));
+      break;
+    }
+
+    case GTERegisterAccessAction::ZeroExtend16:
+    {
+      armAsm->uxth(RWRET, RWRET);
+      armAsm->str(RWRET, PTR(ptr));
+      break;
+    }
+
+    case GTERegisterAccessAction::CallHandler:
+    {
+      Flush(FLUSH_FOR_C_CALL);
+      armAsm->mov(RWARG2, RWRET);
+      EmitMov(RWARG1, index);
+      EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
+      break;
+    }
+
+    case GTERegisterAccessAction::PushFIFO:
+    {
+      // SXY0 <- SXY1
+      // SXY1 <- SXY2
+      // SXY2 <- SXYP
+      DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode());
+      armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));
+      armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));
+      armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));
+      armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));
+      armAsm->str(RWRET, PTR(&g_state.gte_regs.SXY2[0]));
+      break;
+    }
+
+    default:
+    {
+      Panic("Unknown action");
+      return;
+    }
+  }
+
+  if (g_settings.gpu_pgxp_enable)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    armAsm->mov(RWARG3, RWRET);
+    armAsm->mov(RWARG2, addr);
+    EmitMov(RWARG1, inst->bits);
+    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));
+    FreeHostReg(addr_reg.value().GetCode());
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                               const std::optional<VirtualMemoryAddress>& address)
+{
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+
+  const std::optional<WRegister> addr_reg =
+    g_settings.gpu_pgxp_enable ? std::optional<WRegister>(WRegister(AllocateTempHostReg(HR_CALLEE_SAVED))) :
+                                 std::optional<WRegister>();
+  FlushForLoadStore(address, true);
+  const WRegister addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
+  const WRegister data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
+  if (!cf.valid_host_t)
+    MoveTToReg(RWARG2, cf);
+
+  GenerateStore(addr, data, size);
+
+  if (g_settings.gpu_pgxp_enable)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    MoveMIPSRegToReg(RWARG3, cf.MipsT());
+    armAsm->mov(RWARG2, addr);
+    EmitMov(RWARG1, inst->bits);
+    EmitCall(s_pgxp_mem_store_functions[static_cast<u32>(size)]);
+    FreeHostReg(addr_reg.value().GetCode());
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                               const std::optional<VirtualMemoryAddress>& address)
+{
+  DebugAssert(size == MemoryAccessSize::Word && !sign);
+  FlushForLoadStore(address, true);
+
+  // TODO: if address is constant, this can be simplified..
+  // We'd need to be careful here if we weren't overwriting it..
+  const WRegister addr = WRegister(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP));
+  ComputeLoadStoreAddressArg(cf, address, addr);
+  armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
+  GenerateLoad(RWARG1, MemoryAccessSize::Word, false, []() { return RWRET; });
+
+  // TODO: this can take over rt's value if it's no longer needed
+  // NOTE: can't trust T in cf because of the flush
+  const Reg rt = inst->r.rt;
+  const WRegister value = RWARG2;
+  if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
+    armAsm->mov(value, WRegister(rtreg.value()));
+  else if (HasConstantReg(rt))
+    EmitMov(value, GetConstantRegU32(rt));
+  else
+    armAsm->ldr(value, MipsPtr(rt));
+
+  armAsm->and_(RWSCRATCH, addr, 3);
+  armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *8
+
+  if (inst->op == InstructionOp::swl)
+  {
+    // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
+    // new_value = (RWRET & mem_mask) | (value >> (24 - shift));
+    EmitMov(RWARG3, 0xFFFFFF00u);
+    armAsm->lslv(RWARG3, RWARG3, RWSCRATCH);
+    armAsm->and_(RWRET, RWRET, RWARG3);
+
+    EmitMov(RWARG3, 24);
+    armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
+    armAsm->lsrv(value, value, RWARG3);
+    armAsm->orr(value, value, RWRET);
+  }
+  else
+  {
+    // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
+    // new_value = (RWRET & mem_mask) | (value << shift);
+    armAsm->lslv(value, value, RWSCRATCH);
+
+    EmitMov(RWARG3, 24);
+    armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
+    EmitMov(RWSCRATCH, 0x00FFFFFFu);
+    armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG3);
+    armAsm->and_(RWRET, RWRET, RWSCRATCH);
+    armAsm->orr(value, value, RWRET);
+  }
+
+  FreeHostReg(addr.GetCode());
+
+  armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
+  GenerateStore(RWARG1, value, MemoryAccessSize::Word);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                                const std::optional<VirtualMemoryAddress>& address)
+{
+  FlushForLoadStore(address, true);
+
+  const u32 index = static_cast<u32>(inst->r.rt.GetValue());
+  const auto [ptr, action] = GetGTERegisterPointer(index, false);
+  switch (action)
+  {
+    case GTERegisterAccessAction::Direct:
+    {
+      armAsm->ldr(RWARG2, PTR(ptr));
+    }
+    break;
+
+    case GTERegisterAccessAction::CallHandler:
+    {
+      // should already be flushed.. except in fastmem case
+      Flush(FLUSH_FOR_C_CALL);
+      EmitMov(RWARG1, index);
+      EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
+      armAsm->mov(RWARG2, RWRET);
+    }
+    break;
+
+    default:
+    {
+      Panic("Unknown action");
+    }
+    break;
+  }
+
+  // PGXP makes this a giant pain.
+  if (!g_settings.gpu_pgxp_enable)
+  {
+    const WRegister addr = ComputeLoadStoreAddressArg(cf, address);
+    GenerateStore(addr, RWARG2, size);
+    return;
+  }
+
+  // TODO: This can be simplified because we don't need to validate in PGXP..
+  const WRegister addr_reg = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
+  const WRegister data_backup = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
+  FlushForLoadStore(address, true);
+  ComputeLoadStoreAddressArg(cf, address, addr_reg);
+  armAsm->mov(data_backup, RWARG2);
+  GenerateStore(addr_reg, RWARG2, size);
+
+  Flush(FLUSH_FOR_C_CALL);
+  armAsm->mov(RWARG3, data_backup);
+  armAsm->mov(RWARG2, addr_reg);
+  EmitMov(RWARG1, inst->bits);
+  EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
+  FreeHostReg(addr_reg.GetCode());
+  FreeHostReg(data_backup.GetCode());
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_mtc0(CompileFlags cf)
+{
+  // TODO: we need better constant setting here.. which will need backprop
+  AssertRegOrConstT(cf);
+
+  const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
+  const u32* ptr = GetCop0RegPtr(reg);
+  const u32 mask = GetCop0RegWriteMask(reg);
+  if (!ptr)
+  {
+    Compile_Fallback();
+    return;
+  }
+
+  if (mask == 0)
+  {
+    // if it's a read-only register, ignore
+    Log_DebugPrintf("Ignoring write to read-only cop0 reg %u", static_cast<u32>(reg));
+    return;
+  }
+
+  // for some registers, we need to test certain bits
+  const bool needs_bit_test = (reg == Cop0Reg::SR);
+  const WRegister new_value = RWARG1;
+  const WRegister old_value = RWARG2;
+  const WRegister changed_bits = RWARG3;
+  const WRegister mask_reg = RWSCRATCH;
+
+  // Load old value
+  armAsm->ldr(old_value, PTR(ptr));
+
+  // No way we fit this in an immediate..
+  EmitMov(mask_reg, mask);
+
+  // update value
+  if (cf.valid_host_t)
+    armAsm->and_(new_value, CFGetRegT(cf), mask_reg);
+  else
+    EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);
+
+  if (needs_bit_test)
+    armAsm->eor(changed_bits, old_value, new_value);
+  armAsm->bic(old_value, old_value, mask_reg);
+  armAsm->orr(new_value, old_value, new_value);
+  armAsm->str(new_value, PTR(ptr));
+
+  if (reg == Cop0Reg::SR)
+  {
+    // TODO: replace with register backup
+    // We could just inline the whole thing..
+    Flush(FLUSH_FOR_C_CALL);
+
+    SwitchToFarCodeIfBitSet(changed_bits, 16);
+    armAsm->sub(sp, sp, 16);
+    armAsm->stp(RWARG1, RWARG2, MemOperand(sp));
+    EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));
+    armAsm->ldp(RWARG1, RWARG2, MemOperand(sp));
+    armAsm->add(sp, sp, 16);
+    armAsm->ldr(RMEMBASE, PTR(&g_state.fastmem_base));
+    SwitchToNearCode(true);
+  }
+
+  if (reg == Cop0Reg::SR || reg == Cop0Reg::CAUSE)
+  {
+    const WRegister sr = (reg == Cop0Reg::SR) ? RWARG2 : (armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits)), RWARG1);
+    TestInterrupts(sr);
+  }
+
+  if (reg == Cop0Reg::DCIC && g_settings.cpu_recompiler_memory_exceptions)
+  {
+    // TODO: DCIC handling for debug breakpoints
+    Log_WarningPrintf("TODO: DCIC handling for debug breakpoints");
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_rfe(CompileFlags cf)
+{
+  // shift mode bits right two, preserving upper bits
+  armAsm->ldr(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
+  armAsm->bfxil(RWARG1, RWARG1, 2, 4);
+  armAsm->str(RWARG1, PTR(&g_state.cop0_regs.sr.bits));
+
+  TestInterrupts(RWARG1);
+}
+
+void CPU::NewRec::AArch64Compiler::TestInterrupts(const vixl::aarch64::WRegister& sr)
+{
+  // if Iec == 0 then goto no_interrupt
+  Label no_interrupt;
+  armAsm->tbz(sr, 0, &no_interrupt);
+
+  // sr & cause
+  armAsm->ldr(RWSCRATCH, PTR(&g_state.cop0_regs.cause.bits));
+  armAsm->and_(sr, sr, RWSCRATCH);
+
+  // ((sr & cause) & 0xff00) == 0 goto no_interrupt
+  armAsm->tst(sr, 0xFF00);
+
+  SwitchToFarCode(true, ne);
+  BackupHostState();
+  Flush(FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
+  EmitCall(reinterpret_cast<const void*>(&DispatchInterrupt));
+  EndBlock(std::nullopt, true);
+  RestoreHostState();
+  SwitchToNearCode(false);
+
+  armAsm->bind(&no_interrupt);
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_mfc2(CompileFlags cf)
+{
+  const u32 index = inst->cop.Cop2Index();
+  const Reg rt = inst->r.rt;
+
+  const auto [ptr, action] = GetGTERegisterPointer(index, false);
+  if (action == GTERegisterAccessAction::Ignore)
+    return;
+
+  u32 hreg;
+  if (action == GTERegisterAccessAction::Direct)
+  {
+    hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
+                           EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
+    armAsm->ldr(WRegister(hreg), PTR(ptr));
+  }
+  else if (action == GTERegisterAccessAction::CallHandler)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    EmitMov(RWARG1, index);
+    EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
+
+    hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
+                           EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
+    armAsm->mov(WRegister(hreg), RWRET);
+  }
+  else
+  {
+    Panic("Unknown action");
+    return;
+  }
+
+  if (g_settings.gpu_pgxp_enable)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    EmitMov(RWARG1, inst->bits);
+    armAsm->mov(RWARG2, WRegister(hreg));
+    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_mtc2(CompileFlags cf)
+{
+  const u32 index = inst->cop.Cop2Index();
+  const auto [ptr, action] = GetGTERegisterPointer(index, true);
+  if (action == GTERegisterAccessAction::Ignore)
+    return;
+
+  if (action == GTERegisterAccessAction::Direct)
+  {
+    if (cf.const_t)
+      StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);
+    else
+      armAsm->str(CFGetRegT(cf), PTR(ptr));
+  }
+  else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
+  {
+    const bool sign = (action == GTERegisterAccessAction::SignExtend16);
+    if (cf.valid_host_t)
+    {
+      sign ? armAsm->sxth(RWARG1, CFGetRegT(cf)) : armAsm->uxth(RWARG1, CFGetRegT(cf));
+      armAsm->str(RWARG1, PTR(ptr));
+    }
+    else if (cf.const_t)
+    {
+      const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
+      StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);
+    }
+    else
+    {
+      Panic("Unsupported setup");
+    }
+  }
+  else if (action == GTERegisterAccessAction::CallHandler)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    EmitMov(RWARG1, index);
+    MoveTToReg(RWARG2, cf);
+    EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
+  }
+  else if (action == GTERegisterAccessAction::PushFIFO)
+  {
+    // SXY0 <- SXY1
+    // SXY1 <- SXY2
+    // SXY2 <- SXYP
+    DebugAssert(RWRET.GetCode() != RWARG2.GetCode() && RWRET.GetCode() != RWARG3.GetCode());
+    armAsm->ldr(RWARG2, PTR(&g_state.gte_regs.SXY1[0]));
+    armAsm->ldr(RWARG3, PTR(&g_state.gte_regs.SXY2[0]));
+    armAsm->str(RWARG2, PTR(&g_state.gte_regs.SXY0[0]));
+    armAsm->str(RWARG3, PTR(&g_state.gte_regs.SXY1[0]));
+    if (cf.valid_host_t)
+      armAsm->str(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));
+    else if (cf.const_t)
+      StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);
+    else
+      Panic("Unsupported setup");
+  }
+  else
+  {
+    Panic("Unknown action");
+  }
+}
+
+void CPU::NewRec::AArch64Compiler::Compile_cop2(CompileFlags cf)
+{
+  TickCount func_ticks;
+  GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
+
+  Flush(FLUSH_FOR_C_CALL);
+  EmitMov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
+  EmitCall(reinterpret_cast<const void*>(func));
+
+  AddGTETicks(func_ticks);
+}
+
+u32 CPU::NewRec::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
+                                       TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
+                                       u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
+                                       bool is_load)
+{
+  Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);
+  Assembler* armAsm = &arm_asm;
+
+#ifdef VIXL_DEBUG
+  vixl::CodeBufferCheckScope asm_check(armAsm, thunk_space, vixl::CodeBufferCheckScope::kDontReserveBufferSpace);
+#endif
+
+  static constexpr u32 GPR_SIZE = 8;
+
+  // save regs
+  u32 num_gprs = 0;
+
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
+      num_gprs++;
+  }
+
+  const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);
+
+  // TODO: use stp+ldp, vixl helper?
+
+  if (stack_size > 0)
+  {
+    armAsm->sub(sp, sp, stack_size);
+
+    u32 stack_offset = 0;
+    for (u32 i = 0; i < NUM_HOST_REGS; i++)
+    {
+      if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
+      {
+        armAsm->str(XRegister(i), MemOperand(sp, stack_offset));
+        stack_offset += GPR_SIZE;
+      }
+    }
+  }
+
+  if (cycles_to_add != 0)
+  {
+    // NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles
+    Assert(Assembler::IsImmAddSub(cycles_to_add));
+    armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));
+    armAsm->add(RWSCRATCH, RWSCRATCH, cycles_to_add);
+    armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));
+  }
+
+  if (address_register != static_cast<u8>(RWARG1.GetCode()))
+    armAsm->mov(RWARG1, WRegister(address_register));
+
+  if (!is_load)
+  {
+    if (data_register != static_cast<u8>(RWARG2.GetCode()))
+      armAsm->mov(RWARG2, WRegister(data_register));
+  }
+
+  switch (size)
+  {
+    case MemoryAccessSize::Byte:
+    {
+      armEmitCall(armAsm,
+                  is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte) :
+                            reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte),
+                  false);
+    }
+    break;
+    case MemoryAccessSize::HalfWord:
+    {
+      armEmitCall(armAsm,
+                  is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord) :
+                            reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord),
+                  false);
+    }
+    break;
+    case MemoryAccessSize::Word:
+    {
+      armEmitCall(armAsm,
+                  is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord) :
+                            reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord),
+                  false);
+    }
+    break;
+  }
+
+  if (is_load)
+  {
+    const WRegister dst = WRegister(data_register);
+    switch (size)
+    {
+      case MemoryAccessSize::Byte:
+      {
+        is_signed ? armAsm->sxtb(dst, RWRET) : armAsm->uxtb(dst, RWRET);
+      }
+      break;
+      case MemoryAccessSize::HalfWord:
+      {
+        is_signed ? armAsm->sxth(dst, RWRET) : armAsm->uxth(dst, RWRET);
+      }
+      break;
+      case MemoryAccessSize::Word:
+      {
+        if (dst.GetCode() != RWRET.GetCode())
+          armAsm->mov(dst, RWRET);
+      }
+      break;
+    }
+  }
+
+  if (cycles_to_remove != 0)
+  {
+    Assert(Assembler::IsImmAddSub(cycles_to_remove));
+    armAsm->ldr(RWSCRATCH, PTR(&g_state.pending_ticks));
+    armAsm->sub(RWSCRATCH, RWSCRATCH, cycles_to_remove);
+    armAsm->str(RWSCRATCH, PTR(&g_state.pending_ticks));
+  }
+
+  // restore regs
+  if (stack_size > 0)
+  {
+    u32 stack_offset = 0;
+    for (u32 i = 0; i < NUM_HOST_REGS; i++)
+    {
+      if ((gpr_bitmask & (1u << i)) && armIsCallerSavedRegister(i) && (!is_load || data_register != i))
+      {
+        armAsm->ldr(XRegister(i), MemOperand(sp, stack_offset));
+        stack_offset += GPR_SIZE;
+      }
+    }
+
+    armAsm->add(sp, sp, stack_size);
+  }
+
+  armEmitJmp(armAsm, static_cast<const u8*>(code_address) + code_size, true);
+  armAsm->FinalizeCode();
+
+  return static_cast<u32>(armAsm->GetCursorOffset());
+}
diff --git a/src/core/cpu_newrec_compiler_aarch64.h b/src/core/cpu_newrec_compiler_aarch64.h
new file mode 100644
index 000000000..58c6b0a71
--- /dev/null
+++ b/src/core/cpu_newrec_compiler_aarch64.h
@@ -0,0 +1,164 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+#include "cpu_newrec_compiler.h"
+#include <memory>
+
+#include "vixl/aarch64/assembler-aarch64.h"
+
+namespace CPU::NewRec {
+
+class AArch64Compiler final : public Compiler
+{
+public:
+  AArch64Compiler();
+  ~AArch64Compiler() override;
+
+protected:
+  const char* GetHostRegName(u32 reg) const override;
+
+  const void* GetCurrentCodePointer() override;
+
+  void LoadHostRegWithConstant(u32 reg, u32 val) override;
+  void LoadHostRegFromCPUPointer(u32 reg, const void* ptr) override;
+  void StoreConstantToCPUPointer(u32 val, const void* ptr) override;
+  void StoreHostRegToCPUPointer(u32 reg, const void* ptr) override;
+  void CopyHostReg(u32 dst, u32 src) override;
+
+  void Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer, u32 far_code_space) override;
+  void BeginBlock() override;
+  void GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) override;
+  void GenerateICacheCheckAndUpdate() override;
+  void GenerateCall(const void* func, s32 arg1reg = -1, s32 arg2reg = -1, s32 arg3reg = -1) override;
+  void EndBlock(const std::optional<u32>& newpc, bool do_event_test) override;
+  void EndBlockWithException(Exception excode) override;
+  void EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test);
+  const void* EndCompile(u32* code_size, u32* far_code_size) override;
+
+  void Flush(u32 flags) override;
+
+  void Compile_Fallback() override;
+
+  void CheckBranchTarget(const vixl::aarch64::WRegister& pcreg);
+  void Compile_jr(CompileFlags cf) override;
+  void Compile_jalr(CompileFlags cf) override;
+  void Compile_bxx(CompileFlags cf, BranchCondition cond) override;
+
+  void Compile_addi(CompileFlags cf, bool overflow);
+  void Compile_addi(CompileFlags cf) override;
+  void Compile_addiu(CompileFlags cf) override;
+  void Compile_slti(CompileFlags cf, bool sign);
+  void Compile_slti(CompileFlags cf) override;
+  void Compile_sltiu(CompileFlags cf) override;
+  void Compile_andi(CompileFlags cf) override;
+  void Compile_ori(CompileFlags cf) override;
+  void Compile_xori(CompileFlags cf) override;
+
+  void Compile_shift(CompileFlags cf, void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
+                                                                           const vixl::aarch64::Register&, unsigned));
+  void Compile_sll(CompileFlags cf) override;
+  void Compile_srl(CompileFlags cf) override;
+  void Compile_sra(CompileFlags cf) override;
+  void Compile_variable_shift(CompileFlags cf,
+                              void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
+                                                                   const vixl::aarch64::Register&,
+                                                                   const vixl::aarch64::Register&),
+                              void (vixl::aarch64::Assembler::*op_const)(const vixl::aarch64::Register&,
+                                                                         const vixl::aarch64::Register&, unsigned));
+  void Compile_sllv(CompileFlags cf) override;
+  void Compile_srlv(CompileFlags cf) override;
+  void Compile_srav(CompileFlags cf) override;
+  void Compile_mult(CompileFlags cf, bool sign);
+  void Compile_mult(CompileFlags cf) override;
+  void Compile_multu(CompileFlags cf) override;
+  void Compile_div(CompileFlags cf) override;
+  void Compile_divu(CompileFlags cf) override;
+  void TestOverflow(const vixl::aarch64::WRegister& result);
+  void Compile_dst_op(CompileFlags cf,
+                      void (vixl::aarch64::Assembler::*op)(const vixl::aarch64::Register&,
+                                                           const vixl::aarch64::Register&,
+                                                           const vixl::aarch64::Operand&),
+                      bool commutative, bool logical, bool overflow);
+  void Compile_add(CompileFlags cf) override;
+  void Compile_addu(CompileFlags cf) override;
+  void Compile_sub(CompileFlags cf) override;
+  void Compile_subu(CompileFlags cf) override;
+  void Compile_and(CompileFlags cf) override;
+  void Compile_or(CompileFlags cf) override;
+  void Compile_xor(CompileFlags cf) override;
+  void Compile_nor(CompileFlags cf) override;
+  void Compile_slt(CompileFlags cf, bool sign);
+  void Compile_slt(CompileFlags cf) override;
+  void Compile_sltu(CompileFlags cf) override;
+
+  vixl::aarch64::WRegister
+  ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional<VirtualMemoryAddress>& address,
+                             const std::optional<const vixl::aarch64::WRegister>& reg = std::nullopt);
+  template<typename RegAllocFn>
+  vixl::aarch64::WRegister GenerateLoad(const vixl::aarch64::WRegister& addr_reg, MemoryAccessSize size, bool sign,
+                                        const RegAllocFn& dst_reg_alloc);
+  void GenerateStore(const vixl::aarch64::WRegister& addr_reg, const vixl::aarch64::WRegister& value_reg,
+                     MemoryAccessSize size);
+  void Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                   const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                   const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                    const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                   const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                   const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                    const std::optional<VirtualMemoryAddress>& address) override;
+
+  void TestInterrupts(const vixl::aarch64::WRegister& sr);
+  void Compile_mtc0(CompileFlags cf) override;
+  void Compile_rfe(CompileFlags cf) override;
+
+  void Compile_mfc2(CompileFlags cf) override;
+  void Compile_mtc2(CompileFlags cf) override;
+  void Compile_cop2(CompileFlags cf) override;
+
+  void GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg = Reg::count,
+                                    Reg arg3reg = Reg::count) override;
+
+private:
+  void EmitMov(const vixl::aarch64::WRegister& dst, u32 val);
+  void EmitCall(const void* ptr, bool force_inline = false);
+
+  vixl::aarch64::Operand armCheckAddSubConstant(s32 val);
+  vixl::aarch64::Operand armCheckAddSubConstant(u32 val);
+  vixl::aarch64::Operand armCheckCompareConstant(s32 val);
+  vixl::aarch64::Operand armCheckLogicalConstant(u32 val);
+
+  void SwitchToFarCode(bool emit_jump, vixl::aarch64::Condition cond = vixl::aarch64::Condition::al);
+  void SwitchToFarCodeIfBitSet(const vixl::aarch64::Register& reg, u32 bit);
+  void SwitchToFarCodeIfRegZeroOrNonZero(const vixl::aarch64::Register& reg, bool nonzero);
+  void SwitchToNearCode(bool emit_jump, vixl::aarch64::Condition cond = vixl::aarch64::Condition::al);
+
+  void AssertRegOrConstS(CompileFlags cf) const;
+  void AssertRegOrConstT(CompileFlags cf) const;
+  vixl::aarch64::MemOperand MipsPtr(Reg r) const;
+  vixl::aarch64::WRegister CFGetRegD(CompileFlags cf) const;
+  vixl::aarch64::WRegister CFGetRegS(CompileFlags cf) const;
+  vixl::aarch64::WRegister CFGetRegT(CompileFlags cf) const;
+  vixl::aarch64::WRegister CFGetRegLO(CompileFlags cf) const;
+  vixl::aarch64::WRegister CFGetRegHI(CompileFlags cf) const;
+
+  void MoveSToReg(const vixl::aarch64::WRegister& dst, CompileFlags cf);
+  void MoveTToReg(const vixl::aarch64::WRegister& dst, CompileFlags cf);
+  void MoveMIPSRegToReg(const vixl::aarch64::WRegister& dst, Reg reg);
+
+  std::unique_ptr<vixl::aarch64::Assembler> m_emitter;
+  std::unique_ptr<vixl::aarch64::Assembler> m_far_emitter;
+  vixl::aarch64::Assembler* armAsm;
+
+#ifdef VIXL_DEBUG
+  std::unique_ptr<vixl::CodeBufferCheckScope> m_emitter_check;
+  std::unique_ptr<vixl::CodeBufferCheckScope> m_far_emitter_check;
+#endif
+};
+
+} // namespace CPU::NewRec
diff --git a/src/core/cpu_newrec_compiler_riscv64.cpp b/src/core/cpu_newrec_compiler_riscv64.cpp
new file mode 100644
index 000000000..88ad4783e
--- /dev/null
+++ b/src/core/cpu_newrec_compiler_riscv64.cpp
@@ -0,0 +1,2453 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#include "cpu_newrec_compiler_riscv64.h"
+#include "common/align.h"
+#include "common/assert.h"
+#include "common/log.h"
+#include "common/string_util.h"
+#include "cpu_code_cache_private.h"
+#include "cpu_core_private.h"
+#include "cpu_recompiler_thunks.h"
+#include "gte.h"
+#include "settings.h"
+#include "timing_event.h"
+#include <limits>
+Log_SetChannel(CPU::NewRec);
+
+#ifdef ENABLE_HOST_DISASSEMBLY
+extern "C" {
+#include "riscv-disas.h"
+}
+#endif
+
+// For LW/SW/etc.
+#define PTR(x) ((u32)(((u8*)(x)) - ((u8*)&g_state))), RSTATE
+
+static constexpr u32 BLOCK_LINK_SIZE = 8; // auipc+jr
+
+namespace CPU::NewRec {
+
+using namespace biscuit;
+
+using CPU::Recompiler::rvEmitCall;
+using CPU::Recompiler::rvEmitDSExtW;
+using CPU::Recompiler::rvEmitDUExtW;
+using CPU::Recompiler::rvEmitJmp;
+using CPU::Recompiler::rvEmitMov;
+using CPU::Recompiler::rvEmitMov64;
+using CPU::Recompiler::rvEmitSExtB;
+using CPU::Recompiler::rvEmitSExtH;
+using CPU::Recompiler::rvEmitUExtB;
+using CPU::Recompiler::rvEmitUExtH;
+using CPU::Recompiler::rvGetAddressImmediates;
+using CPU::Recompiler::rvIsCallerSavedRegister;
+using CPU::Recompiler::rvIsValidSExtITypeImm;
+using CPU::Recompiler::rvMoveAddressToReg;
+
+RISCV64Compiler s_instance;
+Compiler* g_compiler = &s_instance;
+
+} // namespace CPU::NewRec
+
+bool CPU::Recompiler::rvIsCallerSavedRegister(u32 id)
+{
+  return (id == 1 || (id >= 3 && id < 8) || (id >= 10 && id <= 17) || (id >= 28 && id <= 31));
+}
+
+bool CPU::Recompiler::rvIsValidSExtITypeImm(u32 imm)
+{
+  return (static_cast<u32>((static_cast<s32>(imm) << 20) >> 20) == imm);
+}
+
+std::pair<s32, s32> CPU::Recompiler::rvGetAddressImmediates(const void* cur, const void* target)
+{
+  const s64 disp = static_cast<s64>(reinterpret_cast<intptr_t>(target) - reinterpret_cast<intptr_t>(cur));
+  Assert(disp >= static_cast<s64>(std::numeric_limits<s32>::min()) &&
+         disp <= static_cast<s64>(std::numeric_limits<s32>::max()));
+
+  const s64 hi = disp + 0x800;
+  const s64 lo = disp - (hi & 0xFFFFF000);
+  return std::make_pair(static_cast<s32>(hi >> 12), static_cast<s32>((lo << 52) >> 52));
+}
+
+void CPU::Recompiler::rvMoveAddressToReg(biscuit::Assembler* rvAsm, const biscuit::GPR& reg, const void* addr)
+{
+  const auto [hi, lo] = rvGetAddressImmediates(rvAsm->GetCursorPointer(), addr);
+  rvAsm->AUIPC(reg, hi);
+  rvAsm->ADDI(reg, reg, lo);
+}
+
+void CPU::Recompiler::rvEmitMov(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, u32 imm)
+{
+  // Borrowed from biscuit, but doesn't emit an ADDI if the lower 12 bits are zero.
+  const u32 lower = imm & 0xFFF;
+  const u32 upper = (imm & 0xFFFFF000) >> 12;
+  const s32 simm = static_cast<s32>(imm);
+  if (rvIsValidSExtITypeImm(simm))
+  {
+    rvAsm->ADDI(rd, biscuit::zero, static_cast<s32>(lower));
+  }
+  else
+  {
+    const bool needs_increment = (lower & 0x800) != 0;
+    const u32 upper_imm = needs_increment ? upper + 1 : upper;
+    rvAsm->LUI(rd, upper_imm);
+    rvAsm->ADDI(rd, rd, static_cast<int32_t>(lower));
+  }
+}
+
+void CPU::Recompiler::rvEmitMov64(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& scratch,
+                                  u64 imm)
+{
+  // TODO: Make better..
+  rvEmitMov(rvAsm, rd, static_cast<u32>(imm >> 32));
+  rvEmitMov(rvAsm, scratch, static_cast<u32>(imm));
+  rvAsm->SLLI64(rd, rd, 32);
+  rvAsm->SLLI64(scratch, scratch, 32);
+  rvAsm->SRLI64(scratch, scratch, 32);
+  rvAsm->ADD(rd, rd, scratch);
+}
+
+u32 CPU::Recompiler::rvEmitJmp(biscuit::Assembler* rvAsm, const void* ptr, const biscuit::GPR& link_reg)
+{
+  // TODO: use J if displacement is <1MB,  needs a bool because backpatch must be 8 bytes
+  const auto [hi, lo] = rvGetAddressImmediates(rvAsm->GetCursorPointer(), ptr);
+  rvAsm->AUIPC(RSCRATCH, hi);
+  rvAsm->JALR(link_reg, lo, RSCRATCH);
+  return 8;
+}
+
+u32 CPU::Recompiler::rvEmitCall(biscuit::Assembler* rvAsm, const void* ptr)
+{
+  return rvEmitJmp(rvAsm, ptr, biscuit::ra);
+}
+
+void CPU::Recompiler::rvEmitSExtB(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs)
+{
+  rvAsm->SLLI(rd, rs, 24);
+  rvAsm->SRAIW(rd, rd, 24);
+}
+
+void CPU::Recompiler::rvEmitUExtB(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs)
+{
+  rvAsm->ANDI(rd, rs, 0xFF);
+}
+
+void CPU::Recompiler::rvEmitSExtH(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs)
+{
+  rvAsm->SLLI(rd, rs, 16);
+  rvAsm->SRAIW(rd, rd, 16);
+}
+
+void CPU::Recompiler::rvEmitUExtH(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs)
+{
+  rvAsm->SLLI(rd, rs, 16);
+  rvAsm->SRLI(rd, rd, 16);
+}
+
+void CPU::Recompiler::rvEmitDSExtW(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs)
+{
+  rvAsm->ADDIW(rd, rs, 0);
+}
+
+void CPU::Recompiler::rvEmitDUExtW(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs)
+{
+  rvAsm->SLLI64(rd, rs, 32);
+  rvAsm->SRLI64(rd, rd, 32);
+}
+
+void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size)
+{
+#ifdef ENABLE_HOST_DISASSEMBLY
+  const u8* cur = static_cast<const u8*>(start);
+  const u8* end = cur + size;
+  char buf[256];
+  while (cur < end)
+  {
+    rv_inst inst;
+    size_t instlen;
+    inst_fetch(cur, &inst, &instlen);
+    disasm_inst(buf, std::size(buf), rv64, static_cast<u64>(reinterpret_cast<uintptr_t>(cur)), inst);
+    Log_DebugPrintf("\t0x%016" PRIx64 "\t%s", static_cast<u64>(reinterpret_cast<uintptr_t>(cur)), buf);
+    cur += instlen;
+  }
+#else
+  Log_ErrorPrint("Not compiled with ENABLE_HOST_DISASSEMBLY.");
+#endif
+}
+
+u32 CPU::CodeCache::GetHostInstructionCount(const void* start, u32 size)
+{
+#ifdef ENABLE_HOST_DISASSEMBLY
+  const u8* cur = static_cast<const u8*>(start);
+  const u8* end = cur + size;
+  u32 icount = 0;
+  while (cur < end)
+  {
+    rv_inst inst;
+    size_t instlen;
+    inst_fetch(cur, &inst, &instlen);
+    cur += instlen;
+    icount++;
+  }
+  return icount;
+#else
+  Log_ErrorPrint("Not compiled with ENABLE_HOST_DISASSEMBLY.");
+  return 0;
+#endif
+}
+
+u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
+{
+  using namespace CPU::Recompiler;
+  using namespace biscuit;
+
+  Assembler actual_asm(static_cast<u8*>(code), code_size);
+  Assembler* rvAsm = &actual_asm;
+
+  Label dispatch;
+
+  g_enter_recompiler = reinterpret_cast<decltype(g_enter_recompiler)>(rvAsm->GetCursorPointer());
+  {
+    // TODO: reserve some space for saving caller-saved registers
+
+    // Need the CPU state for basically everything :-)
+    rvMoveAddressToReg(rvAsm, RSTATE, &g_state);
+
+    // Fastmem setup
+    if (IsUsingFastmem())
+      rvAsm->LD(RMEMBASE, PTR(&g_state.fastmem_base));
+
+    // Downcount isn't set on entry, so we need to initialize it
+    rvMoveAddressToReg(rvAsm, RARG1, TimingEvents::GetHeadEventPtr());
+    rvAsm->LD(RARG1, 0, RARG1);
+    rvAsm->LW(RARG1, offsetof(TimingEvent, m_downcount), RARG1);
+    rvAsm->SW(RARG1, PTR(&g_state.downcount));
+
+    // Fall through to event dispatcher
+  }
+
+  // check events then for frame done
+  g_check_events_and_dispatch = rvAsm->GetCursorPointer();
+  {
+    Label skip_event_check;
+    rvAsm->LW(RARG1, PTR(&g_state.pending_ticks));
+    rvAsm->LW(RARG2, PTR(&g_state.downcount));
+    rvAsm->BLTU(RARG1, RARG2, &skip_event_check);
+
+    g_run_events_and_dispatch = rvAsm->GetCursorPointer();
+    rvEmitCall(rvAsm, reinterpret_cast<const void*>(&TimingEvents::RunEvents));
+
+    rvAsm->Bind(&skip_event_check);
+  }
+
+  // TODO: align?
+  g_dispatcher = rvAsm->GetCursorPointer();
+  {
+    rvAsm->Bind(&dispatch);
+
+    // x9 <- s_fast_map[pc >> 16]
+    rvAsm->LWU(RARG1, PTR(&g_state.pc));
+    rvMoveAddressToReg(rvAsm, RARG3, g_code_lut.data());
+    rvAsm->SRLI(RARG2, RARG1, 16);
+    rvAsm->SLLI(RARG1, RARG1, 1);
+    rvAsm->SLLI(RARG2, RARG2, 3);
+    rvAsm->ADD(RARG2, RARG2, RARG3);
+    rvAsm->LD(RARG2, 0, RARG2);
+
+    // blr(x9[pc * 2]) (fast_map[pc >> 2])
+    rvAsm->ADD(RARG1, RARG1, RARG2);
+    rvAsm->LD(RARG1, 0, RARG1);
+    rvAsm->JR(RARG1);
+  }
+
+  g_compile_or_revalidate_block = rvAsm->GetCursorPointer();
+  {
+    rvAsm->LW(RARG1, PTR(&g_state.pc));
+    rvEmitCall(rvAsm, reinterpret_cast<const void*>(&CompileOrRevalidateBlock));
+    rvAsm->J(&dispatch);
+  }
+
+  g_discard_and_recompile_block = rvAsm->GetCursorPointer();
+  {
+    rvAsm->LW(RARG1, PTR(&g_state.pc));
+    rvEmitCall(rvAsm, reinterpret_cast<const void*>(&DiscardAndRecompileBlock));
+    rvAsm->J(&dispatch);
+  }
+
+  g_interpret_block = rvAsm->GetCursorPointer();
+  {
+    rvEmitCall(rvAsm, CodeCache::GetInterpretUncachedBlockFunction());
+    rvAsm->J(&dispatch);
+  }
+
+  // TODO: align?
+
+  return static_cast<u32>(rvAsm->GetCodeBuffer().GetSizeInBytes());
+}
+
+u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache)
+{
+  // TODO: get rid of assembler construction here
+  {
+    biscuit::Assembler assembler(static_cast<u8*>(code), BLOCK_LINK_SIZE);
+    CPU::Recompiler::rvEmitCall(&assembler, dst);
+
+    DebugAssert(assembler.GetCodeBuffer().GetSizeInBytes() <= BLOCK_LINK_SIZE);
+    if (assembler.GetCodeBuffer().GetRemainingBytes() > 0)
+      assembler.NOP();
+  }
+
+  if (flush_icache)
+    JitCodeBuffer::FlushInstructionCache(code, BLOCK_LINK_SIZE);
+
+  return BLOCK_LINK_SIZE;
+}
+
+CPU::NewRec::RISCV64Compiler::RISCV64Compiler() = default;
+
+CPU::NewRec::RISCV64Compiler::~RISCV64Compiler() = default;
+
+const void* CPU::NewRec::RISCV64Compiler::GetCurrentCodePointer()
+{
+  return rvAsm->GetCursorPointer();
+}
+
+void CPU::NewRec::RISCV64Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space,
+                                         u8* far_code_buffer, u32 far_code_space)
+{
+  Compiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
+
+  // TODO: don't recreate this every time..
+  DebugAssert(!m_emitter && !m_far_emitter && !rvAsm);
+  m_emitter = std::make_unique<Assembler>(code_buffer, code_buffer_space);
+  m_far_emitter = std::make_unique<Assembler>(far_code_buffer, far_code_space);
+  rvAsm = m_emitter.get();
+
+  // Need to wipe it out so it's correct when toggling fastmem.
+  m_host_regs = {};
+
+  const u32 membase_idx = CodeCache::IsUsingFastmem() ? RMEMBASE.Index() : NUM_HOST_REGS;
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    HostRegAlloc& hra = m_host_regs[i];
+
+    if (i == RARG1.Index() || i == RARG2.Index() || i == RARG3.Index() || i == RSCRATCH.Index() ||
+        i == RSTATE.Index() || i == membase_idx || i < 5 /* zero, ra, sp, gp, tp */)
+    {
+      continue;
+    }
+
+    hra.flags = HR_USABLE | (rvIsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::SwitchToFarCode(
+  bool emit_jump,
+  void (biscuit::Assembler::*inverted_cond)(biscuit::GPR, biscuit::GPR, biscuit::Label*) /* = nullptr */,
+  const biscuit::GPR& rs1 /* = biscuit::zero */, const biscuit::GPR& rs2 /* = biscuit::zero */)
+{
+  DebugAssert(rvAsm == m_emitter.get());
+  if (emit_jump)
+  {
+    const void* target = m_far_emitter->GetCursorPointer();
+    if (inverted_cond)
+    {
+      Label skip;
+      (rvAsm->*inverted_cond)(rs1, rs2, &skip);
+      rvEmitJmp(rvAsm, target);
+      rvAsm->Bind(&skip);
+    }
+    else
+    {
+      rvEmitCall(rvAsm, target);
+    }
+  }
+  rvAsm = m_far_emitter.get();
+}
+
+void CPU::NewRec::RISCV64Compiler::SwitchToNearCode(bool emit_jump)
+{
+  DebugAssert(rvAsm == m_far_emitter.get());
+  if (emit_jump)
+    rvEmitJmp(rvAsm, m_emitter->GetCursorPointer());
+  rvAsm = m_emitter.get();
+}
+
+void CPU::NewRec::RISCV64Compiler::EmitMov(const biscuit::GPR& dst, u32 val)
+{
+  rvEmitMov(rvAsm, dst, val);
+}
+
+void CPU::NewRec::RISCV64Compiler::EmitCall(const void* ptr)
+{
+  rvEmitCall(rvAsm, ptr);
+}
+
+void CPU::NewRec::RISCV64Compiler::SafeImmSExtIType(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm,
+                                                    void (biscuit::Assembler::*iop)(GPR, GPR, u32),
+                                                    void (biscuit::Assembler::*rop)(GPR, GPR, GPR))
+{
+  DebugAssert(rd != RSCRATCH && rs != RSCRATCH);
+
+  if (rvIsValidSExtITypeImm(imm))
+  {
+    (rvAsm->*iop)(rd, rs, imm);
+    return;
+  }
+
+  rvEmitMov(rvAsm, RSCRATCH, imm);
+  (rvAsm->*rop)(rd, rs, RSCRATCH);
+}
+
+void CPU::NewRec::RISCV64Compiler::SafeADDI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
+{
+  SafeImmSExtIType(rd, rs, imm, reinterpret_cast<void (biscuit::Assembler::*)(GPR, GPR, u32)>(&Assembler::ADDI),
+                   &Assembler::ADD);
+}
+
+void CPU::NewRec::RISCV64Compiler::SafeADDIW(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
+{
+  SafeImmSExtIType(rd, rs, imm, reinterpret_cast<void (biscuit::Assembler::*)(GPR, GPR, u32)>(&Assembler::ADDIW),
+                   &Assembler::ADDW);
+}
+
+void CPU::NewRec::RISCV64Compiler::SafeSUBIW(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
+{
+  const u32 nimm = static_cast<u32>(-static_cast<s32>(imm));
+  SafeImmSExtIType(rd, rs, nimm, reinterpret_cast<void (biscuit::Assembler::*)(GPR, GPR, u32)>(&Assembler::ADDIW),
+                   &Assembler::ADDW);
+}
+
+void CPU::NewRec::RISCV64Compiler::SafeANDI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
+{
+  SafeImmSExtIType(rd, rs, imm, &Assembler::ANDI, &Assembler::AND);
+}
+
+void CPU::NewRec::RISCV64Compiler::SafeORI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
+{
+  SafeImmSExtIType(rd, rs, imm, &Assembler::ORI, &Assembler::OR);
+}
+
+void CPU::NewRec::RISCV64Compiler::SafeXORI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
+{
+  SafeImmSExtIType(rd, rs, imm, &Assembler::XORI, &Assembler::XOR);
+}
+
+void CPU::NewRec::RISCV64Compiler::SafeSLTI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
+{
+  SafeImmSExtIType(rd, rs, imm, reinterpret_cast<void (biscuit::Assembler::*)(GPR, GPR, u32)>(&Assembler::SLTI),
+                   &Assembler::SLT);
+}
+
+void CPU::NewRec::RISCV64Compiler::SafeSLTIU(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm)
+{
+  SafeImmSExtIType(rd, rs, imm, reinterpret_cast<void (biscuit::Assembler::*)(GPR, GPR, u32)>(&Assembler::SLTIU),
+                   &Assembler::SLTU);
+}
+
+void CPU::NewRec::RISCV64Compiler::EmitSExtB(const biscuit::GPR& rd, const biscuit::GPR& rs)
+{
+  rvEmitSExtB(rvAsm, rd, rs);
+}
+
+void CPU::NewRec::RISCV64Compiler::EmitUExtB(const biscuit::GPR& rd, const biscuit::GPR& rs)
+{
+  rvEmitUExtB(rvAsm, rd, rs);
+}
+
+void CPU::NewRec::RISCV64Compiler::EmitSExtH(const biscuit::GPR& rd, const biscuit::GPR& rs)
+{
+  rvEmitSExtH(rvAsm, rd, rs);
+}
+
+void CPU::NewRec::RISCV64Compiler::EmitUExtH(const biscuit::GPR& rd, const biscuit::GPR& rs)
+{
+  rvEmitUExtH(rvAsm, rd, rs);
+}
+
+void CPU::NewRec::RISCV64Compiler::EmitDSExtW(const biscuit::GPR& rd, const biscuit::GPR& rs)
+{
+  rvEmitDSExtW(rvAsm, rd, rs);
+}
+
+void CPU::NewRec::RISCV64Compiler::EmitDUExtW(const biscuit::GPR& rd, const biscuit::GPR& rs)
+{
+  rvEmitDUExtW(rvAsm, rd, rs);
+}
+
+void CPU::NewRec::RISCV64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
+{
+  // store it first to reduce code size, because we can offset
+  // TODO: 64-bit displacement is needed :/
+  // rvMoveAddressToReg(rvAsm, RARG1, ram_ptr);
+  // rvMoveAddressToReg(rvAsm, RARG2, shadow_ptr);
+  rvEmitMov64(rvAsm, RARG1, RSCRATCH, static_cast<u64>(reinterpret_cast<uintptr_t>(ram_ptr)));
+  rvEmitMov64(rvAsm, RARG2, RSCRATCH, static_cast<u64>(reinterpret_cast<uintptr_t>(shadow_ptr)));
+
+  u32 offset = 0;
+  Label block_changed;
+
+  while (size >= 8)
+  {
+    rvAsm->LD(RARG3, offset, RARG1);
+    rvAsm->LD(RSCRATCH, offset, RARG2);
+    rvAsm->BNE(RARG3, RSCRATCH, &block_changed);
+    offset += 8;
+    size -= 8;
+  }
+
+  while (size >= 4)
+  {
+    rvAsm->LWU(RARG3, offset, RARG1);
+    rvAsm->LWU(RSCRATCH, offset, RARG2);
+    rvAsm->BNE(RARG3, RSCRATCH, &block_changed);
+    offset += 4;
+    size -= 4;
+  }
+
+  DebugAssert(size == 0);
+
+  Label block_unchanged;
+  rvAsm->J(&block_unchanged);
+  rvAsm->Bind(&block_changed);
+  rvEmitJmp(rvAsm, CodeCache::g_discard_and_recompile_block);
+  rvAsm->Bind(&block_unchanged);
+}
+
+void CPU::NewRec::RISCV64Compiler::GenerateICacheCheckAndUpdate()
+{
+  if (GetSegmentForAddress(m_block->pc) >= Segment::KSEG1)
+  {
+    rvAsm->LW(RARG1, PTR(&g_state.pending_ticks));
+    SafeADDIW(RARG1, RARG1, static_cast<u32>(m_block->uncached_fetch_ticks));
+    rvAsm->SW(RARG1, PTR(&g_state.pending_ticks));
+  }
+  else
+  {
+    const auto& ticks_reg = RARG1;
+    const auto& current_tag_reg = RARG2;
+    const auto& existing_tag_reg = RARG3;
+
+    VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
+    rvAsm->LW(ticks_reg, PTR(&g_state.pending_ticks));
+    rvEmitMov(rvAsm, current_tag_reg, current_pc);
+
+    for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
+    {
+      const TickCount fill_ticks = GetICacheFillTicks(current_pc);
+      if (fill_ticks <= 0)
+        continue;
+
+      const u32 line = GetICacheLine(current_pc);
+      const u32 offset = offsetof(State, icache_tags) + (line * sizeof(u32));
+
+      // TODO: Verify sign extension here...
+      Label cache_hit;
+      rvAsm->LW(existing_tag_reg, offset, RSTATE);
+      rvAsm->BEQ(existing_tag_reg, current_tag_reg, &cache_hit);
+
+      rvAsm->SW(current_tag_reg, offset, RSTATE);
+      SafeADDIW(ticks_reg, ticks_reg, static_cast<u32>(fill_ticks));
+      rvAsm->Bind(&cache_hit);
+
+      if (i != (m_block->icache_line_count - 1))
+        SafeADDIW(current_tag_reg, current_tag_reg, ICACHE_LINE_SIZE);
+    }
+
+    rvAsm->SW(ticks_reg, PTR(&g_state.pending_ticks));
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
+                                                s32 arg3reg /*= -1*/)
+{
+  if (arg1reg >= 0 && arg1reg != static_cast<s32>(RARG1.Index()))
+    rvAsm->MV(RARG1, GPR(arg1reg));
+  if (arg1reg >= 0 && arg2reg != static_cast<s32>(RARG2.Index()))
+    rvAsm->MV(RARG2, GPR(arg2reg));
+  if (arg1reg >= 0 && arg3reg != static_cast<s32>(RARG3.Index()))
+    rvAsm->MV(RARG3, GPR(arg3reg));
+  EmitCall(func);
+}
+
+void CPU::NewRec::RISCV64Compiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
+{
+  if (newpc.has_value())
+  {
+    if (m_dirty_pc || m_compiler_pc != newpc)
+    {
+      EmitMov(RSCRATCH, newpc.value());
+      rvAsm->SW(RSCRATCH, PTR(&g_state.pc));
+    }
+  }
+  m_dirty_pc = false;
+
+  // flush regs
+  Flush(FLUSH_END_BLOCK);
+  EndAndLinkBlock(newpc, do_event_test);
+}
+
+void CPU::NewRec::RISCV64Compiler::EndBlockWithException(Exception excode)
+{
+  // flush regs, but not pc, it's going to get overwritten
+  // flush cycles because of the GTE instruction stuff...
+  Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION);
+
+  // TODO: flush load delay
+  // TODO: break for pcdrv
+
+  EmitMov(RARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
+                                                             inst->cop.cop_n));
+  EmitMov(RARG2, m_current_instruction_pc);
+  EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
+  m_dirty_pc = false;
+
+  EndAndLinkBlock(std::nullopt, true);
+}
+
+void CPU::NewRec::RISCV64Compiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test)
+{
+  // event test
+  // pc should've been flushed
+  DebugAssert(!m_dirty_pc);
+
+  // TODO: try extracting this to a function
+  // TODO: move the cycle flush in here..
+
+  // save cycles for event test
+  const TickCount cycles = std::exchange(m_cycles, 0);
+
+  // pending_ticks += cycles
+  // if (pending_ticks >= downcount) { dispatch_event(); }
+  if (do_event_test || m_gte_done_cycle > cycles || cycles > 0)
+    rvAsm->LW(RARG1, PTR(&g_state.pending_ticks));
+  if (do_event_test)
+    rvAsm->LW(RARG2, PTR(&g_state.downcount));
+  if (cycles > 0)
+  {
+    SafeADDIW(RARG1, RARG1, cycles);
+    rvAsm->SW(RARG1, PTR(&g_state.pending_ticks));
+  }
+  if (m_gte_done_cycle > cycles)
+  {
+    SafeADDIW(RARG2, RARG1, m_gte_done_cycle - cycles);
+    rvAsm->SW(RARG1, PTR(&g_state.gte_completion_tick));
+  }
+
+  if (do_event_test)
+  {
+    // TODO: see if we can do a far jump somehow with this..
+    Label cont;
+    rvAsm->BLT(RARG1, RARG2, &cont);
+    rvEmitJmp(rvAsm, CodeCache::g_run_events_and_dispatch);
+    rvAsm->Bind(&cont);
+  }
+
+  // jump to dispatcher or next block
+  if (!newpc.has_value())
+  {
+    rvEmitJmp(rvAsm, CodeCache::g_dispatcher);
+  }
+  else
+  {
+    if (newpc.value() == m_block->pc)
+    {
+      // Special case: ourselves! No need to backlink then.
+      Log_DebugPrintf("Linking block at %08X to self", m_block->pc);
+      rvEmitJmp(rvAsm, rvAsm->GetBufferPointer(0));
+    }
+    else
+    {
+      const void* target = CreateBlockLink(m_block, rvAsm->GetCursorPointer(), newpc.value());
+      rvEmitJmp(rvAsm, target);
+    }
+  }
+
+  m_block_ended = true;
+}
+
+const void* CPU::NewRec::RISCV64Compiler::EndCompile(u32* code_size, u32* far_code_size)
+{
+  u8* const code = m_emitter->GetBufferPointer(0);
+  *code_size = static_cast<u32>(m_emitter->GetCodeBuffer().GetSizeInBytes());
+  *far_code_size = static_cast<u32>(m_far_emitter->GetCodeBuffer().GetSizeInBytes());
+  rvAsm = nullptr;
+  m_far_emitter.reset();
+  m_emitter.reset();
+  return code;
+}
+
+const char* CPU::NewRec::RISCV64Compiler::GetHostRegName(u32 reg) const
+{
+  static constexpr std::array<const char*, 32> reg64_names = {
+    {"zero", "ra", "sp", "gp", "tp", "t0", "t1", "t2", "s0", "s1", "a0",  "a1",  "a2", "a3", "a4", "a5",
+     "a6",   "a7", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "s11", "t3", "t4", "t5", "t6"}};
+  return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
+}
+
+void CPU::NewRec::RISCV64Compiler::LoadHostRegWithConstant(u32 reg, u32 val)
+{
+  EmitMov(GPR(reg), val);
+}
+
+void CPU::NewRec::RISCV64Compiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
+{
+  rvAsm->LW(GPR(reg), PTR(ptr));
+}
+
+void CPU::NewRec::RISCV64Compiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
+{
+  rvAsm->SW(GPR(reg), PTR(ptr));
+}
+
+void CPU::NewRec::RISCV64Compiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
+{
+  if (val == 0)
+  {
+    rvAsm->SW(zero, PTR(ptr));
+    return;
+  }
+
+  EmitMov(RSCRATCH, val);
+  rvAsm->SW(RSCRATCH, PTR(ptr));
+}
+
+void CPU::NewRec::RISCV64Compiler::CopyHostReg(u32 dst, u32 src)
+{
+  if (src != dst)
+    rvAsm->MV(GPR(dst), GPR(src));
+}
+
+void CPU::NewRec::RISCV64Compiler::AssertRegOrConstS(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_s || cf.const_s);
+}
+
+void CPU::NewRec::RISCV64Compiler::AssertRegOrConstT(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_t || cf.const_t);
+}
+
+biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetSafeRegS(CompileFlags cf, const biscuit::GPR& temp_reg)
+{
+  if (cf.valid_host_s)
+  {
+    return GPR(cf.host_s);
+  }
+  else if (cf.const_s)
+  {
+    if (HasConstantRegValue(cf.MipsS(), 0))
+      return zero;
+
+    EmitMov(temp_reg, GetConstantRegU32(cf.MipsS()));
+    return temp_reg;
+  }
+  else
+  {
+    Log_WarningPrintf("Hit memory path in CFGetSafeRegS() for %s", GetRegName(cf.MipsS()));
+    rvAsm->LW(temp_reg, PTR(&g_state.regs.r[cf.mips_s]));
+    return temp_reg;
+  }
+}
+
+biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetSafeRegT(CompileFlags cf, const biscuit::GPR& temp_reg)
+{
+  if (cf.valid_host_t)
+  {
+    return GPR(cf.host_t);
+  }
+  else if (cf.const_t)
+  {
+    if (HasConstantRegValue(cf.MipsT(), 0))
+      return zero;
+
+    EmitMov(temp_reg, GetConstantRegU32(cf.MipsT()));
+    return temp_reg;
+  }
+  else
+  {
+    Log_WarningPrintf("Hit memory path in CFGetSafeRegT() for %s", GetRegName(cf.MipsT()));
+    rvAsm->LW(temp_reg, PTR(&g_state.regs.r[cf.mips_t]));
+    return temp_reg;
+  }
+}
+
+biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegD(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_d);
+  return GPR(cf.host_d);
+}
+
+biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegS(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_s);
+  return GPR(cf.host_s);
+}
+
+biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegT(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_t);
+  return GPR(cf.host_t);
+}
+
+biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegLO(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_lo);
+  return GPR(cf.host_lo);
+}
+
+biscuit::GPR CPU::NewRec::RISCV64Compiler::CFGetRegHI(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_hi);
+  return GPR(cf.host_hi);
+}
+
+void CPU::NewRec::RISCV64Compiler::MoveSToReg(const biscuit::GPR& dst, CompileFlags cf)
+{
+  if (cf.valid_host_s)
+  {
+    if (cf.host_s != dst.Index())
+      rvAsm->MV(dst, GPR(cf.host_s));
+  }
+  else if (cf.const_s)
+  {
+    EmitMov(dst, GetConstantRegU32(cf.MipsS()));
+  }
+  else
+  {
+    Log_WarningPrintf("Hit memory path in MoveSToReg() for %s", GetRegName(cf.MipsS()));
+    rvAsm->LW(dst, PTR(&g_state.regs.r[cf.mips_s]));
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::MoveTToReg(const biscuit::GPR& dst, CompileFlags cf)
+{
+  if (cf.valid_host_t)
+  {
+    if (cf.host_t != dst.Index())
+      rvAsm->MV(dst, GPR(cf.host_t));
+  }
+  else if (cf.const_t)
+  {
+    EmitMov(dst, GetConstantRegU32(cf.MipsT()));
+  }
+  else
+  {
+    Log_WarningPrintf("Hit memory path in MoveTToReg() for %s", GetRegName(cf.MipsT()));
+    rvAsm->LW(dst, PTR(&g_state.regs.r[cf.mips_t]));
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg)
+{
+  DebugAssert(reg < Reg::count);
+  if (const std::optional<u32> hreg = CheckHostReg(0, Compiler::HR_TYPE_CPU_REG, reg))
+    rvAsm->MV(dst, GPR(hreg.value()));
+  else if (HasConstantReg(reg))
+    EmitMov(dst, GetConstantRegU32(reg));
+  else
+    rvAsm->LW(dst, PTR(&g_state.regs.r[static_cast<u8>(reg)]));
+}
+
+void CPU::NewRec::RISCV64Compiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val,
+                                                                Reg arg2reg /* = Reg::count */,
+                                                                Reg arg3reg /* = Reg::count */)
+{
+  DebugAssert(g_settings.gpu_pgxp_enable);
+
+  Flush(FLUSH_FOR_C_CALL);
+
+  if (arg2reg != Reg::count)
+    MoveMIPSRegToReg(RARG2, arg2reg);
+  if (arg3reg != Reg::count)
+    MoveMIPSRegToReg(RARG3, arg3reg);
+
+  EmitMov(RARG1, arg1val);
+  EmitCall(func);
+}
+
+void CPU::NewRec::RISCV64Compiler::Flush(u32 flags)
+{
+  Compiler::Flush(flags);
+
+  if (flags & FLUSH_PC && m_dirty_pc)
+  {
+    StoreConstantToCPUPointer(m_compiler_pc, &g_state.pc);
+    m_dirty_pc = false;
+  }
+
+  if (flags & FLUSH_INSTRUCTION_BITS)
+  {
+    // This sucks, but it's only used for fallbacks.
+    Panic("Not implemented");
+  }
+
+  if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
+  {
+    // This sucks :(
+    // TODO: make it a function?
+    rvAsm->LBU(RARG1, PTR(&g_state.load_delay_reg));
+    rvAsm->LW(RARG2, PTR(&g_state.load_delay_value));
+    rvAsm->SLLI(RARG1, RARG1, 2); // *4
+    rvAsm->ADD(RARG1, RARG1, RSTATE);
+    rvAsm->SW(RARG2, offsetof(CPU::State, regs.r[0]), RARG1);
+    rvAsm->LI(RSCRATCH, static_cast<u8>(Reg::count));
+    rvAsm->SB(RSCRATCH, PTR(&g_state.load_delay_reg));
+    m_load_delay_dirty = false;
+  }
+
+  if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
+  {
+    if (m_load_delay_value_register != NUM_HOST_REGS)
+      FreeHostReg(m_load_delay_value_register);
+
+    EmitMov(RSCRATCH, static_cast<u8>(m_load_delay_register));
+    rvAsm->SB(RSCRATCH, PTR(&g_state.load_delay_reg));
+    m_load_delay_register = Reg::count;
+    m_load_delay_dirty = true;
+  }
+
+  if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
+  {
+    // May as well flush cycles while we're here.
+    // GTE spanning blocks is very rare, we _could_ disable this for speed.
+    rvAsm->LW(RARG1, PTR(&g_state.pending_ticks));
+    rvAsm->LW(RARG2, PTR(&g_state.gte_completion_tick));
+    if (m_cycles > 0)
+    {
+      SafeADDIW(RARG1, RARG1, m_cycles);
+      m_cycles = 0;
+    }
+    Label no_stall;
+    rvAsm->BGE(RARG1, RARG2, &no_stall);
+    rvAsm->MV(RARG1, RARG2);
+    rvAsm->Bind(&no_stall);
+    rvAsm->SW(RARG1, PTR(&g_state.pending_ticks));
+    m_dirty_gte_done_cycle = false;
+  }
+
+  if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
+  {
+    rvAsm->LW(RARG1, PTR(&g_state.pending_ticks));
+
+    // update cycles at the same time
+    if (flags & FLUSH_CYCLES && m_cycles > 0)
+    {
+      SafeADDIW(RARG1, RARG1, m_cycles);
+      rvAsm->SW(RARG1, PTR(&g_state.pending_ticks));
+      m_gte_done_cycle -= m_cycles;
+      m_cycles = 0;
+    }
+
+    SafeADDIW(RARG1, RARG1, m_gte_done_cycle);
+    rvAsm->SW(RARG1, PTR(&g_state.gte_completion_tick));
+    m_gte_done_cycle = 0;
+    m_dirty_gte_done_cycle = true;
+  }
+
+  if (flags & FLUSH_CYCLES && m_cycles > 0)
+  {
+    rvAsm->LW(RARG1, PTR(&g_state.pending_ticks));
+    SafeADDIW(RARG1, RARG1, m_cycles);
+    rvAsm->SW(RARG1, PTR(&g_state.pending_ticks));
+    m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
+    m_cycles = 0;
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_Fallback()
+{
+  Flush(FLUSH_FOR_INTERPRETER);
+
+#if 0
+  cg->call(&CPU::Recompiler::Thunks::InterpretInstruction);
+
+  // TODO: make me less garbage
+  // TODO: this is wrong, it flushes the load delay on the same cycle when we return.
+  // but nothing should be going through here..
+  Label no_load_delay;
+  cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]);
+  cg->cmp(RWARG1, static_cast<u8>(Reg::count));
+  cg->je(no_load_delay, CodeGenerator::T_SHORT);
+  cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]);
+  cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1);
+  cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2);
+  cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast<u32>(Reg::count));
+  cg->L(no_load_delay);
+
+  m_load_delay_dirty = EMULATE_LOAD_DELAYS;
+#else
+  Panic("Fixme");
+#endif
+}
+
+void CPU::NewRec::RISCV64Compiler::CheckBranchTarget(const biscuit::GPR& pcreg)
+{
+  if (!g_settings.cpu_recompiler_memory_exceptions)
+    return;
+
+  DebugAssert(pcreg != RSCRATCH);
+  rvAsm->ANDI(RSCRATCH, pcreg, 0x3);
+  SwitchToFarCode(true, &Assembler::BEQ, RSCRATCH, zero);
+
+  BackupHostState();
+  EndBlockWithException(Exception::AdEL);
+
+  RestoreHostState();
+  SwitchToNearCode(false);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_jr(CompileFlags cf)
+{
+  const GPR pcreg = CFGetRegS(cf);
+  CheckBranchTarget(pcreg);
+
+  rvAsm->SW(pcreg, PTR(&g_state.pc));
+
+  CompileBranchDelaySlot(false);
+  EndBlock(std::nullopt, true);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_jalr(CompileFlags cf)
+{
+  const GPR pcreg = CFGetRegS(cf);
+  if (MipsD() != Reg::zero)
+    SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
+
+  CheckBranchTarget(pcreg);
+  rvAsm->SW(pcreg, PTR(&g_state.pc));
+
+  CompileBranchDelaySlot(false);
+  EndBlock(std::nullopt, true);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
+{
+  AssertRegOrConstS(cf);
+
+  const u32 taken_pc = GetConditionalBranchTarget(cf);
+
+  Flush(FLUSH_FOR_BRANCH);
+
+  DebugAssert(cf.valid_host_s);
+
+  // MipsT() here should equal zero for zero branches.
+  DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
+
+  Label taken;
+  const GPR rs = CFGetRegS(cf);
+  switch (cond)
+  {
+    case BranchCondition::Equal:
+    case BranchCondition::NotEqual:
+    {
+      AssertRegOrConstT(cf);
+      if (cf.const_t && HasConstantRegValue(cf.MipsT(), 0))
+      {
+        (cond == BranchCondition::Equal) ? rvAsm->BEQZ(rs, &taken) : rvAsm->BNEZ(rs, &taken);
+      }
+      else
+      {
+        const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG1;
+        if (!cf.valid_host_t)
+          MoveTToReg(RARG1, cf);
+        if (cond == Compiler::BranchCondition::Equal)
+          rvAsm->BEQ(rs, rt, &taken);
+        else
+          rvAsm->BNE(rs, rt, &taken);
+      }
+    }
+    break;
+
+    case BranchCondition::GreaterThanZero:
+    {
+      rvAsm->BGTZ(rs, &taken);
+    }
+    break;
+
+    case BranchCondition::GreaterEqualZero:
+    {
+      rvAsm->BGEZ(rs, &taken);
+    }
+    break;
+
+    case BranchCondition::LessThanZero:
+    {
+      rvAsm->BLTZ(rs, &taken);
+    }
+    break;
+
+    case BranchCondition::LessEqualZero:
+    {
+      rvAsm->BLEZ(rs, &taken);
+    }
+    break;
+  }
+
+  BackupHostState();
+  if (!cf.delay_slot_swapped)
+    CompileBranchDelaySlot();
+
+  EndBlock(m_compiler_pc, true);
+
+  rvAsm->Bind(&taken);
+
+  RestoreHostState();
+  if (!cf.delay_slot_swapped)
+    CompileBranchDelaySlot();
+
+  EndBlock(taken_pc, true);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_addi(CompileFlags cf, bool overflow)
+{
+  const GPR rs = CFGetRegS(cf);
+  const GPR rt = CFGetRegT(cf);
+  if (const u32 imm = inst->i.imm_sext32(); imm != 0)
+  {
+    if (!overflow)
+    {
+      SafeADDIW(rt, rs, imm);
+    }
+    else
+    {
+      SafeADDI(RARG1, rs, imm);
+      SafeADDIW(rt, rs, imm);
+      TestOverflow(RARG1, rt, rt);
+    }
+  }
+  else if (rt.Index() != rs.Index())
+  {
+    rvAsm->MV(rt, rs);
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_addi(CompileFlags cf)
+{
+  Compile_addi(cf, g_settings.cpu_recompiler_memory_exceptions);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_addiu(CompileFlags cf)
+{
+  Compile_addi(cf, false);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_slti(CompileFlags cf)
+{
+  Compile_slti(cf, true);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_sltiu(CompileFlags cf)
+{
+  Compile_slti(cf, false);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_slti(CompileFlags cf, bool sign)
+{
+  if (sign)
+    SafeSLTI(CFGetRegT(cf), CFGetRegS(cf), inst->i.imm_sext32());
+  else
+    SafeSLTIU(CFGetRegT(cf), CFGetRegS(cf), inst->i.imm_sext32());
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_andi(CompileFlags cf)
+{
+  const GPR rt = CFGetRegT(cf);
+  if (const u32 imm = inst->i.imm_zext32(); imm != 0)
+    SafeANDI(rt, CFGetRegS(cf), imm);
+  else
+    EmitMov(rt, 0);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_ori(CompileFlags cf)
+{
+  const GPR rt = CFGetRegT(cf);
+  const GPR rs = CFGetRegS(cf);
+  if (const u32 imm = inst->i.imm_zext32(); imm != 0)
+    SafeORI(rt, rs, imm);
+  else if (rt.Index() != rs.Index())
+    rvAsm->MV(rt, rs);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_xori(CompileFlags cf)
+{
+  const GPR rt = CFGetRegT(cf);
+  const GPR rs = CFGetRegS(cf);
+  if (const u32 imm = inst->i.imm_zext32(); imm != 0)
+    SafeXORI(rt, rs, imm);
+  else if (rt.Index() != rs.Index())
+    rvAsm->MV(rt, rs);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_shift(
+  CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR),
+  void (biscuit::Assembler::*op_const)(biscuit::GPR, biscuit::GPR, unsigned))
+{
+  const GPR rd = CFGetRegD(cf);
+  const GPR rt = CFGetRegT(cf);
+  if (inst->r.shamt > 0)
+    (rvAsm->*op_const)(rd, rt, inst->r.shamt);
+  else if (rd.Index() != rt.Index())
+    rvAsm->MV(rd, rt);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_sll(CompileFlags cf)
+{
+  Compile_shift(cf, &Assembler::SLLW, &Assembler::SLLIW);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_srl(CompileFlags cf)
+{
+  Compile_shift(cf, &Assembler::SRLW, &Assembler::SRLIW);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_sra(CompileFlags cf)
+{
+  Compile_shift(cf, &Assembler::SRAW, &Assembler::SRAIW);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_variable_shift(
+  CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR),
+  void (biscuit::Assembler::*op_const)(biscuit::GPR, biscuit::GPR, unsigned))
+{
+  const GPR rd = CFGetRegD(cf);
+
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+
+  const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
+  if (!cf.valid_host_t)
+    MoveTToReg(rt, cf);
+
+  if (cf.const_s)
+  {
+    if (const u32 shift = GetConstantRegU32(cf.MipsS()); shift != 0)
+      (rvAsm->*op_const)(rd, rt, shift & 31u);
+    else if (rd.Index() != rt.Index())
+      rvAsm->MV(rd, rt);
+  }
+  else
+  {
+    (rvAsm->*op)(rd, rt, CFGetRegS(cf));
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_sllv(CompileFlags cf)
+{
+  Compile_variable_shift(cf, &Assembler::SLLW, &Assembler::SLLIW);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_srlv(CompileFlags cf)
+{
+  Compile_variable_shift(cf, &Assembler::SRLW, &Assembler::SRLIW);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_srav(CompileFlags cf)
+{
+  Compile_variable_shift(cf, &Assembler::SRAW, &Assembler::SRAIW);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_mult(CompileFlags cf, bool sign)
+{
+  const GPR rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
+  if (!cf.valid_host_s)
+    MoveSToReg(rs, cf);
+
+  const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
+  if (!cf.valid_host_t)
+    MoveTToReg(rt, cf);
+
+  // TODO: if lo/hi gets killed, we can use a 32-bit multiply
+  const GPR lo = CFGetRegLO(cf);
+  const GPR hi = CFGetRegHI(cf);
+
+  if (sign)
+  {
+    rvAsm->MUL(lo, rs, rt);
+    rvAsm->SRAI64(hi, lo, 32);
+    EmitDSExtW(lo, lo);
+  }
+  else
+  {
+    // Need to make it unsigned.
+    EmitDUExtW(RARG1, rs);
+    EmitDUExtW(RARG2, rt);
+    rvAsm->MUL(lo, RARG1, RARG2);
+    rvAsm->SRAI64(hi, lo, 32);
+    EmitDSExtW(lo, lo);
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_mult(CompileFlags cf)
+{
+  Compile_mult(cf, true);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_multu(CompileFlags cf)
+{
+  Compile_mult(cf, false);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_div(CompileFlags cf)
+{
+  // 36 Volume I: RISC-V User-Level ISA V2.2
+  const GPR rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
+  if (!cf.valid_host_s)
+    MoveSToReg(rs, cf);
+
+  const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
+  if (!cf.valid_host_t)
+    MoveTToReg(rt, cf);
+
+  const GPR rlo = CFGetRegLO(cf);
+  const GPR rhi = CFGetRegHI(cf);
+
+  Label done;
+  Label not_divide_by_zero;
+  rvAsm->BNEZ(rt, &not_divide_by_zero);
+  rvAsm->MV(rhi, rs); // hi = num
+  rvAsm->SRAI64(rlo, rs, 63);
+  rvAsm->ANDI(rlo, rlo, 2);
+  rvAsm->ADDI(rlo, rlo, -1); // lo = s >= 0 ? -1 : 1
+  rvAsm->J(&done);
+
+  rvAsm->Bind(&not_divide_by_zero);
+  Label not_unrepresentable;
+  EmitMov(RSCRATCH, static_cast<u32>(-1));
+  rvAsm->BNE(rt, RSCRATCH, &not_unrepresentable);
+  EmitMov(rlo, 0x80000000u);
+  rvAsm->BNE(rs, rlo, &not_unrepresentable);
+  EmitMov(rhi, 0);
+  rvAsm->J(&done);
+
+  rvAsm->Bind(&not_unrepresentable);
+
+  rvAsm->DIVW(rlo, rs, rt);
+  rvAsm->REMW(rhi, rs, rt);
+
+  rvAsm->Bind(&done);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_divu(CompileFlags cf)
+{
+  const GPR rs = cf.valid_host_s ? CFGetRegS(cf) : RARG1;
+  if (!cf.valid_host_s)
+    MoveSToReg(rs, cf);
+
+  const GPR rt = cf.valid_host_t ? CFGetRegT(cf) : RARG2;
+  if (!cf.valid_host_t)
+    MoveTToReg(rt, cf);
+
+  const GPR rlo = CFGetRegLO(cf);
+  const GPR rhi = CFGetRegHI(cf);
+
+  // Semantics match? :-)
+  rvAsm->DIVUW(rlo, rs, rt);
+  rvAsm->REMUW(rhi, rs, rt);
+}
+
+void CPU::NewRec::RISCV64Compiler::TestOverflow(const biscuit::GPR& long_res, const biscuit::GPR& res,
+                                                const biscuit::GPR& reg_to_discard)
+{
+  SwitchToFarCode(true, &Assembler::BEQ, long_res, res);
+
+  BackupHostState();
+
+  // toss the result
+  ClearHostReg(reg_to_discard.Index());
+
+  EndBlockWithException(Exception::Ov);
+
+  RestoreHostState();
+
+  SwitchToNearCode(false);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_dst_op(
+  CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR),
+  void (RISCV64Compiler::*op_const)(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm),
+  void (biscuit::Assembler::*op_long)(biscuit::GPR, biscuit::GPR, biscuit::GPR), bool commutative, bool overflow)
+{
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+
+  const GPR rd = CFGetRegD(cf);
+
+  if (overflow)
+  {
+    const GPR rs = CFGetSafeRegS(cf, RARG1);
+    const GPR rt = CFGetSafeRegT(cf, RARG2);
+    (rvAsm->*op)(RARG3, rs, rt);
+    (rvAsm->*op_long)(rd, rs, rt);
+    TestOverflow(RARG3, rd, rd);
+    return;
+  }
+
+  if (cf.valid_host_s && cf.valid_host_t)
+  {
+    (rvAsm->*op)(rd, CFGetRegS(cf), CFGetRegT(cf));
+  }
+  else if (commutative && (cf.const_s || cf.const_t))
+  {
+    const GPR src = cf.const_s ? CFGetRegT(cf) : CFGetRegS(cf);
+    if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
+    {
+      (this->*op_const)(rd, src, cv);
+    }
+    else
+    {
+      if (rd.Index() != src.Index())
+        rvAsm->MV(rd, src);
+      overflow = false;
+    }
+  }
+  else if (cf.const_s)
+  {
+    if (HasConstantRegValue(cf.MipsS(), 0))
+    {
+      (rvAsm->*op)(rd, zero, CFGetRegT(cf));
+    }
+    else
+    {
+      EmitMov(RSCRATCH, GetConstantRegU32(cf.MipsS()));
+      (rvAsm->*op)(rd, RSCRATCH, CFGetRegT(cf));
+    }
+  }
+  else if (cf.const_t)
+  {
+    const GPR rs = CFGetRegS(cf);
+    if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
+    {
+      (this->*op_const)(rd, rs, cv);
+    }
+    else
+    {
+      if (rd.Index() != rs.Index())
+        rvAsm->MV(rd, rs);
+      overflow = false;
+    }
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_add(CompileFlags cf)
+{
+  Compile_dst_op(cf, &Assembler::ADDW, &RISCV64Compiler::SafeADDIW, &Assembler::ADD, true,
+                 g_settings.cpu_recompiler_memory_exceptions);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_addu(CompileFlags cf)
+{
+  Compile_dst_op(cf, &Assembler::ADDW, &RISCV64Compiler::SafeADDIW, &Assembler::ADD, true, false);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_sub(CompileFlags cf)
+{
+  Compile_dst_op(cf, &Assembler::SUBW, &RISCV64Compiler::SafeSUBIW, &Assembler::SUB, false,
+                 g_settings.cpu_recompiler_memory_exceptions);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_subu(CompileFlags cf)
+{
+  Compile_dst_op(cf, &Assembler::SUBW, &RISCV64Compiler::SafeSUBIW, &Assembler::SUB, false, false);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_and(CompileFlags cf)
+{
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+
+  // special cases - and with self -> self, and with 0 -> 0
+  const GPR regd = CFGetRegD(cf);
+  if (cf.MipsS() == cf.MipsT())
+  {
+    rvAsm->MV(regd, CFGetRegS(cf));
+    return;
+  }
+  else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
+  {
+    EmitMov(regd, 0);
+    return;
+  }
+
+  Compile_dst_op(cf, &Assembler::AND, &RISCV64Compiler::SafeANDI, &Assembler::AND, true, false);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_or(CompileFlags cf)
+{
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+
+  // or/nor with 0 -> no effect
+  const GPR regd = CFGetRegD(cf);
+  if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
+  {
+    cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
+    return;
+  }
+
+  Compile_dst_op(cf, &Assembler::OR, &RISCV64Compiler::SafeORI, &Assembler::OR, true, false);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_xor(CompileFlags cf)
+{
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+
+  const GPR regd = CFGetRegD(cf);
+  if (cf.MipsS() == cf.MipsT())
+  {
+    // xor with self -> zero
+    EmitMov(regd, 0);
+    return;
+  }
+  else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
+  {
+    // xor with zero -> no effect
+    cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
+    return;
+  }
+
+  Compile_dst_op(cf, &Assembler::XOR, &RISCV64Compiler::SafeXORI, &Assembler::XOR, true, false);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_nor(CompileFlags cf)
+{
+  Compile_or(cf);
+  rvAsm->NOT(CFGetRegD(cf), CFGetRegD(cf));
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_slt(CompileFlags cf)
+{
+  Compile_slt(cf, true);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_sltu(CompileFlags cf)
+{
+  Compile_slt(cf, false);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_slt(CompileFlags cf, bool sign)
+{
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+
+  const GPR rd = CFGetRegD(cf);
+  const GPR rs = CFGetSafeRegS(cf, RARG1);
+
+  if (cf.const_t && rvIsValidSExtITypeImm(GetConstantRegU32(cf.MipsT())))
+  {
+    if (sign)
+      rvAsm->SLTI(rd, rs, GetConstantRegS32(cf.MipsT()));
+    else
+      rvAsm->SLTIU(rd, rs, GetConstantRegS32(cf.MipsT()));
+  }
+  else
+  {
+    const GPR rt = CFGetSafeRegT(cf, RARG2);
+    if (sign)
+      rvAsm->SLT(rd, rs, rt);
+    else
+      rvAsm->SLTU(rd, rs, rt);
+  }
+}
+
+biscuit::GPR CPU::NewRec::RISCV64Compiler::ComputeLoadStoreAddressArg(
+  CompileFlags cf, const std::optional<VirtualMemoryAddress>& address, const std::optional<const biscuit::GPR>& reg)
+{
+  const u32 imm = inst->i.imm_sext32();
+  if (cf.valid_host_s && imm == 0 && !reg.has_value())
+    return CFGetRegS(cf);
+
+  const GPR dst = reg.has_value() ? reg.value() : RARG1;
+  if (address.has_value())
+  {
+    EmitMov(dst, address.value());
+  }
+  else if (imm == 0)
+  {
+    if (cf.valid_host_s)
+    {
+      if (const GPR src = CFGetRegS(cf); src.Index() != dst.Index())
+        rvAsm->MV(dst, CFGetRegS(cf));
+    }
+    else
+    {
+      rvAsm->LW(dst, PTR(&g_state.regs.r[cf.mips_s]));
+    }
+  }
+  else
+  {
+    if (cf.valid_host_s)
+    {
+      SafeADDIW(dst, CFGetRegS(cf), inst->i.imm_sext32());
+    }
+    else
+    {
+      rvAsm->LW(dst, PTR(&g_state.regs.r[cf.mips_s]));
+      SafeADDIW(dst, dst, inst->i.imm_sext32());
+    }
+  }
+
+  return dst;
+}
+
+template<typename RegAllocFn>
+void CPU::NewRec::RISCV64Compiler::GenerateLoad(const biscuit::GPR& addr_reg, MemoryAccessSize size, bool sign,
+                                                const RegAllocFn& dst_reg_alloc)
+{
+  const bool checked = g_settings.cpu_recompiler_memory_exceptions;
+  if (!checked && CodeCache::IsUsingFastmem())
+  {
+    m_cycles += Bus::RAM_READ_TICKS;
+
+    // TODO: Make this better. If we're loading the address from state, we can use LWU instead, and skip this.
+    // TODO: LUT fastmem
+    const GPR dst = dst_reg_alloc();
+    rvAsm->SLLI64(RSCRATCH, addr_reg, 32);
+    rvAsm->SRLI64(RSCRATCH, RSCRATCH, 32);
+
+    if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
+    {
+      DebugAssert(addr_reg.Index() != RARG3.Index());
+      rvAsm->SRLI64(RARG3, RSCRATCH, Bus::FASTMEM_LUT_PAGE_SHIFT);
+      rvAsm->SLLI64(RARG3, RARG3, 8);
+      rvAsm->ADD(RARG3, RARG3, RMEMBASE);
+      rvAsm->LD(RARG3, 0, RARG3);
+      rvAsm->ADD(RSCRATCH, RSCRATCH, RARG3);
+    }
+    else
+    {
+      rvAsm->ADD(RSCRATCH, RSCRATCH, RMEMBASE);
+    }
+
+    u8* start = m_emitter->GetCursorPointer();
+    switch (size)
+    {
+      case MemoryAccessSize::Byte:
+        sign ? rvAsm->LB(dst, 0, RSCRATCH) : rvAsm->LBU(dst, 0, RSCRATCH);
+        break;
+
+      case MemoryAccessSize::HalfWord:
+        sign ? rvAsm->LH(dst, 0, RSCRATCH) : rvAsm->LHU(dst, 0, RSCRATCH);
+        break;
+
+      case MemoryAccessSize::Word:
+        rvAsm->LW(dst, 0, RSCRATCH);
+        break;
+    }
+
+    // We need a nop, because the slowmem jump might be more than 1MB away.
+    rvAsm->NOP();
+
+    AddLoadStoreInfo(start, 8, addr_reg.Index(), dst.Index(), size, sign, true);
+    return;
+  }
+
+  if (addr_reg.Index() != RARG1.Index())
+    rvAsm->MV(RARG1, addr_reg);
+
+  switch (size)
+  {
+    case MemoryAccessSize::Byte:
+    {
+      EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryByte) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte));
+    }
+    break;
+    case MemoryAccessSize::HalfWord:
+    {
+      EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryHalfWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord));
+    }
+    break;
+    case MemoryAccessSize::Word:
+    {
+      EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord));
+    }
+    break;
+  }
+
+  // TODO: turn this into an asm function instead
+  if (checked)
+  {
+    rvAsm->SRLI64(RSCRATCH, RRET, 63);
+    SwitchToFarCode(true, &Assembler::BEQ, RSCRATCH, zero);
+    BackupHostState();
+
+    // Need to stash this in a temp because of the flush.
+    const GPR temp = GPR(AllocateTempHostReg(HR_CALLEE_SAVED));
+    rvAsm->NEG(temp, RRET);
+    rvAsm->SLLIW(temp, temp, 2);
+
+    Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
+
+    // cause_bits = (-result << 2) | BD | cop_n
+    SafeORI(RARG1, temp,
+            Cop0Registers::CAUSE::MakeValueForException(
+              static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));
+    EmitMov(RARG2, m_current_instruction_pc);
+    EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
+    FreeHostReg(temp.Index());
+    EndBlock(std::nullopt, true);
+
+    RestoreHostState();
+    SwitchToNearCode(false);
+  }
+
+  const GPR dst_reg = dst_reg_alloc();
+  switch (size)
+  {
+    case MemoryAccessSize::Byte:
+    {
+      sign ? EmitSExtB(dst_reg, RRET) : EmitUExtB(dst_reg, RRET);
+    }
+    break;
+    case MemoryAccessSize::HalfWord:
+    {
+      sign ? EmitSExtH(dst_reg, RRET) : EmitUExtH(dst_reg, RRET);
+    }
+    break;
+    case MemoryAccessSize::Word:
+    {
+      // Need to undo the zero-extend.
+      if (checked)
+        rvEmitDSExtW(rvAsm, dst_reg, RRET);
+      else if (dst_reg.Index() != RRET.Index())
+        rvAsm->MV(dst_reg, RRET);
+    }
+    break;
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::GenerateStore(const biscuit::GPR& addr_reg, const biscuit::GPR& value_reg,
+                                                 MemoryAccessSize size)
+{
+  const bool checked = g_settings.cpu_recompiler_memory_exceptions;
+  if (!checked && CodeCache::IsUsingFastmem())
+  {
+    DebugAssert(value_reg != RSCRATCH);
+    rvAsm->SLLI64(RSCRATCH, addr_reg, 32);
+    rvAsm->SRLI64(RSCRATCH, RSCRATCH, 32);
+
+    if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
+    {
+      DebugAssert(addr_reg.Index() != RARG3.Index());
+      rvAsm->SRLI64(RARG3, RSCRATCH, Bus::FASTMEM_LUT_PAGE_SHIFT);
+      rvAsm->SLLI64(RARG3, RARG3, 8);
+      rvAsm->ADD(RARG3, RARG3, RMEMBASE);
+      rvAsm->LD(RARG3, 0, RARG3);
+      rvAsm->ADD(RSCRATCH, RSCRATCH, RARG3);
+    }
+    else
+    {
+      rvAsm->ADD(RSCRATCH, RSCRATCH, RMEMBASE);
+    }
+
+    u8* start = m_emitter->GetCursorPointer();
+    switch (size)
+    {
+      case MemoryAccessSize::Byte:
+        rvAsm->SB(value_reg, 0, RSCRATCH);
+        break;
+
+      case MemoryAccessSize::HalfWord:
+        rvAsm->SH(value_reg, 0, RSCRATCH);
+        break;
+
+      case MemoryAccessSize::Word:
+        rvAsm->SW(value_reg, 0, RSCRATCH);
+        break;
+    }
+
+    // We need a nop, because the slowmem jump might be more than 1MB away.
+    rvAsm->NOP();
+
+    AddLoadStoreInfo(start, 8, addr_reg.Index(), value_reg.Index(), size, false, false);
+    return;
+  }
+
+  if (addr_reg.Index() != RARG1.Index())
+    rvAsm->MV(RARG1, addr_reg);
+  if (value_reg.Index() != RARG2.Index())
+    rvAsm->MV(RARG2, value_reg);
+
+  switch (size)
+  {
+    case MemoryAccessSize::Byte:
+    {
+      EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryByte) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte));
+    }
+    break;
+    case MemoryAccessSize::HalfWord:
+    {
+      EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryHalfWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord));
+    }
+    break;
+    case MemoryAccessSize::Word:
+    {
+      EmitCall(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord));
+    }
+    break;
+  }
+
+  // TODO: turn this into an asm function instead
+  if (checked)
+  {
+    SwitchToFarCode(true, &Assembler::BEQ, RRET, zero);
+    BackupHostState();
+
+    // Need to stash this in a temp because of the flush.
+    const GPR temp = GPR(AllocateTempHostReg(HR_CALLEE_SAVED));
+    rvAsm->SLLIW(temp, RRET, 2);
+
+    Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
+
+    // cause_bits = (result << 2) | BD | cop_n
+    SafeORI(RARG1, temp,
+            Cop0Registers::CAUSE::MakeValueForException(
+              static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));
+    EmitMov(RARG2, m_current_instruction_pc);
+    EmitCall(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
+    FreeHostReg(temp.Index());
+    EndBlock(std::nullopt, true);
+
+    RestoreHostState();
+    SwitchToNearCode(false);
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                               const std::optional<VirtualMemoryAddress>& address)
+{
+  FlushForLoadStore(address, false);
+  const GPR addr = ComputeLoadStoreAddressArg(cf, address);
+  GenerateLoad(addr, size, sign, [this, cf]() {
+    if (cf.MipsT() == Reg::zero)
+      return RRET;
+
+    return GPR(AllocateHostReg(HR_MODE_WRITE, EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG,
+                               cf.MipsT()));
+  });
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                               const std::optional<VirtualMemoryAddress>& address)
+{
+  DebugAssert(size == MemoryAccessSize::Word && !sign);
+  FlushForLoadStore(address, false);
+
+  // TODO: if address is constant, this can be simplified..
+
+  // If we're coming from another block, just flush the load delay and hope for the best..
+  if (m_load_delay_dirty)
+    UpdateLoadDelay();
+
+  // We'd need to be careful here if we weren't overwriting it..
+  const GPR addr = GPR(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP));
+  ComputeLoadStoreAddressArg(cf, address, addr);
+  rvAsm->ANDI(RARG1, addr, ~0x3u);
+  GenerateLoad(RARG1, MemoryAccessSize::Word, false, []() { return RRET; });
+
+  if (inst->r.rt == Reg::zero)
+  {
+    FreeHostReg(addr.Index());
+    return;
+  }
+
+  // lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
+  // never written back. NOTE: can't trust T in cf because of the flush
+  const Reg rt = inst->r.rt;
+  GPR value;
+  if (m_load_delay_register == rt)
+  {
+    const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
+                                 AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
+                                 m_load_delay_value_register;
+    RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
+    value = GPR(existing_ld_rt);
+  }
+  else
+  {
+    if constexpr (EMULATE_LOAD_DELAYS)
+    {
+      value = GPR(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
+      if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
+        rvAsm->MV(value, GPR(rtreg.value()));
+      else if (HasConstantReg(rt))
+        EmitMov(value, GetConstantRegU32(rt));
+      else
+        rvAsm->LW(value, PTR(&g_state.regs.r[static_cast<u8>(rt)]));
+    }
+    else
+    {
+      value = GPR(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
+    }
+  }
+
+  DebugAssert(value.Index() != RARG2.Index() && value.Index() != RARG3.Index());
+  rvAsm->ANDI(RARG2, addr, 3);
+  rvAsm->SLLIW(RARG2, RARG2, 3); // *8
+  EmitMov(RARG3, 24);
+  rvAsm->SUBW(RARG3, RARG3, RARG2);
+
+  if (inst->op == InstructionOp::lwl)
+  {
+    // const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
+    // new_value = (value & mask) | (RWRET << (24 - shift));
+    EmitMov(addr, 0xFFFFFFu);
+    rvAsm->SRLW(addr, addr, RARG2);
+    rvAsm->AND(value, value, addr);
+    rvAsm->SLLW(RRET, RRET, RARG3);
+    rvAsm->OR(value, value, RRET);
+  }
+  else
+  {
+    // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
+    // new_value = (value & mask) | (RWRET >> shift);
+    rvAsm->SRLW(RRET, RRET, RARG2);
+    EmitMov(addr, 0xFFFFFF00u);
+    rvAsm->SLLW(addr, addr, RARG3);
+    rvAsm->AND(value, value, addr);
+    rvAsm->OR(value, value, RRET);
+  }
+
+  FreeHostReg(addr.Index());
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                                const std::optional<VirtualMemoryAddress>& address)
+{
+  FlushForLoadStore(address, false);
+  const GPR addr = ComputeLoadStoreAddressArg(cf, address);
+  GenerateLoad(addr, MemoryAccessSize::Word, false, []() { return RRET; });
+
+  const u32 index = static_cast<u32>(inst->r.rt.GetValue());
+  const auto [ptr, action] = GetGTERegisterPointer(index, true);
+  switch (action)
+  {
+    case GTERegisterAccessAction::Ignore:
+    {
+      return;
+    }
+
+    case GTERegisterAccessAction::Direct:
+    {
+      rvAsm->SW(RRET, PTR(ptr));
+      return;
+    }
+
+    case GTERegisterAccessAction::SignExtend16:
+    {
+      EmitSExtH(RRET, RRET);
+      rvAsm->SW(RRET, PTR(ptr));
+      return;
+    }
+
+    case GTERegisterAccessAction::ZeroExtend16:
+    {
+      EmitUExtH(RRET, RRET);
+      rvAsm->SW(RRET, PTR(ptr));
+      return;
+    }
+
+    case GTERegisterAccessAction::CallHandler:
+    {
+      Flush(FLUSH_FOR_C_CALL);
+      rvAsm->MV(RARG2, RRET);
+      EmitMov(RARG1, index);
+      EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
+      return;
+    }
+
+    case GTERegisterAccessAction::PushFIFO:
+    {
+      // SXY0 <- SXY1
+      // SXY1 <- SXY2
+      // SXY2 <- SXYP
+      DebugAssert(RRET.Index() != RARG2.Index() && RRET.Index() != RARG3.Index());
+      rvAsm->LW(RARG2, PTR(&g_state.gte_regs.SXY1[0]));
+      rvAsm->LW(RARG3, PTR(&g_state.gte_regs.SXY2[0]));
+      rvAsm->SW(RARG2, PTR(&g_state.gte_regs.SXY0[0]));
+      rvAsm->SW(RARG3, PTR(&g_state.gte_regs.SXY1[0]));
+      rvAsm->SW(RRET, PTR(&g_state.gte_regs.SXY2[0]));
+      return;
+    }
+
+    default:
+    {
+      Panic("Unknown action");
+      return;
+    }
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                               const std::optional<VirtualMemoryAddress>& address)
+{
+  AssertRegOrConstS(cf);
+  AssertRegOrConstT(cf);
+  FlushForLoadStore(address, true);
+  const GPR addr = ComputeLoadStoreAddressArg(cf, address);
+
+  if (!cf.valid_host_t)
+    MoveTToReg(RARG2, cf);
+
+  GenerateStore(addr, cf.valid_host_t ? CFGetRegT(cf) : RARG2, size);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                               const std::optional<VirtualMemoryAddress>& address)
+{
+  DebugAssert(size == MemoryAccessSize::Word && !sign);
+  FlushForLoadStore(address, true);
+
+  // TODO: if address is constant, this can be simplified..
+  // We'd need to be careful here if we weren't overwriting it..
+  const GPR addr = GPR(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP));
+  ComputeLoadStoreAddressArg(cf, address, addr);
+  rvAsm->ANDI(RARG1, addr, ~0x3u);
+  GenerateLoad(RARG1, MemoryAccessSize::Word, false, []() { return RRET; });
+
+  // TODO: this can take over rt's value if it's no longer needed
+  // NOTE: can't trust T in cf because of the flush
+  const Reg rt = inst->r.rt;
+  const GPR value = RARG2;
+  if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
+    rvAsm->MV(value, GPR(rtreg.value()));
+  else if (HasConstantReg(rt))
+    EmitMov(value, GetConstantRegU32(rt));
+  else
+    rvAsm->LW(value, PTR(&g_state.regs.r[static_cast<u8>(rt)]));
+
+  rvAsm->ANDI(RSCRATCH, addr, 3);
+  rvAsm->SLLIW(RSCRATCH, RSCRATCH, 3); // *8
+
+  if (inst->op == InstructionOp::swl)
+  {
+    // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
+    // new_value = (RWRET & mem_mask) | (value >> (24 - shift));
+    EmitMov(RARG3, 0xFFFFFF00u);
+    rvAsm->SLLW(RARG3, RARG3, RSCRATCH);
+    rvAsm->AND(RRET, RRET, RARG3);
+
+    EmitMov(RARG3, 24);
+    rvAsm->SUBW(RARG3, RARG3, RSCRATCH);
+    rvAsm->SRLW(value, value, RARG3);
+    rvAsm->OR(value, value, RRET);
+  }
+  else
+  {
+    // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
+    // new_value = (RWRET & mem_mask) | (value << shift);
+    rvAsm->SLLW(value, value, RSCRATCH);
+
+    EmitMov(RARG3, 24);
+    rvAsm->SUBW(RARG3, RARG3, RSCRATCH);
+    EmitMov(RSCRATCH, 0x00FFFFFFu);
+    rvAsm->SRLW(RSCRATCH, RSCRATCH, RARG3);
+    rvAsm->AND(RRET, RRET, RSCRATCH);
+    rvAsm->OR(value, value, RRET);
+  }
+
+  FreeHostReg(addr.Index());
+
+  rvAsm->ANDI(RARG1, addr, ~0x3u);
+  GenerateStore(RARG1, value, MemoryAccessSize::Word);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                                const std::optional<VirtualMemoryAddress>& address)
+{
+  FlushForLoadStore(address, true);
+
+  const u32 index = static_cast<u32>(inst->r.rt.GetValue());
+  const auto [ptr, action] = GetGTERegisterPointer(index, false);
+  switch (action)
+  {
+    case GTERegisterAccessAction::Direct:
+    {
+      rvAsm->LW(RARG2, PTR(ptr));
+    }
+    break;
+
+    case GTERegisterAccessAction::CallHandler:
+    {
+      // should already be flushed.. except in fastmem case
+      Flush(FLUSH_FOR_C_CALL);
+      EmitMov(RARG1, index);
+      EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
+      rvAsm->MV(RARG2, RRET);
+    }
+    break;
+
+    default:
+    {
+      Panic("Unknown action");
+    }
+    break;
+  }
+
+  const GPR addr = ComputeLoadStoreAddressArg(cf, address);
+  GenerateStore(addr, RARG2, size);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_mtc0(CompileFlags cf)
+{
+  // TODO: we need better constant setting here.. which will need backprop
+  AssertRegOrConstT(cf);
+
+  const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
+  const u32* ptr = GetCop0RegPtr(reg);
+  const u32 mask = GetCop0RegWriteMask(reg);
+  if (!ptr)
+  {
+    Compile_Fallback();
+    return;
+  }
+
+  if (mask == 0)
+  {
+    // if it's a read-only register, ignore
+    Log_DebugPrintf("Ignoring write to read-only cop0 reg %u", static_cast<u32>(reg));
+    return;
+  }
+
+  // for some registers, we need to test certain bits
+  const bool needs_bit_test = (reg == Cop0Reg::SR);
+  const GPR new_value = RARG1;
+  const GPR old_value = RARG2;
+  const GPR changed_bits = RARG3;
+  const GPR mask_reg = RSCRATCH;
+
+  // Load old value
+  rvAsm->LW(old_value, PTR(ptr));
+
+  // No way we fit this in an immediate..
+  EmitMov(mask_reg, mask);
+
+  // update value
+  // TODO: This is creating pointless MV instructions.. why?
+  if (cf.valid_host_t)
+    rvAsm->AND(new_value, CFGetRegT(cf), mask_reg);
+  else
+    EmitMov(new_value, GetConstantRegU32(cf.MipsT()) & mask);
+
+  if (needs_bit_test)
+    rvAsm->XOR(changed_bits, old_value, new_value);
+  rvAsm->NOT(mask_reg, mask_reg);
+  rvAsm->AND(old_value, old_value, mask_reg);
+  rvAsm->OR(new_value, old_value, new_value);
+  rvAsm->SW(new_value, PTR(ptr));
+
+  if (reg == Cop0Reg::SR)
+  {
+    // TODO: replace with register backup
+    // We could just inline the whole thing..
+    Flush(FLUSH_FOR_C_CALL);
+
+    rvAsm->SRLIW(RSCRATCH, changed_bits, 16);
+    rvAsm->ANDI(RSCRATCH, RSCRATCH, 1);
+    SwitchToFarCode(true, &Assembler::BEQ, RSCRATCH, zero);
+    rvAsm->ADDI(sp, sp, -16);
+    rvAsm->SW(RARG1, 0, sp);
+    rvAsm->SW(RARG2, 8, sp);
+    EmitCall(reinterpret_cast<const void*>(&CPU::UpdateMemoryPointers));
+    rvAsm->SW(RARG2, 8, sp);
+    rvAsm->SW(RARG1, 0, sp);
+    rvAsm->ADDI(sp, sp, 16);
+    rvAsm->LD(RMEMBASE, PTR(&g_state.fastmem_base));
+    SwitchToNearCode(true);
+  }
+
+  if (reg == Cop0Reg::SR || reg == Cop0Reg::CAUSE)
+  {
+    const GPR sr = (reg == Cop0Reg::SR) ? RARG2 : (rvAsm->LW(RARG1, PTR(&g_state.cop0_regs.sr.bits)), RARG1);
+    TestInterrupts(sr);
+  }
+
+  if (reg == Cop0Reg::DCIC && g_settings.cpu_recompiler_memory_exceptions)
+  {
+    // TODO: DCIC handling for debug breakpoints
+    Log_WarningPrintf("TODO: DCIC handling for debug breakpoints");
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_rfe(CompileFlags cf)
+{
+  // shift mode bits right two, preserving upper bits
+  rvAsm->LW(RARG1, PTR(&g_state.cop0_regs.sr.bits));
+  rvAsm->SRLIW(RSCRATCH, RARG1, 2);
+  rvAsm->ANDI(RSCRATCH, RSCRATCH, 0xf);
+  rvAsm->ANDI(RARG1, RARG1, ~0xfu);
+  rvAsm->OR(RARG1, RARG1, RSCRATCH);
+  rvAsm->SW(RARG1, PTR(&g_state.cop0_regs.sr.bits));
+
+  TestInterrupts(RARG1);
+}
+
+void CPU::NewRec::RISCV64Compiler::TestInterrupts(const biscuit::GPR& sr)
+{
+  DebugAssert(sr != RSCRATCH);
+
+  // if Iec == 0 then goto no_interrupt
+  Label no_interrupt;
+  rvAsm->ANDI(RSCRATCH, sr, 1);
+  rvAsm->BEQZ(RSCRATCH, &no_interrupt);
+
+  // sr & cause
+  rvAsm->LW(RSCRATCH, PTR(&g_state.cop0_regs.cause.bits));
+  rvAsm->AND(sr, sr, RSCRATCH);
+
+  // ((sr & cause) & 0xff00) == 0 goto no_interrupt
+  rvAsm->SRLIW(sr, sr, 8);
+  rvAsm->ANDI(sr, sr, 0xFF);
+  SwitchToFarCode(true, &Assembler::BEQ, sr, zero);
+  BackupHostState();
+  Flush(FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
+  EmitCall(reinterpret_cast<const void*>(&DispatchInterrupt));
+  EndBlock(std::nullopt, true);
+  RestoreHostState();
+  SwitchToNearCode(false);
+
+  rvAsm->Bind(&no_interrupt);
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_mfc2(CompileFlags cf)
+{
+  const u32 index = inst->cop.Cop2Index();
+  const Reg rt = inst->r.rt;
+
+  const auto [ptr, action] = GetGTERegisterPointer(index, false);
+  if (action == GTERegisterAccessAction::Ignore)
+    return;
+
+  if (action == GTERegisterAccessAction::Direct)
+  {
+    const u32 hreg =
+      AllocateHostReg(HR_MODE_WRITE, EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
+    rvAsm->LW(GPR(hreg), PTR(ptr));
+  }
+  else if (action == GTERegisterAccessAction::CallHandler)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    EmitMov(RARG1, index);
+    EmitCall(reinterpret_cast<const void*>(&GTE::ReadRegister));
+
+    const u32 hreg =
+      AllocateHostReg(HR_MODE_WRITE, EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
+    rvAsm->MV(GPR(hreg), RRET);
+  }
+  else
+  {
+    Panic("Unknown action");
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_mtc2(CompileFlags cf)
+{
+  const u32 index = inst->cop.Cop2Index();
+  const auto [ptr, action] = GetGTERegisterPointer(index, true);
+  if (action == GTERegisterAccessAction::Ignore)
+    return;
+
+  if (action == GTERegisterAccessAction::Direct)
+  {
+    if (cf.const_t)
+      StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), ptr);
+    else
+      rvAsm->SW(CFGetRegT(cf), PTR(ptr));
+  }
+  else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
+  {
+    const bool sign = (action == GTERegisterAccessAction::SignExtend16);
+    if (cf.valid_host_t)
+    {
+      sign ? EmitSExtH(RARG1, CFGetRegT(cf)) : EmitUExtH(RARG1, CFGetRegT(cf));
+      rvAsm->SW(RARG1, PTR(ptr));
+    }
+    else if (cf.const_t)
+    {
+      const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
+      StoreConstantToCPUPointer(sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv), ptr);
+    }
+    else
+    {
+      Panic("Unsupported setup");
+    }
+  }
+  else if (action == GTERegisterAccessAction::CallHandler)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    EmitMov(RARG1, index);
+    MoveTToReg(RARG2, cf);
+    EmitCall(reinterpret_cast<const void*>(&GTE::WriteRegister));
+  }
+  else if (action == GTERegisterAccessAction::PushFIFO)
+  {
+    // SXY0 <- SXY1
+    // SXY1 <- SXY2
+    // SXY2 <- SXYP
+    DebugAssert(RRET.Index() != RARG2.Index() && RRET.Index() != RARG3.Index());
+    rvAsm->LW(RARG2, PTR(&g_state.gte_regs.SXY1[0]));
+    rvAsm->LW(RARG3, PTR(&g_state.gte_regs.SXY2[0]));
+    rvAsm->SW(RARG2, PTR(&g_state.gte_regs.SXY0[0]));
+    rvAsm->SW(RARG3, PTR(&g_state.gte_regs.SXY1[0]));
+    if (cf.valid_host_t)
+      rvAsm->SW(CFGetRegT(cf), PTR(&g_state.gte_regs.SXY2[0]));
+    else if (cf.const_t)
+      StoreConstantToCPUPointer(GetConstantRegU32(cf.MipsT()), &g_state.gte_regs.SXY2[0]);
+    else
+      Panic("Unsupported setup");
+  }
+  else
+  {
+    Panic("Unknown action");
+  }
+}
+
+void CPU::NewRec::RISCV64Compiler::Compile_cop2(CompileFlags cf)
+{
+  TickCount func_ticks;
+  GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
+
+  Flush(FLUSH_FOR_C_CALL);
+  EmitMov(RARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
+  EmitCall(reinterpret_cast<const void*>(func));
+
+  AddGTETicks(func_ticks);
+}
+
+u32 CPU::NewRec::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
+                                       TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
+                                       u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
+                                       bool is_load)
+{
+  Assembler arm_asm(static_cast<u8*>(thunk_code), thunk_space);
+  Assembler* rvAsm = &arm_asm;
+
+  static constexpr u32 GPR_SIZE = 8;
+
+  // save regs
+  u32 num_gprs = 0;
+
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    if ((gpr_bitmask & (1u << i)) && rvIsCallerSavedRegister(i) && (!is_load || data_register != i))
+      num_gprs++;
+  }
+
+  const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE);
+
+  if (stack_size > 0)
+  {
+    rvAsm->ADDI(sp, sp, -static_cast<s32>(stack_size));
+
+    u32 stack_offset = 0;
+    for (u32 i = 0; i < NUM_HOST_REGS; i++)
+    {
+      if ((gpr_bitmask & (1u << i)) && rvIsCallerSavedRegister(i) && (!is_load || data_register != i))
+      {
+        rvAsm->SD(GPR(i), stack_offset, sp);
+        stack_offset += GPR_SIZE;
+      }
+    }
+  }
+
+  if (cycles_to_add != 0)
+  {
+    // NOTE: we have to reload here, because memory writes can run DMA, which can screw with cycles
+    Assert(rvIsValidSExtITypeImm(cycles_to_add));
+    rvAsm->LW(RSCRATCH, PTR(&g_state.pending_ticks));
+    rvAsm->ADDIW(RSCRATCH, RSCRATCH, cycles_to_add);
+    rvAsm->SW(RSCRATCH, PTR(&g_state.pending_ticks));
+  }
+
+  if (address_register != RARG1.Index())
+    rvAsm->MV(RARG1, GPR(address_register));
+
+  if (!is_load)
+  {
+    if (data_register != RARG2.Index())
+      rvAsm->MV(RARG2, GPR(data_register));
+  }
+
+  switch (size)
+  {
+    case MemoryAccessSize::Byte:
+    {
+      rvEmitCall(rvAsm, is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte) :
+                                  reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte));
+    }
+    break;
+    case MemoryAccessSize::HalfWord:
+    {
+      rvEmitCall(rvAsm, is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord) :
+                                  reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord));
+    }
+    break;
+    case MemoryAccessSize::Word:
+    {
+      rvEmitCall(rvAsm, is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord) :
+                                  reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord));
+    }
+    break;
+  }
+
+  if (is_load)
+  {
+    const GPR dst = GPR(data_register);
+    switch (size)
+    {
+      case MemoryAccessSize::Byte:
+      {
+        is_signed ? rvEmitSExtB(rvAsm, dst, RRET) : rvEmitUExtB(rvAsm, dst, RRET);
+      }
+      break;
+      case MemoryAccessSize::HalfWord:
+      {
+        is_signed ? rvEmitSExtH(rvAsm, dst, RRET) : rvEmitUExtH(rvAsm, dst, RRET);
+      }
+      break;
+      case MemoryAccessSize::Word:
+      {
+        if (dst.Index() != RRET.Index())
+          rvAsm->MV(dst, RRET);
+      }
+      break;
+    }
+  }
+
+  if (cycles_to_remove != 0)
+  {
+    Assert(rvIsValidSExtITypeImm(-cycles_to_remove));
+    rvAsm->LW(RSCRATCH, PTR(&g_state.pending_ticks));
+    rvAsm->ADDIW(RSCRATCH, RSCRATCH, -cycles_to_remove);
+    rvAsm->SW(RSCRATCH, PTR(&g_state.pending_ticks));
+  }
+
+  // restore regs
+  if (stack_size > 0)
+  {
+    u32 stack_offset = 0;
+    for (u32 i = 0; i < NUM_HOST_REGS; i++)
+    {
+      if ((gpr_bitmask & (1u << i)) && rvIsCallerSavedRegister(i) && (!is_load || data_register != i))
+      {
+        rvAsm->LD(GPR(i), stack_offset, sp);
+        stack_offset += GPR_SIZE;
+      }
+    }
+
+    rvAsm->ADDI(sp, sp, stack_size);
+  }
+
+  rvEmitJmp(rvAsm, static_cast<const u8*>(code_address) + code_size);
+
+  return static_cast<u32>(rvAsm->GetCodeBuffer().GetSizeInBytes());
+}
diff --git a/src/core/cpu_newrec_compiler_riscv64.h b/src/core/cpu_newrec_compiler_riscv64.h
new file mode 100644
index 000000000..96a265e33
--- /dev/null
+++ b/src/core/cpu_newrec_compiler_riscv64.h
@@ -0,0 +1,168 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+#include "cpu_newrec_compiler.h"
+#include <memory>
+
+namespace CPU::NewRec {
+
+class RISCV64Compiler final : public Compiler
+{
+public:
+  RISCV64Compiler();
+  ~RISCV64Compiler() override;
+
+protected:
+  const char* GetHostRegName(u32 reg) const override;
+
+  const void* GetCurrentCodePointer() override;
+
+  void LoadHostRegWithConstant(u32 reg, u32 val) override;
+  void LoadHostRegFromCPUPointer(u32 reg, const void* ptr) override;
+  void StoreConstantToCPUPointer(u32 val, const void* ptr) override;
+  void StoreHostRegToCPUPointer(u32 reg, const void* ptr) override;
+  void CopyHostReg(u32 dst, u32 src) override;
+
+  void Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,
+             u32 far_code_space) override;
+  void GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) override;
+  void GenerateICacheCheckAndUpdate() override;
+  void GenerateCall(const void* func, s32 arg1reg = -1, s32 arg2reg = -1, s32 arg3reg = -1) override;
+  void EndBlock(const std::optional<u32>& newpc, bool do_event_test) override;
+  void EndBlockWithException(Exception excode) override;
+  void EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test);
+  const void* EndCompile(u32* code_size, u32* far_code_size) override;
+
+  void Flush(u32 flags) override;
+
+  void Compile_Fallback() override;
+
+  void CheckBranchTarget(const biscuit::GPR& pcreg);
+  void Compile_jr(CompileFlags cf) override;
+  void Compile_jalr(CompileFlags cf) override;
+  void Compile_bxx(CompileFlags cf, BranchCondition cond) override;
+
+  void Compile_addi(CompileFlags cf, bool overflow);
+  void Compile_addi(CompileFlags cf) override;
+  void Compile_addiu(CompileFlags cf) override;
+  void Compile_slti(CompileFlags cf, bool sign);
+  void Compile_slti(CompileFlags cf) override;
+  void Compile_sltiu(CompileFlags cf) override;
+  void Compile_andi(CompileFlags cf) override;
+  void Compile_ori(CompileFlags cf) override;
+  void Compile_xori(CompileFlags cf) override;
+
+  void Compile_shift(CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR),
+                     void (biscuit::Assembler::*op_const)(biscuit::GPR, biscuit::GPR, unsigned));
+  void Compile_sll(CompileFlags cf) override;
+  void Compile_srl(CompileFlags cf) override;
+  void Compile_sra(CompileFlags cf) override;
+  void Compile_variable_shift(CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR),
+                              void (biscuit::Assembler::*op_const)(biscuit::GPR, biscuit::GPR, unsigned));
+  void Compile_sllv(CompileFlags cf) override;
+  void Compile_srlv(CompileFlags cf) override;
+  void Compile_srav(CompileFlags cf) override;
+  void Compile_mult(CompileFlags cf, bool sign);
+  void Compile_mult(CompileFlags cf) override;
+  void Compile_multu(CompileFlags cf) override;
+  void Compile_div(CompileFlags cf) override;
+  void Compile_divu(CompileFlags cf) override;
+  void TestOverflow(const biscuit::GPR& long_res, const biscuit::GPR& res, const biscuit::GPR& reg_to_discard);
+  void Compile_dst_op(CompileFlags cf, void (biscuit::Assembler::*op)(biscuit::GPR, biscuit::GPR, biscuit::GPR),
+                      void (RISCV64Compiler::*op_const)(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm),
+                      void (biscuit::Assembler::*op_long)(biscuit::GPR, biscuit::GPR, biscuit::GPR), bool commutative,
+                      bool overflow);
+  void Compile_add(CompileFlags cf) override;
+  void Compile_addu(CompileFlags cf) override;
+  void Compile_sub(CompileFlags cf) override;
+  void Compile_subu(CompileFlags cf) override;
+  void Compile_and(CompileFlags cf) override;
+  void Compile_or(CompileFlags cf) override;
+  void Compile_xor(CompileFlags cf) override;
+  void Compile_nor(CompileFlags cf) override;
+  void Compile_slt(CompileFlags cf, bool sign);
+  void Compile_slt(CompileFlags cf) override;
+  void Compile_sltu(CompileFlags cf) override;
+
+  biscuit::GPR ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional<VirtualMemoryAddress>& address,
+                                          const std::optional<const biscuit::GPR>& reg = std::nullopt);
+  template<typename RegAllocFn>
+  void GenerateLoad(const biscuit::GPR& addr_reg, MemoryAccessSize size, bool sign, const RegAllocFn& dst_reg_alloc);
+  void GenerateStore(const biscuit::GPR& addr_reg, const biscuit::GPR& value_reg, MemoryAccessSize size);
+  void Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                   const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                   const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                    const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                   const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                   const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                    const std::optional<VirtualMemoryAddress>& address) override;
+
+  void TestInterrupts(const biscuit::GPR& sr);
+  void Compile_mtc0(CompileFlags cf) override;
+  void Compile_rfe(CompileFlags cf) override;
+
+  void Compile_mfc2(CompileFlags cf) override;
+  void Compile_mtc2(CompileFlags cf) override;
+  void Compile_cop2(CompileFlags cf) override;
+
+  void GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg = Reg::count,
+                                    Reg arg3reg = Reg::count) override;
+
+private:
+  void EmitMov(const biscuit::GPR& dst, u32 val);
+  void EmitCall(const void* ptr);
+
+  void SwitchToFarCode(bool emit_jump,
+                       void (biscuit::Assembler::*inverted_cond)(biscuit::GPR, biscuit::GPR, biscuit::Label*) = nullptr,
+                       const biscuit::GPR& rs1 = biscuit::zero, const biscuit::GPR& rs2 = biscuit::zero);
+  void SwitchToNearCode(bool emit_jump);
+
+  void AssertRegOrConstS(CompileFlags cf) const;
+  void AssertRegOrConstT(CompileFlags cf) const;
+  // vixl::aarch64::MemOperand MipsPtr(Reg r) const;
+
+  void SafeImmSExtIType(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm,
+                        void (biscuit::Assembler::*iop)(biscuit::GPR, biscuit::GPR, u32),
+                        void (biscuit::Assembler::*rop)(biscuit::GPR, biscuit::GPR, biscuit::GPR));
+
+  void SafeADDI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm);
+  void SafeADDIW(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm);
+  void SafeSUBIW(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm);
+  void SafeANDI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm);
+  void SafeORI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm);
+  void SafeXORI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm);
+  void SafeSLTI(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm);
+  void SafeSLTIU(const biscuit::GPR& rd, const biscuit::GPR& rs, u32 imm);
+
+  void EmitSExtB(const biscuit::GPR& rd, const biscuit::GPR& rs);
+  void EmitUExtB(const biscuit::GPR& rd, const biscuit::GPR& rs);
+  void EmitSExtH(const biscuit::GPR& rd, const biscuit::GPR& rs);
+  void EmitUExtH(const biscuit::GPR& rd, const biscuit::GPR& rs);
+  void EmitDSExtW(const biscuit::GPR& rd, const biscuit::GPR& rs);
+  void EmitDUExtW(const biscuit::GPR& rd, const biscuit::GPR& rs);
+
+  biscuit::GPR CFGetSafeRegS(CompileFlags cf, const biscuit::GPR& temp_reg);
+  biscuit::GPR CFGetSafeRegT(CompileFlags cf, const biscuit::GPR& temp_reg);
+
+  biscuit::GPR CFGetRegD(CompileFlags cf) const;
+  biscuit::GPR CFGetRegS(CompileFlags cf) const;
+  biscuit::GPR CFGetRegT(CompileFlags cf) const;
+  biscuit::GPR CFGetRegLO(CompileFlags cf) const;
+  biscuit::GPR CFGetRegHI(CompileFlags cf) const;
+
+  void MoveSToReg(const biscuit::GPR& dst, CompileFlags cf);
+  void MoveTToReg(const biscuit::GPR& dst, CompileFlags cf);
+  void MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg);
+
+  std::unique_ptr<biscuit::Assembler> m_emitter;
+  std::unique_ptr<biscuit::Assembler> m_far_emitter;
+  biscuit::Assembler* rvAsm;
+};
+
+} // namespace CPU::NewRec
diff --git a/src/core/cpu_newrec_compiler_x64.cpp b/src/core/cpu_newrec_compiler_x64.cpp
new file mode 100644
index 000000000..7f458360b
--- /dev/null
+++ b/src/core/cpu_newrec_compiler_x64.cpp
@@ -0,0 +1,2196 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#include "cpu_newrec_compiler_x64.h"
+#include "common/align.h"
+#include "common/assert.h"
+#include "common/log.h"
+#include "common/string_util.h"
+#include "cpu_code_cache_private.h"
+#include "cpu_core_private.h"
+#include "cpu_recompiler_thunks.h"
+#include "cpu_recompiler_types.h"
+#include "gte.h"
+#include "pgxp.h"
+#include "settings.h"
+#include "timing_event.h"
+#include <limits>
+Log_SetChannel(CPU::NewRec);
+
+#define RMEMBASE cg->rbx
+#define RSTATE cg->rbp
+
+// #define PTR(x) (cg->rip + (x))
+#define PTR(x) (RSTATE + (u32)(((u8*)(x)) - ((u8*)&g_state)))
+
+// PGXP TODO: LWL etc, MFC0
+// PGXP TODO: Spyro 1 level gates have issues.
+
+static constexpr u32 BACKPATCH_JMP_SIZE = 5;
+
+using namespace Xbyak;
+
+using CPU::Recompiler::IsCallerSavedRegister;
+
+// TODO: try using a pointer to state instead of rip-relative.. it might end up faster due to smaller code
+
+namespace CPU::NewRec {
+X64Compiler s_instance;
+Compiler* g_compiler = &s_instance;
+} // namespace CPU::NewRec
+
+CPU::NewRec::X64Compiler::X64Compiler() = default;
+
+CPU::NewRec::X64Compiler::~X64Compiler() = default;
+
+void CPU::NewRec::X64Compiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space,
+                                     u8* far_code_buffer, u32 far_code_space)
+{
+  Compiler::Reset(block, code_buffer, code_buffer_space, far_code_buffer, far_code_space);
+
+  // TODO: don't recreate this every time..
+  DebugAssert(!m_emitter && !m_far_emitter && !cg);
+  m_emitter = std::make_unique<Xbyak::CodeGenerator>(code_buffer_space, code_buffer);
+  m_far_emitter = std::make_unique<Xbyak::CodeGenerator>(far_code_space, far_code_buffer);
+  cg = m_emitter.get();
+
+  // Need to wipe it out so it's correct when toggling fastmem.
+  m_host_regs = {};
+
+  const u32 membase_idx = CodeCache::IsUsingFastmem() ? static_cast<u32>(RMEMBASE.getIdx()) : NUM_HOST_REGS;
+  const u32 cpu_idx = static_cast<u32>(RSTATE.getIdx());
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    HostRegAlloc& ra = m_host_regs[i];
+
+    if (i == static_cast<u32>(RWRET.getIdx()) || i == static_cast<u32>(RWARG1.getIdx()) ||
+        i == static_cast<u32>(RWARG2.getIdx()) || i == static_cast<u32>(RWARG3.getIdx()) ||
+        i == static_cast<u32>(cg->rsp.getIdx()) || i == cpu_idx || i == membase_idx ||
+        i == static_cast<u32>(cg->ecx.getIdx()) /* keep ecx free for shifts, maybe use BMI? */)
+    {
+      continue;
+    }
+
+    ra.flags = HR_USABLE | (IsCallerSavedRegister(i) ? 0 : HR_CALLEE_SAVED);
+  }
+}
+
+void CPU::NewRec::X64Compiler::SwitchToFarCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*))
+{
+  DebugAssert(cg == m_emitter.get());
+  if (emit_jump)
+  {
+    const void* fcptr = m_far_emitter->getCurr<const void*>();
+    (jump_op) ? (cg->*jump_op)(fcptr) : cg->jmp(fcptr);
+  }
+  cg = m_far_emitter.get();
+}
+
+void CPU::NewRec::X64Compiler::SwitchToNearCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*))
+{
+  DebugAssert(cg == m_far_emitter.get());
+  if (emit_jump)
+  {
+    const void* fcptr = m_emitter->getCurr<const void*>();
+    (jump_op) ? (cg->*jump_op)(fcptr) : cg->jmp(fcptr);
+  }
+  cg = m_emitter.get();
+}
+
+void CPU::NewRec::X64Compiler::BeginBlock()
+{
+  Compiler::BeginBlock();
+
+#if 0
+  if (m_block->pc == 0xBFC06F0C)
+  {
+    //__debugbreak();
+    cg->db(0xcc);
+  }
+#endif
+
+#if 0
+  cg->nop();
+  cg->mov(RWARG1, m_block->pc);
+  cg->nop();
+#endif
+}
+
+void CPU::NewRec::X64Compiler::GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size)
+{
+  // store it first to reduce code size, because we can offset
+  cg->mov(RXARG1, static_cast<size_t>(reinterpret_cast<uintptr_t>(ram_ptr)));
+  cg->mov(RXARG2, static_cast<size_t>(reinterpret_cast<uintptr_t>(shadow_ptr)));
+
+  bool first = true;
+  u32 offset = 0;
+  while (size >= 16)
+  {
+    const Xbyak::Xmm& dst = first ? cg->xmm0 : cg->xmm1;
+    cg->movups(dst, cg->xword[RXARG1 + offset]);
+    cg->pcmpeqd(dst, cg->xword[RXARG2 + offset]);
+    if (!first)
+      cg->pand(cg->xmm0, dst);
+    else
+      first = false;
+
+    offset += 16;
+    size -= 16;
+  }
+
+  // TODO: better codegen for 16 byte aligned blocks
+  if (!first)
+  {
+    cg->movmskps(cg->eax, cg->xmm0);
+    cg->cmp(cg->eax, 0xf);
+    cg->jne(CodeCache::g_discard_and_recompile_block);
+  }
+
+  while (size >= 8)
+  {
+    cg->mov(RXARG3, cg->qword[RXARG1 + offset]);
+    cg->cmp(RXARG3, cg->qword[RXARG2 + offset]);
+    cg->jne(CodeCache::g_discard_and_recompile_block);
+    offset += 8;
+    size -= 8;
+  }
+
+  while (size >= 4)
+  {
+    cg->mov(RWARG3, cg->dword[RXARG1 + offset]);
+    cg->cmp(RWARG3, cg->dword[RXARG2 + offset]);
+    cg->jne(CodeCache::g_discard_and_recompile_block);
+    offset += 4;
+    size -= 4;
+  }
+
+  DebugAssert(size == 0);
+}
+
+void CPU::NewRec::X64Compiler::GenerateICacheCheckAndUpdate()
+{
+  if (GetSegmentForAddress(m_block->pc) >= Segment::KSEG1)
+  {
+    cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast<u32>(m_block->uncached_fetch_ticks));
+  }
+  else if (m_block->icache_line_count > 0)
+  {
+    cg->lea(RXARG1, cg->dword[PTR(&g_state.icache_tags)]);
+
+    // TODO: Vectorize this...
+    VirtualMemoryAddress current_pc = m_block->pc & ICACHE_TAG_ADDRESS_MASK;
+    for (u32 i = 0; i < m_block->icache_line_count; i++, current_pc += ICACHE_LINE_SIZE)
+    {
+      const VirtualMemoryAddress tag = GetICacheTagForAddress(current_pc);
+      const TickCount fill_ticks = GetICacheFillTicks(current_pc);
+      if (fill_ticks <= 0)
+        continue;
+
+      const u32 line = GetICacheLine(current_pc);
+      const u32 offset = (line * sizeof(u32));
+      Xbyak::Label cache_hit;
+
+      cg->cmp(cg->dword[RXARG1 + offset], tag);
+      cg->je(cache_hit);
+      cg->mov(cg->dword[RXARG1 + offset], tag);
+      cg->add(cg->dword[PTR(&g_state.pending_ticks)], static_cast<u32>(fill_ticks));
+      cg->L(cache_hit);
+    }
+  }
+}
+
+void CPU::NewRec::X64Compiler::GenerateCall(const void* func, s32 arg1reg /*= -1*/, s32 arg2reg /*= -1*/,
+                                            s32 arg3reg /*= -1*/)
+{
+  if (arg1reg >= 0 && arg1reg != static_cast<s32>(RXARG1.getIdx()))
+    cg->mov(RXARG1, Reg64(arg1reg));
+  if (arg1reg >= 0 && arg2reg != static_cast<s32>(RXARG2.getIdx()))
+    cg->mov(RXARG2, Reg64(arg2reg));
+  if (arg1reg >= 0 && arg3reg != static_cast<s32>(RXARG3.getIdx()))
+    cg->mov(RXARG3, Reg64(arg3reg));
+  cg->call(func);
+}
+
+void CPU::NewRec::X64Compiler::EndBlock(const std::optional<u32>& newpc, bool do_event_test)
+{
+  if (newpc.has_value())
+  {
+    if (m_dirty_pc || m_compiler_pc != newpc)
+      cg->mov(cg->dword[PTR(&g_state.pc)], newpc.value());
+  }
+  m_dirty_pc = false;
+
+  // flush regs
+  Flush(FLUSH_END_BLOCK);
+  EndAndLinkBlock(newpc, do_event_test);
+}
+
+void CPU::NewRec::X64Compiler::EndBlockWithException(Exception excode)
+{
+  // flush regs, but not pc, it's going to get overwritten
+  // flush cycles because of the GTE instruction stuff...
+  Flush(FLUSH_END_BLOCK | FLUSH_FOR_EXCEPTION);
+
+  // TODO: flush load delay
+  // TODO: break for pcdrv
+
+  cg->mov(RWARG1, Cop0Registers::CAUSE::MakeValueForException(excode, m_current_instruction_branch_delay_slot, false,
+                                                              inst->cop.cop_n));
+  cg->mov(RWARG2, m_current_instruction_pc);
+  cg->call(static_cast<void (*)(u32, u32)>(&CPU::RaiseException));
+  m_dirty_pc = false;
+
+  EndAndLinkBlock(std::nullopt, true);
+}
+
+void CPU::NewRec::X64Compiler::EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test)
+{
+  // event test
+  // pc should've been flushed
+  DebugAssert(!m_dirty_pc);
+
+  // TODO: try extracting this to a function
+
+  // save cycles for event test
+  const TickCount cycles = std::exchange(m_cycles, 0);
+
+  // fast path when not doing an event test
+  if (!do_event_test && m_gte_done_cycle <= cycles)
+  {
+    if (cycles == 1)
+      cg->inc(cg->dword[PTR(&g_state.pending_ticks)]);
+    else if (cycles > 0)
+      cg->add(cg->dword[PTR(&g_state.pending_ticks)], cycles);
+  }
+  else
+  {
+    // pending_ticks += cycles
+    // if (pending_ticks >= downcount) { dispatch_event(); }
+    if (do_event_test || cycles > 0 || m_gte_done_cycle > cycles)
+      cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]);
+    if (cycles > 0)
+      cg->add(RWARG1, cycles);
+    if (m_gte_done_cycle > cycles)
+    {
+      cg->mov(RWARG2, RWARG1);
+      ((m_gte_done_cycle - cycles) == 1) ? cg->inc(RWARG2) : cg->add(RWARG2, m_gte_done_cycle - cycles);
+      cg->mov(cg->dword[PTR(&g_state.gte_completion_tick)], RWARG2);
+    }
+    if (do_event_test)
+      cg->cmp(RWARG1, cg->dword[PTR(&g_state.downcount)]);
+    if (cycles > 0)
+      cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1);
+    if (do_event_test)
+      cg->jge(CodeCache::g_run_events_and_dispatch);
+  }
+
+  // jump to dispatcher or next block
+  if (!newpc.has_value())
+  {
+    cg->jmp(CodeCache::g_dispatcher);
+  }
+  else
+  {
+    if (newpc.value() == m_block->pc)
+    {
+      // Special case: ourselves! No need to backlink then.
+      Log_DebugPrintf("Linking block at %08X to self", m_block->pc);
+      cg->jmp(cg->getCode());
+    }
+    else
+    {
+      const void* target = CodeCache::CreateBlockLink(m_block, cg->getCurr<void*>(), newpc.value());
+      cg->jmp(target, CodeGenerator::T_NEAR);
+    }
+  }
+
+  m_block_ended = true;
+}
+
+const void* CPU::NewRec::X64Compiler::EndCompile(u32* code_size, u32* far_code_size)
+{
+  const void* code = m_emitter->getCode();
+  *code_size = static_cast<u32>(m_emitter->getSize());
+  *far_code_size = static_cast<u32>(m_far_emitter->getSize());
+  cg = nullptr;
+  m_far_emitter.reset();
+  m_emitter.reset();
+  return code;
+}
+
+const void* CPU::NewRec::X64Compiler::GetCurrentCodePointer()
+{
+  return cg->getCurr();
+}
+
+const char* CPU::NewRec::X64Compiler::GetHostRegName(u32 reg) const
+{
+  static constexpr std::array<const char*, 16> reg64_names = {
+    {"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"}};
+  return (reg < reg64_names.size()) ? reg64_names[reg] : "UNKNOWN";
+}
+
+void CPU::NewRec::X64Compiler::LoadHostRegWithConstant(u32 reg, u32 val)
+{
+  cg->mov(Reg32(reg), val);
+}
+
+void CPU::NewRec::X64Compiler::LoadHostRegFromCPUPointer(u32 reg, const void* ptr)
+{
+  cg->mov(Reg32(reg), cg->dword[PTR(ptr)]);
+}
+
+void CPU::NewRec::X64Compiler::StoreHostRegToCPUPointer(u32 reg, const void* ptr)
+{
+  cg->mov(cg->dword[PTR(ptr)], Reg32(reg));
+}
+
+void CPU::NewRec::X64Compiler::StoreConstantToCPUPointer(u32 val, const void* ptr)
+{
+  cg->mov(cg->dword[PTR(ptr)], val);
+}
+
+void CPU::NewRec::X64Compiler::CopyHostReg(u32 dst, u32 src)
+{
+  if (src != dst)
+    cg->mov(Reg32(dst), Reg32(src));
+}
+
+Xbyak::Address CPU::NewRec::X64Compiler::MipsPtr(Reg r) const
+{
+  DebugAssert(r < Reg::count);
+  return cg->dword[PTR(&g_state.regs.r[static_cast<u32>(r)])];
+}
+
+Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegD(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_d);
+  return Reg32(cf.host_d);
+}
+
+Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegS(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_s);
+  return Reg32(cf.host_s);
+}
+
+Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegT(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_t);
+  return Reg32(cf.host_t);
+}
+
+Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegLO(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_lo);
+  return Reg32(cf.host_lo);
+}
+
+Xbyak::Reg32 CPU::NewRec::X64Compiler::CFGetRegHI(CompileFlags cf) const
+{
+  DebugAssert(cf.valid_host_hi);
+  return Reg32(cf.host_hi);
+}
+
+Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveSToD(CompileFlags cf)
+{
+  DebugAssert(cf.valid_host_d);
+  DebugAssert(!cf.valid_host_t || cf.host_t != cf.host_d);
+
+  const Reg32 rd = CFGetRegD(cf);
+  MoveSToReg(rd, cf);
+
+  return rd;
+}
+
+Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveSToT(CompileFlags cf)
+{
+  DebugAssert(cf.valid_host_t);
+
+  const Reg32 rt = CFGetRegT(cf);
+  if (cf.valid_host_s)
+  {
+    const Reg32 rs = CFGetRegS(cf);
+    if (rt != rs)
+      cg->mov(rt, rs);
+  }
+  else if (cf.const_s)
+  {
+    if (const u32 cv = GetConstantRegU32(cf.MipsS()); cv != 0)
+      cg->mov(rt, cv);
+    else
+      cg->xor_(rt, rt);
+  }
+  else
+  {
+    cg->mov(rt, MipsPtr(cf.MipsS()));
+  }
+
+  return rt;
+}
+
+Xbyak::Reg32 CPU::NewRec::X64Compiler::MoveTToD(CompileFlags cf)
+{
+  DebugAssert(cf.valid_host_d);
+  DebugAssert(!cf.valid_host_s || cf.host_s != cf.host_d);
+
+  const Reg32 rd = CFGetRegD(cf);
+  MoveTToReg(rd, cf);
+  return rd;
+}
+
+void CPU::NewRec::X64Compiler::MoveSToReg(const Xbyak::Reg32& dst, CompileFlags cf)
+{
+  if (cf.valid_host_s)
+  {
+    if (cf.host_s != static_cast<u32>(dst.getIdx()))
+      cg->mov(dst, Reg32(cf.host_s));
+  }
+  else if (cf.const_s)
+  {
+    const u32 cv = GetConstantRegU32(cf.MipsS());
+    if (cv == 0)
+      cg->xor_(dst, dst);
+    else
+      cg->mov(dst, cv);
+  }
+  else
+  {
+    cg->mov(dst, cg->dword[PTR(&g_state.regs.r[cf.mips_s])]);
+  }
+}
+
+void CPU::NewRec::X64Compiler::MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf)
+{
+  if (cf.valid_host_t)
+  {
+    if (cf.host_t != static_cast<u32>(dst.getIdx()))
+      cg->mov(dst, Reg32(cf.host_t));
+  }
+  else if (cf.const_t)
+  {
+    const u32 cv = GetConstantRegU32(cf.MipsT());
+    if (cv == 0)
+      cg->xor_(dst, dst);
+    else
+      cg->mov(dst, cv);
+  }
+  else
+  {
+    cg->mov(dst, cg->dword[PTR(&g_state.regs.r[cf.mips_t])]);
+  }
+}
+
+void CPU::NewRec::X64Compiler::MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg)
+{
+  DebugAssert(reg < Reg::count);
+  if (const std::optional<u32> hreg = CheckHostReg(0, Compiler::HR_TYPE_CPU_REG, reg))
+    cg->mov(dst, Reg32(hreg.value()));
+  else if (HasConstantReg(reg))
+    cg->mov(dst, GetConstantRegU32(reg));
+  else
+    cg->mov(dst, MipsPtr(reg));
+}
+
+void CPU::NewRec::X64Compiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val,
+                                                            Reg arg2reg /* = Reg::count */,
+                                                            Reg arg3reg /* = Reg::count */)
+{
+  DebugAssert(g_settings.gpu_pgxp_enable);
+
+  Flush(FLUSH_FOR_C_CALL);
+
+  if (arg2reg != Reg::count)
+    MoveMIPSRegToReg(RWARG2, arg2reg);
+  if (arg3reg != Reg::count)
+    MoveMIPSRegToReg(RWARG3, arg3reg);
+
+  cg->mov(RWARG1, arg1val);
+  cg->call(func);
+}
+
+void CPU::NewRec::X64Compiler::Flush(u32 flags)
+{
+  Compiler::Flush(flags);
+
+  if (flags & FLUSH_PC && m_dirty_pc)
+  {
+    cg->mov(cg->dword[PTR(&g_state.pc)], m_compiler_pc);
+    m_dirty_pc = false;
+  }
+
+  if (flags & FLUSH_INSTRUCTION_BITS)
+  {
+    cg->mov(cg->dword[PTR(&g_state.current_instruction.bits)], inst->bits);
+    cg->mov(cg->dword[PTR(&g_state.current_instruction_pc)], m_current_instruction_pc);
+    cg->mov(cg->byte[PTR(&g_state.current_instruction_in_branch_delay_slot)], m_current_instruction_branch_delay_slot);
+  }
+
+  if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty)
+  {
+    // This sucks :(
+    // TODO: make it a function?
+    cg->movzx(RWARG1, cg->byte[PTR(&g_state.load_delay_reg)]);
+    cg->mov(RWARG2, cg->dword[PTR(&g_state.load_delay_value)]);
+    cg->mov(cg->dword[PTR(&g_state.regs.r[0]) + RXARG1 * 4], RWARG2);
+    cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], static_cast<u8>(Reg::count));
+    m_load_delay_dirty = false;
+  }
+
+  if (flags & FLUSH_LOAD_DELAY && m_load_delay_register != Reg::count)
+  {
+    if (m_load_delay_value_register != NUM_HOST_REGS)
+      FreeHostReg(m_load_delay_value_register);
+
+    cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], static_cast<u8>(m_load_delay_register));
+    m_load_delay_register = Reg::count;
+    m_load_delay_dirty = true;
+  }
+
+  if (flags & FLUSH_GTE_STALL_FROM_STATE && m_dirty_gte_done_cycle)
+  {
+    // May as well flush cycles while we're here.
+    // GTE spanning blocks is very rare, we _could_ disable this for speed.
+    cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]);
+    cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_completion_tick)]);
+    if (m_cycles > 0)
+    {
+      (m_cycles == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_cycles);
+      m_cycles = 0;
+    }
+    cg->cmp(RWARG2, RWARG1);
+    cg->cmova(RWARG1, RWARG2);
+    cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1);
+    m_dirty_gte_done_cycle = false;
+  }
+
+  if (flags & FLUSH_GTE_DONE_CYCLE && m_gte_done_cycle > m_cycles)
+  {
+    cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]);
+
+    // update cycles at the same time
+    if (flags & FLUSH_CYCLES && m_cycles > 0)
+    {
+      (m_cycles == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_cycles);
+      cg->mov(cg->dword[PTR(&g_state.pending_ticks)], RWARG1);
+      m_gte_done_cycle -= m_cycles;
+      m_cycles = 0;
+    }
+
+    (m_gte_done_cycle == 1) ? cg->inc(RWARG1) : cg->add(RWARG1, m_gte_done_cycle);
+    cg->mov(cg->dword[PTR(&g_state.gte_completion_tick)], RWARG1);
+    m_gte_done_cycle = 0;
+    m_dirty_gte_done_cycle = true;
+  }
+
+  if (flags & FLUSH_CYCLES && m_cycles > 0)
+  {
+    (m_cycles == 1) ? cg->inc(cg->dword[PTR(&g_state.pending_ticks)]) :
+                      cg->add(cg->dword[PTR(&g_state.pending_ticks)], m_cycles);
+    m_gte_done_cycle = std::max<TickCount>(m_gte_done_cycle - m_cycles, 0);
+    m_cycles = 0;
+  }
+}
+
+void CPU::NewRec::X64Compiler::Compile_Fallback()
+{
+  Flush(FLUSH_FOR_INTERPRETER);
+
+  cg->call(&CPU::Recompiler::Thunks::InterpretInstruction);
+
+  // TODO: make me less garbage
+  // TODO: this is wrong, it flushes the load delay on the same cycle when we return.
+  // but nothing should be going through here..
+  Label no_load_delay;
+  cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]);
+  cg->cmp(RWARG1, static_cast<u8>(Reg::count));
+  cg->je(no_load_delay, CodeGenerator::T_SHORT);
+  cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]);
+  cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1);
+  cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2);
+  cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast<u32>(Reg::count));
+  cg->L(no_load_delay);
+
+  m_load_delay_dirty = EMULATE_LOAD_DELAYS;
+}
+
+void CPU::NewRec::X64Compiler::CheckBranchTarget(const Xbyak::Reg32& pcreg)
+{
+  if (!g_settings.cpu_recompiler_memory_exceptions)
+    return;
+
+  cg->test(pcreg, 0x3);
+  SwitchToFarCode(true, &CodeGenerator::jnz);
+
+  BackupHostState();
+  EndBlockWithException(Exception::AdEL);
+
+  RestoreHostState();
+  SwitchToNearCode(false);
+}
+
+void CPU::NewRec::X64Compiler::Compile_jr(CompileFlags cf)
+{
+  if (!cf.valid_host_s)
+    cg->mov(RWARG1, MipsPtr(cf.MipsS()));
+
+  const Reg32 pcreg = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
+  CheckBranchTarget(pcreg);
+
+  cg->mov(cg->dword[PTR(&g_state.pc)], pcreg);
+
+  CompileBranchDelaySlot(false);
+  EndBlock(std::nullopt, true);
+}
+
+void CPU::NewRec::X64Compiler::Compile_jalr(CompileFlags cf)
+{
+  if (!cf.valid_host_s)
+    cg->mov(RWARG1, MipsPtr(cf.MipsS()));
+
+  const Reg32 pcreg = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
+
+  if (MipsD() != Reg::zero)
+    SetConstantReg(MipsD(), GetBranchReturnAddress(cf));
+
+  CheckBranchTarget(pcreg);
+  cg->mov(cg->dword[PTR(&g_state.pc)], pcreg);
+
+  CompileBranchDelaySlot(false);
+  EndBlock(std::nullopt, true);
+}
+
+void CPU::NewRec::X64Compiler::Compile_bxx(CompileFlags cf, BranchCondition cond)
+{
+  const u32 taken_pc = GetConditionalBranchTarget(cf);
+
+  Flush(FLUSH_FOR_BRANCH);
+
+  DebugAssert(cf.valid_host_s);
+
+  // MipsT() here should equal zero for zero branches.
+  DebugAssert(cond == BranchCondition::Equal || cond == BranchCondition::NotEqual || cf.MipsT() == Reg::zero);
+
+  // TODO: Swap this back to near once instructions don't blow up
+  constexpr CodeGenerator::LabelType type = CodeGenerator::T_NEAR;
+  Label taken;
+  switch (cond)
+  {
+    case BranchCondition::Equal:
+    case BranchCondition::NotEqual:
+    {
+      // we should always have S, maybe not T
+      // TODO: if it's zero, we can just do test rs, rs
+      if (cf.valid_host_t)
+        cg->cmp(CFGetRegS(cf), CFGetRegT(cf));
+      else if (cf.const_t)
+        cg->cmp(CFGetRegS(cf), GetConstantRegU32(cf.MipsT()));
+      else
+        cg->cmp(CFGetRegS(cf), MipsPtr(cf.MipsT()));
+
+      (cond == BranchCondition::Equal) ? cg->je(taken, type) : cg->jne(taken, type);
+    }
+    break;
+
+    case BranchCondition::GreaterThanZero:
+    {
+      cg->cmp(CFGetRegS(cf), 0);
+      cg->jg(taken, type);
+    }
+    break;
+
+    case BranchCondition::GreaterEqualZero:
+    {
+      cg->test(CFGetRegS(cf), CFGetRegS(cf));
+      cg->jns(taken, type);
+    }
+    break;
+
+    case BranchCondition::LessThanZero:
+    {
+      cg->test(CFGetRegS(cf), CFGetRegS(cf));
+      cg->js(taken, type);
+    }
+    break;
+
+    case BranchCondition::LessEqualZero:
+    {
+      cg->cmp(CFGetRegS(cf), 0);
+      cg->jle(taken, type);
+    }
+    break;
+  }
+
+  BackupHostState();
+  if (!cf.delay_slot_swapped)
+    CompileBranchDelaySlot();
+
+  EndBlock(m_compiler_pc, true);
+
+  cg->L(taken);
+
+  RestoreHostState();
+  if (!cf.delay_slot_swapped)
+    CompileBranchDelaySlot();
+
+  EndBlock(taken_pc, true);
+}
+
+void CPU::NewRec::X64Compiler::Compile_addi(CompileFlags cf)
+{
+  const Reg32 rt = MoveSToT(cf);
+  if (const u32 imm = inst->i.imm_sext32(); imm != 0)
+  {
+    cg->add(rt, imm);
+    if (g_settings.cpu_recompiler_memory_exceptions)
+    {
+      DebugAssert(cf.valid_host_t);
+      TestOverflow(rt);
+    }
+  }
+}
+
+void CPU::NewRec::X64Compiler::Compile_addiu(CompileFlags cf)
+{
+  const Reg32 rt = MoveSToT(cf);
+  if (const u32 imm = inst->i.imm_sext32(); imm != 0)
+    cg->add(rt, imm);
+}
+
+void CPU::NewRec::X64Compiler::Compile_slti(CompileFlags cf)
+{
+  Compile_slti(cf, true);
+}
+
+void CPU::NewRec::X64Compiler::Compile_sltiu(CompileFlags cf)
+{
+  Compile_slti(cf, false);
+}
+
+void CPU::NewRec::X64Compiler::Compile_slti(CompileFlags cf, bool sign)
+{
+  const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1;
+
+  // Case where T == S, can't use xor because it changes flags
+  if (!cf.valid_host_t || !cf.valid_host_s || cf.host_t != cf.host_s)
+    cg->xor_(rt, rt);
+
+  if (cf.valid_host_s)
+    cg->cmp(CFGetRegS(cf), inst->i.imm_sext32());
+  else
+    cg->cmp(MipsPtr(cf.MipsS()), inst->i.imm_sext32());
+
+  if (cf.valid_host_t && cf.valid_host_s && cf.host_t == cf.host_s)
+    cg->mov(rt, 0);
+
+  sign ? cg->setl(rt.cvt8()) : cg->setb(rt.cvt8());
+
+  if (!cf.valid_host_t)
+    cg->mov(MipsPtr(cf.MipsT()), rt);
+}
+
+void CPU::NewRec::X64Compiler::Compile_andi(CompileFlags cf)
+{
+  if (const u32 imm = inst->i.imm_zext32(); imm != 0)
+  {
+    const Reg32 rt = MoveSToT(cf);
+    cg->and_(rt, imm);
+  }
+  else
+  {
+    const Reg32 rt = CFGetRegT(cf);
+    cg->xor_(rt, rt);
+  }
+}
+
+void CPU::NewRec::X64Compiler::Compile_ori(CompileFlags cf)
+{
+  const Reg32 rt = MoveSToT(cf);
+  if (const u32 imm = inst->i.imm_zext32(); imm != 0)
+    cg->or_(rt, imm);
+}
+
+void CPU::NewRec::X64Compiler::Compile_xori(CompileFlags cf)
+{
+  const Reg32 rt = MoveSToT(cf);
+  if (const u32 imm = inst->i.imm_zext32(); imm != 0)
+    cg->xor_(rt, imm);
+}
+
+void CPU::NewRec::X64Compiler::Compile_sll(CompileFlags cf)
+{
+  const Reg32 rd = MoveTToD(cf);
+  if (inst->r.shamt > 0)
+    cg->shl(rd, inst->r.shamt);
+}
+
+void CPU::NewRec::X64Compiler::Compile_srl(CompileFlags cf)
+{
+  const Reg32 rd = MoveTToD(cf);
+  if (inst->r.shamt > 0)
+    cg->shr(rd, inst->r.shamt);
+}
+
+void CPU::NewRec::X64Compiler::Compile_sra(CompileFlags cf)
+{
+  const Reg32 rd = MoveTToD(cf);
+  if (inst->r.shamt > 0)
+    cg->sar(rd, inst->r.shamt);
+}
+
+void CPU::NewRec::X64Compiler::Compile_variable_shift(
+  CompileFlags cf, void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Reg8&),
+  void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, int))
+{
+  const Reg32 rd = CFGetRegD(cf);
+  if (!cf.const_s)
+  {
+    MoveSToReg(cg->ecx, cf);
+    MoveTToReg(rd, cf);
+    (cg->*op)(rd, cg->cl);
+  }
+  else
+  {
+    MoveTToReg(rd, cf);
+    (cg->*op_const)(rd, GetConstantRegU32(cf.MipsS()));
+  }
+}
+
+void CPU::NewRec::X64Compiler::Compile_sllv(CompileFlags cf)
+{
+  Compile_variable_shift(cf, &CodeGenerator::shl, &CodeGenerator::shl);
+}
+
+void CPU::NewRec::X64Compiler::Compile_srlv(CompileFlags cf)
+{
+  Compile_variable_shift(cf, &CodeGenerator::shr, &CodeGenerator::shr);
+}
+
+void CPU::NewRec::X64Compiler::Compile_srav(CompileFlags cf)
+{
+  Compile_variable_shift(cf, &CodeGenerator::sar, &CodeGenerator::sar);
+}
+
+void CPU::NewRec::X64Compiler::Compile_mult(CompileFlags cf, bool sign)
+{
+  // RAX/RDX shouldn't be allocatable..
+  DebugAssert(!(m_host_regs[Xbyak::Operand::RAX].flags & HR_USABLE) &&
+              !(m_host_regs[Xbyak::Operand::RDX].flags & HR_USABLE));
+
+  MoveSToReg(cg->eax, cf);
+  if (cf.valid_host_t)
+  {
+    sign ? cg->imul(CFGetRegT(cf)) : cg->mul(CFGetRegT(cf));
+  }
+  else if (cf.const_t)
+  {
+    cg->mov(cg->edx, GetConstantRegU32(cf.MipsT()));
+    sign ? cg->imul(cg->edx) : cg->mul(cg->edx);
+  }
+  else
+  {
+    sign ? cg->imul(MipsPtr(cf.MipsT())) : cg->mul(MipsPtr(cf.MipsT()));
+  }
+
+  // TODO: skip writeback if it's not needed
+  if (cf.valid_host_lo)
+    cg->mov(CFGetRegLO(cf), cg->eax);
+  else
+    cg->mov(MipsPtr(Reg::lo), cg->eax);
+  if (cf.valid_host_lo)
+    cg->mov(CFGetRegHI(cf), cg->edx);
+  else
+    cg->mov(MipsPtr(Reg::hi), cg->edx);
+}
+
+void CPU::NewRec::X64Compiler::Compile_mult(CompileFlags cf)
+{
+  Compile_mult(cf, true);
+}
+
+void CPU::NewRec::X64Compiler::Compile_multu(CompileFlags cf)
+{
+  Compile_mult(cf, false);
+}
+
+void CPU::NewRec::X64Compiler::Compile_div(CompileFlags cf)
+{
+  // not supported without registers for now..
+  DebugAssert(cf.valid_host_lo && cf.valid_host_hi);
+
+  const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : cg->ecx;
+  if (!cf.valid_host_t)
+    MoveTToReg(rt, cf);
+
+  const Reg32 rlo = CFGetRegLO(cf);
+  const Reg32 rhi = CFGetRegHI(cf);
+
+  MoveSToReg(cg->eax, cf);
+  cg->cdq();
+
+  Label done;
+  Label not_divide_by_zero;
+  cg->test(rt, rt);
+  cg->jnz(not_divide_by_zero, CodeGenerator::T_SHORT);
+  cg->test(cg->eax, cg->eax);
+  cg->mov(rhi, cg->eax); // hi = num
+  cg->mov(rlo, 1);
+  cg->mov(cg->eax, static_cast<u32>(-1));
+  cg->cmovns(rlo, cg->eax); // lo = s >= 0 ? -1 : 1
+  cg->jmp(done, CodeGenerator::T_SHORT);
+
+  cg->L(not_divide_by_zero);
+  Label not_unrepresentable;
+  cg->cmp(cg->eax, 0x80000000u);
+  cg->jne(not_unrepresentable, CodeGenerator::T_SHORT);
+  cg->cmp(rt, static_cast<u32>(-1));
+  cg->jne(not_unrepresentable, CodeGenerator::T_SHORT);
+
+  cg->mov(rlo, 0x80000000u);
+  cg->xor_(rhi, rhi);
+  cg->jmp(done, CodeGenerator::T_SHORT);
+
+  cg->L(not_unrepresentable);
+
+  cg->idiv(rt);
+  cg->mov(rlo, cg->eax);
+  cg->mov(rhi, cg->edx);
+
+  cg->L(done);
+}
+
+void CPU::NewRec::X64Compiler::Compile_divu(CompileFlags cf)
+{
+  // not supported without registers for now..
+  DebugAssert(cf.valid_host_lo && cf.valid_host_hi);
+
+  const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : cg->ecx;
+  if (!cf.valid_host_t)
+    MoveTToReg(rt, cf);
+
+  const Reg32 rlo = CFGetRegLO(cf);
+  const Reg32 rhi = CFGetRegHI(cf);
+
+  MoveSToReg(cg->eax, cf);
+  cg->xor_(cg->edx, cg->edx);
+
+  Label done;
+  Label not_divide_by_zero;
+  cg->test(rt, rt);
+  cg->jnz(not_divide_by_zero, CodeGenerator::T_SHORT);
+  cg->mov(rlo, static_cast<u32>(-1));
+  cg->mov(rhi, cg->eax);
+  cg->jmp(done, CodeGenerator::T_SHORT);
+
+  cg->L(not_divide_by_zero);
+  cg->div(rt);
+  cg->mov(rlo, cg->eax);
+  cg->mov(rhi, cg->edx);
+
+  cg->L(done);
+}
+
+void CPU::NewRec::X64Compiler::TestOverflow(const Xbyak::Reg32& result)
+{
+  SwitchToFarCode(true, &Xbyak::CodeGenerator::jo);
+
+  BackupHostState();
+
+  // toss the result
+  ClearHostReg(result.getIdx());
+
+  EndBlockWithException(Exception::Ov);
+
+  RestoreHostState();
+
+  SwitchToNearCode(false);
+}
+
+void CPU::NewRec::X64Compiler::Compile_dst_op(
+  CompileFlags cf, void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Operand&),
+  void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, u32), bool commutative, bool overflow)
+{
+  if (cf.valid_host_s && cf.valid_host_t)
+  {
+    if (cf.host_d == cf.host_s)
+    {
+      (cg->*op)(CFGetRegD(cf), CFGetRegT(cf));
+    }
+    else if (cf.host_d == cf.host_t)
+    {
+      if (commutative)
+      {
+        (cg->*op)(CFGetRegD(cf), CFGetRegS(cf));
+      }
+      else
+      {
+        cg->mov(RWARG1, CFGetRegT(cf));
+        cg->mov(CFGetRegD(cf), CFGetRegS(cf));
+        (cg->*op)(CFGetRegD(cf), RWARG1);
+      }
+    }
+    else
+    {
+      cg->mov(CFGetRegD(cf), CFGetRegS(cf));
+      (cg->*op)(CFGetRegD(cf), CFGetRegT(cf));
+    }
+  }
+  else if (commutative && (cf.const_s || cf.const_t))
+  {
+    const Reg32 rd = CFGetRegD(cf);
+    (cf.const_s) ? MoveTToReg(rd, cf) : MoveSToReg(rd, cf);
+    if (const u32 cv = GetConstantRegU32(cf.const_s ? cf.MipsS() : cf.MipsT()); cv != 0)
+      (cg->*op_const)(CFGetRegD(cf), cv);
+    else
+      overflow = false;
+  }
+  else if (cf.const_s)
+  {
+    // need to backup T?
+    if (cf.valid_host_d && cf.valid_host_t && cf.host_d == cf.host_t)
+    {
+      cg->mov(RWARG1, CFGetRegT(cf));
+      MoveSToReg(CFGetRegD(cf), cf);
+      (cg->*op)(CFGetRegD(cf), RWARG1);
+    }
+    else
+    {
+      MoveSToReg(CFGetRegD(cf), cf);
+      (cg->*op)(CFGetRegD(cf), CFGetRegT(cf));
+    }
+  }
+  else if (cf.const_t)
+  {
+    MoveSToReg(CFGetRegD(cf), cf);
+    if (const u32 cv = GetConstantRegU32(cf.MipsT()); cv != 0)
+      (cg->*op_const)(CFGetRegD(cf), cv);
+    else
+      overflow = false;
+  }
+  else if (cf.valid_host_s)
+  {
+    if (cf.host_d != cf.host_s)
+      cg->mov(CFGetRegD(cf), CFGetRegS(cf));
+    (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsT()));
+  }
+  else if (cf.valid_host_t)
+  {
+    if (cf.host_d != cf.host_t)
+      cg->mov(CFGetRegD(cf), CFGetRegT(cf));
+    (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsS()));
+  }
+  else
+  {
+    cg->mov(CFGetRegD(cf), MipsPtr(cf.MipsS()));
+    (cg->*op)(CFGetRegD(cf), MipsPtr(cf.MipsT()));
+  }
+
+  if (overflow)
+  {
+    DebugAssert(cf.valid_host_d);
+    TestOverflow(CFGetRegD(cf));
+  }
+}
+
+void CPU::NewRec::X64Compiler::Compile_add(CompileFlags cf)
+{
+  Compile_dst_op(cf, &CodeGenerator::add, &CodeGenerator::add, true, g_settings.cpu_recompiler_memory_exceptions);
+}
+
+void CPU::NewRec::X64Compiler::Compile_addu(CompileFlags cf)
+{
+  Compile_dst_op(cf, &CodeGenerator::add, &CodeGenerator::add, true, false);
+}
+
+void CPU::NewRec::X64Compiler::Compile_sub(CompileFlags cf)
+{
+  Compile_dst_op(cf, &CodeGenerator::sub, &CodeGenerator::sub, false, g_settings.cpu_recompiler_memory_exceptions);
+}
+
+void CPU::NewRec::X64Compiler::Compile_subu(CompileFlags cf)
+{
+  Compile_dst_op(cf, &CodeGenerator::sub, &CodeGenerator::sub, false, false);
+}
+
+void CPU::NewRec::X64Compiler::Compile_and(CompileFlags cf)
+{
+  // special cases - and with self -> self, and with 0 -> 0
+  const Reg32 regd = CFGetRegD(cf);
+  if (cf.MipsS() == cf.MipsT())
+  {
+    MoveSToReg(regd, cf);
+    return;
+  }
+  else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
+  {
+    cg->xor_(regd, regd);
+    return;
+  }
+
+  Compile_dst_op(cf, &CodeGenerator::and_, &CodeGenerator::and_, true, false);
+}
+
+void CPU::NewRec::X64Compiler::Compile_or(CompileFlags cf)
+{
+  // or/nor with 0 -> no effect
+  const Reg32 regd = CFGetRegD(cf);
+  if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0) || cf.MipsS() == cf.MipsT())
+  {
+    cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
+    return;
+  }
+
+  Compile_dst_op(cf, &CodeGenerator::or_, &CodeGenerator::or_, true, false);
+}
+
+void CPU::NewRec::X64Compiler::Compile_xor(CompileFlags cf)
+{
+  const Reg32 regd = CFGetRegD(cf);
+  if (cf.MipsS() == cf.MipsT())
+  {
+    // xor with self -> zero
+    cg->xor_(regd, regd);
+    return;
+  }
+  else if (HasConstantRegValue(cf.MipsS(), 0) || HasConstantRegValue(cf.MipsT(), 0))
+  {
+    // xor with zero -> no effect
+    cf.const_s ? MoveTToReg(regd, cf) : MoveSToReg(regd, cf);
+    return;
+  }
+
+  Compile_dst_op(cf, &CodeGenerator::xor_, &CodeGenerator::xor_, true, false);
+}
+
+void CPU::NewRec::X64Compiler::Compile_nor(CompileFlags cf)
+{
+  Compile_or(cf);
+  cg->not_(CFGetRegD(cf));
+}
+
+void CPU::NewRec::X64Compiler::Compile_slt(CompileFlags cf)
+{
+  Compile_slt(cf, true);
+}
+
+void CPU::NewRec::X64Compiler::Compile_sltu(CompileFlags cf)
+{
+  Compile_slt(cf, false);
+}
+
+void CPU::NewRec::X64Compiler::Compile_slt(CompileFlags cf, bool sign)
+{
+  const Reg32 rd = CFGetRegD(cf);
+  const Reg32 rs = cf.valid_host_s ? CFGetRegS(cf) : RWARG1;
+  const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1;
+  if (!cf.valid_host_s)
+    MoveSToReg(rs, cf);
+
+  // Case where D == S, can't use xor because it changes flags
+  // TODO: swap and reverse op for constants
+  if (rd != rs && rd != rt)
+    cg->xor_(rd, rd);
+
+  if (cf.valid_host_t)
+    cg->cmp(rs, CFGetRegT(cf));
+  else if (cf.const_t)
+    cg->cmp(rs, GetConstantRegU32(cf.MipsT()));
+  else
+    cg->cmp(rs, MipsPtr(cf.MipsT()));
+
+  if (rd == rs || rd == rt)
+    cg->mov(rd, 0);
+
+  sign ? cg->setl(rd.cvt8()) : cg->setb(rd.cvt8());
+}
+
+Xbyak::Reg32
+CPU::NewRec::X64Compiler::ComputeLoadStoreAddressArg(CompileFlags cf,
+                                                     const std::optional<VirtualMemoryAddress>& address,
+                                                     const std::optional<const Xbyak::Reg32>& reg /* = std::nullopt */)
+{
+  const u32 imm = inst->i.imm_sext32();
+  if (cf.valid_host_s && imm == 0 && !reg.has_value())
+    return CFGetRegS(cf);
+
+  const Reg32 dst = reg.has_value() ? reg.value() : RWARG1;
+  if (address.has_value())
+  {
+    cg->mov(dst, address.value());
+  }
+  else
+  {
+    if (cf.valid_host_s)
+    {
+      if (const Reg32 src = CFGetRegS(cf); src != dst)
+        cg->mov(dst, CFGetRegS(cf));
+    }
+    else
+    {
+      cg->mov(dst, MipsPtr(cf.MipsS()));
+    }
+
+    if (imm != 0)
+      cg->add(dst, inst->i.imm_sext32());
+  }
+
+  return dst;
+}
+
+template<typename RegAllocFn>
+Xbyak::Reg32 CPU::NewRec::X64Compiler::GenerateLoad(const Xbyak::Reg32& addr_reg, MemoryAccessSize size, bool sign,
+                                                    const RegAllocFn& dst_reg_alloc)
+{
+  const bool checked = g_settings.cpu_recompiler_memory_exceptions;
+  if (CodeCache::IsUsingFastmem() && !checked)
+  {
+    m_cycles += Bus::RAM_READ_TICKS;
+
+    const Reg32 dst = dst_reg_alloc();
+
+    if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
+    {
+      DebugAssert(addr_reg != RWARG3);
+      cg->mov(RWARG3, addr_reg.cvt32());
+      cg->shr(RWARG3, Bus::FASTMEM_LUT_PAGE_SHIFT);
+      cg->mov(RXARG3, cg->qword[RMEMBASE + RXARG3 * 8]);
+    }
+
+    const Reg64 membase = (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE;
+    u8* start = cg->getCurr<u8*>();
+    switch (size)
+    {
+      case MemoryAccessSize::Byte:
+      {
+        sign ? cg->movsx(dst, cg->byte[membase + addr_reg.cvt64()]) :
+               cg->movzx(dst, cg->byte[membase + addr_reg.cvt64()]);
+      }
+      break;
+
+      case MemoryAccessSize::HalfWord:
+      {
+        sign ? cg->movsx(dst, cg->word[membase + addr_reg.cvt64()]) :
+               cg->movzx(dst, cg->word[membase + addr_reg.cvt64()]);
+      }
+      break;
+
+      case MemoryAccessSize::Word:
+      {
+        cg->mov(dst, cg->word[membase + addr_reg.cvt64()]);
+      }
+      break;
+    }
+
+    u8* end = cg->getCurr<u8*>();
+    while ((end - start) < BACKPATCH_JMP_SIZE)
+    {
+      cg->nop();
+      end = cg->getCurr<u8*>();
+    }
+
+    AddLoadStoreInfo(start, static_cast<u32>(end - start), static_cast<u32>(addr_reg.getIdx()),
+                     static_cast<u32>(dst.getIdx()), size, sign, true);
+    return dst;
+  }
+
+  if (addr_reg != RWARG1)
+    cg->mov(RWARG1, addr_reg);
+
+  switch (size)
+  {
+    case MemoryAccessSize::Byte:
+    {
+      cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryByte) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte));
+    }
+    break;
+    case MemoryAccessSize::HalfWord:
+    {
+      cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryHalfWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord));
+    }
+    break;
+    case MemoryAccessSize::Word:
+    {
+      cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::ReadMemoryWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord));
+    }
+    break;
+  }
+
+  // TODO: turn this into an asm function instead
+  if (checked)
+  {
+    cg->test(RXRET, RXRET);
+
+    BackupHostState();
+    SwitchToFarCode(true, &CodeGenerator::js);
+
+    // flush regs, but not pc, it's going to get overwritten
+    // flush cycles because of the GTE instruction stuff...
+    Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
+
+    // cause_bits = (-result << 2) | BD | cop_n
+    cg->mov(RWARG1, RWRET);
+    cg->neg(RWARG1);
+    cg->shl(RWARG1, 2);
+    cg->or_(RWARG1, Cop0Registers::CAUSE::MakeValueForException(
+                      static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));
+    cg->mov(RWARG2, m_current_instruction_pc);
+    cg->call(static_cast<void (*)(u32, u32)>(&CPU::RaiseException));
+    m_dirty_pc = false;
+    EndAndLinkBlock(std::nullopt, true);
+
+    SwitchToNearCode(false);
+    RestoreHostState();
+  }
+
+  const Xbyak::Reg32 dst_reg = dst_reg_alloc();
+  switch (size)
+  {
+    case MemoryAccessSize::Byte:
+    {
+      sign ? cg->movsx(dst_reg, RWRET.cvt8()) : cg->movzx(dst_reg, RWRET.cvt8());
+    }
+    break;
+    case MemoryAccessSize::HalfWord:
+    {
+      sign ? cg->movsx(dst_reg, RWRET.cvt16()) : cg->movzx(dst_reg, RWRET.cvt16());
+    }
+    break;
+    case MemoryAccessSize::Word:
+    {
+      if (dst_reg != RWRET)
+        cg->mov(dst_reg, RWRET);
+    }
+    break;
+  }
+
+  return dst_reg;
+}
+
+void CPU::NewRec::X64Compiler::GenerateStore(const Xbyak::Reg32& addr_reg, const Xbyak::Reg32& value_reg,
+                                             MemoryAccessSize size)
+{
+  const bool checked = g_settings.cpu_recompiler_memory_exceptions;
+  if (CodeCache::IsUsingFastmem() && !checked)
+  {
+    if (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT)
+    {
+      DebugAssert(addr_reg != RWARG3 && value_reg != RWARG3);
+      cg->mov(RWARG3, addr_reg.cvt32());
+      cg->shr(RWARG3, Bus::FASTMEM_LUT_PAGE_SHIFT);
+      cg->mov(RXARG3, cg->qword[RMEMBASE + RXARG3 * 8]);
+    }
+
+    const Reg64 membase = (g_settings.cpu_fastmem_mode == CPUFastmemMode::LUT) ? RXARG3 : RMEMBASE;
+    u8* start = cg->getCurr<u8*>();
+    switch (size)
+    {
+      case MemoryAccessSize::Byte:
+        cg->mov(cg->byte[membase + addr_reg.cvt64()], value_reg.cvt8());
+        break;
+
+      case MemoryAccessSize::HalfWord:
+        cg->mov(cg->word[membase + addr_reg.cvt64()], value_reg.cvt16());
+        break;
+
+      case MemoryAccessSize::Word:
+        cg->mov(cg->word[membase + addr_reg.cvt64()], value_reg.cvt32());
+        break;
+    }
+
+    u8* end = cg->getCurr<u8*>();
+    while ((end - start) < BACKPATCH_JMP_SIZE)
+    {
+      cg->nop();
+      end = cg->getCurr<u8*>();
+    }
+
+    AddLoadStoreInfo(start, static_cast<u32>(end - start), static_cast<u32>(addr_reg.getIdx()),
+                     static_cast<u32>(value_reg.getIdx()), size, false, false);
+    return;
+  }
+
+  if (addr_reg != RWARG1)
+    cg->mov(RWARG1, addr_reg);
+  if (value_reg != RWARG2)
+    cg->mov(RWARG2, value_reg);
+
+  switch (size)
+  {
+    case MemoryAccessSize::Byte:
+    {
+      cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryByte) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte));
+    }
+    break;
+    case MemoryAccessSize::HalfWord:
+    {
+      cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryHalfWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord));
+    }
+    break;
+    case MemoryAccessSize::Word:
+    {
+      cg->call(checked ? reinterpret_cast<const void*>(&Recompiler::Thunks::WriteMemoryWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord));
+    }
+    break;
+  }
+
+  // TODO: turn this into an asm function instead
+  if (checked)
+  {
+    cg->test(RWRET, RWRET);
+
+    BackupHostState();
+    SwitchToFarCode(true, &CodeGenerator::jnz);
+
+    // flush regs, but not pc, it's going to get overwritten
+    // flush cycles because of the GTE instruction stuff...
+    Flush(FLUSH_FOR_C_CALL | FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION);
+
+    // cause_bits = (result << 2) | BD | cop_n
+    cg->mov(RWARG1, RWRET);
+    cg->shl(RWARG1, 2);
+    cg->or_(RWARG1, Cop0Registers::CAUSE::MakeValueForException(
+                      static_cast<Exception>(0), m_current_instruction_branch_delay_slot, false, inst->cop.cop_n));
+    cg->mov(RWARG2, m_current_instruction_pc);
+    cg->call(reinterpret_cast<const void*>(static_cast<void (*)(u32, u32)>(&CPU::RaiseException)));
+    m_dirty_pc = false;
+    EndAndLinkBlock(std::nullopt, true);
+
+    SwitchToNearCode(false);
+    RestoreHostState();
+  }
+}
+
+void CPU::NewRec::X64Compiler::Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                           const std::optional<VirtualMemoryAddress>& address)
+{
+  const std::optional<Reg32> addr_reg = g_settings.gpu_pgxp_enable ?
+                                          std::optional<Reg32>(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) :
+                                          std::optional<Reg32>();
+  FlushForLoadStore(address, false);
+  const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
+
+  const Reg32 data = GenerateLoad(addr, size, sign, [this, cf]() {
+    if (cf.MipsT() == Reg::zero)
+      return RWRET;
+
+    return Reg32(AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
+                                 EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, cf.MipsT()));
+  });
+
+  if (g_settings.gpu_pgxp_enable)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+
+    cg->mov(RWARG1, inst->bits);
+    cg->mov(RWARG2, addr);
+    cg->mov(RWARG3, data);
+    cg->call(s_pgxp_mem_load_functions[static_cast<u32>(size)][static_cast<u32>(sign)]);
+    FreeHostReg(addr_reg.value().getIdx());
+  }
+}
+
+void CPU::NewRec::X64Compiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                           const std::optional<VirtualMemoryAddress>& address)
+{
+  DebugAssert(size == MemoryAccessSize::Word && !sign);
+  FlushForLoadStore(address, false);
+
+  // TODO: if address is constant, this can be simplified..
+
+  // If we're coming from another block, just flush the load delay and hope for the best..
+  if (m_load_delay_dirty)
+    UpdateLoadDelay();
+
+  // We'd need to be careful here if we weren't overwriting it..
+  const Reg32 addr = Reg32(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP));
+  ComputeLoadStoreAddressArg(cf, address, addr);
+  cg->mov(RWARG1, addr);
+  cg->and_(RWARG1, ~0x3u);
+  GenerateLoad(RWARG1, MemoryAccessSize::Word, false, []() { return RWRET; });
+
+  if (inst->r.rt == Reg::zero)
+  {
+    FreeHostReg(addr.getIdx());
+    return;
+  }
+
+  // lwl/lwr from a load-delayed value takes the new value, but it itself, is load delayed, so the original value is
+  // never written back. NOTE: can't trust T in cf because of the flush
+  const Reg rt = inst->r.rt;
+  Reg32 value;
+  if (m_load_delay_register == rt)
+  {
+    const u32 existing_ld_rt = (m_load_delay_value_register == NUM_HOST_REGS) ?
+                                 AllocateHostReg(HR_MODE_READ, HR_TYPE_LOAD_DELAY_VALUE, rt) :
+                                 m_load_delay_value_register;
+    RenameHostReg(existing_ld_rt, HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt);
+    value = Reg32(existing_ld_rt);
+  }
+  else
+  {
+    if constexpr (EMULATE_LOAD_DELAYS)
+    {
+      value = Reg32(AllocateHostReg(HR_MODE_WRITE, HR_TYPE_NEXT_LOAD_DELAY_VALUE, rt));
+      if (HasConstantReg(rt))
+        cg->mov(value, GetConstantRegU32(rt));
+      else if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
+        cg->mov(value, Reg32(rtreg.value()));
+      else
+        cg->mov(value, MipsPtr(rt));
+    }
+    else
+    {
+      value = Reg32(AllocateHostReg(HR_MODE_READ | HR_MODE_WRITE, HR_TYPE_CPU_REG, rt));
+    }
+  }
+
+  DebugAssert(value != cg->ecx);
+  cg->mov(cg->ecx, addr);
+  cg->and_(cg->ecx, 3);
+  cg->shl(cg->ecx, 3); // *8
+
+  // TODO for other arch: reverse subtract
+  DebugAssert(RWARG2 != cg->ecx);
+  cg->mov(RWARG2, 24);
+  cg->sub(RWARG2, cg->ecx);
+
+  if (inst->op == InstructionOp::lwl)
+  {
+    // const u32 mask = UINT32_C(0x00FFFFFF) >> shift;
+    // new_value = (value & mask) | (RWRET << (24 - shift));
+    cg->mov(addr, 0xFFFFFFu);
+    cg->shr(addr, cg->cl);
+    cg->and_(value, addr);
+    cg->mov(cg->ecx, RWARG2);
+    cg->shl(RWRET, cg->cl);
+    cg->or_(value, RWRET);
+  }
+  else
+  {
+    // const u32 mask = UINT32_C(0xFFFFFF00) << (24 - shift);
+    // new_value = (value & mask) | (RWRET >> shift);
+    cg->shr(RWRET, cg->cl);
+    cg->mov(addr, 0xFFFFFF00u);
+    cg->mov(cg->ecx, RWARG2);
+    cg->shl(addr, cg->cl);
+    cg->and_(value, addr);
+    cg->or_(value, RWRET);
+  }
+
+  FreeHostReg(addr.getIdx());
+}
+
+void CPU::NewRec::X64Compiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                            const std::optional<VirtualMemoryAddress>& address)
+{
+  const std::optional<Reg32> addr_reg = g_settings.gpu_pgxp_enable ?
+                                          std::optional<Reg32>(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) :
+                                          std::optional<Reg32>();
+  FlushForLoadStore(address, false);
+  const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
+  GenerateLoad(addr, MemoryAccessSize::Word, false, []() { return RWRET; });
+
+  const u32 index = static_cast<u32>(inst->r.rt.GetValue());
+  const auto [ptr, action] = GetGTERegisterPointer(index, true);
+  switch (action)
+  {
+    case GTERegisterAccessAction::Ignore:
+    {
+      break;
+    }
+
+    case GTERegisterAccessAction::Direct:
+    {
+      cg->mov(cg->dword[PTR(ptr)], RWRET);
+      break;
+    }
+
+    case GTERegisterAccessAction::SignExtend16:
+    {
+      cg->movsx(RWRET, RWRET.cvt16());
+      cg->mov(cg->dword[PTR(ptr)], RWRET);
+      break;
+    }
+
+    case GTERegisterAccessAction::ZeroExtend16:
+    {
+      cg->movzx(RWRET, RWRET.cvt16());
+      cg->mov(cg->dword[PTR(ptr)], RWRET);
+      break;
+    }
+
+    case GTERegisterAccessAction::CallHandler:
+    {
+      Flush(FLUSH_FOR_C_CALL);
+      cg->mov(RWARG2, RWRET);
+      cg->mov(RWARG1, index);
+      cg->call(&GTE::WriteRegister);
+      break;
+    }
+
+    case GTERegisterAccessAction::PushFIFO:
+    {
+      // SXY0 <- SXY1
+      // SXY1 <- SXY2
+      // SXY2 <- SXYP
+      DebugAssert(RWRET != RWARG1 && RWRET != RWARG2);
+      cg->mov(RWARG1, cg->dword[PTR(&g_state.gte_regs.SXY1[0])]);
+      cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_regs.SXY2[0])]);
+      cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY0[0])], RWARG1);
+      cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY1[0])], RWARG2);
+      cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], RWRET);
+      break;
+    }
+
+    default:
+    {
+      Panic("Unknown action");
+      return;
+    }
+  }
+
+  if (g_settings.gpu_pgxp_enable)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    cg->mov(RWARG3, RWRET);
+    cg->mov(RWARG2, addr);
+    cg->mov(RWARG1, inst->bits);
+    cg->call(reinterpret_cast<const void*>(&PGXP::CPU_LWC2));
+    FreeHostReg(addr_reg.value().getIdx());
+  }
+}
+
+void CPU::NewRec::X64Compiler::Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                           const std::optional<VirtualMemoryAddress>& address)
+{
+  const std::optional<Reg32> addr_reg = g_settings.gpu_pgxp_enable ?
+                                          std::optional<Reg32>(Reg32(AllocateTempHostReg(HR_CALLEE_SAVED))) :
+                                          std::optional<Reg32>();
+  FlushForLoadStore(address, true);
+  const Reg32 addr = ComputeLoadStoreAddressArg(cf, address, addr_reg);
+  const Reg32 data = cf.valid_host_t ? CFGetRegT(cf) : RWARG2;
+  if (!cf.valid_host_t)
+    MoveTToReg(RWARG2, cf);
+
+  GenerateStore(addr, data, size);
+
+  if (g_settings.gpu_pgxp_enable)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    MoveMIPSRegToReg(RWARG3, cf.MipsT());
+    cg->mov(RWARG2, addr);
+    cg->mov(RWARG1, inst->bits);
+    cg->call(s_pgxp_mem_store_functions[static_cast<u32>(size)]);
+    FreeHostReg(addr_reg.value().getIdx());
+  }
+}
+
+void CPU::NewRec::X64Compiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                           const std::optional<VirtualMemoryAddress>& address)
+{
+  DebugAssert(size == MemoryAccessSize::Word && !sign);
+  FlushForLoadStore(address, true);
+
+  // TODO: if address is constant, this can be simplified..
+  // We'd need to be careful here if we weren't overwriting it..
+  const Reg32 addr = Reg32(AllocateHostReg(HR_CALLEE_SAVED, HR_TYPE_TEMP));
+  ComputeLoadStoreAddressArg(cf, address, addr);
+  cg->mov(RWARG1, addr);
+  cg->and_(RWARG1, ~0x3u);
+  GenerateLoad(RWARG1, MemoryAccessSize::Word, false, []() { return RWRET; });
+
+  // TODO: this can take over rt's value if it's no longer needed
+  // NOTE: can't trust T in cf because of the flush
+  const Reg rt = inst->r.rt;
+  const Reg32 value = RWARG2;
+  DebugAssert(value != cg->ecx);
+  if (HasConstantReg(rt))
+    cg->mov(value, GetConstantRegU32(rt));
+  else if (const std::optional<u32> rtreg = CheckHostReg(HR_MODE_READ, HR_TYPE_CPU_REG, rt); rtreg.has_value())
+    cg->mov(value, Reg32(rtreg.value()));
+  else
+    cg->mov(value, MipsPtr(rt));
+
+  cg->mov(cg->ecx, addr);
+  cg->and_(cg->ecx, 3);
+  cg->shl(cg->ecx, 3); // *8
+
+  if (inst->op == InstructionOp::swl)
+  {
+    // const u32 mem_mask = UINT32_C(0xFFFFFF00) << shift;
+    // new_value = (RWRET & mem_mask) | (value >> (24 - shift));
+    cg->mov(RWARG3, 0xFFFFFF00u);
+    cg->shl(RWARG3, cg->cl);
+    cg->and_(RWRET, RWARG3);
+
+    cg->mov(RWARG3, 24);
+    cg->sub(RWARG3, cg->ecx);
+    cg->mov(cg->ecx, RWARG3);
+    cg->shr(value, cg->cl);
+    cg->or_(value, RWRET);
+  }
+  else
+  {
+    // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
+    // new_value = (RWRET & mem_mask) | (value << shift);
+    cg->shl(value, cg->cl);
+
+    DebugAssert(RWARG3 != cg->ecx);
+    cg->mov(RWARG3, 24);
+    cg->sub(RWARG3, cg->ecx);
+    cg->mov(cg->ecx, RWARG3);
+    cg->mov(RWARG3, 0x00FFFFFFu);
+    cg->shr(RWARG3, cg->cl);
+    cg->and_(RWRET, RWARG3);
+    cg->or_(value, RWRET);
+  }
+
+  FreeHostReg(addr.getIdx());
+
+  cg->mov(RWARG1, addr);
+  cg->and_(RWARG1, ~0x3u);
+  GenerateStore(RWARG1, value, MemoryAccessSize::Word);
+}
+
+void CPU::NewRec::X64Compiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                                            const std::optional<VirtualMemoryAddress>& address)
+{
+  const u32 index = static_cast<u32>(inst->r.rt.GetValue());
+  const auto [ptr, action] = GetGTERegisterPointer(index, false);
+  switch (action)
+  {
+    case GTERegisterAccessAction::Direct:
+    {
+      cg->mov(RWARG2, cg->dword[PTR(ptr)]);
+    }
+    break;
+
+    case GTERegisterAccessAction::CallHandler:
+    {
+      // should already be flushed.. except in fastmem case
+      Flush(FLUSH_FOR_C_CALL);
+      cg->mov(RWARG1, index);
+      cg->call(&GTE::ReadRegister);
+      cg->mov(RWARG2, RWRET);
+    }
+    break;
+
+    default:
+    {
+      Panic("Unknown action");
+    }
+    break;
+  }
+
+  // PGXP makes this a giant pain.
+  if (!g_settings.gpu_pgxp_enable)
+  {
+    FlushForLoadStore(address, true);
+    const Reg32 addr = ComputeLoadStoreAddressArg(cf, address);
+    GenerateStore(addr, RWARG2, size);
+    return;
+  }
+
+  // TODO: This can be simplified because we don't need to validate in PGXP..
+  const Reg32 addr_reg = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED));
+  const Reg32 data_backup = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED));
+  FlushForLoadStore(address, true);
+  ComputeLoadStoreAddressArg(cf, address, addr_reg);
+  cg->mov(data_backup, RWARG2);
+  GenerateStore(addr_reg, RWARG2, size);
+
+  Flush(FLUSH_FOR_C_CALL);
+  cg->mov(RWARG3, data_backup);
+  cg->mov(RWARG2, addr_reg);
+  cg->mov(RWARG1, inst->bits);
+  cg->call(reinterpret_cast<const void*>(&PGXP::CPU_SWC2));
+  FreeHostReg(addr_reg.getIdx());
+  FreeHostReg(data_backup.getIdx());
+}
+
+void CPU::NewRec::X64Compiler::Compile_mtc0(CompileFlags cf)
+{
+  const Cop0Reg reg = static_cast<Cop0Reg>(MipsD());
+  const u32* ptr = GetCop0RegPtr(reg);
+  const u32 mask = GetCop0RegWriteMask(reg);
+  if (!ptr)
+  {
+    Compile_Fallback();
+    return;
+  }
+
+  // TODO: const apply mask
+  const Reg32 rt = cf.valid_host_t ? CFGetRegT(cf) : RWARG1;
+  const u32 constant_value = cf.const_t ? GetConstantRegU32(cf.MipsT()) : 0;
+  if (mask == 0)
+  {
+    // if it's a read-only register, ignore
+    Log_DebugPrintf("Ignoring write to read-only cop0 reg %u", static_cast<u32>(reg));
+    return;
+  }
+
+  // for some registers, we need to test certain bits
+  const bool needs_bit_test = (reg == Cop0Reg::SR);
+  const Reg32 changed_bits = RWARG3;
+
+  // update value
+  if (cf.valid_host_t)
+  {
+    cg->mov(RWARG1, rt);
+    cg->mov(RWARG2, cg->dword[PTR(ptr)]);
+    cg->and_(RWARG1, mask);
+    if (needs_bit_test)
+    {
+      cg->mov(changed_bits, RWARG2);
+      cg->xor_(changed_bits, RWARG1);
+    }
+    cg->and_(RWARG2, ~mask);
+    cg->or_(RWARG2, RWARG1);
+    cg->mov(cg->dword[PTR(ptr)], RWARG2);
+  }
+  else
+  {
+    cg->mov(RWARG2, cg->dword[PTR(ptr)]);
+    if (needs_bit_test)
+    {
+      cg->mov(changed_bits, RWARG2);
+      cg->xor_(changed_bits, constant_value & mask);
+    }
+    cg->and_(RWARG2, ~mask);
+    cg->or_(RWARG2, constant_value & mask);
+    cg->mov(cg->dword[PTR(ptr)], RWARG2);
+  }
+
+  if (reg == Cop0Reg::SR)
+  {
+    // TODO: replace with register backup
+    // We could just inline the whole thing..
+    Flush(FLUSH_FOR_C_CALL);
+
+    cg->test(changed_bits, 1u << 16);
+    SwitchToFarCode(true, &CodeGenerator::jnz);
+    cg->push(RWARG1);
+    cg->push(RWARG2);
+    cg->call(&CPU::UpdateMemoryPointers);
+    cg->pop(RWARG2);
+    cg->pop(RWARG1);
+    cg->mov(RMEMBASE, cg->qword[PTR(&g_state.fastmem_base)]);
+    SwitchToNearCode(true);
+  }
+
+  if (reg == Cop0Reg::SR || reg == Cop0Reg::CAUSE)
+  {
+    const Reg32 sr =
+      (reg == Cop0Reg::SR) ? RWARG2 : (cg->mov(RWARG1, cg->dword[PTR(&g_state.cop0_regs.sr.bits)]), RWARG1);
+    TestInterrupts(sr);
+  }
+
+  if (reg == Cop0Reg::DCIC && g_settings.cpu_recompiler_memory_exceptions)
+  {
+    // TODO: DCIC handling for debug breakpoints
+    Log_WarningPrintf("TODO: DCIC handling for debug breakpoints");
+  }
+}
+
+void CPU::NewRec::X64Compiler::Compile_rfe(CompileFlags cf)
+{
+  // shift mode bits right two, preserving upper bits
+  static constexpr u32 mode_bits_mask = UINT32_C(0b1111);
+  cg->mov(RWARG1, cg->dword[PTR(&g_state.cop0_regs.sr.bits)]);
+  cg->mov(RWARG2, RWARG1);
+  cg->shr(RWARG2, 2);
+  cg->and_(RWARG1, ~mode_bits_mask);
+  cg->and_(RWARG2, mode_bits_mask);
+  cg->or_(RWARG1, RWARG2);
+  cg->mov(cg->dword[PTR(&g_state.cop0_regs.sr.bits)], RWARG1);
+
+  TestInterrupts(RWARG1);
+}
+
+void CPU::NewRec::X64Compiler::TestInterrupts(const Xbyak::Reg32& sr)
+{
+  // if Iec == 0 then goto no_interrupt
+  Label no_interrupt;
+
+  cg->test(sr, 1);
+  cg->jz(no_interrupt, CodeGenerator::T_NEAR);
+
+  // sr & cause
+  cg->and_(sr, cg->dword[PTR(&g_state.cop0_regs.cause.bits)]);
+
+  // ((sr & cause) & 0xff00) == 0 goto no_interrupt
+  cg->test(sr, 0xFF00);
+
+  SwitchToFarCode(true, &CodeGenerator::jnz);
+  BackupHostState();
+  Flush(FLUSH_FLUSH_MIPS_REGISTERS | FLUSH_FOR_EXCEPTION | FLUSH_FOR_C_CALL);
+  cg->call(reinterpret_cast<const void*>(&DispatchInterrupt));
+  EndBlock(std::nullopt, true);
+  RestoreHostState();
+  SwitchToNearCode(false);
+
+  cg->L(no_interrupt);
+}
+
+void CPU::NewRec::X64Compiler::Compile_mfc2(CompileFlags cf)
+{
+  const u32 index = inst->cop.Cop2Index();
+  const Reg rt = inst->r.rt;
+
+  const auto [ptr, action] = GetGTERegisterPointer(index, false);
+  if (action == GTERegisterAccessAction::Ignore)
+    return;
+
+  u32 hreg;
+  if (action == GTERegisterAccessAction::Direct)
+  {
+    hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
+                           EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
+    cg->mov(Reg32(hreg), cg->dword[PTR(ptr)]);
+  }
+  else if (action == GTERegisterAccessAction::CallHandler)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    cg->mov(RWARG1, index);
+    cg->call(&GTE::ReadRegister);
+
+    hreg = AllocateHostReg(GetFlagsForNewLoadDelayedReg(),
+                           EMULATE_LOAD_DELAYS ? HR_TYPE_NEXT_LOAD_DELAY_VALUE : HR_TYPE_CPU_REG, rt);
+    cg->mov(Reg32(hreg), RWRET);
+  }
+  else
+  {
+    Panic("Unknown action");
+    return;
+  }
+
+  if (g_settings.gpu_pgxp_enable)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    cg->mov(RWARG1, inst->bits);
+    cg->mov(RWARG2, Reg32(hreg));
+    cg->call(reinterpret_cast<const void*>(&PGXP::CPU_MFC2));
+  }
+}
+
+void CPU::NewRec::X64Compiler::Compile_mtc2(CompileFlags cf)
+{
+  const u32 index = inst->cop.Cop2Index();
+  const auto [ptr, action] = GetGTERegisterPointer(index, true);
+  if (action == GTERegisterAccessAction::Ignore)
+    return;
+
+  if (action == GTERegisterAccessAction::Direct)
+  {
+    if (cf.const_t)
+    {
+      cg->mov(cg->dword[PTR(ptr)], GetConstantRegU32(cf.MipsT()));
+    }
+    else if (cf.valid_host_t)
+    {
+      cg->mov(cg->dword[PTR(ptr)], CFGetRegT(cf));
+    }
+    else
+    {
+      cg->mov(RWARG1, MipsPtr(cf.MipsT()));
+      cg->mov(cg->dword[PTR(ptr)], RWARG1);
+    }
+  }
+  else if (action == GTERegisterAccessAction::SignExtend16 || action == GTERegisterAccessAction::ZeroExtend16)
+  {
+    const bool sign = (action == GTERegisterAccessAction::SignExtend16);
+    if (cf.const_t)
+    {
+      const u16 cv = Truncate16(GetConstantRegU32(cf.MipsT()));
+      cg->mov(cg->dword[PTR(ptr)], sign ? ::SignExtend32(cv) : ::ZeroExtend32(cv));
+    }
+    else if (cf.valid_host_t)
+    {
+      sign ? cg->movsx(RWARG1, Reg16(cf.host_t)) : cg->movzx(RWARG1, Reg16(cf.host_t));
+      cg->mov(cg->dword[PTR(ptr)], RWARG1);
+    }
+    else
+    {
+      sign ? cg->movsx(RWARG1, cg->word[PTR(&g_state.regs.r[cf.mips_t])]) :
+             cg->movzx(RWARG1, cg->word[PTR(&g_state.regs.r[cf.mips_t])]);
+      cg->mov(cg->dword[PTR(ptr)], RWARG1);
+    }
+  }
+  else if (action == GTERegisterAccessAction::CallHandler)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    cg->mov(RWARG1, index);
+    MoveTToReg(RWARG2, cf);
+    cg->call(&GTE::WriteRegister);
+  }
+  else if (action == GTERegisterAccessAction::PushFIFO)
+  {
+    // SXY0 <- SXY1
+    // SXY1 <- SXY2
+    // SXY2 <- SXYP
+    cg->mov(RWARG1, cg->dword[PTR(&g_state.gte_regs.SXY1[0])]);
+    cg->mov(RWARG2, cg->dword[PTR(&g_state.gte_regs.SXY2[0])]);
+    if (!cf.const_t && !cf.valid_host_t)
+      cg->mov(RWARG3, MipsPtr(cf.MipsT()));
+    cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY0[0])], RWARG1);
+    cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY1[0])], RWARG2);
+    if (cf.const_t)
+      cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], GetConstantRegU32(cf.MipsT()));
+    else if (cf.valid_host_t)
+      cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], CFGetRegT(cf));
+    else
+      cg->mov(cg->dword[PTR(&g_state.gte_regs.SXY2[0])], RWARG3);
+  }
+  else
+  {
+    Panic("Unknown action");
+  }
+}
+
+void CPU::NewRec::X64Compiler::Compile_cop2(CompileFlags cf)
+{
+  TickCount func_ticks;
+  GTE::InstructionImpl func = GTE::GetInstructionImpl(inst->bits, &func_ticks);
+
+  Flush(FLUSH_FOR_C_CALL);
+  cg->mov(RWARG1, inst->bits & GTE::Instruction::REQUIRED_BITS_MASK);
+  cg->call(reinterpret_cast<const void*>(func));
+
+  AddGTETicks(func_ticks);
+}
+
+u32 CPU::NewRec::CompileLoadStoreThunk(void* thunk_code, u32 thunk_space, void* code_address, u32 code_size,
+                                        TickCount cycles_to_add, TickCount cycles_to_remove, u32 gpr_bitmask,
+                                        u8 address_register, u8 data_register, MemoryAccessSize size, bool is_signed,
+                                        bool is_load)
+{
+  CodeGenerator acg(thunk_space, thunk_code);
+  CodeGenerator* cg = &acg;
+
+  static constexpr u32 GPR_SIZE = 8;
+
+  // on win32, we need to reserve an additional 32 bytes shadow space when calling out to C
+#ifdef _WIN32
+  static constexpr u32 SHADOW_SIZE = 32;
+#else
+  static constexpr u32 SHADOW_SIZE = 0;
+#endif
+
+  // save regs
+  u32 num_gprs = 0;
+
+  for (u32 i = 0; i < NUM_HOST_REGS; i++)
+  {
+    if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i))
+      num_gprs++;
+  }
+
+  const u32 stack_size = (((num_gprs + 1) & ~1u) * GPR_SIZE) + SHADOW_SIZE;
+
+  if (stack_size > 0)
+  {
+    cg->sub(cg->rsp, stack_size);
+
+    u32 stack_offset = SHADOW_SIZE;
+    for (u32 i = 0; i < NUM_HOST_REGS; i++)
+    {
+      if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i))
+      {
+        cg->mov(cg->qword[cg->rsp + stack_offset], Reg64(i));
+        stack_offset += GPR_SIZE;
+      }
+    }
+  }
+
+  if (cycles_to_add != 0)
+    cg->add(cg->dword[PTR(&g_state.pending_ticks)], cycles_to_add);
+
+  if (address_register != static_cast<u8>(RWARG1.getIdx()))
+    cg->mov(RWARG1, Reg32(address_register));
+
+  if (!is_load)
+  {
+    if (data_register != static_cast<u8>(RWARG2.getIdx()))
+      cg->mov(RWARG2, Reg32(data_register));
+  }
+
+  switch (size)
+  {
+    case MemoryAccessSize::Byte:
+    {
+      cg->call(is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryByte) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryByte));
+    }
+    break;
+    case MemoryAccessSize::HalfWord:
+    {
+      cg->call(is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryHalfWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryHalfWord));
+    }
+    break;
+    case MemoryAccessSize::Word:
+    {
+      cg->call(is_load ? reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedReadMemoryWord) :
+                         reinterpret_cast<const void*>(&Recompiler::Thunks::UncheckedWriteMemoryWord));
+    }
+    break;
+  }
+
+  if (is_load)
+  {
+    const Reg32 dst = Reg32(data_register);
+    switch (size)
+    {
+      case MemoryAccessSize::Byte:
+      {
+        is_signed ? cg->movsx(dst, RWRET.cvt8()) : cg->movzx(dst, RWRET.cvt8());
+      }
+      break;
+      case MemoryAccessSize::HalfWord:
+      {
+        is_signed ? cg->movsx(dst, RWRET.cvt16()) : cg->movzx(dst, RWRET.cvt16());
+      }
+      break;
+      case MemoryAccessSize::Word:
+      {
+        if (dst != RWRET)
+          cg->mov(dst, RWRET);
+      }
+      break;
+    }
+  }
+
+  if (cycles_to_remove != 0)
+    cg->sub(cg->dword[PTR(&g_state.pending_ticks)], cycles_to_remove);
+
+  // restore regs
+  if (stack_size > 0)
+  {
+    u32 stack_offset = SHADOW_SIZE;
+    for (u32 i = 0; i < NUM_HOST_REGS; i++)
+    {
+      if ((gpr_bitmask & (1u << i)) && IsCallerSavedRegister(i) && (!is_load || data_register != i))
+      {
+        cg->mov(Reg64(i), cg->qword[cg->rsp + stack_offset]);
+        stack_offset += GPR_SIZE;
+      }
+    }
+
+    cg->add(cg->rsp, stack_size);
+  }
+
+  cg->jmp(static_cast<const u8*>(code_address) + code_size);
+
+  // fill the rest of it with nops, if any
+  DebugAssert(code_size >= BACKPATCH_JMP_SIZE);
+  if (code_size > BACKPATCH_JMP_SIZE)
+    std::memset(static_cast<u8*>(code_address) + BACKPATCH_JMP_SIZE, 0x90, code_size - BACKPATCH_JMP_SIZE);
+
+  return static_cast<u32>(cg->getSize());
+}
diff --git a/src/core/cpu_newrec_compiler_x64.h b/src/core/cpu_newrec_compiler_x64.h
new file mode 100644
index 000000000..e9af43398
--- /dev/null
+++ b/src/core/cpu_newrec_compiler_x64.h
@@ -0,0 +1,140 @@
+// SPDX-FileCopyrightText: 2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+#include "cpu_newrec_compiler.h"
+#include <initializer_list>
+#include <memory>
+
+namespace CPU::NewRec {
+
+class X64Compiler final : public Compiler
+{
+public:
+  X64Compiler();
+  ~X64Compiler() override;
+
+protected:
+  const char* GetHostRegName(u32 reg) const override;
+
+  const void* GetCurrentCodePointer() override;
+
+  void LoadHostRegWithConstant(u32 reg, u32 val) override;
+  void LoadHostRegFromCPUPointer(u32 reg, const void* ptr) override;
+  void StoreConstantToCPUPointer(u32 val, const void* ptr) override;
+  void StoreHostRegToCPUPointer(u32 reg, const void* ptr) override;
+  void CopyHostReg(u32 dst, u32 src) override;
+
+  void Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, u8* far_code_buffer,
+             u32 far_code_space) override;
+  void BeginBlock() override;
+  void GenerateBlockProtectCheck(const u8* ram_ptr, const u8* shadow_ptr, u32 size) override;
+  void GenerateICacheCheckAndUpdate() override;
+  void GenerateCall(const void* func, s32 arg1reg = -1, s32 arg2reg = -1, s32 arg3reg = -1) override;
+  void EndBlock(const std::optional<u32>& newpc, bool do_event_test) override;
+  void EndBlockWithException(Exception excode) override;
+  void EndAndLinkBlock(const std::optional<u32>& newpc, bool do_event_test);
+  const void* EndCompile(u32* code_size, u32* far_code_size) override;
+
+  void Flush(u32 flags) override;
+
+  void Compile_Fallback() override;
+
+  void CheckBranchTarget(const Xbyak::Reg32& pcreg);
+  void Compile_jr(CompileFlags cf) override;
+  void Compile_jalr(CompileFlags cf) override;
+  void Compile_bxx(CompileFlags cf, BranchCondition cond) override;
+
+  void Compile_addi(CompileFlags cf) override;
+  void Compile_addiu(CompileFlags cf) override;
+  void Compile_slti(CompileFlags cf, bool sign);
+  void Compile_slti(CompileFlags cf) override;
+  void Compile_sltiu(CompileFlags cf) override;
+  void Compile_andi(CompileFlags cf) override;
+  void Compile_ori(CompileFlags cf) override;
+  void Compile_xori(CompileFlags cf) override;
+
+  void Compile_sll(CompileFlags cf) override;
+  void Compile_srl(CompileFlags cf) override;
+  void Compile_sra(CompileFlags cf) override;
+  void Compile_variable_shift(CompileFlags cf,
+                              void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Reg8&),
+                              void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, int));
+  void Compile_sllv(CompileFlags cf) override;
+  void Compile_srlv(CompileFlags cf) override;
+  void Compile_srav(CompileFlags cf) override;
+  void Compile_mult(CompileFlags cf, bool sign);
+  void Compile_mult(CompileFlags cf) override;
+  void Compile_multu(CompileFlags cf) override;
+  void Compile_div(CompileFlags cf) override;
+  void Compile_divu(CompileFlags cf) override;
+  void TestOverflow(const Xbyak::Reg32& result);
+  void Compile_dst_op(CompileFlags cf, void (Xbyak::CodeGenerator::*op)(const Xbyak::Operand&, const Xbyak::Operand&),
+                      void (Xbyak::CodeGenerator::*op_const)(const Xbyak::Operand&, u32), bool commutative,
+                      bool overflow);
+  void Compile_add(CompileFlags cf) override;
+  void Compile_addu(CompileFlags cf) override;
+  void Compile_sub(CompileFlags cf) override;
+  void Compile_subu(CompileFlags cf) override;
+  void Compile_and(CompileFlags cf) override;
+  void Compile_or(CompileFlags cf) override;
+  void Compile_xor(CompileFlags cf) override;
+  void Compile_nor(CompileFlags cf) override;
+  void Compile_slt(CompileFlags cf, bool sign);
+  void Compile_slt(CompileFlags cf) override;
+  void Compile_sltu(CompileFlags cf) override;
+
+  Xbyak::Reg32 ComputeLoadStoreAddressArg(CompileFlags cf, const std::optional<VirtualMemoryAddress>& address,
+                                          const std::optional<const Xbyak::Reg32>& reg = std::nullopt);
+  template<typename RegAllocFn>
+  Xbyak::Reg32 GenerateLoad(const Xbyak::Reg32& addr_reg, MemoryAccessSize size, bool sign,
+                            const RegAllocFn& dst_reg_alloc);
+  void GenerateStore(const Xbyak::Reg32& addr_reg, const Xbyak::Reg32& value_reg, MemoryAccessSize size);
+  void Compile_lxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                   const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_lwx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                   const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                    const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_sxx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                   const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_swx(CompileFlags cf, MemoryAccessSize size, bool sign,
+                   const std::optional<VirtualMemoryAddress>& address) override;
+  void Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign,
+                    const std::optional<VirtualMemoryAddress>& address) override;
+
+  void TestInterrupts(const Xbyak::Reg32& sr);
+  void Compile_mtc0(CompileFlags cf) override;
+  void Compile_rfe(CompileFlags cf) override;
+
+  void Compile_mfc2(CompileFlags cf) override;
+  void Compile_mtc2(CompileFlags cf) override;
+  void Compile_cop2(CompileFlags cf) override;
+
+  void GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg = Reg::count,
+                                    Reg arg3reg = Reg::count) override;
+
+private:
+  void SwitchToFarCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*) = nullptr);
+  void SwitchToNearCode(bool emit_jump, void (Xbyak::CodeGenerator::*jump_op)(const void*) = nullptr);
+
+  Xbyak::Address MipsPtr(Reg r) const;
+  Xbyak::Reg32 CFGetRegD(CompileFlags cf) const;
+  Xbyak::Reg32 CFGetRegS(CompileFlags cf) const;
+  Xbyak::Reg32 CFGetRegT(CompileFlags cf) const;
+  Xbyak::Reg32 CFGetRegLO(CompileFlags cf) const;
+  Xbyak::Reg32 CFGetRegHI(CompileFlags cf) const;
+
+  Xbyak::Reg32 MoveSToD(CompileFlags cf);
+  Xbyak::Reg32 MoveSToT(CompileFlags cf);
+  Xbyak::Reg32 MoveTToD(CompileFlags cf);
+  void MoveSToReg(const Xbyak::Reg32& dst, CompileFlags cf);
+  void MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf);
+  void MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg);
+
+  std::unique_ptr<Xbyak::CodeGenerator> m_emitter;
+  std::unique_ptr<Xbyak::CodeGenerator> m_far_emitter;
+  Xbyak::CodeGenerator* cg;
+};
+
+} // namespace CPU::NewRec
diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp
index b5114237a..69212680f 100644
--- a/src/core/cpu_recompiler_code_generator_x64.cpp
+++ b/src/core/cpu_recompiler_code_generator_x64.cpp
@@ -46,7 +46,8 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size)
   constexpr u32 stack_size = 8;
 #endif
 
-  DebugAssert(g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler);
+  DebugAssert(g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler ||
+              g_settings.cpu_execution_mode == CPUExecutionMode::NewRec);
 
   CodeGenerator acg(code_size, static_cast<u8*>(code));
   CodeGenerator* cg = &acg;
diff --git a/src/core/cpu_recompiler_types.h b/src/core/cpu_recompiler_types.h
index 58273691c..1b89317a8 100644
--- a/src/core/cpu_recompiler_types.h
+++ b/src/core/cpu_recompiler_types.h
@@ -6,6 +6,8 @@
 #pragma once
 #include "cpu_types.h"
 
+#include <utility>
+
 #if defined(CPU_ARCH_X64)
 
 // We need to include windows.h before xbyak does..
@@ -130,4 +132,39 @@ u8* armGetJumpTrampoline(const void* target);
 
 } // namespace CPU::Recompiler
 
+#elif defined(CPU_ARCH_RISCV64)
+
+#include "biscuit/assembler.hpp"
+
+namespace CPU::Recompiler {
+
+// A reasonable "maximum" number of bytes per instruction.
+constexpr u32 MAX_NEAR_HOST_BYTES_PER_INSTRUCTION = 64;
+constexpr u32 MAX_FAR_HOST_BYTES_PER_INSTRUCTION = 128;
+
+#define RRET biscuit::a0
+#define RARG1 biscuit::a0
+#define RARG2 biscuit::a1
+#define RARG3 biscuit::a2
+#define RSCRATCH biscuit::t6
+#define RSTATE biscuit::s10
+#define RMEMBASE biscuit::s11
+
+bool rvIsCallerSavedRegister(u32 id);
+bool rvIsValidSExtITypeImm(u32 imm);
+std::pair<s32, s32> rvGetAddressImmediates(const void* cur, const void* target);
+void rvMoveAddressToReg(biscuit::Assembler* armAsm, const biscuit::GPR& reg, const void* addr);
+void rvEmitMov(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, u32 imm);
+void rvEmitMov64(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& scratch, u64 imm);
+u32 rvEmitJmp(biscuit::Assembler* armAsm, const void* ptr, const biscuit::GPR& link_reg = biscuit::zero);
+u32 rvEmitCall(biscuit::Assembler* armAsm, const void* ptr);
+void rvEmitSExtB(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs);  // -> word
+void rvEmitUExtB(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs);  // -> word
+void rvEmitSExtH(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs);  // -> word
+void rvEmitUExtH(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs);  // -> word
+void rvEmitDSExtW(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs); // -> doubleword
+void rvEmitDUExtW(biscuit::Assembler* rvAsm, const biscuit::GPR& rd, const biscuit::GPR& rs); // -> doubleword
+
+} // namespace CPU::Recompiler
+
 #endif
diff --git a/src/core/imgui_overlays.cpp b/src/core/imgui_overlays.cpp
index a52b9a5c5..050d100af 100644
--- a/src/core/imgui_overlays.cpp
+++ b/src/core/imgui_overlays.cpp
@@ -370,6 +370,11 @@ void ImGuiManager::DrawPerformanceOverlay()
           text.append_fmt("{}{}", first ? "" : "/", "CI");
           first = false;
         }
+        else if (g_settings.cpu_execution_mode == CPUExecutionMode::NewRec)
+        {
+          text.append_fmt("{}{}", first ? "" : "/", "NR");
+          first = false;
+        }
         else
         {
           if (g_settings.cpu_recompiler_icache)
diff --git a/src/core/settings.cpp b/src/core/settings.cpp
index 100824d08..99575e6f1 100644
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -834,11 +834,13 @@ const char* Settings::GetDiscRegionDisplayName(DiscRegion region)
   return Host::TranslateToCString("DiscRegion", s_disc_region_display_names[static_cast<int>(region)]);
 }
 
-static constexpr const std::array s_cpu_execution_mode_names = {"Interpreter", "CachedInterpreter", "Recompiler"};
+static constexpr const std::array s_cpu_execution_mode_names = {"Interpreter", "CachedInterpreter", "Recompiler",
+                                                                "NewRec"};
 static constexpr const std::array s_cpu_execution_mode_display_names = {
   TRANSLATE_NOOP("CPUExecutionMode", "Interpreter (Slowest)"),
   TRANSLATE_NOOP("CPUExecutionMode", "Cached Interpreter (Faster)"),
-  TRANSLATE_NOOP("CPUExecutionMode", "Recompiler (Fastest)")};
+  TRANSLATE_NOOP("CPUExecutionMode", "Recompiler (Fastest)"),
+  TRANSLATE_NOOP("CPUExecutionMode", "New Recompiler (Experimental)")};
 
 std::optional<CPUExecutionMode> Settings::ParseCPUExecutionMode(const char* str)
 {
diff --git a/src/core/settings.h b/src/core/settings.h
index 7fc54a3fb..b7d3c78f6 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -414,7 +414,7 @@ struct Settings
   static constexpr float DEFAULT_GPU_PGXP_DEPTH_THRESHOLD = 300.0f;
   static constexpr float GPU_PGXP_DEPTH_THRESHOLD_SCALE = 4096.0f;
 
-#ifdef ENABLE_RECOMPILER
+#if defined(ENABLE_RECOMPILER)
   static constexpr CPUExecutionMode DEFAULT_CPU_EXECUTION_MODE = CPUExecutionMode::Recompiler;
 
   // LUT still ends up faster on Apple Silicon for now, because of 16K pages.
@@ -423,6 +423,9 @@ struct Settings
 #else
   static constexpr CPUFastmemMode DEFAULT_CPU_FASTMEM_MODE = CPUFastmemMode::LUT;
 #endif
+#elif defined(ENABLE_NEWREC)
+  static constexpr CPUExecutionMode DEFAULT_CPU_EXECUTION_MODE = CPUExecutionMode::NewRec;
+  static constexpr CPUFastmemMode DEFAULT_CPU_FASTMEM_MODE = CPUFastmemMode::MMap;
 #else
   static constexpr CPUExecutionMode DEFAULT_CPU_EXECUTION_MODE = CPUExecutionMode::CachedInterpreter;
   static constexpr CPUFastmemMode DEFAULT_CPU_FASTMEM_MODE = CPUFastmemMode::Disabled;
diff --git a/src/core/system.cpp b/src/core/system.cpp
index 4fd0736cf..44a5e77a1 100644
--- a/src/core/system.cpp
+++ b/src/core/system.cpp
@@ -3532,7 +3532,7 @@ void System::CheckForSettingsChanges(const Settings& old_settings)
       CPU::ClearICache();
     }
 
-    if (g_settings.cpu_execution_mode == CPUExecutionMode::Recompiler &&
+    if (CPU::CodeCache::IsUsingAnyRecompiler() &&
         (g_settings.cpu_recompiler_memory_exceptions != old_settings.cpu_recompiler_memory_exceptions ||
          g_settings.cpu_recompiler_block_linking != old_settings.cpu_recompiler_block_linking ||
          g_settings.cpu_recompiler_icache != old_settings.cpu_recompiler_icache ||
diff --git a/src/core/types.h b/src/core/types.h
index 1732263b4..506784e11 100644
--- a/src/core/types.h
+++ b/src/core/types.h
@@ -46,6 +46,7 @@ enum class CPUExecutionMode : u8
   Interpreter,
   CachedInterpreter,
   Recompiler,
+  NewRec,
   Count
 };