From b106392da0793035e2f35dacecf62a79eddc2663 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Sun, 21 Sep 2025 16:56:45 +1000
Subject: [PATCH] CPU/PGXP: Implement lwl/lwr/swl/swr

Fixes gaps in sky geometry in Kingsley's Adventure.
---
 src/core/cpu_core.cpp               |  14 +-
 src/core/cpu_pgxp.cpp               | 238 +++++++++++++++++++++++++++-
 src/core/cpu_pgxp.h                 |   2 +
 src/core/cpu_recompiler_arm32.cpp   |  81 +++++-----
 src/core/cpu_recompiler_arm32.h     |   2 +-
 src/core/cpu_recompiler_arm64.cpp   |  81 +++++-----
 src/core/cpu_recompiler_arm64.h     |   2 +-
 src/core/cpu_recompiler_riscv64.cpp |  79 ++++-----
 src/core/cpu_recompiler_riscv64.h   |   2 +-
 src/core/cpu_recompiler_x64.cpp     |  86 +++++-----
 src/core/cpu_recompiler_x64.h       |   2 +-
 11 files changed, 422 insertions(+), 167 deletions(-)
diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp
index 929dfed16..6347e1b21 100644
--- a/src/core/cpu_core.cpp
+++ b/src/core/cpu_core.cpp
@@ -1508,6 +1508,9 @@ restart_instruction:
 
       // Bypasses load delay. No need to check the old value since this is the delay slot or it's not relevant.
       const u32 existing_value = (inst.i.rt == g_state.load_delay_reg) ? g_state.load_delay_value : ReadReg(inst.i.rt);
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
+        PGXP::CPU_LWx(inst, addr, existing_value);
+
       const u8 shift = (Truncate8(addr) & u8(3)) * u8(8);
       u32 new_value;
       if (inst.op == InstructionOp::lwl)
@@ -1522,9 +1525,6 @@ restart_instruction:
       }
 
       WriteRegDelayed(inst.i.rt, new_value);
-
-      if constexpr (pgxp_mode >= PGXPMode::Memory)
-        PGXP::CPU_LW(inst, addr, new_value);
     }
     break;
 
@@ -1591,11 +1591,14 @@ restart_instruction:
       }
 
       const u32 reg_value = ReadReg(inst.i.rt);
-      const u8 shift = (Truncate8(addr) & u8(3)) * u8(8);
       u32 mem_value;
       if (!ReadMemoryWord(aligned_addr, &mem_value))
         return;
 
+      if constexpr (pgxp_mode >= PGXPMode::Memory)
+        PGXP::CPU_SWx(inst, addr, reg_value);
+
+      const u8 shift = (Truncate8(addr) & u8(3)) * u8(8);
       u32 new_value;
       if (inst.op == InstructionOp::swl)
       {
@@ -1609,9 +1612,6 @@ restart_instruction:
       }
 
       WriteMemoryWord(aligned_addr, new_value);
-
-      if constexpr (pgxp_mode >= PGXPMode::Memory)
-        PGXP::CPU_SW(inst, aligned_addr, new_value);
     }
     break;
 
diff --git a/src/core/cpu_pgxp.cpp b/src/core/cpu_pgxp.cpp
index 208855c74..ed85175d1 100644
--- a/src/core/cpu_pgxp.cpp
+++ b/src/core/cpu_pgxp.cpp
@@ -338,7 +338,8 @@ ALWAYS_INLINE_RELEASE void CPU::PGXP::WriteMem(u32 addr, const PGXPValue& value)
     return;
 
   *pMem = value;
-  pMem->flags |= VALID_LOWZ | VALID_HIGHZ;
+  pMem->flags =
+    (value.flags & ~(VALID_LOWZ | VALID_HIGHZ)) | ((value.flags & VALID_Z) ? (VALID_LOWZ | VALID_HIGHZ) : 0);
 }
 
 ALWAYS_INLINE_RELEASE void CPU::PGXP::WriteMem16(u32 addr, const PGXPValue& value)
@@ -699,6 +700,241 @@ void CPU::PGXP::CPU_SW(Instruction instr, u32 addr, u32 rtVal)
   WriteMem(addr, prtVal);
 }
 
+void CPU::PGXP::CPU_LWx(Instruction instr, u32 addr, u32 rtVal)
+{
+  LOG_VALUES_LOAD(addr, memVal);
+
+  const u32 aligned_addr = addr & ~3u;
+  PGXPValue* pmemVal = GetPtr(aligned_addr);
+  u32 memVal;
+  if (!pmemVal)
+    return;
+  if (!CPU::SafeReadMemoryWord(aligned_addr, &memVal)) [[unlikely]]
+    return;
+  pmemVal->Validate(memVal);
+  LOG_VALUES_LOAD(addr, memVal);
+
+  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
+
+  const u32 byte_shift = addr & 3u;
+
+  if (instr.op == InstructionOp::lwl)
+  {
+    const u32 bit_shift = (byte_shift * 8);
+    const u32 mixed_value = (rtVal & (UINT32_C(0x00FFFFFF) >> bit_shift)) | (memVal << (24 - bit_shift));
+
+    switch (byte_shift)
+    {
+      case 0:
+      {
+        // only writing the upper half of Y, can't do much about that..
+        prtVal.y = static_cast<float>(static_cast<s16>(mixed_value >> 16));
+        prtVal.value = mixed_value;
+        prtVal.flags = (prtVal.flags & ~VALID_Y);
+      }
+      break;
+
+      case 1:
+      {
+        prtVal.y = pmemVal->x;
+        prtVal.z = (pmemVal->flags & VALID_LOWZ) ? pmemVal->z : prtVal.z;
+        prtVal.value = mixed_value;
+        prtVal.flags =
+          (prtVal.flags & ~VALID_Y) | ((pmemVal->flags & VALID_X) << 1) | ((pmemVal->flags & VALID_LOWZ) ? VALID_Z : 0);
+      }
+      break;
+
+      case 2:
+      {
+        // making a dog's breakfast of both X and Y
+        prtVal.x = static_cast<float>(static_cast<s16>(mixed_value));
+        prtVal.y = static_cast<float>(static_cast<s16>(mixed_value >> 16));
+        prtVal.value = mixed_value;
+        prtVal.flags &= ~(VALID_X | VALID_Y | VALID_Z);
+      }
+      break;
+
+      case 3:
+      {
+        // effectively the same as a normal load.
+        prtVal = *pmemVal;
+        prtVal.value = mixed_value;
+      }
+      break;
+
+        DefaultCaseIsUnreachable();
+    }
+  }
+  else
+  {
+    const u32 bit_shift = (byte_shift * 8);
+    const u32 mixed_value = (rtVal & (UINT32_C(0xFFFFFF00) << (24 - bit_shift))) | (memVal >> bit_shift);
+
+    switch (byte_shift)
+    {
+      case 0:
+      {
+        // effectively the same as a normal load.
+        prtVal = *pmemVal;
+        prtVal.value = mixed_value;
+      }
+      break;
+
+      case 1:
+      {
+        // making a dog's breakfast of both X and Y
+        prtVal.x = static_cast<float>(static_cast<s16>(mixed_value));
+        prtVal.y = static_cast<float>(static_cast<s16>(mixed_value >> 16));
+        prtVal.value = mixed_value;
+        prtVal.flags &= ~(VALID_X | VALID_Y | VALID_Z);
+      }
+      break;
+
+      case 2:
+      {
+        prtVal.x = pmemVal->y;
+        prtVal.z = (pmemVal->flags & VALID_HIGHZ) ? pmemVal->z : prtVal.z;
+        prtVal.value = mixed_value;
+        prtVal.flags = (prtVal.flags & ~VALID_X) | ((pmemVal->flags & VALID_Y) >> 1) |
+                       ((pmemVal->flags & VALID_HIGHZ) ? VALID_Z : 0);
+      }
+      break;
+
+      case 3:
+      {
+        // only writing the lower half of X, can't do much about that..
+        prtVal.x = static_cast<float>(static_cast<s16>(mixed_value));
+        prtVal.value = mixed_value;
+        prtVal.flags = (prtVal.flags & ~VALID_X);
+      }
+      break;
+
+        DefaultCaseIsUnreachable();
+    }
+  }
+}
+
+void CPU::PGXP::CPU_SWx(Instruction instr, u32 addr, u32 rtVal)
+{
+  LOG_VALUES_STORE(instr.r.rt.GetValue(), rtVal, addr);
+
+  const u32 aligned_addr = addr & ~3u;
+  PGXPValue* pmemVal = GetPtr(aligned_addr);
+  u32 memVal;
+  if (!pmemVal)
+    return;
+  if (!CPU::SafeReadMemoryWord(aligned_addr, &memVal)) [[unlikely]]
+    return;
+  pmemVal->Validate(memVal);
+
+  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
+
+  const u32 byte_shift = addr & 3u;
+
+  if (instr.op == InstructionOp::swl)
+  {
+    const u32 bit_shift = (byte_shift * 8);
+    const u32 mixed_value = (memVal & (UINT32_C(0xFFFFFF00) << bit_shift)) | (rtVal >> (24 - bit_shift));
+
+    switch (byte_shift)
+    {
+      case 0:
+      {
+        // only writing the lower half of X, can't do much about that..
+        pmemVal->x = static_cast<float>(static_cast<s16>(mixed_value));
+        pmemVal->value = mixed_value;
+        pmemVal->flags =
+          (pmemVal->flags & ~(VALID_X | VALID_Z | VALID_LOWZ)) | ((pmemVal->flags & VALID_HIGHZ) ? VALID_Z : 0);
+      }
+      break;
+
+      case 1:
+      {
+        pmemVal->x = prtVal.y;
+        pmemVal->z = (prtVal.flags & VALID_Z) ? prtVal.z : pmemVal->z;
+        pmemVal->value = mixed_value;
+        pmemVal->flags = (pmemVal->flags & ~(VALID_X | VALID_Z | VALID_LOWZ)) | ((prtVal.flags & VALID_Y) >> 1) |
+                         ((prtVal.flags & VALID_Z) ? (VALID_Z | VALID_LOWZ) : 0) |
+                         ((pmemVal->flags & VALID_HIGHZ) ? VALID_Z : 0);
+      }
+      break;
+
+      case 2:
+      {
+        // making a dog's breakfast of both X and Y
+        pmemVal->x = static_cast<float>(static_cast<s16>(mixed_value));
+        pmemVal->y = static_cast<float>(static_cast<s16>(mixed_value >> 16));
+        pmemVal->value = mixed_value;
+        pmemVal->flags &= ~(VALID_X | VALID_Y | VALID_Z | VALID_LOWZ | VALID_HIGHZ);
+      }
+      break;
+
+      case 3:
+      {
+        // effectively the same as a normal store.
+        *pmemVal = prtVal;
+        pmemVal->value = mixed_value;
+        pmemVal->flags =
+          (prtVal.flags & ~(VALID_LOWZ | VALID_HIGHZ)) | ((prtVal.flags & VALID_Z) ? (VALID_LOWZ | VALID_HIGHZ) : 0);
+      }
+      break;
+
+        DefaultCaseIsUnreachable();
+    }
+  }
+  else
+  {
+    const u32 bit_shift = (byte_shift * 8);
+    const u32 mixed_value = (memVal & (UINT32_C(0x00FFFFFF) >> (24 - bit_shift))) | (rtVal << bit_shift);
+
+    switch (byte_shift)
+    {
+      case 0:
+      {
+        // effectively the same as a normal store.
+        *pmemVal = prtVal;
+        pmemVal->value = mixed_value;
+        pmemVal->flags =
+          (prtVal.flags & ~(VALID_LOWZ | VALID_HIGHZ)) | ((prtVal.flags & VALID_Z) ? (VALID_LOWZ | VALID_HIGHZ) : 0);
+      }
+      break;
+
+      case 1:
+      {
+        // making a dog's breakfast of both X and Y
+        pmemVal->x = static_cast<float>(static_cast<s16>(mixed_value));
+        pmemVal->y = static_cast<float>(static_cast<s16>(mixed_value >> 16));
+        pmemVal->value = mixed_value;
+        pmemVal->flags &= ~(VALID_X | VALID_Y | VALID_LOWZ | VALID_HIGHZ);
+      }
+      break;
+
+      case 2:
+      {
+        pmemVal->y = prtVal.x;
+        pmemVal->z = (prtVal.flags & VALID_Z) ? prtVal.z : pmemVal->z;
+        pmemVal->value = mixed_value;
+        pmemVal->flags = (pmemVal->flags & ~(VALID_X | VALID_Z | VALID_HIGHZ)) | ((prtVal.flags & VALID_X) << 1) |
+                         ((prtVal.flags & VALID_Z) ? (VALID_Z | VALID_HIGHZ) : 0) |
+                         ((pmemVal->flags & VALID_LOWZ) ? VALID_Z : 0);
+      }
+      break;
+
+      case 3:
+      {
+        // only writing the upper half of Y, can't do much about that..
+        pmemVal->y = static_cast<float>(static_cast<s16>(mixed_value));
+        pmemVal->value = mixed_value;
+        pmemVal->flags =
+          (pmemVal->flags & ~(VALID_X | VALID_Z | VALID_HIGHZ)) | ((pmemVal->flags & VALID_LOWZ) ? VALID_Z : 0);
+      }
+      break;
+
+        DefaultCaseIsUnreachable();
+    }
+  }
+}
+
 void CPU::PGXP::CPU_MOVE_Packed(u32 rd_and_rs, u32 rsVal)
 {
   const u32 Rs = (rd_and_rs & 0xFFu);
diff --git a/src/core/cpu_pgxp.h b/src/core/cpu_pgxp.h
index 9852f6b78..15a39ffcc 100644
--- a/src/core/cpu_pgxp.h
+++ b/src/core/cpu_pgxp.h
@@ -31,9 +31,11 @@ void CPU_LW(Instruction instr, u32 addr, u32 rtVal);
 void CPU_LH(Instruction instr, u32 addr, u32 rtVal);
 void CPU_LHU(Instruction instr, u32 addr, u32 rtVal);
 void CPU_LBx(Instruction instr, u32 addr, u32 rtVal);
+void CPU_LWx(Instruction instr, u32 addr, u32 rtVal);
 void CPU_SB(Instruction instr, u32 addr, u32 rtVal);
 void CPU_SH(Instruction instr, u32 addr, u32 rtVal);
 void CPU_SW(Instruction instr, u32 addr, u32 rtVal);
+void CPU_SWx(Instruction instr, u32 addr, u32 rtVal);
 void CPU_MOVE(u32 Rd, u32 Rs, u32 rsVal);
 void CPU_MOVE_Packed(u32 rd_and_rs, u32 rsVal);
 void CPU_ADDI(Instruction instr, u32 rsVal);
diff --git a/src/core/cpu_recompiler_arm32.cpp b/src/core/cpu_recompiler_arm32.cpp
index 05f05b8d2..5c64f02c4 100644
--- a/src/core/cpu_recompiler_arm32.cpp
+++ b/src/core/cpu_recompiler_arm32.cpp
@@ -920,15 +920,28 @@ void CPU::ARM32Recompiler::MoveTToReg(const vixl::aarch32::Register& dst, Compil
   }
 }
 
-void CPU::ARM32Recompiler::MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg)
+void CPU::ARM32Recompiler::MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg, bool ignore_load_delays)
 {
   DebugAssert(reg < Reg::count);
-  if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
+  if (ignore_load_delays && m_load_delay_register == reg)
+  {
+    if (m_load_delay_value_register == NUM_HOST_REGS)
+      armAsm->ldr(dst, PTR(&g_state.load_delay_value));
+    else
+      armAsm->mov(dst, Register(m_load_delay_value_register));
+  }
+  else if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
+  {
     armAsm->mov(dst, Register(hreg.value()));
+  }
   else if (HasConstantReg(reg))
+  {
     EmitMov(dst, GetConstantRegU32(reg));
+  }
   else
+  {
     armAsm->ldr(dst, MipsPtr(reg));
+  }
 }
 
 void CPU::ARM32Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
@@ -1909,6 +1922,17 @@ void CPU::ARM32Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, b
 
   // We'd need to be careful here if we weren't overwriting it..
   ComputeLoadStoreAddressArg(cf, address, addr);
+
+  // Do PGXP first, it does its own load.
+  if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    EmitMov(RARG1, inst->bits);
+    armAsm->mov(RARG2, addr);
+    MoveMIPSRegToReg(RARG3, inst->r.rt, true);
+    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWx));
+  }
+
   armAsm->bic(RARG1, addr, 3);
   GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
 
@@ -1976,15 +2000,6 @@ void CPU::ARM32Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, b
   }
 
   FreeHostReg(addr.GetCode());
-
-  if (g_settings.gpu_pgxp_enable)
-  {
-    Flush(FLUSH_FOR_C_CALL);
-    armAsm->mov(RARG3, value);
-    armAsm->bic(RARG2, addr, 3);
-    EmitMov(RARG1, inst->bits);
-    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LW));
-  }
 }
 
 void CPU::ARM32Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
@@ -2109,15 +2124,22 @@ void CPU::ARM32Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b
   // TODO: this can take over rt's value if it's no longer needed
   // NOTE: can't trust T in cf because of the alloc
   const Register addr = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
-  const Register value = g_settings.gpu_pgxp_enable ? Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
-  if (g_settings.gpu_pgxp_enable)
-    MoveMIPSRegToReg(value, inst->r.rt);
 
   FlushForLoadStore(address, true, use_fastmem);
 
   // TODO: if address is constant, this can be simplified..
   // We'd need to be careful here if we weren't overwriting it..
   ComputeLoadStoreAddressArg(cf, address, addr);
+
+  if (g_settings.gpu_pgxp_enable)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    EmitMov(RARG1, inst->bits);
+    armAsm->mov(RARG2, addr);
+    MoveMIPSRegToReg(RARG3, inst->r.rt);
+    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWx));
+  }
+
   armAsm->bic(RARG1, addr, 3);
   GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
 
@@ -2125,9 +2147,7 @@ void CPU::ARM32Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b
   armAsm->lsl(RSCRATCH, RSCRATCH, 3); // *8
   armAsm->bic(addr, addr, 3);
 
-  // Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
-  if (!g_settings.gpu_pgxp_enable)
-    MoveMIPSRegToReg(value, inst->r.rt);
+  MoveMIPSRegToReg(RARG2, inst->r.rt);
 
   if (inst->op == InstructionOp::swl)
   {
@@ -2139,40 +2159,25 @@ void CPU::ARM32Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b
 
     EmitMov(RARG3, 24);
     armAsm->sub(RARG3, RARG3, RSCRATCH);
-    armAsm->lsr(value, value, RARG3);
-    armAsm->orr(value, value, RRET);
+    armAsm->lsr(RARG2, RARG2, RARG3);
+    armAsm->orr(RARG2, RARG2, RRET);
   }
   else
   {
     // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
     // new_value = (RWRET & mem_mask) | (value << shift);
-    armAsm->lsl(value, value, RSCRATCH);
+    armAsm->lsl(RARG2, RARG2, RSCRATCH);
 
     EmitMov(RARG3, 24);
     armAsm->sub(RARG3, RARG3, RSCRATCH);
     EmitMov(RSCRATCH, 0x00FFFFFFu);
     armAsm->lsr(RSCRATCH, RSCRATCH, RARG3);
     armAsm->and_(RRET, RRET, RSCRATCH);
-    armAsm->orr(value, value, RRET);
-  }
-
-  if (!g_settings.gpu_pgxp_enable)
-  {
-    GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
-    FreeHostReg(addr.GetCode());
+    armAsm->orr(RARG2, RARG2, RRET);
   }
-  else
-  {
-    GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
 
-    Flush(FLUSH_FOR_C_CALL);
-    armAsm->mov(RARG3, value);
-    FreeHostReg(value.GetCode());
-    armAsm->mov(RARG2, addr);
-    FreeHostReg(addr.GetCode());
-    EmitMov(RARG1, inst->bits);
-    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SW));
-  }
+  GenerateStore(addr, RARG2, MemoryAccessSize::Word, use_fastmem);
+  FreeHostReg(addr.GetCode());
 }
 
 void CPU::ARM32Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
diff --git a/src/core/cpu_recompiler_arm32.h b/src/core/cpu_recompiler_arm32.h
index 18e0ea675..cb58b69b6 100644
--- a/src/core/cpu_recompiler_arm32.h
+++ b/src/core/cpu_recompiler_arm32.h
@@ -153,7 +153,7 @@ private:
 
   void MoveSToReg(const vixl::aarch32::Register& dst, CompileFlags cf);
   void MoveTToReg(const vixl::aarch32::Register& dst, CompileFlags cf);
-  void MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg);
+  void MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg, bool ignore_load_delays = false);
 
   vixl::aarch32::Assembler m_emitter;
   vixl::aarch32::Assembler m_far_emitter;
diff --git a/src/core/cpu_recompiler_arm64.cpp b/src/core/cpu_recompiler_arm64.cpp
index b58e8f9e0..be85b08b9 100644
--- a/src/core/cpu_recompiler_arm64.cpp
+++ b/src/core/cpu_recompiler_arm64.cpp
@@ -1083,15 +1083,28 @@ void CPU::ARM64Recompiler::MoveTToReg(const vixl::aarch64::Register& dst, Compil
   }
 }
 
-void CPU::ARM64Recompiler::MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg)
+void CPU::ARM64Recompiler::MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg, bool ignore_load_delays)
 {
   DebugAssert(reg < Reg::count && dst.IsW());
-  if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
+  if (ignore_load_delays && m_load_delay_register == reg)
+  {
+    if (m_load_delay_value_register == NUM_HOST_REGS)
+      armAsm->ldr(dst, PTR(&g_state.load_delay_value));
+    else
+      armAsm->mov(dst, WRegister(m_load_delay_value_register));
+  }
+  else if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
+  {
     armAsm->mov(dst, WRegister(hreg.value()));
+  }
   else if (HasConstantReg(reg))
+  {
     EmitMov(dst, GetConstantRegU32(reg));
+  }
   else
+  {
     armAsm->ldr(dst, MipsPtr(reg));
+  }
 }
 
 void CPU::ARM64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
@@ -2085,6 +2098,17 @@ void CPU::ARM64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, b
 
   // We'd need to be careful here if we weren't overwriting it..
   ComputeLoadStoreAddressArg(cf, address, addr);
+
+  // Do PGXP first, it does its own load.
+  if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    EmitMov(RWARG1, inst->bits);
+    armAsm->mov(RWARG2, addr);
+    MoveMIPSRegToReg(RWARG3, inst->r.rt, true);
+    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWx));
+  }
+
   armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
   GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
 
@@ -2152,15 +2176,6 @@ void CPU::ARM64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, b
   }
 
   FreeHostReg(addr.GetCode());
-
-  if (g_settings.gpu_pgxp_enable)
-  {
-    Flush(FLUSH_FOR_C_CALL);
-    armAsm->mov(RWARG3, value);
-    armAsm->and_(RWARG2, addr, armCheckLogicalConstant(~0x3u));
-    EmitMov(RWARG1, inst->bits);
-    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LW));
-  }
 }
 
 void CPU::ARM64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
@@ -2285,15 +2300,22 @@ void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b
   // TODO: this can take over rt's value if it's no longer needed
   // NOTE: can't trust T in cf because of the alloc
   const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
-  const Register value = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;
-  if (g_settings.gpu_pgxp_enable)
-    MoveMIPSRegToReg(value, inst->r.rt);
 
   FlushForLoadStore(address, true, use_fastmem);
 
   // TODO: if address is constant, this can be simplified..
   // We'd need to be careful here if we weren't overwriting it..
   ComputeLoadStoreAddressArg(cf, address, addr);
+
+  if (g_settings.gpu_pgxp_enable)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    EmitMov(RWARG1, inst->bits);
+    armAsm->mov(RWARG2, addr);
+    MoveMIPSRegToReg(RWARG3, inst->r.rt);
+    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWx));
+  }
+
   armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
   GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
 
@@ -2301,9 +2323,7 @@ void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b
   armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *8
   armAsm->and_(addr, addr, armCheckLogicalConstant(~0x3u));
 
-  // Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
-  if (!g_settings.gpu_pgxp_enable)
-    MoveMIPSRegToReg(value, inst->r.rt);
+  MoveMIPSRegToReg(RWARG2, inst->r.rt);
 
   if (inst->op == InstructionOp::swl)
   {
@@ -2315,40 +2335,25 @@ void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b
 
     EmitMov(RWARG3, 24);
     armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
-    armAsm->lsrv(value, value, RWARG3);
-    armAsm->orr(value, value, RWRET);
+    armAsm->lsrv(RWARG2, RWARG2, RWARG3);
+    armAsm->orr(RWARG2, RWARG2, RWRET);
   }
   else
   {
     // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
     // new_value = (RWRET & mem_mask) | (value << shift);
-    armAsm->lslv(value, value, RWSCRATCH);
+    armAsm->lslv(RWARG2, RWARG2, RWSCRATCH);
 
     EmitMov(RWARG3, 24);
     armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
     EmitMov(RWSCRATCH, 0x00FFFFFFu);
     armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG3);
     armAsm->and_(RWRET, RWRET, RWSCRATCH);
-    armAsm->orr(value, value, RWRET);
-  }
-
-  if (!g_settings.gpu_pgxp_enable)
-  {
-    GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
-    FreeHostReg(addr.GetCode());
+    armAsm->orr(RWARG2, RWARG2, RWRET);
   }
-  else
-  {
-    GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
 
-    Flush(FLUSH_FOR_C_CALL);
-    armAsm->mov(RWARG3, value);
-    FreeHostReg(value.GetCode());
-    armAsm->mov(RWARG2, addr);
-    FreeHostReg(addr.GetCode());
-    EmitMov(RWARG1, inst->bits);
-    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SW));
-  }
+  GenerateStore(addr, RWARG2, MemoryAccessSize::Word, use_fastmem);
+  FreeHostReg(addr.GetCode());
 }
 
 void CPU::ARM64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
diff --git a/src/core/cpu_recompiler_arm64.h b/src/core/cpu_recompiler_arm64.h
index 55011f145..41d582d79 100644
--- a/src/core/cpu_recompiler_arm64.h
+++ b/src/core/cpu_recompiler_arm64.h
@@ -154,7 +154,7 @@ private:
 
   void MoveSToReg(const vixl::aarch64::Register& dst, CompileFlags cf);
   void MoveTToReg(const vixl::aarch64::Register& dst, CompileFlags cf);
-  void MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg);
+  void MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg, bool ignore_load_delays = false);
 
   vixl::aarch64::Assembler m_emitter;
   vixl::aarch64::Assembler m_far_emitter;
diff --git a/src/core/cpu_recompiler_riscv64.cpp b/src/core/cpu_recompiler_riscv64.cpp
index b5424051a..021e158e1 100644
--- a/src/core/cpu_recompiler_riscv64.cpp
+++ b/src/core/cpu_recompiler_riscv64.cpp
@@ -911,15 +911,28 @@ void CPU::RISCV64Recompiler::MoveTToReg(const biscuit::GPR& dst, CompileFlags cf
   }
 }
 
-void CPU::RISCV64Recompiler::MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg)
+void CPU::RISCV64Recompiler::MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg, bool ignore_load_delays)
 {
   DebugAssert(reg < Reg::count);
-  if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
+  if (ignore_load_delays && m_load_delay_register == reg)
+  {
+    if (m_load_delay_value_register == NUM_HOST_REGS)
+      rvAsm->LW(dst, PTR(&g_state.load_delay_value));
+    else
+      rvAsm->MV(dst, GPR(m_load_delay_value_register));
+  }
+  else if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
+  {
     rvAsm->MV(dst, GPR(hreg.value()));
+  }
   else if (HasConstantReg(reg))
+  {
     EmitMov(dst, GetConstantRegU32(reg));
+  }
   else
+  {
     rvAsm->LW(dst, PTR(&g_state.regs.r[static_cast<u8>(reg)]));
+  }
 }
 
 void CPU::RISCV64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
@@ -1942,6 +1955,17 @@ void CPU::RISCV64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size,
 
   // We'd need to be careful here if we weren't overwriting it..
   ComputeLoadStoreAddressArg(cf, address, addr);
+
+  // Do PGXP first, it does its own load.
+  if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    EmitMov(RARG1, inst->bits);
+    rvAsm->MV(RARG2, addr);
+    MoveMIPSRegToReg(RARG3, inst->r.rt, true);
+    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWx));
+  }
+
   rvAsm->ANDI(RARG1, addr, ~0x3u);
   GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
 
@@ -2009,15 +2033,6 @@ void CPU::RISCV64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size,
   }
 
   FreeHostReg(addr.Index());
-
-  if (g_settings.gpu_pgxp_enable)
-  {
-    Flush(FLUSH_FOR_C_CALL);
-    rvAsm->MV(RARG3, value);
-    rvAsm->ANDI(RARG2, addr, ~0x3u);
-    EmitMov(RARG1, inst->bits);
-    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LW));
-  }
 }
 
 void CPU::RISCV64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
@@ -2140,15 +2155,22 @@ void CPU::RISCV64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size,
   // TODO: this can take over rt's value if it's no longer needed
   // NOTE: can't trust T in cf because of the alloc
   const GPR addr = GPR(AllocateTempHostReg(HR_CALLEE_SAVED));
-  const GPR value = g_settings.gpu_pgxp_enable ? GPR(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
-  if (g_settings.gpu_pgxp_enable)
-    MoveMIPSRegToReg(value, inst->r.rt);
 
   FlushForLoadStore(address, true, use_fastmem);
 
   // TODO: if address is constant, this can be simplified..
   // We'd need to be careful here if we weren't overwriting it..
   ComputeLoadStoreAddressArg(cf, address, addr);
+
+  if (g_settings.gpu_pgxp_enable)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    EmitMov(RARG1, inst->bits);
+    rvAsm->MV(RARG2, addr);
+    MoveMIPSRegToReg(RARG3, inst->r.rt);
+    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWx));
+  }
+
   rvAsm->ANDI(RARG1, addr, ~0x3u);
   GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
 
@@ -2158,7 +2180,7 @@ void CPU::RISCV64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size,
 
   // Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
   if (!g_settings.gpu_pgxp_enable)
-    MoveMIPSRegToReg(value, inst->r.rt);
+    MoveMIPSRegToReg(RARG2, inst->r.rt);
 
   if (inst->op == InstructionOp::swl)
   {
@@ -2170,40 +2192,25 @@ void CPU::RISCV64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size,
 
     EmitMov(RARG3, 24);
     rvAsm->SUBW(RARG3, RARG3, RSCRATCH);
-    rvAsm->SRLW(value, value, RARG3);
-    rvAsm->OR(value, value, RRET);
+    rvAsm->SRLW(RARG2, RARG2, RARG3);
+    rvAsm->OR(RARG2, RARG2, RRET);
   }
   else
   {
     // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
     // new_value = (RWRET & mem_mask) | (value << shift);
-    rvAsm->SLLW(value, value, RSCRATCH);
+    rvAsm->SLLW(RARG2, RARG2, RSCRATCH);
 
     EmitMov(RARG3, 24);
     rvAsm->SUBW(RARG3, RARG3, RSCRATCH);
     EmitMov(RSCRATCH, 0x00FFFFFFu);
     rvAsm->SRLW(RSCRATCH, RSCRATCH, RARG3);
     rvAsm->AND(RRET, RRET, RSCRATCH);
-    rvAsm->OR(value, value, RRET);
-  }
-
-  if (!g_settings.gpu_pgxp_enable)
-  {
-    GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
-    FreeHostReg(addr.Index());
+    rvAsm->OR(RARG2, RARG2, RRET);
   }
-  else
-  {
-    GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
 
-    Flush(FLUSH_FOR_C_CALL);
-    rvAsm->MV(RARG3, value);
-    FreeHostReg(value.Index());
-    rvAsm->MV(RARG2, addr);
-    FreeHostReg(addr.Index());
-    EmitMov(RARG1, inst->bits);
-    EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SW));
-  }
+  GenerateStore(addr, RARG2, MemoryAccessSize::Word, use_fastmem);
+  FreeHostReg(addr.Index());
 }
 
 void CPU::RISCV64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
diff --git a/src/core/cpu_recompiler_riscv64.h b/src/core/cpu_recompiler_riscv64.h
index 69b7a2236..acd76d862 100644
--- a/src/core/cpu_recompiler_riscv64.h
+++ b/src/core/cpu_recompiler_riscv64.h
@@ -166,7 +166,7 @@ private:
 
   void MoveSToReg(const biscuit::GPR& dst, CompileFlags cf);
   void MoveTToReg(const biscuit::GPR& dst, CompileFlags cf);
-  void MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg);
+  void MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg, bool ignore_load_delays = false);
 
   std::unique_ptr<biscuit::Assembler> m_emitter;
   std::unique_ptr<biscuit::Assembler> m_far_emitter;
diff --git a/src/core/cpu_recompiler_x64.cpp b/src/core/cpu_recompiler_x64.cpp
index 48a7c79d6..6cbf1fcde 100644
--- a/src/core/cpu_recompiler_x64.cpp
+++ b/src/core/cpu_recompiler_x64.cpp
@@ -858,15 +858,28 @@ void CPU::X64Recompiler::MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf)
   }
 }
 
-void CPU::X64Recompiler::MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg)
+void CPU::X64Recompiler::MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg, bool ignore_load_delays)
 {
   DebugAssert(reg < Reg::count);
-  if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
+  if (ignore_load_delays && m_load_delay_register == reg)
+  {
+    if (m_load_delay_value_register == NUM_HOST_REGS)
+      cg->mov(dst, cg->dword[PTR(&g_state.load_delay_value)]);
+    else
+      cg->mov(dst, Reg32(m_load_delay_value_register));
+  }
+  else if (const std::optional<u32> hreg = CheckHostReg(0, HR_TYPE_CPU_REG, reg))
+  {
     cg->mov(dst, Reg32(hreg.value()));
+  }
   else if (HasConstantReg(reg))
+  {
     cg->mov(dst, GetConstantRegU32(reg));
+  }
   else
+  {
     cg->mov(dst, MipsPtr(reg));
+  }
 }
 
 void CPU::X64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
@@ -1891,6 +1904,17 @@ void CPU::X64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, boo
 
   // We'd need to be careful here if we weren't overwriting it..
   ComputeLoadStoreAddressArg(cf, address, addr);
+
+  // Do PGXP first, it does its own load.
+  if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    cg->mov(RWARG1, inst->bits);
+    cg->mov(RWARG2, addr);
+    MoveMIPSRegToReg(RWARG3, inst->r.rt, true);
+    cg->call(reinterpret_cast<const void*>(&PGXP::CPU_LWx));
+  }
+
   cg->mov(RWARG1, addr);
   cg->and_(RWARG1, ~0x3u);
   GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
@@ -1965,18 +1989,6 @@ void CPU::X64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, boo
   }
 
   FreeHostReg(addr.getIdx());
-
-  if (g_settings.gpu_pgxp_enable)
-  {
-    Flush(FLUSH_FOR_C_CALL);
-
-    DebugAssert(value != RWARG3);
-    cg->mov(RWARG3, value);
-    cg->mov(RWARG2, addr);
-    cg->and_(RWARG2, ~0x3u);
-    cg->mov(RWARG1, inst->bits);
-    cg->call(reinterpret_cast<const void*>(&PGXP::CPU_LW));
-  }
 }
 
 void CPU::X64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
@@ -2098,28 +2110,31 @@ void CPU::X64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, boo
   // TODO: this can take over rt's value if it's no longer needed
   // NOTE: can't trust T in cf because of the alloc
   const Reg32 addr = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED));
-  const Reg32 value = g_settings.gpu_pgxp_enable ? Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;
-  if (g_settings.gpu_pgxp_enable)
-    MoveMIPSRegToReg(value, inst->r.rt);
-
   FlushForLoadStore(address, true, use_fastmem);
 
   // TODO: if address is constant, this can be simplified..
   // We'd need to be careful here if we weren't overwriting it..
   ComputeLoadStoreAddressArg(cf, address, addr);
+
+  if (g_settings.gpu_pgxp_enable)
+  {
+    Flush(FLUSH_FOR_C_CALL);
+    cg->mov(RWARG1, inst->bits);
+    cg->mov(RWARG2, addr);
+    MoveMIPSRegToReg(RWARG3, inst->r.rt);
+    cg->call(reinterpret_cast<const void*>(&PGXP::CPU_SWx));
+  }
+
   cg->mov(RWARG1, addr);
   cg->and_(RWARG1, ~0x3u);
   GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
 
-  DebugAssert(value != cg->ecx);
   cg->mov(cg->ecx, addr);
   cg->and_(cg->ecx, 3);
   cg->shl(cg->ecx, 3); // *8
   cg->and_(addr, ~0x3u);
 
-  // Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
-  if (!g_settings.gpu_pgxp_enable)
-    MoveMIPSRegToReg(value, inst->r.rt);
+  MoveMIPSRegToReg(RWARG2, inst->r.rt);
 
   if (inst->op == InstructionOp::swl)
   {
@@ -2132,14 +2147,14 @@ void CPU::X64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, boo
     cg->mov(RWARG3, 24);
     cg->sub(RWARG3, cg->ecx);
     cg->mov(cg->ecx, RWARG3);
-    cg->shr(value, cg->cl);
-    cg->or_(value, RWRET);
+    cg->shr(RWARG2, cg->cl);
+    cg->or_(RWARG2, RWRET);
   }
   else
   {
     // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
     // new_value = (RWRET & mem_mask) | (value << shift);
-    cg->shl(value, cg->cl);
+    cg->shl(RWARG2, cg->cl);
 
     DebugAssert(RWARG3 != cg->ecx);
     cg->mov(RWARG3, 24);
@@ -2148,26 +2163,11 @@ void CPU::X64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, boo
     cg->mov(RWARG3, 0x00FFFFFFu);
     cg->shr(RWARG3, cg->cl);
     cg->and_(RWRET, RWARG3);
-    cg->or_(value, RWRET);
-  }
-
-  if (!g_settings.gpu_pgxp_enable)
-  {
-    GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
-    FreeHostReg(addr.getIdx());
+    cg->or_(RWARG2, RWRET);
   }
-  else
-  {
-    GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
 
-    Flush(FLUSH_FOR_C_CALL);
-    cg->mov(RWARG3, value);
-    FreeHostReg(value.getIdx());
-    cg->mov(RWARG2, addr);
-    FreeHostReg(addr.getIdx());
-    cg->mov(RWARG1, inst->bits);
-    cg->call(reinterpret_cast<const void*>(&PGXP::CPU_SW));
-  }
+  GenerateStore(addr, RWARG2, MemoryAccessSize::Word, use_fastmem);
+  FreeHostReg(addr.getIdx());
 }
 
 void CPU::X64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
diff --git a/src/core/cpu_recompiler_x64.h b/src/core/cpu_recompiler_x64.h
index c2f1285ed..76192f26e 100644
--- a/src/core/cpu_recompiler_x64.h
+++ b/src/core/cpu_recompiler_x64.h
@@ -142,7 +142,7 @@ private:
   Xbyak::Reg32 MoveTToD(CompileFlags cf);
   void MoveSToReg(const Xbyak::Reg32& dst, CompileFlags cf);
   void MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf);
-  void MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg);
+  void MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg, bool ignore_load_delays = false);
 
   std::unique_ptr<Xbyak::CodeGenerator> m_emitter;
   std::unique_ptr<Xbyak::CodeGenerator> m_far_emitter;