From b106392da0793035e2f35dacecf62a79eddc2663 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sun, 21 Sep 2025 16:56:45 +1000 Subject: [PATCH] CPU/PGXP: Implement lwl/lwr/swl/swr Fixes gaps in sky geometry in Kingsley's Adventure. --- src/core/cpu_core.cpp | 14 +- src/core/cpu_pgxp.cpp | 238 +++++++++++++++++++++++++++- src/core/cpu_pgxp.h | 2 + src/core/cpu_recompiler_arm32.cpp | 81 +++++----- src/core/cpu_recompiler_arm32.h | 2 +- src/core/cpu_recompiler_arm64.cpp | 81 +++++----- src/core/cpu_recompiler_arm64.h | 2 +- src/core/cpu_recompiler_riscv64.cpp | 79 ++++----- src/core/cpu_recompiler_riscv64.h | 2 +- src/core/cpu_recompiler_x64.cpp | 86 +++++----- src/core/cpu_recompiler_x64.h | 2 +- 11 files changed, 422 insertions(+), 167 deletions(-) diff --git a/src/core/cpu_core.cpp b/src/core/cpu_core.cpp index 929dfed16..6347e1b21 100644 --- a/src/core/cpu_core.cpp +++ b/src/core/cpu_core.cpp @@ -1508,6 +1508,9 @@ restart_instruction: // Bypasses load delay. No need to check the old value since this is the delay slot or it's not relevant. const u32 existing_value = (inst.i.rt == g_state.load_delay_reg) ? g_state.load_delay_value : ReadReg(inst.i.rt); + if constexpr (pgxp_mode >= PGXPMode::Memory) + PGXP::CPU_LWx(inst, addr, existing_value); + const u8 shift = (Truncate8(addr) & u8(3)) * u8(8); u32 new_value; if (inst.op == InstructionOp::lwl) @@ -1522,9 +1525,6 @@ restart_instruction: } WriteRegDelayed(inst.i.rt, new_value); - - if constexpr (pgxp_mode >= PGXPMode::Memory) - PGXP::CPU_LW(inst, addr, new_value); } break; @@ -1591,11 +1591,14 @@ restart_instruction: } const u32 reg_value = ReadReg(inst.i.rt); - const u8 shift = (Truncate8(addr) & u8(3)) * u8(8); u32 mem_value; if (!ReadMemoryWord(aligned_addr, &mem_value)) return; + if constexpr (pgxp_mode >= PGXPMode::Memory) + PGXP::CPU_SWx(inst, addr, reg_value); + + const u8 shift = (Truncate8(addr) & u8(3)) * u8(8); u32 new_value; if (inst.op == InstructionOp::swl) { @@ -1609,9 +1612,6 @@ restart_instruction: } WriteMemoryWord(aligned_addr, new_value); - - if constexpr (pgxp_mode >= PGXPMode::Memory) - PGXP::CPU_SW(inst, aligned_addr, new_value); } break; diff --git a/src/core/cpu_pgxp.cpp b/src/core/cpu_pgxp.cpp index 208855c74..ed85175d1 100644 --- a/src/core/cpu_pgxp.cpp +++ b/src/core/cpu_pgxp.cpp @@ -338,7 +338,8 @@ ALWAYS_INLINE_RELEASE void CPU::PGXP::WriteMem(u32 addr, const PGXPValue& value) return; *pMem = value; - pMem->flags |= VALID_LOWZ | VALID_HIGHZ; + pMem->flags = + (value.flags & ~(VALID_LOWZ | VALID_HIGHZ)) | ((value.flags & VALID_Z) ? (VALID_LOWZ | VALID_HIGHZ) : 0); } ALWAYS_INLINE_RELEASE void CPU::PGXP::WriteMem16(u32 addr, const PGXPValue& value) @@ -699,6 +700,241 @@ void CPU::PGXP::CPU_SW(Instruction instr, u32 addr, u32 rtVal) WriteMem(addr, prtVal); } +void CPU::PGXP::CPU_LWx(Instruction instr, u32 addr, u32 rtVal) +{ + LOG_VALUES_LOAD(addr, memVal); + + const u32 aligned_addr = addr & ~3u; + PGXPValue* pmemVal = GetPtr(aligned_addr); + u32 memVal; + if (!pmemVal) + return; + if (!CPU::SafeReadMemoryWord(aligned_addr, &memVal)) [[unlikely]] + return; + pmemVal->Validate(memVal); + LOG_VALUES_LOAD(addr, memVal); + + PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal); + + const u32 byte_shift = addr & 3u; + + if (instr.op == InstructionOp::lwl) + { + const u32 bit_shift = (byte_shift * 8); + const u32 mixed_value = (rtVal & (UINT32_C(0x00FFFFFF) >> bit_shift)) | (memVal << (24 - bit_shift)); + + switch (byte_shift) + { + case 0: + { + // only writing the upper half of Y, can't do much about that.. + prtVal.y = static_cast(static_cast(mixed_value >> 16)); + prtVal.value = mixed_value; + prtVal.flags = (prtVal.flags & ~VALID_Y); + } + break; + + case 1: + { + prtVal.y = pmemVal->x; + prtVal.z = (pmemVal->flags & VALID_LOWZ) ? pmemVal->z : prtVal.z; + prtVal.value = mixed_value; + prtVal.flags = + (prtVal.flags & ~VALID_Y) | ((pmemVal->flags & VALID_X) << 1) | ((pmemVal->flags & VALID_LOWZ) ? VALID_Z : 0); + } + break; + + case 2: + { + // making a dog's breakfast of both X and Y + prtVal.x = static_cast(static_cast(mixed_value)); + prtVal.y = static_cast(static_cast(mixed_value >> 16)); + prtVal.value = mixed_value; + prtVal.flags &= ~(VALID_X | VALID_Y | VALID_Z); + } + break; + + case 3: + { + // effectively the same as a normal load. + prtVal = *pmemVal; + prtVal.value = mixed_value; + } + break; + + DefaultCaseIsUnreachable(); + } + } + else + { + const u32 bit_shift = (byte_shift * 8); + const u32 mixed_value = (rtVal & (UINT32_C(0xFFFFFF00) << (24 - bit_shift))) | (memVal >> bit_shift); + + switch (byte_shift) + { + case 0: + { + // effectively the same as a normal load. + prtVal = *pmemVal; + prtVal.value = mixed_value; + } + break; + + case 1: + { + // making a dog's breakfast of both X and Y + prtVal.x = static_cast(static_cast(mixed_value)); + prtVal.y = static_cast(static_cast(mixed_value >> 16)); + prtVal.value = mixed_value; + prtVal.flags &= ~(VALID_X | VALID_Y | VALID_Z); + } + break; + + case 2: + { + prtVal.x = pmemVal->y; + prtVal.z = (pmemVal->flags & VALID_HIGHZ) ? pmemVal->z : prtVal.z; + prtVal.value = mixed_value; + prtVal.flags = (prtVal.flags & ~VALID_X) | ((pmemVal->flags & VALID_Y) >> 1) | + ((pmemVal->flags & VALID_HIGHZ) ? VALID_Z : 0); + } + break; + + case 3: + { + // only writing the lower half of X, can't do much about that.. + prtVal.x = static_cast(static_cast(mixed_value)); + prtVal.value = mixed_value; + prtVal.flags = (prtVal.flags & ~VALID_X); + } + break; + + DefaultCaseIsUnreachable(); + } + } +} + +void CPU::PGXP::CPU_SWx(Instruction instr, u32 addr, u32 rtVal) +{ + LOG_VALUES_STORE(instr.r.rt.GetValue(), rtVal, addr); + + const u32 aligned_addr = addr & ~3u; + PGXPValue* pmemVal = GetPtr(aligned_addr); + u32 memVal; + if (!pmemVal) + return; + if (!CPU::SafeReadMemoryWord(aligned_addr, &memVal)) [[unlikely]] + return; + pmemVal->Validate(memVal); + + PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal); + + const u32 byte_shift = addr & 3u; + + if (instr.op == InstructionOp::swl) + { + const u32 bit_shift = (byte_shift * 8); + const u32 mixed_value = (memVal & (UINT32_C(0xFFFFFF00) << bit_shift)) | (rtVal >> (24 - bit_shift)); + + switch (byte_shift) + { + case 0: + { + // only writing the lower half of X, can't do much about that.. + pmemVal->x = static_cast(static_cast(mixed_value)); + pmemVal->value = mixed_value; + pmemVal->flags = + (pmemVal->flags & ~(VALID_X | VALID_Z | VALID_LOWZ)) | ((pmemVal->flags & VALID_HIGHZ) ? VALID_Z : 0); + } + break; + + case 1: + { + pmemVal->x = prtVal.y; + pmemVal->z = (prtVal.flags & VALID_Z) ? prtVal.z : pmemVal->z; + pmemVal->value = mixed_value; + pmemVal->flags = (pmemVal->flags & ~(VALID_X | VALID_Z | VALID_LOWZ)) | ((prtVal.flags & VALID_Y) >> 1) | + ((prtVal.flags & VALID_Z) ? (VALID_Z | VALID_LOWZ) : 0) | + ((pmemVal->flags & VALID_HIGHZ) ? VALID_Z : 0); + } + break; + + case 2: + { + // making a dog's breakfast of both X and Y + pmemVal->x = static_cast(static_cast(mixed_value)); + pmemVal->y = static_cast(static_cast(mixed_value >> 16)); + pmemVal->value = mixed_value; + pmemVal->flags &= ~(VALID_X | VALID_Y | VALID_Z | VALID_LOWZ | VALID_HIGHZ); + } + break; + + case 3: + { + // effectively the same as a normal store. + *pmemVal = prtVal; + pmemVal->value = mixed_value; + pmemVal->flags = + (prtVal.flags & ~(VALID_LOWZ | VALID_HIGHZ)) | ((prtVal.flags & VALID_Z) ? (VALID_LOWZ | VALID_HIGHZ) : 0); + } + break; + + DefaultCaseIsUnreachable(); + } + } + else + { + const u32 bit_shift = (byte_shift * 8); + const u32 mixed_value = (memVal & (UINT32_C(0x00FFFFFF) >> (24 - bit_shift))) | (rtVal << bit_shift); + + switch (byte_shift) + { + case 0: + { + // effectively the same as a normal store. + *pmemVal = prtVal; + pmemVal->value = mixed_value; + pmemVal->flags = + (prtVal.flags & ~(VALID_LOWZ | VALID_HIGHZ)) | ((prtVal.flags & VALID_Z) ? (VALID_LOWZ | VALID_HIGHZ) : 0); + } + break; + + case 1: + { + // making a dog's breakfast of both X and Y + pmemVal->x = static_cast(static_cast(mixed_value)); + pmemVal->y = static_cast(static_cast(mixed_value >> 16)); + pmemVal->value = mixed_value; + pmemVal->flags &= ~(VALID_X | VALID_Y | VALID_LOWZ | VALID_HIGHZ); + } + break; + + case 2: + { + pmemVal->y = prtVal.x; + pmemVal->z = (prtVal.flags & VALID_Z) ? prtVal.z : pmemVal->z; + pmemVal->value = mixed_value; + pmemVal->flags = (pmemVal->flags & ~(VALID_X | VALID_Z | VALID_HIGHZ)) | ((prtVal.flags & VALID_X) << 1) | + ((prtVal.flags & VALID_Z) ? (VALID_Z | VALID_HIGHZ) : 0) | + ((pmemVal->flags & VALID_LOWZ) ? VALID_Z : 0); + } + break; + + case 3: + { + // only writing the upper half of Y, can't do much about that.. + pmemVal->y = static_cast(static_cast(mixed_value)); + pmemVal->value = mixed_value; + pmemVal->flags = + (pmemVal->flags & ~(VALID_X | VALID_Z | VALID_HIGHZ)) | ((pmemVal->flags & VALID_LOWZ) ? VALID_Z : 0); + } + break; + + DefaultCaseIsUnreachable(); + } + } +} + void CPU::PGXP::CPU_MOVE_Packed(u32 rd_and_rs, u32 rsVal) { const u32 Rs = (rd_and_rs & 0xFFu); diff --git a/src/core/cpu_pgxp.h b/src/core/cpu_pgxp.h index 9852f6b78..15a39ffcc 100644 --- a/src/core/cpu_pgxp.h +++ b/src/core/cpu_pgxp.h @@ -31,9 +31,11 @@ void CPU_LW(Instruction instr, u32 addr, u32 rtVal); void CPU_LH(Instruction instr, u32 addr, u32 rtVal); void CPU_LHU(Instruction instr, u32 addr, u32 rtVal); void CPU_LBx(Instruction instr, u32 addr, u32 rtVal); +void CPU_LWx(Instruction instr, u32 addr, u32 rtVal); void CPU_SB(Instruction instr, u32 addr, u32 rtVal); void CPU_SH(Instruction instr, u32 addr, u32 rtVal); void CPU_SW(Instruction instr, u32 addr, u32 rtVal); +void CPU_SWx(Instruction instr, u32 addr, u32 rtVal); void CPU_MOVE(u32 Rd, u32 Rs, u32 rsVal); void CPU_MOVE_Packed(u32 rd_and_rs, u32 rsVal); void CPU_ADDI(Instruction instr, u32 rsVal); diff --git a/src/core/cpu_recompiler_arm32.cpp b/src/core/cpu_recompiler_arm32.cpp index 05f05b8d2..5c64f02c4 100644 --- a/src/core/cpu_recompiler_arm32.cpp +++ b/src/core/cpu_recompiler_arm32.cpp @@ -920,15 +920,28 @@ void CPU::ARM32Recompiler::MoveTToReg(const vixl::aarch32::Register& dst, Compil } } -void CPU::ARM32Recompiler::MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg) +void CPU::ARM32Recompiler::MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg, bool ignore_load_delays) { DebugAssert(reg < Reg::count); - if (const std::optional hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg)) + if (ignore_load_delays && m_load_delay_register == reg) + { + if (m_load_delay_value_register == NUM_HOST_REGS) + armAsm->ldr(dst, PTR(&g_state.load_delay_value)); + else + armAsm->mov(dst, Register(m_load_delay_value_register)); + } + else if (const std::optional hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg)) + { armAsm->mov(dst, Register(hreg.value())); + } else if (HasConstantReg(reg)) + { EmitMov(dst, GetConstantRegU32(reg)); + } else + { armAsm->ldr(dst, MipsPtr(reg)); + } } void CPU::ARM32Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */, @@ -1909,6 +1922,17 @@ void CPU::ARM32Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, b // We'd need to be careful here if we weren't overwriting it.. ComputeLoadStoreAddressArg(cf, address, addr); + + // Do PGXP first, it does its own load. + if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero) + { + Flush(FLUSH_FOR_C_CALL); + EmitMov(RARG1, inst->bits); + armAsm->mov(RARG2, addr); + MoveMIPSRegToReg(RARG3, inst->r.rt, true); + EmitCall(reinterpret_cast(&PGXP::CPU_LWx)); + } + armAsm->bic(RARG1, addr, 3); GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; }); @@ -1976,15 +2000,6 @@ void CPU::ARM32Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, b } FreeHostReg(addr.GetCode()); - - if (g_settings.gpu_pgxp_enable) - { - Flush(FLUSH_FOR_C_CALL); - armAsm->mov(RARG3, value); - armAsm->bic(RARG2, addr, 3); - EmitMov(RARG1, inst->bits); - EmitCall(reinterpret_cast(&PGXP::CPU_LW)); - } } void CPU::ARM32Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, @@ -2109,15 +2124,22 @@ void CPU::ARM32Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b // TODO: this can take over rt's value if it's no longer needed // NOTE: can't trust T in cf because of the alloc const Register addr = Register(AllocateTempHostReg(HR_CALLEE_SAVED)); - const Register value = g_settings.gpu_pgxp_enable ? Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2; - if (g_settings.gpu_pgxp_enable) - MoveMIPSRegToReg(value, inst->r.rt); FlushForLoadStore(address, true, use_fastmem); // TODO: if address is constant, this can be simplified.. // We'd need to be careful here if we weren't overwriting it.. ComputeLoadStoreAddressArg(cf, address, addr); + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + EmitMov(RARG1, inst->bits); + armAsm->mov(RARG2, addr); + MoveMIPSRegToReg(RARG3, inst->r.rt); + EmitCall(reinterpret_cast(&PGXP::CPU_SWx)); + } + armAsm->bic(RARG1, addr, 3); GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; }); @@ -2125,9 +2147,7 @@ void CPU::ARM32Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b armAsm->lsl(RSCRATCH, RSCRATCH, 3); // *8 armAsm->bic(addr, addr, 3); - // Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush. - if (!g_settings.gpu_pgxp_enable) - MoveMIPSRegToReg(value, inst->r.rt); + MoveMIPSRegToReg(RARG2, inst->r.rt); if (inst->op == InstructionOp::swl) { @@ -2139,40 +2159,25 @@ void CPU::ARM32Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b EmitMov(RARG3, 24); armAsm->sub(RARG3, RARG3, RSCRATCH); - armAsm->lsr(value, value, RARG3); - armAsm->orr(value, value, RRET); + armAsm->lsr(RARG2, RARG2, RARG3); + armAsm->orr(RARG2, RARG2, RRET); } else { // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift); // new_value = (RWRET & mem_mask) | (value << shift); - armAsm->lsl(value, value, RSCRATCH); + armAsm->lsl(RARG2, RARG2, RSCRATCH); EmitMov(RARG3, 24); armAsm->sub(RARG3, RARG3, RSCRATCH); EmitMov(RSCRATCH, 0x00FFFFFFu); armAsm->lsr(RSCRATCH, RSCRATCH, RARG3); armAsm->and_(RRET, RRET, RSCRATCH); - armAsm->orr(value, value, RRET); - } - - if (!g_settings.gpu_pgxp_enable) - { - GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); - FreeHostReg(addr.GetCode()); + armAsm->orr(RARG2, RARG2, RRET); } - else - { - GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); - Flush(FLUSH_FOR_C_CALL); - armAsm->mov(RARG3, value); - FreeHostReg(value.GetCode()); - armAsm->mov(RARG2, addr); - FreeHostReg(addr.GetCode()); - EmitMov(RARG1, inst->bits); - EmitCall(reinterpret_cast(&PGXP::CPU_SW)); - } + GenerateStore(addr, RARG2, MemoryAccessSize::Word, use_fastmem); + FreeHostReg(addr.GetCode()); } void CPU::ARM32Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, diff --git a/src/core/cpu_recompiler_arm32.h b/src/core/cpu_recompiler_arm32.h index 18e0ea675..cb58b69b6 100644 --- a/src/core/cpu_recompiler_arm32.h +++ b/src/core/cpu_recompiler_arm32.h @@ -153,7 +153,7 @@ private: void MoveSToReg(const vixl::aarch32::Register& dst, CompileFlags cf); void MoveTToReg(const vixl::aarch32::Register& dst, CompileFlags cf); - void MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg); + void MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg, bool ignore_load_delays = false); vixl::aarch32::Assembler m_emitter; vixl::aarch32::Assembler m_far_emitter; diff --git a/src/core/cpu_recompiler_arm64.cpp b/src/core/cpu_recompiler_arm64.cpp index b58e8f9e0..be85b08b9 100644 --- a/src/core/cpu_recompiler_arm64.cpp +++ b/src/core/cpu_recompiler_arm64.cpp @@ -1083,15 +1083,28 @@ void CPU::ARM64Recompiler::MoveTToReg(const vixl::aarch64::Register& dst, Compil } } -void CPU::ARM64Recompiler::MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg) +void CPU::ARM64Recompiler::MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg, bool ignore_load_delays) { DebugAssert(reg < Reg::count && dst.IsW()); - if (const std::optional hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg)) + if (ignore_load_delays && m_load_delay_register == reg) + { + if (m_load_delay_value_register == NUM_HOST_REGS) + armAsm->ldr(dst, PTR(&g_state.load_delay_value)); + else + armAsm->mov(dst, WRegister(m_load_delay_value_register)); + } + else if (const std::optional hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg)) + { armAsm->mov(dst, WRegister(hreg.value())); + } else if (HasConstantReg(reg)) + { EmitMov(dst, GetConstantRegU32(reg)); + } else + { armAsm->ldr(dst, MipsPtr(reg)); + } } void CPU::ARM64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */, @@ -2085,6 +2098,17 @@ void CPU::ARM64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, b // We'd need to be careful here if we weren't overwriting it.. ComputeLoadStoreAddressArg(cf, address, addr); + + // Do PGXP first, it does its own load. + if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero) + { + Flush(FLUSH_FOR_C_CALL); + EmitMov(RWARG1, inst->bits); + armAsm->mov(RWARG2, addr); + MoveMIPSRegToReg(RWARG3, inst->r.rt, true); + EmitCall(reinterpret_cast(&PGXP::CPU_LWx)); + } + armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u)); GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; }); @@ -2152,15 +2176,6 @@ void CPU::ARM64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, b } FreeHostReg(addr.GetCode()); - - if (g_settings.gpu_pgxp_enable) - { - Flush(FLUSH_FOR_C_CALL); - armAsm->mov(RWARG3, value); - armAsm->and_(RWARG2, addr, armCheckLogicalConstant(~0x3u)); - EmitMov(RWARG1, inst->bits); - EmitCall(reinterpret_cast(&PGXP::CPU_LW)); - } } void CPU::ARM64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, @@ -2285,15 +2300,22 @@ void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b // TODO: this can take over rt's value if it's no longer needed // NOTE: can't trust T in cf because of the alloc const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)); - const Register value = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2; - if (g_settings.gpu_pgxp_enable) - MoveMIPSRegToReg(value, inst->r.rt); FlushForLoadStore(address, true, use_fastmem); // TODO: if address is constant, this can be simplified.. // We'd need to be careful here if we weren't overwriting it.. ComputeLoadStoreAddressArg(cf, address, addr); + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + EmitMov(RWARG1, inst->bits); + armAsm->mov(RWARG2, addr); + MoveMIPSRegToReg(RWARG3, inst->r.rt); + EmitCall(reinterpret_cast(&PGXP::CPU_SWx)); + } + armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u)); GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; }); @@ -2301,9 +2323,7 @@ void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *8 armAsm->and_(addr, addr, armCheckLogicalConstant(~0x3u)); - // Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush. - if (!g_settings.gpu_pgxp_enable) - MoveMIPSRegToReg(value, inst->r.rt); + MoveMIPSRegToReg(RWARG2, inst->r.rt); if (inst->op == InstructionOp::swl) { @@ -2315,40 +2335,25 @@ void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b EmitMov(RWARG3, 24); armAsm->sub(RWARG3, RWARG3, RWSCRATCH); - armAsm->lsrv(value, value, RWARG3); - armAsm->orr(value, value, RWRET); + armAsm->lsrv(RWARG2, RWARG2, RWARG3); + armAsm->orr(RWARG2, RWARG2, RWRET); } else { // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift); // new_value = (RWRET & mem_mask) | (value << shift); - armAsm->lslv(value, value, RWSCRATCH); + armAsm->lslv(RWARG2, RWARG2, RWSCRATCH); EmitMov(RWARG3, 24); armAsm->sub(RWARG3, RWARG3, RWSCRATCH); EmitMov(RWSCRATCH, 0x00FFFFFFu); armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG3); armAsm->and_(RWRET, RWRET, RWSCRATCH); - armAsm->orr(value, value, RWRET); - } - - if (!g_settings.gpu_pgxp_enable) - { - GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); - FreeHostReg(addr.GetCode()); + armAsm->orr(RWARG2, RWARG2, RWRET); } - else - { - GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); - Flush(FLUSH_FOR_C_CALL); - armAsm->mov(RWARG3, value); - FreeHostReg(value.GetCode()); - armAsm->mov(RWARG2, addr); - FreeHostReg(addr.GetCode()); - EmitMov(RWARG1, inst->bits); - EmitCall(reinterpret_cast(&PGXP::CPU_SW)); - } + GenerateStore(addr, RWARG2, MemoryAccessSize::Word, use_fastmem); + FreeHostReg(addr.GetCode()); } void CPU::ARM64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, diff --git a/src/core/cpu_recompiler_arm64.h b/src/core/cpu_recompiler_arm64.h index 55011f145..41d582d79 100644 --- a/src/core/cpu_recompiler_arm64.h +++ b/src/core/cpu_recompiler_arm64.h @@ -154,7 +154,7 @@ private: void MoveSToReg(const vixl::aarch64::Register& dst, CompileFlags cf); void MoveTToReg(const vixl::aarch64::Register& dst, CompileFlags cf); - void MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg); + void MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg, bool ignore_load_delays = false); vixl::aarch64::Assembler m_emitter; vixl::aarch64::Assembler m_far_emitter; diff --git a/src/core/cpu_recompiler_riscv64.cpp b/src/core/cpu_recompiler_riscv64.cpp index b5424051a..021e158e1 100644 --- a/src/core/cpu_recompiler_riscv64.cpp +++ b/src/core/cpu_recompiler_riscv64.cpp @@ -911,15 +911,28 @@ void CPU::RISCV64Recompiler::MoveTToReg(const biscuit::GPR& dst, CompileFlags cf } } -void CPU::RISCV64Recompiler::MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg) +void CPU::RISCV64Recompiler::MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg, bool ignore_load_delays) { DebugAssert(reg < Reg::count); - if (const std::optional hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg)) + if (ignore_load_delays && m_load_delay_register == reg) + { + if (m_load_delay_value_register == NUM_HOST_REGS) + rvAsm->LW(dst, PTR(&g_state.load_delay_value)); + else + rvAsm->MV(dst, GPR(m_load_delay_value_register)); + } + else if (const std::optional hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg)) + { rvAsm->MV(dst, GPR(hreg.value())); + } else if (HasConstantReg(reg)) + { EmitMov(dst, GetConstantRegU32(reg)); + } else + { rvAsm->LW(dst, PTR(&g_state.regs.r[static_cast(reg)])); + } } void CPU::RISCV64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */, @@ -1942,6 +1955,17 @@ void CPU::RISCV64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, // We'd need to be careful here if we weren't overwriting it.. ComputeLoadStoreAddressArg(cf, address, addr); + + // Do PGXP first, it does its own load. + if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero) + { + Flush(FLUSH_FOR_C_CALL); + EmitMov(RARG1, inst->bits); + rvAsm->MV(RARG2, addr); + MoveMIPSRegToReg(RARG3, inst->r.rt, true); + EmitCall(reinterpret_cast(&PGXP::CPU_LWx)); + } + rvAsm->ANDI(RARG1, addr, ~0x3u); GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; }); @@ -2009,15 +2033,6 @@ void CPU::RISCV64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, } FreeHostReg(addr.Index()); - - if (g_settings.gpu_pgxp_enable) - { - Flush(FLUSH_FOR_C_CALL); - rvAsm->MV(RARG3, value); - rvAsm->ANDI(RARG2, addr, ~0x3u); - EmitMov(RARG1, inst->bits); - EmitCall(reinterpret_cast(&PGXP::CPU_LW)); - } } void CPU::RISCV64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, @@ -2140,15 +2155,22 @@ void CPU::RISCV64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, // TODO: this can take over rt's value if it's no longer needed // NOTE: can't trust T in cf because of the alloc const GPR addr = GPR(AllocateTempHostReg(HR_CALLEE_SAVED)); - const GPR value = g_settings.gpu_pgxp_enable ? GPR(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2; - if (g_settings.gpu_pgxp_enable) - MoveMIPSRegToReg(value, inst->r.rt); FlushForLoadStore(address, true, use_fastmem); // TODO: if address is constant, this can be simplified.. // We'd need to be careful here if we weren't overwriting it.. ComputeLoadStoreAddressArg(cf, address, addr); + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + EmitMov(RARG1, inst->bits); + rvAsm->MV(RARG2, addr); + MoveMIPSRegToReg(RARG3, inst->r.rt); + EmitCall(reinterpret_cast(&PGXP::CPU_SWx)); + } + rvAsm->ANDI(RARG1, addr, ~0x3u); GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; }); @@ -2158,7 +2180,7 @@ void CPU::RISCV64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, // Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush. if (!g_settings.gpu_pgxp_enable) - MoveMIPSRegToReg(value, inst->r.rt); + MoveMIPSRegToReg(RARG2, inst->r.rt); if (inst->op == InstructionOp::swl) { @@ -2170,40 +2192,25 @@ void CPU::RISCV64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, EmitMov(RARG3, 24); rvAsm->SUBW(RARG3, RARG3, RSCRATCH); - rvAsm->SRLW(value, value, RARG3); - rvAsm->OR(value, value, RRET); + rvAsm->SRLW(RARG2, RARG2, RARG3); + rvAsm->OR(RARG2, RARG2, RRET); } else { // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift); // new_value = (RWRET & mem_mask) | (value << shift); - rvAsm->SLLW(value, value, RSCRATCH); + rvAsm->SLLW(RARG2, RARG2, RSCRATCH); EmitMov(RARG3, 24); rvAsm->SUBW(RARG3, RARG3, RSCRATCH); EmitMov(RSCRATCH, 0x00FFFFFFu); rvAsm->SRLW(RSCRATCH, RSCRATCH, RARG3); rvAsm->AND(RRET, RRET, RSCRATCH); - rvAsm->OR(value, value, RRET); - } - - if (!g_settings.gpu_pgxp_enable) - { - GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); - FreeHostReg(addr.Index()); + rvAsm->OR(RARG2, RARG2, RRET); } - else - { - GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); - Flush(FLUSH_FOR_C_CALL); - rvAsm->MV(RARG3, value); - FreeHostReg(value.Index()); - rvAsm->MV(RARG2, addr); - FreeHostReg(addr.Index()); - EmitMov(RARG1, inst->bits); - EmitCall(reinterpret_cast(&PGXP::CPU_SW)); - } + GenerateStore(addr, RARG2, MemoryAccessSize::Word, use_fastmem); + FreeHostReg(addr.Index()); } void CPU::RISCV64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, diff --git a/src/core/cpu_recompiler_riscv64.h b/src/core/cpu_recompiler_riscv64.h index 69b7a2236..acd76d862 100644 --- a/src/core/cpu_recompiler_riscv64.h +++ b/src/core/cpu_recompiler_riscv64.h @@ -166,7 +166,7 @@ private: void MoveSToReg(const biscuit::GPR& dst, CompileFlags cf); void MoveTToReg(const biscuit::GPR& dst, CompileFlags cf); - void MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg); + void MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg, bool ignore_load_delays = false); std::unique_ptr m_emitter; std::unique_ptr m_far_emitter; diff --git a/src/core/cpu_recompiler_x64.cpp b/src/core/cpu_recompiler_x64.cpp index 48a7c79d6..6cbf1fcde 100644 --- a/src/core/cpu_recompiler_x64.cpp +++ b/src/core/cpu_recompiler_x64.cpp @@ -858,15 +858,28 @@ void CPU::X64Recompiler::MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf) } } -void CPU::X64Recompiler::MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg) +void CPU::X64Recompiler::MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg, bool ignore_load_delays) { DebugAssert(reg < Reg::count); - if (const std::optional hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg)) + if (ignore_load_delays && m_load_delay_register == reg) + { + if (m_load_delay_value_register == NUM_HOST_REGS) + cg->mov(dst, cg->dword[PTR(&g_state.load_delay_value)]); + else + cg->mov(dst, Reg32(m_load_delay_value_register)); + } + else if (const std::optional hreg = CheckHostReg(0, HR_TYPE_CPU_REG, reg)) + { cg->mov(dst, Reg32(hreg.value())); + } else if (HasConstantReg(reg)) + { cg->mov(dst, GetConstantRegU32(reg)); + } else + { cg->mov(dst, MipsPtr(reg)); + } } void CPU::X64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */, @@ -1891,6 +1904,17 @@ void CPU::X64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, boo // We'd need to be careful here if we weren't overwriting it.. ComputeLoadStoreAddressArg(cf, address, addr); + + // Do PGXP first, it does its own load. + if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero) + { + Flush(FLUSH_FOR_C_CALL); + cg->mov(RWARG1, inst->bits); + cg->mov(RWARG2, addr); + MoveMIPSRegToReg(RWARG3, inst->r.rt, true); + cg->call(reinterpret_cast(&PGXP::CPU_LWx)); + } + cg->mov(RWARG1, addr); cg->and_(RWARG1, ~0x3u); GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; }); @@ -1965,18 +1989,6 @@ void CPU::X64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, boo } FreeHostReg(addr.getIdx()); - - if (g_settings.gpu_pgxp_enable) - { - Flush(FLUSH_FOR_C_CALL); - - DebugAssert(value != RWARG3); - cg->mov(RWARG3, value); - cg->mov(RWARG2, addr); - cg->and_(RWARG2, ~0x3u); - cg->mov(RWARG1, inst->bits); - cg->call(reinterpret_cast(&PGXP::CPU_LW)); - } } void CPU::X64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, @@ -2098,28 +2110,31 @@ void CPU::X64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, boo // TODO: this can take over rt's value if it's no longer needed // NOTE: can't trust T in cf because of the alloc const Reg32 addr = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)); - const Reg32 value = g_settings.gpu_pgxp_enable ? Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2; - if (g_settings.gpu_pgxp_enable) - MoveMIPSRegToReg(value, inst->r.rt); - FlushForLoadStore(address, true, use_fastmem); // TODO: if address is constant, this can be simplified.. // We'd need to be careful here if we weren't overwriting it.. ComputeLoadStoreAddressArg(cf, address, addr); + + if (g_settings.gpu_pgxp_enable) + { + Flush(FLUSH_FOR_C_CALL); + cg->mov(RWARG1, inst->bits); + cg->mov(RWARG2, addr); + MoveMIPSRegToReg(RWARG3, inst->r.rt); + cg->call(reinterpret_cast(&PGXP::CPU_SWx)); + } + cg->mov(RWARG1, addr); cg->and_(RWARG1, ~0x3u); GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; }); - DebugAssert(value != cg->ecx); cg->mov(cg->ecx, addr); cg->and_(cg->ecx, 3); cg->shl(cg->ecx, 3); // *8 cg->and_(addr, ~0x3u); - // Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush. - if (!g_settings.gpu_pgxp_enable) - MoveMIPSRegToReg(value, inst->r.rt); + MoveMIPSRegToReg(RWARG2, inst->r.rt); if (inst->op == InstructionOp::swl) { @@ -2132,14 +2147,14 @@ void CPU::X64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, boo cg->mov(RWARG3, 24); cg->sub(RWARG3, cg->ecx); cg->mov(cg->ecx, RWARG3); - cg->shr(value, cg->cl); - cg->or_(value, RWRET); + cg->shr(RWARG2, cg->cl); + cg->or_(RWARG2, RWRET); } else { // const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift); // new_value = (RWRET & mem_mask) | (value << shift); - cg->shl(value, cg->cl); + cg->shl(RWARG2, cg->cl); DebugAssert(RWARG3 != cg->ecx); cg->mov(RWARG3, 24); @@ -2148,26 +2163,11 @@ void CPU::X64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, boo cg->mov(RWARG3, 0x00FFFFFFu); cg->shr(RWARG3, cg->cl); cg->and_(RWRET, RWARG3); - cg->or_(value, RWRET); - } - - if (!g_settings.gpu_pgxp_enable) - { - GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); - FreeHostReg(addr.getIdx()); + cg->or_(RWARG2, RWRET); } - else - { - GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem); - Flush(FLUSH_FOR_C_CALL); - cg->mov(RWARG3, value); - FreeHostReg(value.getIdx()); - cg->mov(RWARG2, addr); - FreeHostReg(addr.getIdx()); - cg->mov(RWARG1, inst->bits); - cg->call(reinterpret_cast(&PGXP::CPU_SW)); - } + GenerateStore(addr, RWARG2, MemoryAccessSize::Word, use_fastmem); + FreeHostReg(addr.getIdx()); } void CPU::X64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem, diff --git a/src/core/cpu_recompiler_x64.h b/src/core/cpu_recompiler_x64.h index c2f1285ed..76192f26e 100644 --- a/src/core/cpu_recompiler_x64.h +++ b/src/core/cpu_recompiler_x64.h @@ -142,7 +142,7 @@ private: Xbyak::Reg32 MoveTToD(CompileFlags cf); void MoveSToReg(const Xbyak::Reg32& dst, CompileFlags cf); void MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf); - void MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg); + void MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg, bool ignore_load_delays = false); std::unique_ptr m_emitter; std::unique_ptr m_far_emitter;