CPU/PGXP: Implement lwl/lwr/swl/swr

Fixes gaps in sky geometry in Kingsley's Adventure.
pull/3576/head
Stenzek 1 month ago
parent e183ec307a
commit b106392da0
No known key found for this signature in database

@ -1508,6 +1508,9 @@ restart_instruction:
// Bypasses load delay. No need to check the old value since this is the delay slot or it's not relevant.
const u32 existing_value = (inst.i.rt == g_state.load_delay_reg) ? g_state.load_delay_value : ReadReg(inst.i.rt);
if constexpr (pgxp_mode >= PGXPMode::Memory)
PGXP::CPU_LWx(inst, addr, existing_value);
const u8 shift = (Truncate8(addr) & u8(3)) * u8(8);
u32 new_value;
if (inst.op == InstructionOp::lwl)
@ -1522,9 +1525,6 @@ restart_instruction:
}
WriteRegDelayed(inst.i.rt, new_value);
if constexpr (pgxp_mode >= PGXPMode::Memory)
PGXP::CPU_LW(inst, addr, new_value);
}
break;
@ -1591,11 +1591,14 @@ restart_instruction:
}
const u32 reg_value = ReadReg(inst.i.rt);
const u8 shift = (Truncate8(addr) & u8(3)) * u8(8);
u32 mem_value;
if (!ReadMemoryWord(aligned_addr, &mem_value))
return;
if constexpr (pgxp_mode >= PGXPMode::Memory)
PGXP::CPU_SWx(inst, addr, reg_value);
const u8 shift = (Truncate8(addr) & u8(3)) * u8(8);
u32 new_value;
if (inst.op == InstructionOp::swl)
{
@ -1609,9 +1612,6 @@ restart_instruction:
}
WriteMemoryWord(aligned_addr, new_value);
if constexpr (pgxp_mode >= PGXPMode::Memory)
PGXP::CPU_SW(inst, aligned_addr, new_value);
}
break;

@ -338,7 +338,8 @@ ALWAYS_INLINE_RELEASE void CPU::PGXP::WriteMem(u32 addr, const PGXPValue& value)
return;
*pMem = value;
pMem->flags |= VALID_LOWZ | VALID_HIGHZ;
pMem->flags =
(value.flags & ~(VALID_LOWZ | VALID_HIGHZ)) | ((value.flags & VALID_Z) ? (VALID_LOWZ | VALID_HIGHZ) : 0);
}
ALWAYS_INLINE_RELEASE void CPU::PGXP::WriteMem16(u32 addr, const PGXPValue& value)
@ -699,6 +700,241 @@ void CPU::PGXP::CPU_SW(Instruction instr, u32 addr, u32 rtVal)
WriteMem(addr, prtVal);
}
void CPU::PGXP::CPU_LWx(Instruction instr, u32 addr, u32 rtVal)
{
LOG_VALUES_LOAD(addr, memVal);
const u32 aligned_addr = addr & ~3u;
PGXPValue* pmemVal = GetPtr(aligned_addr);
u32 memVal;
if (!pmemVal)
return;
if (!CPU::SafeReadMemoryWord(aligned_addr, &memVal)) [[unlikely]]
return;
pmemVal->Validate(memVal);
LOG_VALUES_LOAD(addr, memVal);
PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
const u32 byte_shift = addr & 3u;
if (instr.op == InstructionOp::lwl)
{
const u32 bit_shift = (byte_shift * 8);
const u32 mixed_value = (rtVal & (UINT32_C(0x00FFFFFF) >> bit_shift)) | (memVal << (24 - bit_shift));
switch (byte_shift)
{
case 0:
{
// only writing the upper half of Y, can't do much about that..
prtVal.y = static_cast<float>(static_cast<s16>(mixed_value >> 16));
prtVal.value = mixed_value;
prtVal.flags = (prtVal.flags & ~VALID_Y);
}
break;
case 1:
{
prtVal.y = pmemVal->x;
prtVal.z = (pmemVal->flags & VALID_LOWZ) ? pmemVal->z : prtVal.z;
prtVal.value = mixed_value;
prtVal.flags =
(prtVal.flags & ~VALID_Y) | ((pmemVal->flags & VALID_X) << 1) | ((pmemVal->flags & VALID_LOWZ) ? VALID_Z : 0);
}
break;
case 2:
{
// making a dog's breakfast of both X and Y
prtVal.x = static_cast<float>(static_cast<s16>(mixed_value));
prtVal.y = static_cast<float>(static_cast<s16>(mixed_value >> 16));
prtVal.value = mixed_value;
prtVal.flags &= ~(VALID_X | VALID_Y | VALID_Z);
}
break;
case 3:
{
// effectively the same as a normal load.
prtVal = *pmemVal;
prtVal.value = mixed_value;
}
break;
DefaultCaseIsUnreachable();
}
}
else
{
const u32 bit_shift = (byte_shift * 8);
const u32 mixed_value = (rtVal & (UINT32_C(0xFFFFFF00) << (24 - bit_shift))) | (memVal >> bit_shift);
switch (byte_shift)
{
case 0:
{
// effectively the same as a normal load.
prtVal = *pmemVal;
prtVal.value = mixed_value;
}
break;
case 1:
{
// making a dog's breakfast of both X and Y
prtVal.x = static_cast<float>(static_cast<s16>(mixed_value));
prtVal.y = static_cast<float>(static_cast<s16>(mixed_value >> 16));
prtVal.value = mixed_value;
prtVal.flags &= ~(VALID_X | VALID_Y | VALID_Z);
}
break;
case 2:
{
prtVal.x = pmemVal->y;
prtVal.z = (pmemVal->flags & VALID_HIGHZ) ? pmemVal->z : prtVal.z;
prtVal.value = mixed_value;
prtVal.flags = (prtVal.flags & ~VALID_X) | ((pmemVal->flags & VALID_Y) >> 1) |
((pmemVal->flags & VALID_HIGHZ) ? VALID_Z : 0);
}
break;
case 3:
{
// only writing the lower half of X, can't do much about that..
prtVal.x = static_cast<float>(static_cast<s16>(mixed_value));
prtVal.value = mixed_value;
prtVal.flags = (prtVal.flags & ~VALID_X);
}
break;
DefaultCaseIsUnreachable();
}
}
}
void CPU::PGXP::CPU_SWx(Instruction instr, u32 addr, u32 rtVal)
{
LOG_VALUES_STORE(instr.r.rt.GetValue(), rtVal, addr);
const u32 aligned_addr = addr & ~3u;
PGXPValue* pmemVal = GetPtr(aligned_addr);
u32 memVal;
if (!pmemVal)
return;
if (!CPU::SafeReadMemoryWord(aligned_addr, &memVal)) [[unlikely]]
return;
pmemVal->Validate(memVal);
PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
const u32 byte_shift = addr & 3u;
if (instr.op == InstructionOp::swl)
{
const u32 bit_shift = (byte_shift * 8);
const u32 mixed_value = (memVal & (UINT32_C(0xFFFFFF00) << bit_shift)) | (rtVal >> (24 - bit_shift));
switch (byte_shift)
{
case 0:
{
// only writing the lower half of X, can't do much about that..
pmemVal->x = static_cast<float>(static_cast<s16>(mixed_value));
pmemVal->value = mixed_value;
pmemVal->flags =
(pmemVal->flags & ~(VALID_X | VALID_Z | VALID_LOWZ)) | ((pmemVal->flags & VALID_HIGHZ) ? VALID_Z : 0);
}
break;
case 1:
{
pmemVal->x = prtVal.y;
pmemVal->z = (prtVal.flags & VALID_Z) ? prtVal.z : pmemVal->z;
pmemVal->value = mixed_value;
pmemVal->flags = (pmemVal->flags & ~(VALID_X | VALID_Z | VALID_LOWZ)) | ((prtVal.flags & VALID_Y) >> 1) |
((prtVal.flags & VALID_Z) ? (VALID_Z | VALID_LOWZ) : 0) |
((pmemVal->flags & VALID_HIGHZ) ? VALID_Z : 0);
}
break;
case 2:
{
// making a dog's breakfast of both X and Y
pmemVal->x = static_cast<float>(static_cast<s16>(mixed_value));
pmemVal->y = static_cast<float>(static_cast<s16>(mixed_value >> 16));
pmemVal->value = mixed_value;
pmemVal->flags &= ~(VALID_X | VALID_Y | VALID_Z | VALID_LOWZ | VALID_HIGHZ);
}
break;
case 3:
{
// effectively the same as a normal store.
*pmemVal = prtVal;
pmemVal->value = mixed_value;
pmemVal->flags =
(prtVal.flags & ~(VALID_LOWZ | VALID_HIGHZ)) | ((prtVal.flags & VALID_Z) ? (VALID_LOWZ | VALID_HIGHZ) : 0);
}
break;
DefaultCaseIsUnreachable();
}
}
else
{
const u32 bit_shift = (byte_shift * 8);
const u32 mixed_value = (memVal & (UINT32_C(0x00FFFFFF) >> (24 - bit_shift))) | (rtVal << bit_shift);
switch (byte_shift)
{
case 0:
{
// effectively the same as a normal store.
*pmemVal = prtVal;
pmemVal->value = mixed_value;
pmemVal->flags =
(prtVal.flags & ~(VALID_LOWZ | VALID_HIGHZ)) | ((prtVal.flags & VALID_Z) ? (VALID_LOWZ | VALID_HIGHZ) : 0);
}
break;
case 1:
{
// making a dog's breakfast of both X and Y
pmemVal->x = static_cast<float>(static_cast<s16>(mixed_value));
pmemVal->y = static_cast<float>(static_cast<s16>(mixed_value >> 16));
pmemVal->value = mixed_value;
pmemVal->flags &= ~(VALID_X | VALID_Y | VALID_LOWZ | VALID_HIGHZ);
}
break;
case 2:
{
pmemVal->y = prtVal.x;
pmemVal->z = (prtVal.flags & VALID_Z) ? prtVal.z : pmemVal->z;
pmemVal->value = mixed_value;
pmemVal->flags = (pmemVal->flags & ~(VALID_X | VALID_Z | VALID_HIGHZ)) | ((prtVal.flags & VALID_X) << 1) |
((prtVal.flags & VALID_Z) ? (VALID_Z | VALID_HIGHZ) : 0) |
((pmemVal->flags & VALID_LOWZ) ? VALID_Z : 0);
}
break;
case 3:
{
// only writing the upper half of Y, can't do much about that..
pmemVal->y = static_cast<float>(static_cast<s16>(mixed_value));
pmemVal->value = mixed_value;
pmemVal->flags =
(pmemVal->flags & ~(VALID_X | VALID_Z | VALID_HIGHZ)) | ((pmemVal->flags & VALID_LOWZ) ? VALID_Z : 0);
}
break;
DefaultCaseIsUnreachable();
}
}
}
void CPU::PGXP::CPU_MOVE_Packed(u32 rd_and_rs, u32 rsVal)
{
const u32 Rs = (rd_and_rs & 0xFFu);

@ -31,9 +31,11 @@ void CPU_LW(Instruction instr, u32 addr, u32 rtVal);
void CPU_LH(Instruction instr, u32 addr, u32 rtVal);
void CPU_LHU(Instruction instr, u32 addr, u32 rtVal);
void CPU_LBx(Instruction instr, u32 addr, u32 rtVal);
void CPU_LWx(Instruction instr, u32 addr, u32 rtVal);
void CPU_SB(Instruction instr, u32 addr, u32 rtVal);
void CPU_SH(Instruction instr, u32 addr, u32 rtVal);
void CPU_SW(Instruction instr, u32 addr, u32 rtVal);
void CPU_SWx(Instruction instr, u32 addr, u32 rtVal);
void CPU_MOVE(u32 Rd, u32 Rs, u32 rsVal);
void CPU_MOVE_Packed(u32 rd_and_rs, u32 rsVal);
void CPU_ADDI(Instruction instr, u32 rsVal);

@ -920,15 +920,28 @@ void CPU::ARM32Recompiler::MoveTToReg(const vixl::aarch32::Register& dst, Compil
}
}
void CPU::ARM32Recompiler::MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg)
void CPU::ARM32Recompiler::MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg, bool ignore_load_delays)
{
DebugAssert(reg < Reg::count);
if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
if (ignore_load_delays && m_load_delay_register == reg)
{
if (m_load_delay_value_register == NUM_HOST_REGS)
armAsm->ldr(dst, PTR(&g_state.load_delay_value));
else
armAsm->mov(dst, Register(m_load_delay_value_register));
}
else if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
{
armAsm->mov(dst, Register(hreg.value()));
}
else if (HasConstantReg(reg))
{
EmitMov(dst, GetConstantRegU32(reg));
}
else
{
armAsm->ldr(dst, MipsPtr(reg));
}
}
void CPU::ARM32Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
@ -1909,6 +1922,17 @@ void CPU::ARM32Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, b
// We'd need to be careful here if we weren't overwriting it..
ComputeLoadStoreAddressArg(cf, address, addr);
// Do PGXP first, it does its own load.
if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)
{
Flush(FLUSH_FOR_C_CALL);
EmitMov(RARG1, inst->bits);
armAsm->mov(RARG2, addr);
MoveMIPSRegToReg(RARG3, inst->r.rt, true);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWx));
}
armAsm->bic(RARG1, addr, 3);
GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
@ -1976,15 +2000,6 @@ void CPU::ARM32Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, b
}
FreeHostReg(addr.GetCode());
if (g_settings.gpu_pgxp_enable)
{
Flush(FLUSH_FOR_C_CALL);
armAsm->mov(RARG3, value);
armAsm->bic(RARG2, addr, 3);
EmitMov(RARG1, inst->bits);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LW));
}
}
void CPU::ARM32Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
@ -2109,15 +2124,22 @@ void CPU::ARM32Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b
// TODO: this can take over rt's value if it's no longer needed
// NOTE: can't trust T in cf because of the alloc
const Register addr = Register(AllocateTempHostReg(HR_CALLEE_SAVED));
const Register value = g_settings.gpu_pgxp_enable ? Register(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
if (g_settings.gpu_pgxp_enable)
MoveMIPSRegToReg(value, inst->r.rt);
FlushForLoadStore(address, true, use_fastmem);
// TODO: if address is constant, this can be simplified..
// We'd need to be careful here if we weren't overwriting it..
ComputeLoadStoreAddressArg(cf, address, addr);
if (g_settings.gpu_pgxp_enable)
{
Flush(FLUSH_FOR_C_CALL);
EmitMov(RARG1, inst->bits);
armAsm->mov(RARG2, addr);
MoveMIPSRegToReg(RARG3, inst->r.rt);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWx));
}
armAsm->bic(RARG1, addr, 3);
GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
@ -2125,9 +2147,7 @@ void CPU::ARM32Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b
armAsm->lsl(RSCRATCH, RSCRATCH, 3); // *8
armAsm->bic(addr, addr, 3);
// Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
if (!g_settings.gpu_pgxp_enable)
MoveMIPSRegToReg(value, inst->r.rt);
MoveMIPSRegToReg(RARG2, inst->r.rt);
if (inst->op == InstructionOp::swl)
{
@ -2139,40 +2159,25 @@ void CPU::ARM32Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b
EmitMov(RARG3, 24);
armAsm->sub(RARG3, RARG3, RSCRATCH);
armAsm->lsr(value, value, RARG3);
armAsm->orr(value, value, RRET);
armAsm->lsr(RARG2, RARG2, RARG3);
armAsm->orr(RARG2, RARG2, RRET);
}
else
{
// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
// new_value = (RWRET & mem_mask) | (value << shift);
armAsm->lsl(value, value, RSCRATCH);
armAsm->lsl(RARG2, RARG2, RSCRATCH);
EmitMov(RARG3, 24);
armAsm->sub(RARG3, RARG3, RSCRATCH);
EmitMov(RSCRATCH, 0x00FFFFFFu);
armAsm->lsr(RSCRATCH, RSCRATCH, RARG3);
armAsm->and_(RRET, RRET, RSCRATCH);
armAsm->orr(value, value, RRET);
}
if (!g_settings.gpu_pgxp_enable)
{
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
FreeHostReg(addr.GetCode());
armAsm->orr(RARG2, RARG2, RRET);
}
else
{
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
Flush(FLUSH_FOR_C_CALL);
armAsm->mov(RARG3, value);
FreeHostReg(value.GetCode());
armAsm->mov(RARG2, addr);
FreeHostReg(addr.GetCode());
EmitMov(RARG1, inst->bits);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SW));
}
GenerateStore(addr, RARG2, MemoryAccessSize::Word, use_fastmem);
FreeHostReg(addr.GetCode());
}
void CPU::ARM32Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,

@ -153,7 +153,7 @@ private:
void MoveSToReg(const vixl::aarch32::Register& dst, CompileFlags cf);
void MoveTToReg(const vixl::aarch32::Register& dst, CompileFlags cf);
void MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg);
void MoveMIPSRegToReg(const vixl::aarch32::Register& dst, Reg reg, bool ignore_load_delays = false);
vixl::aarch32::Assembler m_emitter;
vixl::aarch32::Assembler m_far_emitter;

@ -1083,15 +1083,28 @@ void CPU::ARM64Recompiler::MoveTToReg(const vixl::aarch64::Register& dst, Compil
}
}
void CPU::ARM64Recompiler::MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg)
void CPU::ARM64Recompiler::MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg, bool ignore_load_delays)
{
DebugAssert(reg < Reg::count && dst.IsW());
if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
if (ignore_load_delays && m_load_delay_register == reg)
{
if (m_load_delay_value_register == NUM_HOST_REGS)
armAsm->ldr(dst, PTR(&g_state.load_delay_value));
else
armAsm->mov(dst, WRegister(m_load_delay_value_register));
}
else if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
{
armAsm->mov(dst, WRegister(hreg.value()));
}
else if (HasConstantReg(reg))
{
EmitMov(dst, GetConstantRegU32(reg));
}
else
{
armAsm->ldr(dst, MipsPtr(reg));
}
}
void CPU::ARM64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
@ -2085,6 +2098,17 @@ void CPU::ARM64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, b
// We'd need to be careful here if we weren't overwriting it..
ComputeLoadStoreAddressArg(cf, address, addr);
// Do PGXP first, it does its own load.
if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)
{
Flush(FLUSH_FOR_C_CALL);
EmitMov(RWARG1, inst->bits);
armAsm->mov(RWARG2, addr);
MoveMIPSRegToReg(RWARG3, inst->r.rt, true);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWx));
}
armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
@ -2152,15 +2176,6 @@ void CPU::ARM64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, b
}
FreeHostReg(addr.GetCode());
if (g_settings.gpu_pgxp_enable)
{
Flush(FLUSH_FOR_C_CALL);
armAsm->mov(RWARG3, value);
armAsm->and_(RWARG2, addr, armCheckLogicalConstant(~0x3u));
EmitMov(RWARG1, inst->bits);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LW));
}
}
void CPU::ARM64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
@ -2285,15 +2300,22 @@ void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b
// TODO: this can take over rt's value if it's no longer needed
// NOTE: can't trust T in cf because of the alloc
const Register addr = WRegister(AllocateTempHostReg(HR_CALLEE_SAVED));
const Register value = g_settings.gpu_pgxp_enable ? WRegister(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;
if (g_settings.gpu_pgxp_enable)
MoveMIPSRegToReg(value, inst->r.rt);
FlushForLoadStore(address, true, use_fastmem);
// TODO: if address is constant, this can be simplified..
// We'd need to be careful here if we weren't overwriting it..
ComputeLoadStoreAddressArg(cf, address, addr);
if (g_settings.gpu_pgxp_enable)
{
Flush(FLUSH_FOR_C_CALL);
EmitMov(RWARG1, inst->bits);
armAsm->mov(RWARG2, addr);
MoveMIPSRegToReg(RWARG3, inst->r.rt);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWx));
}
armAsm->and_(RWARG1, addr, armCheckLogicalConstant(~0x3u));
GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
@ -2301,9 +2323,7 @@ void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b
armAsm->lsl(RWSCRATCH, RWSCRATCH, 3); // *8
armAsm->and_(addr, addr, armCheckLogicalConstant(~0x3u));
// Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
if (!g_settings.gpu_pgxp_enable)
MoveMIPSRegToReg(value, inst->r.rt);
MoveMIPSRegToReg(RWARG2, inst->r.rt);
if (inst->op == InstructionOp::swl)
{
@ -2315,40 +2335,25 @@ void CPU::ARM64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, b
EmitMov(RWARG3, 24);
armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
armAsm->lsrv(value, value, RWARG3);
armAsm->orr(value, value, RWRET);
armAsm->lsrv(RWARG2, RWARG2, RWARG3);
armAsm->orr(RWARG2, RWARG2, RWRET);
}
else
{
// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
// new_value = (RWRET & mem_mask) | (value << shift);
armAsm->lslv(value, value, RWSCRATCH);
armAsm->lslv(RWARG2, RWARG2, RWSCRATCH);
EmitMov(RWARG3, 24);
armAsm->sub(RWARG3, RWARG3, RWSCRATCH);
EmitMov(RWSCRATCH, 0x00FFFFFFu);
armAsm->lsrv(RWSCRATCH, RWSCRATCH, RWARG3);
armAsm->and_(RWRET, RWRET, RWSCRATCH);
armAsm->orr(value, value, RWRET);
}
if (!g_settings.gpu_pgxp_enable)
{
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
FreeHostReg(addr.GetCode());
armAsm->orr(RWARG2, RWARG2, RWRET);
}
else
{
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
Flush(FLUSH_FOR_C_CALL);
armAsm->mov(RWARG3, value);
FreeHostReg(value.GetCode());
armAsm->mov(RWARG2, addr);
FreeHostReg(addr.GetCode());
EmitMov(RWARG1, inst->bits);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SW));
}
GenerateStore(addr, RWARG2, MemoryAccessSize::Word, use_fastmem);
FreeHostReg(addr.GetCode());
}
void CPU::ARM64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,

@ -154,7 +154,7 @@ private:
void MoveSToReg(const vixl::aarch64::Register& dst, CompileFlags cf);
void MoveTToReg(const vixl::aarch64::Register& dst, CompileFlags cf);
void MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg);
void MoveMIPSRegToReg(const vixl::aarch64::Register& dst, Reg reg, bool ignore_load_delays = false);
vixl::aarch64::Assembler m_emitter;
vixl::aarch64::Assembler m_far_emitter;

@ -911,15 +911,28 @@ void CPU::RISCV64Recompiler::MoveTToReg(const biscuit::GPR& dst, CompileFlags cf
}
}
void CPU::RISCV64Recompiler::MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg)
void CPU::RISCV64Recompiler::MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg, bool ignore_load_delays)
{
DebugAssert(reg < Reg::count);
if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
if (ignore_load_delays && m_load_delay_register == reg)
{
if (m_load_delay_value_register == NUM_HOST_REGS)
rvAsm->LW(dst, PTR(&g_state.load_delay_value));
else
rvAsm->MV(dst, GPR(m_load_delay_value_register));
}
else if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
{
rvAsm->MV(dst, GPR(hreg.value()));
}
else if (HasConstantReg(reg))
{
EmitMov(dst, GetConstantRegU32(reg));
}
else
{
rvAsm->LW(dst, PTR(&g_state.regs.r[static_cast<u8>(reg)]));
}
}
void CPU::RISCV64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
@ -1942,6 +1955,17 @@ void CPU::RISCV64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size,
// We'd need to be careful here if we weren't overwriting it..
ComputeLoadStoreAddressArg(cf, address, addr);
// Do PGXP first, it does its own load.
if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)
{
Flush(FLUSH_FOR_C_CALL);
EmitMov(RARG1, inst->bits);
rvAsm->MV(RARG2, addr);
MoveMIPSRegToReg(RARG3, inst->r.rt, true);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LWx));
}
rvAsm->ANDI(RARG1, addr, ~0x3u);
GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
@ -2009,15 +2033,6 @@ void CPU::RISCV64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size,
}
FreeHostReg(addr.Index());
if (g_settings.gpu_pgxp_enable)
{
Flush(FLUSH_FOR_C_CALL);
rvAsm->MV(RARG3, value);
rvAsm->ANDI(RARG2, addr, ~0x3u);
EmitMov(RARG1, inst->bits);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_LW));
}
}
void CPU::RISCV64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
@ -2140,15 +2155,22 @@ void CPU::RISCV64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size,
// TODO: this can take over rt's value if it's no longer needed
// NOTE: can't trust T in cf because of the alloc
const GPR addr = GPR(AllocateTempHostReg(HR_CALLEE_SAVED));
const GPR value = g_settings.gpu_pgxp_enable ? GPR(AllocateTempHostReg(HR_CALLEE_SAVED)) : RARG2;
if (g_settings.gpu_pgxp_enable)
MoveMIPSRegToReg(value, inst->r.rt);
FlushForLoadStore(address, true, use_fastmem);
// TODO: if address is constant, this can be simplified..
// We'd need to be careful here if we weren't overwriting it..
ComputeLoadStoreAddressArg(cf, address, addr);
if (g_settings.gpu_pgxp_enable)
{
Flush(FLUSH_FOR_C_CALL);
EmitMov(RARG1, inst->bits);
rvAsm->MV(RARG2, addr);
MoveMIPSRegToReg(RARG3, inst->r.rt);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SWx));
}
rvAsm->ANDI(RARG1, addr, ~0x3u);
GenerateLoad(RARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RRET; });
@ -2158,7 +2180,7 @@ void CPU::RISCV64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size,
// Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
if (!g_settings.gpu_pgxp_enable)
MoveMIPSRegToReg(value, inst->r.rt);
MoveMIPSRegToReg(RARG2, inst->r.rt);
if (inst->op == InstructionOp::swl)
{
@ -2170,40 +2192,25 @@ void CPU::RISCV64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size,
EmitMov(RARG3, 24);
rvAsm->SUBW(RARG3, RARG3, RSCRATCH);
rvAsm->SRLW(value, value, RARG3);
rvAsm->OR(value, value, RRET);
rvAsm->SRLW(RARG2, RARG2, RARG3);
rvAsm->OR(RARG2, RARG2, RRET);
}
else
{
// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
// new_value = (RWRET & mem_mask) | (value << shift);
rvAsm->SLLW(value, value, RSCRATCH);
rvAsm->SLLW(RARG2, RARG2, RSCRATCH);
EmitMov(RARG3, 24);
rvAsm->SUBW(RARG3, RARG3, RSCRATCH);
EmitMov(RSCRATCH, 0x00FFFFFFu);
rvAsm->SRLW(RSCRATCH, RSCRATCH, RARG3);
rvAsm->AND(RRET, RRET, RSCRATCH);
rvAsm->OR(value, value, RRET);
}
if (!g_settings.gpu_pgxp_enable)
{
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
FreeHostReg(addr.Index());
rvAsm->OR(RARG2, RARG2, RRET);
}
else
{
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
Flush(FLUSH_FOR_C_CALL);
rvAsm->MV(RARG3, value);
FreeHostReg(value.Index());
rvAsm->MV(RARG2, addr);
FreeHostReg(addr.Index());
EmitMov(RARG1, inst->bits);
EmitCall(reinterpret_cast<const void*>(&PGXP::CPU_SW));
}
GenerateStore(addr, RARG2, MemoryAccessSize::Word, use_fastmem);
FreeHostReg(addr.Index());
}
void CPU::RISCV64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,

@ -166,7 +166,7 @@ private:
void MoveSToReg(const biscuit::GPR& dst, CompileFlags cf);
void MoveTToReg(const biscuit::GPR& dst, CompileFlags cf);
void MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg);
void MoveMIPSRegToReg(const biscuit::GPR& dst, Reg reg, bool ignore_load_delays = false);
std::unique_ptr<biscuit::Assembler> m_emitter;
std::unique_ptr<biscuit::Assembler> m_far_emitter;

@ -858,15 +858,28 @@ void CPU::X64Recompiler::MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf)
}
}
void CPU::X64Recompiler::MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg)
void CPU::X64Recompiler::MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg, bool ignore_load_delays)
{
DebugAssert(reg < Reg::count);
if (const std::optional<u32> hreg = CheckHostReg(0, Recompiler::HR_TYPE_CPU_REG, reg))
if (ignore_load_delays && m_load_delay_register == reg)
{
if (m_load_delay_value_register == NUM_HOST_REGS)
cg->mov(dst, cg->dword[PTR(&g_state.load_delay_value)]);
else
cg->mov(dst, Reg32(m_load_delay_value_register));
}
else if (const std::optional<u32> hreg = CheckHostReg(0, HR_TYPE_CPU_REG, reg))
{
cg->mov(dst, Reg32(hreg.value()));
}
else if (HasConstantReg(reg))
{
cg->mov(dst, GetConstantRegU32(reg));
}
else
{
cg->mov(dst, MipsPtr(reg));
}
}
void CPU::X64Recompiler::GeneratePGXPCallWithMIPSRegs(const void* func, u32 arg1val, Reg arg2reg /* = Reg::count */,
@ -1891,6 +1904,17 @@ void CPU::X64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, boo
// We'd need to be careful here if we weren't overwriting it..
ComputeLoadStoreAddressArg(cf, address, addr);
// Do PGXP first, it does its own load.
if (g_settings.gpu_pgxp_enable && inst->r.rt != Reg::zero)
{
Flush(FLUSH_FOR_C_CALL);
cg->mov(RWARG1, inst->bits);
cg->mov(RWARG2, addr);
MoveMIPSRegToReg(RWARG3, inst->r.rt, true);
cg->call(reinterpret_cast<const void*>(&PGXP::CPU_LWx));
}
cg->mov(RWARG1, addr);
cg->and_(RWARG1, ~0x3u);
GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
@ -1965,18 +1989,6 @@ void CPU::X64Recompiler::Compile_lwx(CompileFlags cf, MemoryAccessSize size, boo
}
FreeHostReg(addr.getIdx());
if (g_settings.gpu_pgxp_enable)
{
Flush(FLUSH_FOR_C_CALL);
DebugAssert(value != RWARG3);
cg->mov(RWARG3, value);
cg->mov(RWARG2, addr);
cg->and_(RWARG2, ~0x3u);
cg->mov(RWARG1, inst->bits);
cg->call(reinterpret_cast<const void*>(&PGXP::CPU_LW));
}
}
void CPU::X64Recompiler::Compile_lwc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,
@ -2098,28 +2110,31 @@ void CPU::X64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, boo
// TODO: this can take over rt's value if it's no longer needed
// NOTE: can't trust T in cf because of the alloc
const Reg32 addr = Reg32(AllocateTempHostReg(HR_CALLEE_SAVED));
const Reg32 value = g_settings.gpu_pgxp_enable ? Reg32(AllocateTempHostReg(HR_CALLEE_SAVED)) : RWARG2;
if (g_settings.gpu_pgxp_enable)
MoveMIPSRegToReg(value, inst->r.rt);
FlushForLoadStore(address, true, use_fastmem);
// TODO: if address is constant, this can be simplified..
// We'd need to be careful here if we weren't overwriting it..
ComputeLoadStoreAddressArg(cf, address, addr);
if (g_settings.gpu_pgxp_enable)
{
Flush(FLUSH_FOR_C_CALL);
cg->mov(RWARG1, inst->bits);
cg->mov(RWARG2, addr);
MoveMIPSRegToReg(RWARG3, inst->r.rt);
cg->call(reinterpret_cast<const void*>(&PGXP::CPU_SWx));
}
cg->mov(RWARG1, addr);
cg->and_(RWARG1, ~0x3u);
GenerateLoad(RWARG1, MemoryAccessSize::Word, false, use_fastmem, []() { return RWRET; });
DebugAssert(value != cg->ecx);
cg->mov(cg->ecx, addr);
cg->and_(cg->ecx, 3);
cg->shl(cg->ecx, 3); // *8
cg->and_(addr, ~0x3u);
// Need to load down here for PGXP-off, because it's in a volatile reg that can get overwritten by flush.
if (!g_settings.gpu_pgxp_enable)
MoveMIPSRegToReg(value, inst->r.rt);
MoveMIPSRegToReg(RWARG2, inst->r.rt);
if (inst->op == InstructionOp::swl)
{
@ -2132,14 +2147,14 @@ void CPU::X64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, boo
cg->mov(RWARG3, 24);
cg->sub(RWARG3, cg->ecx);
cg->mov(cg->ecx, RWARG3);
cg->shr(value, cg->cl);
cg->or_(value, RWRET);
cg->shr(RWARG2, cg->cl);
cg->or_(RWARG2, RWRET);
}
else
{
// const u32 mem_mask = UINT32_C(0x00FFFFFF) >> (24 - shift);
// new_value = (RWRET & mem_mask) | (value << shift);
cg->shl(value, cg->cl);
cg->shl(RWARG2, cg->cl);
DebugAssert(RWARG3 != cg->ecx);
cg->mov(RWARG3, 24);
@ -2148,26 +2163,11 @@ void CPU::X64Recompiler::Compile_swx(CompileFlags cf, MemoryAccessSize size, boo
cg->mov(RWARG3, 0x00FFFFFFu);
cg->shr(RWARG3, cg->cl);
cg->and_(RWRET, RWARG3);
cg->or_(value, RWRET);
}
if (!g_settings.gpu_pgxp_enable)
{
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
FreeHostReg(addr.getIdx());
cg->or_(RWARG2, RWRET);
}
else
{
GenerateStore(addr, value, MemoryAccessSize::Word, use_fastmem);
Flush(FLUSH_FOR_C_CALL);
cg->mov(RWARG3, value);
FreeHostReg(value.getIdx());
cg->mov(RWARG2, addr);
FreeHostReg(addr.getIdx());
cg->mov(RWARG1, inst->bits);
cg->call(reinterpret_cast<const void*>(&PGXP::CPU_SW));
}
GenerateStore(addr, RWARG2, MemoryAccessSize::Word, use_fastmem);
FreeHostReg(addr.getIdx());
}
void CPU::X64Recompiler::Compile_swc2(CompileFlags cf, MemoryAccessSize size, bool sign, bool use_fastmem,

@ -142,7 +142,7 @@ private:
Xbyak::Reg32 MoveTToD(CompileFlags cf);
void MoveSToReg(const Xbyak::Reg32& dst, CompileFlags cf);
void MoveTToReg(const Xbyak::Reg32& dst, CompileFlags cf);
void MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg);
void MoveMIPSRegToReg(const Xbyak::Reg32& dst, Reg reg, bool ignore_load_delays = false);
std::unique_ptr<Xbyak::CodeGenerator> m_emitter;
std::unique_ptr<Xbyak::CodeGenerator> m_far_emitter;

Loading…
Cancel
Save