Common: Further optimize alltrue()/allfalse() on ARM

pull/3356/head
Stenzek 1 month ago
parent 0189e1ef81
commit 57f3fee28c
No known key found for this signature in database

@ -6,6 +6,7 @@
#include <algorithm>
#include <cmath>
#include <cstdint>
#define GSVECTOR_HAS_FAST_INT_SHUFFLE8 1
#define GSVECTOR_HAS_SRLV 1
@ -646,25 +647,10 @@ public:
ALWAYS_INLINE bool alltrue() const
{
// MSB should be set in all 8-bit lanes.
#ifdef CPU_ARCH_ARM64
return (vminv_u8(vreinterpret_u8_s32(v2s)) & 0x80) == 0x80;
#else
return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) ==
0x80808080u);
#endif
return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
}
ALWAYS_INLINE bool allfalse() const
{
// MSB should be clear in all 8-bit lanes.
#ifdef CPU_ARCH_ARM64
return (vmaxv_u32(vreinterpret_u8_s32(v2s)) & 0x80) != 0x80;
#else
return (
((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) | vget_lane_u32(vreinterpret_u32_s32(v2s), 1)) & 0x80808080u) == 0);
#endif
}
ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0)); }
template<int i>
ALWAYS_INLINE GSVector2i insert8(int a) const
@ -910,9 +896,12 @@ public:
return (vget_lane_u32(masks, 0) | (vget_lane_u32(masks, 1) << 1));
}
ALWAYS_INLINE bool alltrue() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == 0xFFFFFFFFFFFFFFFFULL); }
ALWAYS_INLINE bool alltrue() const
{
return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
}
ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == 0); }
ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0)); }
ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); }
@ -2110,23 +2099,20 @@ public:
ALWAYS_INLINE bool alltrue() const
{
// MSB should be set in all 8-bit lanes.
#ifdef CPU_ARCH_ARM64
return (vminvq_u8(vreinterpretq_u8_s32(v4s)) & 0x80) == 0x80;
return (vminvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0xFFFFFFFF));
#else
const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s)));
return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0x80808080u);
return (vget_lane_u64(vreinterpret_u64_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) ==
UINT64_C(0xFFFFFFFFFFFFFFFF));
#endif
}
ALWAYS_INLINE bool allfalse() const
{
// MSB should be clear in all 8-bit lanes.
#ifdef CPU_ARCH_ARM64
return (vmaxvq_u32(vreinterpretq_u8_s32(v4s)) & 0x80) != 0x80;
return (vmaxvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0));
#else
const uint32x2_t res = vreinterpret_u32_s32(vorr_s32(vget_low_s32(v4s), vget_high_s32(v4s)));
return ((vget_lane_u32(res, 0) | vget_lane_u32(res, 1) & 0x80808080u) == 0);
return (vget_lane_u64(vreinterpret_u64_s32(vorr_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == UINT64_C(0));
#endif
}
@ -2727,13 +2713,25 @@ public:
ALWAYS_INLINE bool alltrue() const
{
// return mask() == 0xf;
return ~(vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) & vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
#ifdef CPU_ARCH_ARM64
return (vminvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0xFFFFFFFF));
#else
return (vget_lane_u64(vreinterpret_u64_u32(vand_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
vget_high_u32(vreinterpretq_u32_f32(v4s)))),
0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
#endif
}
ALWAYS_INLINE bool allfalse() const
{
return (vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) | vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
#ifdef CPU_ARCH_ARM64
return (vmaxvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0));
#else
return (vget_lane_u64(vreinterpret_u64_u32(vorr_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
vget_high_u32(vreinterpretq_u32_f32(v4s)))),
0) == UINT64_C(0));
#endif
}
ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }

Loading…
Cancel
Save