PerformanceCounters: Align to 4 elements

And use minv()/maxv().
3 months ago · 6c9d339855
parent b81dfa205c
commit 6c9d339855
6 changed files with 81 additions and 32 deletions
--- a/src/common/gsvector_neon.h
+++ b/src/common/gsvector_neon.h
@ -279,7 +279,7 @@ public:
  {
    constexpr int bit1 = ((mask & 2) * 3) << 1;
    constexpr int bit0 = (mask & 1) * 3;
-    return blend16<bit1 | bit0>(v);
+    return blend16 < bit1 | bit0 > (v);
  }

  ALWAYS_INLINE GSVector2i blend(const GSVector2i& v, const GSVector2i& mask) const
@ -2610,6 +2610,36 @@ public:
 #endif
  }

+  ALWAYS_INLINE float addv() const
+  {
+#ifdef CPU_ARCH_ARM64
+    return vaddvq_f32(v4s);
+#else
+    float32x2_t tmp = vadd_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
+    return vget_lane_f32(vadd_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
+#endif
+  }
+
+  ALWAYS_INLINE float minv() const
+  {
+#ifdef CPU_ARCH_ARM64
+    return vminvq_f32(v4s);
+#else
+    float32x2_t tmp = vmin_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
+    return vget_lane_f32(vmin_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
+#endif
+  }
+
+  ALWAYS_INLINE float maxv() const
+  {
+#ifdef CPU_ARCH_ARM64
+    return vmaxvq_f32(v4s);
+#else
+    float32x2_t tmp = vmax_f32(vget_low_f32(v4s), vget_high_f32(v4s)); // (x+z, y+w)
+    return vget_lane_f32(vmax_f32(tmp, vdup_lane_f32(tmp, 1)), 0);
+#endif
+  }
+
  ALWAYS_INLINE GSVector4 sat(const GSVector4& a, const GSVector4& b) const { return max(a).min(b); }

  ALWAYS_INLINE GSVector4 sat(const GSVector4& a) const
--- a/src/common/gsvector_nosimd.h
+++ b/src/common/gsvector_nosimd.h
@ -1859,6 +1859,10 @@ public:

  ALWAYS_INLINE float dot(const GSVector4& v) const { return (x * v.x) + (y * v.y) + (z * v.z) + (w * v.w); }

+  ALWAYS_INLINE float addv() const { return (x + y + z + w); }
+  ALWAYS_INLINE float minv() const { return std::min(x, std::min(y, std::min(z, w))); }
+  ALWAYS_INLINE float maxv() const { return std::max(x, std::max(y, std::max(z, w))); }
+
  GSVector4 sat(const GSVector4& min, const GSVector4& max) const
  {
    return GSVector4(std::clamp(x, min.x, max.x), std::clamp(y, min.y, max.y), std::clamp(z, min.z, max.z),
--- a/src/common/gsvector_sse.h
+++ b/src/common/gsvector_sse.h
@ -2039,6 +2039,11 @@ public:

 #ifdef CPU_ARCH_SSE41
  ALWAYS_INLINE float dot(const GSVector4& v) const { return _mm_cvtss_f32(_mm_dp_ps(m, v.m, 0xf1)); }
+  ALWAYS_INLINE float addv() const
+  {
+    const __m128 pairs = _mm_hadd_ps(m, m);
+    return _mm_cvtss_f32(_mm_hadd_ps(pairs, pairs));
+  }
 #else
  float dot(const GSVector4& v) const
  {
@ -2047,8 +2052,28 @@ public:
    tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1)));
    return _mm_cvtss_f32(tmp);
  }
+  float addv() const
+  {
+    __m128 tmp = _mm_add_ps(m, _mm_movehl_ps(m, m)); // (x+z, y+w, ..., ...)
+    tmp = _mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(3, 2, 1, 1)));
+    return _mm_cvtss_f32(tmp);
+  }
 #endif

+  ALWAYS_INLINE float minv() const
+  {
+    __m128 v = _mm_min_ps(m, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2)));
+    v = _mm_min_ps(v, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
+    return _mm_cvtss_f32(v);
+  }
+
+  ALWAYS_INLINE float maxv() const
+  {
+    __m128 v = _mm_max_ps(m, _mm_shuffle_ps(m, m, _MM_SHUFFLE(3, 2, 3, 2)));
+    v = _mm_max_ps(v, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)));
+    return _mm_cvtss_f32(v);
+  }
+
  ALWAYS_INLINE GSVector4 sat(const GSVector4& min, const GSVector4& max) const
  {
    return GSVector4(_mm_min_ps(_mm_max_ps(m, min), max));
--- a/src/core/imgui_overlays.cpp
+++ b/src/core/imgui_overlays.cpp
@ -129,32 +129,6 @@ static InputOverlayState s_input_overlay_state = {};

 } // namespace ImGuiManager

-static std::tuple<float, float> GetMinMax(std::span<const float> values)
-{
-  GSVector4 vmin(GSVector4::load<false>(values.data()));
-  GSVector4 vmax(vmin);
-
-  const u32 count = static_cast<u32>(values.size());
-  const u32 aligned_count = Common::AlignDownPow2(count, 4);
-  u32 i = 4;
-  for (; i < aligned_count; i += 4)
-  {
-    const GSVector4 v(GSVector4::load<false>(&values[i]));
-    vmin = vmin.min(v);
-    vmax = vmax.max(v);
-  }
-
-  float min = std::min(vmin.x, std::min(vmin.y, std::min(vmin.z, vmin.w)));
-  float max = std::max(vmax.x, std::max(vmax.y, std::max(vmax.z, vmax.w)));
-  for (; i < count; i++)
-  {
-    min = std::min(min, values[i]);
-    max = std::max(max, values[i]);
-  }
-
-  return std::tie(min, max);
-}
-
 bool ImGuiManager::AreAnyDebugWindowsEnabled(const SettingsInterface& si)
 {
 #ifndef __ANDROID__
@ -731,7 +705,23 @@ void ImGuiManager::DrawFrameTimeOverlay(float& position_y, float scale, float ma
  {
    ImGui::PushFont(fixed_font, fixed_font_size, fixed_font_weight);

-    auto [min, max] = GetMinMax(PerformanceCounters::GetFrameTimeHistory());
+    // LLVM likes to unroll this... whatever.
+    float min, max;
+    {
+      const PerformanceCounters::FrameTimeHistory& history = PerformanceCounters::GetFrameTimeHistory();
+      static_assert((PerformanceCounters::NUM_FRAME_TIME_SAMPLES % 4) == 0);
+      GSVector4 vmin = GSVector4::load<false>(history.data());
+      GSVector4 vmax = vmin;
+      for (size_t i = 4; i < history.size(); i += 4)
+      {
+        const GSVector4 v = GSVector4::load<false>(&history[i]);
+        vmin = vmin.min(v);
+        vmax = vmax.max(v);
+      }
+
+      min = vmin.minv();
+      max = vmin.maxv();
+    }

    // add a little bit of space either side, so we're not constantly resizing
    if ((max - min) < 4.0f)
--- a/src/core/performance_counters.cpp
+++ b/src/core/performance_counters.cpp
@ -22,7 +22,7 @@ namespace PerformanceCounters {

 namespace {

-struct State
+struct ALIGN_TO_CACHE_LINE State
 {
  Timer::Value last_update_time;
  Timer::Value last_frame_time;
@ -55,7 +55,7 @@ struct State
  float accumulated_gpu_time;
  float gpu_usage;

-  FrameTimeHistory frame_time_history;
+  alignas(VECTOR_ALIGNMENT) FrameTimeHistory frame_time_history;
  u32 frame_time_history_pos;
 };

@ -63,7 +63,7 @@ struct State

 static constexpr const float PERFORMANCE_COUNTER_UPDATE_INTERVAL = 1.0f;

-ALIGN_TO_CACHE_LINE State s_state = {};
+State s_state = {};

 } // namespace PerformanceCounters

--- a/src/core/performance_counters.h
+++ b/src/core/performance_counters.h
@ -9,7 +9,7 @@ class GPUBackend;

 namespace PerformanceCounters
 {
-inline constexpr u32 NUM_FRAME_TIME_SAMPLES = 150;
+inline constexpr u32 NUM_FRAME_TIME_SAMPLES = 152;
 using FrameTimeHistory = std::array<float, NUM_FRAME_TIME_SAMPLES>;

 float GetFPS();