GPU: Handle VRAM wrap-around behavior on scanout

5 years ago · 9e024b7a51
parent fc3efebb38
commit 9e024b7a51
6 changed files with 129 additions and 72 deletions
--- a/src/core/gpu.cpp
+++ b/src/core/gpu.cpp
@ -487,22 +487,19 @@ void GPU::UpdateCRTCDisplayParameters()

  if (horizontal_display_end <= horizontal_visible_end_tick)
  {
-    cs.display_vram_width = std::min<u16>(
+    cs.display_vram_width =
      std::max<u16>((((horizontal_display_end - std::max(horizontal_display_start, horizontal_visible_start_tick)) +
                      (cs.dot_clock_divider - 1)) /
                     cs.dot_clock_divider),
-                    1u),
-      VRAM_WIDTH - cs.display_vram_left);
+                    1u);
  }
  else
  {
-    cs.display_vram_width = std::min<u16>(
-      std::max<u16>(
-        (((horizontal_visible_end_tick - std::max(horizontal_display_start, horizontal_visible_start_tick)) +
-          (cs.dot_clock_divider - 1)) /
-         cs.dot_clock_divider),
-        1u),
-      VRAM_WIDTH - cs.display_vram_left);
+    cs.display_vram_width = std::max<u16>(
+      (((horizontal_visible_end_tick - std::max(horizontal_display_start, horizontal_visible_start_tick)) +
+        (cs.dot_clock_divider - 1)) /
+       cs.dot_clock_divider),
+      1u);
  }

  if (vertical_display_start >= vertical_visible_start_line)
@ -513,21 +510,19 @@ void GPU::UpdateCRTCDisplayParameters()
  else
  {
    cs.display_origin_top = 0;
-    cs.display_vram_top = std::min<u16>(
-      m_crtc_state.regs.Y + ((vertical_visible_start_line - vertical_display_start) << height_shift), VRAM_HEIGHT - 1);
+    cs.display_vram_top =
+      m_crtc_state.regs.Y + ((vertical_visible_start_line - vertical_display_start) << height_shift);
  }

  if (vertical_display_end <= vertical_visible_end_line)
  {
-    cs.display_vram_height = std::min<u16>(
-      (vertical_display_end - std::max(vertical_display_start, vertical_visible_start_line)) << height_shift,
-      VRAM_HEIGHT - cs.display_vram_top);
+    cs.display_vram_height = (vertical_display_end - std::max(vertical_display_start, vertical_visible_start_line))
+                             << height_shift;
  }
  else
  {
-    cs.display_vram_height = std::min<u16>(
-      (vertical_visible_end_line - std::max(vertical_display_start, vertical_visible_start_line)) << height_shift,
-      VRAM_HEIGHT - cs.display_vram_top);
+    cs.display_vram_height = (vertical_visible_end_line - std::max(vertical_display_start, vertical_visible_start_line))
+                             << height_shift;
  }
 }

--- a/src/core/gpu_hw_d3d11.cpp
+++ b/src/core/gpu_hw_d3d11.cpp
@ -553,7 +553,9 @@ void GPU_HW_D3D11::UpdateDisplay()
    {
      m_host_display->ClearDisplayTexture();
    }
-    else if (!m_GPUSTAT.display_area_color_depth_24 && !interlaced)
+    else if (!m_GPUSTAT.display_area_color_depth_24 && !interlaced &&
+             (scaled_vram_offset_x + scaled_display_width) <= m_vram_texture.GetWidth() &&
+             (scaled_vram_offset_y + scaled_vram_offset_y <= m_vram_texture.GetHeight()))
    {
      m_host_display->SetDisplayTexture(m_vram_texture.GetD3DSRV(), m_vram_texture.GetWidth(),
                                        m_vram_texture.GetHeight(), scaled_vram_offset_x, scaled_vram_offset_y,
@ -567,15 +569,15 @@ void GPU_HW_D3D11::UpdateDisplay()
      const u32 reinterpret_field_offset = GetInterlacedField();
      const u32 reinterpret_start_x = m_crtc_state.regs.X * m_resolution_scale;
      const u32 reinterpret_width = scaled_display_width + (m_crtc_state.display_vram_left - m_crtc_state.regs.X);
-      const u32 uniforms[4] = {reinterpret_field_offset, reinterpret_start_x};
+      const u32 uniforms[4] = {reinterpret_start_x, scaled_vram_offset_y, reinterpret_field_offset};
      ID3D11PixelShader* display_pixel_shader =
        m_display_pixel_shaders[BoolToUInt8(m_GPUSTAT.display_area_color_depth_24)][BoolToUInt8(interlaced)].Get();

-      SetViewportAndScissor(reinterpret_start_x, scaled_vram_offset_y, reinterpret_width, scaled_display_height);
+      SetViewportAndScissor(0, reinterpret_field_offset, reinterpret_width, scaled_display_height);
      DrawUtilityShader(display_pixel_shader, uniforms, sizeof(uniforms));

      m_host_display->SetDisplayTexture(m_display_texture.GetD3DSRV(), m_display_texture.GetWidth(),
-                                        m_display_texture.GetHeight(), scaled_vram_offset_x, scaled_vram_offset_y,
+                                        m_display_texture.GetHeight(), scaled_vram_offset_x - reinterpret_start_x, 0,
                                        scaled_display_width, scaled_display_height);

      RestoreGraphicsAPIState();
--- a/src/core/gpu_hw_opengl.cpp
+++ b/src/core/gpu_hw_opengl.cpp
@ -506,7 +506,9 @@ void GPU_HW_OpenGL::UpdateDisplay()
    {
      m_host_display->ClearDisplayTexture();
    }
-    else if (!m_GPUSTAT.display_area_color_depth_24 && !interlaced)
+    else if (!m_GPUSTAT.display_area_color_depth_24 && !interlaced &&
+             (scaled_vram_offset_x + scaled_display_width) <= m_vram_texture.GetWidth() &&
+             (scaled_vram_offset_y + scaled_vram_offset_y <= m_vram_texture.GetHeight()))
    {
      m_host_display->SetDisplayTexture(reinterpret_cast<void*>(static_cast<uintptr_t>(m_vram_texture.GetGLId())),
                                        m_vram_texture.GetWidth(), m_vram_texture.GetHeight(), scaled_vram_offset_x,
@ -525,20 +527,19 @@ void GPU_HW_OpenGL::UpdateDisplay()
      const u32 flipped_vram_offset_y = VRAM_HEIGHT - vram_offset_y - display_height;
      const u32 scaled_flipped_vram_offset_y =
        m_vram_texture.GetHeight() - scaled_vram_offset_y - scaled_display_height;
-
      const u32 reinterpret_field_offset = GetInterlacedField();
      const u32 reinterpret_start_x = m_crtc_state.regs.X * m_resolution_scale;
      const u32 reinterpret_width = scaled_display_width + (m_crtc_state.display_vram_left - m_crtc_state.regs.X);
-      const u32 uniforms[4] = {reinterpret_field_offset, reinterpret_start_x};
+      const u32 uniforms[4] = {reinterpret_start_x, scaled_flipped_vram_offset_y, reinterpret_field_offset};
      UploadUniformBlock(uniforms, sizeof(uniforms));
      m_batch_ubo_dirty = true;

-      glViewport(reinterpret_start_x, scaled_flipped_vram_offset_y, reinterpret_width, scaled_display_height);
+      glViewport(0, reinterpret_field_offset, reinterpret_width, scaled_display_height);
      glDrawArrays(GL_TRIANGLES, 0, 3);

      m_host_display->SetDisplayTexture(reinterpret_cast<void*>(static_cast<uintptr_t>(m_display_texture.GetGLId())),
                                        m_display_texture.GetWidth(), m_display_texture.GetHeight(),
-                                        scaled_vram_offset_x, m_vram_texture.GetHeight() - scaled_vram_offset_y,
+                                        scaled_vram_offset_x - reinterpret_start_x, scaled_display_height,
                                        scaled_display_width, -static_cast<s32>(scaled_display_height));

      // restore state
--- a/src/core/gpu_hw_shadergen.cpp
+++ b/src/core/gpu_hw_shadergen.cpp
@ -770,37 +770,39 @@ std::string GPU_HW_ShaderGen::GenerateDisplayFragmentShader(bool depth_24bit, bo
  DefineMacro(ss, "INTERLACED", interlaced);

  WriteCommonFunctions(ss);
-  DeclareUniformBuffer(ss, {"int u_field_offset", "int u_vram_start_x"});
+  DeclareUniformBuffer(ss, {"uint2 u_vram_offset", "uint u_field_offset"});
  DeclareTexture(ss, "samp0", 0);

  DeclareFragmentEntryPoint(ss, 0, 1, {}, true, false);
  ss << R"(
 {
-  int2 icoords = int2(v_pos.xy);
+  uint2 icoords = uint2(v_pos.xy) + u_vram_offset;

  #if INTERLACED
-    if (((fixYCoord(icoords.y) / RESOLUTION_SCALE) & 1) != u_field_offset)
+    if (((icoords.y / uint(RESOLUTION_SCALE)) & 1u) != u_field_offset)
      discard;
  #endif

+  //icoords.y = uint(fixYCoord(int(icoords.y)));
+
  #if DEPTH_24BIT
    // relative to start of scanout
-    int relative_x = (icoords.x - u_vram_start_x) / RESOLUTION_SCALE;
-    icoords.x = u_vram_start_x + ((relative_x * 3) / 2) * RESOLUTION_SCALE;
+    uint relative_x = (icoords.x - u_vram_offset.x) / uint(RESOLUTION_SCALE);
+    icoords.x = u_vram_offset.x + ((relative_x * 3u) / 2u) * uint(RESOLUTION_SCALE);

    // load adjacent 16-bit texels
-    uint s0 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, icoords, 0));
-    uint s1 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, icoords + int2(RESOLUTION_SCALE, 0), 0));
+    uint s0 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, int2(icoords % uint2(VRAM_SIZE)), 0));
+    uint s1 = RGBA8ToRGBA5551(LOAD_TEXTURE(samp0, int2((icoords + uint2(uint(RESOLUTION_SCALE), 0)) % uint2(VRAM_SIZE)), 0));
    
    // select which part of the combined 16-bit texels we are currently shading
-    uint s1s0 = ((s1 << 16) | s0) >> ((relative_x & 1) * 8);
+    uint s1s0 = ((s1 << 16) | s0) >> ((relative_x & 1u) * 8u);
    
    // extract components and normalize
    o_col0 = float4(float(s1s0 & 0xFFu) / 255.0, float((s1s0 >> 8u) & 0xFFu) / 255.0,
                    float((s1s0 >> 16u) & 0xFFu) / 255.0, 1.0);
  #else
    // load and return
-    o_col0 = LOAD_TEXTURE(samp0, icoords, 0);
+    o_col0 = LOAD_TEXTURE(samp0, int2(icoords % uint2(VRAM_SIZE)), 0);
  #endif
 }
 )";
--- a/src/core/gpu_sw.cpp
+++ b/src/core/gpu_sw.cpp
@ -41,38 +41,99 @@ void GPU_SW::Reset()
  m_vram.fill(0);
 }

-void GPU_SW::CopyOut15Bit(const u16* src_ptr, u32 src_stride, u32* dst_ptr, u32 dst_stride, u32 width, u32 height)
+void GPU_SW::CopyOut15Bit(u32 src_x, u32 src_y, u32* dst_ptr, u32 dst_stride, u32 width, u32 height, bool interlaced)
 {
-  for (u32 row = 0; row < height; row++)
+  const u8 interlaced_shift = BoolToUInt8(interlaced);
+
+  // Fast path when not wrapping around.
+  if ((src_x + width) <= VRAM_WIDTH && (src_y + height) <= VRAM_HEIGHT)
  {
-    const u16* src_row_ptr = src_ptr;
-    u32* dst_row_ptr = dst_ptr;
-    for (u32 col = 0; col < width; col++)
-      *(dst_row_ptr++) = RGBA5551ToRGBA8888(*(src_row_ptr++));
+    dst_stride <<= interlaced_shift;
+    height >>= interlaced_shift;
+
+    const u16* src_ptr = &m_vram[src_y * VRAM_WIDTH + src_x];
+    const u32 src_stride = VRAM_WIDTH << interlaced_shift;
+    for (u32 row = 0; row < height; row++)
+    {
+      const u16* src_row_ptr = src_ptr;
+      u32* dst_row_ptr = dst_ptr;
+      for (u32 col = 0; col < width; col++)
+        *(dst_row_ptr++) = RGBA5551ToRGBA8888(*(src_row_ptr++));

-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    }
+  }
+  else
+  {
+    dst_stride <<= interlaced_shift;
+    height >>= interlaced_shift;
+
+    const u32 end_x = src_x + width;
+    for (u32 row = 0; row < height; row++)
+    {
+      const u16* src_row_ptr = &m_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
+      u32* dst_row_ptr = dst_ptr;
+
+      for (u32 col = src_x; col < end_x; col++)
+        *(dst_row_ptr++) = RGBA5551ToRGBA8888(src_row_ptr[col % VRAM_WIDTH]);
+
+      src_y += (1 << interlaced_shift);
+      dst_ptr += dst_stride;
+    }
  }
 }

-void GPU_SW::CopyOut24Bit(const u16* src_ptr, u32 src_stride, u32* dst_ptr, u32 dst_stride, u32 width, u32 height)
+void GPU_SW::CopyOut24Bit(u32 src_x, u32 src_y, u32* dst_ptr, u32 dst_stride, u32 width, u32 height, bool interlaced)
 {
-  for (u32 row = 0; row < height; row++)
+  const u8 interlaced_shift = BoolToUInt8(interlaced);
+
+  if ((src_x + width) <= VRAM_WIDTH && (src_y + height) <= VRAM_HEIGHT)
  {
-    const u8* src_row_ptr = reinterpret_cast<const u8*>(src_ptr);
-    u32* dst_row_ptr = dst_ptr;
+    dst_stride <<= interlaced_shift;
+    height >>= interlaced_shift;

-    // Beware unaligned accesses.
-    for (u32 col = 0; col < width; col++)
+    const u8* src_ptr = reinterpret_cast<const u8*>(&m_vram[src_y * VRAM_WIDTH + src_x]);
+    const u32 src_stride = (VRAM_WIDTH << interlaced_shift) * sizeof(u16);
+    for (u32 row = 0; row < height; row++)
    {
-      // This will fill the alpha channel with junk, but that's okay since we don't use it
-      std::memcpy(dst_row_ptr, src_row_ptr, sizeof(u32));
-      src_row_ptr += 3;
-      dst_row_ptr++;
+      const u8* src_row_ptr = src_ptr;
+      u8* dst_row_ptr = reinterpret_cast<u8*>(dst_ptr);
+      for (u32 col = 0; col < width; col++)
+      {
+        *(dst_row_ptr++) = *(src_row_ptr++);
+        *(dst_row_ptr++) = *(src_row_ptr++);
+        *(dst_row_ptr++) = *(src_row_ptr++);
+        *(dst_row_ptr++) = 0xFF;
+      }
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
    }
+  }
+  else
+  {
+    dst_stride <<= interlaced_shift;
+    height >>= interlaced_shift;

-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
+    const u32 end_x = src_x + width;
+    for (u32 row = 0; row < height; row++)
+    {
+      const u16* src_row_ptr = &m_vram[(src_y % VRAM_HEIGHT) * VRAM_WIDTH];
+      u32* dst_row_ptr = dst_ptr;
+
+      for (u32 col = 0; col < width; col++)
+      {
+        const u32 offset = (src_x + ((col * 3) / 2));
+        const u16 s0 = src_row_ptr[offset % VRAM_WIDTH];
+        const u16 s1 = src_row_ptr[(offset + 1) % VRAM_WIDTH];
+        const u8 shift = static_cast<u8>(col & 1u) * 8;
+        *(dst_row_ptr++) = (((ZeroExtend32(s1) << 16) | ZeroExtend32(s0)) >> shift) | 0xFF000000u;
+      }
+
+      src_y += (1 << interlaced_shift);
+      dst_ptr += dst_stride;
+    }
  }
 }

@ -98,34 +159,32 @@ void GPU_SW::UpdateDisplay()
      const u32 field = GetInterlacedField();
      if (m_GPUSTAT.display_area_color_depth_24)
      {
-        CopyOut24Bit(m_vram.data() + (vram_offset_y + field) * VRAM_WIDTH + m_crtc_state.regs.X, VRAM_WIDTH * 2,
-                     m_display_texture_buffer.data() + field * display_width, display_width * 2, display_width,
-                     display_height / 2);
+        CopyOut24Bit(m_crtc_state.regs.X, vram_offset_y + field, m_display_texture_buffer.data() + field * VRAM_WIDTH,
+                     VRAM_WIDTH, display_width, display_height, true);
      }
      else
      {
-        CopyOut15Bit(m_vram.data() + (vram_offset_y + field) * VRAM_WIDTH + m_crtc_state.regs.X, VRAM_WIDTH * 2,
-                     m_display_texture_buffer.data() + field * display_width, display_width * 2, display_width,
-                     display_height / 2);
+        CopyOut15Bit(m_crtc_state.regs.X, vram_offset_y + field, m_display_texture_buffer.data() + field * VRAM_WIDTH,
+                     VRAM_WIDTH, display_width, display_height, true);
      }
    }
    else
    {
      if (m_GPUSTAT.display_area_color_depth_24)
      {
-        CopyOut24Bit(m_vram.data() + vram_offset_y * VRAM_WIDTH + m_crtc_state.regs.X, VRAM_WIDTH,
-                     m_display_texture_buffer.data(), display_width, display_width, display_height);
+        CopyOut24Bit(m_crtc_state.regs.X, vram_offset_y, m_display_texture_buffer.data(), VRAM_WIDTH, display_width,
+                     display_height, false);
      }
      else
      {
-        CopyOut15Bit(m_vram.data() + vram_offset_y * VRAM_WIDTH + m_crtc_state.regs.X, VRAM_WIDTH,
-                     m_display_texture_buffer.data(), display_width, display_width, display_height);
+        CopyOut15Bit(m_crtc_state.regs.X, vram_offset_y, m_display_texture_buffer.data(), VRAM_WIDTH, display_width,
+                     display_height, false);
      }
    }

    const u32 texture_offset_x = m_crtc_state.display_vram_left - m_crtc_state.regs.X;
    m_host_display->UpdateTexture(m_display_texture.get(), texture_offset_x, 0, display_width, display_height,
-                                  m_display_texture_buffer.data(), display_width * sizeof(u32));
+                                  m_display_texture_buffer.data(), VRAM_WIDTH * sizeof(u32));
    m_host_display->SetDisplayTexture(m_display_texture->GetHandle(), VRAM_WIDTH, VRAM_HEIGHT, texture_offset_x, 0,
                                      display_width, display_height);
    m_host_display->SetDisplayParameters(m_crtc_state.display_width, m_crtc_state.display_height,
@ -135,7 +194,7 @@ void GPU_SW::UpdateDisplay()
  }
  else
  {
-    CopyOut15Bit(m_vram.data(), VRAM_WIDTH, m_display_texture_buffer.data(), VRAM_WIDTH, VRAM_WIDTH, VRAM_HEIGHT);
+    CopyOut15Bit(0, 0, m_display_texture_buffer.data(), VRAM_WIDTH, VRAM_WIDTH, VRAM_HEIGHT, false);
    m_host_display->UpdateTexture(m_display_texture.get(), 0, 0, VRAM_WIDTH, VRAM_HEIGHT,
                                  m_display_texture_buffer.data(), VRAM_WIDTH * sizeof(u32));
    m_host_display->SetDisplayTexture(m_display_texture->GetHandle(), VRAM_WIDTH, VRAM_HEIGHT, 0, 0, VRAM_WIDTH,
--- a/src/core/gpu_sw.h
+++ b/src/core/gpu_sw.h
@ -43,10 +43,8 @@ protected:
  //////////////////////////////////////////////////////////////////////////
  // Scanout
  //////////////////////////////////////////////////////////////////////////
-  static void CopyOut15Bit(const u16* src_ptr, u32 src_stride, u32* dst_ptr, u32 dst_stride, u32 width, u32 height);
-
-  static void CopyOut24Bit(const u16* src_ptr, u32 src_stride, u32* dst_ptr, u32 dst_stride, u32 width, u32 height);
-
+  void CopyOut15Bit(u32 src_x, u32 src_y, u32* dst_ptr, u32 dst_stride, u32 width, u32 height, bool interlaced);
+  void CopyOut24Bit(u32 src_x, u32 src_y, u32* dst_ptr, u32 dst_stride, u32 width, u32 height, bool interlaced);
  void UpdateDisplay() override;

  //////////////////////////////////////////////////////////////////////////