mirror of https://github.com/yuzu-mirror/yuzu
Merge pull request #11225 from FernandoS27/no-laxatives-in-santas-cookies
Y.F.C: Rework the Query Cache.pull/8/head
commit
854457a392
@ -0,0 +1,173 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
#version 460 core
|
||||
|
||||
#extension GL_KHR_shader_subgroup_basic : require
|
||||
#extension GL_KHR_shader_subgroup_shuffle : require
|
||||
#extension GL_KHR_shader_subgroup_shuffle_relative : require
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
|
||||
#ifdef VULKAN
|
||||
|
||||
#define HAS_EXTENDED_TYPES 1
|
||||
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
|
||||
#define END_PUSH_CONSTANTS };
|
||||
#define UNIFORM(n)
|
||||
#define BINDING_INPUT_BUFFER 0
|
||||
#define BINDING_OUTPUT_IMAGE 1
|
||||
|
||||
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
|
||||
|
||||
#extension GL_NV_gpu_shader5 : enable
|
||||
#ifdef GL_NV_gpu_shader5
|
||||
#define HAS_EXTENDED_TYPES 1
|
||||
#else
|
||||
#define HAS_EXTENDED_TYPES 0
|
||||
#endif
|
||||
#define BEGIN_PUSH_CONSTANTS
|
||||
#define END_PUSH_CONSTANTS
|
||||
#define UNIFORM(n) layout(location = n) uniform
|
||||
#define BINDING_INPUT_BUFFER 0
|
||||
#define BINDING_OUTPUT_IMAGE 0
|
||||
|
||||
#endif
|
||||
|
||||
BEGIN_PUSH_CONSTANTS
|
||||
UNIFORM(0) uint min_accumulation_base;
|
||||
UNIFORM(1) uint max_accumulation_base;
|
||||
UNIFORM(2) uint accumulation_limit;
|
||||
UNIFORM(3) uint buffer_offset;
|
||||
END_PUSH_CONSTANTS
|
||||
|
||||
#define LOCAL_RESULTS 8
|
||||
#define QUERIES_PER_INVOC 2048
|
||||
|
||||
layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in;
|
||||
|
||||
layout(std430, binding = 0) readonly buffer block1 {
|
||||
uvec2 input_data[];
|
||||
};
|
||||
|
||||
layout(std430, binding = 1) coherent buffer block2 {
|
||||
uvec2 output_data[];
|
||||
};
|
||||
|
||||
layout(std430, binding = 2) coherent buffer block3 {
|
||||
uvec2 accumulated_data;
|
||||
};
|
||||
|
||||
shared uvec2 shared_data[128];
|
||||
|
||||
// Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64
|
||||
uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
|
||||
uint carry = 0;
|
||||
uvec2 result;
|
||||
result.x = uaddCarry(value_1.x, value_2.x, carry);
|
||||
result.y = value_1.y + value_2.y + carry;
|
||||
return result;
|
||||
}
|
||||
|
||||
// do subgroup Prefix Sum using Hillis and Steele's algorithm
|
||||
uvec2 subgroupInclusiveAddUint64(uvec2 value) {
|
||||
uvec2 result = value;
|
||||
for (uint i = 1; i < gl_SubgroupSize; i *= 2) {
|
||||
uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i;
|
||||
if (i <= gl_SubgroupInvocationID) {
|
||||
result = AddUint64(result, other);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Writes down the results to the output buffer and to the accumulation buffer
|
||||
void WriteResults(uvec2 results[LOCAL_RESULTS]) {
|
||||
const uint current_id = gl_LocalInvocationID.x;
|
||||
const uvec2 accum = accumulated_data;
|
||||
for (uint i = 0; i < LOCAL_RESULTS; i++) {
|
||||
uvec2 base_data = current_id * LOCAL_RESULTS + i < min_accumulation_base ? accum : uvec2(0, 0);
|
||||
AddUint64(results[i], base_data);
|
||||
}
|
||||
for (uint i = 0; i < LOCAL_RESULTS; i++) {
|
||||
output_data[buffer_offset + current_id * LOCAL_RESULTS + i] = results[i];
|
||||
}
|
||||
uint index = accumulation_limit % LOCAL_RESULTS;
|
||||
uint base_id = accumulation_limit / LOCAL_RESULTS;
|
||||
if (min_accumulation_base >= accumulation_limit + 1) {
|
||||
if (current_id == base_id) {
|
||||
accumulated_data = results[index];
|
||||
}
|
||||
return;
|
||||
}
|
||||
// We have that ugly case in which the accumulation data is reset in the middle somewhere.
|
||||
barrier();
|
||||
groupMemoryBarrier();
|
||||
|
||||
if (current_id == base_id) {
|
||||
uvec2 reset_value = output_data[max_accumulation_base - 1];
|
||||
// Calculate two complement / negate manually
|
||||
reset_value = AddUint64(uvec2(1,0), ~reset_value);
|
||||
accumulated_data = AddUint64(results[index], reset_value);
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint subgroup_inv_id = gl_SubgroupInvocationID;
|
||||
const uint subgroup_id = gl_SubgroupID + gl_WorkGroupID.x * gl_NumSubgroups;
|
||||
const uint last_subgroup_id = subgroupMax(subgroup_inv_id);
|
||||
const uint current_id = gl_LocalInvocationID.x;
|
||||
const uint total_work = accumulation_limit;
|
||||
const uint last_result_id = LOCAL_RESULTS - 1;
|
||||
uvec2 data[LOCAL_RESULTS];
|
||||
for (uint i = 0; i < LOCAL_RESULTS; i++) {
|
||||
data[i] = input_data[buffer_offset + current_id * LOCAL_RESULTS + i];
|
||||
}
|
||||
uvec2 results[LOCAL_RESULTS];
|
||||
results[0] = data[0];
|
||||
for (uint i = 1; i < LOCAL_RESULTS; i++) {
|
||||
results[i] = AddUint64(data[i], results[i - 1]);
|
||||
}
|
||||
// make sure all input data has been loaded
|
||||
subgroupBarrier();
|
||||
subgroupMemoryBarrier();
|
||||
|
||||
// on the last local result, do a subgroup inclusive scan sum
|
||||
results[last_result_id] = subgroupInclusiveAddUint64(results[last_result_id]);
|
||||
// get the last local result from the subgroup behind the current
|
||||
uvec2 result_behind = subgroupShuffleUp(results[last_result_id], 1);
|
||||
if (subgroup_inv_id != 0) {
|
||||
for (uint i = 1; i < LOCAL_RESULTS; i++) {
|
||||
results[i - 1] = AddUint64(results[i - 1], result_behind);
|
||||
}
|
||||
}
|
||||
|
||||
// if we had less queries than our subgroup, just write down the results.
|
||||
if (total_work <= gl_SubgroupSize * LOCAL_RESULTS) { // This condition is constant per dispatch.
|
||||
WriteResults(results);
|
||||
return;
|
||||
}
|
||||
|
||||
// We now have more, so lets write the last result into shared memory.
|
||||
// Only pick the last subgroup.
|
||||
if (subgroup_inv_id == last_subgroup_id) {
|
||||
shared_data[subgroup_id] = results[last_result_id];
|
||||
}
|
||||
// wait until everyone loaded their stuffs
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
|
||||
// only if it's not the first subgroup
|
||||
if (subgroup_id != 0) {
|
||||
// get the results from some previous invocation
|
||||
uvec2 tmp = shared_data[subgroup_inv_id];
|
||||
subgroupBarrier();
|
||||
subgroupMemoryBarrierShared();
|
||||
tmp = subgroupInclusiveAddUint64(tmp);
|
||||
// obtain the result that would be equivalent to the previous result
|
||||
uvec2 shuffled_result = subgroupShuffle(tmp, subgroup_id - 1);
|
||||
for (uint i = 0; i < LOCAL_RESULTS; i++) {
|
||||
results[i] = AddUint64(results[i], shuffled_result);
|
||||
}
|
||||
}
|
||||
WriteResults(results);
|
||||
}
|
@ -0,0 +1,138 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
|
||||
// Nicholas Haemel. Modified to suit needs.
|
||||
|
||||
#version 460 core
|
||||
|
||||
#ifdef VULKAN
|
||||
|
||||
#define HAS_EXTENDED_TYPES 1
|
||||
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
|
||||
#define END_PUSH_CONSTANTS };
|
||||
#define UNIFORM(n)
|
||||
#define BINDING_INPUT_BUFFER 0
|
||||
#define BINDING_OUTPUT_IMAGE 1
|
||||
|
||||
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
|
||||
|
||||
#extension GL_NV_gpu_shader5 : enable
|
||||
#ifdef GL_NV_gpu_shader5
|
||||
#define HAS_EXTENDED_TYPES 1
|
||||
#else
|
||||
#define HAS_EXTENDED_TYPES 0
|
||||
#endif
|
||||
#define BEGIN_PUSH_CONSTANTS
|
||||
#define END_PUSH_CONSTANTS
|
||||
#define UNIFORM(n) layout(location = n) uniform
|
||||
#define BINDING_INPUT_BUFFER 0
|
||||
#define BINDING_OUTPUT_IMAGE 0
|
||||
|
||||
#endif
|
||||
|
||||
BEGIN_PUSH_CONSTANTS
|
||||
UNIFORM(0) uint min_accumulation_base;
|
||||
UNIFORM(1) uint max_accumulation_base;
|
||||
UNIFORM(2) uint accumulation_limit;
|
||||
UNIFORM(3) uint buffer_offset;
|
||||
END_PUSH_CONSTANTS
|
||||
|
||||
#define LOCAL_RESULTS 4
|
||||
#define QUERIES_PER_INVOC 2048
|
||||
|
||||
layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in;
|
||||
|
||||
layout(std430, binding = 0) readonly buffer block1 {
|
||||
uvec2 input_data[gl_WorkGroupSize.x * LOCAL_RESULTS];
|
||||
};
|
||||
|
||||
layout(std430, binding = 1) writeonly coherent buffer block2 {
|
||||
uvec2 output_data[gl_WorkGroupSize.x * LOCAL_RESULTS];
|
||||
};
|
||||
|
||||
layout(std430, binding = 2) coherent buffer block3 {
|
||||
uvec2 accumulated_data;
|
||||
};
|
||||
|
||||
shared uvec2 shared_data[gl_WorkGroupSize.x * LOCAL_RESULTS];
|
||||
|
||||
uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
|
||||
uint carry = 0;
|
||||
uvec2 result;
|
||||
result.x = uaddCarry(value_1.x, value_2.x, carry);
|
||||
result.y = value_1.y + value_2.y + carry;
|
||||
return result;
|
||||
}
|
||||
|
||||
void main(void) {
|
||||
uint id = gl_LocalInvocationID.x;
|
||||
uvec2 base_value[LOCAL_RESULTS];
|
||||
const uvec2 accum = accumulated_data;
|
||||
for (uint i = 0; i < LOCAL_RESULTS; i++) {
|
||||
base_value[i] = (buffer_offset + id * LOCAL_RESULTS + i) < min_accumulation_base
|
||||
? accumulated_data
|
||||
: uvec2(0);
|
||||
}
|
||||
uint work_size = gl_WorkGroupSize.x;
|
||||
uint rd_id;
|
||||
uint wr_id;
|
||||
uint mask;
|
||||
uvec2 inputs[LOCAL_RESULTS];
|
||||
for (uint i = 0; i < LOCAL_RESULTS; i++) {
|
||||
inputs[i] = input_data[buffer_offset + id * LOCAL_RESULTS + i];
|
||||
}
|
||||
// The number of steps is the log base 2 of the
|
||||
// work group size, which should be a power of 2
|
||||
const uint steps = uint(log2(work_size)) + uint(log2(LOCAL_RESULTS));
|
||||
uint step = 0;
|
||||
|
||||
// Each invocation is responsible for the content of
|
||||
// two elements of the output array
|
||||
for (uint i = 0; i < LOCAL_RESULTS; i++) {
|
||||
shared_data[id * LOCAL_RESULTS + i] = inputs[i];
|
||||
}
|
||||
// Synchronize to make sure that everyone has initialized
|
||||
// their elements of shared_data[] with data loaded from
|
||||
// the input arrays
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
// For each step...
|
||||
for (step = 0; step < steps; step++) {
|
||||
// Calculate the read and write index in the
|
||||
// shared array
|
||||
mask = (1 << step) - 1;
|
||||
rd_id = ((id >> step) << (step + 1)) + mask;
|
||||
wr_id = rd_id + 1 + (id & mask);
|
||||
// Accumulate the read data into our element
|
||||
|
||||
shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
|
||||
// Synchronize again to make sure that everyone
|
||||
// has caught up with us
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
// Add the accumulation
|
||||
for (uint i = 0; i < LOCAL_RESULTS; i++) {
|
||||
shared_data[id * LOCAL_RESULTS + i] =
|
||||
AddUint64(shared_data[id * LOCAL_RESULTS + i], base_value[i]);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
|
||||
// Finally write our data back to the output buffer
|
||||
for (uint i = 0; i < LOCAL_RESULTS; i++) {
|
||||
output_data[buffer_offset + id * LOCAL_RESULTS + i] = shared_data[id * LOCAL_RESULTS + i];
|
||||
}
|
||||
if (id == 0) {
|
||||
if (min_accumulation_base >= accumulation_limit + 1) {
|
||||
accumulated_data = shared_data[accumulation_limit];
|
||||
return;
|
||||
}
|
||||
uvec2 reset_value = shared_data[max_accumulation_base - 1];
|
||||
uvec2 final_value = shared_data[accumulation_limit];
|
||||
// Two complements
|
||||
reset_value = AddUint64(uvec2(1, 0), ~reset_value);
|
||||
accumulated_data = AddUint64(final_value, reset_value);
|
||||
}
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
#version 450
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(std430, binding = 0) buffer Query {
|
||||
uvec2 initial;
|
||||
uvec2 unknown;
|
||||
uvec2 current;
|
||||
};
|
||||
|
||||
layout(std430, binding = 1) buffer Result {
|
||||
uint result;
|
||||
};
|
||||
|
||||
void main() {
|
||||
result = all(equal(initial, current)) ? 1 : 0;
|
||||
}
|
@ -0,0 +1,104 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <deque>
|
||||
#include <utility>
|
||||
|
||||
#include "common/common_types.h"
|
||||
|
||||
namespace VideoCommon {
|
||||
|
||||
class BankBase {
|
||||
protected:
|
||||
const size_t base_bank_size{};
|
||||
size_t bank_size{};
|
||||
std::atomic<size_t> references{};
|
||||
size_t current_slot{};
|
||||
|
||||
public:
|
||||
explicit BankBase(size_t bank_size_) : base_bank_size{bank_size_}, bank_size(bank_size_) {}
|
||||
|
||||
virtual ~BankBase() = default;
|
||||
|
||||
virtual std::pair<bool, size_t> Reserve() {
|
||||
if (IsClosed()) {
|
||||
return {false, bank_size};
|
||||
}
|
||||
const size_t result = current_slot++;
|
||||
return {true, result};
|
||||
}
|
||||
|
||||
virtual void Reset() {
|
||||
current_slot = 0;
|
||||
references = 0;
|
||||
bank_size = base_bank_size;
|
||||
}
|
||||
|
||||
size_t Size() const {
|
||||
return bank_size;
|
||||
}
|
||||
|
||||
void AddReference(size_t how_many = 1) {
|
||||
references.fetch_add(how_many, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
void CloseReference(size_t how_many = 1) {
|
||||
if (how_many > references.load(std::memory_order_relaxed)) {
|
||||
UNREACHABLE();
|
||||
}
|
||||
references.fetch_sub(how_many, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
void Close() {
|
||||
bank_size = current_slot;
|
||||
}
|
||||
|
||||
bool IsClosed() const {
|
||||
return current_slot >= bank_size;
|
||||
}
|
||||
|
||||
bool IsDead() const {
|
||||
return IsClosed() && references == 0;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename BankType>
|
||||
class BankPool {
|
||||
private:
|
||||
std::deque<BankType> bank_pool;
|
||||
std::deque<size_t> bank_indices;
|
||||
|
||||
public:
|
||||
BankPool() = default;
|
||||
~BankPool() = default;
|
||||
|
||||
// Reserve a bank from the pool and return its index
|
||||
template <typename Func>
|
||||
size_t ReserveBank(Func&& builder) {
|
||||
if (!bank_indices.empty() && bank_pool[bank_indices.front()].IsDead()) {
|
||||
size_t new_index = bank_indices.front();
|
||||
bank_indices.pop_front();
|
||||
bank_pool[new_index].Reset();
|
||||
return new_index;
|
||||
}
|
||||
size_t new_index = bank_pool.size();
|
||||
builder(bank_pool, new_index);
|
||||
bank_indices.push_back(new_index);
|
||||
return new_index;
|
||||
}
|
||||
|
||||
// Get a reference to a bank using its index
|
||||
BankType& GetBank(size_t index) {
|
||||
return bank_pool[index];
|
||||
}
|
||||
|
||||
// Get the total number of banks in the pool
|
||||
size_t BankCount() const {
|
||||
return bank_pool.size();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace VideoCommon
|
@ -0,0 +1,70 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/common_funcs.h"
|
||||
#include "common/common_types.h"
|
||||
|
||||
namespace VideoCommon {
|
||||
|
||||
enum class QueryFlagBits : u32 {
|
||||
HasTimestamp = 1 << 0, ///< Indicates if this query has a timestamp.
|
||||
IsFinalValueSynced = 1 << 1, ///< Indicates if the query has been synced in the host
|
||||
IsHostSynced = 1 << 2, ///< Indicates if the query has been synced in the host
|
||||
IsGuestSynced = 1 << 3, ///< Indicates if the query has been synced with the guest.
|
||||
IsHostManaged = 1 << 4, ///< Indicates if this query points to a host query
|
||||
IsRewritten = 1 << 5, ///< Indicates if this query was rewritten by another query
|
||||
IsInvalidated = 1 << 6, ///< Indicates the value of th query has been nullified.
|
||||
IsOrphan = 1 << 7, ///< Indicates the query has not been set by a guest query.
|
||||
IsFence = 1 << 8, ///< Indicates the query is a fence.
|
||||
};
|
||||
DECLARE_ENUM_FLAG_OPERATORS(QueryFlagBits)
|
||||
|
||||
class QueryBase {
|
||||
public:
|
||||
VAddr guest_address{};
|
||||
QueryFlagBits flags{};
|
||||
u64 value{};
|
||||
|
||||
protected:
|
||||
// Default constructor
|
||||
QueryBase() = default;
|
||||
|
||||
// Parameterized constructor
|
||||
QueryBase(VAddr address, QueryFlagBits flags_, u64 value_)
|
||||
: guest_address(address), flags(flags_), value{value_} {}
|
||||
};
|
||||
|
||||
class GuestQuery : public QueryBase {
|
||||
public:
|
||||
// Parameterized constructor
|
||||
GuestQuery(bool isLong, VAddr address, u64 queryValue)
|
||||
: QueryBase(address, QueryFlagBits::IsFinalValueSynced, queryValue) {
|
||||
if (isLong) {
|
||||
flags |= QueryFlagBits::HasTimestamp;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class HostQueryBase : public QueryBase {
|
||||
public:
|
||||
// Default constructor
|
||||
HostQueryBase() : QueryBase(0, QueryFlagBits::IsHostManaged | QueryFlagBits::IsOrphan, 0) {}
|
||||
|
||||
// Parameterized constructor
|
||||
HostQueryBase(bool has_timestamp, VAddr address)
|
||||
: QueryBase(address, QueryFlagBits::IsHostManaged, 0), start_bank_id{}, size_banks{},
|
||||
start_slot{}, size_slots{} {
|
||||
if (has_timestamp) {
|
||||
flags |= QueryFlagBits::HasTimestamp;
|
||||
}
|
||||
}
|
||||
|
||||
u32 start_bank_id{};
|
||||
u32 size_banks{};
|
||||
size_t start_slot{};
|
||||
size_t size_slots{};
|
||||
};
|
||||
|
||||
} // namespace VideoCommon
|
@ -0,0 +1,580 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <deque>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
||||
#include "common/assert.h"
|
||||
#include "common/common_types.h"
|
||||
#include "common/logging/log.h"
|
||||
#include "common/scope_exit.h"
|
||||
#include "common/settings.h"
|
||||
#include "core/memory.h"
|
||||
#include "video_core/engines/maxwell_3d.h"
|
||||
#include "video_core/gpu.h"
|
||||
#include "video_core/memory_manager.h"
|
||||
#include "video_core/query_cache/bank_base.h"
|
||||
#include "video_core/query_cache/query_base.h"
|
||||
#include "video_core/query_cache/query_cache_base.h"
|
||||
#include "video_core/query_cache/query_stream.h"
|
||||
#include "video_core/query_cache/types.h"
|
||||
|
||||
namespace VideoCommon {
|
||||
|
||||
using Maxwell = Tegra::Engines::Maxwell3D;
|
||||
|
||||
struct SyncValuesStruct {
|
||||
VAddr address;
|
||||
u64 value;
|
||||
u64 size;
|
||||
|
||||
static constexpr bool GeneratesBaseBuffer = true;
|
||||
};
|
||||
|
||||
template <typename Traits>
|
||||
class GuestStreamer : public SimpleStreamer<GuestQuery> {
|
||||
public:
|
||||
using RuntimeType = typename Traits::RuntimeType;
|
||||
|
||||
GuestStreamer(size_t id_, RuntimeType& runtime_)
|
||||
: SimpleStreamer<GuestQuery>(id_), runtime{runtime_} {}
|
||||
|
||||
virtual ~GuestStreamer() = default;
|
||||
|
||||
size_t WriteCounter(VAddr address, bool has_timestamp, u32 value,
|
||||
std::optional<u32> subreport = std::nullopt) override {
|
||||
auto new_id = BuildQuery(has_timestamp, address, static_cast<u64>(value));
|
||||
pending_sync.push_back(new_id);
|
||||
return new_id;
|
||||
}
|
||||
|
||||
bool HasPendingSync() const override {
|
||||
return !pending_sync.empty();
|
||||
}
|
||||
|
||||
void SyncWrites() override {
|
||||
if (pending_sync.empty()) {
|
||||
return;
|
||||
}
|
||||
std::vector<SyncValuesStruct> sync_values;
|
||||
sync_values.reserve(pending_sync.size());
|
||||
for (size_t pending_id : pending_sync) {
|
||||
auto& query = slot_queries[pending_id];
|
||||
if (True(query.flags & QueryFlagBits::IsRewritten) ||
|
||||
True(query.flags & QueryFlagBits::IsInvalidated)) {
|
||||
continue;
|
||||
}
|
||||
query.flags |= QueryFlagBits::IsHostSynced;
|
||||
sync_values.emplace_back(SyncValuesStruct{
|
||||
.address = query.guest_address,
|
||||
.value = query.value,
|
||||
.size = static_cast<u64>(True(query.flags & QueryFlagBits::HasTimestamp) ? 8 : 4)});
|
||||
}
|
||||
pending_sync.clear();
|
||||
if (sync_values.size() > 0) {
|
||||
runtime.template SyncValues<SyncValuesStruct>(sync_values);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
RuntimeType& runtime;
|
||||
std::deque<size_t> pending_sync;
|
||||
};
|
||||
|
||||
template <typename Traits>
|
||||
class StubStreamer : public GuestStreamer<Traits> {
|
||||
public:
|
||||
using RuntimeType = typename Traits::RuntimeType;
|
||||
|
||||
StubStreamer(size_t id_, RuntimeType& runtime_, u32 stub_value_)
|
||||
: GuestStreamer<Traits>(id_, runtime_), stub_value{stub_value_} {}
|
||||
|
||||
~StubStreamer() override = default;
|
||||
|
||||
size_t WriteCounter(VAddr address, bool has_timestamp, [[maybe_unused]] u32 value,
|
||||
std::optional<u32> subreport = std::nullopt) override {
|
||||
size_t new_id =
|
||||
GuestStreamer<Traits>::WriteCounter(address, has_timestamp, stub_value, subreport);
|
||||
return new_id;
|
||||
}
|
||||
|
||||
private:
|
||||
u32 stub_value;
|
||||
};
|
||||
|
||||
template <typename Traits>
|
||||
struct QueryCacheBase<Traits>::QueryCacheBaseImpl {
|
||||
using RuntimeType = typename Traits::RuntimeType;
|
||||
|
||||
QueryCacheBaseImpl(QueryCacheBase<Traits>* owner_, VideoCore::RasterizerInterface& rasterizer_,
|
||||
Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_, Tegra::GPU& gpu_)
|
||||
: owner{owner_}, rasterizer{rasterizer_},
|
||||
cpu_memory{cpu_memory_}, runtime{runtime_}, gpu{gpu_} {
|
||||
streamer_mask = 0;
|
||||
for (size_t i = 0; i < static_cast<size_t>(QueryType::MaxQueryTypes); i++) {
|
||||
streamers[i] = runtime.GetStreamerInterface(static_cast<QueryType>(i));
|
||||
if (streamers[i]) {
|
||||
streamer_mask |= 1ULL << streamers[i]->GetId();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
void ForEachStreamerIn(u64 mask, Func&& func) {
|
||||
static constexpr bool RETURNS_BOOL =
|
||||
std::is_same_v<std::invoke_result<Func, StreamerInterface*>, bool>;
|
||||
while (mask != 0) {
|
||||
size_t position = std::countr_zero(mask);
|
||||
mask &= ~(1ULL << position);
|
||||
if constexpr (RETURNS_BOOL) {
|
||||
if (func(streamers[position])) {
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
func(streamers[position]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
void ForEachStreamer(Func&& func) {
|
||||
ForEachStreamerIn(streamer_mask, func);
|
||||
}
|
||||
|
||||
QueryBase* ObtainQuery(QueryCacheBase<Traits>::QueryLocation location) {
|
||||
size_t which_stream = location.stream_id.Value();
|
||||
auto* streamer = streamers[which_stream];
|
||||
if (!streamer) {
|
||||
return nullptr;
|
||||
}
|
||||
return streamer->GetQuery(location.query_id.Value());
|
||||
}
|
||||
|
||||
QueryCacheBase<Traits>* owner;
|
||||
VideoCore::RasterizerInterface& rasterizer;
|
||||
Core::Memory::Memory& cpu_memory;
|
||||
RuntimeType& runtime;
|
||||
Tegra::GPU& gpu;
|
||||
std::array<StreamerInterface*, static_cast<size_t>(QueryType::MaxQueryTypes)> streamers;
|
||||
u64 streamer_mask;
|
||||
std::mutex flush_guard;
|
||||
std::deque<u64> flushes_pending;
|
||||
std::vector<QueryCacheBase<Traits>::QueryLocation> pending_unregister;
|
||||
};
|
||||
|
||||
template <typename Traits>
|
||||
QueryCacheBase<Traits>::QueryCacheBase(Tegra::GPU& gpu_,
|
||||
VideoCore::RasterizerInterface& rasterizer_,
|
||||
Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_)
|
||||
: cached_queries{} {
|
||||
impl = std::make_unique<QueryCacheBase<Traits>::QueryCacheBaseImpl>(
|
||||
this, rasterizer_, cpu_memory_, runtime_, gpu_);
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
QueryCacheBase<Traits>::~QueryCacheBase() = default;
|
||||
|
||||
template <typename Traits>
|
||||
void QueryCacheBase<Traits>::CounterEnable(QueryType counter_type, bool is_enabled) {
|
||||
size_t index = static_cast<size_t>(counter_type);
|
||||
StreamerInterface* streamer = impl->streamers[index];
|
||||
if (!streamer) [[unlikely]] {
|
||||
UNREACHABLE();
|
||||
return;
|
||||
}
|
||||
if (is_enabled) {
|
||||
streamer->StartCounter();
|
||||
} else {
|
||||
streamer->PauseCounter();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
void QueryCacheBase<Traits>::CounterClose(QueryType counter_type) {
|
||||
size_t index = static_cast<size_t>(counter_type);
|
||||
StreamerInterface* streamer = impl->streamers[index];
|
||||
if (!streamer) [[unlikely]] {
|
||||
UNREACHABLE();
|
||||
return;
|
||||
}
|
||||
streamer->CloseCounter();
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
void QueryCacheBase<Traits>::CounterReset(QueryType counter_type) {
|
||||
size_t index = static_cast<size_t>(counter_type);
|
||||
StreamerInterface* streamer = impl->streamers[index];
|
||||
if (!streamer) [[unlikely]] {
|
||||
UNIMPLEMENTED();
|
||||
return;
|
||||
}
|
||||
streamer->ResetCounter();
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
void QueryCacheBase<Traits>::BindToChannel(s32 id) {
|
||||
VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo>::BindToChannel(id);
|
||||
impl->runtime.Bind3DEngine(maxwell3d);
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
void QueryCacheBase<Traits>::CounterReport(GPUVAddr addr, QueryType counter_type,
|
||||
QueryPropertiesFlags flags, u32 payload, u32 subreport) {
|
||||
const bool has_timestamp = True(flags & QueryPropertiesFlags::HasTimeout);
|
||||
const bool is_fence = True(flags & QueryPropertiesFlags::IsAFence);
|
||||
size_t streamer_id = static_cast<size_t>(counter_type);
|
||||
auto* streamer = impl->streamers[streamer_id];
|
||||
if (streamer == nullptr) [[unlikely]] {
|
||||
counter_type = QueryType::Payload;
|
||||
payload = 1U;
|
||||
streamer_id = static_cast<size_t>(counter_type);
|
||||
streamer = impl->streamers[streamer_id];
|
||||
}
|
||||
auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(addr);
|
||||
if (!cpu_addr_opt) [[unlikely]] {
|
||||
return;
|
||||
}
|
||||
VAddr cpu_addr = *cpu_addr_opt;
|
||||
const size_t new_query_id = streamer->WriteCounter(cpu_addr, has_timestamp, payload, subreport);
|
||||
auto* query = streamer->GetQuery(new_query_id);
|
||||
if (is_fence) {
|
||||
query->flags |= QueryFlagBits::IsFence;
|
||||
}
|
||||
QueryLocation query_location{};
|
||||
query_location.stream_id.Assign(static_cast<u32>(streamer_id));
|
||||
query_location.query_id.Assign(static_cast<u32>(new_query_id));
|
||||
const auto gen_caching_indexing = [](VAddr cur_addr) {
|
||||
return std::make_pair<u64, u32>(cur_addr >> Core::Memory::YUZU_PAGEBITS,
|
||||
static_cast<u32>(cur_addr & Core::Memory::YUZU_PAGEMASK));
|
||||
};
|
||||
u8* pointer = impl->cpu_memory.GetPointer(cpu_addr);
|
||||
u8* pointer_timestamp = impl->cpu_memory.GetPointer(cpu_addr + 8);
|
||||
bool is_synced = !Settings::IsGPULevelHigh() && is_fence;
|
||||
|
||||
std::function<void()> operation([this, is_synced, streamer, query_base = query, query_location,
|
||||
pointer, pointer_timestamp] {
|
||||
if (True(query_base->flags & QueryFlagBits::IsInvalidated)) {
|
||||
if (!is_synced) [[likely]] {
|
||||
impl->pending_unregister.push_back(query_location);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (False(query_base->flags & QueryFlagBits::IsFinalValueSynced)) [[unlikely]] {
|
||||
UNREACHABLE();
|
||||
return;
|
||||
}
|
||||
query_base->value += streamer->GetAmmendValue();
|
||||
streamer->SetAccumulationValue(query_base->value);
|
||||
if (True(query_base->flags & QueryFlagBits::HasTimestamp)) {
|
||||
u64 timestamp = impl->gpu.GetTicks();
|
||||
std::memcpy(pointer_timestamp, ×tamp, sizeof(timestamp));
|
||||
std::memcpy(pointer, &query_base->value, sizeof(query_base->value));
|
||||
} else {
|
||||
u32 value = static_cast<u32>(query_base->value);
|
||||
std::memcpy(pointer, &value, sizeof(value));
|
||||
}
|
||||
if (!is_synced) [[likely]] {
|
||||
impl->pending_unregister.push_back(query_location);
|
||||
}
|
||||
});
|
||||
if (is_fence) {
|
||||
impl->rasterizer.SignalFence(std::move(operation));
|
||||
} else {
|
||||
if (!Settings::IsGPULevelHigh() && counter_type == QueryType::Payload) {
|
||||
if (has_timestamp) {
|
||||
u64 timestamp = impl->gpu.GetTicks();
|
||||
u64 value = static_cast<u64>(payload);
|
||||
std::memcpy(pointer_timestamp, ×tamp, sizeof(timestamp));
|
||||
std::memcpy(pointer, &value, sizeof(value));
|
||||
} else {
|
||||
std::memcpy(pointer, &payload, sizeof(payload));
|
||||
}
|
||||
streamer->Free(new_query_id);
|
||||
return;
|
||||
}
|
||||
impl->rasterizer.SyncOperation(std::move(operation));
|
||||
}
|
||||
if (is_synced) {
|
||||
streamer->Free(new_query_id);
|
||||
return;
|
||||
}
|
||||
auto [cont_addr, base] = gen_caching_indexing(cpu_addr);
|
||||
{
|
||||
std::scoped_lock lock(cache_mutex);
|
||||
auto it1 = cached_queries.try_emplace(cont_addr);
|
||||
auto& sub_container = it1.first->second;
|
||||
auto it_current = sub_container.find(base);
|
||||
if (it_current == sub_container.end()) {
|
||||
sub_container.insert_or_assign(base, query_location);
|
||||
return;
|
||||
}
|
||||
auto* old_query = impl->ObtainQuery(it_current->second);
|
||||
old_query->flags |= QueryFlagBits::IsRewritten;
|
||||
sub_container.insert_or_assign(base, query_location);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
void QueryCacheBase<Traits>::UnregisterPending() {
|
||||
const auto gen_caching_indexing = [](VAddr cur_addr) {
|
||||
return std::make_pair<u64, u32>(cur_addr >> Core::Memory::YUZU_PAGEBITS,
|
||||
static_cast<u32>(cur_addr & Core::Memory::YUZU_PAGEMASK));
|
||||
};
|
||||
std::scoped_lock lock(cache_mutex);
|
||||
for (QueryLocation loc : impl->pending_unregister) {
|
||||
const auto [streamer_id, query_id] = loc.unpack();
|
||||
auto* streamer = impl->streamers[streamer_id];
|
||||
if (!streamer) [[unlikely]] {
|
||||
continue;
|
||||
}
|
||||
auto* query = streamer->GetQuery(query_id);
|
||||
auto [cont_addr, base] = gen_caching_indexing(query->guest_address);
|
||||
auto it1 = cached_queries.find(cont_addr);
|
||||
if (it1 != cached_queries.end()) {
|
||||
auto it2 = it1->second.find(base);
|
||||
if (it2 != it1->second.end()) {
|
||||
if (it2->second.raw == loc.raw) {
|
||||
it1->second.erase(it2);
|
||||
}
|
||||
}
|
||||
}
|
||||
streamer->Free(query_id);
|
||||
}
|
||||
impl->pending_unregister.clear();
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
void QueryCacheBase<Traits>::NotifyWFI() {
|
||||
bool should_sync = false;
|
||||
impl->ForEachStreamer(
|
||||
[&should_sync](StreamerInterface* streamer) { should_sync |= streamer->HasPendingSync(); });
|
||||
if (!should_sync) {
|
||||
return;
|
||||
}
|
||||
|
||||
impl->ForEachStreamer([](StreamerInterface* streamer) { streamer->PresyncWrites(); });
|
||||
impl->runtime.Barriers(true);
|
||||
impl->ForEachStreamer([](StreamerInterface* streamer) { streamer->SyncWrites(); });
|
||||
impl->runtime.Barriers(false);
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
void QueryCacheBase<Traits>::NotifySegment(bool resume) {
|
||||
if (resume) {
|
||||
impl->runtime.ResumeHostConditionalRendering();
|
||||
} else {
|
||||
CounterClose(VideoCommon::QueryType::ZPassPixelCount64);
|
||||
CounterClose(VideoCommon::QueryType::StreamingByteCount);
|
||||
impl->runtime.PauseHostConditionalRendering();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
bool QueryCacheBase<Traits>::AccelerateHostConditionalRendering() {
|
||||
bool qc_dirty = false;
|
||||
const auto gen_lookup = [this, &qc_dirty](GPUVAddr address) -> VideoCommon::LookupData {
|
||||
auto cpu_addr_opt = gpu_memory->GpuToCpuAddress(address);
|
||||
if (!cpu_addr_opt) [[unlikely]] {
|
||||
return VideoCommon::LookupData{
|
||||
.address = 0,
|
||||
.found_query = nullptr,
|
||||
};
|
||||
}
|
||||
VAddr cpu_addr = *cpu_addr_opt;
|
||||
std::scoped_lock lock(cache_mutex);
|
||||
auto it1 = cached_queries.find(cpu_addr >> Core::Memory::YUZU_PAGEBITS);
|
||||
if (it1 == cached_queries.end()) {
|
||||
return VideoCommon::LookupData{
|
||||
.address = cpu_addr,
|
||||
.found_query = nullptr,
|
||||
};
|
||||
}
|
||||
auto& sub_container = it1->second;
|
||||
auto it_current = sub_container.find(cpu_addr & Core::Memory::YUZU_PAGEMASK);
|
||||
|
||||
if (it_current == sub_container.end()) {
|
||||
auto it_current_2 = sub_container.find((cpu_addr & Core::Memory::YUZU_PAGEMASK) + 4);
|
||||
if (it_current_2 == sub_container.end()) {
|
||||
return VideoCommon::LookupData{
|
||||
.address = cpu_addr,
|
||||
.found_query = nullptr,
|
||||
};
|
||||
}
|
||||
}
|
||||
auto* query = impl->ObtainQuery(it_current->second);
|
||||
qc_dirty |= True(query->flags & QueryFlagBits::IsHostManaged) &&
|
||||
False(query->flags & QueryFlagBits::IsGuestSynced);
|
||||
return VideoCommon::LookupData{
|
||||
.address = cpu_addr,
|
||||
.found_query = query,
|
||||
};
|
||||
};
|
||||
|
||||
auto& regs = maxwell3d->regs;
|
||||
if (regs.render_enable_override != Maxwell::Regs::RenderEnable::Override::UseRenderEnable) {
|
||||
impl->runtime.EndHostConditionalRendering();
|
||||
return false;
|
||||
}
|
||||
const ComparisonMode mode = static_cast<ComparisonMode>(regs.render_enable.mode);
|
||||
const GPUVAddr address = regs.render_enable.Address();
|
||||
switch (mode) {
|
||||
case ComparisonMode::True:
|
||||
impl->runtime.EndHostConditionalRendering();
|
||||
return false;
|
||||
case ComparisonMode::False:
|
||||
impl->runtime.EndHostConditionalRendering();
|
||||
return false;
|
||||
case ComparisonMode::Conditional: {
|
||||
VideoCommon::LookupData object_1{gen_lookup(address)};
|
||||
return impl->runtime.HostConditionalRenderingCompareValue(object_1, qc_dirty);
|
||||
}
|
||||
case ComparisonMode::IfEqual: {
|
||||
VideoCommon::LookupData object_1{gen_lookup(address)};
|
||||
VideoCommon::LookupData object_2{gen_lookup(address + 16)};
|
||||
return impl->runtime.HostConditionalRenderingCompareValues(object_1, object_2, qc_dirty,
|
||||
true);
|
||||
}
|
||||
case ComparisonMode::IfNotEqual: {
|
||||
VideoCommon::LookupData object_1{gen_lookup(address)};
|
||||
VideoCommon::LookupData object_2{gen_lookup(address + 16)};
|
||||
return impl->runtime.HostConditionalRenderingCompareValues(object_1, object_2, qc_dirty,
|
||||
false);
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Async downloads
|
||||
template <typename Traits>
|
||||
void QueryCacheBase<Traits>::CommitAsyncFlushes() {
|
||||
// Make sure to have the results synced in Host.
|
||||
NotifyWFI();
|
||||
|
||||
u64 mask{};
|
||||
{
|
||||
std::scoped_lock lk(impl->flush_guard);
|
||||
impl->ForEachStreamer([&mask](StreamerInterface* streamer) {
|
||||
bool local_result = streamer->HasUnsyncedQueries();
|
||||
if (local_result) {
|
||||
mask |= 1ULL << streamer->GetId();
|
||||
}
|
||||
});
|
||||
impl->flushes_pending.push_back(mask);
|
||||
}
|
||||
std::function<void()> func([this] { UnregisterPending(); });
|
||||
impl->rasterizer.SyncOperation(std::move(func));
|
||||
if (mask == 0) {
|
||||
return;
|
||||
}
|
||||
u64 ran_mask = ~mask;
|
||||
while (mask) {
|
||||
impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) {
|
||||
u64 dep_mask = streamer->GetDependentMask();
|
||||
if ((dep_mask & ~ran_mask) != 0) {
|
||||
return;
|
||||
}
|
||||
u64 index = streamer->GetId();
|
||||
ran_mask |= (1ULL << index);
|
||||
mask &= ~(1ULL << index);
|
||||
streamer->PushUnsyncedQueries();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
bool QueryCacheBase<Traits>::HasUncommittedFlushes() const {
|
||||
bool result = false;
|
||||
impl->ForEachStreamer([&result](StreamerInterface* streamer) {
|
||||
result |= streamer->HasUnsyncedQueries();
|
||||
return result;
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
bool QueryCacheBase<Traits>::ShouldWaitAsyncFlushes() {
|
||||
std::scoped_lock lk(impl->flush_guard);
|
||||
return !impl->flushes_pending.empty() && impl->flushes_pending.front() != 0ULL;
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
void QueryCacheBase<Traits>::PopAsyncFlushes() {
|
||||
u64 mask;
|
||||
{
|
||||
std::scoped_lock lk(impl->flush_guard);
|
||||
mask = impl->flushes_pending.front();
|
||||
impl->flushes_pending.pop_front();
|
||||
}
|
||||
if (mask == 0) {
|
||||
return;
|
||||
}
|
||||
u64 ran_mask = ~mask;
|
||||
while (mask) {
|
||||
impl->ForEachStreamerIn(mask, [&mask, &ran_mask](StreamerInterface* streamer) {
|
||||
u64 dep_mask = streamer->GetDependenceMask();
|
||||
if ((dep_mask & ~ran_mask) != 0) {
|
||||
return;
|
||||
}
|
||||
u64 index = streamer->GetId();
|
||||
ran_mask |= (1ULL << index);
|
||||
mask &= ~(1ULL << index);
|
||||
streamer->PopUnsyncedQueries();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Invalidation
|
||||
|
||||
template <typename Traits>
|
||||
void QueryCacheBase<Traits>::InvalidateQuery(QueryCacheBase<Traits>::QueryLocation location) {
|
||||
auto* query_base = impl->ObtainQuery(location);
|
||||
if (!query_base) {
|
||||
return;
|
||||
}
|
||||
query_base->flags |= QueryFlagBits::IsInvalidated;
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
bool QueryCacheBase<Traits>::IsQueryDirty(QueryCacheBase<Traits>::QueryLocation location) {
|
||||
auto* query_base = impl->ObtainQuery(location);
|
||||
if (!query_base) {
|
||||
return false;
|
||||
}
|
||||
return True(query_base->flags & QueryFlagBits::IsHostManaged) &&
|
||||
False(query_base->flags & QueryFlagBits::IsGuestSynced);
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
bool QueryCacheBase<Traits>::SemiFlushQueryDirty(QueryCacheBase<Traits>::QueryLocation location) {
|
||||
auto* query_base = impl->ObtainQuery(location);
|
||||
if (!query_base) {
|
||||
return false;
|
||||
}
|
||||
if (True(query_base->flags & QueryFlagBits::IsFinalValueSynced) &&
|
||||
False(query_base->flags & QueryFlagBits::IsGuestSynced)) {
|
||||
auto* ptr = impl->cpu_memory.GetPointer(query_base->guest_address);
|
||||
if (True(query_base->flags & QueryFlagBits::HasTimestamp)) {
|
||||
std::memcpy(ptr, &query_base->value, sizeof(query_base->value));
|
||||
return false;
|
||||
}
|
||||
u32 value_l = static_cast<u32>(query_base->value);
|
||||
std::memcpy(ptr, &value_l, sizeof(value_l));
|
||||
return false;
|
||||
}
|
||||
return True(query_base->flags & QueryFlagBits::IsHostManaged) &&
|
||||
False(query_base->flags & QueryFlagBits::IsGuestSynced);
|
||||
}
|
||||
|
||||
template <typename Traits>
|
||||
void QueryCacheBase<Traits>::RequestGuestHostSync() {
|
||||
impl->rasterizer.ReleaseFences();
|
||||
}
|
||||
|
||||
} // namespace VideoCommon
|
@ -0,0 +1,181 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <optional>
|
||||
#include <span>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
||||
#include "common/assert.h"
|
||||
#include "common/bit_field.h"
|
||||
#include "common/common_types.h"
|
||||
#include "core/memory.h"
|
||||
#include "video_core/control/channel_state_cache.h"
|
||||
#include "video_core/query_cache/query_base.h"
|
||||
#include "video_core/query_cache/types.h"
|
||||
|
||||
namespace Core::Memory {
|
||||
class Memory;
|
||||
}
|
||||
|
||||
namespace VideoCore {
|
||||
class RasterizerInterface;
|
||||
}
|
||||
|
||||
namespace Tegra {
|
||||
class GPU;
|
||||
}
|
||||
|
||||
namespace VideoCommon {
|
||||
|
||||
struct LookupData {
|
||||
VAddr address;
|
||||
QueryBase* found_query;
|
||||
};
|
||||
|
||||
template <typename Traits>
|
||||
class QueryCacheBase : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> {
|
||||
using RuntimeType = typename Traits::RuntimeType;
|
||||
|
||||
public:
|
||||
union QueryLocation {
|
||||
BitField<27, 5, u32> stream_id;
|
||||
BitField<0, 27, u32> query_id;
|
||||
u32 raw;
|
||||
|
||||
std::pair<size_t, size_t> unpack() const {
|
||||
return {static_cast<size_t>(stream_id.Value()), static_cast<size_t>(query_id.Value())};
|
||||
}
|
||||
};
|
||||
|
||||
explicit QueryCacheBase(Tegra::GPU& gpu, VideoCore::RasterizerInterface& rasterizer_,
|
||||
Core::Memory::Memory& cpu_memory_, RuntimeType& runtime_);
|
||||
|
||||
~QueryCacheBase();
|
||||
|
||||
void InvalidateRegion(VAddr addr, std::size_t size) {
|
||||
IterateCache<true>(addr, size,
|
||||
[this](QueryLocation location) { InvalidateQuery(location); });
|
||||
}
|
||||
|
||||
void FlushRegion(VAddr addr, std::size_t size) {
|
||||
bool result = false;
|
||||
IterateCache<false>(addr, size, [this, &result](QueryLocation location) {
|
||||
result |= SemiFlushQueryDirty(location);
|
||||
return result;
|
||||
});
|
||||
if (result) {
|
||||
RequestGuestHostSync();
|
||||
}
|
||||
}
|
||||
|
||||
static u64 BuildMask(std::span<const QueryType> types) {
|
||||
u64 mask = 0;
|
||||
for (auto query_type : types) {
|
||||
mask |= 1ULL << (static_cast<u64>(query_type));
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
|
||||
/// Return true when a CPU region is modified from the GPU
|
||||
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size) {
|
||||
bool result = false;
|
||||
IterateCache<false>(addr, size, [this, &result](QueryLocation location) {
|
||||
result |= IsQueryDirty(location);
|
||||
return result;
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
void CounterEnable(QueryType counter_type, bool is_enabled);
|
||||
|
||||
void CounterReset(QueryType counter_type);
|
||||
|
||||
void CounterClose(QueryType counter_type);
|
||||
|
||||
void CounterReport(GPUVAddr addr, QueryType counter_type, QueryPropertiesFlags flags,
|
||||
u32 payload, u32 subreport);
|
||||
|
||||
void NotifyWFI();
|
||||
|
||||
bool AccelerateHostConditionalRendering();
|
||||
|
||||
// Async downloads
|
||||
void CommitAsyncFlushes();
|
||||
|
||||
bool HasUncommittedFlushes() const;
|
||||
|
||||
bool ShouldWaitAsyncFlushes();
|
||||
|
||||
void PopAsyncFlushes();
|
||||
|
||||
void NotifySegment(bool resume);
|
||||
|
||||
void BindToChannel(s32 id) override;
|
||||
|
||||
protected:
|
||||
template <bool remove_from_cache, typename Func>
|
||||
void IterateCache(VAddr addr, std::size_t size, Func&& func) {
|
||||
static constexpr bool RETURNS_BOOL =
|
||||
std::is_same_v<std::invoke_result<Func, QueryLocation>, bool>;
|
||||
const u64 addr_begin = addr;
|
||||
const u64 addr_end = addr_begin + size;
|
||||
|
||||
const u64 page_end = addr_end >> Core::Memory::YUZU_PAGEBITS;
|
||||
std::scoped_lock lock(cache_mutex);
|
||||
for (u64 page = addr_begin >> Core::Memory::YUZU_PAGEBITS; page <= page_end; ++page) {
|
||||
const u64 page_start = page << Core::Memory::YUZU_PAGEBITS;
|
||||
const auto in_range = [page_start, addr_begin, addr_end](const u32 query_location) {
|
||||
const u64 cache_begin = page_start + query_location;
|
||||
const u64 cache_end = cache_begin + sizeof(u32);
|
||||
return cache_begin < addr_end && addr_begin < cache_end;
|
||||
};
|
||||
const auto& it = cached_queries.find(page);
|
||||
if (it == std::end(cached_queries)) {
|
||||
continue;
|
||||
}
|
||||
auto& contents = it->second;
|
||||
for (auto& query : contents) {
|
||||
if (!in_range(query.first)) {
|
||||
continue;
|
||||
}
|
||||
if constexpr (RETURNS_BOOL) {
|
||||
if (func(query.second)) {
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
func(query.second);
|
||||
}
|
||||
}
|
||||
if constexpr (remove_from_cache) {
|
||||
const auto in_range2 = [&](const std::pair<u32, QueryLocation>& pair) {
|
||||
return in_range(pair.first);
|
||||
};
|
||||
std::erase_if(contents, in_range2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
using ContentCache = std::unordered_map<u64, std::unordered_map<u32, QueryLocation>>;
|
||||
|
||||
void InvalidateQuery(QueryLocation location);
|
||||
bool IsQueryDirty(QueryLocation location);
|
||||
bool SemiFlushQueryDirty(QueryLocation location);
|
||||
void RequestGuestHostSync();
|
||||
void UnregisterPending();
|
||||
|
||||
std::unordered_map<u64, std::unordered_map<u32, QueryLocation>> cached_queries;
|
||||
std::mutex cache_mutex;
|
||||
|
||||
struct QueryCacheBaseImpl;
|
||||
friend struct QueryCacheBaseImpl;
|
||||
friend RuntimeType;
|
||||
|
||||
std::unique_ptr<QueryCacheBaseImpl> impl;
|
||||
};
|
||||
|
||||
} // namespace VideoCommon
|
@ -0,0 +1,149 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <deque>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
#include "common/assert.h"
|
||||
#include "common/common_types.h"
|
||||
#include "video_core/query_cache/bank_base.h"
|
||||
#include "video_core/query_cache/query_base.h"
|
||||
|
||||
namespace VideoCommon {
|
||||
|
||||
class StreamerInterface {
|
||||
public:
|
||||
explicit StreamerInterface(size_t id_) : id{id_}, dependence_mask{}, dependent_mask{} {}
|
||||
virtual ~StreamerInterface() = default;
|
||||
|
||||
virtual QueryBase* GetQuery(size_t id) = 0;
|
||||
|
||||
virtual void StartCounter() {
|
||||
/* Do Nothing */
|
||||
}
|
||||
|
||||
virtual void PauseCounter() {
|
||||
/* Do Nothing */
|
||||
}
|
||||
|
||||
virtual void ResetCounter() {
|
||||
/* Do Nothing */
|
||||
}
|
||||
|
||||
virtual void CloseCounter() {
|
||||
/* Do Nothing */
|
||||
}
|
||||
|
||||
virtual bool HasPendingSync() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void PresyncWrites() {
|
||||
/* Do Nothing */
|
||||
}
|
||||
|
||||
virtual void SyncWrites() {
|
||||
/* Do Nothing */
|
||||
}
|
||||
|
||||
virtual size_t WriteCounter(VAddr address, bool has_timestamp, u32 value,
|
||||
std::optional<u32> subreport = std::nullopt) = 0;
|
||||
|
||||
virtual bool HasUnsyncedQueries() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void PushUnsyncedQueries() {
|
||||
/* Do Nothing */
|
||||
}
|
||||
|
||||
virtual void PopUnsyncedQueries() {
|
||||
/* Do Nothing */
|
||||
}
|
||||
|
||||
virtual void Free(size_t query_id) = 0;
|
||||
|
||||
size_t GetId() const {
|
||||
return id;
|
||||
}
|
||||
|
||||
u64 GetDependenceMask() const {
|
||||
return dependence_mask;
|
||||
}
|
||||
|
||||
u64 GetDependentMask() const {
|
||||
return dependence_mask;
|
||||
}
|
||||
|
||||
u64 GetAmmendValue() const {
|
||||
return ammend_value;
|
||||
}
|
||||
|
||||
void SetAccumulationValue(u64 new_value) {
|
||||
acumulation_value = new_value;
|
||||
}
|
||||
|
||||
protected:
|
||||
void MakeDependent(StreamerInterface* depend_on) {
|
||||
dependence_mask |= 1ULL << depend_on->id;
|
||||
depend_on->dependent_mask |= 1ULL << id;
|
||||
}
|
||||
|
||||
const size_t id;
|
||||
u64 dependence_mask;
|
||||
u64 dependent_mask;
|
||||
u64 ammend_value{};
|
||||
u64 acumulation_value{};
|
||||
};
|
||||
|
||||
template <typename QueryType>
|
||||
class SimpleStreamer : public StreamerInterface {
|
||||
public:
|
||||
explicit SimpleStreamer(size_t id_) : StreamerInterface{id_} {}
|
||||
virtual ~SimpleStreamer() = default;
|
||||
|
||||
protected:
|
||||
virtual QueryType* GetQuery(size_t query_id) override {
|
||||
if (query_id < slot_queries.size()) {
|
||||
return &slot_queries[query_id];
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
virtual void Free(size_t query_id) override {
|
||||
std::scoped_lock lk(guard);
|
||||
ReleaseQuery(query_id);
|
||||
}
|
||||
|
||||
template <typename... Args, typename = decltype(QueryType(std::declval<Args>()...))>
|
||||
size_t BuildQuery(Args&&... args) {
|
||||
std::scoped_lock lk(guard);
|
||||
if (!old_queries.empty()) {
|
||||
size_t new_id = old_queries.front();
|
||||
old_queries.pop_front();
|
||||
new (&slot_queries[new_id]) QueryType(std::forward<Args>(args)...);
|
||||
return new_id;
|
||||
}
|
||||
size_t new_id = slot_queries.size();
|
||||
slot_queries.emplace_back(std::forward<Args>(args)...);
|
||||
return new_id;
|
||||
}
|
||||
|
||||
void ReleaseQuery(size_t query_id) {
|
||||
|
||||
if (query_id < slot_queries.size()) {
|
||||
old_queries.push_back(query_id);
|
||||
return;
|
||||
}
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
std::mutex guard;
|
||||
std::deque<QueryType> slot_queries;
|
||||
std::deque<size_t> old_queries;
|
||||
};
|
||||
|
||||
} // namespace VideoCommon
|
@ -0,0 +1,74 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/common_funcs.h"
|
||||
#include "common/common_types.h"
|
||||
|
||||
namespace VideoCommon {
|
||||
|
||||
enum class QueryPropertiesFlags : u32 {
|
||||
HasTimeout = 1 << 0,
|
||||
IsAFence = 1 << 1,
|
||||
};
|
||||
DECLARE_ENUM_FLAG_OPERATORS(QueryPropertiesFlags)
|
||||
|
||||
// This should always be equivalent to maxwell3d Report Semaphore Reports
|
||||
enum class QueryType : u32 {
|
||||
Payload = 0, // "None" in docs, but confirmed via hardware to return the payload
|
||||
VerticesGenerated = 1,
|
||||
ZPassPixelCount = 2,
|
||||
PrimitivesGenerated = 3,
|
||||
AlphaBetaClocks = 4,
|
||||
VertexShaderInvocations = 5,
|
||||
StreamingPrimitivesNeededMinusSucceeded = 6,
|
||||
GeometryShaderInvocations = 7,
|
||||
GeometryShaderPrimitivesGenerated = 9,
|
||||
ZCullStats0 = 10,
|
||||
StreamingPrimitivesSucceeded = 11,
|
||||
ZCullStats1 = 12,
|
||||
StreamingPrimitivesNeeded = 13,
|
||||
ZCullStats2 = 14,
|
||||
ClipperInvocations = 15,
|
||||
ZCullStats3 = 16,
|
||||
ClipperPrimitivesGenerated = 17,
|
||||
VtgPrimitivesOut = 18,
|
||||
PixelShaderInvocations = 19,
|
||||
ZPassPixelCount64 = 21,
|
||||
IEEECleanColorTarget = 24,
|
||||
IEEECleanZetaTarget = 25,
|
||||
StreamingByteCount = 26,
|
||||
TessellationInitInvocations = 27,
|
||||
BoundingRectangle = 28,
|
||||
TessellationShaderInvocations = 29,
|
||||
TotalStreamingPrimitivesNeededMinusSucceeded = 30,
|
||||
TessellationShaderPrimitivesGenerated = 31,
|
||||
// max.
|
||||
MaxQueryTypes,
|
||||
};
|
||||
|
||||
// Comparison modes for Host Conditional Rendering
|
||||
enum class ComparisonMode : u32 {
|
||||
False = 0,
|
||||
True = 1,
|
||||
Conditional = 2,
|
||||
IfEqual = 3,
|
||||
IfNotEqual = 4,
|
||||
MaxComparisonMode,
|
||||
};
|
||||
|
||||
// Reduction ops.
|
||||
enum class ReductionOp : u32 {
|
||||
RedAdd = 0,
|
||||
RedMin = 1,
|
||||
RedMax = 2,
|
||||
RedInc = 3,
|
||||
RedDec = 4,
|
||||
RedAnd = 5,
|
||||
RedOr = 6,
|
||||
RedXor = 7,
|
||||
MaxReductionOp,
|
||||
};
|
||||
|
||||
} // namespace VideoCommon
|
File diff suppressed because it is too large
Load Diff
@ -1,101 +1,75 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "common/common_types.h"
|
||||
#include "video_core/query_cache.h"
|
||||
#include "video_core/renderer_vulkan/vk_resource_pool.h"
|
||||
#include "video_core/vulkan_common/vulkan_wrapper.h"
|
||||
#include "video_core/query_cache/query_cache_base.h"
|
||||
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
|
||||
|
||||
namespace VideoCore {
|
||||
class RasterizerInterface;
|
||||
}
|
||||
|
||||
namespace VideoCommon {
|
||||
class StreamerInterface;
|
||||
}
|
||||
|
||||
namespace Vulkan {
|
||||
|
||||
class CachedQuery;
|
||||
class Device;
|
||||
class HostCounter;
|
||||
class QueryCache;
|
||||
class Scheduler;
|
||||
class StagingBufferPool;
|
||||
|
||||
using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>;
|
||||
struct QueryCacheRuntimeImpl;
|
||||
|
||||
class QueryPool final : public ResourcePool {
|
||||
class QueryCacheRuntime {
|
||||
public:
|
||||
explicit QueryPool(const Device& device, Scheduler& scheduler, VideoCore::QueryType type);
|
||||
~QueryPool() override;
|
||||
explicit QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer,
|
||||
Core::Memory::Memory& cpu_memory_,
|
||||
Vulkan::BufferCache& buffer_cache_, const Device& device_,
|
||||
const MemoryAllocator& memory_allocator_, Scheduler& scheduler_,
|
||||
StagingBufferPool& staging_pool_,
|
||||
ComputePassDescriptorQueue& compute_pass_descriptor_queue,
|
||||
DescriptorPool& descriptor_pool);
|
||||
~QueryCacheRuntime();
|
||||
|
||||
std::pair<VkQueryPool, u32> Commit();
|
||||
template <typename SyncValuesType>
|
||||
void SyncValues(std::span<SyncValuesType> values, VkBuffer base_src_buffer = nullptr);
|
||||
|
||||
void Reserve(std::pair<VkQueryPool, u32> query);
|
||||
void Barriers(bool is_prebarrier);
|
||||
|
||||
protected:
|
||||
void Allocate(std::size_t begin, std::size_t end) override;
|
||||
void EndHostConditionalRendering();
|
||||
|
||||
private:
|
||||
static constexpr std::size_t GROW_STEP = 512;
|
||||
void PauseHostConditionalRendering();
|
||||
|
||||
const Device& device;
|
||||
const VideoCore::QueryType type;
|
||||
void ResumeHostConditionalRendering();
|
||||
|
||||
std::vector<vk::QueryPool> pools;
|
||||
std::vector<bool> usage;
|
||||
};
|
||||
bool HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, bool qc_dirty);
|
||||
|
||||
class QueryCache final
|
||||
: public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> {
|
||||
public:
|
||||
explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_,
|
||||
Core::Memory::Memory& cpu_memory_, const Device& device_,
|
||||
Scheduler& scheduler_);
|
||||
~QueryCache();
|
||||
|
||||
std::pair<VkQueryPool, u32> AllocateQuery(VideoCore::QueryType type);
|
||||
bool HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1,
|
||||
VideoCommon::LookupData object_2, bool qc_dirty,
|
||||
bool equal_check);
|
||||
|
||||
void Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query);
|
||||
VideoCommon::StreamerInterface* GetStreamerInterface(VideoCommon::QueryType query_type);
|
||||
|
||||
const Device& GetDevice() const noexcept {
|
||||
return device;
|
||||
}
|
||||
void Bind3DEngine(Tegra::Engines::Maxwell3D* maxwell3d);
|
||||
|
||||
Scheduler& GetScheduler() const noexcept {
|
||||
return scheduler;
|
||||
}
|
||||
template <typename Func>
|
||||
void View3DRegs(Func&& func);
|
||||
|
||||
private:
|
||||
const Device& device;
|
||||
Scheduler& scheduler;
|
||||
std::array<QueryPool, VideoCore::NumQueryTypes> query_pools;
|
||||
void HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, bool is_equal);
|
||||
void HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal);
|
||||
friend struct QueryCacheRuntimeImpl;
|
||||
std::unique_ptr<QueryCacheRuntimeImpl> impl;
|
||||
};
|
||||
|
||||
class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> {
|
||||
public:
|
||||
explicit HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_,
|
||||
VideoCore::QueryType type_);
|
||||
~HostCounter();
|
||||
|
||||
void EndQuery();
|
||||
|
||||
private:
|
||||
u64 BlockingQuery(bool async = false) const override;
|
||||
|
||||
QueryCache& cache;
|
||||
const VideoCore::QueryType type;
|
||||
const std::pair<VkQueryPool, u32> query;
|
||||
const u64 tick;
|
||||
struct QueryCacheParams {
|
||||
using RuntimeType = typename Vulkan::QueryCacheRuntime;
|
||||
};
|
||||
|
||||
class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> {
|
||||
public:
|
||||
explicit CachedQuery(QueryCache&, VideoCore::QueryType, VAddr cpu_addr_, u8* host_ptr_)
|
||||
: CachedQueryBase{cpu_addr_, host_ptr_} {}
|
||||
};
|
||||
using QueryCache = VideoCommon::QueryCacheBase<QueryCacheParams>;
|
||||
|
||||
} // namespace Vulkan
|
||||
|
Loading…
Reference in New Issue