Last active
December 12, 2021 06:01
-
-
Save cdwfs/4222ca09cb259f8dd50f7f2cf7d09179 to your computer and use it in GitHub Desktop.
Vulkan function to get a pair of timestamps (one CPU, one GPU) corresponding to (very nearly) the same point in absolute wall time.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
struct CpuGpuTimestampInfo { | |
VkDevice device; | |
VkQueue queue; | |
uint32_t queue_family_index; | |
float timestamp_period; // Copy from VkPhysicalDeviceLimits::timestampPeriod | |
uint32_t timestamp_valid_bits; // Copy from VkQueueFamilyProperties::timestampValidBits | |
}; | |
VkResult GetCpuGpuTimestamp(const CpuGpuTimestampInfo *info, | |
std::chrono::high_resolution_clock::time_point *out_cpu_time, uint64_t *out_gpu_time) { | |
if (info->timestamp_valid_bits == 0) { | |
return VK_ERROR_FEATURE_NOT_PRESENT; // timestamps not supported on the specified queue | |
} | |
VkEventCreateInfo event_ci = {VK_STRUCTURE_TYPE_EVENT_CREATE_INFO}; | |
VkEvent event0 = VK_NULL_HANDLE; | |
VkResult result = vkCreateEvent(info->device, &event_ci, NULL, &event0); | |
if (result != VK_SUCCESS) { | |
return result; | |
} | |
VkEvent event1 = VK_NULL_HANDLE; | |
result = vkCreateEvent(info->device, &event_ci, NULL, &event1); | |
if (result != VK_SUCCESS) { | |
return result; | |
} | |
VkQueryPoolCreateInfo qpool_ci = {VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO}; | |
qpool_ci.queryType = VK_QUERY_TYPE_TIMESTAMP; | |
qpool_ci.queryCount = 2; | |
VkQueryPool qpool = VK_NULL_HANDLE; | |
result = vkCreateQueryPool(info->device, &qpool_ci, NULL, &qpool); | |
if (result != VK_SUCCESS) { | |
return result; | |
} | |
VkCommandPoolCreateInfo cpool_ci = {VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO}; | |
cpool_ci.queueFamilyIndex = info->queue_family_index; | |
cpool_ci.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT; | |
VkCommandPool cpool = VK_NULL_HANDLE; | |
result = vkCreateCommandPool(info->device, &cpool_ci, NULL, &cpool); | |
if (result != VK_SUCCESS) { | |
return result; | |
} | |
VkFenceCreateInfo fence_ci = {VK_STRUCTURE_TYPE_FENCE_CREATE_INFO}; | |
VkFence fence = VK_NULL_HANDLE; | |
result = vkCreateFence(info->device, &fence_ci, NULL, &fence); | |
if (result != VK_SUCCESS) { | |
return result; | |
} | |
VkCommandBufferAllocateInfo cb_alloc_info = {VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO}; | |
cb_alloc_info.commandPool = cpool; | |
cb_alloc_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; | |
cb_alloc_info.commandBufferCount = 1; | |
VkCommandBuffer cb = VK_NULL_HANDLE; | |
result = vkAllocateCommandBuffers(info->device, &cb_alloc_info, &cb); | |
if (result != VK_SUCCESS) { | |
return result; | |
} | |
VkCommandBufferBeginInfo cb_begin_info = {VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO}; | |
cb_begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; | |
result = vkBeginCommandBuffer(cb, &cb_begin_info); | |
if (result != VK_SUCCESS) { | |
return result; | |
} | |
vkCmdWaitEvents(cb, 1, &event0, VK_PIPELINE_STAGE_HOST_BIT, | |
VK_PIPELINE_STAGE_HOST_BIT | VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, NULL, 0, NULL, 0, NULL); | |
vkCmdWriteTimestamp(cb, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, qpool, 0); | |
vkCmdWaitEvents( | |
cb, 1, &event1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, NULL, 0, NULL, 0, NULL); | |
vkCmdWriteTimestamp(cb, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, qpool, 1); | |
result = vkEndCommandBuffer(cb); | |
if (result != VK_SUCCESS) { | |
return result; | |
} | |
VkSubmitInfo submit_info = {VK_STRUCTURE_TYPE_SUBMIT_INFO}; | |
submit_info.commandBufferCount = 1; | |
submit_info.pCommandBuffers = &cb; | |
result = vkQueueSubmit(info->queue, 1, &submit_info, fence); | |
if (result != VK_SUCCESS) { | |
return result; | |
} | |
// Wait until we're reasonably sure the GPU is sitting at the | |
// first WaitEvents() call before we signal the event. | |
std::this_thread::sleep_for(std::chrono::seconds(1)); | |
std::chrono::high_resolution_clock::time_point host_times[2]; | |
result = vkSetEvent(info->device, event0); | |
if (result != VK_SUCCESS) { | |
return result; | |
} | |
host_times[0] = std::chrono::high_resolution_clock::now(); | |
std::this_thread::sleep_for(std::chrono::seconds(1)); | |
result = vkSetEvent(info->device, event1); | |
if (result != VK_SUCCESS) { | |
return result; | |
} | |
host_times[1] = std::chrono::high_resolution_clock::now(); | |
result = vkWaitForFences(info->device, 1, &fence, VK_TRUE, UINT64_MAX); | |
if (result != VK_SUCCESS) { | |
return result; | |
} | |
vkDestroyCommandPool(info->device, cpool, NULL); | |
vkDestroyFence(info->device, fence, NULL); | |
vkDestroyEvent(info->device, event1, NULL); | |
vkDestroyEvent(info->device, event0, NULL); | |
uint64_t raw_device_timestamps[2] = {0, 0}; | |
result = vkGetQueryPoolResults(info->device, qpool, 0, 2, sizeof(raw_device_timestamps), raw_device_timestamps, | |
sizeof(uint64_t), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); | |
if (result != VK_SUCCESS) { | |
return result; | |
} | |
vkDestroyQueryPool(info->device, qpool, NULL); | |
// raw_device_timestamps[0] and host_times[0] now correspond to the same time point in two different time domains. | |
*out_cpu_time = host_times[0]; | |
*out_gpu_time = raw_device_timestamps[0]; | |
// Everything beyond here is just approximating the error between the two timestamps, as we know that | |
// the delta(host_times[1], host_times[0]) should theoretically be identical to delta(raw_device_timestamps[1], | |
// raw_device_timestamps[0]); | |
double device_secs[2] = {0.0, 0.0}; | |
const uint64_t timestamp_mask = | |
(info->timestamp_valid_bits == 64) ? UINT64_MAX : ((1ULL << info->timestamp_valid_bits) - 1); | |
const double seconds_per_tick = static_cast<double>(info->timestamp_period) / 1e9; | |
for (uint32_t i = 0; i < 2; ++i) { | |
raw_device_timestamps[i] &= timestamp_mask; | |
device_secs[i] = static_cast<double>(raw_device_timestamps[i]) * seconds_per_tick; | |
}; | |
const auto host_secs = | |
static_cast<double>(std::chrono::duration_cast<std::chrono::nanoseconds>(host_times[1] - host_times[0]).count()) / | |
1e9; | |
printf("%.9f seconds elapsed on host\n", host_secs); | |
printf("%.9f seconds elapsed on device\n", device_secs[1] - device_secs[0]); | |
return VK_SUCCESS; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment