From 12d11ba5831c011aef34de57f9412f4698d0dfe1 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 10 Dec 2025 18:26:52 -0600 Subject: [PATCH] vulkan: Add perf logger mode with concurrency This implements a variation of the perf logger where rather than timing each operation individually with effectively a barrier in between, we put the timing boundaries where we already synchronize and time the groups of work that normally overlap. This can be useful to help understand whether individual operations need to be optimized, or if the group is already running efficiently. GGML_VK_PERF_LOGGER_CONCURRENT=1 enables the new mode (when GGML_VK_PERF_LOGGER is also set). GGML_VK_SYNC_LOGGER=1 replaces the ENABLE_SYNC_LOGGING compile time switch. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 146 +++++++++++++++++++-------- 1 file changed, 104 insertions(+), 42 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c6f5809ccd7..305d5f1e648 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1519,6 +1519,8 @@ class vk_memory_logger { #endif // GGML_VULKAN_MEMORY_DEBUG static bool vk_perf_logger_enabled = false; +static bool vk_perf_logger_concurrent = false; +static bool vk_enable_sync_logger = false; // number of calls between perf logger prints static uint32_t vk_perf_logger_frequency = 1; @@ -1569,14 +1571,14 @@ class vk_perf_logger { flops.clear(); } - void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) { + std::string get_node_fusion_name(const ggml_tensor * node, const char *fusion_name, uint64_t *n_flops) { + *n_flops = 0; std::string fusion_str; if (fusion_name) { fusion_str = fusion_name + std::string(" "); } if (node->op == GGML_OP_UNARY) { - timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time); - return; + return fusion_str + ggml_unary_op_name(ggml_get_unary_op(node)); } if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { const uint64_t m = node->ne[0]; @@ -1598,9 +1600,8 @@ class vk_perf_logger { name += " batch=" + std::to_string(batch); } name = fusion_str + name; - timings[name].push_back(time); - flops[name].push_back(m * n * (k + (k - 1)) * batch); - return; + *n_flops = m * n * (k + (k - 1)) * batch; + return name; } if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) { std::string name = ggml_op_name(node->op); @@ -1616,20 +1617,17 @@ class vk_perf_logger { uint64_t size_M = Cout; uint64_t size_K = Cin * KW * KH; uint64_t size_N = N * OW * OH; - uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1)); + *n_flops = size_M * size_N * (size_K + (size_K - 1)); name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) + ", N=N*OW*OH=" + std::to_string(size_N); name = fusion_str + name; - flops[name].push_back(n_flops); - timings[name].push_back(time); - return; + return name; } if (node->op == GGML_OP_RMS_NORM) { std::string name = ggml_op_name(node->op); name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")"; name = fusion_str + name; - timings[name].push_back(time); - return; + return name; } if (node->op == GGML_OP_FLASH_ATTN_EXT) { const ggml_tensor * dst = node; @@ -1645,8 +1643,7 @@ class vk_perf_logger { " k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " << " v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " << " m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")"; - timings[name.str()].push_back(time); - return; + return name.str(); } if (node->op == GGML_OP_TOP_K) { std::stringstream name; @@ -1654,11 +1651,38 @@ class vk_perf_logger { name << ggml_op_name(node->op) << " K=" << node->ne[0] << " (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")"; - timings[name.str()].push_back(time); - return; + return name.str(); + } + return fusion_str + ggml_op_name(node->op); + } + + void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) { + uint64_t n_flops; + std::string name = get_node_fusion_name(node, fusion_name, &n_flops); + if (n_flops) { + flops[name].push_back(n_flops); + } + timings[name].push_back(time); + } + + void log_timing(const std::vector &nodes, const std::vector &names, uint64_t time) { + uint64_t total_flops = 0; + std::string name; + for (size_t n = 0; n < nodes.size(); ++n) { + uint64_t n_flops = 0; + name += get_node_fusion_name(nodes[n], names[n], &n_flops); + total_flops += n_flops; + + if (n != nodes.size() - 1) { + name += ", "; + } } - timings[fusion_str + ggml_op_name(node->op)].push_back(time); + if (total_flops) { + flops[name].push_back(total_flops); + } + timings[name].push_back(time); } + private: std::map> timings; std::map> flops; @@ -1721,7 +1745,9 @@ struct ggml_backend_vk_context { std::unique_ptr perf_logger; vk::QueryPool query_pool; std::vector query_fusion_names; + std::vector query_fusion_node_count; std::vector query_nodes; + std::vector query_node_idx; int32_t num_queries {}; int32_t query_idx {}; }; @@ -5173,6 +5199,8 @@ static void ggml_vk_instance_init() { } vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr; + vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr; + vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr; const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY"); if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) { @@ -11730,15 +11758,18 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr } } -#define ENABLE_SYNC_LOGGING 0 - if (need_sync) { -#if ENABLE_SYNC_LOGGING - std::cerr << "sync" << std::endl; -#endif + if (vk_enable_sync_logger) { + std::cerr << "sync" << std::endl; + } ctx->unsynced_nodes_written.clear(); ctx->unsynced_nodes_read.clear(); ggml_vk_sync_buffers(ctx, compute_ctx); + + if (vk_perf_logger_enabled && vk_perf_logger_concurrent) { + ctx->query_node_idx[ctx->query_idx] = node_idx; + compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + } } // Add all fused nodes to the unsynchronized lists. for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { @@ -11755,20 +11786,20 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr } } } -#if ENABLE_SYNC_LOGGING - for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { - auto *n = cgraph->nodes[node_idx + i]; - std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name; - if (n->op == GGML_OP_GLU) { - std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " "; - } - if (n->op == GGML_OP_ROPE) { - const int mode = ((const int32_t *) n->op_params)[2]; - std::cerr << " rope mode: " << mode; + if (vk_enable_sync_logger) { + for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { + auto *n = cgraph->nodes[node_idx + i]; + std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name; + if (n->op == GGML_OP_GLU) { + std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " "; + } + if (n->op == GGML_OP_ROPE) { + const int mode = ((const int32_t *) n->op_params)[2]; + std::cerr << " rope mode: " << mode; + } + std::cerr << std::endl; } - std::cerr << std::endl; } -#endif switch (node->op) { case GGML_OP_REPEAT: @@ -13045,12 +13076,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ctx->query_pool = ctx->device->device.createQueryPool(query_create_info); ctx->num_queries = query_create_info.queryCount; ctx->query_fusion_names.resize(ctx->num_queries); + ctx->query_fusion_node_count.resize(ctx->num_queries); ctx->query_nodes.resize(ctx->num_queries); + ctx->query_node_idx.resize(ctx->num_queries); } ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1); std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr); + std::fill(ctx->query_fusion_node_count.begin(), ctx->query_fusion_node_count.end(), 0); std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr); + std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0); GGML_ASSERT(ctx->compute_ctx.expired()); compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); @@ -13179,9 +13214,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg } else { compute_ctx = ctx->compute_ctx.lock(); } - ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i]; - ctx->query_fusion_names[ctx->query_idx] = fusion_string; - compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + if (!vk_perf_logger_concurrent) { + // track a single node/fusion for the current query + ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i]; + ctx->query_fusion_names[ctx->query_idx] = fusion_string; + compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); + } else { + // track a fusion string and number of fused ops for the current node_idx + ctx->query_fusion_names[i] = fusion_string; + ctx->query_fusion_node_count[i] = ctx->num_additional_fused_ops; + } } if (enqueued) { @@ -13223,12 +13265,32 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // Get the results and pass them to the logger std::vector timestamps(cgraph->n_nodes + 1); VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results"); - for (int i = 1; i < ctx->query_idx; i++) { - auto node = ctx->query_nodes[i]; - auto name = ctx->query_fusion_names[i]; - ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); + if (!vk_perf_logger_concurrent) { + // Log each op separately + for (int i = 1; i < ctx->query_idx; i++) { + auto node = ctx->query_nodes[i]; + auto name = ctx->query_fusion_names[i]; + ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); + } + } else { + // Log each group of nodes + int prev_node_idx = 0; + for (int i = 1; i < ctx->query_idx; i++) { + auto cur_node_idx = ctx->query_node_idx[i]; + std::vector nodes; + std::vector names; + for (int node_idx = prev_node_idx; node_idx < cur_node_idx; ++node_idx) { + if (ggml_op_is_empty(cgraph->nodes[node_idx]->op)) { + continue; + } + nodes.push_back(cgraph->nodes[node_idx]); + names.push_back(ctx->query_fusion_names[node_idx]); + node_idx += ctx->query_fusion_node_count[node_idx]; + } + prev_node_idx = cur_node_idx; + ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod)); + } } - ctx->perf_logger->print_timings(); }