Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 104 additions & 42 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1519,6 +1519,8 @@ class vk_memory_logger {
#endif // GGML_VULKAN_MEMORY_DEBUG

static bool vk_perf_logger_enabled = false;
static bool vk_perf_logger_concurrent = false;
static bool vk_enable_sync_logger = false;
// number of calls between perf logger prints
static uint32_t vk_perf_logger_frequency = 1;

Expand Down Expand Up @@ -1569,14 +1571,14 @@ class vk_perf_logger {
flops.clear();
}

void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
std::string get_node_fusion_name(const ggml_tensor * node, const char *fusion_name, uint64_t *n_flops) {
*n_flops = 0;
std::string fusion_str;
if (fusion_name) {
fusion_str = fusion_name + std::string(" ");
}
if (node->op == GGML_OP_UNARY) {
timings[fusion_str + ggml_unary_op_name(ggml_get_unary_op(node))].push_back(time);
return;
return fusion_str + ggml_unary_op_name(ggml_get_unary_op(node));
}
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
const uint64_t m = node->ne[0];
Expand All @@ -1598,9 +1600,8 @@ class vk_perf_logger {
name += " batch=" + std::to_string(batch);
}
name = fusion_str + name;
timings[name].push_back(time);
flops[name].push_back(m * n * (k + (k - 1)) * batch);
return;
*n_flops = m * n * (k + (k - 1)) * batch;
return name;
}
if (node->op == GGML_OP_CONV_2D || node->op == GGML_OP_CONV_TRANSPOSE_2D) {
std::string name = ggml_op_name(node->op);
Expand All @@ -1616,20 +1617,17 @@ class vk_perf_logger {
uint64_t size_M = Cout;
uint64_t size_K = Cin * KW * KH;
uint64_t size_N = N * OW * OH;
uint64_t n_flops = size_M * size_N * (size_K + (size_K - 1));
*n_flops = size_M * size_N * (size_K + (size_K - 1));
name += " M=Cout=" + std::to_string(size_M) + ", K=Cin*KW*KH=" + std::to_string(size_K) +
", N=N*OW*OH=" + std::to_string(size_N);
name = fusion_str + name;
flops[name].push_back(n_flops);
timings[name].push_back(time);
return;
return name;
}
if (node->op == GGML_OP_RMS_NORM) {
std::string name = ggml_op_name(node->op);
name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")";
name = fusion_str + name;
timings[name].push_back(time);
return;
return name;
}
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
const ggml_tensor * dst = node;
Expand All @@ -1645,20 +1643,46 @@ class vk_perf_logger {
" k(" << k->ne[0] << "," << k->ne[1] << "," << k->ne[2] << "," << k->ne[3] << "), " <<
" v(" << v->ne[0] << "," << v->ne[1] << "," << v->ne[2] << "," << v->ne[3] << "), " <<
" m(" << (m?m->ne[0]:0) << "," << (m?m->ne[1]:0) << "," << (m?m->ne[2]:0) << "," << (m?m->ne[3]:0) << ")";
timings[name.str()].push_back(time);
return;
return name.str();
}
if (node->op == GGML_OP_TOP_K) {
std::stringstream name;
name << fusion_str;
name << ggml_op_name(node->op) <<
" K=" << node->ne[0] <<
" (" << node->src[0]->ne[0] << "," << node->src[0]->ne[1] << "," << node->src[0]->ne[2] << "," << node->src[0]->ne[3] << ")";
timings[name.str()].push_back(time);
return;
return name.str();
}
return fusion_str + ggml_op_name(node->op);
}

void log_timing(const ggml_tensor * node, const char *fusion_name, uint64_t time) {
uint64_t n_flops;
std::string name = get_node_fusion_name(node, fusion_name, &n_flops);
if (n_flops) {
flops[name].push_back(n_flops);
}
timings[name].push_back(time);
}

void log_timing(const std::vector<ggml_tensor *> &nodes, const std::vector<const char *> &names, uint64_t time) {
uint64_t total_flops = 0;
std::string name;
for (size_t n = 0; n < nodes.size(); ++n) {
uint64_t n_flops = 0;
name += get_node_fusion_name(nodes[n], names[n], &n_flops);
total_flops += n_flops;

if (n != nodes.size() - 1) {
name += ", ";
}
}
timings[fusion_str + ggml_op_name(node->op)].push_back(time);
if (total_flops) {
flops[name].push_back(total_flops);
}
timings[name].push_back(time);
}

private:
std::map<std::string, std::vector<uint64_t>> timings;
std::map<std::string, std::vector<uint64_t>> flops;
Expand Down Expand Up @@ -1721,7 +1745,9 @@ struct ggml_backend_vk_context {
std::unique_ptr<vk_perf_logger> perf_logger;
vk::QueryPool query_pool;
std::vector<const char *> query_fusion_names;
std::vector<int> query_fusion_node_count;
std::vector<ggml_tensor *> query_nodes;
std::vector<int> query_node_idx;
int32_t num_queries {};
int32_t query_idx {};
};
Expand Down Expand Up @@ -5173,6 +5199,8 @@ static void ggml_vk_instance_init() {
}

vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
vk_perf_logger_concurrent = getenv("GGML_VK_PERF_LOGGER_CONCURRENT") != nullptr;
vk_enable_sync_logger = getenv("GGML_VK_SYNC_LOGGER") != nullptr;
const char* GGML_VK_PERF_LOGGER_FREQUENCY = getenv("GGML_VK_PERF_LOGGER_FREQUENCY");

if (GGML_VK_PERF_LOGGER_FREQUENCY != nullptr) {
Expand Down Expand Up @@ -11730,15 +11758,18 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
}
}

#define ENABLE_SYNC_LOGGING 0

if (need_sync) {
#if ENABLE_SYNC_LOGGING
std::cerr << "sync" << std::endl;
#endif
if (vk_enable_sync_logger) {
std::cerr << "sync" << std::endl;
}
ctx->unsynced_nodes_written.clear();
ctx->unsynced_nodes_read.clear();
ggml_vk_sync_buffers(ctx, compute_ctx);

if (vk_perf_logger_enabled && vk_perf_logger_concurrent) {
ctx->query_node_idx[ctx->query_idx] = node_idx;
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
}
}
// Add all fused nodes to the unsynchronized lists.
for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
Expand All @@ -11755,20 +11786,20 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
}
}
}
#if ENABLE_SYNC_LOGGING
for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
auto *n = cgraph->nodes[node_idx + i];
std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name;
if (n->op == GGML_OP_GLU) {
std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
}
if (n->op == GGML_OP_ROPE) {
const int mode = ((const int32_t *) n->op_params)[2];
std::cerr << " rope mode: " << mode;
if (vk_enable_sync_logger) {
for (int i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
auto *n = cgraph->nodes[node_idx + i];
std::cerr << node_idx + i << " " << ggml_op_name(n->op) << " " << n->name;
if (n->op == GGML_OP_GLU) {
std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
}
if (n->op == GGML_OP_ROPE) {
const int mode = ((const int32_t *) n->op_params)[2];
std::cerr << " rope mode: " << mode;
}
std::cerr << std::endl;
}
std::cerr << std::endl;
}
#endif

switch (node->op) {
case GGML_OP_REPEAT:
Expand Down Expand Up @@ -13045,12 +13076,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
ctx->query_pool = ctx->device->device.createQueryPool(query_create_info);
ctx->num_queries = query_create_info.queryCount;
ctx->query_fusion_names.resize(ctx->num_queries);
ctx->query_fusion_node_count.resize(ctx->num_queries);
ctx->query_nodes.resize(ctx->num_queries);
ctx->query_node_idx.resize(ctx->num_queries);
}

ctx->device->device.resetQueryPool(ctx->query_pool, 0, cgraph->n_nodes+1);
std::fill(ctx->query_fusion_names.begin(), ctx->query_fusion_names.end(), nullptr);
std::fill(ctx->query_fusion_node_count.begin(), ctx->query_fusion_node_count.end(), 0);
std::fill(ctx->query_nodes.begin(), ctx->query_nodes.end(), nullptr);
std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0);

GGML_ASSERT(ctx->compute_ctx.expired());
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
Expand Down Expand Up @@ -13179,9 +13214,16 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
} else {
compute_ctx = ctx->compute_ctx.lock();
}
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
ctx->query_fusion_names[ctx->query_idx] = fusion_string;
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
if (!vk_perf_logger_concurrent) {
// track a single node/fusion for the current query
ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i];
ctx->query_fusion_names[ctx->query_idx] = fusion_string;
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++);
} else {
// track a fusion string and number of fused ops for the current node_idx
ctx->query_fusion_names[i] = fusion_string;
ctx->query_fusion_node_count[i] = ctx->num_additional_fused_ops;
}
}

if (enqueued) {
Expand Down Expand Up @@ -13223,12 +13265,32 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
// Get the results and pass them to the logger
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->query_pool, 0, ctx->query_idx, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
for (int i = 1; i < ctx->query_idx; i++) {
auto node = ctx->query_nodes[i];
auto name = ctx->query_fusion_names[i];
ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
if (!vk_perf_logger_concurrent) {
// Log each op separately
for (int i = 1; i < ctx->query_idx; i++) {
auto node = ctx->query_nodes[i];
auto name = ctx->query_fusion_names[i];
ctx->perf_logger->log_timing(node, name, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
}
} else {
// Log each group of nodes
int prev_node_idx = 0;
for (int i = 1; i < ctx->query_idx; i++) {
auto cur_node_idx = ctx->query_node_idx[i];
std::vector<ggml_tensor *> nodes;
std::vector<const char *> names;
for (int node_idx = prev_node_idx; node_idx < cur_node_idx; ++node_idx) {
if (ggml_op_is_empty(cgraph->nodes[node_idx]->op)) {
continue;
}
nodes.push_back(cgraph->nodes[node_idx]);
names.push_back(ctx->query_fusion_names[node_idx]);
node_idx += ctx->query_fusion_node_count[node_idx];
}
prev_node_idx = cur_node_idx;
ctx->perf_logger->log_timing(nodes, names, uint64_t((timestamps[i] - timestamps[i-1]) * ctx->device->properties.limits.timestampPeriod));
}
}

ctx->perf_logger->print_timings();
}

Expand Down
Loading