Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,8 @@ inline void execute_graph(handler &CGH,
inline void execute_graph(queue Q, command_graph<graph_state::executable> &G,
const sycl::detail::code_location &CodeLoc =
sycl::detail::code_location::current()) {
submit(std::move(Q), [&](handler &CGH) { execute_graph(CGH, G); }, CodeLoc);
submit_graph_direct_without_event_impl(std::move(Q), G, /*DepEvents*/ {},
CodeLoc);
}

} // namespace ext::oneapi::experimental
Expand Down
33 changes: 20 additions & 13 deletions sycl/include/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,20 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl(
const detail::KernelPropertyHolderStructTy &Props,
const detail::code_location &CodeLoc, bool IsTopCodeLoc);

event __SYCL_EXPORT submit_graph_direct_with_event_impl(
const queue &Queue,
ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::executable> &G,
sycl::span<const event> DepEvents,
const detail::code_location &CodeLoc = detail::code_location::current());

void __SYCL_EXPORT submit_graph_direct_without_event_impl(
const queue &Queue,
ext::oneapi::experimental::command_graph<
ext::oneapi::experimental::graph_state::executable> &G,
sycl::span<const event> DepEvents,
const detail::code_location &CodeLoc = detail::code_location::current());

namespace detail {
class queue_impl;

Expand Down Expand Up @@ -3651,7 +3665,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
ext::oneapi::experimental::graph_state::executable>
Graph,
const detail::code_location &CodeLoc = detail::code_location::current()) {
return submit([&](handler &CGH) { CGH.ext_oneapi_graph(Graph); }, CodeLoc);
return submit_graph_direct_with_event_impl(*this, Graph, /*DepEvents*/ {},
CodeLoc);
}

/// Shortcut for executing a graph of commands with a single dependency.
Expand All @@ -3666,12 +3681,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
Graph,
event DepEvent,
const detail::code_location &CodeLoc = detail::code_location::current()) {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvent);
CGH.ext_oneapi_graph(Graph);
},
CodeLoc);
return submit_graph_direct_with_event_impl(
*this, Graph, sycl::span<const event>(&DepEvent, 1), CodeLoc);
}

/// Shortcut for executing a graph of commands with multiple dependencies.
Expand All @@ -3686,12 +3697,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
Graph,
const std::vector<event> &DepEvents,
const detail::code_location &CodeLoc = detail::code_location::current()) {
return submit(
[&](handler &CGH) {
CGH.depends_on(DepEvents);
CGH.ext_oneapi_graph(Graph);
},
CodeLoc);
return submit_graph_direct_with_event_impl(*this, Graph, DepEvents,
CodeLoc);
}

/// Provides a hint to the runtime that previously issued commands to this
Expand Down
10 changes: 4 additions & 6 deletions sycl/source/detail/graph/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1208,7 +1208,7 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue,
return SignalEvent;
}

EventImplPtr
std::pair<EventImplPtr, bool>
exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
sycl::detail::CG::StorageInitHelper CGData,
bool EventNeeded) {
Expand All @@ -1217,19 +1217,17 @@ exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
cleanupExecutionEvents(MSchedulerDependencies);
CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(),
MSchedulerDependencies.end());

bool IsCGDataSafeForSchedulerBypass =
detail::Scheduler::areEventsSafeForSchedulerBypass(
CGData.MEvents, Queue.getContextImpl()) &&
CGData.MRequirements.empty();
bool SkipScheduler = IsCGDataSafeForSchedulerBypass && !MContainsHostTask;

// This variable represents the returned event. It will always be nullptr if
// EventNeeded is false.
EventImplPtr SignalEvent;

if (!MContainsHostTask) {
bool SkipScheduler =
IsCGDataSafeForSchedulerBypass && MPartitions[0]->MRequirements.empty();
SkipScheduler = SkipScheduler && MPartitions[0]->MRequirements.empty();
if (SkipScheduler) {
SignalEvent = enqueuePartitionDirectly(MPartitions[0], Queue,
CGData.MEvents, EventNeeded);
Expand Down Expand Up @@ -1262,7 +1260,7 @@ exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
SignalEvent->setProfilingEnabled(MEnableProfiling);
}

return SignalEvent;
return {SignalEvent, SkipScheduler};
}

void exec_graph_impl::duplicateNodes() {
Expand Down
13 changes: 8 additions & 5 deletions sycl/source/detail/graph/graph_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -640,11 +640,14 @@ class exec_graph_impl {
/// @param CGData Command-group data provided by the sycl::handler
/// @param EventNeeded Whether an event signalling the completion of this
/// operation needs to be returned.
/// @return Returns an event if EventNeeded is true. Returns nullptr
/// otherwise.
EventImplPtr enqueue(sycl::detail::queue_impl &Queue,
sycl::detail::CG::StorageInitHelper CGData,
bool EventNeeded);
/// @return Returns a pair of an event and a boolean indicating whether the
/// scheduler was bypassed. If an event is required, then the first element of
/// the pair is the event representing the execution of the graph. If no event
/// is required, the first element is nullptr. The second element is true if
/// the scheduler was bypassed, false otherwise.
std::pair<EventImplPtr, bool>
enqueue(sycl::detail::queue_impl &Queue,
sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded);

/// Iterates through all the nodes in the graph to build the list of
/// accessor requirements for the whole graph and for each partition.
Expand Down
144 changes: 106 additions & 38 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,16 @@ void queue_impl::addEvent(const detail::EventImplPtr &EventImpl) {
}
}

void queue_impl::addEventUnlocked(const detail::EventImplPtr &EventImpl) {
if (!EventImpl)
return;
Command *Cmd = EventImpl->getCommand();
if (Cmd != nullptr && EventImpl->getHandle() == nullptr) {
std::weak_ptr<event_impl> EventWeakPtr{EventImpl};
MEventsWeak.push_back(std::move(EventWeakPtr));
}
}

detail::EventImplPtr
queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
bool CallerNeedsEvent, const detail::code_location &Loc,
Expand Down Expand Up @@ -532,16 +542,23 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
KData.validateAndSetKernelLaunchProperties(Props, hasCommandGraph(),
getDeviceImpl());

auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &&CGData,
bool SchedulerBypass) -> EventImplPtr {
auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &&CGData)
-> std::pair<EventImplPtr, bool> {
bool SchedulerBypass =
(CGData.MEvents.size() > 0
? detail::Scheduler::areEventsSafeForSchedulerBypass(
CGData.MEvents, getContextImpl())
: true) &&
!hasCommandGraph();
if (SchedulerBypass) {
// No need to copy/move the kernel function, so we set
// the function pointer to the original function
KData.setKernelFunc(HostKernel.getPtr());

return submit_kernel_scheduler_bypass(KData, CGData.MEvents,
CallerNeedsEvent, nullptr, nullptr,
CodeLoc, IsTopCodeLoc);
return {submit_kernel_scheduler_bypass(KData, CGData.MEvents,
CallerNeedsEvent, nullptr, nullptr,
CodeLoc, IsTopCodeLoc),
/*SchedulerBypass*/ true};
}
std::unique_ptr<detail::CG> CommandGroup;
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
Expand Down Expand Up @@ -569,57 +586,101 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;

if (auto GraphImpl = getCommandGraph(); GraphImpl) {
return submit_command_to_graph(*GraphImpl, std::move(CommandGroup),
detail::CGType::Kernel);
return {submit_command_to_graph(*GraphImpl, std::move(CommandGroup),
detail::CGType::Kernel),
/*SchedulerBypass*/ false};
}

return detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
*this, true);
return {detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
*this, true),
/*SchedulerBypass*/ false};
};

return submit_direct(CallerNeedsEvent, DepEvents, SubmitKernelFunc);
return submit_direct(CallerNeedsEvent, DepEvents, SubmitKernelFunc,
detail::CGType::Kernel,
/*InsertBarrierForInOrderCommand*/ false);
}

EventImplPtr queue_impl::submit_graph_direct_impl(
std::shared_ptr<ext::oneapi::experimental::detail::exec_graph_impl>
ExecGraph,
bool CallerNeedsEvent, sycl::span<const event> DepEvents,
[[maybe_unused]] const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
bool EventNeeded = CallerNeedsEvent || ExecGraph->containsHostTask() ||
!supportsDiscardingPiEvents();
auto SubmitGraphFunc = [&](detail::CG::StorageInitHelper &&CGData)
-> std::pair<EventImplPtr, bool> {
if (auto ParentGraph = getCommandGraph(); ParentGraph) {
std::unique_ptr<detail::CG> CommandGroup;
{
ext::oneapi::experimental::detail::graph_impl::ReadLock ExecLock(
ExecGraph->MMutex);
CGData.MRequirements = ExecGraph->getRequirements();
}
// Here we are using the CommandGroup without passing a CommandBuffer to
// pass the exec_graph_impl and event dependencies. Since this subgraph
// CG will not be executed this is fine.
CommandGroup.reset(
new sycl::detail::CGExecCommandBuffer(nullptr, ExecGraph, CGData));
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
return {submit_command_to_graph(*ParentGraph, std::move(CommandGroup),
detail::CGType::ExecCommandBuffer),
/*SchedulerBypass*/ false};
} else {
return ExecGraph->enqueue(*this, CGData, EventNeeded);
}
};
// If the graph contains a host task, we may need to insert a barrier prior
// to submission to ensure correct ordering with in-order queues.
return submit_direct(CallerNeedsEvent, DepEvents, SubmitGraphFunc,
detail::CGType::ExecCommandBuffer,
ExecGraph->containsHostTask());
}

template <typename SubmitCommandFuncType>
detail::EventImplPtr
queue_impl::submit_direct(bool CallerNeedsEvent,
sycl::span<const event> DepEvents,
SubmitCommandFuncType &SubmitCommandFunc) {
detail::EventImplPtr queue_impl::submit_direct(
bool CallerNeedsEvent, sycl::span<const event> DepEvents,
SubmitCommandFuncType &SubmitCommandFunc, detail::CGType Type,
bool InsertBarrierForInOrderCommand) {
detail::CG::StorageInitHelper CGData;
std::unique_lock<std::mutex> Lock(MMutex);

// Used by queue_empty() and getLastEvent()
MEmpty.store(false, std::memory_order_release);
const bool inOrder = isInOrder();

// Sync with an external event
std::optional<event> ExternalEvent = popExternalEvent();
if (ExternalEvent) {
registerEventDependency</*LockQueue*/ false>(
getSyclObjImpl(*ExternalEvent), CGData.MEvents, this, getContextImpl(),
getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr,
detail::CGType::Kernel);
Type);
}

auto &Deps = hasCommandGraph() ? MExtGraphDeps : MDefaultGraphDeps;

// Sync with the last event for in order queue
EventImplPtr &LastEvent = Deps.LastEventPtr;
if (isInOrder() && LastEvent) {
if (inOrder && LastEvent) {
registerEventDependency</*LockQueue*/ false>(
LastEvent, CGData.MEvents, this, getContextImpl(), getDeviceImpl(),
hasCommandGraph() ? getCommandGraph().get() : nullptr,
detail::CGType::Kernel);
hasCommandGraph() ? getCommandGraph().get() : nullptr, Type);
} else if (inOrder && !MEmpty.load(std::memory_order_acquire) &&
InsertBarrierForInOrderCommand) {
// A barrier is injected to ensure ordering with prior commands
auto ResEvent = insertHelperBarrier();
registerEventDependency</*LockQueue*/ false>(
ResEvent, CGData.MEvents, this, getContextImpl(), getDeviceImpl(),
hasCommandGraph() ? getCommandGraph().get() : nullptr, Type);
}

for (event e : DepEvents) {
registerEventDependency</*LockQueue*/ false>(
getSyclObjImpl(e), CGData.MEvents, this, getContextImpl(),
getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr,
detail::CGType::Kernel);
Type);
}

// Barrier and un-enqueued commands synchronization for out or order queue
if (!isInOrder()) {
if (!inOrder) {
MMissedCleanupRequests.unset(
[&](MissedCleanupRequestsType &MissedCleanupRequests) {
for (auto &UpdatedGraph : MissedCleanupRequests)
Expand All @@ -632,32 +693,30 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
}
}

bool SchedulerBypass =
(CGData.MEvents.size() > 0
? detail::Scheduler::areEventsSafeForSchedulerBypass(
CGData.MEvents, getContextImpl())
: true) &&
!hasCommandGraph();
// Used by queue_empty() and getLastEvent()
MEmpty.store(false, std::memory_order_release);

auto [EventImpl, SchedulerBypass] = SubmitCommandFunc(std::move(CGData));

// Synchronize with the "no last event mode", used by the handler-based
// kernel submit path
MNoLastEventMode.store(isInOrder() && SchedulerBypass,
std::memory_order_relaxed);

EventImplPtr EventImpl =
SubmitCommandFunc(std::move(CGData), SchedulerBypass);
MNoLastEventMode.store(inOrder && SchedulerBypass, std::memory_order_relaxed);

// Sync with the last event for in order queue. For scheduler-bypass flow,
// the ordering is done at the layers below the SYCL runtime,
// but for the scheduler-based flow, it needs to be done here, as the
// scheduler handles host task submissions.
if (isInOrder()) {
if (inOrder) {
LastEvent = SchedulerBypass ? nullptr : EventImpl;
}

// Barrier and un-enqueued commands synchronization for out or order queue
if (!isInOrder() && !EventImpl->isEnqueued()) {
Deps.UnenqueuedCmdEvents.push_back(EventImpl);
// Barrier and un-enqueued commands synchronization for out or order queue.
// The event must also be stored for future wait calls.
if (!inOrder) {
if (!EventImpl->isEnqueued()) {
Deps.UnenqueuedCmdEvents.push_back(EventImpl);
}
addEventUnlocked(EventImpl);
}

return CallerNeedsEvent ? std::move(EventImpl) : nullptr;
Expand Down Expand Up @@ -1108,6 +1167,15 @@ void queue_impl::verifyProps(const property_list &Props) const {
CheckPropertiesWithData);
}

EventImplPtr queue_impl::insertHelperBarrier() {
auto ResEvent = detail::event_impl::create_device_event(*this);
ur_event_handle_t UREvent = nullptr;
getAdapter().call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
getHandleRef(), 0, nullptr, &UREvent);
ResEvent->setHandle(UREvent);
return ResEvent;
}

} // namespace detail
} // namespace _V1
} // namespace sycl
Loading
Loading