1 | /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | // TODO(opensource): Use a more generic sounding preprocessor name than |
17 | // GOOGLE_CUDA |
18 | #if GOOGLE_CUDA |
19 | |
20 | #define EIGEN_USE_GPU |
21 | |
22 | #include "tensorflow/core/common_runtime/gpu/gpu_device.h" |
23 | |
24 | #include <stdlib.h> |
25 | #include <string.h> |
26 | #include <algorithm> |
27 | #include <list> |
28 | #include <map> |
29 | #include <tuple> |
30 | #include <vector> |
31 | |
32 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" |
33 | #include "tensorflow/core/common_runtime/device_factory.h" |
34 | #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" |
35 | #include "tensorflow/core/common_runtime/gpu/gpu_id.h" |
36 | #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h" |
37 | #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h" |
38 | #include "tensorflow/core/common_runtime/gpu/gpu_init.h" |
39 | #include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h" |
40 | #include "tensorflow/core/common_runtime/gpu/gpu_util.h" |
41 | #include "tensorflow/core/common_runtime/gpu/process_state.h" |
42 | #include "tensorflow/core/common_runtime/gpu_device_context.h" |
43 | #include "tensorflow/core/common_runtime/local_device.h" |
44 | #include "tensorflow/core/framework/allocator.h" |
45 | #include "tensorflow/core/framework/device_base.h" |
46 | #include "tensorflow/core/framework/op_kernel.h" |
47 | #include "tensorflow/core/framework/tensor.h" |
48 | #include "tensorflow/core/framework/tensor.pb.h" |
49 | #include "tensorflow/core/framework/types.h" |
50 | #include "tensorflow/core/framework/variant_op_registry.h" |
51 | #include "tensorflow/core/graph/types.h" |
52 | #include "tensorflow/core/lib/core/errors.h" |
53 | #include "tensorflow/core/lib/core/status.h" |
54 | #include "tensorflow/core/lib/gtl/stl_util.h" |
55 | #include "tensorflow/core/lib/strings/numbers.h" |
56 | #include "tensorflow/core/lib/strings/str_util.h" |
57 | #include "tensorflow/core/lib/strings/strcat.h" |
58 | #include "tensorflow/core/platform/cuda.h" |
59 | #include "tensorflow/core/platform/logging.h" |
60 | #include "tensorflow/core/platform/macros.h" |
61 | #include "tensorflow/core/platform/stream_executor.h" |
62 | #include "tensorflow/core/platform/tracing.h" |
63 | #include "tensorflow/core/platform/types.h" |
64 | #include "tensorflow/core/public/session_options.h" |
65 | #include "tensorflow/core/util/device_name_utils.h" |
66 | #include "tensorflow/core/util/env_var.h" |
67 | #include "tensorflow/core/util/stream_executor_util.h" |
68 | |
69 | #if !defined(PLATFORM_GOOGLE) |
70 | #include "cuda/cuda_config.h" |
71 | #endif |
72 | |
73 | namespace tensorflow { |
74 | |
75 | // Eigen Ops directly allocate memory only for temporary buffers used |
76 | // during OpKernel::Compute(). The recommended way of allocating such |
77 | // memory is via OpKernelContext::allocate_temp(). However, Eigen Ops |
78 | // don't have access to OpKernelContext, instead they get access to |
79 | // memory directly through the device allocator. As an Open Source |
80 | // project, Eigen assumes allocator semantics similar to those of the |
81 | // CUDA memory allocator, and may not work correctly due to race |
82 | // conditions if used with some other allocator. For safety, we need |
83 | // to delay deallocation calls out of Eigen until all events on the |
84 | // corresponding stream have completed. The following two classes |
85 | // serve this purpose in two different compilation environments. |
86 | |
87 | class EigenCudaStreamDevice : public ::Eigen::StreamInterface { |
88 | public: |
89 | EigenCudaStreamDevice() |
90 | : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) { |
91 | Eigen::initializeDeviceProp(); |
92 | } |
93 | ~EigenCudaStreamDevice() override {} |
94 | void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream, |
95 | TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc, |
96 | char* scratch) { |
97 | if (LogMemory::IsEnabled()) { |
98 | operation_ = context->op_kernel().name() + "/EigenAllocator" ; |
99 | step_id_ = context->step_id(); |
100 | } |
101 | context_ = context; |
102 | scratch_ = scratch; |
103 | semaphore_ = |
104 | reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize); |
105 | stream_ = cuda_stream; |
106 | allocator_ = alloc; |
107 | const int cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id).value(); |
108 | device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id]; |
109 | } |
110 | |
111 | const cudaStream_t& stream() const override { return *stream_; } |
112 | const cudaDeviceProp& deviceProperties() const override { |
113 | return *device_prop_; |
114 | } |
115 | |
116 | void* allocate(size_t num_bytes) const override { |
117 | void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes); |
118 | if (ret == nullptr) { |
119 | if (context_) { |
120 | context_->SetStatus(errors::ResourceExhausted( |
121 | strings::StrCat("Ran out of GPU memory when allocating " , num_bytes, |
122 | " bytes for " , operation_))); |
123 | } else { |
124 | LOG(FATAL) |
125 | << "EigenAllocator for GPU ran out of memory when allocating " |
126 | << num_bytes << ". See error logs for more detailed info." ; |
127 | } |
128 | } |
129 | if (LogMemory::IsEnabled() && ret != nullptr) { |
130 | LogMemory::RecordRawAllocation(operation_, step_id_, num_bytes, ret, |
131 | allocator_); |
132 | } |
133 | return ret; |
134 | } |
135 | void deallocate(void* buffer) const override { |
136 | if (LogMemory::IsEnabled() && buffer != nullptr) { |
137 | LogMemory::RecordRawDeallocation(operation_, step_id_, buffer, allocator_, |
138 | true); |
139 | } |
140 | AsyncFreeData* afData = |
141 | new AsyncFreeData(allocator_, buffer, operation_, step_id_); |
142 | cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, 0); |
143 | CHECK_EQ(err, cudaSuccess); |
144 | } |
145 | |
146 | // Return a pointer to a per stream scratchpad of 1024 bytes residing |
147 | // in global memory. |
148 | void* scratchpad() const override { return scratch_; } |
149 | |
150 | // Return a semaphore. The semaphore is initially initialized to 0, and |
151 | // each kernel using it is responsible for resetting to 0 upon completion |
152 | // to maintain the invariant that the semaphore is always equal to 0 upon |
153 | // each kernel start. |
154 | unsigned int* semaphore() const override { return semaphore_; } |
155 | |
156 | private: |
157 | struct AsyncFreeData { |
158 | AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o, |
159 | const int64 s) |
160 | : allocator_(a), address_(p), operation_(o), step_id_(s) {} |
161 | ::tensorflow::Allocator* allocator_; |
162 | void* address_; |
163 | const string operation_; |
164 | const int64 step_id_; |
165 | }; |
166 | |
167 | static void CUDART_CB asyncFree(cudaStream_t stream, cudaError_t status, |
168 | void* userData) { |
169 | AsyncFreeData* data = static_cast<AsyncFreeData*>(userData); |
170 | if (LogMemory::IsEnabled()) { |
171 | LogMemory::RecordRawDeallocation(data->operation_, data->step_id_, |
172 | data->address_, data->allocator_, false); |
173 | } |
174 | data->allocator_->DeallocateRaw(data->address_); |
175 | delete data; |
176 | } |
177 | |
178 | string operation_; |
179 | int64 step_id_; |
180 | const cudaStream_t* stream_; // Not owned. |
181 | const cudaDeviceProp* device_prop_; // Not owned. |
182 | ::tensorflow::Allocator* allocator_; // Not owned. |
183 | mutable char* scratch_; |
184 | mutable unsigned int* semaphore_; |
185 | OpKernelContext* context_; |
186 | |
187 | TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice); |
188 | }; |
189 | |
190 | // This factory helps to ensure that different GPU device objects that refer to |
191 | // the same physical device and stream group id use the same stream group |
192 | // object (and therefore the same CUDA streams). This is necessary since there |
193 | // is a single memory allocator per device (see ProcessState::GetGPUAllocator) |
194 | // and allocators must not be shared across streams. |
195 | class BaseGPUDevice::StreamGroupFactory { |
196 | public: |
197 | // Returns the unique stream group for use with the stream defined by |
198 | // {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet |
199 | // exist. |
200 | // This function is thread safe. |
201 | BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id, |
202 | int stream_group_within_gpu, |
203 | gpu::StreamExecutor* executor) { |
204 | mutex_lock guard(lock_); |
205 | StreamGroup* group = |
206 | &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)]; |
207 | if (!group->compute) { |
208 | group->compute = new gpu::Stream(executor); |
209 | group->compute->Init(); |
210 | VLOG(2) << "Created stream[" << stream_group_within_gpu |
211 | << "] = " << group->compute; |
212 | |
213 | group->host_to_device = new gpu::Stream(executor); |
214 | group->host_to_device->Init(); |
215 | VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu |
216 | << "] = " << group->host_to_device; |
217 | |
218 | group->device_to_host = new gpu::Stream(executor); |
219 | group->device_to_host->Init(); |
220 | VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu |
221 | << "] = " << group->device_to_host; |
222 | |
223 | group->device_to_device = new gpu::Stream(executor); |
224 | group->device_to_device->Init(); |
225 | VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu |
226 | << "] = " << group->device_to_host; |
227 | } |
228 | return group; |
229 | } |
230 | |
231 | // Returns a reference to the StreamGroupFactory singleton. Note that this is |
232 | // never destroyed, so the objects it owns are never deleted. |
233 | static StreamGroupFactory& Global() { |
234 | static StreamGroupFactory* instance = new StreamGroupFactory(); |
235 | return *instance; |
236 | } |
237 | |
238 | private: |
239 | mutex lock_; |
240 | using key_type = std::tuple<int, int>; |
241 | std::map<key_type, StreamGroup> streams_; |
242 | |
243 | // StreamGroupFactory cannot be created directly; Call |
244 | // StreamGroupFactory::Global() to get the global instance. |
245 | StreamGroupFactory() = default; |
246 | TF_DISALLOW_COPY_AND_ASSIGN(StreamGroupFactory); |
247 | }; |
248 | |
249 | BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name, |
250 | Bytes memory_limit, const DeviceLocality& locality, |
251 | TfGpuId tf_gpu_id, |
252 | const string& physical_device_desc, |
253 | Allocator* gpu_allocator, Allocator* cpu_allocator, |
254 | bool sync_every_op, int32 max_streams) |
255 | : LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU, |
256 | memory_limit, locality, |
257 | physical_device_desc)), |
258 | gpu_allocator_(gpu_allocator), |
259 | cpu_allocator_(cpu_allocator), |
260 | scoped_allocator_mgr_(new ScopedAllocatorMgr(name)), |
261 | tf_gpu_id_(tf_gpu_id), |
262 | sync_every_op_(sync_every_op), |
263 | max_streams_(max_streams) { |
264 | ProcessState::singleton()->EnableGPUDevice(); |
265 | } |
266 | |
267 | BaseGPUDevice::~BaseGPUDevice() { |
268 | delete gpu_device_info_; |
269 | for (auto ctx : device_contexts_) ctx->Unref(); |
270 | } |
271 | |
272 | Status BaseGPUDevice::Init(const SessionOptions& options) { |
273 | auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_); |
274 | if (!executor_status.status().ok()) { |
275 | return errors::Internal("Failed to get StreamExecutor for device " , |
276 | tf_gpu_id_.value()); |
277 | } |
278 | |
279 | executor_ = executor_status.ValueOrDie(); |
280 | em_.reset(new EventMgr(executor_, options.config.gpu_options())); |
281 | |
282 | if (max_streams_ < 1) { |
283 | return errors::InvalidArgument("Invalid value for max_streams." ); |
284 | } |
285 | |
286 | // Create the specified number of GPU streams |
287 | for (int i = 0; i < max_streams_; i++) { |
288 | streams_.push_back( |
289 | StreamGroupFactory::Global().GetOrCreate(tf_gpu_id_, i, executor_)); |
290 | |
291 | size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int); |
292 | void* scratch_buffer = gpu_allocator_->AllocateRaw( |
293 | Allocator::kAllocatorAlignment, scratch_buffer_size); |
294 | if (scratch_buffer == nullptr) { |
295 | return errors::FailedPrecondition( |
296 | "Failed to allocate scratch buffer for device " , tf_gpu_id_.value()); |
297 | } |
298 | scratch_.push_back(static_cast<char*>(scratch_buffer)); |
299 | |
300 | perftools::gputools::DeviceMemory<char> mem( |
301 | perftools::gputools::DeviceMemoryBase(scratch_buffer, |
302 | scratch_buffer_size)); |
303 | |
304 | bool ok = executor_->SynchronousMemZero( |
305 | &mem, Eigen::kCudaScratchSize + sizeof(unsigned int)); |
306 | if (!ok) { |
307 | return errors::FailedPrecondition( |
308 | "Failed to memcopy into scratch buffer for device " , |
309 | tf_gpu_id_.value()); |
310 | } |
311 | |
312 | device_contexts_.push_back(new GPUDeviceContext( |
313 | i, streams_.back()->compute, streams_.back()->host_to_device, |
314 | streams_.back()->device_to_host, streams_.back()->device_to_device)); |
315 | } |
316 | gpu_device_info_ = new GpuDeviceInfo; |
317 | gpu_device_info_->stream = streams_[0]->compute; |
318 | gpu_device_info_->default_context = device_contexts_[0]; |
319 | gpu_device_info_->event_mgr = em_.get(); |
320 | gpu_device_info_->gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id_).value(); |
321 | set_tensorflow_gpu_device_info(gpu_device_info_); |
322 | |
323 | // Whether and how the GPU device uses its own threadpool. |
324 | // This option is experimental. Once we confirm the best setting, we |
325 | // may change the default behavior and completely remove this flag. |
326 | // Default values might change in future releases. |
327 | // Possible values: |
328 | // * global: GPU uses threads shared with CPU in the main compute |
329 | // thread-pool. This is currently the default. |
330 | // * gpu_private: GPU uses threads dedicated to this device. |
331 | // * gpu_shared: All GPUs share a dedicated thread pool. |
332 | string gpu_thread_mode; |
333 | TF_RETURN_IF_ERROR( |
334 | ReadStringFromEnvVar("TF_GPU_THREAD_MODE" , "global" , &gpu_thread_mode)); |
335 | gpu_thread_mode = str_util::Lowercase(gpu_thread_mode); |
336 | if (gpu_thread_mode != "global" ) { |
337 | int64 gpu_thread_count = -1; |
338 | // Default to two threads. One for device compute and another for memory |
339 | // copies. |
340 | TF_RETURN_IF_ERROR( |
341 | ReadInt64FromEnvVar("TF_GPU_THREAD_COUNT" , 2, &gpu_thread_count)); |
342 | if (gpu_thread_mode == "gpu_private" ) { |
343 | // TODO(zhengxq): since these threads only serve a single GPU device, |
344 | // we should set the device context once for each thread, and avoid |
345 | // setting them for each kernel. |
346 | // TODO(zhengxq): pin the thread to the same socket of the target GPU. |
347 | thread_pool_.reset(new thread::ThreadPool( |
348 | options.env, strings::StrCat("gpu_private_" , tf_gpu_id_.value()), |
349 | static_cast<int32>(gpu_thread_count))); |
350 | set_tensorflow_device_thread_pool(thread_pool_.get()); |
351 | } else if (gpu_thread_mode == "gpu_shared" ) { |
352 | static thread::ThreadPool* thread_pool = new thread::ThreadPool( |
353 | options.env, "gpu_shared" , static_cast<int32>(gpu_thread_count)); |
354 | set_tensorflow_device_thread_pool(thread_pool); |
355 | } else { |
356 | string error_message = |
357 | strings::StrCat("Invalid gpu_thread_mode: " , gpu_thread_mode); |
358 | LOG(WARNING) << error_message; |
359 | return errors::InvalidArgument(error_message); |
360 | } |
361 | } |
362 | |
363 | return Status::OK(); |
364 | } |
365 | |
366 | bool BaseGPUDevice::RequiresRecordingAccessedTensors() const { |
367 | // When there is no more than one stream, we release the tensor reference |
368 | // at the end of the kernel launch, instead of at the end of the kernel |
369 | // execution. |
370 | return streams_.size() > 1; |
371 | } |
372 | |
373 | Status BaseGPUDevice::FillContextMap(const Graph* graph, |
374 | DeviceContextMap* device_context_map) { |
375 | VLOG(2) << "FillContextMap" ; |
376 | |
377 | const size_t num_streams = streams_.size(); |
378 | // Special case for single stream. |
379 | if (num_streams == 1) { |
380 | return Status::OK(); |
381 | } |
382 | const int64 before = Env::Default()->NowMicros(); |
383 | gpu_stream_util::AssignStreamsOpts opts; |
384 | opts.max_streams = static_cast<int32>(num_streams); |
385 | std::unordered_map<int, int> node_to_stream_id; |
386 | TF_RETURN_IF_ERROR( |
387 | gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id)); |
388 | int64 elapsed = Env::Default()->NowMicros() - before; |
389 | VLOG(3) << "AssignStreams took " << elapsed << "us" ; |
390 | |
391 | // Fill in the context map. It is OK for this map to contain |
392 | // duplicate DeviceContexts so long as we increment the refcount. |
393 | device_context_map->resize(graph->num_node_ids()); |
394 | for (Node* n : graph->nodes()) { |
395 | auto mapped_stream = node_to_stream_id[n->id()]; |
396 | CHECK_LE(mapped_stream, num_streams); |
397 | auto ctx = device_contexts_[mapped_stream]; |
398 | VLOG(3) << "Assigned stream " << node_to_stream_id[n->id()] |
399 | << " ==> stream[" << ctx->stream_id() << "] for node id " << n->id() |
400 | << " " << n->type_string() << " " << n->name(); |
401 | ctx->Ref(); |
402 | (*device_context_map)[n->id()] = ctx; |
403 | } |
404 | |
405 | return Status::OK(); |
406 | } |
407 | |
408 | void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { |
409 | // ScopedActivity is cheap when tracing is not active, but we |
410 | // can avoid computing the Hash64. |
411 | // TODO(pbar) This would no longer be needed if Ops have a unique id. |
412 | const uint64 id = port::Tracing::IsActive() ? Hash64(op_kernel->name()) : 0; |
413 | port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute, |
414 | id); |
415 | |
416 | // NOTE(tucker): We need to discriminate between Eigen GPU |
417 | // operations and all others. If an operation is Eigen |
418 | // implemented (or otherwise tries to launch a cuda kernel |
419 | // directly), we need to establish a stacked-scoped environment |
420 | // that directs it to execute on the proper device. Otherwise we |
421 | // expect the Op to use StreamExecutor directly and correctly. The |
422 | // way we make this discrimination is quite hacky: At the moment |
423 | // the only non-Eigen GPU Op is the recv-op, which is known to be |
424 | // asynchronous. |
425 | if (op_kernel->is_internal() && op_kernel->type_string() == "_Recv" ) { |
426 | context->SetStatus(errors::Internal( |
427 | "Invalid synchronous 'Compute' on GPU for '_Recv' op" )); |
428 | } else if (port::Tracing::ScopedAnnotation::Enabled()) { |
429 | port::Tracing::ScopedAnnotation annotation(op_kernel->name(), |
430 | op_kernel->type_string()); |
431 | ComputeHelper(op_kernel, context); |
432 | } else { |
433 | ComputeHelper(op_kernel, context); |
434 | } |
435 | } |
436 | |
437 | void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel, |
438 | OpKernelContext* context) { |
439 | GPUDeviceContext* gpu_device_context = device_contexts_[0]; |
440 | if (context->op_device_context() != nullptr) { |
441 | gpu_device_context = |
442 | static_cast<GPUDeviceContext*>(context->op_device_context()); |
443 | } |
444 | gpu::Stream* stream = gpu_device_context->stream(); |
445 | const auto stream_id = gpu_device_context->stream_id(); |
446 | |
447 | const bool vlog_1 = VLOG_IS_ON(1); |
448 | const bool vlog_2 = vlog_1 && VLOG_IS_ON(2); |
449 | |
450 | if (vlog_1) { |
451 | VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op " |
452 | << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream[" |
453 | << stream_id << "]" ; |
454 | } |
455 | |
456 | const auto num_streams = streams_.size(); |
457 | if (num_streams > 1) { |
458 | // If this op's device context is different from the other contexts, |
459 | // we must wait on the stream. |
460 | for (int i = 0; i < context->num_inputs(); ++i) { |
461 | const GPUDeviceContext* idc = |
462 | static_cast<GPUDeviceContext*>(context->input_device_context(i)); |
463 | OP_REQUIRES(context, idc != nullptr, |
464 | errors::Internal("Input device context " , i, |
465 | " was not set properly." )); |
466 | if (vlog_2) { |
467 | const void* base; |
468 | size_t len; |
469 | if (context->has_input(i)) { |
470 | if (IsRefType(context->input_dtype(i))) { |
471 | Tensor tensor = context->mutable_input(i, false); |
472 | base = DMAHelper::base(&tensor); |
473 | len = tensor.TotalBytes(); |
474 | } else { |
475 | const Tensor& tensor = context->input(i); |
476 | base = DMAHelper::base(&tensor); |
477 | len = tensor.TotalBytes(); |
478 | } |
479 | LOG(INFO) << "Input " << i << " " << base << " " << len; |
480 | LOG(INFO) << " stream[" << stream_id << "].ThenWaitFor(stream[" |
481 | << idc->stream_id() << "])" |
482 | << ((idc->stream() == stream) ? " not needed" : "" ); |
483 | } |
484 | } |
485 | if (idc->stream() != stream) stream->ThenWaitFor(idc->stream()); |
486 | } |
487 | } |
488 | gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()}; |
489 | op_kernel->Compute(context); |
490 | if (context->status().ok()) { |
491 | if (sync_every_op_) { |
492 | // Note: GPUUtil::Sync() only syncs the default stream. |
493 | // We need to either sync the stream used by this op, or |
494 | // all streams. Given that this flag is typically used for |
495 | // debugging it makes more sense to sync all GPU activity. |
496 | context->SetStatus(GPUUtil::SyncAll(this)); |
497 | } |
498 | } |
499 | } |
500 | |
501 | void BaseGPUDevice::ConsumeListOfAccessedTensors( |
502 | DeviceContext* device_context, const TensorReferenceVector& tensor_refs) { |
503 | GPUDeviceContext* gpu_device_context = device_contexts_[0]; |
504 | if (device_context != nullptr) { |
505 | gpu_device_context = static_cast<GPUDeviceContext*>(device_context); |
506 | } |
507 | gpu::Stream* stream = gpu_device_context->stream(); |
508 | em_->ThenDeleteTensors(stream, tensor_refs); |
509 | } |
510 | |
511 | // Based on the semantics of Device::Sync this call should wait for |
512 | // all streams not just the current one. |
513 | Status BaseGPUDevice::Sync() { return GPUUtil::SyncAll(this); } |
514 | |
515 | void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel, |
516 | OpKernelContext* context, |
517 | AsyncOpKernel::DoneCallback done) { |
518 | GPUDeviceContext* gpu_device_context = device_contexts_[0]; |
519 | if (context->op_device_context() != nullptr) { |
520 | gpu_device_context = |
521 | static_cast<GPUDeviceContext*>(context->op_device_context()); |
522 | } |
523 | gpu::Stream* stream = gpu_device_context->stream(); |
524 | const auto stream_id = gpu_device_context->stream_id(); |
525 | |
526 | VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op " |
527 | << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream[" |
528 | << stream_id << "]" ; |
529 | |
530 | // When TraceMe profiling is off (which is the default), the |
531 | // following TraceMe constructor is simply a conditional test of |
532 | // false value. Measurements show that its overhead is negligible. |
533 | port::Tracing::TraceMe activity(op_kernel->name(), op_kernel->type_string(), |
534 | op_kernel->IsExpensive()); |
535 | gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()}; |
536 | op_kernel->ComputeAsync(context, done); |
537 | } |
538 | |
539 | Status BaseGPUDevice::MaybeCopyTensorToGPU( |
540 | const AllocatorAttributes& alloc_attrs, const Tensor& from, Tensor* to, |
541 | StatusCallback done) { |
542 | if (alloc_attrs.on_host()) { |
543 | *to = from; |
544 | done(Status::OK()); |
545 | return Status::OK(); |
546 | } else { |
547 | if (!DMAHelper::CanUseDMA(&from)) { |
548 | Status err = errors::Internal("GPU copy from non-DMA " , |
549 | DataTypeString(from.dtype()), " tensor" ); |
550 | done(err); |
551 | return err; |
552 | } |
553 | auto* copy = |
554 | new Tensor(GetAllocator(alloc_attrs), from.dtype(), from.shape()); |
555 | |
556 | // If the tensor is not initialized, we likely ran out of memory. |
557 | if (!copy->IsInitialized()) { |
558 | delete copy; |
559 | Status err = errors::ResourceExhausted( |
560 | "OOM when allocating tensor of shape " , from.shape().DebugString(), |
561 | " and type " , DataTypeString(from.dtype())); |
562 | done(err); |
563 | return err; |
564 | } |
565 | |
566 | StatusCallback wrapped_done = std::bind( |
567 | [to, copy](StatusCallback done_, |
568 | // Begin unbound arguments. |
569 | const Status& s) { |
570 | *to = std::move(*copy); |
571 | delete copy; |
572 | done_(s); |
573 | }, |
574 | std::move(done), std::placeholders::_1); |
575 | |
576 | port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto" ); |
577 | device_contexts_[0]->CopyCPUTensorToDevice(&from, this, copy, |
578 | std::move(wrapped_done)); |
579 | return Status::OK(); |
580 | } |
581 | } |
582 | |
583 | Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto, |
584 | const AllocatorAttributes alloc_attrs, |
585 | Tensor* tensor) { |
586 | AllocatorAttributes attr; |
587 | attr.set_on_host(true); |
588 | attr.set_gpu_compatible(true); |
589 | Allocator* host_alloc = GetAllocator(attr); |
590 | Tensor parsed(tensor_proto.dtype()); |
591 | if (!parsed.FromProto(host_alloc, tensor_proto)) { |
592 | return errors::InvalidArgument("Cannot parse tensor from proto: " , |
593 | tensor_proto.DebugString()); |
594 | } |
595 | |
596 | if (parsed.dtype() == DT_VARIANT) { |
597 | const Variant* from = parsed.flat<Variant>().data(); |
598 | Tensor copy(cpu_allocator(), DT_VARIANT, parsed.shape()); |
599 | Variant* copy_variant = copy.flat<Variant>().data(); |
600 | |
601 | std::list<Notification> notifications; |
602 | Status copy_status; |
603 | auto copier = [this, &alloc_attrs, ¬ifications, ©_status]( |
604 | const Tensor& from, Tensor* to) { |
605 | // Copier isn't run in a multithreaded environment, so we don't |
606 | // have to worry about the notifications list being modified in parallel. |
607 | notifications.emplace_back(); |
608 | Notification& n = *notifications.rbegin(); |
609 | return MaybeCopyTensorToGPU(alloc_attrs, from, to, |
610 | [&n, ©_status](const Status& s) { |
611 | if (copy_status.ok()) { |
612 | copy_status.Update(s); |
613 | } |
614 | n.Notify(); |
615 | }); |
616 | }; |
617 | Status s; |
618 | for (int64 ix = 0; ix < parsed.NumElements(); ++ix) { |
619 | s = VariantDeviceCopy(VariantDeviceCopyDirection::HOST_TO_DEVICE, |
620 | from[ix], ©_variant[ix], copier); |
621 | if (!s.ok()) { |
622 | break; |
623 | } |
624 | } |
625 | for (auto& n : notifications) { |
626 | n.WaitForNotification(); |
627 | } |
628 | if (!s.ok()) { |
629 | return s; |
630 | } |
631 | *tensor = std::move(copy); |
632 | return copy_status; |
633 | } else { |
634 | Notification n; |
635 | Status status; |
636 | TF_RETURN_IF_ERROR(MaybeCopyTensorToGPU(alloc_attrs, parsed, tensor, |
637 | [&n, &status](const Status& s) { |
638 | status = s; |
639 | n.Notify(); |
640 | })); |
641 | n.WaitForNotification(); |
642 | return status; |
643 | } |
644 | } |
645 | |
646 | namespace { |
647 | class ConcretePerOpGpuDevice : public PerOpGpuDevice { |
648 | public: |
649 | ConcretePerOpGpuDevice() : device_(&stream_device_) {} |
650 | |
651 | void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream, |
652 | TfGpuId tf_gpu_id, Allocator* base_allocator, |
653 | char* scratch) { |
654 | stream_device_.Reinitialize(context, cuda_stream, tf_gpu_id, base_allocator, |
655 | scratch); |
656 | } |
657 | |
658 | const Eigen::GpuDevice& device() const override { return device_; } |
659 | |
660 | private: |
661 | EigenCudaStreamDevice stream_device_; |
662 | Eigen::GpuDevice device_; |
663 | }; |
664 | |
665 | // Parse 'visible_device_list' into a list of CUDA GPU ids. |
666 | Status ParseVisibleDeviceList(const string& visible_device_list, |
667 | std::vector<CudaGpuId>* visible_gpu_order) { |
668 | visible_gpu_order->clear(); |
669 | gpu::Platform* gpu_manager = GPUMachineManager(); |
670 | |
671 | // If the user wants to remap the visible to virtual GPU mapping, |
672 | // check for that here. |
673 | if (visible_device_list.empty()) { |
674 | visible_gpu_order->resize(gpu_manager->VisibleDeviceCount()); |
675 | // By default, visible to virtual mapping is unchanged. |
676 | int deviceNo = 0; |
677 | std::generate(visible_gpu_order->begin(), visible_gpu_order->end(), |
678 | [&deviceNo] { return deviceNo++; }); |
679 | } else { |
680 | const std::vector<string> order_str = |
681 | str_util::Split(visible_device_list, ','); |
682 | for (const string& cuda_gpu_id_str : order_str) { |
683 | int32 cuda_gpu_id; |
684 | if (!strings::safe_strto32(cuda_gpu_id_str, &cuda_gpu_id)) { |
685 | return errors::InvalidArgument( |
686 | "Could not parse entry in 'visible_device_list': '" , |
687 | cuda_gpu_id_str, "'. visible_device_list = " , visible_device_list); |
688 | } |
689 | if (cuda_gpu_id < 0 || cuda_gpu_id >= gpu_manager->VisibleDeviceCount()) { |
690 | return errors::InvalidArgument( |
691 | "'visible_device_list' listed an invalid GPU id '" , cuda_gpu_id, |
692 | "' but visible device count is " , |
693 | gpu_manager->VisibleDeviceCount()); |
694 | } |
695 | visible_gpu_order->push_back(CudaGpuId(cuda_gpu_id)); |
696 | } |
697 | } |
698 | |
699 | // Validate no repeats. |
700 | std::set<CudaGpuId> visible_device_set(visible_gpu_order->begin(), |
701 | visible_gpu_order->end()); |
702 | if (visible_device_set.size() != visible_gpu_order->size()) { |
703 | return errors::InvalidArgument( |
704 | "visible_device_list contained a duplicate entry: " , |
705 | visible_device_list); |
706 | } |
707 | return Status::OK(); |
708 | } |
709 | |
710 | Status VerifyVirtualDeviceSettings( |
711 | const size_t num_gpus_to_use, const GPUOptions& gpu_options, |
712 | const std::vector<CudaGpuId>& visible_gpu_order, |
713 | const std::vector<CudaGpuId>& valid_cuda_gpu_ids) { |
714 | const auto& virtual_devices = gpu_options.experimental().virtual_devices(); |
715 | CHECK(!virtual_devices.empty()); |
716 | if (gpu_options.per_process_gpu_memory_fraction() > 0) { |
717 | return errors::InvalidArgument( |
718 | "It's invalid to set per_process_gpu_memory_fraction when " |
719 | "virtual_devices is set." ); |
720 | } |
721 | if (num_gpus_to_use < virtual_devices.size()) { |
722 | return errors::Unknown( |
723 | "Not enough GPUs to create virtual devices." |
724 | " num_gpus_to_use: " , |
725 | num_gpus_to_use, " #virtual_devices: " , virtual_devices.size()); |
726 | } |
727 | if (!gpu_options.visible_device_list().empty() && |
728 | visible_gpu_order.size() != virtual_devices.size()) { |
729 | return errors::InvalidArgument( |
730 | "The number of GPUs in visible_device_list doesn't match the number " |
731 | "of elements in the virtual_devices list." , |
732 | " #GPUs in visible_device_list: " , visible_gpu_order.size(), |
733 | " virtual_devices.size(): " , virtual_devices.size()); |
734 | } |
735 | if (valid_cuda_gpu_ids.size() != virtual_devices.size()) { |
736 | return errors::Unknown( |
737 | "The number of valid GPUs doesn't match the number of elements in " |
738 | "the virtual_devices list." , |
739 | " #valid GPUs: " , valid_cuda_gpu_ids.size(), |
740 | " virtual_devices.size(): " , virtual_devices.size()); |
741 | } |
742 | return Status::OK(); |
743 | } |
744 | |
745 | int64 MinSystemMemory(int64 available_memory) { |
746 | // We use the following heuristic for now: |
747 | // |
748 | // If the available_memory is < 2GiB, we allocate 225MiB to system memory. |
749 | // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory. |
750 | // |
751 | // In the future we could be more sophisticated by using a table of devices. |
752 | int64 min_system_memory; |
753 | if (available_memory < (1LL << 31)) { |
754 | // 225MiB |
755 | min_system_memory = 225 * 1024 * 1024; |
756 | } else { |
757 | // max(300 MiB, 0.05 * available_memory) |
758 | min_system_memory = |
759 | std::max(314572800LL, static_cast<int64>(available_memory * 0.05)); |
760 | } |
761 | #if defined(__GNUC__) && defined(__OPTIMIZE__) |
762 | // Do nothing |
763 | #elif !defined(__GNUC__) && defined(NDEBUG) |
764 | // Do nothing |
765 | #else |
766 | // Double the amount of available GPU memory in non-opt builds (debug |
767 | // builds in windows); because in non-opt builds more system memory |
768 | // is necessary. |
769 | min_system_memory *= 2; |
770 | #endif |
771 | |
772 | #if defined(ANDROID_TEGRA) |
773 | // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM |
774 | // and Video RAM |
775 | min_system_memory = 1 << 30; |
776 | #endif |
777 | return min_system_memory; |
778 | } |
779 | |
780 | // Get the memory limit for the virtual device being created on GPU with |
781 | // 'cuda_gpu_id', when that virtual device is the only virtual device being |
782 | // created on that GPU. |
783 | Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options, |
784 | CudaGpuId cuda_gpu_id, |
785 | int64* memory_limit) { |
786 | int64 total_memory = 0; |
787 | int64 available_memory = 0; |
788 | gpu::StreamExecutor* se = |
789 | GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); |
790 | if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) { |
791 | return errors::Unknown("Failed to query available memory for GPU " , |
792 | cuda_gpu_id.value()); |
793 | } |
794 | |
795 | int64 allocated_memory = 0; |
796 | const double per_process_gpu_memory_fraction = |
797 | gpu_options.per_process_gpu_memory_fraction(); |
798 | if (per_process_gpu_memory_fraction == 0) { |
799 | allocated_memory = available_memory; |
800 | const int64 min_system_memory = MinSystemMemory(available_memory); |
801 | if (min_system_memory < allocated_memory) { |
802 | allocated_memory -= min_system_memory; |
803 | } |
804 | } else { |
805 | allocated_memory = total_memory * per_process_gpu_memory_fraction; |
806 | } |
807 | *memory_limit = allocated_memory; |
808 | return Status::OK(); |
809 | } |
810 | } // namespace |
811 | |
812 | void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context, |
813 | PerOpGpuDevice* device, int stream_id, |
814 | Allocator* allocator) { |
815 | ConcretePerOpGpuDevice* concrete_device = |
816 | static_cast<ConcretePerOpGpuDevice*>(device); |
817 | DCHECK(concrete_device); |
818 | const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>( |
819 | streams_[stream_id]->compute->implementation()->CudaStreamMemberHack()); |
820 | concrete_device->Reinitialize(context, cuda_stream, tf_gpu_id_, allocator, |
821 | scratch_[stream_id]); |
822 | } |
823 | |
824 | PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() { |
825 | return new ConcretePerOpGpuDevice(); |
826 | } |
827 | |
828 | void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context, |
829 | PerOpGpuDevice* device, |
830 | DeviceContext* dc, |
831 | Allocator* allocator) { |
832 | if (dc) { |
833 | const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc); |
834 | const int stream_id = gpu_dc->stream_id(); |
835 | VLOG(1) << " eigen_gpu_device(" << dc << ") => stream[" << stream_id |
836 | << "]" ; |
837 | CHECK_LT(stream_id, streams_.size()); |
838 | ReinitializeDevice(context, device, stream_id, allocator); |
839 | } else { |
840 | ReinitializeDevice(context, device, 0, allocator); |
841 | } |
842 | } |
843 | |
844 | Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr, |
845 | int64 step_id) { |
846 | if (attr.scope_id > 0) { |
847 | return scoped_allocator_mgr_->GetContainer(step_id)->GetInstance( |
848 | attr.scope_id); |
849 | } |
850 | LOG(FATAL) << "Unexpected call to BaseGPUDevice::GetScopedAllocator " |
851 | << "attr.scope_id = " << attr.scope_id; |
852 | return gpu_allocator_; |
853 | } |
854 | |
855 | const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = 1000; |
856 | const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = 1; |
857 | |
858 | Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, |
859 | const string& name_prefix, |
860 | std::vector<Device*>* devices) { |
861 | TF_RETURN_IF_ERROR(ValidateGPUMachineManager()); |
862 | gpu::Platform* gpu_manager = GPUMachineManager(); |
863 | if (gpu_manager == nullptr) { |
864 | return Status::OK(); |
865 | } |
866 | // If there are no GPUs visible, do nothing. |
867 | if (gpu_manager->VisibleDeviceCount() <= 0) { |
868 | return Status::OK(); |
869 | } |
870 | |
871 | size_t num_gpus_to_use = INT_MAX; |
872 | auto iter = options.config.device_count().find("GPU" ); |
873 | if (iter != options.config.device_count().end()) { |
874 | num_gpus_to_use = iter->second; |
875 | } |
876 | const auto& gpu_options = options.config.gpu_options(); |
877 | std::vector<CudaGpuId> visible_gpu_order; |
878 | TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(), |
879 | &visible_gpu_order)); |
880 | |
881 | std::vector<CudaGpuId> valid_cuda_gpu_ids; |
882 | TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &valid_cuda_gpu_ids)); |
883 | if (num_gpus_to_use > valid_cuda_gpu_ids.size()) { |
884 | num_gpus_to_use = valid_cuda_gpu_ids.size(); |
885 | } |
886 | if (!valid_cuda_gpu_ids.empty()) { |
887 | // Save the original device. |
888 | int original_device = 0; |
889 | cudaError_t err = cudaGetDevice(&original_device); |
890 | if (err != cudaSuccess) { |
891 | return errors::Internal("cudaGetDevice() failed. Status: " , |
892 | cudaGetErrorString(err)); |
893 | } |
894 | // Force to implicitly initialize CUDA runtime on each valid GPU before |
895 | // CreateGPUDevice(). |
896 | for (CudaGpuId cuda_gpu_id : valid_cuda_gpu_ids) { |
897 | err = cudaSetDevice(cuda_gpu_id.value()); |
898 | if (err != cudaSuccess) { |
899 | return errors::Internal("cudaSetDevice() on GPU:" , cuda_gpu_id.value(), |
900 | " failed. Status: " , cudaGetErrorString(err)); |
901 | } |
902 | err = cudaFree(nullptr); |
903 | if (err != cudaSuccess) { |
904 | return errors::Internal( |
905 | "CUDA runtime implicit initialization on GPU:" , cuda_gpu_id.value(), |
906 | " failed. Status: " , cudaGetErrorString(err)); |
907 | } |
908 | } |
909 | // Reset to the original device. |
910 | err = cudaSetDevice(original_device); |
911 | if (err != cudaSuccess) { |
912 | return errors::Internal("cudaSetDevice() on GPU:" , original_device, |
913 | " failed. Status: " , cudaGetErrorString(err)); |
914 | } |
915 | } |
916 | |
917 | std::vector<InterconnectMap> interconnect_maps; |
918 | TF_RETURN_IF_ERROR( |
919 | GetInterconnectMaps(visible_gpu_order, gpu_manager, &interconnect_maps)); |
920 | |
921 | // Print each interconnect map to the log. |
922 | for (const InterconnectMap& im : interconnect_maps) { |
923 | LOG(INFO) << "Device interconnect " << im.name << " with strength " |
924 | << im.strength << " edge matrix:" ; |
925 | string line_buf = " " ; |
926 | for (int i = 0; i < visible_gpu_order.size(); ++i) { |
927 | strings::StrAppend(&line_buf, visible_gpu_order[i].value(), " " ); |
928 | } |
929 | LOG(INFO) << line_buf; |
930 | for (int i = 0; i < visible_gpu_order.size(); ++i) { |
931 | line_buf = strings::StrCat(visible_gpu_order[i].value(), ": " ); |
932 | CudaGpuId cuda_id_i = visible_gpu_order[i]; |
933 | for (int j = 0; j < visible_gpu_order.size(); ++j) { |
934 | CudaGpuId cuda_id_j = visible_gpu_order[j]; |
935 | if (im.directed_links.find({cuda_id_i, cuda_id_j}) != |
936 | im.directed_links.end()) { |
937 | line_buf.append("Y " ); |
938 | } else { |
939 | line_buf.append("N " ); |
940 | } |
941 | } |
942 | LOG(INFO) << line_buf; |
943 | } |
944 | } |
945 | |
946 | const auto& virtual_devices = gpu_options.experimental().virtual_devices(); |
947 | if (!virtual_devices.empty()) { |
948 | TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings( |
949 | num_gpus_to_use, gpu_options, visible_gpu_order, valid_cuda_gpu_ids)); |
950 | // We've verified that num_gpus_to_use >= virtual_devices.size(). |
951 | num_gpus_to_use = virtual_devices.size(); |
952 | CHECK(gpu_options.visible_device_list().empty() || |
953 | valid_cuda_gpu_ids == visible_gpu_order); |
954 | } |
955 | int next_tf_gpu_id = 0; |
956 | std::vector<int64> memory_limit_bytes; |
957 | for (int i = 0; i < num_gpus_to_use; ++i) { |
958 | const CudaGpuId cuda_gpu_id = valid_cuda_gpu_ids[i]; |
959 | if (virtual_devices.empty() || |
960 | virtual_devices.Get(i).memory_limit_mb_size() == 0) { |
961 | int64 single_virtual_device_memory_limit = 0; |
962 | TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit( |
963 | gpu_options, cuda_gpu_id, &single_virtual_device_memory_limit)); |
964 | memory_limit_bytes.push_back(single_virtual_device_memory_limit); |
965 | } else { |
966 | const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb(); |
967 | std::transform(memory_limit_mb.begin(), memory_limit_mb.end(), |
968 | std::back_inserter(memory_limit_bytes), [](float mb) { |
969 | return static_cast<int64>(mb) * (1ll << 20); |
970 | }); |
971 | } |
972 | while (next_tf_gpu_id < memory_limit_bytes.size()) { |
973 | TfGpuId tf_gpu_id(next_tf_gpu_id); |
974 | ++next_tf_gpu_id; |
975 | GpuIdManager::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id); |
976 | } |
977 | } |
978 | const int num_tf_gpus = next_tf_gpu_id; |
979 | |
980 | LocalityMap device_localities; |
981 | TF_RETURN_IF_ERROR( |
982 | GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities)); |
983 | |
984 | // Build the GPUDevices |
985 | CHECK_EQ(next_tf_gpu_id, memory_limit_bytes.size()); |
986 | for (int di = 0; di < num_tf_gpus; ++di) { |
987 | TfGpuId tf_gpu_id(di); |
988 | int64 bytes = memory_limit_bytes[di]; |
989 | auto it = device_localities.find(tf_gpu_id); |
990 | if (it == device_localities.end()) { |
991 | return errors::Internal("Failed to find DeviceLocality for GPU device " , |
992 | tf_gpu_id.value()); |
993 | } |
994 | TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes, |
995 | it->second, devices)); |
996 | } |
997 | return Status::OK(); |
998 | } |
999 | |
1000 | static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id, |
1001 | const gpu::DeviceDescription& desc) { |
1002 | int cc_major; |
1003 | int cc_minor; |
1004 | if (!desc.cuda_compute_capability(&cc_major, &cc_minor)) { |
1005 | cc_major = 0; |
1006 | cc_minor = 0; |
1007 | } |
1008 | // LINT.IfChange |
1009 | return strings::StrCat("device: " , cuda_gpu_id.value(), |
1010 | ", name: " , desc.name(), |
1011 | ", pci bus id: " , desc.pci_bus_id(), |
1012 | ", compute capability: " , cc_major, "." , cc_minor); |
1013 | // LINT.ThenChange(//tensorflow/python/platform/test.py) |
1014 | } |
1015 | |
1016 | Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options, |
1017 | const string& name_prefix, |
1018 | TfGpuId tf_gpu_id, |
1019 | int64 memory_limit, |
1020 | const DeviceLocality& dev_locality, |
1021 | std::vector<Device*>* devices) { |
1022 | CHECK_GE(tf_gpu_id.value(), 0); |
1023 | const string device_name = |
1024 | strings::StrCat(name_prefix, "/device:GPU:" , tf_gpu_id.value()); |
1025 | GpuIdUtil::CheckValidTfGpuId(tf_gpu_id); |
1026 | CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id); |
1027 | int numa_node = dev_locality.numa_node(); |
1028 | |
1029 | gpu::StreamExecutor* se = |
1030 | GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); |
1031 | const gpu::DeviceDescription& desc = se->GetDeviceDescription(); |
1032 | ProcessState* process_state = ProcessState::singleton(); |
1033 | Allocator* gpu_allocator = process_state->GetGPUAllocator( |
1034 | options.config.gpu_options(), tf_gpu_id, memory_limit); |
1035 | if (gpu_allocator == nullptr) { |
1036 | return errors::Internal("Failed to get memory allocator for TF GPU " , |
1037 | tf_gpu_id.value(), " with " , memory_limit, |
1038 | " bytes of memory." ); |
1039 | } |
1040 | AllocatorStats stats; |
1041 | gpu_allocator->GetStats(&stats); |
1042 | // 'memory_limit' is the required memory size, but if the allocator with given |
1043 | // tf_gpu_id was created before, we'll use it instead of creating a new one |
1044 | // (as TF gpu device is a shared resource), in which case the actual memory |
1045 | // limit represented by 'stats.bytes_limit' used by that allocator may be |
1046 | // different (which should be an error). |
1047 | // |
1048 | // TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit. |
1049 | BaseGPUDevice* gpu_device = CreateGPUDevice( |
1050 | options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality, |
1051 | tf_gpu_id, GetShortDeviceDescription(cuda_gpu_id, desc), gpu_allocator, |
1052 | process_state->GetCPUAllocator(numa_node)); |
1053 | LOG(INFO) << "Created TensorFlow device (" << device_name << " with " |
1054 | << (stats.bytes_limit >> 20) << " MB memory) -> physical GPU (" |
1055 | << GetShortDeviceDescription(cuda_gpu_id, desc) << ")" ; |
1056 | TF_RETURN_IF_ERROR(gpu_device->Init(options)); |
1057 | devices->push_back(gpu_device); |
1058 | |
1059 | return Status::OK(); |
1060 | } |
1061 | |
1062 | namespace { |
1063 | std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> |
1064 | GetPeerAccessMap(gpu::Platform* platform, |
1065 | const std::vector<CudaGpuId>& visible_gpu_order) { |
1066 | std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> map( |
1067 | new std::map<std::pair<CudaGpuId, CudaGpuId>, bool>); |
1068 | for (CudaGpuId cuda_gpu_i : visible_gpu_order) { |
1069 | for (CudaGpuId cuda_gpu_j : visible_gpu_order) { |
1070 | gpu::StreamExecutor* from = |
1071 | GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie(); |
1072 | gpu::StreamExecutor* to = |
1073 | GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie(); |
1074 | (*map)[{cuda_gpu_i, cuda_gpu_j}] = from->CanEnablePeerAccessTo(to); |
1075 | } |
1076 | } |
1077 | |
1078 | return map; |
1079 | } |
1080 | |
1081 | } // namespace |
1082 | |
1083 | Status BaseGPUDeviceFactory::GetInterconnectMaps( |
1084 | const std::vector<CudaGpuId>& visible_gpu_order, gpu::Platform* gpu_manager, |
1085 | std::vector<InterconnectMap>* maps) { |
1086 | // The default interconnect map is obtained from the StreamExecutor. |
1087 | auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order); |
1088 | maps->resize(1); |
1089 | InterconnectMap& imap = maps->at(0); |
1090 | imap.name = "StreamExecutor" ; |
1091 | imap.strength = InterconnectMap::kStreamExecutorStrength; |
1092 | for (CudaGpuId cuda_id_i : visible_gpu_order) { |
1093 | for (CudaGpuId cuda_id_j : visible_gpu_order) { |
1094 | if (cuda_id_i == cuda_id_j) continue; |
1095 | if ((*access_map)[{cuda_id_i, cuda_id_j}]) { |
1096 | imap.directed_links.insert({cuda_id_i, cuda_id_j}); |
1097 | } |
1098 | } |
1099 | } |
1100 | return Status::OK(); |
1101 | } |
1102 | |
1103 | Status BaseGPUDeviceFactory::GetDeviceLocalities( |
1104 | int num_tf_gpus, const std::vector<InterconnectMap>& interconnects, |
1105 | LocalityMap* localities) { |
1106 | std::vector<TfGpuId> all_tf_gpu_ids; |
1107 | for (int i = 0; i < num_tf_gpus; ++i) { |
1108 | all_tf_gpu_ids.push_back(TfGpuId(i)); |
1109 | } |
1110 | for (TfGpuId tf_gpu_id : all_tf_gpu_ids) { |
1111 | CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id); |
1112 | // Get GPU bus_id from its reported NUMA affinity. Because GPUs are |
1113 | // virtualized in some environments, we can't just use the GPU id. |
1114 | // NUMA locales are indexed from 0, buses are indexed from 1. |
1115 | gpu::StreamExecutor* se = |
1116 | GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); |
1117 | const gpu::DeviceDescription& desc = se->GetDeviceDescription(); |
1118 | int numa_node = desc.numa_node(); |
1119 | if (numa_node < 0) { |
1120 | // For some reason the StreamExecutor couldn't get the NUMA |
1121 | // affinity of the GPU. If this is not a multi-socket mobo with |
1122 | // GPUs local to different buses, it doesn't matter. If it is, we |
1123 | // may run into trouble later with data transfer operations. The |
1124 | // trouble may manifest as slower than expected performance, or |
1125 | // outright failures. |
1126 | LOG(INFO) << "Could not identify NUMA node of CUDA gpu id " << cuda_gpu_id |
1127 | << ", defaulting to 0. Your kernel may not have been built " |
1128 | << "with NUMA support." ; |
1129 | numa_node = 0; |
1130 | } |
1131 | DeviceLocality dev_locality; |
1132 | dev_locality.set_numa_node(numa_node); |
1133 | dev_locality.set_bus_id(numa_node + 1); |
1134 | |
1135 | // Set LocalLinks from InterconnectMaps. |
1136 | LocalLinks* links = dev_locality.mutable_links(); |
1137 | for (const InterconnectMap& imap : interconnects) { |
1138 | for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) { |
1139 | CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst); |
1140 | if (imap.directed_links.find({cuda_gpu_id, cuda_gpu_dst}) != |
1141 | imap.directed_links.end()) { |
1142 | InterconnectLink* ilink = links->add_link(); |
1143 | ilink->set_device_id(tf_gpu_dst.value()); |
1144 | ilink->set_type(imap.name); |
1145 | ilink->set_strength(imap.strength); |
1146 | } |
1147 | } |
1148 | } |
1149 | |
1150 | // If this is one of multiple virtual GPUs on the same physical GPU |
1151 | // add high strength links to the others. |
1152 | for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) { |
1153 | if (tf_gpu_id == tf_gpu_dst) continue; |
1154 | CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst); |
1155 | if (cuda_gpu_id == cuda_gpu_dst) { |
1156 | InterconnectLink* ilink = links->add_link(); |
1157 | ilink->set_device_id(tf_gpu_dst.value()); |
1158 | ilink->set_type("SAME_DEVICE" ); |
1159 | ilink->set_strength(InterconnectMap::kSameDeviceStrength); |
1160 | } |
1161 | } |
1162 | |
1163 | (*localities)[tf_gpu_id] = dev_locality; |
1164 | VLOG(1) << "GPUDevice CudaGpuId " << cuda_gpu_id << " TfGpuId " << tf_gpu_id |
1165 | << " on bus " << dev_locality.bus_id() << " numa: " << numa_node |
1166 | << " pci: " << desc.pci_bus_id() |
1167 | << " DeviceLocality: " << dev_locality.DebugString(); |
1168 | } |
1169 | return Status::OK(); |
1170 | } |
1171 | |
1172 | static int GetDefaultMinGPUMultiprocessorCount( |
1173 | gpu::Platform* gpu_manager, |
1174 | const std::vector<CudaGpuId>& visible_gpu_order) { |
1175 | static const int kDefaultMinGPUMultiprocessorCount = 8; |
1176 | |
1177 | // Find the highest multi-processor count across all visible GPUs. |
1178 | int max_count = -1; |
1179 | for (int i = 0; i < visible_gpu_order.size(); ++i) { |
1180 | auto exec_status = |
1181 | GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_order[i]); |
1182 | if (!exec_status.ok()) { |
1183 | continue; |
1184 | } |
1185 | |
1186 | gpu::StreamExecutor* se = exec_status.ValueOrDie(); |
1187 | const gpu::DeviceDescription& desc = se->GetDeviceDescription(); |
1188 | max_count = std::max(max_count, desc.core_count()); |
1189 | } |
1190 | |
1191 | if (max_count < 0 || kDefaultMinGPUMultiprocessorCount < max_count) { |
1192 | return kDefaultMinGPUMultiprocessorCount; |
1193 | } else { |
1194 | return max_count; |
1195 | } |
1196 | } |
1197 | |
1198 | static int GetMinGPUMultiprocessorCount( |
1199 | gpu::Platform* gpu_manager, |
1200 | const std::vector<CudaGpuId>& visible_gpu_order) { |
1201 | const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT" ); |
1202 | |
1203 | if (tf_min_gpu_core_count == nullptr || |
1204 | strcmp(tf_min_gpu_core_count, "" ) == 0) { |
1205 | return GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order); |
1206 | } |
1207 | |
1208 | int min_gpu_core_count = -1; |
1209 | if (strings::safe_strto32(tf_min_gpu_core_count, &min_gpu_core_count)) { |
1210 | if (min_gpu_core_count >= 0) { |
1211 | return min_gpu_core_count; |
1212 | } |
1213 | } |
1214 | |
1215 | int count = |
1216 | GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order); |
1217 | LOG(ERROR) << "Invalid minimum GPU multiprocessor count: [" |
1218 | << tf_min_gpu_core_count << "]. " |
1219 | << "Using the default value: " << count; |
1220 | return count; |
1221 | } |
1222 | |
1223 | namespace { |
1224 | |
1225 | struct CudaVersion { |
1226 | // Initialize from version_name in the form of "3.5" |
1227 | explicit CudaVersion(const std::string& version_name) { |
1228 | size_t dot_pos = version_name.find('.'); |
1229 | CHECK(dot_pos != string::npos) |
1230 | << "Illegal version name: [" << version_name << "]" ; |
1231 | string major_str = version_name.substr(0, dot_pos); |
1232 | CHECK(strings::safe_strto32(major_str, &major_part)) |
1233 | << "Illegal version name: [" << version_name << "]" ; |
1234 | string minor_str = version_name.substr(dot_pos + 1); |
1235 | CHECK(strings::safe_strto32(minor_str, &minor_part)) |
1236 | << "Illegal version name: [" << version_name << "]" ; |
1237 | } |
1238 | CudaVersion() {} |
1239 | bool operator<(const CudaVersion& other) const { |
1240 | if (this->major_part != other.major_part) { |
1241 | return this->major_part < other.major_part; |
1242 | } |
1243 | return this->minor_part < other.minor_part; |
1244 | } |
1245 | friend std::ostream& operator<<(std::ostream& os, |
1246 | const CudaVersion& version) { |
1247 | os << version.major_part << "." << version.minor_part; |
1248 | return os; |
1249 | } |
1250 | int major_part = -1; |
1251 | int minor_part = -1; |
1252 | }; |
1253 | |
1254 | std::vector<CudaVersion> supported_cuda_compute_capabilities = { |
1255 | TF_CUDA_CAPABILITIES,}; |
1256 | |
1257 | std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() { |
1258 | auto cuda_caps = supported_cuda_compute_capabilities; |
1259 | #ifdef TF_EXTRA_CUDA_CAPABILITIES |
1260 | // TF_EXTRA_CUDA_CAPABILITIES should be defined a sequence separated by commas, |
1261 | // for example: |
1262 | // TF_EXTRA_CUDA_CAPABILITIES=3.0,4.0,5.0 |
1263 | // Use two-level macro expansion for stringification. |
1264 | #define TF_XSTRING(...) #__VA_ARGS__ |
1265 | #define TF_STRING(s) TF_XSTRING(s) |
1266 | string extra_cuda_caps = TF_STRING(TF_EXTRA_CUDA_CAPABILITIES); |
1267 | #undef TF_STRING |
1268 | #undef TF_XSTRING |
1269 | auto extra_capabilities = str_util::Split(extra_cuda_caps, ','); |
1270 | for (const auto& capability : extra_capabilities) { |
1271 | cuda_caps.push_back(CudaVersion(capability)); |
1272 | } |
1273 | #endif |
1274 | return cuda_caps; |
1275 | } |
1276 | |
1277 | Status EnablePeerAccess(gpu::Platform* platform, |
1278 | const std::vector<CudaGpuId>& visible_gpu_order) { |
1279 | int possible_peer_count = 0; |
1280 | int enabled_peer_count = 0; |
1281 | for (int i = 0; i < visible_gpu_order.size(); ++i) { |
1282 | const CudaGpuId cuda_gpu_i = visible_gpu_order[i]; |
1283 | for (int j = 0; j < visible_gpu_order.size(); ++j) { |
1284 | const CudaGpuId cuda_gpu_j = visible_gpu_order[j]; |
1285 | // We have already validated that ExecutorForDevice() calls return OK. |
1286 | gpu::StreamExecutor* from = |
1287 | GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie(); |
1288 | gpu::StreamExecutor* to = |
1289 | GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie(); |
1290 | |
1291 | if (from->CanEnablePeerAccessTo(to)) { |
1292 | ++possible_peer_count; |
1293 | auto status = from->EnablePeerAccessTo(to); |
1294 | if (!status.ok()) { |
1295 | LOG(WARNING) |
1296 | << "Unable to enable peer access between device ordinals " |
1297 | << cuda_gpu_i << " and " << cuda_gpu_j << ", status: " << status; |
1298 | } else { |
1299 | ++enabled_peer_count; |
1300 | } |
1301 | } |
1302 | } |
1303 | } |
1304 | |
1305 | // Return an error in the extreme failure case where the driver |
1306 | // reported that peering was possible but not a single peering was |
1307 | // successful. This is to catch possible system misconfigurations |
1308 | // or more fundamental issues. |
1309 | if (possible_peer_count > 0 && enabled_peer_count == 0) { |
1310 | return errors::Internal(possible_peer_count, |
1311 | " potential peer access pairs were reported by the " |
1312 | "driver, but no peering could be enabled." ); |
1313 | } |
1314 | return Status::OK(); |
1315 | } |
1316 | |
1317 | } // namespace |
1318 | |
1319 | Status BaseGPUDeviceFactory::GetValidDeviceIds( |
1320 | const std::vector<CudaGpuId>& visible_gpu_order, |
1321 | std::vector<CudaGpuId>* ids) { |
1322 | gpu::Platform* gpu_manager = GPUMachineManager(); |
1323 | bool new_gpu_found = false; |
1324 | for (int i = 0; i < visible_gpu_order.size(); ++i) { |
1325 | const CudaGpuId cuda_gpu_id = visible_gpu_order[i]; |
1326 | |
1327 | // Only perform this once per visible cuda gpu id. |
1328 | if (visible_gpu_initialized_[cuda_gpu_id.value()]) { |
1329 | continue; |
1330 | } |
1331 | |
1332 | visible_gpu_initialized_[cuda_gpu_id.value()] = true; |
1333 | new_gpu_found = true; |
1334 | |
1335 | auto executor = GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, cuda_gpu_id); |
1336 | if (!executor.ok()) { |
1337 | return StreamExecutorUtil::ConvertStatus(executor.status()); |
1338 | } |
1339 | |
1340 | auto stream_exec = executor.ValueOrDie(); |
1341 | int64 free_bytes; |
1342 | int64 total_bytes; |
1343 | if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) { |
1344 | // Logs internally on failure. |
1345 | free_bytes = 0; |
1346 | total_bytes = 0; |
1347 | } |
1348 | const auto& description = stream_exec->GetDeviceDescription(); |
1349 | int cc_major; |
1350 | int cc_minor; |
1351 | if (!description.cuda_compute_capability(&cc_major, &cc_minor)) { |
1352 | // Logs internally on failure. |
1353 | cc_major = 0; |
1354 | cc_minor = 0; |
1355 | } |
1356 | LOG(INFO) << "Found device " << i << " with properties: " |
1357 | << "\nname: " << description.name() << " major: " << cc_major |
1358 | << " minor: " << cc_minor |
1359 | << " memoryClockRate(GHz): " << description.clock_rate_ghz() |
1360 | << "\npciBusID: " << description.pci_bus_id() << "\ntotalMemory: " |
1361 | << strings::HumanReadableNumBytes(total_bytes) |
1362 | << " freeMemory: " << strings::HumanReadableNumBytes(free_bytes); |
1363 | } |
1364 | // Checking peering and shows matrix if more than one gpu found. |
1365 | if (new_gpu_found && visible_gpu_order.size() > 1) { |
1366 | // Enable peer access |
1367 | TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order)); |
1368 | } |
1369 | |
1370 | auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities(); |
1371 | if (cuda_supported_capabilities.empty()) { |
1372 | return errors::FailedPrecondition( |
1373 | "No supported cuda capabilities in binary." ); |
1374 | } |
1375 | CudaVersion min_supported_capability = *std::min_element( |
1376 | cuda_supported_capabilities.begin(), cuda_supported_capabilities.end()); |
1377 | |
1378 | int min_gpu_core_count = |
1379 | GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order); |
1380 | |
1381 | // Filter out devices that don't have the right capability or power. |
1382 | for (int i = 0; i < visible_gpu_order.size(); ++i) { |
1383 | const CudaGpuId visible_gpu_id = visible_gpu_order[i]; |
1384 | auto exec_status = |
1385 | GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_id); |
1386 | if (!exec_status.ok()) { |
1387 | LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id |
1388 | << " whose executor is in invalid state: " |
1389 | << exec_status.status().ToString(); |
1390 | continue; |
1391 | } |
1392 | gpu::StreamExecutor* se = exec_status.ValueOrDie(); |
1393 | const gpu::DeviceDescription& desc = se->GetDeviceDescription(); |
1394 | CudaVersion device_capability; |
1395 | if (!desc.cuda_compute_capability(&device_capability.major_part, |
1396 | &device_capability.minor_part)) { |
1397 | LOG(INFO) << "Ignoring visible gpu device " |
1398 | << "(" << GetShortDeviceDescription(visible_gpu_id, desc) |
1399 | << ") " |
1400 | << "whose CUDA compute capability is not available." ; |
1401 | continue; |
1402 | } |
1403 | // Only GPUs with no less than the minimum supported compute capability is |
1404 | // accepted. |
1405 | if (device_capability < min_supported_capability) { |
1406 | LOG(INFO) << "Ignoring visible gpu device " |
1407 | << "(" << GetShortDeviceDescription(visible_gpu_id, desc) |
1408 | << ") " |
1409 | << "with Cuda compute capability " << device_capability |
1410 | << ". The minimum required Cuda capability is " |
1411 | << min_supported_capability << "." ; |
1412 | continue; |
1413 | } |
1414 | |
1415 | // Filter out slow GPUs. By default, GPUs with a lower multiprocessor |
1416 | // count than the fastest GPU are filtered out, unless they have 8 or more |
1417 | // multiprocessors. If the TF_MIN_GPU_MULTIPROCESSOR_COUNT environment |
1418 | // variable is set, its value will be used to filter out GPUs. |
1419 | if (desc.core_count() < min_gpu_core_count) { |
1420 | LOG(INFO) << "Ignoring visible gpu device " |
1421 | << "(" << GetShortDeviceDescription(visible_gpu_id, desc) |
1422 | << ") " |
1423 | << "with Cuda multiprocessor count: " << desc.core_count() |
1424 | << ". The minimum required count is " << min_gpu_core_count |
1425 | << ". You can adjust this requirement with the env var " |
1426 | "TF_MIN_GPU_MULTIPROCESSOR_COUNT." ; |
1427 | continue; |
1428 | } |
1429 | ids->push_back(visible_gpu_id); |
1430 | } |
1431 | if (!ids->empty()) { |
1432 | std::vector<int> raw_ids(ids->size()); |
1433 | std::transform(ids->begin(), ids->end(), raw_ids.begin(), |
1434 | [](CudaGpuId id) -> int { return id.value(); }); |
1435 | LOG(INFO) << "Adding visible gpu devices: " |
1436 | << str_util::Join(raw_ids, ", " ); |
1437 | } |
1438 | |
1439 | return Status::OK(); |
1440 | } |
1441 | |
1442 | } // namespace tensorflow |
1443 | |
1444 | #endif // GOOGLE_CUDA |
1445 | |