1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16// TODO(opensource): Use a more generic sounding preprocessor name than
17// GOOGLE_CUDA
18#if GOOGLE_CUDA
19
20#define EIGEN_USE_GPU
21
22#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
23
24#include <stdlib.h>
25#include <string.h>
26#include <algorithm>
27#include <list>
28#include <map>
29#include <tuple>
30#include <vector>
31
32#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
33#include "tensorflow/core/common_runtime/device_factory.h"
34#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
35#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
36#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
37#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
38#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
39#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
40#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
41#include "tensorflow/core/common_runtime/gpu/process_state.h"
42#include "tensorflow/core/common_runtime/gpu_device_context.h"
43#include "tensorflow/core/common_runtime/local_device.h"
44#include "tensorflow/core/framework/allocator.h"
45#include "tensorflow/core/framework/device_base.h"
46#include "tensorflow/core/framework/op_kernel.h"
47#include "tensorflow/core/framework/tensor.h"
48#include "tensorflow/core/framework/tensor.pb.h"
49#include "tensorflow/core/framework/types.h"
50#include "tensorflow/core/framework/variant_op_registry.h"
51#include "tensorflow/core/graph/types.h"
52#include "tensorflow/core/lib/core/errors.h"
53#include "tensorflow/core/lib/core/status.h"
54#include "tensorflow/core/lib/gtl/stl_util.h"
55#include "tensorflow/core/lib/strings/numbers.h"
56#include "tensorflow/core/lib/strings/str_util.h"
57#include "tensorflow/core/lib/strings/strcat.h"
58#include "tensorflow/core/platform/cuda.h"
59#include "tensorflow/core/platform/logging.h"
60#include "tensorflow/core/platform/macros.h"
61#include "tensorflow/core/platform/stream_executor.h"
62#include "tensorflow/core/platform/tracing.h"
63#include "tensorflow/core/platform/types.h"
64#include "tensorflow/core/public/session_options.h"
65#include "tensorflow/core/util/device_name_utils.h"
66#include "tensorflow/core/util/env_var.h"
67#include "tensorflow/core/util/stream_executor_util.h"
68
69#if !defined(PLATFORM_GOOGLE)
70#include "cuda/cuda_config.h"
71#endif
72
73namespace tensorflow {
74
75// Eigen Ops directly allocate memory only for temporary buffers used
76// during OpKernel::Compute(). The recommended way of allocating such
77// memory is via OpKernelContext::allocate_temp(). However, Eigen Ops
78// don't have access to OpKernelContext, instead they get access to
79// memory directly through the device allocator. As an Open Source
80// project, Eigen assumes allocator semantics similar to those of the
81// CUDA memory allocator, and may not work correctly due to race
82// conditions if used with some other allocator. For safety, we need
83// to delay deallocation calls out of Eigen until all events on the
84// corresponding stream have completed. The following two classes
85// serve this purpose in two different compilation environments.
86
87class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
88 public:
89 EigenCudaStreamDevice()
90 : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) {
91 Eigen::initializeDeviceProp();
92 }
93 ~EigenCudaStreamDevice() override {}
94 void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
95 TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
96 char* scratch) {
97 if (LogMemory::IsEnabled()) {
98 operation_ = context->op_kernel().name() + "/EigenAllocator";
99 step_id_ = context->step_id();
100 }
101 context_ = context;
102 scratch_ = scratch;
103 semaphore_ =
104 reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
105 stream_ = cuda_stream;
106 allocator_ = alloc;
107 const int cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id).value();
108 device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id];
109 }
110
111 const cudaStream_t& stream() const override { return *stream_; }
112 const cudaDeviceProp& deviceProperties() const override {
113 return *device_prop_;
114 }
115
116 void* allocate(size_t num_bytes) const override {
117 void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes);
118 if (ret == nullptr) {
119 if (context_) {
120 context_->SetStatus(errors::ResourceExhausted(
121 strings::StrCat("Ran out of GPU memory when allocating ", num_bytes,
122 " bytes for ", operation_)));
123 } else {
124 LOG(FATAL)
125 << "EigenAllocator for GPU ran out of memory when allocating "
126 << num_bytes << ". See error logs for more detailed info.";
127 }
128 }
129 if (LogMemory::IsEnabled() && ret != nullptr) {
130 LogMemory::RecordRawAllocation(operation_, step_id_, num_bytes, ret,
131 allocator_);
132 }
133 return ret;
134 }
135 void deallocate(void* buffer) const override {
136 if (LogMemory::IsEnabled() && buffer != nullptr) {
137 LogMemory::RecordRawDeallocation(operation_, step_id_, buffer, allocator_,
138 true);
139 }
140 AsyncFreeData* afData =
141 new AsyncFreeData(allocator_, buffer, operation_, step_id_);
142 cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, 0);
143 CHECK_EQ(err, cudaSuccess);
144 }
145
146 // Return a pointer to a per stream scratchpad of 1024 bytes residing
147 // in global memory.
148 void* scratchpad() const override { return scratch_; }
149
150 // Return a semaphore. The semaphore is initially initialized to 0, and
151 // each kernel using it is responsible for resetting to 0 upon completion
152 // to maintain the invariant that the semaphore is always equal to 0 upon
153 // each kernel start.
154 unsigned int* semaphore() const override { return semaphore_; }
155
156 private:
157 struct AsyncFreeData {
158 AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o,
159 const int64 s)
160 : allocator_(a), address_(p), operation_(o), step_id_(s) {}
161 ::tensorflow::Allocator* allocator_;
162 void* address_;
163 const string operation_;
164 const int64 step_id_;
165 };
166
167 static void CUDART_CB asyncFree(cudaStream_t stream, cudaError_t status,
168 void* userData) {
169 AsyncFreeData* data = static_cast<AsyncFreeData*>(userData);
170 if (LogMemory::IsEnabled()) {
171 LogMemory::RecordRawDeallocation(data->operation_, data->step_id_,
172 data->address_, data->allocator_, false);
173 }
174 data->allocator_->DeallocateRaw(data->address_);
175 delete data;
176 }
177
178 string operation_;
179 int64 step_id_;
180 const cudaStream_t* stream_; // Not owned.
181 const cudaDeviceProp* device_prop_; // Not owned.
182 ::tensorflow::Allocator* allocator_; // Not owned.
183 mutable char* scratch_;
184 mutable unsigned int* semaphore_;
185 OpKernelContext* context_;
186
187 TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
188};
189
190// This factory helps to ensure that different GPU device objects that refer to
191// the same physical device and stream group id use the same stream group
192// object (and therefore the same CUDA streams). This is necessary since there
193// is a single memory allocator per device (see ProcessState::GetGPUAllocator)
194// and allocators must not be shared across streams.
195class BaseGPUDevice::StreamGroupFactory {
196 public:
197 // Returns the unique stream group for use with the stream defined by
198 // {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet
199 // exist.
200 // This function is thread safe.
201 BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id,
202 int stream_group_within_gpu,
203 gpu::StreamExecutor* executor) {
204 mutex_lock guard(lock_);
205 StreamGroup* group =
206 &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
207 if (!group->compute) {
208 group->compute = new gpu::Stream(executor);
209 group->compute->Init();
210 VLOG(2) << "Created stream[" << stream_group_within_gpu
211 << "] = " << group->compute;
212
213 group->host_to_device = new gpu::Stream(executor);
214 group->host_to_device->Init();
215 VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
216 << "] = " << group->host_to_device;
217
218 group->device_to_host = new gpu::Stream(executor);
219 group->device_to_host->Init();
220 VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
221 << "] = " << group->device_to_host;
222
223 group->device_to_device = new gpu::Stream(executor);
224 group->device_to_device->Init();
225 VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
226 << "] = " << group->device_to_host;
227 }
228 return group;
229 }
230
231 // Returns a reference to the StreamGroupFactory singleton. Note that this is
232 // never destroyed, so the objects it owns are never deleted.
233 static StreamGroupFactory& Global() {
234 static StreamGroupFactory* instance = new StreamGroupFactory();
235 return *instance;
236 }
237
238 private:
239 mutex lock_;
240 using key_type = std::tuple<int, int>;
241 std::map<key_type, StreamGroup> streams_;
242
243 // StreamGroupFactory cannot be created directly; Call
244 // StreamGroupFactory::Global() to get the global instance.
245 StreamGroupFactory() = default;
246 TF_DISALLOW_COPY_AND_ASSIGN(StreamGroupFactory);
247};
248
249BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
250 Bytes memory_limit, const DeviceLocality& locality,
251 TfGpuId tf_gpu_id,
252 const string& physical_device_desc,
253 Allocator* gpu_allocator, Allocator* cpu_allocator,
254 bool sync_every_op, int32 max_streams)
255 : LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU,
256 memory_limit, locality,
257 physical_device_desc)),
258 gpu_allocator_(gpu_allocator),
259 cpu_allocator_(cpu_allocator),
260 scoped_allocator_mgr_(new ScopedAllocatorMgr(name)),
261 tf_gpu_id_(tf_gpu_id),
262 sync_every_op_(sync_every_op),
263 max_streams_(max_streams) {
264 ProcessState::singleton()->EnableGPUDevice();
265}
266
267BaseGPUDevice::~BaseGPUDevice() {
268 delete gpu_device_info_;
269 for (auto ctx : device_contexts_) ctx->Unref();
270}
271
272Status BaseGPUDevice::Init(const SessionOptions& options) {
273 auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_);
274 if (!executor_status.status().ok()) {
275 return errors::Internal("Failed to get StreamExecutor for device ",
276 tf_gpu_id_.value());
277 }
278
279 executor_ = executor_status.ValueOrDie();
280 em_.reset(new EventMgr(executor_, options.config.gpu_options()));
281
282 if (max_streams_ < 1) {
283 return errors::InvalidArgument("Invalid value for max_streams.");
284 }
285
286 // Create the specified number of GPU streams
287 for (int i = 0; i < max_streams_; i++) {
288 streams_.push_back(
289 StreamGroupFactory::Global().GetOrCreate(tf_gpu_id_, i, executor_));
290
291 size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int);
292 void* scratch_buffer = gpu_allocator_->AllocateRaw(
293 Allocator::kAllocatorAlignment, scratch_buffer_size);
294 if (scratch_buffer == nullptr) {
295 return errors::FailedPrecondition(
296 "Failed to allocate scratch buffer for device ", tf_gpu_id_.value());
297 }
298 scratch_.push_back(static_cast<char*>(scratch_buffer));
299
300 perftools::gputools::DeviceMemory<char> mem(
301 perftools::gputools::DeviceMemoryBase(scratch_buffer,
302 scratch_buffer_size));
303
304 bool ok = executor_->SynchronousMemZero(
305 &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
306 if (!ok) {
307 return errors::FailedPrecondition(
308 "Failed to memcopy into scratch buffer for device ",
309 tf_gpu_id_.value());
310 }
311
312 device_contexts_.push_back(new GPUDeviceContext(
313 i, streams_.back()->compute, streams_.back()->host_to_device,
314 streams_.back()->device_to_host, streams_.back()->device_to_device));
315 }
316 gpu_device_info_ = new GpuDeviceInfo;
317 gpu_device_info_->stream = streams_[0]->compute;
318 gpu_device_info_->default_context = device_contexts_[0];
319 gpu_device_info_->event_mgr = em_.get();
320 gpu_device_info_->gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id_).value();
321 set_tensorflow_gpu_device_info(gpu_device_info_);
322
323 // Whether and how the GPU device uses its own threadpool.
324 // This option is experimental. Once we confirm the best setting, we
325 // may change the default behavior and completely remove this flag.
326 // Default values might change in future releases.
327 // Possible values:
328 // * global: GPU uses threads shared with CPU in the main compute
329 // thread-pool. This is currently the default.
330 // * gpu_private: GPU uses threads dedicated to this device.
331 // * gpu_shared: All GPUs share a dedicated thread pool.
332 string gpu_thread_mode;
333 TF_RETURN_IF_ERROR(
334 ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global", &gpu_thread_mode));
335 gpu_thread_mode = str_util::Lowercase(gpu_thread_mode);
336 if (gpu_thread_mode != "global") {
337 int64 gpu_thread_count = -1;
338 // Default to two threads. One for device compute and another for memory
339 // copies.
340 TF_RETURN_IF_ERROR(
341 ReadInt64FromEnvVar("TF_GPU_THREAD_COUNT", 2, &gpu_thread_count));
342 if (gpu_thread_mode == "gpu_private") {
343 // TODO(zhengxq): since these threads only serve a single GPU device,
344 // we should set the device context once for each thread, and avoid
345 // setting them for each kernel.
346 // TODO(zhengxq): pin the thread to the same socket of the target GPU.
347 thread_pool_.reset(new thread::ThreadPool(
348 options.env, strings::StrCat("gpu_private_", tf_gpu_id_.value()),
349 static_cast<int32>(gpu_thread_count)));
350 set_tensorflow_device_thread_pool(thread_pool_.get());
351 } else if (gpu_thread_mode == "gpu_shared") {
352 static thread::ThreadPool* thread_pool = new thread::ThreadPool(
353 options.env, "gpu_shared", static_cast<int32>(gpu_thread_count));
354 set_tensorflow_device_thread_pool(thread_pool);
355 } else {
356 string error_message =
357 strings::StrCat("Invalid gpu_thread_mode: ", gpu_thread_mode);
358 LOG(WARNING) << error_message;
359 return errors::InvalidArgument(error_message);
360 }
361 }
362
363 return Status::OK();
364}
365
366bool BaseGPUDevice::RequiresRecordingAccessedTensors() const {
367 // When there is no more than one stream, we release the tensor reference
368 // at the end of the kernel launch, instead of at the end of the kernel
369 // execution.
370 return streams_.size() > 1;
371}
372
373Status BaseGPUDevice::FillContextMap(const Graph* graph,
374 DeviceContextMap* device_context_map) {
375 VLOG(2) << "FillContextMap";
376
377 const size_t num_streams = streams_.size();
378 // Special case for single stream.
379 if (num_streams == 1) {
380 return Status::OK();
381 }
382 const int64 before = Env::Default()->NowMicros();
383 gpu_stream_util::AssignStreamsOpts opts;
384 opts.max_streams = static_cast<int32>(num_streams);
385 std::unordered_map<int, int> node_to_stream_id;
386 TF_RETURN_IF_ERROR(
387 gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id));
388 int64 elapsed = Env::Default()->NowMicros() - before;
389 VLOG(3) << "AssignStreams took " << elapsed << "us";
390
391 // Fill in the context map. It is OK for this map to contain
392 // duplicate DeviceContexts so long as we increment the refcount.
393 device_context_map->resize(graph->num_node_ids());
394 for (Node* n : graph->nodes()) {
395 auto mapped_stream = node_to_stream_id[n->id()];
396 CHECK_LE(mapped_stream, num_streams);
397 auto ctx = device_contexts_[mapped_stream];
398 VLOG(3) << "Assigned stream " << node_to_stream_id[n->id()]
399 << " ==> stream[" << ctx->stream_id() << "] for node id " << n->id()
400 << " " << n->type_string() << " " << n->name();
401 ctx->Ref();
402 (*device_context_map)[n->id()] = ctx;
403 }
404
405 return Status::OK();
406}
407
408void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
409 // ScopedActivity is cheap when tracing is not active, but we
410 // can avoid computing the Hash64.
411 // TODO(pbar) This would no longer be needed if Ops have a unique id.
412 const uint64 id = port::Tracing::IsActive() ? Hash64(op_kernel->name()) : 0;
413 port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
414 id);
415
416 // NOTE(tucker): We need to discriminate between Eigen GPU
417 // operations and all others. If an operation is Eigen
418 // implemented (or otherwise tries to launch a cuda kernel
419 // directly), we need to establish a stacked-scoped environment
420 // that directs it to execute on the proper device. Otherwise we
421 // expect the Op to use StreamExecutor directly and correctly. The
422 // way we make this discrimination is quite hacky: At the moment
423 // the only non-Eigen GPU Op is the recv-op, which is known to be
424 // asynchronous.
425 if (op_kernel->is_internal() && op_kernel->type_string() == "_Recv") {
426 context->SetStatus(errors::Internal(
427 "Invalid synchronous 'Compute' on GPU for '_Recv' op"));
428 } else if (port::Tracing::ScopedAnnotation::Enabled()) {
429 port::Tracing::ScopedAnnotation annotation(op_kernel->name(),
430 op_kernel->type_string());
431 ComputeHelper(op_kernel, context);
432 } else {
433 ComputeHelper(op_kernel, context);
434 }
435}
436
437void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
438 OpKernelContext* context) {
439 GPUDeviceContext* gpu_device_context = device_contexts_[0];
440 if (context->op_device_context() != nullptr) {
441 gpu_device_context =
442 static_cast<GPUDeviceContext*>(context->op_device_context());
443 }
444 gpu::Stream* stream = gpu_device_context->stream();
445 const auto stream_id = gpu_device_context->stream_id();
446
447 const bool vlog_1 = VLOG_IS_ON(1);
448 const bool vlog_2 = vlog_1 && VLOG_IS_ON(2);
449
450 if (vlog_1) {
451 VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op "
452 << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
453 << stream_id << "]";
454 }
455
456 const auto num_streams = streams_.size();
457 if (num_streams > 1) {
458 // If this op's device context is different from the other contexts,
459 // we must wait on the stream.
460 for (int i = 0; i < context->num_inputs(); ++i) {
461 const GPUDeviceContext* idc =
462 static_cast<GPUDeviceContext*>(context->input_device_context(i));
463 OP_REQUIRES(context, idc != nullptr,
464 errors::Internal("Input device context ", i,
465 " was not set properly."));
466 if (vlog_2) {
467 const void* base;
468 size_t len;
469 if (context->has_input(i)) {
470 if (IsRefType(context->input_dtype(i))) {
471 Tensor tensor = context->mutable_input(i, false);
472 base = DMAHelper::base(&tensor);
473 len = tensor.TotalBytes();
474 } else {
475 const Tensor& tensor = context->input(i);
476 base = DMAHelper::base(&tensor);
477 len = tensor.TotalBytes();
478 }
479 LOG(INFO) << "Input " << i << " " << base << " " << len;
480 LOG(INFO) << " stream[" << stream_id << "].ThenWaitFor(stream["
481 << idc->stream_id() << "])"
482 << ((idc->stream() == stream) ? " not needed" : "");
483 }
484 }
485 if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
486 }
487 }
488 gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
489 op_kernel->Compute(context);
490 if (context->status().ok()) {
491 if (sync_every_op_) {
492 // Note: GPUUtil::Sync() only syncs the default stream.
493 // We need to either sync the stream used by this op, or
494 // all streams. Given that this flag is typically used for
495 // debugging it makes more sense to sync all GPU activity.
496 context->SetStatus(GPUUtil::SyncAll(this));
497 }
498 }
499}
500
501void BaseGPUDevice::ConsumeListOfAccessedTensors(
502 DeviceContext* device_context, const TensorReferenceVector& tensor_refs) {
503 GPUDeviceContext* gpu_device_context = device_contexts_[0];
504 if (device_context != nullptr) {
505 gpu_device_context = static_cast<GPUDeviceContext*>(device_context);
506 }
507 gpu::Stream* stream = gpu_device_context->stream();
508 em_->ThenDeleteTensors(stream, tensor_refs);
509}
510
511// Based on the semantics of Device::Sync this call should wait for
512// all streams not just the current one.
513Status BaseGPUDevice::Sync() { return GPUUtil::SyncAll(this); }
514
515void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
516 OpKernelContext* context,
517 AsyncOpKernel::DoneCallback done) {
518 GPUDeviceContext* gpu_device_context = device_contexts_[0];
519 if (context->op_device_context() != nullptr) {
520 gpu_device_context =
521 static_cast<GPUDeviceContext*>(context->op_device_context());
522 }
523 gpu::Stream* stream = gpu_device_context->stream();
524 const auto stream_id = gpu_device_context->stream_id();
525
526 VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
527 << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
528 << stream_id << "]";
529
530 // When TraceMe profiling is off (which is the default), the
531 // following TraceMe constructor is simply a conditional test of
532 // false value. Measurements show that its overhead is negligible.
533 port::Tracing::TraceMe activity(op_kernel->name(), op_kernel->type_string(),
534 op_kernel->IsExpensive());
535 gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
536 op_kernel->ComputeAsync(context, done);
537}
538
539Status BaseGPUDevice::MaybeCopyTensorToGPU(
540 const AllocatorAttributes& alloc_attrs, const Tensor& from, Tensor* to,
541 StatusCallback done) {
542 if (alloc_attrs.on_host()) {
543 *to = from;
544 done(Status::OK());
545 return Status::OK();
546 } else {
547 if (!DMAHelper::CanUseDMA(&from)) {
548 Status err = errors::Internal("GPU copy from non-DMA ",
549 DataTypeString(from.dtype()), " tensor");
550 done(err);
551 return err;
552 }
553 auto* copy =
554 new Tensor(GetAllocator(alloc_attrs), from.dtype(), from.shape());
555
556 // If the tensor is not initialized, we likely ran out of memory.
557 if (!copy->IsInitialized()) {
558 delete copy;
559 Status err = errors::ResourceExhausted(
560 "OOM when allocating tensor of shape ", from.shape().DebugString(),
561 " and type ", DataTypeString(from.dtype()));
562 done(err);
563 return err;
564 }
565
566 StatusCallback wrapped_done = std::bind(
567 [to, copy](StatusCallback done_,
568 // Begin unbound arguments.
569 const Status& s) {
570 *to = std::move(*copy);
571 delete copy;
572 done_(s);
573 },
574 std::move(done), std::placeholders::_1);
575
576 port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto");
577 device_contexts_[0]->CopyCPUTensorToDevice(&from, this, copy,
578 std::move(wrapped_done));
579 return Status::OK();
580 }
581}
582
583Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
584 const AllocatorAttributes alloc_attrs,
585 Tensor* tensor) {
586 AllocatorAttributes attr;
587 attr.set_on_host(true);
588 attr.set_gpu_compatible(true);
589 Allocator* host_alloc = GetAllocator(attr);
590 Tensor parsed(tensor_proto.dtype());
591 if (!parsed.FromProto(host_alloc, tensor_proto)) {
592 return errors::InvalidArgument("Cannot parse tensor from proto: ",
593 tensor_proto.DebugString());
594 }
595
596 if (parsed.dtype() == DT_VARIANT) {
597 const Variant* from = parsed.flat<Variant>().data();
598 Tensor copy(cpu_allocator(), DT_VARIANT, parsed.shape());
599 Variant* copy_variant = copy.flat<Variant>().data();
600
601 std::list<Notification> notifications;
602 Status copy_status;
603 auto copier = [this, &alloc_attrs, &notifications, &copy_status](
604 const Tensor& from, Tensor* to) {
605 // Copier isn't run in a multithreaded environment, so we don't
606 // have to worry about the notifications list being modified in parallel.
607 notifications.emplace_back();
608 Notification& n = *notifications.rbegin();
609 return MaybeCopyTensorToGPU(alloc_attrs, from, to,
610 [&n, &copy_status](const Status& s) {
611 if (copy_status.ok()) {
612 copy_status.Update(s);
613 }
614 n.Notify();
615 });
616 };
617 Status s;
618 for (int64 ix = 0; ix < parsed.NumElements(); ++ix) {
619 s = VariantDeviceCopy(VariantDeviceCopyDirection::HOST_TO_DEVICE,
620 from[ix], &copy_variant[ix], copier);
621 if (!s.ok()) {
622 break;
623 }
624 }
625 for (auto& n : notifications) {
626 n.WaitForNotification();
627 }
628 if (!s.ok()) {
629 return s;
630 }
631 *tensor = std::move(copy);
632 return copy_status;
633 } else {
634 Notification n;
635 Status status;
636 TF_RETURN_IF_ERROR(MaybeCopyTensorToGPU(alloc_attrs, parsed, tensor,
637 [&n, &status](const Status& s) {
638 status = s;
639 n.Notify();
640 }));
641 n.WaitForNotification();
642 return status;
643 }
644}
645
646namespace {
647class ConcretePerOpGpuDevice : public PerOpGpuDevice {
648 public:
649 ConcretePerOpGpuDevice() : device_(&stream_device_) {}
650
651 void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
652 TfGpuId tf_gpu_id, Allocator* base_allocator,
653 char* scratch) {
654 stream_device_.Reinitialize(context, cuda_stream, tf_gpu_id, base_allocator,
655 scratch);
656 }
657
658 const Eigen::GpuDevice& device() const override { return device_; }
659
660 private:
661 EigenCudaStreamDevice stream_device_;
662 Eigen::GpuDevice device_;
663};
664
665// Parse 'visible_device_list' into a list of CUDA GPU ids.
666Status ParseVisibleDeviceList(const string& visible_device_list,
667 std::vector<CudaGpuId>* visible_gpu_order) {
668 visible_gpu_order->clear();
669 gpu::Platform* gpu_manager = GPUMachineManager();
670
671 // If the user wants to remap the visible to virtual GPU mapping,
672 // check for that here.
673 if (visible_device_list.empty()) {
674 visible_gpu_order->resize(gpu_manager->VisibleDeviceCount());
675 // By default, visible to virtual mapping is unchanged.
676 int deviceNo = 0;
677 std::generate(visible_gpu_order->begin(), visible_gpu_order->end(),
678 [&deviceNo] { return deviceNo++; });
679 } else {
680 const std::vector<string> order_str =
681 str_util::Split(visible_device_list, ',');
682 for (const string& cuda_gpu_id_str : order_str) {
683 int32 cuda_gpu_id;
684 if (!strings::safe_strto32(cuda_gpu_id_str, &cuda_gpu_id)) {
685 return errors::InvalidArgument(
686 "Could not parse entry in 'visible_device_list': '",
687 cuda_gpu_id_str, "'. visible_device_list = ", visible_device_list);
688 }
689 if (cuda_gpu_id < 0 || cuda_gpu_id >= gpu_manager->VisibleDeviceCount()) {
690 return errors::InvalidArgument(
691 "'visible_device_list' listed an invalid GPU id '", cuda_gpu_id,
692 "' but visible device count is ",
693 gpu_manager->VisibleDeviceCount());
694 }
695 visible_gpu_order->push_back(CudaGpuId(cuda_gpu_id));
696 }
697 }
698
699 // Validate no repeats.
700 std::set<CudaGpuId> visible_device_set(visible_gpu_order->begin(),
701 visible_gpu_order->end());
702 if (visible_device_set.size() != visible_gpu_order->size()) {
703 return errors::InvalidArgument(
704 "visible_device_list contained a duplicate entry: ",
705 visible_device_list);
706 }
707 return Status::OK();
708}
709
710Status VerifyVirtualDeviceSettings(
711 const size_t num_gpus_to_use, const GPUOptions& gpu_options,
712 const std::vector<CudaGpuId>& visible_gpu_order,
713 const std::vector<CudaGpuId>& valid_cuda_gpu_ids) {
714 const auto& virtual_devices = gpu_options.experimental().virtual_devices();
715 CHECK(!virtual_devices.empty());
716 if (gpu_options.per_process_gpu_memory_fraction() > 0) {
717 return errors::InvalidArgument(
718 "It's invalid to set per_process_gpu_memory_fraction when "
719 "virtual_devices is set.");
720 }
721 if (num_gpus_to_use < virtual_devices.size()) {
722 return errors::Unknown(
723 "Not enough GPUs to create virtual devices."
724 " num_gpus_to_use: ",
725 num_gpus_to_use, " #virtual_devices: ", virtual_devices.size());
726 }
727 if (!gpu_options.visible_device_list().empty() &&
728 visible_gpu_order.size() != virtual_devices.size()) {
729 return errors::InvalidArgument(
730 "The number of GPUs in visible_device_list doesn't match the number "
731 "of elements in the virtual_devices list.",
732 " #GPUs in visible_device_list: ", visible_gpu_order.size(),
733 " virtual_devices.size(): ", virtual_devices.size());
734 }
735 if (valid_cuda_gpu_ids.size() != virtual_devices.size()) {
736 return errors::Unknown(
737 "The number of valid GPUs doesn't match the number of elements in "
738 "the virtual_devices list.",
739 " #valid GPUs: ", valid_cuda_gpu_ids.size(),
740 " virtual_devices.size(): ", virtual_devices.size());
741 }
742 return Status::OK();
743}
744
745int64 MinSystemMemory(int64 available_memory) {
746 // We use the following heuristic for now:
747 //
748 // If the available_memory is < 2GiB, we allocate 225MiB to system memory.
749 // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
750 //
751 // In the future we could be more sophisticated by using a table of devices.
752 int64 min_system_memory;
753 if (available_memory < (1LL << 31)) {
754 // 225MiB
755 min_system_memory = 225 * 1024 * 1024;
756 } else {
757 // max(300 MiB, 0.05 * available_memory)
758 min_system_memory =
759 std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
760 }
761#if defined(__GNUC__) && defined(__OPTIMIZE__)
762// Do nothing
763#elif !defined(__GNUC__) && defined(NDEBUG)
764// Do nothing
765#else
766 // Double the amount of available GPU memory in non-opt builds (debug
767 // builds in windows); because in non-opt builds more system memory
768 // is necessary.
769 min_system_memory *= 2;
770#endif
771
772#if defined(ANDROID_TEGRA)
773 // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM
774 // and Video RAM
775 min_system_memory = 1 << 30;
776#endif
777 return min_system_memory;
778}
779
780// Get the memory limit for the virtual device being created on GPU with
781// 'cuda_gpu_id', when that virtual device is the only virtual device being
782// created on that GPU.
783Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
784 CudaGpuId cuda_gpu_id,
785 int64* memory_limit) {
786 int64 total_memory = 0;
787 int64 available_memory = 0;
788 gpu::StreamExecutor* se =
789 GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
790 if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
791 return errors::Unknown("Failed to query available memory for GPU ",
792 cuda_gpu_id.value());
793 }
794
795 int64 allocated_memory = 0;
796 const double per_process_gpu_memory_fraction =
797 gpu_options.per_process_gpu_memory_fraction();
798 if (per_process_gpu_memory_fraction == 0) {
799 allocated_memory = available_memory;
800 const int64 min_system_memory = MinSystemMemory(available_memory);
801 if (min_system_memory < allocated_memory) {
802 allocated_memory -= min_system_memory;
803 }
804 } else {
805 allocated_memory = total_memory * per_process_gpu_memory_fraction;
806 }
807 *memory_limit = allocated_memory;
808 return Status::OK();
809}
810} // namespace
811
812void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
813 PerOpGpuDevice* device, int stream_id,
814 Allocator* allocator) {
815 ConcretePerOpGpuDevice* concrete_device =
816 static_cast<ConcretePerOpGpuDevice*>(device);
817 DCHECK(concrete_device);
818 const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
819 streams_[stream_id]->compute->implementation()->CudaStreamMemberHack());
820 concrete_device->Reinitialize(context, cuda_stream, tf_gpu_id_, allocator,
821 scratch_[stream_id]);
822}
823
824PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() {
825 return new ConcretePerOpGpuDevice();
826}
827
828void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
829 PerOpGpuDevice* device,
830 DeviceContext* dc,
831 Allocator* allocator) {
832 if (dc) {
833 const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc);
834 const int stream_id = gpu_dc->stream_id();
835 VLOG(1) << " eigen_gpu_device(" << dc << ") => stream[" << stream_id
836 << "]";
837 CHECK_LT(stream_id, streams_.size());
838 ReinitializeDevice(context, device, stream_id, allocator);
839 } else {
840 ReinitializeDevice(context, device, 0, allocator);
841 }
842}
843
844Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr,
845 int64 step_id) {
846 if (attr.scope_id > 0) {
847 return scoped_allocator_mgr_->GetContainer(step_id)->GetInstance(
848 attr.scope_id);
849 }
850 LOG(FATAL) << "Unexpected call to BaseGPUDevice::GetScopedAllocator "
851 << "attr.scope_id = " << attr.scope_id;
852 return gpu_allocator_;
853}
854
855const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = 1000;
856const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = 1;
857
858Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
859 const string& name_prefix,
860 std::vector<Device*>* devices) {
861 TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
862 gpu::Platform* gpu_manager = GPUMachineManager();
863 if (gpu_manager == nullptr) {
864 return Status::OK();
865 }
866 // If there are no GPUs visible, do nothing.
867 if (gpu_manager->VisibleDeviceCount() <= 0) {
868 return Status::OK();
869 }
870
871 size_t num_gpus_to_use = INT_MAX;
872 auto iter = options.config.device_count().find("GPU");
873 if (iter != options.config.device_count().end()) {
874 num_gpus_to_use = iter->second;
875 }
876 const auto& gpu_options = options.config.gpu_options();
877 std::vector<CudaGpuId> visible_gpu_order;
878 TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(),
879 &visible_gpu_order));
880
881 std::vector<CudaGpuId> valid_cuda_gpu_ids;
882 TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &valid_cuda_gpu_ids));
883 if (num_gpus_to_use > valid_cuda_gpu_ids.size()) {
884 num_gpus_to_use = valid_cuda_gpu_ids.size();
885 }
886 if (!valid_cuda_gpu_ids.empty()) {
887 // Save the original device.
888 int original_device = 0;
889 cudaError_t err = cudaGetDevice(&original_device);
890 if (err != cudaSuccess) {
891 return errors::Internal("cudaGetDevice() failed. Status: ",
892 cudaGetErrorString(err));
893 }
894 // Force to implicitly initialize CUDA runtime on each valid GPU before
895 // CreateGPUDevice().
896 for (CudaGpuId cuda_gpu_id : valid_cuda_gpu_ids) {
897 err = cudaSetDevice(cuda_gpu_id.value());
898 if (err != cudaSuccess) {
899 return errors::Internal("cudaSetDevice() on GPU:", cuda_gpu_id.value(),
900 " failed. Status: ", cudaGetErrorString(err));
901 }
902 err = cudaFree(nullptr);
903 if (err != cudaSuccess) {
904 return errors::Internal(
905 "CUDA runtime implicit initialization on GPU:", cuda_gpu_id.value(),
906 " failed. Status: ", cudaGetErrorString(err));
907 }
908 }
909 // Reset to the original device.
910 err = cudaSetDevice(original_device);
911 if (err != cudaSuccess) {
912 return errors::Internal("cudaSetDevice() on GPU:", original_device,
913 " failed. Status: ", cudaGetErrorString(err));
914 }
915 }
916
917 std::vector<InterconnectMap> interconnect_maps;
918 TF_RETURN_IF_ERROR(
919 GetInterconnectMaps(visible_gpu_order, gpu_manager, &interconnect_maps));
920
921 // Print each interconnect map to the log.
922 for (const InterconnectMap& im : interconnect_maps) {
923 LOG(INFO) << "Device interconnect " << im.name << " with strength "
924 << im.strength << " edge matrix:";
925 string line_buf = " ";
926 for (int i = 0; i < visible_gpu_order.size(); ++i) {
927 strings::StrAppend(&line_buf, visible_gpu_order[i].value(), " ");
928 }
929 LOG(INFO) << line_buf;
930 for (int i = 0; i < visible_gpu_order.size(); ++i) {
931 line_buf = strings::StrCat(visible_gpu_order[i].value(), ": ");
932 CudaGpuId cuda_id_i = visible_gpu_order[i];
933 for (int j = 0; j < visible_gpu_order.size(); ++j) {
934 CudaGpuId cuda_id_j = visible_gpu_order[j];
935 if (im.directed_links.find({cuda_id_i, cuda_id_j}) !=
936 im.directed_links.end()) {
937 line_buf.append("Y ");
938 } else {
939 line_buf.append("N ");
940 }
941 }
942 LOG(INFO) << line_buf;
943 }
944 }
945
946 const auto& virtual_devices = gpu_options.experimental().virtual_devices();
947 if (!virtual_devices.empty()) {
948 TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
949 num_gpus_to_use, gpu_options, visible_gpu_order, valid_cuda_gpu_ids));
950 // We've verified that num_gpus_to_use >= virtual_devices.size().
951 num_gpus_to_use = virtual_devices.size();
952 CHECK(gpu_options.visible_device_list().empty() ||
953 valid_cuda_gpu_ids == visible_gpu_order);
954 }
955 int next_tf_gpu_id = 0;
956 std::vector<int64> memory_limit_bytes;
957 for (int i = 0; i < num_gpus_to_use; ++i) {
958 const CudaGpuId cuda_gpu_id = valid_cuda_gpu_ids[i];
959 if (virtual_devices.empty() ||
960 virtual_devices.Get(i).memory_limit_mb_size() == 0) {
961 int64 single_virtual_device_memory_limit = 0;
962 TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit(
963 gpu_options, cuda_gpu_id, &single_virtual_device_memory_limit));
964 memory_limit_bytes.push_back(single_virtual_device_memory_limit);
965 } else {
966 const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
967 std::transform(memory_limit_mb.begin(), memory_limit_mb.end(),
968 std::back_inserter(memory_limit_bytes), [](float mb) {
969 return static_cast<int64>(mb) * (1ll << 20);
970 });
971 }
972 while (next_tf_gpu_id < memory_limit_bytes.size()) {
973 TfGpuId tf_gpu_id(next_tf_gpu_id);
974 ++next_tf_gpu_id;
975 GpuIdManager::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id);
976 }
977 }
978 const int num_tf_gpus = next_tf_gpu_id;
979
980 LocalityMap device_localities;
981 TF_RETURN_IF_ERROR(
982 GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities));
983
984 // Build the GPUDevices
985 CHECK_EQ(next_tf_gpu_id, memory_limit_bytes.size());
986 for (int di = 0; di < num_tf_gpus; ++di) {
987 TfGpuId tf_gpu_id(di);
988 int64 bytes = memory_limit_bytes[di];
989 auto it = device_localities.find(tf_gpu_id);
990 if (it == device_localities.end()) {
991 return errors::Internal("Failed to find DeviceLocality for GPU device ",
992 tf_gpu_id.value());
993 }
994 TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes,
995 it->second, devices));
996 }
997 return Status::OK();
998}
999
1000static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id,
1001 const gpu::DeviceDescription& desc) {
1002 int cc_major;
1003 int cc_minor;
1004 if (!desc.cuda_compute_capability(&cc_major, &cc_minor)) {
1005 cc_major = 0;
1006 cc_minor = 0;
1007 }
1008 // LINT.IfChange
1009 return strings::StrCat("device: ", cuda_gpu_id.value(),
1010 ", name: ", desc.name(),
1011 ", pci bus id: ", desc.pci_bus_id(),
1012 ", compute capability: ", cc_major, ".", cc_minor);
1013 // LINT.ThenChange(//tensorflow/python/platform/test.py)
1014}
1015
1016Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
1017 const string& name_prefix,
1018 TfGpuId tf_gpu_id,
1019 int64 memory_limit,
1020 const DeviceLocality& dev_locality,
1021 std::vector<Device*>* devices) {
1022 CHECK_GE(tf_gpu_id.value(), 0);
1023 const string device_name =
1024 strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
1025 GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
1026 CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
1027 int numa_node = dev_locality.numa_node();
1028
1029 gpu::StreamExecutor* se =
1030 GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
1031 const gpu::DeviceDescription& desc = se->GetDeviceDescription();
1032 ProcessState* process_state = ProcessState::singleton();
1033 Allocator* gpu_allocator = process_state->GetGPUAllocator(
1034 options.config.gpu_options(), tf_gpu_id, memory_limit);
1035 if (gpu_allocator == nullptr) {
1036 return errors::Internal("Failed to get memory allocator for TF GPU ",
1037 tf_gpu_id.value(), " with ", memory_limit,
1038 " bytes of memory.");
1039 }
1040 AllocatorStats stats;
1041 gpu_allocator->GetStats(&stats);
1042 // 'memory_limit' is the required memory size, but if the allocator with given
1043 // tf_gpu_id was created before, we'll use it instead of creating a new one
1044 // (as TF gpu device is a shared resource), in which case the actual memory
1045 // limit represented by 'stats.bytes_limit' used by that allocator may be
1046 // different (which should be an error).
1047 //
1048 // TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit.
1049 BaseGPUDevice* gpu_device = CreateGPUDevice(
1050 options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality,
1051 tf_gpu_id, GetShortDeviceDescription(cuda_gpu_id, desc), gpu_allocator,
1052 process_state->GetCPUAllocator(numa_node));
1053 LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
1054 << (stats.bytes_limit >> 20) << " MB memory) -> physical GPU ("
1055 << GetShortDeviceDescription(cuda_gpu_id, desc) << ")";
1056 TF_RETURN_IF_ERROR(gpu_device->Init(options));
1057 devices->push_back(gpu_device);
1058
1059 return Status::OK();
1060}
1061
1062namespace {
1063std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>>
1064GetPeerAccessMap(gpu::Platform* platform,
1065 const std::vector<CudaGpuId>& visible_gpu_order) {
1066 std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> map(
1067 new std::map<std::pair<CudaGpuId, CudaGpuId>, bool>);
1068 for (CudaGpuId cuda_gpu_i : visible_gpu_order) {
1069 for (CudaGpuId cuda_gpu_j : visible_gpu_order) {
1070 gpu::StreamExecutor* from =
1071 GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
1072 gpu::StreamExecutor* to =
1073 GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
1074 (*map)[{cuda_gpu_i, cuda_gpu_j}] = from->CanEnablePeerAccessTo(to);
1075 }
1076 }
1077
1078 return map;
1079}
1080
1081} // namespace
1082
1083Status BaseGPUDeviceFactory::GetInterconnectMaps(
1084 const std::vector<CudaGpuId>& visible_gpu_order, gpu::Platform* gpu_manager,
1085 std::vector<InterconnectMap>* maps) {
1086 // The default interconnect map is obtained from the StreamExecutor.
1087 auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
1088 maps->resize(1);
1089 InterconnectMap& imap = maps->at(0);
1090 imap.name = "StreamExecutor";
1091 imap.strength = InterconnectMap::kStreamExecutorStrength;
1092 for (CudaGpuId cuda_id_i : visible_gpu_order) {
1093 for (CudaGpuId cuda_id_j : visible_gpu_order) {
1094 if (cuda_id_i == cuda_id_j) continue;
1095 if ((*access_map)[{cuda_id_i, cuda_id_j}]) {
1096 imap.directed_links.insert({cuda_id_i, cuda_id_j});
1097 }
1098 }
1099 }
1100 return Status::OK();
1101}
1102
1103Status BaseGPUDeviceFactory::GetDeviceLocalities(
1104 int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
1105 LocalityMap* localities) {
1106 std::vector<TfGpuId> all_tf_gpu_ids;
1107 for (int i = 0; i < num_tf_gpus; ++i) {
1108 all_tf_gpu_ids.push_back(TfGpuId(i));
1109 }
1110 for (TfGpuId tf_gpu_id : all_tf_gpu_ids) {
1111 CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
1112 // Get GPU bus_id from its reported NUMA affinity. Because GPUs are
1113 // virtualized in some environments, we can't just use the GPU id.
1114 // NUMA locales are indexed from 0, buses are indexed from 1.
1115 gpu::StreamExecutor* se =
1116 GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
1117 const gpu::DeviceDescription& desc = se->GetDeviceDescription();
1118 int numa_node = desc.numa_node();
1119 if (numa_node < 0) {
1120 // For some reason the StreamExecutor couldn't get the NUMA
1121 // affinity of the GPU. If this is not a multi-socket mobo with
1122 // GPUs local to different buses, it doesn't matter. If it is, we
1123 // may run into trouble later with data transfer operations. The
1124 // trouble may manifest as slower than expected performance, or
1125 // outright failures.
1126 LOG(INFO) << "Could not identify NUMA node of CUDA gpu id " << cuda_gpu_id
1127 << ", defaulting to 0. Your kernel may not have been built "
1128 << "with NUMA support.";
1129 numa_node = 0;
1130 }
1131 DeviceLocality dev_locality;
1132 dev_locality.set_numa_node(numa_node);
1133 dev_locality.set_bus_id(numa_node + 1);
1134
1135 // Set LocalLinks from InterconnectMaps.
1136 LocalLinks* links = dev_locality.mutable_links();
1137 for (const InterconnectMap& imap : interconnects) {
1138 for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
1139 CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst);
1140 if (imap.directed_links.find({cuda_gpu_id, cuda_gpu_dst}) !=
1141 imap.directed_links.end()) {
1142 InterconnectLink* ilink = links->add_link();
1143 ilink->set_device_id(tf_gpu_dst.value());
1144 ilink->set_type(imap.name);
1145 ilink->set_strength(imap.strength);
1146 }
1147 }
1148 }
1149
1150 // If this is one of multiple virtual GPUs on the same physical GPU
1151 // add high strength links to the others.
1152 for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
1153 if (tf_gpu_id == tf_gpu_dst) continue;
1154 CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst);
1155 if (cuda_gpu_id == cuda_gpu_dst) {
1156 InterconnectLink* ilink = links->add_link();
1157 ilink->set_device_id(tf_gpu_dst.value());
1158 ilink->set_type("SAME_DEVICE");
1159 ilink->set_strength(InterconnectMap::kSameDeviceStrength);
1160 }
1161 }
1162
1163 (*localities)[tf_gpu_id] = dev_locality;
1164 VLOG(1) << "GPUDevice CudaGpuId " << cuda_gpu_id << " TfGpuId " << tf_gpu_id
1165 << " on bus " << dev_locality.bus_id() << " numa: " << numa_node
1166 << " pci: " << desc.pci_bus_id()
1167 << " DeviceLocality: " << dev_locality.DebugString();
1168 }
1169 return Status::OK();
1170}
1171
1172static int GetDefaultMinGPUMultiprocessorCount(
1173 gpu::Platform* gpu_manager,
1174 const std::vector<CudaGpuId>& visible_gpu_order) {
1175 static const int kDefaultMinGPUMultiprocessorCount = 8;
1176
1177 // Find the highest multi-processor count across all visible GPUs.
1178 int max_count = -1;
1179 for (int i = 0; i < visible_gpu_order.size(); ++i) {
1180 auto exec_status =
1181 GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_order[i]);
1182 if (!exec_status.ok()) {
1183 continue;
1184 }
1185
1186 gpu::StreamExecutor* se = exec_status.ValueOrDie();
1187 const gpu::DeviceDescription& desc = se->GetDeviceDescription();
1188 max_count = std::max(max_count, desc.core_count());
1189 }
1190
1191 if (max_count < 0 || kDefaultMinGPUMultiprocessorCount < max_count) {
1192 return kDefaultMinGPUMultiprocessorCount;
1193 } else {
1194 return max_count;
1195 }
1196}
1197
1198static int GetMinGPUMultiprocessorCount(
1199 gpu::Platform* gpu_manager,
1200 const std::vector<CudaGpuId>& visible_gpu_order) {
1201 const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
1202
1203 if (tf_min_gpu_core_count == nullptr ||
1204 strcmp(tf_min_gpu_core_count, "") == 0) {
1205 return GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1206 }
1207
1208 int min_gpu_core_count = -1;
1209 if (strings::safe_strto32(tf_min_gpu_core_count, &min_gpu_core_count)) {
1210 if (min_gpu_core_count >= 0) {
1211 return min_gpu_core_count;
1212 }
1213 }
1214
1215 int count =
1216 GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1217 LOG(ERROR) << "Invalid minimum GPU multiprocessor count: ["
1218 << tf_min_gpu_core_count << "]. "
1219 << "Using the default value: " << count;
1220 return count;
1221}
1222
1223namespace {
1224
1225struct CudaVersion {
1226 // Initialize from version_name in the form of "3.5"
1227 explicit CudaVersion(const std::string& version_name) {
1228 size_t dot_pos = version_name.find('.');
1229 CHECK(dot_pos != string::npos)
1230 << "Illegal version name: [" << version_name << "]";
1231 string major_str = version_name.substr(0, dot_pos);
1232 CHECK(strings::safe_strto32(major_str, &major_part))
1233 << "Illegal version name: [" << version_name << "]";
1234 string minor_str = version_name.substr(dot_pos + 1);
1235 CHECK(strings::safe_strto32(minor_str, &minor_part))
1236 << "Illegal version name: [" << version_name << "]";
1237 }
1238 CudaVersion() {}
1239 bool operator<(const CudaVersion& other) const {
1240 if (this->major_part != other.major_part) {
1241 return this->major_part < other.major_part;
1242 }
1243 return this->minor_part < other.minor_part;
1244 }
1245 friend std::ostream& operator<<(std::ostream& os,
1246 const CudaVersion& version) {
1247 os << version.major_part << "." << version.minor_part;
1248 return os;
1249 }
1250 int major_part = -1;
1251 int minor_part = -1;
1252};
1253
1254std::vector<CudaVersion> supported_cuda_compute_capabilities = {
1255 TF_CUDA_CAPABILITIES,};
1256
1257std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
1258 auto cuda_caps = supported_cuda_compute_capabilities;
1259#ifdef TF_EXTRA_CUDA_CAPABILITIES
1260// TF_EXTRA_CUDA_CAPABILITIES should be defined a sequence separated by commas,
1261// for example:
1262// TF_EXTRA_CUDA_CAPABILITIES=3.0,4.0,5.0
1263// Use two-level macro expansion for stringification.
1264#define TF_XSTRING(...) #__VA_ARGS__
1265#define TF_STRING(s) TF_XSTRING(s)
1266 string extra_cuda_caps = TF_STRING(TF_EXTRA_CUDA_CAPABILITIES);
1267#undef TF_STRING
1268#undef TF_XSTRING
1269 auto extra_capabilities = str_util::Split(extra_cuda_caps, ',');
1270 for (const auto& capability : extra_capabilities) {
1271 cuda_caps.push_back(CudaVersion(capability));
1272 }
1273#endif
1274 return cuda_caps;
1275}
1276
1277Status EnablePeerAccess(gpu::Platform* platform,
1278 const std::vector<CudaGpuId>& visible_gpu_order) {
1279 int possible_peer_count = 0;
1280 int enabled_peer_count = 0;
1281 for (int i = 0; i < visible_gpu_order.size(); ++i) {
1282 const CudaGpuId cuda_gpu_i = visible_gpu_order[i];
1283 for (int j = 0; j < visible_gpu_order.size(); ++j) {
1284 const CudaGpuId cuda_gpu_j = visible_gpu_order[j];
1285 // We have already validated that ExecutorForDevice() calls return OK.
1286 gpu::StreamExecutor* from =
1287 GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
1288 gpu::StreamExecutor* to =
1289 GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
1290
1291 if (from->CanEnablePeerAccessTo(to)) {
1292 ++possible_peer_count;
1293 auto status = from->EnablePeerAccessTo(to);
1294 if (!status.ok()) {
1295 LOG(WARNING)
1296 << "Unable to enable peer access between device ordinals "
1297 << cuda_gpu_i << " and " << cuda_gpu_j << ", status: " << status;
1298 } else {
1299 ++enabled_peer_count;
1300 }
1301 }
1302 }
1303 }
1304
1305 // Return an error in the extreme failure case where the driver
1306 // reported that peering was possible but not a single peering was
1307 // successful. This is to catch possible system misconfigurations
1308 // or more fundamental issues.
1309 if (possible_peer_count > 0 && enabled_peer_count == 0) {
1310 return errors::Internal(possible_peer_count,
1311 " potential peer access pairs were reported by the "
1312 "driver, but no peering could be enabled.");
1313 }
1314 return Status::OK();
1315}
1316
1317} // namespace
1318
1319Status BaseGPUDeviceFactory::GetValidDeviceIds(
1320 const std::vector<CudaGpuId>& visible_gpu_order,
1321 std::vector<CudaGpuId>* ids) {
1322 gpu::Platform* gpu_manager = GPUMachineManager();
1323 bool new_gpu_found = false;
1324 for (int i = 0; i < visible_gpu_order.size(); ++i) {
1325 const CudaGpuId cuda_gpu_id = visible_gpu_order[i];
1326
1327 // Only perform this once per visible cuda gpu id.
1328 if (visible_gpu_initialized_[cuda_gpu_id.value()]) {
1329 continue;
1330 }
1331
1332 visible_gpu_initialized_[cuda_gpu_id.value()] = true;
1333 new_gpu_found = true;
1334
1335 auto executor = GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, cuda_gpu_id);
1336 if (!executor.ok()) {
1337 return StreamExecutorUtil::ConvertStatus(executor.status());
1338 }
1339
1340 auto stream_exec = executor.ValueOrDie();
1341 int64 free_bytes;
1342 int64 total_bytes;
1343 if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
1344 // Logs internally on failure.
1345 free_bytes = 0;
1346 total_bytes = 0;
1347 }
1348 const auto& description = stream_exec->GetDeviceDescription();
1349 int cc_major;
1350 int cc_minor;
1351 if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
1352 // Logs internally on failure.
1353 cc_major = 0;
1354 cc_minor = 0;
1355 }
1356 LOG(INFO) << "Found device " << i << " with properties: "
1357 << "\nname: " << description.name() << " major: " << cc_major
1358 << " minor: " << cc_minor
1359 << " memoryClockRate(GHz): " << description.clock_rate_ghz()
1360 << "\npciBusID: " << description.pci_bus_id() << "\ntotalMemory: "
1361 << strings::HumanReadableNumBytes(total_bytes)
1362 << " freeMemory: " << strings::HumanReadableNumBytes(free_bytes);
1363 }
1364 // Checking peering and shows matrix if more than one gpu found.
1365 if (new_gpu_found && visible_gpu_order.size() > 1) {
1366 // Enable peer access
1367 TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order));
1368 }
1369
1370 auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
1371 if (cuda_supported_capabilities.empty()) {
1372 return errors::FailedPrecondition(
1373 "No supported cuda capabilities in binary.");
1374 }
1375 CudaVersion min_supported_capability = *std::min_element(
1376 cuda_supported_capabilities.begin(), cuda_supported_capabilities.end());
1377
1378 int min_gpu_core_count =
1379 GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1380
1381 // Filter out devices that don't have the right capability or power.
1382 for (int i = 0; i < visible_gpu_order.size(); ++i) {
1383 const CudaGpuId visible_gpu_id = visible_gpu_order[i];
1384 auto exec_status =
1385 GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_id);
1386 if (!exec_status.ok()) {
1387 LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id
1388 << " whose executor is in invalid state: "
1389 << exec_status.status().ToString();
1390 continue;
1391 }
1392 gpu::StreamExecutor* se = exec_status.ValueOrDie();
1393 const gpu::DeviceDescription& desc = se->GetDeviceDescription();
1394 CudaVersion device_capability;
1395 if (!desc.cuda_compute_capability(&device_capability.major_part,
1396 &device_capability.minor_part)) {
1397 LOG(INFO) << "Ignoring visible gpu device "
1398 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1399 << ") "
1400 << "whose CUDA compute capability is not available.";
1401 continue;
1402 }
1403 // Only GPUs with no less than the minimum supported compute capability is
1404 // accepted.
1405 if (device_capability < min_supported_capability) {
1406 LOG(INFO) << "Ignoring visible gpu device "
1407 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1408 << ") "
1409 << "with Cuda compute capability " << device_capability
1410 << ". The minimum required Cuda capability is "
1411 << min_supported_capability << ".";
1412 continue;
1413 }
1414
1415 // Filter out slow GPUs. By default, GPUs with a lower multiprocessor
1416 // count than the fastest GPU are filtered out, unless they have 8 or more
1417 // multiprocessors. If the TF_MIN_GPU_MULTIPROCESSOR_COUNT environment
1418 // variable is set, its value will be used to filter out GPUs.
1419 if (desc.core_count() < min_gpu_core_count) {
1420 LOG(INFO) << "Ignoring visible gpu device "
1421 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1422 << ") "
1423 << "with Cuda multiprocessor count: " << desc.core_count()
1424 << ". The minimum required count is " << min_gpu_core_count
1425 << ". You can adjust this requirement with the env var "
1426 "TF_MIN_GPU_MULTIPROCESSOR_COUNT.";
1427 continue;
1428 }
1429 ids->push_back(visible_gpu_id);
1430 }
1431 if (!ids->empty()) {
1432 std::vector<int> raw_ids(ids->size());
1433 std::transform(ids->begin(), ids->end(), raw_ids.begin(),
1434 [](CudaGpuId id) -> int { return id.value(); });
1435 LOG(INFO) << "Adding visible gpu devices: "
1436 << str_util::Join(raw_ids, ", ");
1437 }
1438
1439 return Status::OK();
1440}
1441
1442} // namespace tensorflow
1443
1444#endif // GOOGLE_CUDA
1445