gpu_device.cc source code [tensorflow/tensorflow/core/common_runtime/gpu/gpu_device.cc]

1	/ Copyright 2017 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	// TODO(opensource): Use a more generic sounding preprocessor name than
17	// GOOGLE_CUDA
18	#if GOOGLE_CUDA
19
20	#define EIGEN_USE_GPU
21
22	#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
23
24	#include <stdlib.h>
25	#include <string.h>
26	#include <algorithm>
27	#include <list>
28	#include <map>
29	#include <tuple>
30	#include <vector>
31
32	#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
33	#include "tensorflow/core/common_runtime/device_factory.h"
34	#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
35	#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
36	#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
37	#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
38	#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
39	#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
40	#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
41	#include "tensorflow/core/common_runtime/gpu/process_state.h"
42	#include "tensorflow/core/common_runtime/gpu_device_context.h"
43	#include "tensorflow/core/common_runtime/local_device.h"
44	#include "tensorflow/core/framework/allocator.h"
45	#include "tensorflow/core/framework/device_base.h"
46	#include "tensorflow/core/framework/op_kernel.h"
47	#include "tensorflow/core/framework/tensor.h"
48	#include "tensorflow/core/framework/tensor.pb.h"
49	#include "tensorflow/core/framework/types.h"
50	#include "tensorflow/core/framework/variant_op_registry.h"
51	#include "tensorflow/core/graph/types.h"
52	#include "tensorflow/core/lib/core/errors.h"
53	#include "tensorflow/core/lib/core/status.h"
54	#include "tensorflow/core/lib/gtl/stl_util.h"
55	#include "tensorflow/core/lib/strings/numbers.h"
56	#include "tensorflow/core/lib/strings/str_util.h"
57	#include "tensorflow/core/lib/strings/strcat.h"
58	#include "tensorflow/core/platform/cuda.h"
59	#include "tensorflow/core/platform/logging.h"
60	#include "tensorflow/core/platform/macros.h"
61	#include "tensorflow/core/platform/stream_executor.h"
62	#include "tensorflow/core/platform/tracing.h"
63	#include "tensorflow/core/platform/types.h"
64	#include "tensorflow/core/public/session_options.h"
65	#include "tensorflow/core/util/device_name_utils.h"
66	#include "tensorflow/core/util/env_var.h"
67	#include "tensorflow/core/util/stream_executor_util.h"
68
69	#if !defined(PLATFORM_GOOGLE)
70	#include "cuda/cuda_config.h"
71	#endif
72
73	namespace tensorflow {
74
75	// Eigen Ops directly allocate memory only for temporary buffers used
76	// during OpKernel::Compute(). The recommended way of allocating such
77	// memory is via OpKernelContext::allocate_temp(). However, Eigen Ops
78	// don't have access to OpKernelContext, instead they get access to
79	// memory directly through the device allocator. As an Open Source
80	// project, Eigen assumes allocator semantics similar to those of the
81	// CUDA memory allocator, and may not work correctly due to race
82	// conditions if used with some other allocator. For safety, we need
83	// to delay deallocation calls out of Eigen until all events on the
84	// corresponding stream have completed. The following two classes
85	// serve this purpose in two different compilation environments.
86
87	class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
88	public:
89	EigenCudaStreamDevice()
90	: scratch_(nullptr), semaphore_(nullptr), context_(nullptr) {
91	Eigen::initializeDeviceProp();
92	}
93	~EigenCudaStreamDevice() override {}
94	void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
95	TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
96	char* scratch) {
97	if (LogMemory::IsEnabled()) {
98	operation_ = context->op_kernel().name() + "/EigenAllocator";
99	step_id_ = context->step_id();
100	}
101	context_ = context;
102	scratch_ = scratch;
103	semaphore_ =
104	reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
105	stream_ = cuda_stream;
106	allocator_ = alloc;
107	const int cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id).value();
108	device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id];
109	}
110
111	const cudaStream_t& stream() const override { return *stream_; }
112	const cudaDeviceProp& deviceProperties() const override {
113	return *device_prop_;
114	}
115
116	void* allocate(size_t num_bytes) const override {
117	void* ret = allocator_->AllocateRaw(`32` / alignment /, num_bytes);
118	if (ret == nullptr) {
119	if (context_) {
120	context_->SetStatus(errors::ResourceExhausted(
121	strings::StrCat("Ran out of GPU memory when allocating ", num_bytes,
122	" bytes for ", operation_)));
123	} else {
124	LOG(FATAL)
125	<< "EigenAllocator for GPU ran out of memory when allocating "
126	<< num_bytes << ". See error logs for more detailed info.";
127	}
128	}
129	if (LogMemory::IsEnabled() && ret != nullptr) {
130	LogMemory::RecordRawAllocation(operation_, step_id_, num_bytes, ret,
131	allocator_);
132	}
133	return ret;
134	}
135	void deallocate(void* buffer) const override {
136	if (LogMemory::IsEnabled() && buffer != nullptr) {
137	LogMemory::RecordRawDeallocation(operation_, step_id_, buffer, allocator_,
138	true);
139	}
140	AsyncFreeData* afData =
141	new AsyncFreeData(allocator_, buffer, operation_, step_id_);
142	cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, `0`);
143	CHECK_EQ(err, cudaSuccess);
144	}
145
146	// Return a pointer to a per stream scratchpad of 1024 bytes residing
147	// in global memory.
148	void* scratchpad() const override { return scratch_; }
149
150	// Return a semaphore. The semaphore is initially initialized to 0, and
151	// each kernel using it is responsible for resetting to 0 upon completion
152	// to maintain the invariant that the semaphore is always equal to 0 upon
153	// each kernel start.
154	unsigned int* semaphore() const override { return semaphore_; }
155
156	private:
157	struct AsyncFreeData {
158	AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o,
159	const int64 s)
160	: allocator_(a), address_(p), operation_(o), step_id_(s) {}
161	::tensorflow::Allocator* allocator_;
162	void* address_;
163	const string operation_;
164	const int64 step_id_;
165	};
166
167	static void CUDART_CB asyncFree(cudaStream_t stream, cudaError_t status,
168	void* userData) {
169	AsyncFreeData* data = static_cast<AsyncFreeData*>(userData);
170	if (LogMemory::IsEnabled()) {
171	LogMemory::RecordRawDeallocation(data->operation_, data->step_id_,
172	data->address_, data->allocator_, false);
173	}
174	data->allocator_->DeallocateRaw(data->address_);
175	delete data;
176	}
177
178	string operation_;
179	int64 step_id_;
180	const cudaStream_t* stream_; // Not owned.
181	const cudaDeviceProp* device_prop_; // Not owned.
182	::tensorflow::Allocator* allocator_; // Not owned.
183	mutable char* scratch_;
184	mutable unsigned int* semaphore_;
185	OpKernelContext* context_;
186
187	TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
188	};
189
190	// This factory helps to ensure that different GPU device objects that refer to
191	// the same physical device and stream group id use the same stream group
192	// object (and therefore the same CUDA streams). This is necessary since there
193	// is a single memory allocator per device (see ProcessState::GetGPUAllocator)
194	// and allocators must not be shared across streams.
195	class BaseGPUDevice::StreamGroupFactory {
196	public:
197	// Returns the unique stream group for use with the stream defined by
198	// {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet
199	// exist.
200	// This function is thread safe.
201	BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id,
202	int stream_group_within_gpu,
203	gpu::StreamExecutor* executor) {
204	mutex_lock guard(lock_);
205	StreamGroup* group =
206	&streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
207	if (!group->compute) {
208	group->compute = new gpu::Stream(executor);
209	group->compute->Init();
210	VLOG(`2`) << "Created stream[" << stream_group_within_gpu
211	<< "] = " << group->compute;
212
213	group->host_to_device = new gpu::Stream(executor);
214	group->host_to_device->Init();
215	VLOG(`2`) << "Created host_to_device_stream[" << stream_group_within_gpu
216	<< "] = " << group->host_to_device;
217
218	group->device_to_host = new gpu::Stream(executor);
219	group->device_to_host->Init();
220	VLOG(`2`) << "Created device_to_host_stream[" << stream_group_within_gpu
221	<< "] = " << group->device_to_host;
222
223	group->device_to_device = new gpu::Stream(executor);
224	group->device_to_device->Init();
225	VLOG(`2`) << "Created device_to_device_stream[" << stream_group_within_gpu
226	<< "] = " << group->device_to_host;
227	}
228	return group;
229	}
230
231	// Returns a reference to the StreamGroupFactory singleton. Note that this is
232	// never destroyed, so the objects it owns are never deleted.
233	static StreamGroupFactory& Global() {
234	static StreamGroupFactory* instance = new StreamGroupFactory();
235	return *instance;
236	}
237
238	private:
239	mutex lock_;
240	using key_type = std::tuple<int, int>;
241	std::map<key_type, StreamGroup> streams_;
242
243	// StreamGroupFactory cannot be created directly; Call
244	// StreamGroupFactory::Global() to get the global instance.
245	StreamGroupFactory() = default;
246	TF_DISALLOW_COPY_AND_ASSIGN(StreamGroupFactory);
247	};
248
249	BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
250	Bytes memory_limit, const DeviceLocality& locality,
251	TfGpuId tf_gpu_id,
252	const string& physical_device_desc,
253	Allocator* gpu_allocator, Allocator* cpu_allocator,
254	bool sync_every_op, int32 max_streams)
255	: LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU,
256	memory_limit, locality,
257	physical_device_desc)),
258	gpu_allocator_(gpu_allocator),
259	cpu_allocator_(cpu_allocator),
260	scoped_allocator_mgr_(new ScopedAllocatorMgr(name)),
261	tf_gpu_id_(tf_gpu_id),
262	sync_every_op_(sync_every_op),
263	max_streams_(max_streams) {
264	ProcessState::singleton()->EnableGPUDevice();
265	}
266
267	BaseGPUDevice::~BaseGPUDevice() {
268	delete gpu_device_info_;
269	for (auto ctx : device_contexts_) ctx->Unref();
270	}
271
272	Status BaseGPUDevice::Init(const SessionOptions& options) {
273	auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_);
274	if (!executor_status.status().ok()) {
275	return errors::Internal("Failed to get StreamExecutor for device ",
276	tf_gpu_id_.value());
277	}
278
279	executor_ = executor_status.ValueOrDie();
280	em_.reset(new EventMgr(executor_, options.config.gpu_options()));
281
282	if (max_streams_ < `1`) {
283	return errors::InvalidArgument("Invalid value for max_streams.");
284	}
285
286	// Create the specified number of GPU streams
287	for (int i = `0`; i < max_streams_; i++) {
288	streams_.push_back(
289	StreamGroupFactory::Global().GetOrCreate(tf_gpu_id_, i, executor_));
290
291	size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int);
292	void* scratch_buffer = gpu_allocator_->AllocateRaw(
293	Allocator::kAllocatorAlignment, scratch_buffer_size);
294	if (scratch_buffer == nullptr) {
295	return errors::FailedPrecondition(
296	"Failed to allocate scratch buffer for device ", tf_gpu_id_.value());
297	}
298	scratch_.push_back(static_cast<char*>(scratch_buffer));
299
300	perftools::gputools::DeviceMemory<char> mem(
301	perftools::gputools::DeviceMemoryBase(scratch_buffer,
302	scratch_buffer_size));
303
304	bool ok = executor_->SynchronousMemZero(
305	&mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
306	if (!ok) {
307	return errors::FailedPrecondition(
308	"Failed to memcopy into scratch buffer for device ",
309	tf_gpu_id_.value());
310	}
311
312	device_contexts_.push_back(new GPUDeviceContext(
313	i, streams_.back()->compute, streams_.back()->host_to_device,
314	streams_.back()->device_to_host, streams_.back()->device_to_device));
315	}
316	gpu_device_info_ = new GpuDeviceInfo;
317	gpu_device_info_->stream = streams_[`0`]->compute;
318	gpu_device_info_->default_context = device_contexts_[`0`];
319	gpu_device_info_->event_mgr = em_.get();
320	gpu_device_info_->gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id_).value();
321	set_tensorflow_gpu_device_info(gpu_device_info_);
322
323	// Whether and how the GPU device uses its own threadpool.
324	// This option is experimental. Once we confirm the best setting, we
325	// may change the default behavior and completely remove this flag.
326	// Default values might change in future releases.
327	// Possible values:
328	// global: GPU uses threads shared with CPU in the main compute*
329	// thread-pool. This is currently the default.
330	// gpu_private: GPU uses threads dedicated to this device.*
331	// gpu_shared: All GPUs share a dedicated thread pool.*
332	string gpu_thread_mode;
333	TF_RETURN_IF_ERROR(
334	ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global", &gpu_thread_mode));
335	gpu_thread_mode = str_util::Lowercase(gpu_thread_mode);
336	if (gpu_thread_mode != "global") {
337	int64 gpu_thread_count = -`1`;
338	// Default to two threads. One for device compute and another for memory
339	// copies.
340	TF_RETURN_IF_ERROR(
341	ReadInt64FromEnvVar("TF_GPU_THREAD_COUNT", `2`, &gpu_thread_count));
342	if (gpu_thread_mode == "gpu_private") {
343	// TODO(zhengxq): since these threads only serve a single GPU device,
344	// we should set the device context once for each thread, and avoid
345	// setting them for each kernel.
346	// TODO(zhengxq): pin the thread to the same socket of the target GPU.
347	thread_pool_.reset(new thread::ThreadPool(
348	options.env, strings::StrCat("gpu_private_", tf_gpu_id_.value()),
349	static_cast<int32>(gpu_thread_count)));
350	set_tensorflow_device_thread_pool(thread_pool_.get());
351	} else if (gpu_thread_mode == "gpu_shared") {
352	static thread::ThreadPool* thread_pool = new thread::ThreadPool(
353	options.env, "gpu_shared", static_cast<int32>(gpu_thread_count));
354	set_tensorflow_device_thread_pool(thread_pool);
355	} else {
356	string error_message =
357	strings::StrCat("Invalid gpu_thread_mode: ", gpu_thread_mode);
358	LOG(WARNING) << error_message;
359	return errors::InvalidArgument(error_message);
360	}
361	}
362
363	return Status::OK();
364	}
365
366	bool BaseGPUDevice::RequiresRecordingAccessedTensors() const {
367	// When there is no more than one stream, we release the tensor reference
368	// at the end of the kernel launch, instead of at the end of the kernel
369	// execution.
370	return streams_.size() > `1`;
371	}
372
373	Status BaseGPUDevice::FillContextMap(const Graph* graph,
374	DeviceContextMap* device_context_map) {
375	VLOG(`2`) << "FillContextMap";
376
377	const size_t num_streams = streams_.size();
378	// Special case for single stream.
379	if (num_streams == `1`) {
380	return Status::OK();
381	}
382	const int64 before = Env::Default()->NowMicros();
383	gpu_stream_util::AssignStreamsOpts opts;
384	opts.max_streams = static_cast<int32>(num_streams);
385	std::unordered_map<int, int> node_to_stream_id;
386	TF_RETURN_IF_ERROR(
387	gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id));
388	int64 elapsed = Env::Default()->NowMicros() - before;
389	VLOG(`3`) << "AssignStreams took " << elapsed << "us";
390
391	// Fill in the context map. It is OK for this map to contain
392	// duplicate DeviceContexts so long as we increment the refcount.
393	device_context_map->resize(graph->num_node_ids());
394	for (Node* n : graph->nodes()) {
395	auto mapped_stream = node_to_stream_id[n->id()];
396	CHECK_LE(mapped_stream, num_streams);
397	auto ctx = device_contexts_[mapped_stream];
398	VLOG(`3`) << "Assigned stream " << node_to_stream_id[n->id()]
399	<< " ==> stream[" << ctx->stream_id() << "] for node id " << n->id()
400	<< " " << n->type_string() << " " << n->name();
401	ctx->Ref();
402	(*device_context_map)[n->id()] = ctx;
403	}
404
405	return Status::OK();
406	}
407
408	void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
409	// ScopedActivity is cheap when tracing is not active, but we
410	// can avoid computing the Hash64.
411	// TODO(pbar) This would no longer be needed if Ops have a unique id.
412	const uint64 id = port::Tracing::IsActive() ? Hash64(op_kernel->name()) : `0`;
413	port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
414	id);
415
416	// NOTE(tucker): We need to discriminate between Eigen GPU
417	// operations and all others. If an operation is Eigen
418	// implemented (or otherwise tries to launch a cuda kernel
419	// directly), we need to establish a stacked-scoped environment
420	// that directs it to execute on the proper device. Otherwise we
421	// expect the Op to use StreamExecutor directly and correctly. The
422	// way we make this discrimination is quite hacky: At the moment
423	// the only non-Eigen GPU Op is the recv-op, which is known to be
424	// asynchronous.
425	if (op_kernel->is_internal() && op_kernel->type_string() == "_Recv") {
426	context->SetStatus(errors::Internal(
427	"Invalid synchronous 'Compute' on GPU for '_Recv' op"));
428	} else if (port::Tracing::ScopedAnnotation::Enabled()) {
429	port::Tracing::ScopedAnnotation annotation(op_kernel->name(),
430	op_kernel->type_string());
431	ComputeHelper(op_kernel, context);
432	} else {
433	ComputeHelper(op_kernel, context);
434	}
435	}
436
437	void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
438	OpKernelContext* context) {
439	GPUDeviceContext* gpu_device_context = device_contexts_[`0`];
440	if (context->op_device_context() != nullptr) {
441	gpu_device_context =
442	static_cast<GPUDeviceContext*>(context->op_device_context());
443	}
444	gpu::Stream* stream = gpu_device_context->stream();
445	const auto stream_id = gpu_device_context->stream_id();
446
447	const bool vlog_1 = VLOG_IS_ON(`1`);
448	const bool vlog_2 = vlog_1 && VLOG_IS_ON(`2`);
449
450	if (vlog_1) {
451	VLOG(`1`) << "GpuDevice::Compute " << op_kernel->name() << " op "
452	<< op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
453	<< stream_id << "]";
454	}
455
456	const auto num_streams = streams_.size();
457	if (num_streams > `1`) {
458	// If this op's device context is different from the other contexts,
459	// we must wait on the stream.
460	for (int i = `0`; i < context->num_inputs(); ++i) {
461	const GPUDeviceContext* idc =
462	static_cast<GPUDeviceContext*>(context->input_device_context(i));
463	OP_REQUIRES(context, idc != nullptr,
464	errors::Internal("Input device context ", i,
465	" was not set properly."));
466	if (vlog_2) {
467	const void* base;
468	size_t len;
469	if (context->has_input(i)) {
470	if (IsRefType(context->input_dtype(i))) {
471	Tensor tensor = context->mutable_input(i, false);
472	base = DMAHelper::base(&tensor);
473	len = tensor.TotalBytes();
474	} else {
475	const Tensor& tensor = context->input(i);
476	base = DMAHelper::base(&tensor);
477	len = tensor.TotalBytes();
478	}
479	LOG(INFO) << "Input " << i << " " << base << " " << len;
480	LOG(INFO) << " stream[" << stream_id << "].ThenWaitFor(stream["
481	<< idc->stream_id() << "])"
482	<< ((idc->stream() == stream) ? " not needed" : "");
483	}
484	}
485	if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
486	}
487	}
488	gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
489	op_kernel->Compute(context);
490	if (context->status().ok()) {
491	if (sync_every_op_) {
492	// Note: GPUUtil::Sync() only syncs the default stream.
493	// We need to either sync the stream used by this op, or
494	// all streams. Given that this flag is typically used for
495	// debugging it makes more sense to sync all GPU activity.
496	context->SetStatus(GPUUtil::SyncAll(this));
497	}
498	}
499	}
500
501	void BaseGPUDevice::ConsumeListOfAccessedTensors(
502	DeviceContext* device_context, const TensorReferenceVector& tensor_refs) {
503	GPUDeviceContext* gpu_device_context = device_contexts_[`0`];
504	if (device_context != nullptr) {
505	gpu_device_context = static_cast<GPUDeviceContext*>(device_context);
506	}
507	gpu::Stream* stream = gpu_device_context->stream();
508	em_->ThenDeleteTensors(stream, tensor_refs);
509	}
510
511	// Based on the semantics of Device::Sync this call should wait for
512	// all streams not just the current one.
513	Status BaseGPUDevice::Sync() { return GPUUtil::SyncAll(this); }
514
515	void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
516	OpKernelContext* context,
517	AsyncOpKernel::DoneCallback done) {
518	GPUDeviceContext* gpu_device_context = device_contexts_[`0`];
519	if (context->op_device_context() != nullptr) {
520	gpu_device_context =
521	static_cast<GPUDeviceContext*>(context->op_device_context());
522	}
523	gpu::Stream* stream = gpu_device_context->stream();
524	const auto stream_id = gpu_device_context->stream_id();
525
526	VLOG(`1`) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
527	<< op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
528	<< stream_id << "]";
529
530	// When TraceMe profiling is off (which is the default), the
531	// following TraceMe constructor is simply a conditional test of
532	// false value. Measurements show that its overhead is negligible.
533	port::Tracing::TraceMe activity(op_kernel->name(), op_kernel->type_string(),
534	op_kernel->IsExpensive());
535	gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
536	op_kernel->ComputeAsync(context, done);
537	}
538
539	Status BaseGPUDevice::MaybeCopyTensorToGPU(
540	const AllocatorAttributes& alloc_attrs, const Tensor& from, Tensor* to,
541	StatusCallback done) {
542	if (alloc_attrs.on_host()) {
543	*to = from;
544	done(Status::OK());
545	return Status::OK();
546	} else {
547	if (!DMAHelper::CanUseDMA(&from)) {
548	Status err = errors::Internal("GPU copy from non-DMA ",
549	DataTypeString(from.dtype()), " tensor");
550	done(err);
551	return err;
552	}
553	auto* copy =
554	new Tensor(GetAllocator(alloc_attrs), from.dtype(), from.shape());
555
556	// If the tensor is not initialized, we likely ran out of memory.
557	if (!copy->IsInitialized()) {
558	delete copy;
559	Status err = errors::ResourceExhausted(
560	"OOM when allocating tensor of shape ", from.shape().DebugString(),
561	" and type ", DataTypeString(from.dtype()));
562	done(err);
563	return err;
564	}
565
566	StatusCallback wrapped_done = std::bind(
567	[to, copy](StatusCallback done_,
568	// Begin unbound arguments.
569	const Status& s) {
570	to = std::move(copy);
571	delete copy;
572	done_(s);
573	},
574	std::move(done), std::placeholders::_1);
575
576	port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto");
577	device_contexts_[`0`]->CopyCPUTensorToDevice(&from, this, copy,
578	std::move(wrapped_done));
579	return Status::OK();
580	}
581	}
582
583	Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
584	const AllocatorAttributes alloc_attrs,
585	Tensor* tensor) {
586	AllocatorAttributes attr;
587	attr.set_on_host(true);
588	attr.set_gpu_compatible(true);
589	Allocator* host_alloc = GetAllocator(attr);
590	Tensor parsed(tensor_proto.dtype());
591	if (!parsed.FromProto(host_alloc, tensor_proto)) {
592	return errors::InvalidArgument("Cannot parse tensor from proto: ",
593	tensor_proto.DebugString());
594	}
595
596	if (parsed.dtype() == DT_VARIANT) {
597	const Variant* from = parsed.flat<Variant>().data();
598	Tensor copy(cpu_allocator(), DT_VARIANT, parsed.shape());
599	Variant* copy_variant = copy.flat<Variant>().data();
600
601	std::list<Notification> notifications;
602	Status copy_status;
603	auto copier = [this, &alloc_attrs, &notifications, &copy_status](
604	const Tensor& from, Tensor* to) {
605	// Copier isn't run in a multithreaded environment, so we don't
606	// have to worry about the notifications list being modified in parallel.
607	notifications.emplace_back();
608	Notification& n = *notifications.rbegin();
609	return MaybeCopyTensorToGPU(alloc_attrs, from, to,
610	[&n, &copy_status](const Status& s) {
611	if (copy_status.ok()) {
612	copy_status.Update(s);
613	}
614	n.Notify();
615	});
616	};
617	Status s;
618	for (int64 ix = `0`; ix < parsed.NumElements(); ++ix) {
619	s = VariantDeviceCopy(VariantDeviceCopyDirection::HOST_TO_DEVICE,
620	from[ix], &copy_variant[ix], copier);
621	if (!s.ok()) {
622	break;
623	}
624	}
625	for (auto& n : notifications) {
626	n.WaitForNotification();
627	}
628	if (!s.ok()) {
629	return s;
630	}
631	*tensor = std::move(copy);
632	return copy_status;
633	} else {
634	Notification n;
635	Status status;
636	TF_RETURN_IF_ERROR(MaybeCopyTensorToGPU(alloc_attrs, parsed, tensor,
637	[&n, &status](const Status& s) {
638	status = s;
639	n.Notify();
640	}));
641	n.WaitForNotification();
642	return status;
643	}
644	}
645
646	namespace {
647	class ConcretePerOpGpuDevice : public PerOpGpuDevice {
648	public:
649	ConcretePerOpGpuDevice() : device_(&stream_device_) {}
650
651	void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
652	TfGpuId tf_gpu_id, Allocator* base_allocator,
653	char* scratch) {
654	stream_device_.Reinitialize(context, cuda_stream, tf_gpu_id, base_allocator,
655	scratch);
656	}
657
658	const Eigen::GpuDevice& device() const override { return device_; }
659
660	private:
661	EigenCudaStreamDevice stream_device_;
662	Eigen::GpuDevice device_;
663	};
664
665	// Parse 'visible_device_list' into a list of CUDA GPU ids.
666	Status ParseVisibleDeviceList(const string& visible_device_list,
667	std::vector<CudaGpuId>* visible_gpu_order) {
668	visible_gpu_order->clear();
669	gpu::Platform* gpu_manager = GPUMachineManager();
670
671	// If the user wants to remap the visible to virtual GPU mapping,
672	// check for that here.
673	if (visible_device_list.empty()) {
674	visible_gpu_order->resize(gpu_manager->VisibleDeviceCount());
675	// By default, visible to virtual mapping is unchanged.
676	int deviceNo = `0`;
677	std::generate(visible_gpu_order->begin(), visible_gpu_order->end(),
678	[&deviceNo] { return deviceNo++; });
679	} else {
680	const std::vector<string> order_str =
681	str_util::Split(visible_device_list, `','`);
682	for (const string& cuda_gpu_id_str : order_str) {
683	int32 cuda_gpu_id;
684	if (!strings::safe_strto32(cuda_gpu_id_str, &cuda_gpu_id)) {
685	return errors::InvalidArgument(
686	"Could not parse entry in 'visible_device_list': '",
687	cuda_gpu_id_str, "'. visible_device_list = ", visible_device_list);
688	}
689	if (cuda_gpu_id < `0` \|\| cuda_gpu_id >= gpu_manager->VisibleDeviceCount()) {
690	return errors::InvalidArgument(
691	"'visible_device_list' listed an invalid GPU id '", cuda_gpu_id,
692	"' but visible device count is ",
693	gpu_manager->VisibleDeviceCount());
694	}
695	visible_gpu_order->push_back(CudaGpuId(cuda_gpu_id));
696	}
697	}
698
699	// Validate no repeats.
700	std::set<CudaGpuId> visible_device_set(visible_gpu_order->begin(),
701	visible_gpu_order->end());
702	if (visible_device_set.size() != visible_gpu_order->size()) {
703	return errors::InvalidArgument(
704	"visible_device_list contained a duplicate entry: ",
705	visible_device_list);
706	}
707	return Status::OK();
708	}
709
710	Status VerifyVirtualDeviceSettings(
711	const size_t num_gpus_to_use, const GPUOptions& gpu_options,
712	const std::vector<CudaGpuId>& visible_gpu_order,
713	const std::vector<CudaGpuId>& valid_cuda_gpu_ids) {
714	const auto& virtual_devices = gpu_options.experimental().virtual_devices();
715	CHECK(!virtual_devices.empty());
716	if (gpu_options.per_process_gpu_memory_fraction() > `0`) {
717	return errors::InvalidArgument(
718	"It's invalid to set per_process_gpu_memory_fraction when "
719	"virtual_devices is set.");
720	}
721	if (num_gpus_to_use < virtual_devices.size()) {
722	return errors::Unknown(
723	"Not enough GPUs to create virtual devices."
724	" num_gpus_to_use: ",
725	num_gpus_to_use, " #virtual_devices: ", virtual_devices.size());
726	}
727	if (!gpu_options.visible_device_list().empty() &&
728	visible_gpu_order.size() != virtual_devices.size()) {
729	return errors::InvalidArgument(
730	"The number of GPUs in visible_device_list doesn't match the number "
731	"of elements in the virtual_devices list.",
732	" #GPUs in visible_device_list: ", visible_gpu_order.size(),
733	" virtual_devices.size(): ", virtual_devices.size());
734	}
735	if (valid_cuda_gpu_ids.size() != virtual_devices.size()) {
736	return errors::Unknown(
737	"The number of valid GPUs doesn't match the number of elements in "
738	"the virtual_devices list.",
739	" #valid GPUs: ", valid_cuda_gpu_ids.size(),
740	" virtual_devices.size(): ", virtual_devices.size());
741	}
742	return Status::OK();
743	}
744
745	int64 MinSystemMemory(int64 available_memory) {
746	// We use the following heuristic for now:
747	//
748	// If the available_memory is < 2GiB, we allocate 225MiB to system memory.
749	// Otherwise, allocate max(300MiB, 0.05 available_memory) to system memory.*
750	//
751	// In the future we could be more sophisticated by using a table of devices.
752	int64 min_system_memory;
753	if (available_memory < (`1LL` << `31`)) {
754	// 225MiB
755	min_system_memory = `225` * `1024` * `1024`;
756	} else {
757	// max(300 MiB, 0.05 available_memory)*
758	min_system_memory =
759	std::max(`314572800LL`, static_cast<int64>(available_memory * `0.05`));
760	}
761	#if defined(__GNUC__) && defined(__OPTIMIZE__)
762	// Do nothing
763	#elif !defined(__GNUC__) && defined(NDEBUG)
764	// Do nothing
765	#else
766	// Double the amount of available GPU memory in non-opt builds (debug
767	// builds in windows); because in non-opt builds more system memory
768	// is necessary.
769	min_system_memory *= `2`;
770	#endif
771
772	#if defined(ANDROID_TEGRA)
773	// 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM
774	// and Video RAM
775	min_system_memory = `1` << `30`;
776	#endif
777	return min_system_memory;
778	}
779
780	// Get the memory limit for the virtual device being created on GPU with
781	// 'cuda_gpu_id', when that virtual device is the only virtual device being
782	// created on that GPU.
783	Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
784	CudaGpuId cuda_gpu_id,
785	int64* memory_limit) {
786	int64 total_memory = `0`;
787	int64 available_memory = `0`;
788	gpu::StreamExecutor* se =
789	GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
790	if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
791	return errors::Unknown("Failed to query available memory for GPU ",
792	cuda_gpu_id.value());
793	}
794
795	int64 allocated_memory = `0`;
796	const double per_process_gpu_memory_fraction =
797	gpu_options.per_process_gpu_memory_fraction();
798	if (per_process_gpu_memory_fraction == `0`) {
799	allocated_memory = available_memory;
800	const int64 min_system_memory = MinSystemMemory(available_memory);
801	if (min_system_memory < allocated_memory) {
802	allocated_memory -= min_system_memory;
803	}
804	} else {
805	allocated_memory = total_memory * per_process_gpu_memory_fraction;
806	}
807	*memory_limit = allocated_memory;
808	return Status::OK();
809	}
810	} // namespace
811
812	void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
813	PerOpGpuDevice* device, int stream_id,
814	Allocator* allocator) {
815	ConcretePerOpGpuDevice* concrete_device =
816	static_cast<ConcretePerOpGpuDevice*>(device);
817	DCHECK(concrete_device);
818	const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
819	streams_[stream_id]->compute->implementation()->CudaStreamMemberHack());
820	concrete_device->Reinitialize(context, cuda_stream, tf_gpu_id_, allocator,
821	scratch_[stream_id]);
822	}
823
824	PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() {
825	return new ConcretePerOpGpuDevice();
826	}
827
828	void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
829	PerOpGpuDevice* device,
830	DeviceContext* dc,
831	Allocator* allocator) {
832	if (dc) {
833	const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc);
834	const int stream_id = gpu_dc->stream_id();
835	VLOG(`1`) << " eigen_gpu_device(" << dc << ") => stream[" << stream_id
836	<< "]";
837	CHECK_LT(stream_id, streams_.size());
838	ReinitializeDevice(context, device, stream_id, allocator);
839	} else {
840	ReinitializeDevice(context, device, `0`, allocator);
841	}
842	}
843
844	Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr,
845	int64 step_id) {
846	if (attr.scope_id > `0`) {
847	return scoped_allocator_mgr_->GetContainer(step_id)->GetInstance(
848	attr.scope_id);
849	}
850	LOG(FATAL) << "Unexpected call to BaseGPUDevice::GetScopedAllocator "
851	<< "attr.scope_id = " << attr.scope_id;
852	return gpu_allocator_;
853	}
854
855	const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = `1000`;
856	const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = `1`;
857
858	Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
859	const string& name_prefix,
860	std::vector<Device> devices) {
861	TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
862	gpu::Platform* gpu_manager = GPUMachineManager();
863	if (gpu_manager == nullptr) {
864	return Status::OK();
865	}
866	// If there are no GPUs visible, do nothing.
867	if (gpu_manager->VisibleDeviceCount() <= `0`) {
868	return Status::OK();
869	}
870
871	size_t num_gpus_to_use = INT_MAX;
872	auto iter = options.config.device_count().find("GPU");
873	if (iter != options.config.device_count().end()) {
874	num_gpus_to_use = iter->second;
875	}
876	const auto& gpu_options = options.config.gpu_options();
877	std::vector<CudaGpuId> visible_gpu_order;
878	TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(),
879	&visible_gpu_order));
880
881	std::vector<CudaGpuId> valid_cuda_gpu_ids;
882	TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &valid_cuda_gpu_ids));
883	if (num_gpus_to_use > valid_cuda_gpu_ids.size()) {
884	num_gpus_to_use = valid_cuda_gpu_ids.size();
885	}
886	if (!valid_cuda_gpu_ids.empty()) {
887	// Save the original device.
888	int original_device = `0`;
889	cudaError_t err = cudaGetDevice(&original_device);
890	if (err != cudaSuccess) {
891	return errors::Internal("cudaGetDevice() failed. Status: ",
892	cudaGetErrorString(err));
893	}
894	// Force to implicitly initialize CUDA runtime on each valid GPU before
895	// CreateGPUDevice().
896	for (CudaGpuId cuda_gpu_id : valid_cuda_gpu_ids) {
897	err = cudaSetDevice(cuda_gpu_id.value());
898	if (err != cudaSuccess) {
899	return errors::Internal("cudaSetDevice() on GPU:", cuda_gpu_id.value(),
900	" failed. Status: ", cudaGetErrorString(err));
901	}
902	err = cudaFree(nullptr);
903	if (err != cudaSuccess) {
904	return errors::Internal(
905	"CUDA runtime implicit initialization on GPU:", cuda_gpu_id.value(),
906	" failed. Status: ", cudaGetErrorString(err));
907	}
908	}
909	// Reset to the original device.
910	err = cudaSetDevice(original_device);
911	if (err != cudaSuccess) {
912	return errors::Internal("cudaSetDevice() on GPU:", original_device,
913	" failed. Status: ", cudaGetErrorString(err));
914	}
915	}
916
917	std::vector<InterconnectMap> interconnect_maps;
918	TF_RETURN_IF_ERROR(
919	GetInterconnectMaps(visible_gpu_order, gpu_manager, &interconnect_maps));
920
921	// Print each interconnect map to the log.
922	for (const InterconnectMap& im : interconnect_maps) {
923	LOG(INFO) << "Device interconnect " << im.name << " with strength "
924	<< im.strength << " edge matrix:";
925	string line_buf = " ";
926	for (int i = `0`; i < visible_gpu_order.size(); ++i) {
927	strings::StrAppend(&line_buf, visible_gpu_order[i].value(), " ");
928	}
929	LOG(INFO) << line_buf;
930	for (int i = `0`; i < visible_gpu_order.size(); ++i) {
931	line_buf = strings::StrCat(visible_gpu_order[i].value(), ": ");
932	CudaGpuId cuda_id_i = visible_gpu_order[i];
933	for (int j = `0`; j < visible_gpu_order.size(); ++j) {
934	CudaGpuId cuda_id_j = visible_gpu_order[j];
935	if (im.directed_links.find({cuda_id_i, cuda_id_j}) !=
936	im.directed_links.end()) {
937	line_buf.append("Y ");
938	} else {
939	line_buf.append("N ");
940	}
941	}
942	LOG(INFO) << line_buf;
943	}
944	}
945
946	const auto& virtual_devices = gpu_options.experimental().virtual_devices();
947	if (!virtual_devices.empty()) {
948	TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
949	num_gpus_to_use, gpu_options, visible_gpu_order, valid_cuda_gpu_ids));
950	// We've verified that num_gpus_to_use >= virtual_devices.size().
951	num_gpus_to_use = virtual_devices.size();
952	CHECK(gpu_options.visible_device_list().empty() \|\|
953	valid_cuda_gpu_ids == visible_gpu_order);
954	}
955	int next_tf_gpu_id = `0`;
956	std::vector<int64> memory_limit_bytes;
957	for (int i = `0`; i < num_gpus_to_use; ++i) {
958	const CudaGpuId cuda_gpu_id = valid_cuda_gpu_ids[i];
959	if (virtual_devices.empty() \|\|
960	virtual_devices.Get(i).memory_limit_mb_size() == `0`) {
961	int64 single_virtual_device_memory_limit = `0`;
962	TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit(
963	gpu_options, cuda_gpu_id, &single_virtual_device_memory_limit));
964	memory_limit_bytes.push_back(single_virtual_device_memory_limit);
965	} else {
966	const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
967	std::transform(memory_limit_mb.begin(), memory_limit_mb.end(),
968	std::back_inserter(memory_limit_bytes), [](float mb) {
969	return static_cast<int64>(mb) * (`1ll` << `20`);
970	});
971	}
972	while (next_tf_gpu_id < memory_limit_bytes.size()) {
973	TfGpuId tf_gpu_id(next_tf_gpu_id);
974	++next_tf_gpu_id;
975	GpuIdManager::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id);
976	}
977	}
978	const int num_tf_gpus = next_tf_gpu_id;
979
980	LocalityMap device_localities;
981	TF_RETURN_IF_ERROR(
982	GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities));
983
984	// Build the GPUDevices
985	CHECK_EQ(next_tf_gpu_id, memory_limit_bytes.size());
986	for (int di = `0`; di < num_tf_gpus; ++di) {
987	TfGpuId tf_gpu_id(di);
988	int64 bytes = memory_limit_bytes[di];
989	auto it = device_localities.find(tf_gpu_id);
990	if (it == device_localities.end()) {
991	return errors::Internal("Failed to find DeviceLocality for GPU device ",
992	tf_gpu_id.value());
993	}
994	TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes,
995	it->second, devices));
996	}
997	return Status::OK();
998	}
999
1000	static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id,
1001	const gpu::DeviceDescription& desc) {
1002	int cc_major;
1003	int cc_minor;
1004	if (!desc.cuda_compute_capability(&cc_major, &cc_minor)) {
1005	cc_major = `0`;
1006	cc_minor = `0`;
1007	}
1008	// LINT.IfChange
1009	return strings::StrCat("device: ", cuda_gpu_id.value(),
1010	", name: ", desc.name(),
1011	", pci bus id: ", desc.pci_bus_id(),
1012	", compute capability: ", cc_major, ".", cc_minor);
1013	// LINT.ThenChange(//tensorflow/python/platform/test.py)
1014	}
1015
1016	Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
1017	const string& name_prefix,
1018	TfGpuId tf_gpu_id,
1019	int64 memory_limit,
1020	const DeviceLocality& dev_locality,
1021	std::vector<Device> devices) {
1022	CHECK_GE(tf_gpu_id.value(), `0`);
1023	const string device_name =
1024	strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
1025	GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
1026	CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
1027	int numa_node = dev_locality.numa_node();
1028
1029	gpu::StreamExecutor* se =
1030	GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
1031	const gpu::DeviceDescription& desc = se->GetDeviceDescription();
1032	ProcessState* process_state = ProcessState::singleton();
1033	Allocator* gpu_allocator = process_state->GetGPUAllocator(
1034	options.config.gpu_options(), tf_gpu_id, memory_limit);
1035	if (gpu_allocator == nullptr) {
1036	return errors::Internal("Failed to get memory allocator for TF GPU ",
1037	tf_gpu_id.value(), " with ", memory_limit,
1038	" bytes of memory.");
1039	}
1040	AllocatorStats stats;
1041	gpu_allocator->GetStats(&stats);
1042	// 'memory_limit' is the required memory size, but if the allocator with given
1043	// tf_gpu_id was created before, we'll use it instead of creating a new one
1044	// (as TF gpu device is a shared resource), in which case the actual memory
1045	// limit represented by 'stats.bytes_limit' used by that allocator may be
1046	// different (which should be an error).
1047	//
1048	// TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit.
1049	BaseGPUDevice* gpu_device = CreateGPUDevice(
1050	options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality,
1051	tf_gpu_id, GetShortDeviceDescription(cuda_gpu_id, desc), gpu_allocator,
1052	process_state->GetCPUAllocator(numa_node));
1053	LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
1054	<< (stats.bytes_limit >> `20`) << " MB memory) -> physical GPU ("
1055	<< GetShortDeviceDescription(cuda_gpu_id, desc) << ")";
1056	TF_RETURN_IF_ERROR(gpu_device->Init(options));
1057	devices->push_back(gpu_device);
1058
1059	return Status::OK();
1060	}
1061
1062	namespace {
1063	std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>>
1064	GetPeerAccessMap(gpu::Platform* platform,
1065	const std::vector<CudaGpuId>& visible_gpu_order) {
1066	std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> map(
1067	new std::map<std::pair<CudaGpuId, CudaGpuId>, bool>);
1068	for (CudaGpuId cuda_gpu_i : visible_gpu_order) {
1069	for (CudaGpuId cuda_gpu_j : visible_gpu_order) {
1070	gpu::StreamExecutor* from =
1071	GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
1072	gpu::StreamExecutor* to =
1073	GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
1074	(*map)[{cuda_gpu_i, cuda_gpu_j}] = from->CanEnablePeerAccessTo(to);
1075	}
1076	}
1077
1078	return map;
1079	}
1080
1081	} // namespace
1082
1083	Status BaseGPUDeviceFactory::GetInterconnectMaps(
1084	const std::vector<CudaGpuId>& visible_gpu_order, gpu::Platform* gpu_manager,
1085	std::vector<InterconnectMap>* maps) {
1086	// The default interconnect map is obtained from the StreamExecutor.
1087	auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
1088	maps->resize(`1`);
1089	InterconnectMap& imap = maps->at(`0`);
1090	imap.name = "StreamExecutor";
1091	imap.strength = InterconnectMap::kStreamExecutorStrength;
1092	for (CudaGpuId cuda_id_i : visible_gpu_order) {
1093	for (CudaGpuId cuda_id_j : visible_gpu_order) {
1094	if (cuda_id_i == cuda_id_j) continue;
1095	if ((*access_map)[{cuda_id_i, cuda_id_j}]) {
1096	imap.directed_links.insert({cuda_id_i, cuda_id_j});
1097	}
1098	}
1099	}
1100	return Status::OK();
1101	}
1102
1103	Status BaseGPUDeviceFactory::GetDeviceLocalities(
1104	int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
1105	LocalityMap* localities) {
1106	std::vector<TfGpuId> all_tf_gpu_ids;
1107	for (int i = `0`; i < num_tf_gpus; ++i) {
1108	all_tf_gpu_ids.push_back(TfGpuId(i));
1109	}
1110	for (TfGpuId tf_gpu_id : all_tf_gpu_ids) {
1111	CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
1112	// Get GPU bus_id from its reported NUMA affinity. Because GPUs are
1113	// virtualized in some environments, we can't just use the GPU id.
1114	// NUMA locales are indexed from 0, buses are indexed from 1.
1115	gpu::StreamExecutor* se =
1116	GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
1117	const gpu::DeviceDescription& desc = se->GetDeviceDescription();
1118	int numa_node = desc.numa_node();
1119	if (numa_node < `0`) {
1120	// For some reason the StreamExecutor couldn't get the NUMA
1121	// affinity of the GPU. If this is not a multi-socket mobo with
1122	// GPUs local to different buses, it doesn't matter. If it is, we
1123	// may run into trouble later with data transfer operations. The
1124	// trouble may manifest as slower than expected performance, or
1125	// outright failures.
1126	LOG(INFO) << "Could not identify NUMA node of CUDA gpu id " << cuda_gpu_id
1127	<< ", defaulting to 0. Your kernel may not have been built "
1128	<< "with NUMA support.";
1129	numa_node = `0`;
1130	}
1131	DeviceLocality dev_locality;
1132	dev_locality.set_numa_node(numa_node);
1133	dev_locality.set_bus_id(numa_node + `1`);
1134
1135	// Set LocalLinks from InterconnectMaps.
1136	LocalLinks* links = dev_locality.mutable_links();
1137	for (const InterconnectMap& imap : interconnects) {
1138	for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
1139	CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst);
1140	if (imap.directed_links.find({cuda_gpu_id, cuda_gpu_dst}) !=
1141	imap.directed_links.end()) {
1142	InterconnectLink* ilink = links->add_link();
1143	ilink->set_device_id(tf_gpu_dst.value());
1144	ilink->set_type(imap.name);
1145	ilink->set_strength(imap.strength);
1146	}
1147	}
1148	}
1149
1150	// If this is one of multiple virtual GPUs on the same physical GPU
1151	// add high strength links to the others.
1152	for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
1153	if (tf_gpu_id == tf_gpu_dst) continue;
1154	CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst);
1155	if (cuda_gpu_id == cuda_gpu_dst) {
1156	InterconnectLink* ilink = links->add_link();
1157	ilink->set_device_id(tf_gpu_dst.value());
1158	ilink->set_type("SAME_DEVICE");
1159	ilink->set_strength(InterconnectMap::kSameDeviceStrength);
1160	}
1161	}
1162
1163	(*localities)[tf_gpu_id] = dev_locality;
1164	VLOG(`1`) << "GPUDevice CudaGpuId " << cuda_gpu_id << " TfGpuId " << tf_gpu_id
1165	<< " on bus " << dev_locality.bus_id() << " numa: " << numa_node
1166	<< " pci: " << desc.pci_bus_id()
1167	<< " DeviceLocality: " << dev_locality.DebugString();
1168	}
1169	return Status::OK();
1170	}
1171
1172	static int GetDefaultMinGPUMultiprocessorCount(
1173	gpu::Platform* gpu_manager,
1174	const std::vector<CudaGpuId>& visible_gpu_order) {
1175	static const int kDefaultMinGPUMultiprocessorCount = `8`;
1176
1177	// Find the highest multi-processor count across all visible GPUs.
1178	int max_count = -`1`;
1179	for (int i = `0`; i < visible_gpu_order.size(); ++i) {
1180	auto exec_status =
1181	GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_order[i]);
1182	if (!exec_status.ok()) {
1183	continue;
1184	}
1185
1186	gpu::StreamExecutor* se = exec_status.ValueOrDie();
1187	const gpu::DeviceDescription& desc = se->GetDeviceDescription();
1188	max_count = std::max(max_count, desc.core_count());
1189	}
1190
1191	if (max_count < `0` \|\| kDefaultMinGPUMultiprocessorCount < max_count) {
1192	return kDefaultMinGPUMultiprocessorCount;
1193	} else {
1194	return max_count;
1195	}
1196	}
1197
1198	static int GetMinGPUMultiprocessorCount(
1199	gpu::Platform* gpu_manager,
1200	const std::vector<CudaGpuId>& visible_gpu_order) {
1201	const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
1202
1203	if (tf_min_gpu_core_count == nullptr \|\|
1204	strcmp(tf_min_gpu_core_count, "") == `0`) {
1205	return GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1206	}
1207
1208	int min_gpu_core_count = -`1`;
1209	if (strings::safe_strto32(tf_min_gpu_core_count, &min_gpu_core_count)) {
1210	if (min_gpu_core_count >= `0`) {
1211	return min_gpu_core_count;
1212	}
1213	}
1214
1215	int count =
1216	GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1217	LOG(ERROR) << "Invalid minimum GPU multiprocessor count: ["
1218	<< tf_min_gpu_core_count << "]. "
1219	<< "Using the default value: " << count;
1220	return count;
1221	}
1222
1223	namespace {
1224
1225	struct CudaVersion {
1226	// Initialize from version_name in the form of "3.5"
1227	explicit CudaVersion(const std::string& version_name) {
1228	size_t dot_pos = version_name.find(`'.'`);
1229	CHECK(dot_pos != string::npos)
1230	<< "Illegal version name: [" << version_name << "]";
1231	string major_str = version_name.substr(`0`, dot_pos);
1232	CHECK(strings::safe_strto32(major_str, &major_part))
1233	<< "Illegal version name: [" << version_name << "]";
1234	string minor_str = version_name.substr(dot_pos + `1`);
1235	CHECK(strings::safe_strto32(minor_str, &minor_part))
1236	<< "Illegal version name: [" << version_name << "]";
1237	}
1238	CudaVersion() {}
1239	bool operator<(const CudaVersion& other) const {
1240	if (this->major_part != other.major_part) {
1241	return this->major_part < other.major_part;
1242	}
1243	return this->minor_part < other.minor_part;
1244	}
1245	friend std::ostream& operator<<(std::ostream& os,
1246	const CudaVersion& version) {
1247	os << version.major_part << "." << version.minor_part;
1248	return os;
1249	}
1250	int major_part = -`1`;
1251	int minor_part = -`1`;
1252	};
1253
1254	std::vector<CudaVersion> supported_cuda_compute_capabilities = {
1255	TF_CUDA_CAPABILITIES,};
1256
1257	std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
1258	auto cuda_caps = supported_cuda_compute_capabilities;
1259	#ifdef TF_EXTRA_CUDA_CAPABILITIES
1260	// TF_EXTRA_CUDA_CAPABILITIES should be defined a sequence separated by commas,
1261	// for example:
1262	// TF_EXTRA_CUDA_CAPABILITIES=3.0,4.0,5.0
1263	// Use two-level macro expansion for stringification.
1264	#define TF_XSTRING(...) #__VA_ARGS__
1265	#define TF_STRING(s) TF_XSTRING(s)
1266	string extra_cuda_caps = TF_STRING(TF_EXTRA_CUDA_CAPABILITIES);
1267	#undef TF_STRING
1268	#undef TF_XSTRING
1269	auto extra_capabilities = str_util::Split(extra_cuda_caps, `','`);
1270	for (const auto& capability : extra_capabilities) {
1271	cuda_caps.push_back(CudaVersion(capability));
1272	}
1273	#endif
1274	return cuda_caps;
1275	}
1276
1277	Status EnablePeerAccess(gpu::Platform* platform,
1278	const std::vector<CudaGpuId>& visible_gpu_order) {
1279	int possible_peer_count = `0`;
1280	int enabled_peer_count = `0`;
1281	for (int i = `0`; i < visible_gpu_order.size(); ++i) {
1282	const CudaGpuId cuda_gpu_i = visible_gpu_order[i];
1283	for (int j = `0`; j < visible_gpu_order.size(); ++j) {
1284	const CudaGpuId cuda_gpu_j = visible_gpu_order[j];
1285	// We have already validated that ExecutorForDevice() calls return OK.
1286	gpu::StreamExecutor* from =
1287	GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
1288	gpu::StreamExecutor* to =
1289	GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
1290
1291	if (from->CanEnablePeerAccessTo(to)) {
1292	++possible_peer_count;
1293	auto status = from->EnablePeerAccessTo(to);
1294	if (!status.ok()) {
1295	LOG(WARNING)
1296	<< "Unable to enable peer access between device ordinals "
1297	<< cuda_gpu_i << " and " << cuda_gpu_j << ", status: " << status;
1298	} else {
1299	++enabled_peer_count;
1300	}
1301	}
1302	}
1303	}
1304
1305	// Return an error in the extreme failure case where the driver
1306	// reported that peering was possible but not a single peering was
1307	// successful. This is to catch possible system misconfigurations
1308	// or more fundamental issues.
1309	if (possible_peer_count > `0` && enabled_peer_count == `0`) {
1310	return errors::Internal(possible_peer_count,
1311	" potential peer access pairs were reported by the "
1312	"driver, but no peering could be enabled.");
1313	}
1314	return Status::OK();
1315	}
1316
1317	} // namespace
1318
1319	Status BaseGPUDeviceFactory::GetValidDeviceIds(
1320	const std::vector<CudaGpuId>& visible_gpu_order,
1321	std::vector<CudaGpuId>* ids) {
1322	gpu::Platform* gpu_manager = GPUMachineManager();
1323	bool new_gpu_found = false;
1324	for (int i = `0`; i < visible_gpu_order.size(); ++i) {
1325	const CudaGpuId cuda_gpu_id = visible_gpu_order[i];
1326
1327	// Only perform this once per visible cuda gpu id.
1328	if (visible_gpu_initialized_[cuda_gpu_id.value()]) {
1329	continue;
1330	}
1331
1332	visible_gpu_initialized_[cuda_gpu_id.value()] = true;
1333	new_gpu_found = true;
1334
1335	auto executor = GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, cuda_gpu_id);
1336	if (!executor.ok()) {
1337	return StreamExecutorUtil::ConvertStatus(executor.status());
1338	}
1339
1340	auto stream_exec = executor.ValueOrDie();
1341	int64 free_bytes;
1342	int64 total_bytes;
1343	if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
1344	// Logs internally on failure.
1345	free_bytes = `0`;
1346	total_bytes = `0`;
1347	}
1348	const auto& description = stream_exec->GetDeviceDescription();
1349	int cc_major;
1350	int cc_minor;
1351	if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
1352	// Logs internally on failure.
1353	cc_major = `0`;
1354	cc_minor = `0`;
1355	}
1356	LOG(INFO) << "Found device " << i << " with properties: "
1357	<< "\nname: " << description.name() << " major: " << cc_major
1358	<< " minor: " << cc_minor
1359	<< " memoryClockRate(GHz): " << description.clock_rate_ghz()
1360	<< "\npciBusID: " << description.pci_bus_id() << "\ntotalMemory: "
1361	<< strings::HumanReadableNumBytes(total_bytes)
1362	<< " freeMemory: " << strings::HumanReadableNumBytes(free_bytes);
1363	}
1364	// Checking peering and shows matrix if more than one gpu found.
1365	if (new_gpu_found && visible_gpu_order.size() > `1`) {
1366	// Enable peer access
1367	TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order));
1368	}
1369
1370	auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
1371	if (cuda_supported_capabilities.empty()) {
1372	return errors::FailedPrecondition(
1373	"No supported cuda capabilities in binary.");
1374	}
1375	CudaVersion min_supported_capability = *std::min_element(
1376	cuda_supported_capabilities.begin(), cuda_supported_capabilities.end());
1377
1378	int min_gpu_core_count =
1379	GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
1380
1381	// Filter out devices that don't have the right capability or power.
1382	for (int i = `0`; i < visible_gpu_order.size(); ++i) {
1383	const CudaGpuId visible_gpu_id = visible_gpu_order[i];
1384	auto exec_status =
1385	GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_id);
1386	if (!exec_status.ok()) {
1387	LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id
1388	<< " whose executor is in invalid state: "
1389	<< exec_status.status().ToString();
1390	continue;
1391	}
1392	gpu::StreamExecutor* se = exec_status.ValueOrDie();
1393	const gpu::DeviceDescription& desc = se->GetDeviceDescription();
1394	CudaVersion device_capability;
1395	if (!desc.cuda_compute_capability(&device_capability.major_part,
1396	&device_capability.minor_part)) {
1397	LOG(INFO) << "Ignoring visible gpu device "
1398	<< "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1399	<< ") "
1400	<< "whose CUDA compute capability is not available.";
1401	continue;
1402	}
1403	// Only GPUs with no less than the minimum supported compute capability is
1404	// accepted.
1405	if (device_capability < min_supported_capability) {
1406	LOG(INFO) << "Ignoring visible gpu device "
1407	<< "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1408	<< ") "
1409	<< "with Cuda compute capability " << device_capability
1410	<< ". The minimum required Cuda capability is "
1411	<< min_supported_capability << ".";
1412	continue;
1413	}
1414
1415	// Filter out slow GPUs. By default, GPUs with a lower multiprocessor
1416	// count than the fastest GPU are filtered out, unless they have 8 or more
1417	// multiprocessors. If the TF_MIN_GPU_MULTIPROCESSOR_COUNT environment
1418	// variable is set, its value will be used to filter out GPUs.
1419	if (desc.core_count() < min_gpu_core_count) {
1420	LOG(INFO) << "Ignoring visible gpu device "
1421	<< "(" << GetShortDeviceDescription(visible_gpu_id, desc)
1422	<< ") "
1423	<< "with Cuda multiprocessor count: " << desc.core_count()
1424	<< ". The minimum required count is " << min_gpu_core_count
1425	<< ". You can adjust this requirement with the env var "
1426	"TF_MIN_GPU_MULTIPROCESSOR_COUNT.";
1427	continue;
1428	}
1429	ids->push_back(visible_gpu_id);
1430	}
1431	if (!ids->empty()) {
1432	std::vector<int> raw_ids(ids->size());
1433	std::transform(ids->begin(), ids->end(), raw_ids.begin(),
1434	[](CudaGpuId id) -> int { return id.value(); });
1435	LOG(INFO) << "Adding visible gpu devices: "
1436	<< str_util::Join(raw_ids, ", ");
1437	}
1438
1439	return Status::OK();
1440	}
1441
1442	} // namespace tensorflow
1443
1444	#endif // GOOGLE_CUDA
1445

Browse the source code of tensorflow/tensorflow/core/common_runtime/gpu/gpu_device.cc