gpu_util.cc source code [tensorflow/tensorflow/core/common_runtime/gpu/gpu_util.cc]

1	/ Copyright 2015 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
17
18	#include "tensorflow/core/common_runtime/copy_tensor.h"
19	#include "tensorflow/core/common_runtime/device.h"
20	#include "tensorflow/core/common_runtime/dma_helper.h"
21	#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
22	#include "tensorflow/core/common_runtime/gpu/process_state.h"
23	#include "tensorflow/core/common_runtime/gpu_device_context.h"
24	#include "tensorflow/core/framework/tensor.h"
25	#include "tensorflow/core/framework/tensor.pb.h"
26	#include "tensorflow/core/framework/tensor_reference.h"
27	#include "tensorflow/core/framework/types.h"
28	#include "tensorflow/core/lib/core/errors.h"
29	#include "tensorflow/core/lib/core/refcount.h"
30	#include "tensorflow/core/lib/gtl/array_slice.h"
31	#include "tensorflow/core/lib/gtl/stl_util.h"
32	#include "tensorflow/core/lib/hash/hash.h"
33	#include "tensorflow/core/lib/strings/strcat.h"
34	#include "tensorflow/core/lib/strings/stringprintf.h"
35	#include "tensorflow/core/platform/logging.h"
36	#include "tensorflow/core/platform/stream_executor.h"
37	#include "tensorflow/core/platform/tensor_coding.h"
38	#include "tensorflow/core/platform/tracing.h"
39	#include "tensorflow/core/util/util.h"
40
41	// IMPLEMENTATION NOTE:
42	//
43	// 1. Within this module, we intentionally LOG(FATAL) if any stream
44	// involved in memcpy becomes !stream->ok(), because TF process
45	// today (1/2016) can not properly recover from such an error.
46	//
47	// 2. When 0-size tensor is being copied, we should not schedule a
48	// copy ThenMemcpy since there is no byte to move. However, we must
49	// ensure the causal ordering by arranging the copy done callback
50	// happens-after all activities scheduled on the given stream being
51	// finished.
52
53	// If this need to be runtime configurable, consider adding options to
54	// ConfigProto.
55	const tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = `128`;
56	extern bool FLAGS_brain_gpu_record_mem_types;
57
58	using perftools::gputools::DeviceMemoryBase;
59	using perftools::gputools::Stream;
60
61	namespace tensorflow {
62
63	namespace gpu = ::perftools::gputools;
64
65	Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src,
66	const Tensor* dst,
67	const DeviceBase::GpuDeviceInfo** dev_info,
68	gpu::Stream** stream) {
69	if (device == nullptr) {
70	return errors::Internal("Unexpected null device.");
71	}
72	auto di = device->tensorflow_gpu_device_info();
73	if (di == nullptr) {
74	return errors::Internal("Unexpected null device info.");
75	}
76	*dev_info = di;
77	if (ctx == nullptr) {
78	return errors::Internal("Unexpected null device context.");
79	}
80	auto gs = static_cast<const GPUDeviceContext*>(ctx)->stream();
81	if (gs == nullptr) {
82	return errors::Internal("No gpu stream is available.");
83	}
84	*stream = gs;
85	if (dst != nullptr) {
86	if (src.dtype() != dst->dtype()) {
87	return errors::Internal("Can't copy a tensor of ",
88	DataTypeString(src.dtype()), " into a tensor of ",
89	DataTypeString(dst->dtype()));
90	}
91	if (src.TotalBytes() != dst->TotalBytes()) {
92	return errors::Internal("Can't copy ", src.TotalBytes(),
93	" bytes of a tensor into another with ",
94	dst->TotalBytes(), " bytes buffer.");
95	}
96	if ((src.TotalBytes() > `0`) && !src.IsInitialized()) {
97	return errors::Internal("Src tensor is not initialized.");
98	}
99	if ((dst->TotalBytes() > `0`) && !dst->IsInitialized()) {
100	return errors::Internal("Dst tensor is not initialized.");
101	}
102	}
103	if (!DMAHelper::CanUseDMA(&src)) {
104	return errors::Internal("GPU copy from non-DMA ",
105	DataTypeString(src.dtype()), "tensor");
106	}
107	return Status::OK();
108	}
109
110	void* GetBase(const Tensor* src) {
111	return const_cast<void*>(DMAHelper::base(src));
112	}
113
114	void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
115
116	/static/
117	void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
118	const DeviceContext* device_context,
119	TensorProto* proto, bool is_dead,
120	StatusCallback done) {
121	VLOG(`1`) << "SetProtoFromGPU device_context " << device_context;
122	const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
123	gpu::Stream* send_stream = nullptr;
124	Status s = PrepareCopy(dev, device_context, tensor, nullptr, &dev_info,
125	&send_stream);
126	if (!s.ok()) {
127	done (s);
128	return;
129	}
130
131	auto send_device_to_host_stream =
132	static_cast<const GPUDeviceContext*>(device_context)
133	->device_to_host_stream();
134	if (send_device_to_host_stream == nullptr) {
135	done (errors::Internal("No send gpu copy-out-stream is available."));
136	return;
137	}
138	// Wait for the sender's main stream to make sure the data are available.
139	send_device_to_host_stream->ThenWaitFor(send_stream);
140
141	// Tensor values need to be copied from GPU to CPU ram so that
142	// we can build the protobuf response for a RecvTensor RPC.
143	// "device context" identifies the stream where the _Send op executed.
144	proto->set_dtype(tensor.dtype());
145	tensor.shape().AsProto(proto->mutable_tensor_shape());
146
147	// Prepare a proto with the right data buf size, and DMA the data
148	// over from the GPU buffer. Note that 0-size tensors do not have a
149	// backing buffer.
150	Allocator* alloc = nullptr;
151	char* buf = nullptr;
152	const int64 total_bytes = is_dead ? `0` : tensor.TotalBytes();
153	if (total_bytes > `0`) {
154	port::Tracing::ScopedAnnotation annotation("SetProtoFromGPU");
155	alloc = ProcessState::singleton()->GetCUDAHostAllocator(`0`);
156	buf = alloc->Allocate<char>(total_bytes);
157	if (LogMemory::IsEnabled()) {
158	LogMemory::RecordRawAllocation("SetProtoFromGPU",
159	LogMemory::PROTO_BUFFER_STEP_ID,
160	total_bytes, buf, alloc);
161	}
162	void* src_ptr = GetBase(&tensor);
163	DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
164	send_device_to_host_stream->ThenMemcpy(buf, gpu_src_ptr, total_bytes);
165	}
166	// Use of tensor may outlive stack scope, so keep a ref.
167	TensorReference tensor_ref(tensor);
168	dev_info->event_mgr->ThenExecute(
169	send_device_to_host_stream, [send_device_to_host_stream, done, proto, buf,
170	total_bytes, alloc, tensor_ref]() {
171	if (!send_device_to_host_stream->ok()) {
172	LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed";
173	}
174	tensor_ref.Unref();
175	if (total_bytes > `0`) {
176	port::CopyFromArray(proto->mutable_tensor_content(), buf,
177	total_bytes);
178	if (LogMemory::IsEnabled()) {
179	LogMemory::RecordRawDeallocation("SetProtoFromGPU",
180	LogMemory::PROTO_BUFFER_STEP_ID,
181	buf, alloc, false);
182	}
183	alloc->Deallocate<char>(buf, total_bytes);
184	}
185	done (Status::OK());
186	});
187	}
188
189	// static
190	void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
191	DeviceContext* recv_dev_context, Device* src,
192	Device* dst,
193	AllocatorAttributes src_alloc_attr,
194	AllocatorAttributes dst_alloc_attr,
195	const Tensor* input, Tensor* output,
196	StatusCallback done) {
197	const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
198	gpu::Stream* send_stream = nullptr;
199	Status s = PrepareCopy(src, send_dev_context, *input, output, &dev_info,
200	&send_stream);
201	if (!s.ok()) {
202	done (s);
203	return;
204	}
205	auto send_device_to_device_stream =
206	static_cast<const GPUDeviceContext*>(send_dev_context)
207	->device_to_device_stream();
208	if (send_device_to_device_stream == nullptr) {
209	done (errors::Internal("No send gpu copy-out-stream is available."));
210	return;
211	}
212	// Wait for the main stream on the sender to make sure the result is
213	// available.
214	send_device_to_device_stream->ThenWaitFor(send_stream);
215
216	const int64 total_bytes = input->TotalBytes();
217	if (total_bytes > `0`) {
218	void* src_ptr = GetBase(input);
219	DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
220	void* dst_ptr = GetBase(output);
221	DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
222	auto recv_stream =
223	static_cast<const GPUDeviceContext*>(recv_dev_context)->stream();
224	if (recv_stream == nullptr) {
225	done (errors::Internal("No recv gpu stream is available."));
226	return;
227	}
228	// Since we want to use the memory from recv_stream in the
229	// send_device_to_device_stream, add a dependency to make sure the memory is
230	// truly free.
231	// TODO(zhengxq): remove this dependency when we switch to a better way
232	// to make sure the memory is free.
233	send_device_to_device_stream->ThenWaitFor(recv_stream);
234
235	VLOG(`2`) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
236	send_device_to_device_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr,
237	total_bytes);
238	}
239
240	// Use of input may outlive stack scope, so keep a ref.
241	TensorReference input_ref(*input);
242	dev_info->event_mgr->ThenExecute(
243	send_device_to_device_stream,
244	[done, send_device_to_device_stream, input_ref]() {
245	input_ref.Unref();
246	if (!send_device_to_device_stream->ok()) {
247	LOG(FATAL) << "GPU->GPU Memcpy failed";
248	}
249	done (Status::OK());
250	});
251	send_dev_context->MaintainLifetimeOnStream(input,
252	send_device_to_device_stream);
253	}
254
255	static CopyTensor::Registration register_gpu_gpu_copy(
256	DEVICE_GPU, DEVICE_GPU, GPUUtil::DeviceToDeviceCopy);
257
258	// static
259	void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
260	const DeviceContext* device_context,
261	const Tensor* gpu_tensor, Tensor* cpu_tensor,
262	StatusCallback done) {
263	VLOG(`1`) << "CopyGPUTensorToCPU";
264	const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
265	gpu::Stream* send_stream = nullptr;
266	Status s = PrepareCopy(gpu_device, device_context, *gpu_tensor, cpu_tensor,
267	&dev_info, &send_stream);
268	if (!s.ok()) {
269	done (s);
270	return;
271	}
272
273	auto send_device_to_host_stream =
274	static_cast<const GPUDeviceContext*>(device_context)
275	->device_to_host_stream();
276	if (send_device_to_host_stream == nullptr) {
277	done (errors::Internal("No send gpu copy-out-stream is available."));
278	return;
279	}
280	// Wait for the sender's main stream to make sure the data are available.
281	send_device_to_host_stream->ThenWaitFor(send_stream);
282
283	const int64 total_bytes = gpu_tensor->TotalBytes();
284	if (total_bytes > `0`) {
285	void* src_ptr = GetBase(gpu_tensor);
286	DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
287	void* dst_ptr = GetBase(cpu_tensor);
288	send_device_to_host_stream->ThenMemcpy(dst_ptr, gpu_src_ptr, total_bytes);
289	}
290	// Use of the input may outlive stack scope, so keep a ref.
291	TensorReference input_ref(*gpu_tensor);
292	dev_info->event_mgr->ThenExecute(
293	send_device_to_host_stream,
294	[send_device_to_host_stream, done, input_ref]() {
295	if (!send_device_to_host_stream->ok()) {
296	LOG(FATAL) << "GPU->CPU Memcpy failed";
297	}
298	input_ref.Unref();
299	done (Status::OK());
300	});
301	}
302
303	/ static /
304	void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
305	const DeviceContext* device_context,
306	Device* gpu_device, Tensor* gpu_tensor,
307	StatusCallback done) {
308	VLOG(`1`) << "CopyCPUTensorToGPU";
309	const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
310	gpu::Stream* recv_stream = nullptr;
311	Status s = PrepareCopy(gpu_device, device_context, *cpu_tensor, gpu_tensor,
312	&dev_info, &recv_stream);
313	if (!s.ok()) {
314	done (s);
315	return;
316	}
317
318	auto recv_host_to_device_stream =
319	static_cast<const GPUDeviceContext*>(device_context)
320	->host_to_device_stream();
321	if (recv_host_to_device_stream == nullptr) {
322	done (errors::Internal("No send gpu copy-out-stream is available."));
323	return;
324	}
325	// Wait for the recv-stream to make sure the buffer is truly available.
326	recv_host_to_device_stream->ThenWaitFor(recv_stream);
327
328	const int64 total_bytes = cpu_tensor->TotalBytes();
329	// Note that 0-size tensors have no backing buffer.
330	if (total_bytes > `0`) {
331	void* src_ptr = GetBase(cpu_tensor);
332	void* dst_ptr = GetBase(gpu_tensor);
333	DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
334	recv_host_to_device_stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes);
335	}
336	// Use of cpu_tensor may outlive stack scope, so keep a ref.
337	TensorReference input_ref(*cpu_tensor);
338	dev_info->event_mgr->ThenExecute(
339	recv_host_to_device_stream,
340	[recv_host_to_device_stream, done, input_ref]() {
341	input_ref.Unref();
342	if (!recv_host_to_device_stream->ok()) {
343	LOG(FATAL) << "CPU->GPU Memcpy failed";
344	}
345	done (Status::OK());
346	});
347	}
348
349	Status GPUUtil::Sync(Device* gpu_device) {
350	VLOG(`1`) << "GPUUtil::Sync";
351	auto* dev_info = gpu_device->tensorflow_gpu_device_info();
352	if (!dev_info) {
353	return errors::Internal("Failed to find dest device GPUDeviceInfo");
354	}
355	return dev_info->stream->BlockHostUntilDone();
356	}
357
358	Status GPUUtil::SyncAll(Device* gpu_device) {
359	VLOG(`1`) << "GPUUtil::SyncAll";
360	auto* dev_info = gpu_device->tensorflow_gpu_device_info();
361	if (!dev_info) {
362	return errors::Internal("Failed to find dest device GPUDeviceInfo");
363	}
364	if (!dev_info->stream->parent()->SynchronizeAllActivity() \|\|
365	!dev_info->stream->ok()) {
366	return errors::Internal("GPU sync failed");
367	}
368	return Status::OK();
369	}
370
371	string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) {
372	string ret;
373	CHECK(tensor);
374	const int64 num_bytes = std::min<int64>(
375	FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes());
376	void* ptr = (num_bytes > `0`) ? GetBase(tensor) : nullptr;
377	strings::Appendf(&ret, "%p:", ptr);
378	if (num_bytes > `0`) {
379	auto* dev_info = device->tensorflow_gpu_device_info();
380	if (!dev_info) {
381	strings::StrAppend(
382	&ret, PrintMemory(reinterpret_cast<const char*>(ptr), num_bytes));
383	} else {
384	string buf;
385	buf.resize(num_bytes);
386	DeviceMemoryBase gpu_ptr(ptr, num_bytes);
387	auto s = dev_info->stream->parent()->SynchronousMemcpyD2H(
388	gpu_ptr, num_bytes, gtl::string_as_array(&buf));
389	strings::StrAppend(&ret,
390	PrintMemory(gtl::string_as_array(&buf), num_bytes));
391	}
392	}
393	return ret;
394	}
395
396	// TODO(pbar) Checksum is called from places without a valid device context.
397	uint64 GPUUtil::Checksum(Device* gpu_device,
398	const DeviceContext* device_context,
399	const Tensor& tensor) {
400	Tensor copy(tensor.dtype(), tensor.shape());
401	Status s;
402	Notification n;
403	CopyGPUTensorToCPU(gpu_device, device_context, &tensor, &copy,
404	[&s, &n](Status status) {
405	s.Update(status);
406	n.Notify();
407	});
408	n.WaitForNotification();
409	CHECK(s.ok()) << s;
410	return Checksum(copy);
411	}
412
413	uint64 GPUUtil::Checksum(const Tensor& tensor) {
414	const float* fptr = reinterpret_cast<const float*>(GetBase(&tensor));
415	size_t num_bytes = tensor.TotalBytes();
416	size_t num_floats = num_bytes / sizeof(float);
417	for (size_t i = `0`; i < num_floats; ++i) {
418	CHECK(!std::isnan(fptr[i])) << " i " << i;
419	}
420	// TODO(tucker): consider using crc32c instead.
421	return Hash64(reinterpret_cast<const char*>(GetBase(&tensor)),
422	tensor.TotalBytes(), `0`);
423	}
424
425	// static
426	void GPUUtil::CopyGPUTensorToSameGPU(Device* gpu_device,
427	const DeviceContext* device_context,
428	const Tensor* src_gpu_tensor,
429	Tensor* dst_gpu_tensor,
430	StatusCallback done) {
431	VLOG(`1`) << "CopyGPUTensorToSameGPU";
432	const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
433	gpu::Stream* send_stream = nullptr;
434	Status s = PrepareCopy(gpu_device, device_context, *src_gpu_tensor,
435	dst_gpu_tensor, &dev_info, &send_stream);
436	if (!s.ok()) {
437	done (s);
438	return;
439	}
440
441	const int64 total_bytes = src_gpu_tensor->TotalBytes();
442	if (total_bytes > `0`) {
443	void* src_ptr = GetBase(src_gpu_tensor);
444	DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
445	void* dst_ptr = GetBase(dst_gpu_tensor);
446	DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
447	send_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, total_bytes);
448	}
449
450	done (Status::OK());
451	}
452
453	} // namespace tensorflow
454

Browse the source code of tensorflow/tensorflow/core/common_runtime/gpu/gpu_util.cc