1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "tensorflow/core/common_runtime/gpu/gpu_util.h" |
17 | |
18 | #include "tensorflow/core/common_runtime/copy_tensor.h" |
19 | #include "tensorflow/core/common_runtime/device.h" |
20 | #include "tensorflow/core/common_runtime/dma_helper.h" |
21 | #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" |
22 | #include "tensorflow/core/common_runtime/gpu/process_state.h" |
23 | #include "tensorflow/core/common_runtime/gpu_device_context.h" |
24 | #include "tensorflow/core/framework/tensor.h" |
25 | #include "tensorflow/core/framework/tensor.pb.h" |
26 | #include "tensorflow/core/framework/tensor_reference.h" |
27 | #include "tensorflow/core/framework/types.h" |
28 | #include "tensorflow/core/lib/core/errors.h" |
29 | #include "tensorflow/core/lib/core/refcount.h" |
30 | #include "tensorflow/core/lib/gtl/array_slice.h" |
31 | #include "tensorflow/core/lib/gtl/stl_util.h" |
32 | #include "tensorflow/core/lib/hash/hash.h" |
33 | #include "tensorflow/core/lib/strings/strcat.h" |
34 | #include "tensorflow/core/lib/strings/stringprintf.h" |
35 | #include "tensorflow/core/platform/logging.h" |
36 | #include "tensorflow/core/platform/stream_executor.h" |
37 | #include "tensorflow/core/platform/tensor_coding.h" |
38 | #include "tensorflow/core/platform/tracing.h" |
39 | #include "tensorflow/core/util/util.h" |
40 | |
41 | // IMPLEMENTATION NOTE: |
42 | // |
43 | // 1. Within this module, we intentionally LOG(FATAL) if any stream |
44 | // involved in memcpy becomes !stream->ok(), because TF process |
45 | // today (1/2016) can not properly recover from such an error. |
46 | // |
47 | // 2. When 0-size tensor is being copied, we should not schedule a |
48 | // copy ThenMemcpy since there is no byte to move. However, we must |
49 | // ensure the causal ordering by arranging the copy done callback |
50 | // happens-after all activities scheduled on the given stream being |
51 | // finished. |
52 | |
53 | // If this need to be runtime configurable, consider adding options to |
54 | // ConfigProto. |
55 | const tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128; |
56 | extern bool FLAGS_brain_gpu_record_mem_types; |
57 | |
58 | using perftools::gputools::DeviceMemoryBase; |
59 | using perftools::gputools::Stream; |
60 | |
61 | namespace tensorflow { |
62 | |
63 | namespace gpu = ::perftools::gputools; |
64 | |
65 | Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src, |
66 | const Tensor* dst, |
67 | const DeviceBase::GpuDeviceInfo** dev_info, |
68 | gpu::Stream** stream) { |
69 | if (device == nullptr) { |
70 | return errors::Internal("Unexpected null device." ); |
71 | } |
72 | auto di = device->tensorflow_gpu_device_info(); |
73 | if (di == nullptr) { |
74 | return errors::Internal("Unexpected null device info." ); |
75 | } |
76 | *dev_info = di; |
77 | if (ctx == nullptr) { |
78 | return errors::Internal("Unexpected null device context." ); |
79 | } |
80 | auto gs = static_cast<const GPUDeviceContext*>(ctx)->stream(); |
81 | if (gs == nullptr) { |
82 | return errors::Internal("No gpu stream is available." ); |
83 | } |
84 | *stream = gs; |
85 | if (dst != nullptr) { |
86 | if (src.dtype() != dst->dtype()) { |
87 | return errors::Internal("Can't copy a tensor of " , |
88 | DataTypeString(src.dtype()), " into a tensor of " , |
89 | DataTypeString(dst->dtype())); |
90 | } |
91 | if (src.TotalBytes() != dst->TotalBytes()) { |
92 | return errors::Internal("Can't copy " , src.TotalBytes(), |
93 | " bytes of a tensor into another with " , |
94 | dst->TotalBytes(), " bytes buffer." ); |
95 | } |
96 | if ((src.TotalBytes() > 0) && !src.IsInitialized()) { |
97 | return errors::Internal("Src tensor is not initialized." ); |
98 | } |
99 | if ((dst->TotalBytes() > 0) && !dst->IsInitialized()) { |
100 | return errors::Internal("Dst tensor is not initialized." ); |
101 | } |
102 | } |
103 | if (!DMAHelper::CanUseDMA(&src)) { |
104 | return errors::Internal("GPU copy from non-DMA " , |
105 | DataTypeString(src.dtype()), "tensor" ); |
106 | } |
107 | return Status::OK(); |
108 | } |
109 | |
110 | void* GetBase(const Tensor* src) { |
111 | return const_cast<void*>(DMAHelper::base(src)); |
112 | } |
113 | |
114 | void* GetBase(Tensor* dst) { return DMAHelper::base(dst); } |
115 | |
116 | /*static*/ |
117 | void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev, |
118 | const DeviceContext* device_context, |
119 | TensorProto* proto, bool is_dead, |
120 | StatusCallback done) { |
121 | VLOG(1) << "SetProtoFromGPU device_context " << device_context; |
122 | const DeviceBase::GpuDeviceInfo* dev_info = nullptr; |
123 | gpu::Stream* send_stream = nullptr; |
124 | Status s = PrepareCopy(dev, device_context, tensor, nullptr, &dev_info, |
125 | &send_stream); |
126 | if (!s.ok()) { |
127 | done(s); |
128 | return; |
129 | } |
130 | |
131 | auto send_device_to_host_stream = |
132 | static_cast<const GPUDeviceContext*>(device_context) |
133 | ->device_to_host_stream(); |
134 | if (send_device_to_host_stream == nullptr) { |
135 | done(errors::Internal("No send gpu copy-out-stream is available." )); |
136 | return; |
137 | } |
138 | // Wait for the sender's main stream to make sure the data are available. |
139 | send_device_to_host_stream->ThenWaitFor(send_stream); |
140 | |
141 | // Tensor values need to be copied from GPU to CPU ram so that |
142 | // we can build the protobuf response for a RecvTensor RPC. |
143 | // "device context" identifies the stream where the _Send op executed. |
144 | proto->set_dtype(tensor.dtype()); |
145 | tensor.shape().AsProto(proto->mutable_tensor_shape()); |
146 | |
147 | // Prepare a proto with the right data buf size, and DMA the data |
148 | // over from the GPU buffer. Note that 0-size tensors do not have a |
149 | // backing buffer. |
150 | Allocator* alloc = nullptr; |
151 | char* buf = nullptr; |
152 | const int64 total_bytes = is_dead ? 0 : tensor.TotalBytes(); |
153 | if (total_bytes > 0) { |
154 | port::Tracing::ScopedAnnotation annotation("SetProtoFromGPU" ); |
155 | alloc = ProcessState::singleton()->GetCUDAHostAllocator(0); |
156 | buf = alloc->Allocate<char>(total_bytes); |
157 | if (LogMemory::IsEnabled()) { |
158 | LogMemory::RecordRawAllocation("SetProtoFromGPU" , |
159 | LogMemory::PROTO_BUFFER_STEP_ID, |
160 | total_bytes, buf, alloc); |
161 | } |
162 | void* src_ptr = GetBase(&tensor); |
163 | DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes); |
164 | send_device_to_host_stream->ThenMemcpy(buf, gpu_src_ptr, total_bytes); |
165 | } |
166 | // Use of tensor may outlive stack scope, so keep a ref. |
167 | TensorReference tensor_ref(tensor); |
168 | dev_info->event_mgr->ThenExecute( |
169 | send_device_to_host_stream, [send_device_to_host_stream, done, proto, buf, |
170 | total_bytes, alloc, tensor_ref]() { |
171 | if (!send_device_to_host_stream->ok()) { |
172 | LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed" ; |
173 | } |
174 | tensor_ref.Unref(); |
175 | if (total_bytes > 0) { |
176 | port::CopyFromArray(proto->mutable_tensor_content(), buf, |
177 | total_bytes); |
178 | if (LogMemory::IsEnabled()) { |
179 | LogMemory::RecordRawDeallocation("SetProtoFromGPU" , |
180 | LogMemory::PROTO_BUFFER_STEP_ID, |
181 | buf, alloc, false); |
182 | } |
183 | alloc->Deallocate<char>(buf, total_bytes); |
184 | } |
185 | done(Status::OK()); |
186 | }); |
187 | } |
188 | |
189 | // static |
190 | void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context, |
191 | DeviceContext* recv_dev_context, Device* src, |
192 | Device* dst, |
193 | AllocatorAttributes src_alloc_attr, |
194 | AllocatorAttributes dst_alloc_attr, |
195 | const Tensor* input, Tensor* output, |
196 | StatusCallback done) { |
197 | const DeviceBase::GpuDeviceInfo* dev_info = nullptr; |
198 | gpu::Stream* send_stream = nullptr; |
199 | Status s = PrepareCopy(src, send_dev_context, *input, output, &dev_info, |
200 | &send_stream); |
201 | if (!s.ok()) { |
202 | done(s); |
203 | return; |
204 | } |
205 | auto send_device_to_device_stream = |
206 | static_cast<const GPUDeviceContext*>(send_dev_context) |
207 | ->device_to_device_stream(); |
208 | if (send_device_to_device_stream == nullptr) { |
209 | done(errors::Internal("No send gpu copy-out-stream is available." )); |
210 | return; |
211 | } |
212 | // Wait for the main stream on the sender to make sure the result is |
213 | // available. |
214 | send_device_to_device_stream->ThenWaitFor(send_stream); |
215 | |
216 | const int64 total_bytes = input->TotalBytes(); |
217 | if (total_bytes > 0) { |
218 | void* src_ptr = GetBase(input); |
219 | DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes); |
220 | void* dst_ptr = GetBase(output); |
221 | DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); |
222 | auto recv_stream = |
223 | static_cast<const GPUDeviceContext*>(recv_dev_context)->stream(); |
224 | if (recv_stream == nullptr) { |
225 | done(errors::Internal("No recv gpu stream is available." )); |
226 | return; |
227 | } |
228 | // Since we want to use the memory from recv_stream in the |
229 | // send_device_to_device_stream, add a dependency to make sure the memory is |
230 | // truly free. |
231 | // TODO(zhengxq): remove this dependency when we switch to a better way |
232 | // to make sure the memory is free. |
233 | send_device_to_device_stream->ThenWaitFor(recv_stream); |
234 | |
235 | VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr; |
236 | send_device_to_device_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, |
237 | total_bytes); |
238 | } |
239 | |
240 | // Use of input may outlive stack scope, so keep a ref. |
241 | TensorReference input_ref(*input); |
242 | dev_info->event_mgr->ThenExecute( |
243 | send_device_to_device_stream, |
244 | [done, send_device_to_device_stream, input_ref]() { |
245 | input_ref.Unref(); |
246 | if (!send_device_to_device_stream->ok()) { |
247 | LOG(FATAL) << "GPU->GPU Memcpy failed" ; |
248 | } |
249 | done(Status::OK()); |
250 | }); |
251 | send_dev_context->MaintainLifetimeOnStream(input, |
252 | send_device_to_device_stream); |
253 | } |
254 | |
255 | static CopyTensor::Registration register_gpu_gpu_copy( |
256 | DEVICE_GPU, DEVICE_GPU, GPUUtil::DeviceToDeviceCopy); |
257 | |
258 | // static |
259 | void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device, |
260 | const DeviceContext* device_context, |
261 | const Tensor* gpu_tensor, Tensor* cpu_tensor, |
262 | StatusCallback done) { |
263 | VLOG(1) << "CopyGPUTensorToCPU" ; |
264 | const DeviceBase::GpuDeviceInfo* dev_info = nullptr; |
265 | gpu::Stream* send_stream = nullptr; |
266 | Status s = PrepareCopy(gpu_device, device_context, *gpu_tensor, cpu_tensor, |
267 | &dev_info, &send_stream); |
268 | if (!s.ok()) { |
269 | done(s); |
270 | return; |
271 | } |
272 | |
273 | auto send_device_to_host_stream = |
274 | static_cast<const GPUDeviceContext*>(device_context) |
275 | ->device_to_host_stream(); |
276 | if (send_device_to_host_stream == nullptr) { |
277 | done(errors::Internal("No send gpu copy-out-stream is available." )); |
278 | return; |
279 | } |
280 | // Wait for the sender's main stream to make sure the data are available. |
281 | send_device_to_host_stream->ThenWaitFor(send_stream); |
282 | |
283 | const int64 total_bytes = gpu_tensor->TotalBytes(); |
284 | if (total_bytes > 0) { |
285 | void* src_ptr = GetBase(gpu_tensor); |
286 | DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes); |
287 | void* dst_ptr = GetBase(cpu_tensor); |
288 | send_device_to_host_stream->ThenMemcpy(dst_ptr, gpu_src_ptr, total_bytes); |
289 | } |
290 | // Use of the input may outlive stack scope, so keep a ref. |
291 | TensorReference input_ref(*gpu_tensor); |
292 | dev_info->event_mgr->ThenExecute( |
293 | send_device_to_host_stream, |
294 | [send_device_to_host_stream, done, input_ref]() { |
295 | if (!send_device_to_host_stream->ok()) { |
296 | LOG(FATAL) << "GPU->CPU Memcpy failed" ; |
297 | } |
298 | input_ref.Unref(); |
299 | done(Status::OK()); |
300 | }); |
301 | } |
302 | |
303 | /* static */ |
304 | void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor, |
305 | const DeviceContext* device_context, |
306 | Device* gpu_device, Tensor* gpu_tensor, |
307 | StatusCallback done) { |
308 | VLOG(1) << "CopyCPUTensorToGPU" ; |
309 | const DeviceBase::GpuDeviceInfo* dev_info = nullptr; |
310 | gpu::Stream* recv_stream = nullptr; |
311 | Status s = PrepareCopy(gpu_device, device_context, *cpu_tensor, gpu_tensor, |
312 | &dev_info, &recv_stream); |
313 | if (!s.ok()) { |
314 | done(s); |
315 | return; |
316 | } |
317 | |
318 | auto recv_host_to_device_stream = |
319 | static_cast<const GPUDeviceContext*>(device_context) |
320 | ->host_to_device_stream(); |
321 | if (recv_host_to_device_stream == nullptr) { |
322 | done(errors::Internal("No send gpu copy-out-stream is available." )); |
323 | return; |
324 | } |
325 | // Wait for the recv-stream to make sure the buffer is truly available. |
326 | recv_host_to_device_stream->ThenWaitFor(recv_stream); |
327 | |
328 | const int64 total_bytes = cpu_tensor->TotalBytes(); |
329 | // Note that 0-size tensors have no backing buffer. |
330 | if (total_bytes > 0) { |
331 | void* src_ptr = GetBase(cpu_tensor); |
332 | void* dst_ptr = GetBase(gpu_tensor); |
333 | DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); |
334 | recv_host_to_device_stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes); |
335 | } |
336 | // Use of cpu_tensor may outlive stack scope, so keep a ref. |
337 | TensorReference input_ref(*cpu_tensor); |
338 | dev_info->event_mgr->ThenExecute( |
339 | recv_host_to_device_stream, |
340 | [recv_host_to_device_stream, done, input_ref]() { |
341 | input_ref.Unref(); |
342 | if (!recv_host_to_device_stream->ok()) { |
343 | LOG(FATAL) << "CPU->GPU Memcpy failed" ; |
344 | } |
345 | done(Status::OK()); |
346 | }); |
347 | } |
348 | |
349 | Status GPUUtil::Sync(Device* gpu_device) { |
350 | VLOG(1) << "GPUUtil::Sync" ; |
351 | auto* dev_info = gpu_device->tensorflow_gpu_device_info(); |
352 | if (!dev_info) { |
353 | return errors::Internal("Failed to find dest device GPUDeviceInfo" ); |
354 | } |
355 | return dev_info->stream->BlockHostUntilDone(); |
356 | } |
357 | |
358 | Status GPUUtil::SyncAll(Device* gpu_device) { |
359 | VLOG(1) << "GPUUtil::SyncAll" ; |
360 | auto* dev_info = gpu_device->tensorflow_gpu_device_info(); |
361 | if (!dev_info) { |
362 | return errors::Internal("Failed to find dest device GPUDeviceInfo" ); |
363 | } |
364 | if (!dev_info->stream->parent()->SynchronizeAllActivity() || |
365 | !dev_info->stream->ok()) { |
366 | return errors::Internal("GPU sync failed" ); |
367 | } |
368 | return Status::OK(); |
369 | } |
370 | |
371 | string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) { |
372 | string ret; |
373 | CHECK(tensor); |
374 | const int64 num_bytes = std::min<int64>( |
375 | FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes()); |
376 | void* ptr = (num_bytes > 0) ? GetBase(tensor) : nullptr; |
377 | strings::Appendf(&ret, "%p:" , ptr); |
378 | if (num_bytes > 0) { |
379 | auto* dev_info = device->tensorflow_gpu_device_info(); |
380 | if (!dev_info) { |
381 | strings::StrAppend( |
382 | &ret, PrintMemory(reinterpret_cast<const char*>(ptr), num_bytes)); |
383 | } else { |
384 | string buf; |
385 | buf.resize(num_bytes); |
386 | DeviceMemoryBase gpu_ptr(ptr, num_bytes); |
387 | auto s = dev_info->stream->parent()->SynchronousMemcpyD2H( |
388 | gpu_ptr, num_bytes, gtl::string_as_array(&buf)); |
389 | strings::StrAppend(&ret, |
390 | PrintMemory(gtl::string_as_array(&buf), num_bytes)); |
391 | } |
392 | } |
393 | return ret; |
394 | } |
395 | |
396 | // TODO(pbar) Checksum is called from places without a valid device context. |
397 | uint64 GPUUtil::Checksum(Device* gpu_device, |
398 | const DeviceContext* device_context, |
399 | const Tensor& tensor) { |
400 | Tensor copy(tensor.dtype(), tensor.shape()); |
401 | Status s; |
402 | Notification n; |
403 | CopyGPUTensorToCPU(gpu_device, device_context, &tensor, ©, |
404 | [&s, &n](Status status) { |
405 | s.Update(status); |
406 | n.Notify(); |
407 | }); |
408 | n.WaitForNotification(); |
409 | CHECK(s.ok()) << s; |
410 | return Checksum(copy); |
411 | } |
412 | |
413 | uint64 GPUUtil::Checksum(const Tensor& tensor) { |
414 | const float* fptr = reinterpret_cast<const float*>(GetBase(&tensor)); |
415 | size_t num_bytes = tensor.TotalBytes(); |
416 | size_t num_floats = num_bytes / sizeof(float); |
417 | for (size_t i = 0; i < num_floats; ++i) { |
418 | CHECK(!std::isnan(fptr[i])) << " i " << i; |
419 | } |
420 | // TODO(tucker): consider using crc32c instead. |
421 | return Hash64(reinterpret_cast<const char*>(GetBase(&tensor)), |
422 | tensor.TotalBytes(), 0); |
423 | } |
424 | |
425 | // static |
426 | void GPUUtil::CopyGPUTensorToSameGPU(Device* gpu_device, |
427 | const DeviceContext* device_context, |
428 | const Tensor* src_gpu_tensor, |
429 | Tensor* dst_gpu_tensor, |
430 | StatusCallback done) { |
431 | VLOG(1) << "CopyGPUTensorToSameGPU" ; |
432 | const DeviceBase::GpuDeviceInfo* dev_info = nullptr; |
433 | gpu::Stream* send_stream = nullptr; |
434 | Status s = PrepareCopy(gpu_device, device_context, *src_gpu_tensor, |
435 | dst_gpu_tensor, &dev_info, &send_stream); |
436 | if (!s.ok()) { |
437 | done(s); |
438 | return; |
439 | } |
440 | |
441 | const int64 total_bytes = src_gpu_tensor->TotalBytes(); |
442 | if (total_bytes > 0) { |
443 | void* src_ptr = GetBase(src_gpu_tensor); |
444 | DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes); |
445 | void* dst_ptr = GetBase(dst_gpu_tensor); |
446 | DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); |
447 | send_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, total_bytes); |
448 | } |
449 | |
450 | done(Status::OK()); |
451 | } |
452 | |
453 | } // namespace tensorflow |
454 | |