1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_ |
17 | #define TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_ |
18 | |
19 | #include <array> |
20 | #include <memory> |
21 | #include <string> |
22 | #include <unordered_map> |
23 | #include <vector> |
24 | |
25 | #include "tensorflow/core/common_runtime/allocator_retry.h" |
26 | #include "tensorflow/core/common_runtime/visitable_allocator.h" |
27 | #include "tensorflow/core/lib/gtl/stl_util.h" |
28 | #include "tensorflow/core/lib/strings/strcat.h" |
29 | #include "tensorflow/core/platform/macros.h" |
30 | #include "tensorflow/core/platform/mutex.h" |
31 | #include "tensorflow/core/platform/thread_annotations.h" |
32 | #include "tensorflow/core/platform/types.h" |
33 | #include "tensorflow/core/protobuf/config.pb.h" |
34 | |
35 | namespace tensorflow { |
36 | |
37 | // A memory allocator that implements a 'best-fit with coalescing' |
38 | // algorithm. This is essentially a very simple version of Doug Lea's |
39 | // malloc (dlmalloc). |
40 | // |
41 | // The goal of this allocator is to support defragmentation via |
42 | // coalescing. One assumption we make is that the process using this |
43 | // allocator owns pretty much all of the memory, and that nearly |
44 | // all requests to allocate memory go through this interface. |
45 | class BFCAllocator : public VisitableAllocator { |
46 | public: |
47 | // Takes ownership of sub_allocator. |
48 | BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, |
49 | bool allow_growth, const string& name); |
50 | ~BFCAllocator() override; |
51 | |
52 | string Name() override { return name_; } |
53 | void* AllocateRaw(size_t alignment, size_t num_bytes) override; |
54 | void* AllocateRaw(size_t alignment, size_t num_bytes, |
55 | const AllocationAttributes& allocation_attr) override; |
56 | void DeallocateRaw(void* ptr) override; |
57 | |
58 | void AddAllocVisitor(Visitor visitor) override; |
59 | |
60 | // Does nothing, because memory is never freed. |
61 | void AddFreeVisitor(Visitor visitor) override {} |
62 | |
63 | bool TracksAllocationSizes() override; |
64 | |
65 | size_t RequestedSize(const void* ptr) override; |
66 | |
67 | size_t AllocatedSize(const void* ptr) override; |
68 | |
69 | int64 AllocationId(const void* ptr) override; |
70 | |
71 | void GetStats(AllocatorStats* stats) override; |
72 | |
73 | void ClearStats() override; |
74 | |
75 | private: |
76 | struct Bin; |
77 | |
78 | void* AllocateRawInternal(size_t alignment, size_t num_bytes, |
79 | bool dump_log_on_failure); |
80 | void DeallocateRawInternal(void* ptr); |
81 | |
82 | // A ChunkHandle is an index into the chunks_ vector in BFCAllocator |
83 | // kInvalidChunkHandle means an invalid chunk |
84 | typedef size_t ChunkHandle; |
85 | static const int kInvalidChunkHandle = -1; |
86 | |
87 | typedef int BinNum; |
88 | static const int kInvalidBinNum = -1; |
89 | static const int kNumBins = 21; |
90 | |
91 | // Chunks point to memory. Their prev/next pointers form a |
92 | // doubly-linked list of addresses sorted by base address that |
93 | // must be contiguous. Chunks contain information about whether |
94 | // they are in use or whether they are free, and contain a pointer |
95 | // to the bin they are in. |
96 | struct Chunk { |
97 | size_t size = 0; // Full size of buffer. |
98 | |
99 | // We sometimes give chunks that are larger than needed to reduce |
100 | // fragmentation. requested_size keeps track of what the client |
101 | // actually wanted so we can understand whether our splitting |
102 | // strategy is efficient. |
103 | size_t requested_size = 0; |
104 | |
105 | // allocation_id is set to -1 when the chunk is not in use. It is assigned a |
106 | // value greater than zero before the chunk is returned from |
107 | // AllocateRaw, and this value is unique among values assigned by |
108 | // the parent allocator. |
109 | int64 allocation_id = -1; |
110 | void* ptr = nullptr; // pointer to granted subbuffer. |
111 | |
112 | // If not kInvalidChunkHandle, the memory referred to by 'prev' is directly |
113 | // preceding the memory used by this chunk. E.g., It should start |
114 | // at 'ptr - prev->size' |
115 | ChunkHandle prev = kInvalidChunkHandle; |
116 | |
117 | // If not kInvalidChunkHandle, the memory referred to by 'next' is directly |
118 | // following the memory used by this chunk. E.g., It should be at |
119 | // 'ptr + size' |
120 | ChunkHandle next = kInvalidChunkHandle; |
121 | |
122 | // What bin are we in? |
123 | BinNum bin_num = kInvalidBinNum; |
124 | |
125 | bool in_use() const { return allocation_id != -1; } |
126 | |
127 | string DebugString(BFCAllocator* a, |
128 | bool recurse) NO_THREAD_SAFETY_ANALYSIS { |
129 | string dbg; |
130 | strings::StrAppend( |
131 | &dbg, " Size: " , strings::HumanReadableNumBytes(size), |
132 | " | Requested Size: " , strings::HumanReadableNumBytes(requested_size), |
133 | " | in_use: " , in_use()); |
134 | if (recurse && prev != BFCAllocator::kInvalidChunkHandle) { |
135 | Chunk* p = a->ChunkFromHandle(prev); |
136 | strings::StrAppend(&dbg, ", prev: " , p->DebugString(a, false)); |
137 | } |
138 | if (recurse && next != BFCAllocator::kInvalidChunkHandle) { |
139 | Chunk* n = a->ChunkFromHandle(next); |
140 | strings::StrAppend(&dbg, ", next: " , n->DebugString(a, false)); |
141 | } |
142 | return dbg; |
143 | } |
144 | }; |
145 | |
146 | // A Bin is a collection of similar-sized free chunks. |
147 | struct Bin { |
148 | // All chunks in this bin have >= bin_size memory. |
149 | size_t bin_size = 0; |
150 | |
151 | struct ChunkComparator { |
152 | explicit ChunkComparator(BFCAllocator* allocator) |
153 | : allocator_(allocator) {} |
154 | // Sort first by size and then use pointer address as a tie breaker. |
155 | bool operator()(const ChunkHandle ha, |
156 | const ChunkHandle hb) const NO_THREAD_SAFETY_ANALYSIS { |
157 | const Chunk* a = allocator_->ChunkFromHandle(ha); |
158 | const Chunk* b = allocator_->ChunkFromHandle(hb); |
159 | if (a->size != b->size) { |
160 | return a->size < b->size; |
161 | } |
162 | return a->ptr < b->ptr; |
163 | } |
164 | |
165 | private: |
166 | BFCAllocator* allocator_; // The parent allocator |
167 | }; |
168 | |
169 | typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet; |
170 | // List of free chunks within the bin, sorted by chunk size. |
171 | // Chunk * not owned. |
172 | FreeChunkSet free_chunks; |
173 | Bin(BFCAllocator* allocator, size_t bs) |
174 | : bin_size(bs), free_chunks(ChunkComparator(allocator)) {} |
175 | }; |
176 | |
177 | static const size_t kMinAllocationBits = 8; |
178 | static const size_t kMinAllocationSize = 1 << kMinAllocationBits; |
179 | |
180 | // AllocationRegion maps pointers to ChunkHandles for a single |
181 | // contiguous memory region. |
182 | // |
183 | // This class is thread-compatible. |
184 | class AllocationRegion { |
185 | public: |
186 | AllocationRegion(void* ptr, size_t memory_size) |
187 | : ptr_(ptr), |
188 | memory_size_(memory_size), |
189 | end_ptr_( |
190 | static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) { |
191 | DCHECK_EQ(0, memory_size % kMinAllocationSize); |
192 | const size_t n_handles = |
193 | (memory_size + kMinAllocationSize - 1) / kMinAllocationSize; |
194 | handles_ = new ChunkHandle[n_handles]; |
195 | for (size_t i = 0; i < n_handles; i++) { |
196 | handles_[i] = kInvalidChunkHandle; |
197 | } |
198 | } |
199 | |
200 | AllocationRegion() {} |
201 | |
202 | ~AllocationRegion() { delete[] handles_; } |
203 | |
204 | AllocationRegion(AllocationRegion&& other) { Swap(other); } |
205 | |
206 | AllocationRegion& operator=(AllocationRegion&& other) { |
207 | Swap(other); |
208 | return *this; |
209 | } |
210 | |
211 | void* ptr() const { return ptr_; } |
212 | void* end_ptr() const { return end_ptr_; } |
213 | size_t memory_size() const { return memory_size_; } |
214 | ChunkHandle get_handle(const void* p) const { |
215 | return handles_[IndexFor(p)]; |
216 | } |
217 | void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; } |
218 | void erase(const void* p) { set_handle(p, kInvalidChunkHandle); } |
219 | |
220 | private: |
221 | void Swap(AllocationRegion& other) { |
222 | std::swap(ptr_, other.ptr_); |
223 | std::swap(memory_size_, other.memory_size_); |
224 | std::swap(end_ptr_, other.end_ptr_); |
225 | std::swap(handles_, other.handles_); |
226 | } |
227 | |
228 | int IndexFor(const void* p) const { |
229 | std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p); |
230 | std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_); |
231 | DCHECK_GE(p_int, base_int); |
232 | DCHECK_LT(p_int, base_int + memory_size_); |
233 | return static_cast<int>(((p_int - base_int) >> kMinAllocationBits)); |
234 | } |
235 | |
236 | // Metadata about the allocation region. |
237 | void* ptr_ = nullptr; |
238 | size_t memory_size_ = 0; |
239 | void* end_ptr_ = nullptr; |
240 | |
241 | // Array of size "memory_size / kMinAllocationSize". It is |
242 | // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle |
243 | // for the memory allocation represented by "p" |
244 | ChunkHandle* handles_ = nullptr; |
245 | |
246 | TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion); |
247 | }; |
248 | |
249 | // RegionManager aggregates one or more "AllocationRegions" and provides |
250 | // a layer of indirection from pointers to the underlying ChunkHandle, |
251 | // allowing allocation across multiple discontiguous memory regions. |
252 | // |
253 | // This class is thread-compatible. |
254 | class RegionManager { |
255 | public: |
256 | RegionManager() {} |
257 | ~RegionManager() {} |
258 | |
259 | void AddAllocationRegion(void* ptr, size_t memory_size) { |
260 | // Insert sorted by end_ptr |
261 | auto entry = |
262 | std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator); |
263 | regions_.insert(entry, AllocationRegion(ptr, memory_size)); |
264 | } |
265 | |
266 | ChunkHandle get_handle(const void* p) const { |
267 | return RegionFor(p)->get_handle(p); |
268 | } |
269 | |
270 | void set_handle(const void* p, ChunkHandle h) { |
271 | return MutableRegionFor(p)->set_handle(p, h); |
272 | } |
273 | void erase(const void* p) { return MutableRegionFor(p)->erase(p); } |
274 | |
275 | const std::vector<AllocationRegion>& regions() const { return regions_; } |
276 | |
277 | private: |
278 | static bool Comparator(const void* ptr, const AllocationRegion& other) { |
279 | return ptr < other.end_ptr(); |
280 | } |
281 | |
282 | AllocationRegion* MutableRegionFor(const void* p) { |
283 | return const_cast<AllocationRegion*>(RegionFor(p)); |
284 | } |
285 | |
286 | const AllocationRegion* RegionFor(const void* p) const { |
287 | auto entry = |
288 | std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator); |
289 | |
290 | if (entry != regions_.end()) { |
291 | return &(*entry); |
292 | } |
293 | |
294 | LOG(FATAL) << "Could not find Region for " << p; |
295 | return nullptr; |
296 | } |
297 | |
298 | private: |
299 | std::vector<AllocationRegion> regions_; |
300 | }; |
301 | |
302 | // Returns 'bytes' rounded up to the next highest kMinAllocationSize. |
303 | size_t RoundedBytes(size_t bytes); |
304 | |
305 | // Try to add a new memory region that can satisfy an allocation of |
306 | // 'rounded_bytes' bytes. Returns true on success and false on |
307 | // failure. |
308 | bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_); |
309 | |
310 | // Returns a pointer to an underlying allocated chunk of size |
311 | // 'rounded_bytes'. |
312 | void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes) |
313 | EXCLUSIVE_LOCKS_REQUIRED(lock_); |
314 | |
315 | // Splits the chunk specified by 'h' into two chunks, one at least |
316 | // of size 'num_bytes'. |
317 | void SplitChunk(ChunkHandle h, size_t num_bytes) |
318 | EXCLUSIVE_LOCKS_REQUIRED(lock_); |
319 | |
320 | // Merges the two chunk handles. Requires that the chunks are |
321 | // contiguous in their allocation. |
322 | void Merge(ChunkHandle h, ChunkHandle h2) EXCLUSIVE_LOCKS_REQUIRED(lock_); |
323 | |
324 | // Frees the memory represented by 'h', coalescing the chunk if |
325 | // possible. |
326 | void FreeAndMaybeCoalesce(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); |
327 | |
328 | // Adds the chunk 'h' to the proper free bin. |
329 | void InsertFreeChunkIntoBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); |
330 | |
331 | // Removes the free chunk pointed to by 'c' from the set free_chunks. |
332 | void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks, |
333 | const Bin::FreeChunkSet::iterator& c) |
334 | EXCLUSIVE_LOCKS_REQUIRED(lock_); |
335 | |
336 | // Removes a free chunk from the bin. |
337 | void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); |
338 | |
339 | // Removes the chunk metadata represented by 'h'. |
340 | void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); |
341 | |
342 | string RenderOccupancy() EXCLUSIVE_LOCKS_REQUIRED(lock_); |
343 | void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_); |
344 | |
345 | ChunkHandle AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(lock_); |
346 | void DeallocateChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); |
347 | |
348 | Chunk* ChunkFromHandle(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); |
349 | |
350 | // Information about a Bin that is useful for debugging. |
351 | struct BinDebugInfo { |
352 | size_t total_bytes_in_use = 0; |
353 | size_t total_bytes_in_bin = 0; |
354 | size_t total_requested_bytes_in_use = 0; |
355 | size_t total_chunks_in_use = 0; |
356 | size_t total_chunks_in_bin = 0; |
357 | }; |
358 | |
359 | // Computes and returns a BinDebugInfo for each Bin. |
360 | std::array<BinDebugInfo, kNumBins> get_bin_debug_info() |
361 | EXCLUSIVE_LOCKS_REQUIRED(lock_); |
362 | |
363 | AllocatorRetry retry_helper_; |
364 | |
365 | // Structures immutable after construction |
366 | size_t memory_limit_ = 0; |
367 | |
368 | inline int Log2FloorNonZeroSlow(uint64 n) { |
369 | int r = 0; |
370 | while (n > 0) { |
371 | r++; |
372 | n >>= 1; |
373 | } |
374 | return r - 1; |
375 | } |
376 | |
377 | // Returns floor(log2(n)). |
378 | inline int Log2FloorNonZero(uint64 n) { |
379 | #if defined(__GNUC__) |
380 | return 63 ^ __builtin_clzll(n); |
381 | #elif defined(PLATFORM_WINDOWS) |
382 | unsigned long index; |
383 | _BitScanReverse64(&index, n); |
384 | return index; |
385 | #else |
386 | return Log2FloorNonZeroSlow(n); |
387 | #endif |
388 | } |
389 | |
390 | // Map from bin size to Bin |
391 | Bin* BinFromIndex(BinNum index) { |
392 | return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)])); |
393 | } |
394 | size_t BinNumToSize(BinNum index) { |
395 | return static_cast<size_t>(256) << index; |
396 | } |
397 | BinNum BinNumForSize(size_t bytes) { |
398 | uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits; |
399 | int b = std::min(kNumBins - 1, Log2FloorNonZero(v)); |
400 | return b; |
401 | } |
402 | Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); } |
403 | |
404 | char bins_space_[sizeof(Bin) * kNumBins]; |
405 | |
406 | // The size of the current region allocation. |
407 | size_t curr_region_allocation_bytes_; |
408 | |
409 | // The total number of allocated bytes by the allocator. |
410 | size_t total_region_allocated_bytes_ = 0; |
411 | |
412 | // An indicator that expansion of a region has hit the limits |
413 | // of the available memory. |
414 | bool started_backpedal_ = false; |
415 | |
416 | std::unique_ptr<SubAllocator> suballocator_; |
417 | string name_; |
418 | |
419 | // Structures mutable after construction |
420 | mutable mutex lock_; |
421 | RegionManager region_manager_ GUARDED_BY(lock_); |
422 | |
423 | std::vector<Chunk> chunks_ GUARDED_BY(lock_); |
424 | |
425 | // Pointer to head of linked list of free Chunks |
426 | ChunkHandle free_chunks_list_ GUARDED_BY(lock_); |
427 | |
428 | // Called once on each region, ASAP. |
429 | std::vector<Visitor> region_visitors_ GUARDED_BY(lock_); |
430 | |
431 | // Counter containing the next unique identifier to assign to a |
432 | // newly-created chunk. |
433 | int64 next_allocation_id_ GUARDED_BY(lock_); |
434 | |
435 | // Stats. |
436 | AllocatorStats stats_ GUARDED_BY(lock_); |
437 | |
438 | friend class GPUBFCAllocatorPrivateMethodsTest; |
439 | TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator); |
440 | }; |
441 | |
442 | } // namespace tensorflow |
443 | |
444 | #endif // TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_ |
445 | |