173 c10::cuda::current_device()
175 c10::hip::current_device()
181 std::ostringstream
os;
183#if defined(CUDA_VERSION) || defined(HIP_VERSION)
189 std::array<std::string, 6>
prefixes{
"B",
"KiB",
"MiB",
"GiB",
"TiB",
"PiB"};
190 int64_t n = std::floor(std::max(0.0, std::log2(
static_cast<double>(
bytes) /
191 static_cast<double>(768))) /
192 static_cast<double>(10));
194 return std::to_string((
int64_t)(
bytes / std::pow(1024,
n))) +
" " +
198#if TORCH_VERSION_MAJOR > 2 || \
199 (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR > 4)
200 using namespace c10::CachingDeviceAllocator;
204 using namespace c10::cuda::CUDACachingAllocator;
206 using namespace c10::hip::HIPCachingAllocator;
211 os <<
"|====================================================================="
214 <<
"| LibTorch CUDA memory summary, device ID "
216 <<
"| LibTorch ROCm memory summary, device ID "
218 << std::setw(18) << std::left << static_cast<int>(device) <<
"|\n"
219 <<
"|---------------------------------------------------------------------"
226 << std::setw(13) << std::left <<
deviceStats.num_ooms
228 <<
"| cudaMalloc retries: "
230 <<
"| hipMalloc retries: "
232 << std::setw(10) << std::left <<
deviceStats.num_alloc_retries <<
"|\n"
233 <<
"|====================================================================="
235 <<
"| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot "
237 <<
"|---------------------------------------------------------------------"
239 <<
"| Allocated memory | " << std::setw(10) << std::right
244 <<
" | " << std::setw(10) << std::right
249 <<
" | " << std::setw(10) << std::right
254 <<
" | " << std::setw(10) << std::right
260 <<
"| from large pool | " << std::setw(10) << std::right
265 <<
" | " << std::setw(10) << std::right
270 <<
" | " << std::setw(10) << std::right
275 <<
" | " << std::setw(10) << std::right
281 <<
"| from small pool | " << std::setw(10) << std::right
286 <<
" | " << std::setw(10) << std::right
291 <<
" | " << std::setw(10) << std::right
296 <<
" | " << std::setw(10) << std::right
302 <<
"|---------------------------------------------------------------------"
304 <<
"| Active memory | " << std::setw(10) << std::right
307 .
active_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
309 <<
" | " << std::setw(10) << std::right
312 .
active_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
314 <<
" | " << std::setw(10) << std::right
317 .
active_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
319 <<
" | " << std::setw(10) << std::right
322 .
active_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
325 <<
"| from large pool | " << std::setw(10) << std::right
328 .
active_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
330 <<
" | " << std::setw(10) << std::right
333 .
active_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
335 <<
" | " << std::setw(10) << std::right
338 .
active_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
340 <<
" | " << std::setw(10) << std::right
343 .
active_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
346 <<
"| from small pool | " << std::setw(10) << std::right
349 .
active_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
351 <<
" | " << std::setw(10) << std::right
354 .
active_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
356 <<
" | " << std::setw(10) << std::right
359 .
active_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
361 <<
" | " << std::setw(10) << std::right
364 .
active_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
367 <<
"|---------------------------------------------------------------------"
369 <<
"| Requested memory | " << std::setw(10) << std::right
374 <<
" | " << std::setw(10) << std::right
379 <<
" | " << std::setw(10) << std::right
384 <<
" | " << std::setw(10) << std::right
390 <<
"| from large pool | " << std::setw(10) << std::right
395 <<
" | " << std::setw(10) << std::right
400 <<
" | " << std::setw(10) << std::right
405 <<
" | " << std::setw(10) << std::right
411 <<
"| from small pool | " << std::setw(10) << std::right
416 <<
" | " << std::setw(10) << std::right
421 <<
" | " << std::setw(10) << std::right
426 <<
" | " << std::setw(10) << std::right
432 <<
"|---------------------------------------------------------------------"
434 <<
"| GPU reserved memory | " << std::setw(10) << std::right
439 <<
" | " << std::setw(10) << std::right
444 <<
" | " << std::setw(10) << std::right
449 <<
" | " << std::setw(10) << std::right
455 <<
"| from large pool | " << std::setw(10) << std::right
460 <<
" | " << std::setw(10) << std::right
465 <<
" | " << std::setw(10) << std::right
470 <<
" | " << std::setw(10) << std::right
476 <<
"| from small pool | " << std::setw(10) << std::right
481 <<
" | " << std::setw(10) << std::right
486 <<
" | " << std::setw(10) << std::right
491 <<
" | " << std::setw(10) << std::right
497 <<
"|---------------------------------------------------------------------"
499 <<
"| Non-releasable memory | " << std::setw(10) << std::right
502 StatType::AGGREGATE)]
504 <<
" | " << std::setw(10) << std::right
507 StatType::AGGREGATE)]
509 <<
" | " << std::setw(10) << std::right
512 StatType::AGGREGATE)]
514 <<
" | " << std::setw(10) << std::right
517 StatType::AGGREGATE)]
520 <<
"| from large pool | " << std::setw(10) << std::right
523 StatType::LARGE_POOL)]
525 <<
" | " << std::setw(10) << std::right
528 StatType::LARGE_POOL)]
530 <<
" | " << std::setw(10) << std::right
533 StatType::LARGE_POOL)]
535 <<
" | " << std::setw(10) << std::right
538 StatType::LARGE_POOL)]
541 <<
"| from small pool | " << std::setw(10) << std::right
544 StatType::SMALL_POOL)]
546 <<
" | " << std::setw(10) << std::right
549 StatType::SMALL_POOL)]
551 <<
" | " << std::setw(10) << std::right
554 StatType::SMALL_POOL)]
556 <<
" | " << std::setw(10) << std::right
559 StatType::SMALL_POOL)]
562 <<
"|---------------------------------------------------------------------"
564 <<
"| Allocations | " << std::setw(10) << std::right
565 <<
deviceStats.allocation[
static_cast<std::size_t
>(StatType::AGGREGATE)]
567 <<
" | " << std::setw(10) << std::right
568 <<
deviceStats.allocation[
static_cast<std::size_t
>(StatType::AGGREGATE)]
570 <<
" | " << std::setw(10) << std::right
571 <<
deviceStats.allocation[
static_cast<std::size_t
>(StatType::AGGREGATE)]
573 <<
" | " << std::setw(10) << std::right
574 <<
deviceStats.allocation[
static_cast<std::size_t
>(StatType::AGGREGATE)]
577 <<
"|---------------------------------------------------------------------"
579 <<
"| from large pool | " << std::setw(10) << std::right
580 <<
deviceStats.allocation[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
582 <<
" | " << std::setw(10) << std::right
583 <<
deviceStats.allocation[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
585 <<
" | " << std::setw(10) << std::right
586 <<
deviceStats.allocation[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
588 <<
" | " << std::setw(10) << std::right
589 <<
deviceStats.allocation[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
592 <<
"|---------------------------------------------------------------------"
594 <<
"| from small pool | " << std::setw(10) << std::right
595 <<
deviceStats.allocation[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
597 <<
" | " << std::setw(10) << std::right
598 <<
deviceStats.allocation[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
600 <<
" | " << std::setw(10) << std::right
601 <<
deviceStats.allocation[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
603 <<
" | " << std::setw(10) << std::right
604 <<
deviceStats.allocation[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
607 <<
"|---------------------------------------------------------------------"
609 <<
"| Active allocs | " << std::setw(10) << std::right
610 <<
deviceStats.active[
static_cast<std::size_t
>(StatType::AGGREGATE)]
612 <<
" | " << std::setw(10) << std::right
613 <<
deviceStats.active[
static_cast<std::size_t
>(StatType::AGGREGATE)].
peak
614 <<
" | " << std::setw(10) << std::right
615 <<
deviceStats.active[
static_cast<std::size_t
>(StatType::AGGREGATE)]
617 <<
" | " << std::setw(10) << std::right
620 <<
"|---------------------------------------------------------------------"
622 <<
"| from large pool | " << std::setw(10) << std::right
623 <<
deviceStats.active[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
625 <<
" | " << std::setw(10) << std::right
626 <<
deviceStats.active[
static_cast<std::size_t
>(StatType::LARGE_POOL)].
peak
627 <<
" | " << std::setw(10) << std::right
628 <<
deviceStats.active[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
630 <<
" | " << std::setw(10) << std::right
633 <<
"|---------------------------------------------------------------------"
635 <<
"| from small pool | " << std::setw(10) << std::right
636 <<
deviceStats.active[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
638 <<
" | " << std::setw(10) << std::right
639 <<
deviceStats.active[
static_cast<std::size_t
>(StatType::SMALL_POOL)].
peak
640 <<
" | " << std::setw(10) << std::right
641 <<
deviceStats.active[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
643 <<
" | " << std::setw(10) << std::right
646 <<
"|---------------------------------------------------------------------"
648 <<
"| GPU reserved segments | " << std::setw(10) << std::right
649 <<
deviceStats.segment[
static_cast<std::size_t
>(StatType::AGGREGATE)]
651 <<
" | " << std::setw(10) << std::right
652 <<
deviceStats.segment[
static_cast<std::size_t
>(StatType::AGGREGATE)].
peak
653 <<
" | " << std::setw(10) << std::right
654 <<
deviceStats.segment[
static_cast<std::size_t
>(StatType::AGGREGATE)]
656 <<
" | " << std::setw(10) << std::right
659 <<
"|---------------------------------------------------------------------"
661 <<
"| from large pool | " << std::setw(10) << std::right
662 <<
deviceStats.segment[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
664 <<
" | " << std::setw(10) << std::right
665 <<
deviceStats.segment[
static_cast<std::size_t
>(StatType::LARGE_POOL)].
peak
666 <<
" | " << std::setw(10) << std::right
667 <<
deviceStats.segment[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
669 <<
" | " << std::setw(10) << std::right
670 <<
deviceStats.segment[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
673 <<
"|---------------------------------------------------------------------"
675 <<
"| from small pool | " << std::setw(10) << std::right
676 <<
deviceStats.segment[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
678 <<
" | " << std::setw(10) << std::right
679 <<
deviceStats.segment[
static_cast<std::size_t
>(StatType::SMALL_POOL)].
peak
680 <<
" | " << std::setw(10) << std::right
681 <<
deviceStats.segment[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
683 <<
" | " << std::setw(10) << std::right
684 <<
deviceStats.segment[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
687 <<
"|---------------------------------------------------------------------"
689 <<
"| Non-releasable allocs | " << std::setw(10) << std::right
691 .inactive_split[
static_cast<std::size_t
>(StatType::AGGREGATE)]
693 <<
" | " << std::setw(10) << std::right
695 .inactive_split[
static_cast<std::size_t
>(StatType::AGGREGATE)]
697 <<
" | " << std::setw(10) << std::right
699 .inactive_split[
static_cast<std::size_t
>(StatType::AGGREGATE)]
701 <<
" | " << std::setw(10) << std::right
703 .inactive_split[
static_cast<std::size_t
>(StatType::AGGREGATE)]
706 <<
"|---------------------------------------------------------------------"
708 <<
"| from large pool | " << std::setw(10) << std::right
710 .inactive_split[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
712 <<
" | " << std::setw(10) << std::right
714 .inactive_split[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
716 <<
" | " << std::setw(10) << std::right
718 .inactive_split[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
720 <<
" | " << std::setw(10) << std::right
722 .inactive_split[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
725 <<
"|---------------------------------------------------------------------"
727 <<
"| from small pool | " << std::setw(10) << std::right
729 .inactive_split[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
731 <<
" | " << std::setw(10) << std::right
733 .inactive_split[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
735 <<
" | " << std::setw(10) << std::right
737 .inactive_split[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
739 <<
" | " << std::setw(10) << std::right
741 .inactive_split[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
744 <<
"|---------------------------------------------------------------------"
746 <<
"| Oversize allocations | " << std::setw(10) << std::right
747 <<
deviceStats.oversize_allocations.current <<
" | " << std::setw(10)
748 << std::right <<
deviceStats.oversize_allocations.peak <<
" | "
749 << std::setw(10) << std::right
750 <<
deviceStats.oversize_allocations.allocated <<
" | " << std::setw(10)
751 << std::right <<
deviceStats.oversize_allocations.freed <<
" |\n"
752 <<
"|---------------------------------------------------------------------"
754 <<
"| Oversize GPU segments | " << std::setw(10) << std::right
755 <<
deviceStats.oversize_segments.current <<
" | " << std::setw(10)
756 << std::right <<
deviceStats.oversize_segments.peak <<
" | "
757 << std::setw(10) << std::right <<
deviceStats.oversize_segments.allocated
758 <<
" | " << std::setw(10) << std::right
760 <<
"|====================================================================="
763 os <<
"Memory summary is only available for CUDA/HIP devices";