173 c10::cuda::current_device()
175 c10::hip::current_device()
181 std::ostringstream os;
183#if defined(CUDA_VERSION) || defined(HIP_VERSION)
185 auto _format_size = [](int64_t bytes) -> std::string {
189 std::array<std::string, 6> prefixes{
"B",
"KiB",
"MiB",
"GiB",
"TiB",
"PiB"};
190 int64_t n = std::floor(std::max(0.0, std::log2(
static_cast<double>(bytes) /
191 static_cast<double>(768))) /
192 static_cast<double>(10));
194 return std::to_string((int64_t)(bytes / std::pow(1024, n))) +
" " +
198#if TORCH_VERSION_MAJOR > 2 || \
199 (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR > 4)
200 using namespace c10::CachingDeviceAllocator;
204 using namespace c10::cuda::CUDACachingAllocator;
206 using namespace c10::hip::HIPCachingAllocator;
209 DeviceStats deviceStats = getDeviceStats(device);
211 os <<
"|====================================================================="
214 <<
"| LibTorch CUDA memory summary, device ID "
216 <<
"| LibTorch ROCm memory summary, device ID "
218 << std::setw(18) << std::left << static_cast<int>(device) <<
"|\n"
219 <<
"|---------------------------------------------------------------------"
226 << std::setw(13) << std::left << deviceStats.num_ooms
228 <<
"| cudaMalloc retries: "
230 <<
"| hipMalloc retries: "
232 << std::setw(10) << std::left << deviceStats.num_alloc_retries <<
"|\n"
233 <<
"|====================================================================="
235 <<
"| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot "
237 <<
"|---------------------------------------------------------------------"
239 <<
"| Allocated memory | " << std::setw(10) << std::right
242 .allocated_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
244 <<
" | " << std::setw(10) << std::right
247 .allocated_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
249 <<
" | " << std::setw(10) << std::right
252 .allocated_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
254 <<
" | " << std::setw(10) << std::right
257 .allocated_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
260 <<
"| from large pool | " << std::setw(10) << std::right
263 .allocated_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
265 <<
" | " << std::setw(10) << std::right
268 .allocated_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
270 <<
" | " << std::setw(10) << std::right
273 .allocated_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
275 <<
" | " << std::setw(10) << std::right
278 .allocated_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
281 <<
"| from small pool | " << std::setw(10) << std::right
284 .allocated_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
286 <<
" | " << std::setw(10) << std::right
289 .allocated_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
291 <<
" | " << std::setw(10) << std::right
294 .allocated_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
296 <<
" | " << std::setw(10) << std::right
299 .allocated_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
302 <<
"|---------------------------------------------------------------------"
304 <<
"| Active memory | " << std::setw(10) << std::right
307 .active_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
309 <<
" | " << std::setw(10) << std::right
312 .active_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
314 <<
" | " << std::setw(10) << std::right
317 .active_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
319 <<
" | " << std::setw(10) << std::right
322 .active_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
325 <<
"| from large pool | " << std::setw(10) << std::right
328 .active_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
330 <<
" | " << std::setw(10) << std::right
333 .active_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
335 <<
" | " << std::setw(10) << std::right
338 .active_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
340 <<
" | " << std::setw(10) << std::right
343 .active_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
346 <<
"| from small pool | " << std::setw(10) << std::right
349 .active_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
351 <<
" | " << std::setw(10) << std::right
354 .active_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
356 <<
" | " << std::setw(10) << std::right
359 .active_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
361 <<
" | " << std::setw(10) << std::right
364 .active_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
367 <<
"|---------------------------------------------------------------------"
369 <<
"| Requested memory | " << std::setw(10) << std::right
372 .requested_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
374 <<
" | " << std::setw(10) << std::right
377 .requested_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
379 <<
" | " << std::setw(10) << std::right
382 .requested_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
384 <<
" | " << std::setw(10) << std::right
387 .requested_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
390 <<
"| from large pool | " << std::setw(10) << std::right
393 .requested_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
395 <<
" | " << std::setw(10) << std::right
398 .requested_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
400 <<
" | " << std::setw(10) << std::right
403 .requested_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
405 <<
" | " << std::setw(10) << std::right
408 .requested_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
411 <<
"| from small pool | " << std::setw(10) << std::right
414 .requested_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
416 <<
" | " << std::setw(10) << std::right
419 .requested_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
421 <<
" | " << std::setw(10) << std::right
424 .requested_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
426 <<
" | " << std::setw(10) << std::right
429 .requested_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
432 <<
"|---------------------------------------------------------------------"
434 <<
"| GPU reserved memory | " << std::setw(10) << std::right
437 .reserved_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
439 <<
" | " << std::setw(10) << std::right
442 .reserved_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
444 <<
" | " << std::setw(10) << std::right
447 .reserved_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
449 <<
" | " << std::setw(10) << std::right
452 .reserved_bytes[
static_cast<std::size_t
>(StatType::AGGREGATE)]
455 <<
"| from large pool | " << std::setw(10) << std::right
458 .reserved_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
460 <<
" | " << std::setw(10) << std::right
463 .reserved_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
465 <<
" | " << std::setw(10) << std::right
468 .reserved_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
470 <<
" | " << std::setw(10) << std::right
473 .reserved_bytes[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
476 <<
"| from small pool | " << std::setw(10) << std::right
479 .reserved_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
481 <<
" | " << std::setw(10) << std::right
484 .reserved_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
486 <<
" | " << std::setw(10) << std::right
489 .reserved_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
491 <<
" | " << std::setw(10) << std::right
494 .reserved_bytes[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
497 <<
"|---------------------------------------------------------------------"
499 <<
"| Non-releasable memory | " << std::setw(10) << std::right
500 << _format_size(deviceStats
501 .inactive_split_bytes[
static_cast<std::size_t
>(
502 StatType::AGGREGATE)]
504 <<
" | " << std::setw(10) << std::right
505 << _format_size(deviceStats
506 .inactive_split_bytes[
static_cast<std::size_t
>(
507 StatType::AGGREGATE)]
509 <<
" | " << std::setw(10) << std::right
510 << _format_size(deviceStats
511 .inactive_split_bytes[
static_cast<std::size_t
>(
512 StatType::AGGREGATE)]
514 <<
" | " << std::setw(10) << std::right
515 << _format_size(deviceStats
516 .inactive_split_bytes[
static_cast<std::size_t
>(
517 StatType::AGGREGATE)]
520 <<
"| from large pool | " << std::setw(10) << std::right
521 << _format_size(deviceStats
522 .inactive_split_bytes[
static_cast<std::size_t
>(
523 StatType::LARGE_POOL)]
525 <<
" | " << std::setw(10) << std::right
526 << _format_size(deviceStats
527 .inactive_split_bytes[
static_cast<std::size_t
>(
528 StatType::LARGE_POOL)]
530 <<
" | " << std::setw(10) << std::right
531 << _format_size(deviceStats
532 .inactive_split_bytes[
static_cast<std::size_t
>(
533 StatType::LARGE_POOL)]
535 <<
" | " << std::setw(10) << std::right
536 << _format_size(deviceStats
537 .inactive_split_bytes[
static_cast<std::size_t
>(
538 StatType::LARGE_POOL)]
541 <<
"| from small pool | " << std::setw(10) << std::right
542 << _format_size(deviceStats
543 .inactive_split_bytes[
static_cast<std::size_t
>(
544 StatType::SMALL_POOL)]
546 <<
" | " << std::setw(10) << std::right
547 << _format_size(deviceStats
548 .inactive_split_bytes[
static_cast<std::size_t
>(
549 StatType::SMALL_POOL)]
551 <<
" | " << std::setw(10) << std::right
552 << _format_size(deviceStats
553 .inactive_split_bytes[
static_cast<std::size_t
>(
554 StatType::SMALL_POOL)]
556 <<
" | " << std::setw(10) << std::right
557 << _format_size(deviceStats
558 .inactive_split_bytes[
static_cast<std::size_t
>(
559 StatType::SMALL_POOL)]
562 <<
"|---------------------------------------------------------------------"
564 <<
"| Allocations | " << std::setw(10) << std::right
565 << deviceStats.allocation[
static_cast<std::size_t
>(StatType::AGGREGATE)]
567 <<
" | " << std::setw(10) << std::right
568 << deviceStats.allocation[
static_cast<std::size_t
>(StatType::AGGREGATE)]
570 <<
" | " << std::setw(10) << std::right
571 << deviceStats.allocation[
static_cast<std::size_t
>(StatType::AGGREGATE)]
573 <<
" | " << std::setw(10) << std::right
574 << deviceStats.allocation[
static_cast<std::size_t
>(StatType::AGGREGATE)]
577 <<
"|---------------------------------------------------------------------"
579 <<
"| from large pool | " << std::setw(10) << std::right
580 << deviceStats.allocation[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
582 <<
" | " << std::setw(10) << std::right
583 << deviceStats.allocation[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
585 <<
" | " << std::setw(10) << std::right
586 << deviceStats.allocation[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
588 <<
" | " << std::setw(10) << std::right
589 << deviceStats.allocation[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
592 <<
"|---------------------------------------------------------------------"
594 <<
"| from small pool | " << std::setw(10) << std::right
595 << deviceStats.allocation[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
597 <<
" | " << std::setw(10) << std::right
598 << deviceStats.allocation[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
600 <<
" | " << std::setw(10) << std::right
601 << deviceStats.allocation[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
603 <<
" | " << std::setw(10) << std::right
604 << deviceStats.allocation[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
607 <<
"|---------------------------------------------------------------------"
609 <<
"| Active allocs | " << std::setw(10) << std::right
610 << deviceStats.active[
static_cast<std::size_t
>(StatType::AGGREGATE)]
612 <<
" | " << std::setw(10) << std::right
613 << deviceStats.active[
static_cast<std::size_t
>(StatType::AGGREGATE)].peak
614 <<
" | " << std::setw(10) << std::right
615 << deviceStats.active[
static_cast<std::size_t
>(StatType::AGGREGATE)]
617 <<
" | " << std::setw(10) << std::right
618 << deviceStats.active[
static_cast<std::size_t
>(StatType::AGGREGATE)].freed
620 <<
"|---------------------------------------------------------------------"
622 <<
"| from large pool | " << std::setw(10) << std::right
623 << deviceStats.active[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
625 <<
" | " << std::setw(10) << std::right
626 << deviceStats.active[
static_cast<std::size_t
>(StatType::LARGE_POOL)].peak
627 <<
" | " << std::setw(10) << std::right
628 << deviceStats.active[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
630 <<
" | " << std::setw(10) << std::right
631 << deviceStats.active[
static_cast<std::size_t
>(StatType::LARGE_POOL)].freed
633 <<
"|---------------------------------------------------------------------"
635 <<
"| from small pool | " << std::setw(10) << std::right
636 << deviceStats.active[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
638 <<
" | " << std::setw(10) << std::right
639 << deviceStats.active[
static_cast<std::size_t
>(StatType::SMALL_POOL)].peak
640 <<
" | " << std::setw(10) << std::right
641 << deviceStats.active[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
643 <<
" | " << std::setw(10) << std::right
644 << deviceStats.active[
static_cast<std::size_t
>(StatType::SMALL_POOL)].freed
646 <<
"|---------------------------------------------------------------------"
648 <<
"| GPU reserved segments | " << std::setw(10) << std::right
649 << deviceStats.segment[
static_cast<std::size_t
>(StatType::AGGREGATE)]
651 <<
" | " << std::setw(10) << std::right
652 << deviceStats.segment[
static_cast<std::size_t
>(StatType::AGGREGATE)].peak
653 <<
" | " << std::setw(10) << std::right
654 << deviceStats.segment[
static_cast<std::size_t
>(StatType::AGGREGATE)]
656 <<
" | " << std::setw(10) << std::right
657 << deviceStats.segment[
static_cast<std::size_t
>(StatType::AGGREGATE)].freed
659 <<
"|---------------------------------------------------------------------"
661 <<
"| from large pool | " << std::setw(10) << std::right
662 << deviceStats.segment[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
664 <<
" | " << std::setw(10) << std::right
665 << deviceStats.segment[
static_cast<std::size_t
>(StatType::LARGE_POOL)].peak
666 <<
" | " << std::setw(10) << std::right
667 << deviceStats.segment[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
669 <<
" | " << std::setw(10) << std::right
670 << deviceStats.segment[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
673 <<
"|---------------------------------------------------------------------"
675 <<
"| from small pool | " << std::setw(10) << std::right
676 << deviceStats.segment[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
678 <<
" | " << std::setw(10) << std::right
679 << deviceStats.segment[
static_cast<std::size_t
>(StatType::SMALL_POOL)].peak
680 <<
" | " << std::setw(10) << std::right
681 << deviceStats.segment[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
683 <<
" | " << std::setw(10) << std::right
684 << deviceStats.segment[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
687 <<
"|---------------------------------------------------------------------"
689 <<
"| Non-releasable allocs | " << std::setw(10) << std::right
691 .inactive_split[
static_cast<std::size_t
>(StatType::AGGREGATE)]
693 <<
" | " << std::setw(10) << std::right
695 .inactive_split[
static_cast<std::size_t
>(StatType::AGGREGATE)]
697 <<
" | " << std::setw(10) << std::right
699 .inactive_split[
static_cast<std::size_t
>(StatType::AGGREGATE)]
701 <<
" | " << std::setw(10) << std::right
703 .inactive_split[
static_cast<std::size_t
>(StatType::AGGREGATE)]
706 <<
"|---------------------------------------------------------------------"
708 <<
"| from large pool | " << std::setw(10) << std::right
710 .inactive_split[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
712 <<
" | " << std::setw(10) << std::right
714 .inactive_split[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
716 <<
" | " << std::setw(10) << std::right
718 .inactive_split[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
720 <<
" | " << std::setw(10) << std::right
722 .inactive_split[
static_cast<std::size_t
>(StatType::LARGE_POOL)]
725 <<
"|---------------------------------------------------------------------"
727 <<
"| from small pool | " << std::setw(10) << std::right
729 .inactive_split[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
731 <<
" | " << std::setw(10) << std::right
733 .inactive_split[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
735 <<
" | " << std::setw(10) << std::right
737 .inactive_split[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
739 <<
" | " << std::setw(10) << std::right
741 .inactive_split[
static_cast<std::size_t
>(StatType::SMALL_POOL)]
744 <<
"|---------------------------------------------------------------------"
746 <<
"| Oversize allocations | " << std::setw(10) << std::right
747 << deviceStats.oversize_allocations.current <<
" | " << std::setw(10)
748 << std::right << deviceStats.oversize_allocations.peak <<
" | "
749 << std::setw(10) << std::right
750 << deviceStats.oversize_allocations.allocated <<
" | " << std::setw(10)
751 << std::right << deviceStats.oversize_allocations.freed <<
" |\n"
752 <<
"|---------------------------------------------------------------------"
754 <<
"| Oversize GPU segments | " << std::setw(10) << std::right
755 << deviceStats.oversize_segments.current <<
" | " << std::setw(10)
756 << std::right << deviceStats.oversize_segments.peak <<
" | "
757 << std::setw(10) << std::right << deviceStats.oversize_segments.allocated
758 <<
" | " << std::setw(10) << std::right
759 << deviceStats.oversize_segments.freed <<
" |\n"
760 <<
"|====================================================================="
763 os <<
"Memory summary is only available for CUDA/HIP devices";