What are Huge Pages?
Linux Huge Pages use larger memory pages (2MB or 1GB instead of 4KB) to reduce TLB (Translation Lookaside Buffer) misses. This can significantly improve memory access performance for large allocations typical in deep learning.
CPU Request
→
TLB Lookup
→
Page Table
→
Physical Memory
1. Check and Configure Huge Pages
First, check your current huge pages configuration:
# View current huge pages configuration
cat /proc/meminfo | grep -i huge
# Check available huge page sizes
ls /sys/kernel/mm/hugepages/
# Allocate 1024 huge pages of 2MB each (2GB total)
sudo sysctl -w vm.nr_hugepages=1024
# Make it persistent across reboots
echo "vm.nr_hugepages=1024" | sudo tee -a /etc/sysctl.conf
2. Transparent Huge Pages (THP)
The easiest approach — THP handles huge pages automatically:
# Enable THP (if not already enabled)
echo "always" | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
# Check current THP status
cat /sys/kernel/mm/transparent_hugepage/enabled
# Options: [always] madvise never
3. Explicit Huge Pages via mmap
Allocate PyTorch tensors on huge pages using memory-mapped files:
import torch
import ctypes
import os
def allocate_huge_pages_tensor(shape, dtype=torch.float32):
"""Allocate a PyTorch tensor backed by huge pages."""
# Calculate size in bytes
numel = 1
for s in shape:
numel *= s
dtype_size = torch.tensor([], dtype=dtype).element_size()
size_bytes = numel * dtype_size
# Round up to huge page size (2MB)
HUGE_PAGE_SIZE = 2 * 1024 * 1024
aligned_size = ((size_bytes + HUGE_PAGE_SIZE - 1) // HUGE_PAGE_SIZE) * HUGE_PAGE_SIZE
# mmap flags
MAP_HUGETLB = 0x40000
MAP_ANONYMOUS = 0x20
MAP_PRIVATE = 0x02
PROT_READ = 0x1
PROT_WRITE = 0x2
libc = ctypes.CDLL("libc.so.6", use_errno=True)
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int,
ctypes.c_int, ctypes.c_int, ctypes.c_long]
libc.mmap.restype = ctypes.c_void_p
ptr = libc.mmap(None, aligned_size,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-1, 0)
if ptr == ctypes.c_void_p(-1).value:
raise OSError(f"mmap failed: {os.strerror(ctypes.get_errno())}")
# Create tensor from pointer
tensor = torch.frombuffer(
(ctypes.c_char * size_bytes).from_address(ptr),
dtype=dtype
).reshape(shape)
return tensor, ptr, aligned_size
# Usage
tensor, ptr, size = allocate_huge_pages_tensor((1024, 1024, 1024), torch.float32)
4. NUMA-Aware Huge Pages
For multi-socket systems, bind memory to specific NUMA nodes:
# Allocate huge pages on specific NUMA node
echo 512 | sudo tee /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
echo 512 | sudo tee /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages
# Run with numactl
numactl --cpunodebind=0 --membind=0 python train.py