@inproceedings{2c707081873f4b5ab8a549defd826b5f,
title = "QPO: Accelerating Memory-Efficient DNN Training with Quantization and Pipelining",
abstract = "Deep neural network (DNN) training demands significant computational power and also necessitates costly device memory for the storage of parameters, gradients, optimizer states, and activations. Activations generally take up the largest portion of device memory, and their usage increases linearly with the mini-batch size and sequence length, which are two main hyper-parameters in training language models. Offloading is one of the widely used memory-efficient techniques by transferring temporarily data from the device memory to CPU memory to save device memory. However, the offloading and uploading operations easily cause significant data transfer overheads. In this paper, we propose QPO (Quantized and Pipelined Offloading), which combines 1) compressing the activation data with low-bit quantization to alleviate the transfer overhead, and 2) pipelining communication tasks of offloading/uploading with computation tasks of feed-forward/backpropagation to reduce the iteration time. Experiments on GPT-2 and LLaMA-2 models using A100 and RTX 3090 GPUs show a speedup of up to 15\% while approaching minimal memory requirements over existing offloading approaches.",
keywords = "activation quantization, memory compression, memory efficiency, memory swapping, offloading, pipelining",
author = "Xiang Fan and Shaohuai Shi",
note = "Publisher Copyright: {\textcopyright} 2025 IEEE.; 31st IEEE International Conference on Parallel and Distributed Systems, ICPADS 2025 ; Conference date: 14-12-2025 Through 17-12-2025",
year = "2025",
doi = "10.1109/ICPADS67057.2025.11323016",
language = "英语",
series = "Proceedings of the International Conference on Parallel and Distributed Systems - ICPADS",
publisher = "IEEE Computer Society",
booktitle = "Proceedings of 2025 IEEE 31st International Conference on Parallel and Distributed Systems, ICPADS 2025",
address = "美国",
}