@inproceedings{94bb860a4f6d434ab7ecfd26ebf8a577,
title = "SpInfer: Leveraging Low-Level Sparsity for Efficient Large Language Model Inference on GPUs",
abstract = "Large Language Models (LLMs) have demonstrated remarkable capabilities, but their immense scale poses significant challenges in terms of both memory and computational costs. While unstructured pruning offers promising solutions by introducing sparsity to reduce resource requirements, realizing its benefits in LLM inference remains elusive. This is primarily due to the storage overhead of indexing non-zero elements and the inefficiency of sparse matrix multiplication (SpMM) kernels at low sparsity levels (around 50\%). In this paper, we present SpInfer, a high-performance framework tailored for sparsified LLM inference on GPUs. SpInfer introduces Tensor-Core-Aware Bitmap Encoding (TCA-BME), a novel sparse format that minimizes indexing overhead by leveraging efficient bitmap-based indexing, optimized for GPU Tensor Core architectures. Furthermore, SpInfer integrates an optimized SpMM kernel with Shared Memory Bitmap Decoding (SMBD) and asynchronous pipeline design to enhance computational efficiency. Experimental results show that SpInfer significantly outperforms state-of-the-art SpMM implementations (up to 2.14× and 2.27× over Flash-LLM and SparTA, respectively) across a range of sparsity levels (30\% to 70\%), with substantial improvements in both memory efficiency and end-to-end inference speed (up to 1.58×). SpInfer outperforms highly optimized cuBLAS at sparsity levels as low as 30\%, marking the first effective translation of unstructured pruning{\textquoteright}s theoretical advantages into practical performance gains for LLM inference.",
keywords = "GPU, LLM Inference, SpMM, Sparse, Tensor Core, Unstructured Pruning",
author = "Ruibo Fan and Xiangrui Yu and Peijie Dong and Zeyu Li and Gu Gong and Qiang Wang and Wei Wang and Xiaowen Chu",
note = "Publisher Copyright: {\textcopyright} 2025 Copyright held by the owner/author(s).; 20th European Conference on Computer Systems, EuroSys 2025, co-located 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, ASPLOS 2025 ; Conference date: 30-03-2025 Through 03-04-2025",
year = "2025",
month = mar,
day = "30",
doi = "10.1145/3689031.3717481",
language = "英语",
series = "EuroSys 2025 - Proceedings of the 2025 20th European Conference on Computer Systems",
publisher = "Association for Computing Machinery, Inc",
pages = "243--260",
booktitle = "EuroSys 2025 - Proceedings of the 2025 20th European Conference on Computer Systems",
}