@inproceedings{a969582367b0400184f81068a3bb601d,
title = "Incremental pre-training from smaller language models",
abstract = "Large language models have recently become a new learning paradigm and led to state-of-the-art performance across a range of tasks. As explosive open-source pre-trained models are available, it is worth investigating how to better utilize existing models. We propose a simple yet effective method, Incr-Pretrain, for incrementally pre-training language models from smaller well-trained source models. Different layer-wise transfer strategies were introduced for model augmentation including parameter copying, initial value padding, and model distillation. Experiments on multiple zero-shot learning tasks demonstrate satisfying inference performance upon transferring and promising training efficiency during continuing pre-training. Compared to training from scratch, Incr-Pretrain can save up to half the training time to get a similar testing loss.",
author = "Han Zhang and Hui Wang and Ruifeng Xu",
note = "Publisher Copyright: {\textcopyright} 2024 Association for Computational Linguistics; 10th SIGHAN Workshop on Chinese Language Processing, SIGHAN 2024 ; Conference date: 16-08-2024",
year = "2024",
language = "英语",
series = "SIGHAN 2024 - 10th SIGHAN Workshop on Chinese Language Processing, Proceedings of the Workshop",
publisher = "Association for Computational Linguistics (ACL)",
pages = "36--44",
editor = "Kam-Fai Wong and Min Zhang and Ruifeng Xu and Jing Li and Zhongyu Wei and Lin Gui and Bin Liang and Runcong Zhao",
booktitle = "SIGHAN 2024 - 10th SIGHAN Workshop on Chinese Language Processing, Proceedings of the Workshop",
address = "澳大利亚",
}