@inproceedings{10462dc22191455da904836ce8db8b4d,
title = "On the Copying Behaviors of Pre-Training for Neural Machine Translation",
abstract = "Previous studies have shown that initializing neural machine translation (NMT) models with the pre-trained language models (LM) can speed up the model training and boost the model performance. In this work, we identify a critical side-effect of pre-training for NMT, which is due to the discrepancy between the training objectives of LM-based pre-training and NMT. Since the LM objective learns to reconstruct a few source tokens and copy most of them, the pre-training initialization would affect the copying behaviors of NMT models. We provide a quantitative analysis of copying behaviors by introducing a metric called copying ratio, which empirically shows that pre-training based NMT models have a larger copying ratio than the standard one. In response to this problem, we propose a simple and effective method named copying penalty to control the copying behaviors in decoding. Extensive experiments on both in-domain and out-of-domain benchmarks show that the copying penalty method consistently improves translation performance by controlling copying behaviors for pre-training based NMT models. Source code is freely available at https://github.com/SunbowLiu/CopyingPenalty.",
author = "Xuebo Liu and Longyue Wang and Wong, \{Derek F.\} and Liang Ding and Chao, \{Lidia S.\} and Shuming Shi and Zhaopeng Tu",
note = "Publisher Copyright: {\textcopyright} 2021 Association for Computational Linguistics; Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021 ; Conference date: 01-08-2021 Through 06-08-2021",
year = "2021",
doi = "10.18653/v1/2021.findings-acl.373",
language = "英语",
series = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021",
publisher = "Association for Computational Linguistics (ACL)",
pages = "4265--4275",
editor = "Chengqing Zong and Fei Xia and Wenjie Li and Roberto Navigli",
booktitle = "Findings of the Association for Computational Linguistics",
address = "澳大利亚",
}