@inproceedings{7b0f4a157274432582a6825eeb76ee10,
title = "GiVE: Guiding Visual Encoder to Perceive Overlooked Information",
abstract = "Multimodal Large Language Models have advanced AI in applications like text-to-video generation and visual question answering. These models rely on visual encoders to convert non-text data into vectors, but current encoders either lack semantic alignment or overlook non-salient objects. We propose the Guiding Visual Encoder to Perceive Overlooked Information (GiVE) approach. GiVE enhances visual representation with an Attention-Guided Adapter (AG-Adapter) module and an Object-focused Visual Semantic Learning module. These incorporate three novel loss terms: Object-focused Image-Text Contrast (OITC) loss, Object-focused Image-Image Contrast (OIIC) loss, and Object-focused Image Discrimination (OID) loss, improving object consideration, retrieval accuracy, and comprehensiveness. Our contributions include dynamic visual focus adjustment, novel loss functions to enhance object retrieval, and the Multi-Object Instruction (MOInst) dataset. Experiments show our approach achieves state-of-the-art performance.",
keywords = "adapter, image encoder, instruction, multimodal learning, visual perception",
author = "Junjie Li and Jianghong Ma and Xiaofeng Zhang and Yuhang Li and Jianyang Shi",
note = "Publisher Copyright: {\textcopyright} 2025 IEEE.; 2025 IEEE International Conference on Multimedia and Expo, ICME 2025 ; Conference date: 30-06-2025 Through 04-07-2025",
year = "2025",
doi = "10.1109/ICME59968.2025.11210151",
language = "英语",
series = "Proceedings - IEEE International Conference on Multimedia and Expo",
publisher = "IEEE Computer Society",
booktitle = "2025 IEEE International Conference on Multimedia and Expo",
address = "美国",
}