@inproceedings{722fc19cc0e047009bb032e447ed0a1b,
title = "Interpretable Visual Reasoning via Probabilistic Formulation Under Natural Supervision",
abstract = "Visual reasoning is crucial for visual question answering (VQA). However, without labelled programs, implicit reasoning under natural supervision is still quite challenging and previous models are hard to interpret. In this paper, we rethink implicit reasoning process in VQA, and propose a new formulation which maximizes the log-likelihood of joint distribution for the observed question and predicted answer. Accordingly, we derive a Temporal Reasoning Network (TRN) framework which models the implicit reasoning process as sequential planning in latent space. Our model is interpretable on both model design in probabilist and reasoning process via visualization. We experimentally demonstrate that TRN can support implicit reasoning across various datasets. The experimental results of our model are competitive to existing implicit reasoning models and surpass baseline by a large margin on complicated reasoning tasks without extra computation cost in forward stage.",
keywords = "Explanable machine learning, Implicit reasoning, Temporal Reasoning Network, Visual Question Answering",
author = "Xinzhe Han and Shuhui Wang and Chi Su and Weigang Zhang and Qingming Huang and Qi Tian",
note = "Publisher Copyright: {\textcopyright} 2020, Springer Nature Switzerland AG.; 16th European Conference on Computer Vision, ECCV 2020 ; Conference date: 23-08-2020 Through 28-08-2020",
year = "2020",
doi = "10.1007/978-3-030-58545-7\_32",
language = "英语",
isbn = "9783030585440",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "553--570",
editor = "Andrea Vedaldi and Horst Bischof and Thomas Brox and Jan-Michael Frahm",
booktitle = "Computer Vision – ECCV 2020 - 16th European Conference, 2020, Proceedings",
address = "德国",
}