{"id":"https://openalex.org/W4415540763","doi":"https://doi.org/10.1145/3746027.3761997","title":"Hierarchical Vision-Language Reasoning for Multimodal Multiple-Choice Question Answering","display_name":"Hierarchical Vision-Language Reasoning for Multimodal Multiple-Choice Question Answering","publication_year":2025,"publication_date":"2025-10-25","ids":{"openalex":"https://openalex.org/W4415540763","doi":"https://doi.org/10.1145/3746027.3761997"},"language":null,"primary_location":{"id":"doi:10.1145/3746027.3761997","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3761997","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5049818625","display_name":"Ao Zhou","orcid":"https://orcid.org/0009-0004-6056-3515"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ao Zhou","raw_affiliation_strings":["State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0009-0004-6056-3515","affiliations":[{"raw_affiliation_string":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zebo Gu","orcid":"https://orcid.org/0009-0005-8206-8889"},"institutions":[{"id":"https://openalex.org/I10535382","display_name":"Chongqing University of Posts and Telecommunications","ror":"https://ror.org/03dgaqz26","country_code":"CN","type":"education","lineage":["https://openalex.org/I10535382"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zebo Gu","raw_affiliation_strings":["Chongqing University of Posts and Telecommunications, Chongqing, China"],"raw_orcid":"https://orcid.org/0009-0005-8206-8889","affiliations":[{"raw_affiliation_string":"Chongqing University of Posts and Telecommunications, Chongqing, China","institution_ids":["https://openalex.org/I10535382"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111993343","display_name":"T Sun","orcid":"https://orcid.org/0009-0005-6606-1732"},"institutions":[{"id":"https://openalex.org/I10535382","display_name":"Chongqing University of Posts and Telecommunications","ror":"https://ror.org/03dgaqz26","country_code":"CN","type":"education","lineage":["https://openalex.org/I10535382"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tenghao Sun","raw_affiliation_strings":["Chongqing University of Posts and Telecommunications, Chongqing, China"],"raw_orcid":"https://orcid.org/0009-0005-6606-1732","affiliations":[{"raw_affiliation_string":"Chongqing University of Posts and Telecommunications, Chongqing, China","institution_ids":["https://openalex.org/I10535382"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Jiawen Chen","orcid":"https://orcid.org/0009-0004-0494-2473"},"institutions":[{"id":"https://openalex.org/I10535382","display_name":"Chongqing University of Posts and Telecommunications","ror":"https://ror.org/03dgaqz26","country_code":"CN","type":"education","lineage":["https://openalex.org/I10535382"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiawen Chen","raw_affiliation_strings":["Chongqing University of Posts and Telecommunications, Chongqing, China"],"raw_orcid":"https://orcid.org/0009-0004-0494-2473","affiliations":[{"raw_affiliation_string":"Chongqing University of Posts and Telecommunications, Chongqing, China","institution_ids":["https://openalex.org/I10535382"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111003634","display_name":"M.J. Tu","orcid":null},"institutions":[{"id":"https://openalex.org/I10535382","display_name":"Chongqing University of Posts and Telecommunications","ror":"https://ror.org/03dgaqz26","country_code":"CN","type":"education","lineage":["https://openalex.org/I10535382"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mingsheng Tu","raw_affiliation_strings":["Chongqing University of Posts and Telecommunications, Chongqing, China"],"raw_orcid":"https://orcid.org/0009-0003-1679-0174","affiliations":[{"raw_affiliation_string":"Chongqing University of Posts and Telecommunications, Chongqing, China","institution_ids":["https://openalex.org/I10535382"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033620369","display_name":"Zifeng Cheng","orcid":"https://orcid.org/0000-0002-8486-2614"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zifeng Cheng","raw_affiliation_strings":["State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0000-0002-8486-2614","affiliations":[{"raw_affiliation_string":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5087888694","display_name":"Yafeng Yin","orcid":"https://orcid.org/0000-0002-9497-6244"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yafeng Yin","raw_affiliation_strings":["State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0000-0002-9497-6244","affiliations":[{"raw_affiliation_string":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082553860","display_name":"Zhiwei Jiang","orcid":"https://orcid.org/0000-0001-5243-4992"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiwei Jiang","raw_affiliation_strings":["State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0000-0001-5243-4992","affiliations":[{"raw_affiliation_string":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5061025205","display_name":"Qing Gu","orcid":"https://orcid.org/0000-0002-1112-790X"},"institutions":[{"id":"https://openalex.org/I881766915","display_name":"Nanjing University","ror":"https://ror.org/01rxvg760","country_code":"CN","type":"education","lineage":["https://openalex.org/I881766915"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qing Gu","raw_affiliation_strings":["State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China"],"raw_orcid":"https://orcid.org/0000-0002-1112-790X","affiliations":[{"raw_affiliation_string":"State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China","institution_ids":["https://openalex.org/I881766915"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.7588,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.89190923,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":95,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"13784","last_page":"13790"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9955999851226807,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9932000041007996,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/question-answering","display_name":"Question answering","score":0.7723000049591064},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.6092000007629395},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.5515999794006348},{"id":"https://openalex.org/keywords/language-understanding","display_name":"Language understanding","score":0.4932999908924103},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.41440001130104065},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3206000030040741}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8284000158309937},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.7723000049591064},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6777999997138977},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.6092000007629395},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5777000188827515},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.5515999794006348},{"id":"https://openalex.org/C2983448237","wikidata":"https://www.wikidata.org/wiki/Q1078276","display_name":"Language understanding","level":2,"score":0.4932999908924103},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.41440001130104065},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3206000030040741},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2955999970436096},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.28200000524520874},{"id":"https://openalex.org/C155092808","wikidata":"https://www.wikidata.org/wiki/Q182557","display_name":"Computational linguistics","level":2,"score":0.2662000060081482},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2624000012874603},{"id":"https://openalex.org/C90312973","wikidata":"https://www.wikidata.org/wiki/Q7449052","display_name":"Semantic data model","level":2,"score":0.25519999861717224},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2540000081062317}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3746027.3761997","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3746027.3761997","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 33rd ACM International Conference on Multimedia","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3504507626","display_name":null,"funder_award_id":"No.14380001","funder_id":"https://openalex.org/F4320335787","funder_display_name":"Fundamental Research Funds for the Central Universities"},{"id":"https://openalex.org/G6607584811","display_name":null,"funder_award_id":"62441225, 61972192, 62172208, 61906085","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320335787","display_name":"Fundamental Research Funds for the Central Universities","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":7,"referenced_works":["https://openalex.org/W4252076394","https://openalex.org/W4388778348","https://openalex.org/W4392172801","https://openalex.org/W4392570944","https://openalex.org/W4404356490","https://openalex.org/W4407049645","https://openalex.org/W4413146441"],"related_works":[],"abstract_inverted_index":{"Multimodal":[0],"Large":[1],"Language":[2],"Models":[3],"(MLLMs)":[4],"have":[5],"demonstrated":[6],"remarkable":[7],"multimodal":[8,86],"understanding":[9,82],"capabilities":[10],"in":[11,61,128],"Visual":[12],"Question":[13],"Answering":[14],"(VQA)":[15],"tasks":[16],"by":[17],"integrating":[18],"visual":[19],"and":[20,44,66],"textual":[21],"features.":[22],"However,":[23],"under":[24],"the":[25,114],"challenging":[26],"ten-choice":[27],"question":[28],"evaluation":[29],"paradigm,":[30],"existing":[31],"methods":[32],"still":[33],"exhibit":[34],"significant":[35],"limitations":[36],"when":[37],"processing":[38],"PDF":[39,80],"documents":[40],"with":[41,90],"complex":[42,121],"layouts":[43],"lengthy":[45],"content.":[46],"Notably,":[47],"current":[48],"mainstream":[49],"models":[50],"suffer":[51],"from":[52],"a":[53,77,97],"strong":[54],"bias":[55],"toward":[56],"English":[57],"training":[58],"data,":[59],"resulting":[60],"suboptimal":[62],"performance":[63],"for":[64,120],"Japanese":[65,79],"other":[67],"language":[68],"scenarios.":[69,131],"To":[70],"address":[71],"these":[72],"challenges,":[73],"this":[74],"paper":[75],"proposes":[76],"novel":[78],"document":[81],"framework":[83,109],"that":[84,107],"combines":[85],"hierarchical":[87],"reasoning":[88],"mechanisms":[89],"Colqwen-optimized":[91],"retrieval":[92],"methods,":[93],"while":[94],"innovatively":[95],"introducing":[96],"semantic":[98,117],"verification":[99],"strategy":[100],"through":[101],"sub-question":[102],"decomposition.":[103],"Experimental":[104],"results":[105],"demonstrate":[106],"our":[108],"not":[110],"only":[111],"significantly":[112],"enhances":[113],"model's":[115],"deep":[116],"parsing":[118],"capability":[119],"documents,":[122],"but":[123],"also":[124],"exhibits":[125],"superior":[126],"robustness":[127],"practical":[129],"application":[130]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-25T00:00:00"}
