{"id":"https://openalex.org/W7127304069","doi":"https://doi.org/10.48550/arxiv.2602.00559","title":"Learning to Decode Against Compositional Hallucination in Video Multimodal Large Language Models","display_name":"Learning to Decode Against Compositional Hallucination in Video Multimodal Large Language Models","publication_year":2026,"publication_date":"2026-01-31","ids":{"openalex":"https://openalex.org/W7127304069","doi":"https://doi.org/10.48550/arxiv.2602.00559"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.00559","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124954185","display_name":"Wenbin Xing","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xing, Wenbin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124897992","display_name":"Quanxing Zha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zha, Quanxing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014496575","display_name":"Lizheng Zu","orcid":"https://orcid.org/0000-0001-9465-5064"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zu, Lizheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124968897","display_name":"Mengran Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Mengran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124878148","display_name":"Ming Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Ming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5123563796","display_name":"Junchi Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Junchi","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5124954185"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.3587000072002411,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.3587000072002411,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12357","display_name":"Digital Media Forensic Detection","score":0.06920000165700912,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.051500000059604645,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5218999981880188},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.49309998750686646},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.43459999561309814},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4120999872684479},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.3880000114440918},{"id":"https://openalex.org/keywords/adversarial-system","display_name":"Adversarial system","score":0.33570000529289246}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7815999984741211},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5218999981880188},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5001999735832214},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.49309998750686646},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.43459999561309814},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4120999872684479},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3880000114440918},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3481000065803528},{"id":"https://openalex.org/C37736160","wikidata":"https://www.wikidata.org/wiki/Q1801315","display_name":"Adversarial system","level":2,"score":0.33570000529289246},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.3068999946117401},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.30660000443458557},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.30329999327659607},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.29499998688697815},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2721000015735626},{"id":"https://openalex.org/C147764199","wikidata":"https://www.wikidata.org/wiki/Q6865248","display_name":"Minification","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C103088060","wikidata":"https://www.wikidata.org/wiki/Q1062839","display_name":"Error detection and correction","level":2,"score":0.26089999079704285},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.2572000026702881},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.00559","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.00559","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.00559","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.00559","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[{"display_name":"Peace, Justice and strong institutions","id":"https://metadata.un.org/sdg/16","score":0.8074773550033569}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Current":[0],"research":[1],"on":[2,8],"video":[3,43,52,125],"hallucination":[4,58,151],"mitigation":[5],"primarily":[6],"focuses":[7],"isolated":[9,38],"error":[10],"types,":[11],"leaving":[12],"compositional":[13,40,150],"hallucinations,":[14],"arising":[15],"from":[16],"incorrect":[17],"reasoning":[18],"over":[19,171],"multiple":[20],"interacting":[21],"spatial":[22],"and":[23,39,60,74,96,175],"temporal":[24],"factors":[25],"largely":[26],"underexplored.":[27],"We":[28,102],"introduce":[29],"OmniVCHall,":[30],"a":[31,55,62,105,110,128],"benchmark":[32],"designed":[33],"to":[34,79,122,145],"systematically":[35],"evaluate":[36],"both":[37],"hallucinations":[41],"in":[42],"multimodal":[44],"large":[45],"language":[46],"models":[47,93],"(VLLMs).":[48],"OmniVCHall":[49],"spans":[50],"diverse":[51],"domains,":[53],"introduces":[54],"novel":[56],"camera-based":[57],"type,":[59],"defines":[61],"fine-grained":[63],"taxonomy,":[64],"together":[65],"with":[66,109],"adversarial":[67],"answer":[68],"options":[69],"(e.g.,":[70,94],"\"All":[71],"are":[72,140],"correct\"":[73],"\"None":[75],"of":[76,85,170],"the":[77],"above\")":[78],"prevent":[80],"shortcut":[81],"reasoning.":[82],"The":[83,173],"evaluations":[84],"39":[86],"representative":[87,163],"VLLMs":[88],"reveal":[89],"that":[90,156],"even":[91],"advanced":[92],"Qwen3-VL":[95],"GPT-5)":[97],"exhibit":[98],"substantial":[99],"performance":[100,160],"degradation.":[101],"propose":[103],"TriCD,":[104],"contrastive":[106],"decoding":[107],"framework":[108],"triple-pathway":[111],"calibration":[112],"mechanism.":[113],"An":[114],"adaptive":[115],"perturbation":[116],"controller":[117],"dynamically":[118],"selects":[119],"distracting":[120],"operations":[121],"construct":[123],"negative":[124],"variants,":[126],"while":[127],"saliency-guided":[129],"enhancement":[130],"module":[131],"adaptively":[132],"reinforces":[133],"grounded":[134],"token-wise":[135],"visual":[136],"evidences.":[137],"These":[138],"components":[139],"optimized":[141],"via":[142],"reinforcement":[143],"learning":[144],"encourage":[146],"precise":[147],"decision-making":[148],"under":[149],"settings.":[152],"Experimental":[153],"results":[154],"show":[155],"TriCD":[157],"consistently":[158],"improves":[159],"across":[161],"two":[162],"backbones,":[164],"achieving":[165],"an":[166],"average":[167],"accuracy":[168],"improvement":[169],"10%.":[172],"data":[174],"code":[176],"can":[177],"be":[178],"find":[179],"at":[180],"https://github.com/BMRETURN/OmniVCHall.":[181]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-04T00:00:00"}
