{"id":"https://openalex.org/W4415708422","doi":"https://doi.org/10.1109/icme59968.2025.11209977","title":"Mitigating Hallucination in Large Video-Language Models with Injected Semantics","display_name":"Mitigating Hallucination in Large Video-Language Models with Injected Semantics","publication_year":2025,"publication_date":"2025-06-30","ids":{"openalex":"https://openalex.org/W4415708422","doi":"https://doi.org/10.1109/icme59968.2025.11209977"},"language":null,"primary_location":{"id":"doi:10.1109/icme59968.2025.11209977","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11209977","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101160189","display_name":"Bimei Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I159948400","display_name":"Jinan University","ror":"https://ror.org/02xe5ns62","country_code":"CN","type":"education","lineage":["https://openalex.org/I159948400"]},{"id":"https://openalex.org/I34949971","display_name":"University of Jinan","ror":"https://ror.org/02mjz6f26","country_code":"CN","type":"education","lineage":["https://openalex.org/I34949971"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Bimei Wang","raw_affiliation_strings":["Jinan University,College of Cyber Security,China"],"affiliations":[{"raw_affiliation_string":"Jinan University,College of Cyber Security,China","institution_ids":["https://openalex.org/I34949971","https://openalex.org/I159948400"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101945594","display_name":"Wen Fan","orcid":"https://orcid.org/0000-0001-5224-6895"},"institutions":[{"id":"https://openalex.org/I76214153","display_name":"Lanzhou University","ror":"https://ror.org/01mkqqe32","country_code":"CN","type":"education","lineage":["https://openalex.org/I76214153"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fan Wen","raw_affiliation_strings":["Lanzhou University,School of Information Science &amp; Engineering,China"],"affiliations":[{"raw_affiliation_string":"Lanzhou University,School of Information Science &amp; Engineering,China","institution_ids":["https://openalex.org/I76214153"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109706781","display_name":"Jisheng Dang","orcid":"https://orcid.org/0000-0002-5513-911X"},"institutions":[{"id":"https://openalex.org/I76214153","display_name":"Lanzhou University","ror":"https://ror.org/01mkqqe32","country_code":"CN","type":"education","lineage":["https://openalex.org/I76214153"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jisheng Dang","raw_affiliation_strings":["Lanzhou University,School of Information Science &amp; Engineering,China"],"affiliations":[{"raw_affiliation_string":"Lanzhou University,School of Information Science &amp; Engineering,China","institution_ids":["https://openalex.org/I76214153"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045716717","display_name":"Huiguo He","orcid":"https://orcid.org/0000-0003-1419-059X"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huiguo He","raw_affiliation_strings":["Sun Yat-sen University,School of Computer Science and Engineering,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,School of Computer Science and Engineering,China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100634075","display_name":"Xiwen Wang","orcid":"https://orcid.org/0000-0002-9153-6569"},"institutions":[{"id":"https://openalex.org/I68986083","display_name":"Northwest Normal University","ror":"https://ror.org/00gx3j908","country_code":"CN","type":"education","lineage":["https://openalex.org/I68986083"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiwen Wang","raw_affiliation_strings":["Northwest Normal University,School of Computer Science and Engineering,China"],"affiliations":[{"raw_affiliation_string":"Northwest Normal University,School of Computer Science and Engineering,China","institution_ids":["https://openalex.org/I68986083"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101581501","display_name":"Nannan Zhu","orcid":"https://orcid.org/0000-0003-4038-3053"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Nannan Zhu","raw_affiliation_strings":["Sun Yat-sen University,School of System Science and Engineering,China"],"affiliations":[{"raw_affiliation_string":"Sun Yat-sen University,School of System Science and Engineering,China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5025556550","display_name":"Jiasi Weng","orcid":"https://orcid.org/0000-0002-5876-7875"},"institutions":[{"id":"https://openalex.org/I159948400","display_name":"Jinan University","ror":"https://ror.org/02xe5ns62","country_code":"CN","type":"education","lineage":["https://openalex.org/I159948400"]},{"id":"https://openalex.org/I34949971","display_name":"University of Jinan","ror":"https://ror.org/02mjz6f26","country_code":"CN","type":"education","lineage":["https://openalex.org/I34949971"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiasi Weng","raw_affiliation_strings":["Jinan University,College of Cyber Security,China"],"affiliations":[{"raw_affiliation_string":"Jinan University,College of Cyber Security,China","institution_ids":["https://openalex.org/I34949971","https://openalex.org/I159948400"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5101160189"],"corresponding_institution_ids":["https://openalex.org/I159948400","https://openalex.org/I34949971"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.33839466,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5049999952316284,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.5049999952316284,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.15479999780654907,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.10490000247955322,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6833000183105469},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.6798999905586243},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.6765999794006348},{"id":"https://openalex.org/keywords/task","display_name":"Task (project management)","score":0.5896999835968018},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.5508999824523926},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.5357999801635742},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.43959999084472656},{"id":"https://openalex.org/keywords/semantic-feature","display_name":"Semantic feature","score":0.42179998755455017}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8363000154495239},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6833000183105469},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.6798999905586243},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.6765999794006348},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.5896999835968018},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5813000202178955},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5763000249862671},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.5508999824523926},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.5357999801635742},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.43959999084472656},{"id":"https://openalex.org/C2781122975","wikidata":"https://www.wikidata.org/wiki/Q16928266","display_name":"Semantic feature","level":2,"score":0.42179998755455017},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.40130001306533813},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.38519999384880066},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.3531999886035919},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.3504999876022339},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.3018999993801117},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.2973000109195709},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.2851000130176544},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.27950000762939453},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.27459999918937683},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.26030001044273376},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.26010000705718994},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.25589999556541443},{"id":"https://openalex.org/C59404180","wikidata":"https://www.wikidata.org/wiki/Q17013334","display_name":"Feature learning","level":2,"score":0.2508000135421753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icme59968.2025.11209977","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icme59968.2025.11209977","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":22,"referenced_works":["https://openalex.org/W2981851019","https://openalex.org/W3109729331","https://openalex.org/W3161082133","https://openalex.org/W3175859344","https://openalex.org/W4205277845","https://openalex.org/W4285108184","https://openalex.org/W4312480274","https://openalex.org/W4313038481","https://openalex.org/W4353015365","https://openalex.org/W4377235190","https://openalex.org/W4383899959","https://openalex.org/W4390017901","https://openalex.org/W4391547535","https://openalex.org/W4392796587","https://openalex.org/W4393149524","https://openalex.org/W4400113208","https://openalex.org/W4400579078","https://openalex.org/W4401990337","https://openalex.org/W4402671548","https://openalex.org/W4402780269","https://openalex.org/W4403725941","https://openalex.org/W4412158322"],"related_works":[],"abstract_inverted_index":{"Vision-Language":[0],"Models":[1],"(VLMs)":[2],"have":[3],"demonstrated":[4],"remarkable":[5],"performance":[6,148],"across":[7,149],"various":[8],"tasks":[9],"by":[10,24],"encoding":[11],"visual":[12,66],"frames":[13],"into":[14],"tokens":[15],"analogous":[16],"to":[17,63,116],"textual":[18,69],"tokens,":[19],"which":[20,93],"are":[21,53],"then":[22],"processed":[23],"a":[25,41,84,113],"Large":[26],"Language":[27],"Model":[28],"(LLM)":[29],"for":[30,47],"task":[31],"execution.":[32],"To":[33,78],"manage":[34],"computational":[35],"demands,":[36],"current":[37],"methods":[38,52],"often":[39],"employ":[40],"token":[42],"compressor,":[43],"such":[44],"as":[45],"Q-former,":[46],"efficient":[48],"inference.":[49],"However,":[50],"these":[51],"typically":[54],"trained":[55,167],"on":[56],"video-to-text":[57],"generation":[58],"loss,":[59],"lacking":[60],"sufficient":[61],"supervision":[62],"align":[64],"intermediate":[65,101,122],"representations":[67,98],"with":[68],"semantics,":[70],"resulting":[71],"in":[72,120,127],"hallucinations":[73,126,138],"when":[74],"identifying":[75],"essential":[76],"objects.":[77],"address":[79],"this":[80],"issue,":[81],"we":[82,111],"propose":[83],"novel":[85],"visual-textual":[86],"alignment":[87,119],"framework,":[88],"Semantic":[89],"Supervision":[90],"LLM":[91],"(SS-LLM),":[92],"aligns":[94],"video":[95],"and":[96,155,166],"text":[97],"within":[99],"the":[100,106,121],"feature":[102,123],"space,":[103,124],"thereby":[104],"enhancing":[105],"LLM\u2019s":[107],"decoding":[108],"process.":[109],"Additionally,":[110],"introduce":[112],"CLIP":[114],"Loss":[115],"facilitate":[117],"visual-text":[118],"reducing":[125],"VLMs.":[128],"Extensive":[129],"experiments":[130],"demonstrate":[131],"that":[132],"our":[133,163],"approach":[134],"not":[135],"only":[136],"mitigates":[137],"more":[139,153],"effectively":[140],"than":[141],"existing":[142],"models":[143,168],"but":[144],"also":[145],"achieves":[146],"state-of-the-art":[147],"several":[150],"benchmarks,":[151],"providing":[152],"accurate":[154],"semantically":[156],"consistent":[157],"video-text":[158],"representations.":[159],"We":[160],"will":[161],"make":[162],"source":[164],"code":[165],"publicly":[169],"available.":[170]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-30T00:00:00"}
