{"id":"https://openalex.org/W4389820685","doi":"https://doi.org/10.48550/arxiv.2312.08870","title":"Vista-LLaMA: Reducing Hallucination in Video Language Models via Equal Distance to Visual Tokens","display_name":"Vista-LLaMA: Reducing Hallucination in Video Language Models via Equal Distance to Visual Tokens","publication_year":2023,"publication_date":"2023-12-12","ids":{"openalex":"https://openalex.org/W4389820685","doi":"https://doi.org/10.48550/arxiv.2312.08870"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2312.08870","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.08870","pdf_url":"https://arxiv.org/pdf/2312.08870","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2312.08870","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5017393243","display_name":"Fan Ma","orcid":"https://orcid.org/0000-0002-4131-1222"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ma, Fan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101828825","display_name":"Xiaojie Jin","orcid":"https://orcid.org/0000-0003-2789-0923"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Xiaojie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100453993","display_name":"Heng Wang","orcid":"https://orcid.org/0009-0009-5473-5751"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Heng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062938761","display_name":"Yuchen Xian","orcid":"https://orcid.org/0000-0001-5814-1883"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xian, Yuchen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100668696","display_name":"Jiashi Feng","orcid":"https://orcid.org/0000-0001-6843-0064"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Jiashi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100609592","display_name":"Yi Yang","orcid":"https://orcid.org/0009-0000-2822-6130"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Yi","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5017393243"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9873999953269958,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9811999797821045,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7929894924163818},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5318671464920044},{"id":"https://openalex.org/keywords/shot","display_name":"Shot (pellet)","score":0.5301573872566223},{"id":"https://openalex.org/keywords/encoding","display_name":"Encoding (memory)","score":0.5289967656135559},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.4736160933971405},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.45620062947273254},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4445779621601105},{"id":"https://openalex.org/keywords/position","display_name":"Position (finance)","score":0.4113565683364868},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.38324180245399475},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.32023125886917114}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7929894924163818},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5318671464920044},{"id":"https://openalex.org/C2778344882","wikidata":"https://www.wikidata.org/wiki/Q278938","display_name":"Shot (pellet)","level":2,"score":0.5301573872566223},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.5289967656135559},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.4736160933971405},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45620062947273254},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4445779621601105},{"id":"https://openalex.org/C198082294","wikidata":"https://www.wikidata.org/wiki/Q3399648","display_name":"Position (finance)","level":2,"score":0.4113565683364868},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.38324180245399475},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.32023125886917114},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C10138342","wikidata":"https://www.wikidata.org/wiki/Q43015","display_name":"Finance","level":1,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C178790620","wikidata":"https://www.wikidata.org/wiki/Q11351","display_name":"Organic chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2312.08870","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.08870","pdf_url":"https://arxiv.org/pdf/2312.08870","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2312.08870","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2312.08870","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2312.08870","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2312.08870","pdf_url":"https://arxiv.org/pdf/2312.08870","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","display_name":"Quality Education","score":0.7300000190734863}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2074502265","https://openalex.org/W4214877189","https://openalex.org/W2773965352","https://openalex.org/W2381179799","https://openalex.org/W2980279061","https://openalex.org/W2334685461","https://openalex.org/W4388335561","https://openalex.org/W2970530566","https://openalex.org/W4288261899","https://openalex.org/W2366718574"],"abstract_inverted_index":{"Recent":[0],"advances":[1],"in":[2,10],"large":[3,23],"video-language":[4],"models":[5,25],"have":[6],"displayed":[7],"promising":[8],"outcomes":[9],"video":[11,17,56,147,160,210],"comprehension.":[12],"Current":[13],"approaches":[14],"straightforwardly":[15],"convert":[16],"into":[18,162],"language":[19,24,79,165],"tokens":[20,76,116,163,190],"and":[21,51,77,98,107,129,224],"employ":[22],"for":[26,105],"multi-modal":[27],"tasks.":[28],"However,":[29],"this":[30,60],"method":[31],"often":[32],"leads":[33],"to":[34,145,191],"the":[35,45,48,52,55,70,83,102,112,122,138,146,158,168,171,179,183,193,221,227],"generation":[36],"of":[37,47,54,82,114,140,164,170,218],"irrelevant":[38,142],"content,":[39],"commonly":[40],"known":[41],"as":[42,44],"\"hallucination\",":[43],"length":[46],"text":[49,85,99,106,108,118,130,143],"increases":[50],"impact":[53],"diminishes.":[57],"To":[58],"address":[59],"problem,":[61],"we":[62,150],"propose":[63],"Vista-LLaMA,":[64],"a":[65,152,231],"novel":[66],"framework":[67],"that":[68,156],"maintains":[69],"consistent":[71],"distance":[72,124],"between":[73,96,127],"all":[74],"visual":[75,97,115,128,154,189],"any":[78],"tokens,":[80,100],"irrespective":[81],"generated":[84],"length.":[86],"Vista-LLaMA":[87],"omits":[88],"relative":[89,123],"position":[90,103],"encoding":[91,104],"when":[92,121],"determining":[93],"attention":[94,134],"weights":[95],"retaining":[101],"tokens.":[109,131],"This":[110,174,235],"amplifies":[111],"effect":[113],"on":[117,206,220,226],"generation,":[119],"especially":[120],"is":[125,237],"longer":[126],"The":[132],"proposed":[133],"mechanism":[135],"significantly":[136,198],"reduces":[137],"chance":[139],"producing":[141],"related":[144],"content.":[148],"Furthermore,":[149],"present":[151],"sequential":[153],"projector":[155],"projects":[157],"current":[159],"frame":[161],"space":[166],"with":[167],"assistance":[169],"previous":[172,201],"frame.":[173],"approach":[175,197],"not":[176],"only":[177],"captures":[178],"temporal":[180],"relationship":[181],"within":[182],"video,":[184],"but":[185],"also":[186],"allows":[187],"less":[188],"encompass":[192],"entire":[194],"video.":[195],"Our":[196],"outperforms":[199],"various":[200],"methods":[202],"(e.g.,":[203],"Video-ChatGPT,":[204],"MovieChat)":[205],"four":[207],"challenging":[208],"open-ended":[209],"question":[211],"answering":[212],"benchmarks.":[213],"We":[214],"reach":[215],"an":[216],"accuracy":[217],"60.7":[219],"zero-shot":[222,228],"NExT-QA":[223],"60.5":[225],"MSRVTT-QA,":[229],"setting":[230],"new":[232],"state-of-the-art":[233],"performance.":[234],"project":[236],"available":[238],"at":[239],"https://jinxxian.github.io/Vista-LLaMA.":[240]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":2},{"year":2022,"cited_by_count":1}],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2023-12-16T00:00:00"}
