{"id":"https://openalex.org/W4393924464","doi":"https://doi.org/10.48550/arxiv.2404.01258","title":"Direct Preference Optimization of Video Large Multimodal Models from Language Model Reward","display_name":"Direct Preference Optimization of Video Large Multimodal Models from Language Model Reward","publication_year":2024,"publication_date":"2024-04-01","ids":{"openalex":"https://openalex.org/W4393924464","doi":"https://doi.org/10.48550/arxiv.2404.01258"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2404.01258","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.01258","pdf_url":"https://arxiv.org/pdf/2404.01258","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2404.01258","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5063383309","display_name":"Ruohong Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhang, Ruohong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037727565","display_name":"Liangke Gui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gui, Liangke","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5020049212","display_name":"Zhiqing Sun","orcid":"https://orcid.org/0000-0003-1933-496X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Zhiqing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111345651","display_name":"Yihao Feng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Feng, Yihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026735760","display_name":"Keyang Xu","orcid":"https://orcid.org/0000-0002-7049-2528"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Keyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104270575","display_name":"Yuanhan Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yuanhan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103207941","display_name":"Di Fu","orcid":"https://orcid.org/0000-0002-5385-2982"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Di","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107893340","display_name":"Chunyuan Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chunyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107836252","display_name":"Alexander G. Hauptmann","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hauptmann, Alexander","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041302228","display_name":"Yonatan Bisk","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bisk, Yonatan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5104173743","display_name":"Yiming Yang","orcid":"https://orcid.org/0009-0005-9750-8920"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Yiming","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":["https://openalex.org/A5063383309"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13676","display_name":"Educational and Technological Research","score":0.8148000240325928,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13676","display_name":"Educational and Technological Research","score":0.8148000240325928,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.7893999814987183,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7547000050544739,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/preference","display_name":"Preference","score":0.7652462720870972},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.531848669052124},{"id":"https://openalex.org/keywords/cognitive-psychology","display_name":"Cognitive psychology","score":0.4124263823032379},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.38297224044799805},{"id":"https://openalex.org/keywords/microeconomics","display_name":"Microeconomics","score":0.15699300169944763},{"id":"https://openalex.org/keywords/economics","display_name":"Economics","score":0.13712483644485474}],"concepts":[{"id":"https://openalex.org/C2781249084","wikidata":"https://www.wikidata.org/wiki/Q908656","display_name":"Preference","level":2,"score":0.7652462720870972},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.531848669052124},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.4124263823032379},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.38297224044799805},{"id":"https://openalex.org/C175444787","wikidata":"https://www.wikidata.org/wiki/Q39072","display_name":"Microeconomics","level":1,"score":0.15699300169944763},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.13712483644485474}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2404.01258","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.01258","pdf_url":"https://arxiv.org/pdf/2404.01258","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2404.01258","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2404.01258","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2404.01258","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2404.01258","pdf_url":"https://arxiv.org/pdf/2404.01258","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4393924464.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W2382290278","https://openalex.org/W2478288626","https://openalex.org/W4391913857","https://openalex.org/W2350741829","https://openalex.org/W2530322880"],"abstract_inverted_index":{"Preference":[0],"modeling":[1],"techniques,":[2],"such":[3],"as":[4,52,90,103,129],"direct":[5],"preference":[6,57],"optimization":[7],"(DPO),":[8],"has":[9,74],"shown":[10],"effective":[11],"in":[12,23,35],"enhancing":[13],"the":[14,65,143],"generalization":[15],"abilities":[16],"of":[17,67,93,145],"large":[18,47,48],"language":[19,97],"model":[20],"(LLM).":[21],"However,":[22],"tasks":[24],"involving":[25],"video":[26,88,94,108,127,146,149],"instruction-following,":[27],"providing":[28],"informative":[29],"feedback,":[30],"especially":[31],"for":[32,106],"detecting":[33],"hallucinations":[34],"generated":[36,68],"responses,":[37],"remains":[38],"a":[39,82,91],"significant":[40],"challenge.":[41],"Previous":[42],"studies":[43],"have":[44],"explored":[45],"using":[46],"multimodal":[49],"models":[50,54,98],"(LMMs)":[51],"reward":[53,122,138],"to":[55,62,71,99],"guide":[56],"modeling,":[58],"but":[59],"their":[60],"ability":[61],"accurately":[63],"assess":[64],"factuality":[66],"responses":[69],"compared":[70],"corresponding":[72],"videos":[73],"not":[75],"been":[76],"conclusively":[77],"established.":[78],"This":[79],"paper":[80],"introduces":[81],"novel":[83],"framework":[84],"that":[85,134],"utilizes":[86],"detailed":[87],"captions":[89],"proxy":[92],"content,":[95],"enabling":[96],"incorporate":[100],"this":[101,136],"information":[102],"supporting":[104],"evidence":[105],"scoring":[107],"Question":[109],"Answering":[110],"(QA)":[111],"predictions.":[112],"Our":[113],"approach":[114],"demonstrates":[115],"robust":[116],"alignment":[117],"with":[118],"OpenAI":[119],"GPT-4V":[120],"model's":[121],"mechanism,":[123],"which":[124],"directly":[125],"takes":[126],"frames":[128],"input.":[130],"Furthermore,":[131],"we":[132],"show":[133],"applying":[135],"tailored":[137],"through":[139],"DPO":[140],"significantly":[141],"improves":[142],"performance":[144],"LMMs":[147],"on":[148],"QA":[150],"tasks.":[151]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-10T16:38:18.471706","created_date":"2025-10-10T00:00:00"}
