{"id":"https://openalex.org/W4409093780","doi":"https://doi.org/10.1109/iccv51701.2025.01451","title":"VPO: Aligning Text-to-Video Generation Models with Prompt Optimization","display_name":"VPO: Aligning Text-to-Video Generation Models with Prompt Optimization","publication_year":2025,"publication_date":"2025-10-19","ids":{"openalex":"https://openalex.org/W4409093780","doi":"https://doi.org/10.1109/iccv51701.2025.01451"},"language":"en","primary_location":{"id":"doi:10.1109/iccv51701.2025.01451","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01451","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2503.20491","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5002409683","display_name":"Jiale Cheng","orcid":"https://orcid.org/0000-0002-0032-7455"},"institutions":[{"id":"https://openalex.org/I4210164862","display_name":"Artificial Intelligence in Medicine (Canada)","ror":"https://ror.org/05p590m36","country_code":"CA","type":"company","lineage":["https://openalex.org/I4210164862"]}],"countries":["CA"],"is_corresponding":true,"raw_author_name":"Jiale Cheng","raw_affiliation_strings":["The Conversational Artificial Intelligence (CoAI) Group, Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Conversational Artificial Intelligence (CoAI) Group, Tsinghua University","institution_ids":["https://openalex.org/I4210164862"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5116879922","display_name":"Ruiliang Lyu","orcid":null},"institutions":[{"id":"https://openalex.org/I4401726915","display_name":"Zhipu AI (China)","ror":"https://ror.org/005dzvw93","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726915"]}],"countries":[],"is_corresponding":false,"raw_author_name":"Ruiliang Lyu","raw_affiliation_strings":["Zhipu AI"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Zhipu AI","institution_ids":["https://openalex.org/I4401726915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102325476","display_name":"Xiaotao Gu","orcid":null},"institutions":[{"id":"https://openalex.org/I4401726915","display_name":"Zhipu AI (China)","ror":"https://ror.org/005dzvw93","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726915"]}],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaotao Gu","raw_affiliation_strings":["Zhipu AI"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Zhipu AI","institution_ids":["https://openalex.org/I4401726915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115596171","display_name":"Xiao Liu","orcid":"https://orcid.org/0000-0002-8029-0217"},"institutions":[{"id":"https://openalex.org/I4401726915","display_name":"Zhipu AI (China)","ror":"https://ror.org/005dzvw93","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726915"]}],"countries":[],"is_corresponding":false,"raw_author_name":"Xiao Liu","raw_affiliation_strings":["Zhipu AI"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Zhipu AI","institution_ids":["https://openalex.org/I4401726915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042981778","display_name":"Jiazheng Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I4401726915","display_name":"Zhipu AI (China)","ror":"https://ror.org/005dzvw93","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726915"]}],"countries":[],"is_corresponding":false,"raw_author_name":"Jiazheng Xu","raw_affiliation_strings":["Zhipu AI"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Zhipu AI","institution_ids":["https://openalex.org/I4401726915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5063866219","display_name":"Yong\u2010Jie Lu","orcid":"https://orcid.org/0000-0001-6174-6621"},"institutions":[{"id":"https://openalex.org/I4210164862","display_name":"Artificial Intelligence in Medicine (Canada)","ror":"https://ror.org/05p590m36","country_code":"CA","type":"company","lineage":["https://openalex.org/I4210164862"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Yida Lu","raw_affiliation_strings":["The Conversational Artificial Intelligence (CoAI) Group, Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Conversational Artificial Intelligence (CoAI) Group, Tsinghua University","institution_ids":["https://openalex.org/I4210164862"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010956440","display_name":"Jiayan Teng","orcid":null},"institutions":[{"id":"https://openalex.org/I4401726915","display_name":"Zhipu AI (China)","ror":"https://ror.org/005dzvw93","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726915"]}],"countries":[],"is_corresponding":false,"raw_author_name":"Jiayan Teng","raw_affiliation_strings":["Zhipu AI"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Zhipu AI","institution_ids":["https://openalex.org/I4401726915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023011245","display_name":"Zhuoyi Yang","orcid":"https://orcid.org/0009-0008-7909-4930"},"institutions":[{"id":"https://openalex.org/I4401726915","display_name":"Zhipu AI (China)","ror":"https://ror.org/005dzvw93","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726915"]}],"countries":[],"is_corresponding":false,"raw_author_name":"Zhuoyi Yang","raw_affiliation_strings":["Zhipu AI"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Zhipu AI","institution_ids":["https://openalex.org/I4401726915"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052284218","display_name":"Yuxiao Dong","orcid":"https://orcid.org/0000-0002-6092-2002"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuxiao Dong","raw_affiliation_strings":["The Knowledge Engineering Group (KEG), Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Knowledge Engineering Group (KEG), Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052105494","display_name":"Jie Tang","orcid":"https://orcid.org/0000-0001-5986-3837"},"institutions":[{"id":"https://openalex.org/I99065089","display_name":"Tsinghua University","ror":"https://ror.org/03cve4549","country_code":"CN","type":"education","lineage":["https://openalex.org/I99065089"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jie Tang","raw_affiliation_strings":["The Knowledge Engineering Group (KEG), Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Knowledge Engineering Group (KEG), Tsinghua University","institution_ids":["https://openalex.org/I99065089"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085094109","display_name":"Hongning Wang","orcid":"https://orcid.org/0000-0002-6524-9195"},"institutions":[{"id":"https://openalex.org/I4210164862","display_name":"Artificial Intelligence in Medicine (Canada)","ror":"https://ror.org/05p590m36","country_code":"CA","type":"company","lineage":["https://openalex.org/I4210164862"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Hongning Wang","raw_affiliation_strings":["The Conversational Artificial Intelligence (CoAI) Group, Tsinghua University"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"The Conversational Artificial Intelligence (CoAI) Group, Tsinghua University","institution_ids":["https://openalex.org/I4210164862"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5058504936","display_name":"Minlie Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I4401726915","display_name":"Zhipu AI (China)","ror":"https://ror.org/005dzvw93","country_code":null,"type":"company","lineage":["https://openalex.org/I4401726915"]}],"countries":[],"is_corresponding":false,"raw_author_name":"Minlie Huang","raw_affiliation_strings":["Zhipu AI"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Zhipu AI","institution_ids":["https://openalex.org/I4401726915"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5002409683"],"corresponding_institution_ids":["https://openalex.org/I4210164862"],"apc_list":null,"apc_paid":null,"fwci":1.0652,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.75406152,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"15636","last_page":"15645"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9915000200271606,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9915000200271606,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.9897000193595886,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12720","display_name":"Multimedia Communication and Technology","score":0.97079998254776,"subfield":{"id":"https://openalex.org/subfields/3312","display_name":"Sociology and Political Science"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.4905399680137634}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4905399680137634}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1109/iccv51701.2025.01451","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iccv51701.2025.01451","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE/CVF International Conference on Computer Vision (ICCV)","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2503.20491","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.20491","pdf_url":"https://arxiv.org/pdf/2503.20491","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2503.20491","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2503.20491","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2503.20491","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2503.20491","pdf_url":"https://arxiv.org/pdf/2503.20491","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Video":[0],"generation":[1,206,223,233],"models":[2,11,55],"have":[3],"achieved":[4],"remarkable":[5],"progress":[6],"in":[7,230],"text-to-video":[8],"tasks.":[9],"These":[10],"are":[12,31,239],"typically":[13],"trained":[14],"on":[15,52,88,113,159,221],"text-video":[16],"pairs":[17],"with":[18,179,218],"highly":[19],"detailed":[20],"and":[21,119,134,151,163,170,192,215,237],"carefully":[22],"crafted":[23],"descriptions,":[24],"while":[25],"real-world":[26],"user":[27,71,126],"inputs":[28],"during":[29],"inference":[30],"often":[32,50],"concise,":[33],"vague,":[34],"or":[35,76],"poorly":[36],"structured.":[37],"This":[38],"gap":[39],"makes":[40],"prompt":[41],"optimization":[42,146],"crucial":[43],"for":[44],"generating":[45],"high-quality":[46],"videos.":[47,138],"Current":[48],"methods":[49,220],"rely":[51],"large":[53],"language":[54],"(LLMs)":[56],"to":[57,96,173,196],"refine":[58,152],"prompts":[59,83,111,123],"through":[60],"in-context":[61],"learning,":[62],"but":[63],"suffer":[64],"from":[65],"several":[66],"limitations:":[67],"they":[68,81],"may":[69],"distort":[70],"intent,":[72],"omit":[73],"critical":[74],"details,":[75],"introduce":[77,104,167],"safety":[78,133,162],"risks.":[79],"Moreover,":[80,199],"optimize":[82,175],"without":[84],"considering":[85],"the":[86,89,132,176,226],"impact":[87],"final":[90],"video":[91,193,205,222,232],"quality,":[92],"which":[93],"can":[94],"lead":[95],"suboptimal":[97],"results.":[98],"To":[99,139],"address":[100],"these":[101],"issues,":[102],"we":[103,149,166,209],"VPO,":[105],"a":[106,144,153],"principled":[107],"framework":[108],"that":[109,186,211],"optimizes":[110],"based":[112,158],"three":[114],"core":[115],"principles:":[116],"harmlessness,":[117],"accuracy,":[118],"helpfulness.":[120],"The":[121],"generated":[122,137],"faithfully":[124],"preserve":[125],"intents":[127],"and,":[128],"more":[129],"importantly,":[130],"enhance":[131],"quality":[135,194],"of":[136,161,228],"achieve":[140],"this,":[141],"VPO":[142,187,200,212,229],"employs":[143],"two-stage":[145],"approach.":[147],"First,":[148],"construct":[150],"supervised":[154],"fine-tuning":[155],"(SFT)":[156],"dataset":[157],"principles":[160],"alignment.":[164],"Second,":[165],"both":[168],"text-level":[169],"video-level":[171],"feedback":[172],"further":[174],"SFT":[177],"model":[178],"preference":[180],"learning.":[181],"Our":[182,235],"extensive":[183],"experiments":[184],"demonstrate":[185,210],"significantly":[188],"improves":[189],"safety,":[190],"alignment,":[191],"compared":[195],"baseline":[197],"methods.":[198],"shows":[201],"strong":[202],"generalization":[203],"across":[204],"models.":[207,234],"Furthermore,":[208],"could":[213],"outperform":[214],"be":[216],"combined":[217],"RLHF":[219],"models,":[224],"underscoring":[225],"effectiveness":[227],"aligning":[231],"code":[236],"data":[238],"publicly":[240],"available":[241],"at":[242],"https://github.com/thu-coai/VPO.":[243]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-05-06T06:03:25.996018","created_date":"2025-10-10T00:00:00"}
