{"id":"https://openalex.org/W4405057408","doi":"https://doi.org/10.48550/arxiv.2412.02617","title":"Improving Dynamic Object Interactions in Text-to-Video Generation with AI Feedback","display_name":"Improving Dynamic Object Interactions in Text-to-Video Generation with AI Feedback","publication_year":2024,"publication_date":"2024-12-03","ids":{"openalex":"https://openalex.org/W4405057408","doi":"https://doi.org/10.48550/arxiv.2412.02617"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2412.02617","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.02617","pdf_url":"https://arxiv.org/pdf/2412.02617","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2412.02617","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5064007269","display_name":"Hiroki Furuta","orcid":"https://orcid.org/0009-0002-7209-810X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Furuta, Hiroki","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003420204","display_name":"Heiga Zen","orcid":"https://orcid.org/0000-0002-8959-5471"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zen, Heiga","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010575626","display_name":"Dale Schuurmans","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schuurmans, Dale","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002971435","display_name":"Aleksandra Faust","orcid":"https://orcid.org/0000-0002-3268-8685"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Faust, Aleksandra","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5074059447","display_name":"Yutaka Matsuo","orcid":"https://orcid.org/0000-0002-2070-4393"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Matsuo, Yutaka","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025255782","display_name":"Percy Liang","orcid":"https://orcid.org/0000-0002-0458-6139"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liang, Percy","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5047689869","display_name":"Sherry Yang","orcid":"https://orcid.org/0009-0007-9684-0627"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Sherry","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5064007269"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.8791000247001648,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.8791000247001648,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.8705000281333923,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11574","display_name":"Artificial Intelligence in Games","score":0.8205000162124634,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7188824415206909},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.5596860647201538},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5051847100257874},{"id":"https://openalex.org/keywords/video-feedback","display_name":"Video feedback","score":0.4957081079483032},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.471962571144104},{"id":"https://openalex.org/keywords/multimedia","display_name":"Multimedia","score":0.4009479582309723},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3678373694419861},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.3400024473667145},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.3358960747718811},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.05768832564353943}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7188824415206909},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.5596860647201538},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5051847100257874},{"id":"https://openalex.org/C193081819","wikidata":"https://www.wikidata.org/wiki/Q4132092","display_name":"Video feedback","level":2,"score":0.4957081079483032},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.471962571144104},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.4009479582309723},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3678373694419861},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3400024473667145},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3358960747718811},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.05768832564353943},{"id":"https://openalex.org/C120665830","wikidata":"https://www.wikidata.org/wiki/Q14620","display_name":"Optics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2412.02617","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.02617","pdf_url":"https://arxiv.org/pdf/2412.02617","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2412.02617","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2412.02617","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2412.02617","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.02617","pdf_url":"https://arxiv.org/pdf/2412.02617","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1069223013","display_name":null,"funder_award_id":"JSPS KAKENHI","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G3459562248","display_name":null,"funder_award_id":"Grant","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G4636223006","display_name":null,"funder_award_id":"JSPS KAK","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G8430481527","display_name":null,"funder_award_id":"Number","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"}],"funders":[{"id":"https://openalex.org/F4320334764","display_name":"Japan Society for the Promotion of Science","ror":"https://ror.org/00hhkn466"}],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4405057408.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2737719445","https://openalex.org/W4239098401","https://openalex.org/W2898210368","https://openalex.org/W2382480268","https://openalex.org/W4288716084","https://openalex.org/W4387819954","https://openalex.org/W3003671251","https://openalex.org/W2531932609","https://openalex.org/W1976518449","https://openalex.org/W2732837990"],"abstract_inverted_index":{"Large":[0],"text-to-video":[1,81,123],"models":[2,15,40,188],"hold":[3],"immense":[4],"potential":[5],"for":[6,118,227],"a":[7,87,114,145,155,210],"wide":[8,211],"range":[9],"of":[10,31,73,92,122,157,177,213,263],"downstream":[11],"applications.":[12],"However,":[13],"these":[14],"struggle":[16],"to":[17,42,56,75,85,153,172,189,196],"accurately":[18],"depict":[19],"dynamic":[20,228],"object":[21,78,108,197],"interactions,":[22,229],"often":[23,170],"resulting":[24],"in":[25,80,131,199,224,252],"unrealistic":[26],"movements":[27],"and":[28,106,137,235,260],"frequent":[29],"violations":[30],"real-world":[32],"physics.":[33],"One":[34],"solution":[35],"inspired":[36],"by":[37,112,232],"large":[38],"language":[39],"is":[41],"align":[43,173],"generated":[44],"outputs":[45],"with":[46,95,174,215],"desired":[47],"outcomes":[48],"using":[49,244],"external":[50],"feedback.":[51],"This":[52,125],"enables":[53],"the":[54,71,77,220],"model":[55],"refine":[57],"its":[58],"responses":[59],"autonomously,":[60],"eliminating":[61],"extensive":[62],"manual":[63],"data":[64],"collection.":[65],"In":[66],"this":[67,182],"work,":[68],"we":[69,184,239],"investigate":[70],"use":[72,150],"feedback":[74,193,218],"enhance":[76],"dynamics":[79,198],"models.":[82,124],"We":[83,110,148],"aim":[84],"answer":[86],"critical":[88],"question:":[89],"what":[90],"types":[91],"feedback,":[93,250],"paired":[94],"which":[96],"specific":[97,142],"self-improvement":[98],"algorithms,":[99],"can":[100,207],"most":[101,221],"effectively":[102,208],"improve":[103],"text-video":[104,158],"alignment":[105,159],"realistic":[107,261],"interactions?":[109],"begin":[111],"deriving":[113],"unified":[115,146],"probabilistic":[116],"objective":[117],"offline":[119],"RL":[120],"finetuning":[121],"perspective":[126],"highlights":[127],"how":[128],"design":[129],"elements":[130],"existing":[132],"algorithms":[133],"like":[134],"KL":[135],"regularization":[136],"policy":[138],"projection":[139],"emerge":[140],"as":[141,230],"choices":[143],"within":[144],"framework.":[147],"then":[149],"derived":[151,247],"methods":[152],"optimize":[154,209],"set":[156],"metrics":[160],"(e.g.,":[161],"CLIP":[162],"scores,":[163],"optical":[164],"flow),":[165],"but":[166],"notice":[167],"that":[168,204],"they":[169],"fail":[171],"human":[175,236],"perceptions":[176],"generation":[178],"quality.":[179],"To":[180],"address":[181],"limitation,":[183],"propose":[185],"leveraging":[186],"vision-language":[187],"provide":[190],"more":[191],"nuanced":[192],"specifically":[194],"tailored":[195],"videos.":[200],"Our":[201],"experiments":[202],"demonstrate":[203],"our":[205],"method":[206],"variety":[212],"rewards,":[214],"binary":[216],"AI":[217,234,249],"driving":[219],"significant":[222],"improvements":[223],"video":[225],"quality":[226],"confirmed":[231],"both":[233],"evaluations.":[237],"Notably,":[238],"observe":[240],"substantial":[241],"gains":[242],"when":[243],"reward":[245],"signals":[246],"from":[248],"particularly":[251],"scenarios":[253],"involving":[254],"complex":[255],"interactions":[256],"between":[257],"multiple":[258],"objects":[259,264],"depictions":[262],"falling.":[265]},"counts_by_year":[],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
