{"id":"https://openalex.org/W4407091669","doi":"https://doi.org/10.48550/arxiv.2501.19252","title":"Inference-Time Text-to-Video Alignment with Diffusion Latent Beam Search","display_name":"Inference-Time Text-to-Video Alignment with Diffusion Latent Beam Search","publication_year":2025,"publication_date":"2025-01-31","ids":{"openalex":"https://openalex.org/W4407091669","doi":"https://doi.org/10.48550/arxiv.2501.19252"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2501.19252","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2501.19252","pdf_url":"https://arxiv.org/pdf/2501.19252","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2501.19252","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114135699","display_name":"Yuta Oshima","orcid":"https://orcid.org/0009-0006-6016-3866"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Oshima, Yuta","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038802330","display_name":"Masahiro Suzuki","orcid":"https://orcid.org/0000-0001-8519-5617"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Suzuki, Masahiro","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090592819","display_name":"Yutaka Matsuo","orcid":"https://orcid.org/0000-0001-9084-9670"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Matsuo, Yutaka","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5064007269","display_name":"Hiroki Furuta","orcid":"https://orcid.org/0009-0002-7209-810X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Furuta, Hiroki","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5114135699"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9929999709129333,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11439","display_name":"Video Analysis and Summarization","score":0.9929999709129333,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9869999885559082,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9650999903678894,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7491332292556763},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5321838855743408},{"id":"https://openalex.org/keywords/diffusion","display_name":"Diffusion","score":0.5151273012161255},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.349650502204895},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.1411570906639099}],"concepts":[{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7491332292556763},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5321838855743408},{"id":"https://openalex.org/C69357855","wikidata":"https://www.wikidata.org/wiki/Q163214","display_name":"Diffusion","level":2,"score":0.5151273012161255},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.349650502204895},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.1411570906639099},{"id":"https://openalex.org/C97355855","wikidata":"https://www.wikidata.org/wiki/Q11473","display_name":"Thermodynamics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2501.19252","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2501.19252","pdf_url":"https://arxiv.org/pdf/2501.19252","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550/arxiv.2501.19252","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2501.19252","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2501.19252","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2501.19252","pdf_url":"https://arxiv.org/pdf/2501.19252","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G1069223013","display_name":null,"funder_award_id":"JSPS KAKENHI","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G3459562248","display_name":null,"funder_award_id":"Grant","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G4636223006","display_name":null,"funder_award_id":"JSPS KAK","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"},{"id":"https://openalex.org/G8430481527","display_name":null,"funder_award_id":"Number","funder_id":"https://openalex.org/F4320334764","funder_display_name":"Japan Society for the Promotion of Science"}],"funders":[{"id":"https://openalex.org/F4320334764","display_name":"Japan Society for the Promotion of Science","ror":"https://ror.org/00hhkn466"}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4407091669.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"The":[0,205],"remarkable":[1],"progress":[2],"in":[3,84],"text-to-video":[4],"diffusion":[5,45,93,105],"models":[6,46,146],"enables":[7],"the":[8,14,42,52,67,85,154,163,171,176,189,226,232,237,243],"generation":[9,191],"of":[10,16,44,51,63,156],"photorealistic":[11],"videos,":[12],"although":[13],"content":[15],"these":[17],"generated":[18],"videos":[19],"often":[20],"includes":[21],"unnatural":[22],"movement":[23],"or":[24,143],"deformation,":[25],"reverse":[26],"playback,":[27],"and":[28,78,180,187,196,218,235],"motionless":[29],"scenes.":[30],"Recently,":[31],"an":[32],"alignment":[33,111,128],"problem":[34],"has":[35],"attracted":[36],"huge":[37],"attention,":[38],"where":[39],"we":[40,70,75,80,91,223],"steer":[41],"output":[43],"based":[47],"on":[48,175],"some":[49],"measure":[50],"content's":[53],"goodness.":[54],"Because":[55],"there":[56],"is":[57,139,211],"a":[58,103,109,220],"large":[59],"room":[60],"for":[61],"improvement":[62],"perceptual":[64,122,172],"quality":[65,124,173],"along":[66],"frame":[68],"direction,":[69],"should":[71,76,224],"address":[72],"which":[73,100],"metrics":[74,151],"optimize":[77,82],"how":[79],"can":[81,101],"them":[83],"video":[86,123,157],"generation.":[87],"In":[88],"this":[89],"paper,":[90],"propose":[92],"latent":[94,106],"beam":[95],"search":[96,195,238],"with":[97,125,162],"lookahead":[98,233],"estimator,":[99],"select":[102],"better":[104],"to":[107,127,129,152,193,213],"maximize":[108],"given":[110],"reward":[112,132],"at":[113],"inference":[114],"time.":[115],"We":[116,165],"then":[117],"point":[118],"out":[119],"that":[120,167,208],"improving":[121],"respect":[126],"prompts":[130],"requires":[131],"calibration":[133],"by":[134],"weighting":[135],"existing":[136],"metrics.":[137],"This":[138],"because":[140],"when":[141],"humans":[142],"vision":[144],"language":[145],"evaluate":[147],"outputs,":[148],"many":[149,214],"previous":[150],"quantify":[153],"naturalness":[155],"do":[158],"not":[159],"always":[160],"correlate":[161],"evaluation.":[164],"demonstrate":[166],"our":[168,209],"method":[169,210],"improves":[170],"evaluated":[174],"calibrated":[177],"reward,":[178],"VLMs,":[179],"human":[181],"assessment,":[182],"without":[183],"model":[184],"parameter":[185],"update,":[186],"outputs":[188],"best":[190],"compared":[192],"greedy":[194],"best-of-N":[197],"sampling":[198],"under":[199],"much":[200],"more":[201],"efficient":[202],"computational":[203],"cost.":[204],"experiments":[206],"highlight":[207],"beneficial":[212],"capable":[215],"generative":[216],"models,":[217],"provide":[219],"practical":[221],"guideline:":[222],"prioritize":[225],"inference-time":[227],"compute":[228],"allocation":[229],"into":[230],"enabling":[231],"estimator":[234],"increasing":[236],"budget,":[239],"rather":[240],"than":[241],"expanding":[242],"denoising":[244],"steps.":[245]},"counts_by_year":[],"updated_date":"2026-04-10T15:06:20.359241","created_date":"2025-10-10T00:00:00"}
