{"id":"https://openalex.org/W4405470302","doi":"https://doi.org/10.48550/arxiv.2412.11621","title":"VG-TVP: Multimodal Procedural Planning via Visually Grounded Text-Video Prompting","display_name":"VG-TVP: Multimodal Procedural Planning via Visually Grounded Text-Video Prompting","publication_year":2024,"publication_date":"2024-12-16","ids":{"openalex":"https://openalex.org/W4405470302","doi":"https://doi.org/10.48550/arxiv.2412.11621"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2412.11621","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.11621","pdf_url":"https://arxiv.org/pdf/2412.11621","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2412.11621","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027680347","display_name":"Muhammet Furkan Ilaslan","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ilaslan, Muhammet Furkan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017653042","display_name":"Ali K\u00f6ksal","orcid":"https://orcid.org/0000-0002-4539-8636"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Koksal, Ali","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115597686","display_name":"Kevin Qinhong Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Kevin Qinhong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115505724","display_name":"Burak Satar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Satar, Burak","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5068937750","display_name":"Mike Zheng Shou","orcid":"https://orcid.org/0000-0002-7681-2166"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shou, Mike Zheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5052143634","display_name":"Qianli Xu","orcid":"https://orcid.org/0000-0003-0105-5903"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Qianli","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5027680347"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.983299970626831,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.983299970626831,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12031","display_name":"Speech and dialogue systems","score":0.9653000235557556,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10906","display_name":"AI-based Problem Solving and Planning","score":0.9559999704360962,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5262635946273804},{"id":"https://openalex.org/keywords/human\u2013computer-interaction","display_name":"Human\u2013computer interaction","score":0.4365791082382202},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.35019350051879883}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5262635946273804},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4365791082382202},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.35019350051879883}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2412.11621","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.11621","pdf_url":"https://arxiv.org/pdf/2412.11621","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2412.11621","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2412.11621","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2412.11621","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2412.11621","pdf_url":"https://arxiv.org/pdf/2412.11621","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4405470302.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Large":[0],"Language":[1],"Model":[2],"(LLM)-based":[3],"agents":[4],"have":[5,153],"shown":[6],"promise":[7],"in":[8,76],"procedural":[9,56,77],"tasks,":[10],"but":[11],"the":[12,33,81,87,92,97,106,133,145,191],"potential":[13],"of":[14,85,91,101,115,135,147],"multimodal":[15],"instructions":[16],"augmented":[17],"by":[18,110],"texts":[19],"and":[20,54,69,74,96,119,124,139,168,176,181],"videos":[21],"to":[22,131,170],"assist":[23],"users":[24],"remains":[25],"under-explored.":[26],"To":[27,143],"address":[28,144],"this":[29],"gap,":[30],"we":[31,152],"propose":[32],"Visually":[34],"Grounded":[35],"Text-Video":[36],"Prompting":[37],"(VG-TVP)":[38],"method":[39,118,186],"which":[40],"is":[41],"a":[42,59,112,155],"novel":[43,113],"LLM-empowered":[44],"Multimodal":[45],"Procedural":[46,161],"Planning":[47],"(MPP)":[48],"framework.":[49],"It":[50],"generates":[51],"cohesive":[52],"text":[53,137],"video":[55,93,141],"plans":[57,138],"given":[58],"specified":[60],"high-level":[61],"objective.":[62],"The":[63],"main":[64],"challenges":[65],"are":[66],"achieving":[67],"textual":[68,175],"visual":[70,177],"informativeness,":[71,178],"temporal":[72,179],"coherence,":[73,180],"accuracy":[75],"plans.":[78,142],"VG-TVP":[79,104,185],"leverages":[80],"zero-shot":[82],"reasoning":[83],"capability":[84],"LLMs,":[86],"video-to-text":[88],"generation":[89,99,134],"ability":[90,100],"captioning":[94],"models,":[95],"text-to-video":[98],"diffusion":[102],"models.":[103],"improves":[105],"interaction":[107],"between":[108],"modalities":[109],"proposing":[111],"Fusion":[114],"Captioning":[116],"(FoC)":[117],"using":[120],"Text-to-Video":[121],"Bridge":[122,126],"(T2V-B)":[123],"Video-to-Text":[125],"(V2T-B).":[127],"They":[128],"allow":[129],"LLMs":[130],"guide":[132],"visually-grounded":[136],"textual-grounded":[140],"scarcity":[146],"datasets":[148],"suitable":[149],"for":[150],"MPP,":[151],"curated":[154],"new":[156],"dataset":[157],"called":[158],"Daily-Life":[159],"Task":[160],"Plans":[162],"(Daily-PP).":[163],"We":[164],"conduct":[165],"comprehensive":[166],"experiments":[167],"benchmarks":[169],"evaluate":[171],"human":[172],"preferences":[173],"(regarding":[174],"plan":[182],"accuracy).":[183],"Our":[184],"outperforms":[187],"unimodal":[188],"baselines":[189],"on":[190],"Daily-PP":[192],"dataset.":[193]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2024-12-18T00:00:00"}
