{"id":"https://openalex.org/W7160927230","doi":"https://doi.org/10.48550/arxiv.2605.08735","title":"CollabVR: Collaborative Video Reasoning with Vision-Language and Video Generation Models","display_name":"CollabVR: Collaborative Video Reasoning with Vision-Language and Video Generation Models","publication_year":2026,"publication_date":"2026-05-09","ids":{"openalex":"https://openalex.org/W7160927230","doi":"https://doi.org/10.48550/arxiv.2605.08735"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.08735","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08735","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.08735","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5051925314","display_name":"Joowon Kim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kim, Joowon","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135949339","display_name":"Seungho Shin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shin, Seungho","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058234906","display_name":"Joonhyung Park","orcid":"https://orcid.org/0000-0003-4662-0099"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Park, Joonhyung","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135967816","display_name":"Eunho Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Eunho","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7283999919891357,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7283999919891357,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.08309999853372574,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.07289999723434448,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/commit","display_name":"Commit","score":0.7687000036239624},{"id":"https://openalex.org/keywords/frame","display_name":"Frame (networking)","score":0.5856000185012817},{"id":"https://openalex.org/keywords/action","display_name":"Action (physics)","score":0.5457000136375427},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.30469998717308044},{"id":"https://openalex.org/keywords/visual-reasoning","display_name":"Visual reasoning","score":0.2953999936580658},{"id":"https://openalex.org/keywords/component","display_name":"Component (thermodynamics)","score":0.2921999990940094},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.28760001063346863}],"concepts":[{"id":"https://openalex.org/C153180980","wikidata":"https://www.wikidata.org/wiki/Q19776675","display_name":"Commit","level":2,"score":0.7687000036239624},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7222999930381775},{"id":"https://openalex.org/C126042441","wikidata":"https://www.wikidata.org/wiki/Q1324888","display_name":"Frame (networking)","level":2,"score":0.5856000185012817},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.5457000136375427},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4657000005245209},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3785000145435333},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.349700003862381},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.30469998717308044},{"id":"https://openalex.org/C2777508537","wikidata":"https://www.wikidata.org/wiki/Q7936620","display_name":"Visual reasoning","level":2,"score":0.2953999936580658},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.2921999990940094},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.28760001063346863},{"id":"https://openalex.org/C2988634675","wikidata":"https://www.wikidata.org/wiki/Q34508","display_name":"Video recording","level":2,"score":0.28119999170303345},{"id":"https://openalex.org/C20162079","wikidata":"https://www.wikidata.org/wiki/Q1151406","display_name":"Case-based reasoning","level":2,"score":0.2800999879837036},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.2791999876499176},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2768000066280365},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.27459999918937683},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.25699999928474426},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.25609999895095825}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.08735","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08735","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.08735","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.08735","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Preprint"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Recent":[0],"\"Thinking":[1],"with":[2,106,162,191],"Video\"":[3],"approaches":[4],"use":[5],"Video":[6,96],"Generation":[7],"Models":[8,65],"(VGMs)":[9],"for":[10],"visual":[11,57],"reasoning":[12,19,51],"by":[13,63],"producing":[14],"temporally":[15],"coherent":[16],"Chain-of-Frames":[17],"as":[18],"artifacts.":[20],"Even":[21],"strong":[22],"VGMs,":[23],"however,":[24],"exhibit":[25],"two":[26],"recurring":[27],"failure":[28],"modes":[29],"on":[30,35,166,175],"goal-directed":[31],"tasks:":[32],"long-horizon":[33],"drift":[34],"multi-step":[36],"tasks":[37],"and":[38,83,125,142,148,154,189,198],"mid-clip":[39],"simulation":[40],"errors":[41],"that":[42,102,182],"compound.":[43],"Both":[44],"stem":[45],"from":[46],"the":[47,54,71,104,107,112,115,120,122,127,132,163,167],"absence":[48],"of":[49,177],"explicit":[50],"built":[52],"upon":[53],"VGM's":[55],"short-horizon":[56],"prior,":[58],"a":[59,99,178],"role":[60],"naturally":[61],"filled":[62],"Vision-Language":[64],"(VLMs),":[66],"but":[67],"where":[68],"to":[69,136,188],"place":[70],"VLM":[72,105,113,184],"is":[73,81,186],"non-trivial:":[74],"upfront":[75],"plans":[76,114],"commit":[77],"before":[78],"any":[79],"frame":[80],"generated":[82],"post-hoc":[84],"critiques":[85],"over":[86,151],"whole":[87],"videos":[88],"intervene":[89],"too":[90],"late.":[91],"We":[92,194],"propose":[93],"VLM-VGM":[94],"Collaborative":[95],"Reasoning":[97],"(CollabVR),":[98],"closed-loop":[100],"framework":[101],"couples":[103],"VGM":[108,123],"at":[109,159,202],"step-level":[110,183],"granularity:":[111],"immediate":[116],"next":[117,133],"action,":[118],"inspects":[119],"clip":[121],"generates,":[124],"folds":[126],"verifier's":[128],"diagnosis":[129],"directly":[130],"into":[131],"action":[134],"prompt":[135],"repair":[137],"detected":[138],"failures.":[139],"On":[140],"Gen-ViRe":[141],"VBVR-Bench,":[143],"CollabVR":[144],"improves":[145],"both":[146],"open-source":[147],"closed-source":[149],"VGMs":[150],"single-inference,":[152],"Pass@$k$,":[153],"prior":[155],"test-time":[156],"scaling":[157],"baselines":[158],"matched":[160],"compute,":[161],"largest":[164],"gains":[165],"hardest":[168],"tasks.":[169],"It":[170],"also":[171],"yields":[172],"further":[173],"improvements":[174],"top":[176],"reasoning-fine-tuned":[179],"VGM,":[180],"indicating":[181],"supervision":[185],"orthogonal":[187],"stackable":[190],"reasoning-oriented":[192],"fine-tuning.":[193],"provide":[195],"video":[196],"samples":[197],"additional":[199],"qualitative":[200],"results":[201],"our":[203],"project":[204],"page:":[205],"https://joow0n-kim.github.io/collabvr-project-page.":[206]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2026-05-13T00:00:00"}
