{"id":"https://openalex.org/W7147719459","doi":"https://doi.org/10.48550/arxiv.2603.29664","title":"CutClaw: Agentic Hours-Long Video Editing via Music Synchronization","display_name":"CutClaw: Agentic Hours-Long Video Editing via Music Synchronization","publication_year":2026,"publication_date":"2026-03-31","ids":{"openalex":"https://openalex.org/W7147719459","doi":"https://doi.org/10.48550/arxiv.2603.29664"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.29664","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29664","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.29664","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5132660012","display_name":"Shifang Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Zhao, Shifang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132594149","display_name":"Yihan Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Yihan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132615101","display_name":"Ying Shan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan, Ying","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5132591676","display_name":"Yunchao Wei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Yunchao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5132593027","display_name":"Xiaodong Cun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cun, Xiaodong","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5132660012"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7369999885559082,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.7369999885559082,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.10890000313520432,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12290","display_name":"Human Motion and Animation","score":0.028999999165534973,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/video-editing","display_name":"Video editing","score":0.5130000114440918},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.492900013923645},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.4672999978065491},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.4537000060081482},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.4456000030040741},{"id":"https://openalex.org/keywords/narrative","display_name":"Narrative","score":0.44119998812675476},{"id":"https://openalex.org/keywords/storytelling","display_name":"Storytelling","score":0.40950000286102295},{"id":"https://openalex.org/keywords/visualization","display_name":"Visualization","score":0.35350000858306885},{"id":"https://openalex.org/keywords/decomposition","display_name":"Decomposition","score":0.3517000079154968}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8342999815940857},{"id":"https://openalex.org/C2780310081","wikidata":"https://www.wikidata.org/wiki/Q1154312","display_name":"Video editing","level":2,"score":0.5130000114440918},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.492900013923645},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.47360000014305115},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.4672999978065491},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.4537000060081482},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.4456000030040741},{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.44119998812675476},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.4117000102996826},{"id":"https://openalex.org/C2776538412","wikidata":"https://www.wikidata.org/wiki/Q989963","display_name":"Storytelling","level":3,"score":0.40950000286102295},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.35350000858306885},{"id":"https://openalex.org/C124681953","wikidata":"https://www.wikidata.org/wiki/Q339062","display_name":"Decomposition","level":2,"score":0.3517000079154968},{"id":"https://openalex.org/C2776674983","wikidata":"https://www.wikidata.org/wiki/Q545981","display_name":"Image editing","level":3,"score":0.349700003862381},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3474000096321106},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.3352000117301941},{"id":"https://openalex.org/C2779344036","wikidata":"https://www.wikidata.org/wiki/Q11320476","display_name":"Digital content","level":2,"score":0.3301999866962433},{"id":"https://openalex.org/C2778583943","wikidata":"https://www.wikidata.org/wiki/Q846516","display_name":"Digital storytelling","level":2,"score":0.3199999928474426},{"id":"https://openalex.org/C502989409","wikidata":"https://www.wikidata.org/wiki/Q11425","display_name":"Animation","level":2,"score":0.3179999887943268},{"id":"https://openalex.org/C2776509796","wikidata":"https://www.wikidata.org/wiki/Q5276056","display_name":"Content creation","level":2,"score":0.31049999594688416},{"id":"https://openalex.org/C137402728","wikidata":"https://www.wikidata.org/wiki/Q1330119","display_name":"Non-linear editing system","level":5,"score":0.3027999997138977},{"id":"https://openalex.org/C558565934","wikidata":"https://www.wikidata.org/wiki/Q2743","display_name":"Musical","level":2,"score":0.2937999963760376},{"id":"https://openalex.org/C2779754051","wikidata":"https://www.wikidata.org/wiki/Q2903135","display_name":"Interactive storytelling","level":4,"score":0.29269999265670776},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.2890999913215637},{"id":"https://openalex.org/C87687168","wikidata":"https://www.wikidata.org/wiki/Q173114","display_name":"Digital audio","level":4,"score":0.2791000008583069},{"id":"https://openalex.org/C2780878386","wikidata":"https://www.wikidata.org/wiki/Q1659648","display_name":"Visual language","level":2,"score":0.27730000019073486},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.27720001339912415},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.27649998664855957},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.2720000147819519},{"id":"https://openalex.org/C69369342","wikidata":"https://www.wikidata.org/wiki/Q1401416","display_name":"Computer animation","level":3,"score":0.26989999413490295},{"id":"https://openalex.org/C160372630","wikidata":"https://www.wikidata.org/wiki/Q4819855","display_name":"Audio analyzer","level":5,"score":0.2696000039577484},{"id":"https://openalex.org/C2776566319","wikidata":"https://www.wikidata.org/wiki/Q3495514","display_name":"Interactive video","level":2,"score":0.2551000118255615}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.29664","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29664","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.29664","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.29664","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Editing":[0],"the":[1,17,60,118,124,146],"video":[2,24],"content":[3,35,153],"with":[4,74],"audio":[5,107],"alignment":[6],"forms":[7],"a":[8,29,81,92,114,136],"digital":[9],"human-made":[10],"art":[11],"in":[12,173],"current":[13],"social":[14],"media.":[15],"However,":[16],"time-consuming":[18],"and":[19,33,80,101,106,122,141,158],"repetitive":[20],"nature":[21],"of":[22,62],"manual":[23],"editing":[25],"has":[26],"long":[27],"been":[28],"challenge":[30],"for":[31],"filmmakers":[32],"professional":[34],"creators":[36],"alike.":[37],"In":[38,85],"this":[39],"paper,":[40],"we":[41],"introduce":[42],"CutClaw,":[43],"an":[44,68],"autonomous":[45],"multi-agent":[46],"framework":[47],"designed":[48],"to":[49,110,130,134,165],"edit":[50],"hours-long":[51],"raw":[52],"footage":[53],"into":[54],"meaningful":[55],"short":[56,137],"videos":[57,73],"that":[58,96,167],"leverages":[59],"capabilities":[61],"multiple":[63],"Multimodal":[64],"Language":[65],"Models~(MLLMs)":[66],"as":[67],"agent":[69],"system.":[70],"It":[71],"produces":[72],"synchronized":[75],"music,":[76],"followed":[77],"by":[78,90],"instructions,":[79],"visually":[82],"appealing":[83],"appearance.":[84],"detail,":[86],"our":[87],"approach":[88],"begins":[89],"employing":[91],"hierarchical":[93],"multimodal":[94],"decomposition":[95],"captures":[97],"both":[98],"fine-grained":[99,151],"details":[100],"global":[102],"structures":[103,123],"across":[104],"visual":[105,128,152],"footage.":[108],"Then,":[109],"ensure":[111],"narrative":[112],"consistency,":[113],"Playwriter":[115],"Agent":[116],"orchestrates":[117],"whole":[119],"storytelling":[120],"flow":[121],"long-term":[125],"narrative,":[126],"anchoring":[127],"scenes":[129],"musical":[131],"shifts.":[132],"Finally,":[133],"construct":[135],"edited":[138],"video,":[139],"Editor":[140],"Reviewer":[142],"Agents":[143],"collaboratively":[144],"optimize":[145],"final":[147],"cut":[148],"via":[149],"selecting":[150],"based":[154],"on":[155],"rigorous":[156],"aesthetic":[157],"semantic":[159],"criteria.":[160],"We":[161],"conduct":[162],"detailed":[163],"experiments":[164],"demonstrate":[166],"CutClaw":[168],"significantly":[169],"outperforms":[170],"state-of-the-art":[171],"baselines":[172],"generating":[174],"high-quality,":[175],"rhythm-aligned":[176],"videos.":[177],"The":[178],"code":[179],"is":[180],"available":[181],"at:":[182],"https://github.com/GVCLab/CutClaw.":[183]},"counts_by_year":[],"updated_date":"2026-04-02T13:53:19.096889","created_date":"2026-04-02T00:00:00"}
