{"id":"https://openalex.org/W7137142414","doi":"https://doi.org/10.48550/arxiv.2603.13024","title":"SAW: Toward a Surgical Action World Model via Controllable and Scalable Video Generation","display_name":"SAW: Toward a Surgical Action World Model via Controllable and Scalable Video Generation","publication_year":2026,"publication_date":"2026-03-13","ids":{"openalex":"https://openalex.org/W7137142414","doi":"https://doi.org/10.48550/arxiv.2603.13024"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.13024","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13024","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.13024","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129460913","display_name":"Sampath Rapuri","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Rapuri, Sampath","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129593944","display_name":"Lalithkumar Seenivasan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Seenivasan, Lalithkumar","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033499506","display_name":"Dominik Schneider","orcid":"https://orcid.org/0000-0003-4921-0693"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Schneider, Dominik","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5121800111","display_name":"Roger D. Soberanis-Mukul","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Soberanis-Mukul, Roger","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129566879","display_name":"Yufan He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Yufan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129582393","display_name":"Hao Ding","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Hao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129465207","display_name":"Jiru Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Jiru","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100572340","display_name":"Chenhao Yu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Chenhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129628105","display_name":"Chenyan Jing","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jing, Chenyan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051357970","display_name":"Pengfei Guo","orcid":"https://orcid.org/0009-0007-2561-4091"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Pengfei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129400983","display_name":"Daguang Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Daguang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5129547199","display_name":"Mathias Unberath","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Unberath, Mathias","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":12,"corresponding_author_ids":["https://openalex.org/A5129460913"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10916","display_name":"Surgical Simulation and Training","score":0.9401000142097473,"subfield":{"id":"https://openalex.org/subfields/2746","display_name":"Surgery"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},"topics":[{"id":"https://openalex.org/T10916","display_name":"Surgical Simulation and Training","score":0.9401000142097473,"subfield":{"id":"https://openalex.org/subfields/2746","display_name":"Surgery"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}},{"id":"https://openalex.org/T10868","display_name":"Soft Robotics and Applications","score":0.01269999984651804,"subfield":{"id":"https://openalex.org/subfields/2204","display_name":"Biomedical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10653","display_name":"Robot Manipulation and Learning","score":0.006099999882280827,"subfield":{"id":"https://openalex.org/subfields/2207","display_name":"Control and Systems Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/rendering","display_name":"Rendering (computer graphics)","score":0.5673999786376953},{"id":"https://openalex.org/keywords/affordance","display_name":"Affordance","score":0.5281999707221985},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.4729999899864197},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.4203999936580658},{"id":"https://openalex.org/keywords/consistency","display_name":"Consistency (knowledge bases)","score":0.3749000132083893},{"id":"https://openalex.org/keywords/video-quality","display_name":"Video quality","score":0.32919999957084656},{"id":"https://openalex.org/keywords/surgical-planning","display_name":"Surgical planning","score":0.32600000500679016}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6930999755859375},{"id":"https://openalex.org/C205711294","wikidata":"https://www.wikidata.org/wiki/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.5673999786376953},{"id":"https://openalex.org/C194995250","wikidata":"https://www.wikidata.org/wiki/Q531136","display_name":"Affordance","level":2,"score":0.5281999707221985},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.4729999899864197},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4530999958515167},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.4203999936580658},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4077000021934509},{"id":"https://openalex.org/C2776436953","wikidata":"https://www.wikidata.org/wiki/Q5163215","display_name":"Consistency (knowledge bases)","level":2,"score":0.3749000132083893},{"id":"https://openalex.org/C103910844","wikidata":"https://www.wikidata.org/wiki/Q2631256","display_name":"Video quality","level":3,"score":0.32919999957084656},{"id":"https://openalex.org/C2779370443","wikidata":"https://www.wikidata.org/wiki/Q1776627","display_name":"Surgical planning","level":2,"score":0.32600000500679016},{"id":"https://openalex.org/C117251300","wikidata":"https://www.wikidata.org/wiki/Q1849855","display_name":"Parametric statistics","level":2,"score":0.32409998774528503},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.3154999911785126},{"id":"https://openalex.org/C36464697","wikidata":"https://www.wikidata.org/wiki/Q451553","display_name":"Visualization","level":2,"score":0.31119999289512634},{"id":"https://openalex.org/C2780791683","wikidata":"https://www.wikidata.org/wiki/Q846785","display_name":"Action (physics)","level":2,"score":0.3093000054359436},{"id":"https://openalex.org/C2778181360","wikidata":"https://www.wikidata.org/wiki/Q1074814","display_name":"Surgical instrument","level":2,"score":0.30329999327659607},{"id":"https://openalex.org/C2779662365","wikidata":"https://www.wikidata.org/wiki/Q5416694","display_name":"Event (particle physics)","level":2,"score":0.2865000069141388},{"id":"https://openalex.org/C44154836","wikidata":"https://www.wikidata.org/wiki/Q45045","display_name":"Simulation","level":1,"score":0.28029999136924744},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.26980000734329224},{"id":"https://openalex.org/C128840427","wikidata":"https://www.wikidata.org/wiki/Q1302174","display_name":"Motion compensation","level":2,"score":0.25609999895095825}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.13024","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13024","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.13024","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.13024","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"A":[0],"surgical":[1,8,22,40,52,96,115,137,198,225],"world":[2,53,98],"model":[3,143],"capable":[4],"of":[5,50,150],"generating":[6],"realistic":[7],"action":[9,97,138,208],"videos":[10,206,231],"with":[11,154,204],"precise":[12],"control":[13],"over":[14],"tool-tissue":[15,229],"interactions":[16],"can":[17],"address":[18],"fundamental":[19],"challenges":[20],"in":[21],"AI":[23],"and":[24,30,80,120,182,223],"simulation":[25,240],"--":[26,92],"from":[27,232],"data":[28],"scarcity":[29],"rare":[31,202],"event":[32],"synthesis":[33],"to":[34,164,213,217],"bridging":[35],"the":[36,47],"sim-to-real":[37],"gap":[38],"for":[39,196],"automation.":[41],"However,":[42],"current":[43],"video":[44,101,128],"generation":[45],"methods,":[46],"very":[48],"core":[49],"such":[51],"models,":[54],"require":[55],"expensive":[56],"annotations":[57],"or":[58],"complex":[59,77],"structured":[60],"intermediates":[61],"as":[62],"conditioning":[63,157],"signals":[64],"at":[65,171],"inference,":[66],"limiting":[67],"their":[68],"scalability.":[69],"Other":[70],"approaches":[71],"exhibit":[72],"limited":[73],"temporal":[74,176],"consistency":[75,162,177],"across":[76],"laparoscopic":[78,152],"scenes":[79],"do":[81],"not":[82],"possess":[83],"sufficient":[84],"realism.":[85],"We":[86,124],"propose":[87],"Surgical":[88],"Action":[89],"World":[90],"(SAW)":[91],"a":[93,113,126,147,160,237],"step":[94],"toward":[95,236],"modeling":[99],"through":[100],"diffusion":[102,129,134,142],"conditioned":[103],"on":[104,146,186,219],"four":[105],"lightweight":[106,155],"signals:":[107],"language":[108],"prompts":[109],"encoding":[110],"tool-action":[111],"context,":[112],"reference":[114],"scene,":[116],"tissue":[117],"affordance":[118],"mask,":[119],"2D":[121],"tool-tip":[122],"trajectories.":[123],"design":[125],"conditional":[127],"approach":[130],"that":[131],"reformulates":[132],"video-to-video":[133],"into":[135],"trajectory-conditioned":[136],"synthesis.":[139],"The":[140],"backbone":[141],"is":[144],"fine-tuned":[145],"custom-curated":[148],"dataset":[149],"12,044":[151],"clips":[153],"spatiotemporal":[156],"signals,":[158],"leveraging":[159],"depth":[161,170],"loss":[163],"enforce":[165],"geometric":[166],"plausibility":[167],"without":[168],"requiring":[169],"inference.":[172],"SAW":[173],"achieves":[174],"state-of-the-art":[175],"(CD-FVD:":[178],"199.19":[179],"vs.":[180],"546.82)":[181],"strong":[183],"visual":[184],"quality":[185],"held-out":[187],"test":[188,221],"data.":[189],"Furthermore,":[190],"we":[191],"demonstrate":[192],"its":[193],"downstream":[194],"utility":[195],"(a)":[197],"AI,":[199],"where":[200,227],"augmenting":[201],"actions":[203],"SAW-generated":[205],"improves":[207],"recognition":[209],"(clipping":[210],"F1-score:":[211],"20.93%":[212],"43.14%;":[214],"cutting:":[215],"0.00%":[216],"8.33%)":[218],"real":[220],"data,":[222],"(b)":[224],"simulation,":[226],"rendering":[228],"interaction":[230],"simulator-derived":[233],"trajectory":[234],"points":[235],"visually":[238],"faithful":[239],"engine.":[241]},"counts_by_year":[],"updated_date":"2026-03-17T07:05:13.627479","created_date":"2026-03-17T00:00:00"}
