{"id":"https://openalex.org/W7154687311","doi":"https://doi.org/10.48550/arxiv.2604.14561","title":"CoCoDiff: Optimizing Collective Communications for Distributed Diffusion Transformer Inference Under Ulysses Sequence Parallelism","display_name":"CoCoDiff: Optimizing Collective Communications for Distributed Diffusion Transformer Inference Under Ulysses Sequence Parallelism","publication_year":2026,"publication_date":"2026-04-16","ids":{"openalex":"https://openalex.org/W7154687311","doi":"https://doi.org/10.48550/arxiv.2604.14561"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.14561","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14561","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.14561","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5133832542","display_name":"Bin Ma","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Ma, Bin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021789231","display_name":"Xingjian Ding","orcid":"https://orcid.org/0000-0001-8866-4941"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ding, Xingjian","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038566298","display_name":"Tekin Bi\u00e7er","orcid":"https://orcid.org/0000-0002-8428-5159"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bicer, Tekin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5071320445","display_name":"Pengfei Su","orcid":"https://orcid.org/0000-0001-7035-1998"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Su, Pengfei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5092067096","display_name":"Dong Li","orcid":"https://orcid.org/0009-0005-3206-7917"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Dong","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5133832542"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.2784000039100647,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.2784000039100647,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.24390000104904175,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12808","display_name":"Ferroelectric and Negative Capacitance Devices","score":0.09570000320672989,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7098000049591064},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.7069000005722046},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.6638000011444092},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.5375999808311462},{"id":"https://openalex.org/keywords/data-parallelism","display_name":"Data parallelism","score":0.438400000333786},{"id":"https://openalex.org/keywords/message-passing","display_name":"Message passing","score":0.4205000102519989},{"id":"https://openalex.org/keywords/interconnection","display_name":"Interconnection","score":0.3765000104904175},{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization (sociology)","score":0.36000001430511475}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7883999943733215},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7098000049591064},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.7069000005722046},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.6638000011444092},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5644999742507935},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.5375999808311462},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.438400000333786},{"id":"https://openalex.org/C854659","wikidata":"https://www.wikidata.org/wiki/Q1859284","display_name":"Message passing","level":2,"score":0.4205000102519989},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3824000060558319},{"id":"https://openalex.org/C123745756","wikidata":"https://www.wikidata.org/wiki/Q1665949","display_name":"Interconnection","level":2,"score":0.3765000104904175},{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.36000001430511475},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.3596999943256378},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.3546000123023987},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.35409998893737793},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.3255999982357025},{"id":"https://openalex.org/C158156997","wikidata":"https://www.wikidata.org/wiki/Q1416645","display_name":"Models of communication","level":2,"score":0.29269999265670776},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.2874000072479248},{"id":"https://openalex.org/C101765175","wikidata":"https://www.wikidata.org/wiki/Q577764","display_name":"Communications system","level":2,"score":0.2784000039100647},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.27799999713897705},{"id":"https://openalex.org/C57493831","wikidata":"https://www.wikidata.org/wiki/Q3134666","display_name":"Projection (relational algebra)","level":2,"score":0.26330000162124634},{"id":"https://openalex.org/C24858836","wikidata":"https://www.wikidata.org/wiki/Q844718","display_name":"Theory of computation","level":2,"score":0.2551000118255615}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.14561","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14561","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.14561","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.14561","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Diffusion":[0],"Transformers":[1],"(DiTs)":[2],"are":[3],"increasingly":[4],"adopted":[5],"in":[6],"scientific":[7],"computing,":[8],"yet":[9],"growing":[10],"model":[11],"sizes":[12],"and":[13,48,74,116],"resolutions":[14],"make":[15],"distributed":[16,56],"multi-GPU":[17],"inference":[18,25,58],"essential.":[19],"Ulysses":[20],"sequence":[21],"parallelism":[22],"scales":[23],"DiT":[24,57,133],"but":[26],"introduces":[27,96],"frequent":[28],"all-to-all":[29],"collectives":[30,104],"that":[31],"dominate":[32],"latency.":[33],"Overlapping":[34],"these":[35],"with":[36,82,131],"computation":[37],"is":[38],"difficult":[39],"due":[40],"to":[41,78,139],"tight":[42],"data":[43],"dependencies,":[44],"large":[45],"message":[46],"volumes,":[47],"asymmetric":[49],"interconnect":[50],"bandwidths.":[51],"We":[52],"introduce":[53],"CoCoDiff,":[54],"a":[55],"engine":[59],"exploiting":[60],"two":[61],"observations:":[62],"(1)":[63],"V":[64],"requires":[65],"only":[66,121],"linear":[67],"projection":[68],"while":[69],"Q/K":[70,83,114],"need":[71],"additional":[72],"normalization":[73],"RoPE,":[75],"creating":[76],"opportunities":[77],"overlap":[79],"V's":[80,111],"communication":[81,112,119],"computation;":[84,115],"(2)":[85],"adjacent":[86],"denoising":[87],"steps":[88],"produce":[89],"similar":[90],"tensors,":[91],"yielding":[92],"temporal":[93],"redundancy.":[94],"CoCoDiff":[95,144],"three":[97],"mechanisms:":[98],"Tile-Aware":[99],"Parallel":[100],"All-to-all":[101],"(TAPA)":[102],"decomposes":[103],"into":[105],"topology-aligned":[106],"phases;":[107],"V-First":[108],"scheduling":[109],"hides":[110],"behind":[113],"V-Major":[117],"selective":[118],"transmits":[120],"active":[122],"projections":[123],"on":[124],"slow":[125],"interconnects.":[126],"On":[127],"the":[128],"Aurora":[129],"supercomputer":[130],"four":[132],"models":[134],"across":[135],"1-8":[136],"nodes":[137],"(up":[138],"96":[140],"Intel":[141],"GPU":[142],"tiles),":[143],"achieves":[145],"an":[146],"average":[147],"speedup":[148],"of":[149],"3.6x,":[150],"peaking":[151],"at":[152],"8.4x.":[153]},"counts_by_year":[],"updated_date":"2026-04-29T09:16:38.111599","created_date":"2026-04-18T00:00:00"}
