{"id":"https://openalex.org/W7125893984","doi":"https://doi.org/10.1145/3779212.3790188","title":"MSCCL++: Rethinking GPU Communication Abstractions for AI Inference","display_name":"MSCCL++: Rethinking GPU Communication Abstractions for AI Inference","publication_year":2026,"publication_date":"2026-03-10","ids":{"openalex":"https://openalex.org/W7125893984","doi":"https://doi.org/10.1145/3779212.3790188"},"language":null,"primary_location":{"id":"doi:10.1145/3779212.3790188","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790188","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3779212.3790188","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5124139994","display_name":"Changho Hwang","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Changho Hwang","raw_affiliation_strings":["Microsoft Research, Vancouver, BC, Canada"],"raw_orcid":"https://orcid.org/0009-0007-8756-4480","affiliations":[{"raw_affiliation_string":"Microsoft Research, Vancouver, BC, Canada","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124075533","display_name":"Peng Cheng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng Cheng","raw_affiliation_strings":["Microsoft Research, Redmond, WA, USA"],"raw_orcid":"https://orcid.org/0000-0003-4014-4757","affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, WA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056166013","display_name":"Roshan Dathathri","orcid":"https://orcid.org/0009-0006-8815-7468"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Roshan Dathathri","raw_affiliation_strings":["Microsoft Research, Redmond, WA, USA"],"raw_orcid":"https://orcid.org/0009-0006-8815-7468","affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, WA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034240415","display_name":"Abhinav Jangda","orcid":"https://orcid.org/0000-0002-4849-6776"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abhinav Jangda","raw_affiliation_strings":["Microsoft Research, Redmond, WA, USA"],"raw_orcid":"https://orcid.org/0000-0002-4849-6776","affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, WA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124082528","display_name":"Saeed Maleki","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Saeed Maleki","raw_affiliation_strings":["Microsoft Research, Redmond, WA, USA"],"raw_orcid":"https://orcid.org/0000-0002-7998-3681","affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, WA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5011198874","display_name":"Madan Musuvathi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Madan Musuvathi","raw_affiliation_strings":["Microsoft Research, Redmond, WA, USA"],"raw_orcid":"https://orcid.org/0000-0002-2482-7892","affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, WA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001454502","display_name":"Olli Saarikivi","orcid":"https://orcid.org/0000-0001-7596-4734"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Olli Saarikivi","raw_affiliation_strings":["Microsoft Research, Redmond, WA, USA"],"raw_orcid":"https://orcid.org/0000-0001-7596-4734","affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, WA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023546887","display_name":"Aashaka Shah","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Aashaka Shah","raw_affiliation_strings":["Microsoft Research, Redmond, WA, USA"],"raw_orcid":"https://orcid.org/0009-0004-0628-4515","affiliations":[{"raw_affiliation_string":"Microsoft Research, Redmond, WA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124077550","display_name":"Ziyue Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ziyue Yang","raw_affiliation_strings":["Microsoft Research, Beijing, China"],"raw_orcid":"https://orcid.org/0009-0008-0491-7082","affiliations":[{"raw_affiliation_string":"Microsoft Research, Beijing, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124095990","display_name":"Binyang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Binyang Li","raw_affiliation_strings":["Microsoft Azure, Redmond, WA, USA"],"raw_orcid":"https://orcid.org/0000-0002-9295-6530","affiliations":[{"raw_affiliation_string":"Microsoft Azure, Redmond, WA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124135178","display_name":"Caio Rocha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Caio Rocha","raw_affiliation_strings":["Microsoft Azure, Redmond, WA, USA"],"raw_orcid":"https://orcid.org/0009-0008-2036-1379","affiliations":[{"raw_affiliation_string":"Microsoft Azure, Redmond, WA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5124056367","display_name":"Qinghua Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qinghua Zhou","raw_affiliation_strings":["Microsoft Azure, Redmond, WA, USA"],"raw_orcid":"https://orcid.org/0009-0001-8490-7990","affiliations":[{"raw_affiliation_string":"Microsoft Azure, Redmond, WA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013964337","display_name":"Mahdieh Ghazimirsaeed","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mahdieh Ghazimirsaeed","raw_affiliation_strings":["Microsoft Azure, Cambridge, MA, USA"],"raw_orcid":"https://orcid.org/0009-0009-8226-5515","affiliations":[{"raw_affiliation_string":"Microsoft Azure, Cambridge, MA, USA","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004026524","display_name":"Sreevatsa Anantharamu","orcid":"https://orcid.org/0009-0006-0414-2200"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sreevatsa Anantharamu","raw_affiliation_strings":["Microsoft Azure, ,"],"raw_orcid":"https://orcid.org/0009-0006-0414-2200","affiliations":[{"raw_affiliation_string":"Microsoft Azure, ,","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5124079481","display_name":"Jithin Jose","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jithin Jose","raw_affiliation_strings":["Microsoft Azure, Austin, TX, USA"],"raw_orcid":"https://orcid.org/0000-0001-9549-7918","affiliations":[{"raw_affiliation_string":"Microsoft Azure, Austin, TX, USA","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":15,"corresponding_author_ids":["https://openalex.org/A5124139994"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.11702373,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1201","last_page":"1215"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.23340000212192535,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.23340000212192535,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.11760000139474869,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.06859999895095825,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.6190000176429749},{"id":"https://openalex.org/keywords/digital-subscriber-line","display_name":"Digital subscriber line","score":0.5519999861717224},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.48190000653266907},{"id":"https://openalex.org/keywords/interface","display_name":"Interface (matter)","score":0.42590001225471497},{"id":"https://openalex.org/keywords/production","display_name":"Production (economics)","score":0.3244999945163727},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.2948000133037567},{"id":"https://openalex.org/keywords/communications-protocol","display_name":"Communications protocol","score":0.2946000099182129}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.869700014591217},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.6190000176429749},{"id":"https://openalex.org/C201374245","wikidata":"https://www.wikidata.org/wiki/Q104534","display_name":"Digital subscriber line","level":2,"score":0.5519999861717224},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.48190000653266907},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.42590001225471497},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.353300005197525},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3395000100135803},{"id":"https://openalex.org/C2778348673","wikidata":"https://www.wikidata.org/wiki/Q739302","display_name":"Production (economics)","level":2,"score":0.3244999945163727},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.2948000133037567},{"id":"https://openalex.org/C12269588","wikidata":"https://www.wikidata.org/wiki/Q132364","display_name":"Communications protocol","level":2,"score":0.2946000099182129},{"id":"https://openalex.org/C172086080","wikidata":"https://www.wikidata.org/wiki/Q62270","display_name":"Remote procedure call","level":2,"score":0.2930999994277954},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.2892000079154968},{"id":"https://openalex.org/C154690210","wikidata":"https://www.wikidata.org/wiki/Q1668499","display_name":"Rewriting","level":2,"score":0.287200003862381},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.28380000591278076},{"id":"https://openalex.org/C2780395129","wikidata":"https://www.wikidata.org/wiki/Q1128971","display_name":"Rapid prototyping","level":2,"score":0.2777999937534332},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.2678999900817871},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.26600000262260437},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.2651999890804291},{"id":"https://openalex.org/C204495577","wikidata":"https://www.wikidata.org/wiki/Q1205349","display_name":"Callback","level":2,"score":0.26420000195503235},{"id":"https://openalex.org/C135257023","wikidata":"https://www.wikidata.org/wiki/Q691358","display_name":"Domain-specific language","level":2,"score":0.2621999979019165},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.25940001010894775},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.25380000472068787}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3779212.3790188","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790188","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2504.09014","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2504.09014","pdf_url":"https://arxiv.org/pdf/2504.09014","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"pmh:doi:10.48550/arxiv.2504.09014","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"publisher-specific-oa","license_id":"https://openalex.org/licenses/publisher-specific-oa","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"}],"best_oa_location":{"id":"doi:10.1145/3779212.3790188","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3779212.3790188","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"AI":[0,120,129],"applications":[1],"increasingly":[2],"run":[3],"on":[4],"fast-evolving,":[5],"heterogeneous":[6],"hardware":[7,57,175],"to":[8,74,99,109,117],"maximize":[9],"performance,":[10],"but":[11,29],"general-purpose":[12],"libraries":[13],"lag":[14],"in":[15,125],"supporting":[16],"these":[17],"features.":[18],"Performance-minded":[19],"programmers":[20],"often":[21],"build":[22],"custom":[23],"communication":[24,44,77,113,145],"stacks":[25],"that":[26,54,166],"are":[27,169],"fast":[28],"error-prone":[30],"and":[31,65,79,114,135,154],"non-portable.":[32],"This":[33],"paper":[34],"introduces":[35],"MSCCL++,":[36],"a":[37,49,68,81],"design":[38],"methodology":[39],"for":[40,71,111,119,173],"developing":[41],"high-performance,":[42],"portable":[43],"kernels.":[45],"It":[46],"provides":[47],"(1)":[48],"low-level,":[50],"performance-preserving":[51],"primitive":[52],"interface":[53],"exposes":[55],"minimal":[56,96],"abstractions":[58,168],"while":[59],"hiding":[60],"the":[61,87,142],"complexities":[62],"of":[63,83,106,127,161,182],"synchronization":[64],"consistency,":[66],"(2)":[67],"higher-level":[69],"DSL":[70],"application":[72],"developers":[73],"implement":[75],"workload-specific":[76],"algorithms,":[78],"(3)":[80],"library":[82,146],"efficient":[84],"algorithms":[85],"implementing":[86],"standard":[88],"collective":[89,112,144],"API,":[90],"enabling":[91,171],"adoption":[92],"by":[93,132,140,148],"users":[94],"with":[95,163],"expertise.":[97],"Compared":[98],"state-of-the-art":[100],"baselines,":[101],"MSCCL++":[102,123,150,164],"achieves":[103],"geomean":[104],"speedups":[105],"1.7\u00d7":[107],"(up":[108,116],"5.4\u00d7)":[110],"1.2\u00d7":[115],"1.38\u00d7)":[118],"inference":[121],"workloads.":[122],"is":[124,151],"production":[126],"multiple":[128],"services":[130],"provided":[131],"Microsoft":[133],"Azure,":[134],"has":[136],"also":[137],"been":[138],"adopted":[139],"RCCL,":[141],"GPU":[143],"maintained":[147],"AMD.":[149],"open":[152],"source":[153],"available":[155],"at":[156],"https://github.com/microsoft/mscclpp.":[157],"Our":[158],"two":[159],"years":[160],"experience":[162],"suggests":[165],"its":[167],"robust,":[170],"support":[172],"new":[174],"features,":[176],"such":[177],"as":[178],"multimem,":[179],"within":[180],"weeks":[181],"development.":[183]},"counts_by_year":[],"updated_date":"2026-05-05T08:41:31.759640","created_date":"2026-01-29T00:00:00"}
