{"id":"https://openalex.org/W7160642608","doi":"https://doi.org/10.48550/arxiv.2605.05628","title":"Towards Compute-Aware In-Switch Computing for LLMs Tensor-Parallelism on Multi-GPU Systems","display_name":"Towards Compute-Aware In-Switch Computing for LLMs Tensor-Parallelism on Multi-GPU Systems","publication_year":2026,"publication_date":"2026-05-07","ids":{"openalex":"https://openalex.org/W7160642608","doi":"https://doi.org/10.48550/arxiv.2605.05628"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.05628","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05628","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.05628","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5135647065","display_name":"Chen Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135684844","display_name":"Qijun Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Qijun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128051850","display_name":"Zhuoshan Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Zhuoshan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080651207","display_name":"Yijia Diao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Diao, Yijia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135682436","display_name":"Haibo Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Haibo","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135657289","display_name":"Zhe Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Zhe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5049071763","display_name":"Zhipeng Tu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tu, Zhipeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128122866","display_name":"Zhiyao Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Zhiyao","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135698802","display_name":"Guangyu Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Guangyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135708978","display_name":"Zhuoran Song","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Song, Zhuoran","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135696977","display_name":"Zhigang Ji","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ji, Zhigang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003939279","display_name":"Jingwen Leng","orcid":"https://orcid.org/0000-0002-5660-5493"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Leng, Jingwen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5135688432","display_name":"Minyi Guo","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Minyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":13,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.36090001463890076,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12292","display_name":"Graph Theory and Algorithms","score":0.36090001463890076,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.3375999927520752,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.04820000007748604,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/dataflow","display_name":"Dataflow","score":0.8029999732971191},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.6222000122070312},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.59579998254776},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.5171999931335449},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.47699999809265137},{"id":"https://openalex.org/keywords/semantics","display_name":"Semantics (computer science)","score":0.4242999851703644},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.3398999869823456}],"concepts":[{"id":"https://openalex.org/C96324660","wikidata":"https://www.wikidata.org/wiki/Q205446","display_name":"Dataflow","level":2,"score":0.8029999732971191},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7807000279426575},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.6222000122070312},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.59579998254776},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.5171999931335449},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.47699999809265137},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.45350000262260437},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.4242999851703644},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.37220001220703125},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.3398999869823456},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.33469998836517334},{"id":"https://openalex.org/C192126672","wikidata":"https://www.wikidata.org/wiki/Q1068715","display_name":"Telecommunications network","level":2,"score":0.3203999996185303},{"id":"https://openalex.org/C156325763","wikidata":"https://www.wikidata.org/wiki/Q1930895","display_name":"Operational semantics","level":3,"score":0.3059000074863434},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.28459998965263367},{"id":"https://openalex.org/C75684735","wikidata":"https://www.wikidata.org/wiki/Q858810","display_name":"Big data","level":2,"score":0.28349998593330383},{"id":"https://openalex.org/C48677424","wikidata":"https://www.wikidata.org/wiki/Q6888088","display_name":"Mode (computer interface)","level":2,"score":0.2685999870300293},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.2578999996185303},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.25699999928474426},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.25099998712539673},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.25}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.05628","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05628","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.05628","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.05628","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Tensor":[0],"parallelism":[1],"(TP)":[2],"in":[3,63,69,171],"large-scale":[4],"LLM":[5,138],"inference":[6],"and":[7,44,59,66,103,154],"training":[8,147],"introduces":[9,37],"frequent":[10],"collective":[11,26],"operations":[12,27],"that":[13,85,141],"dominate":[14],"inter-GPU":[15],"communication.":[16],"While":[17],"in-switch":[18,109],"computing,":[19],"exemplified":[20],"by":[21,28],"NVLink":[22],"SHARP":[23],"(NVLS),":[24],"accelerates":[25],"reducing":[29],"redundant":[30],"data":[31],"transfer,":[32],"its":[33,41,169],"communication-centric":[34],"design":[35],"philosophy":[36],"the":[38,45,57,74,79,119,150,158],"mismatch":[39,55],"between":[40],"communication":[42,60,87],"mode":[43],"memory":[46,91],"semantic":[47],"requirement":[48],"of":[49,96],"LLM's":[50],"computation":[51],"kernel.":[52],"Such":[53],"a":[54,132],"isolates":[56],"compute":[58],"phases,":[61],"resulting":[62],"underutilized":[64],"resources":[65],"limited":[67],"overlap":[68,161],"multi-GPU":[70,175],"systems.":[71,176],"To":[72],"address":[73],"limitation,":[75],"we":[76],"propose":[77],"CAIS,":[78],"first":[80],"Compute-Aware":[81],"In-Switch":[82],"computing":[83],"framework":[84],"aligns":[86],"modes":[88],"with":[89],"computation's":[90],"semantics":[92],"requirement.":[93],"CAIS":[94,142],"consists":[95],"three":[97],"integral":[98],"techniques:":[99],"(1)":[100],"compute-aware":[101,108],"ISA":[102],"microarchitecture":[104],"extension":[105],"to":[106,117,130],"enable":[107],"computing.":[110],"(2)":[111],"merging-aware":[112],"TB":[113],"(Thread":[114],"Block)":[115],"coordination":[116],"improve":[118],"temporal":[120],"alignment":[121],"for":[122],"efficient":[123],"request":[124],"merging.":[125],"(3)":[126],"graph-level":[127],"dataflow":[128],"optimizer":[129],"achieve":[131],"tight":[133],"cross-kernel":[134],"overlap.":[135],"Evaluations":[136],"on":[137,174],"workloads":[139],"show":[140],"achieves":[143],"1.38$\\times$":[144],"average":[145],"end-to-end":[146],"speedup":[148],"over":[149,156],"SOTA":[151,159],"NVLS-enabled":[152],"solution,":[153],"1.61$\\times$":[155],"T3,":[157],"compute-communicate":[160],"solutions":[162],"but":[163],"do":[164],"not":[165],"leverage":[166],"NVLS,":[167],"demonstrating":[168],"effectiveness":[170],"accelerating":[172],"TP":[173]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-09T00:00:00"}
