{"id":"https://openalex.org/W7160657638","doi":"https://doi.org/10.48550/arxiv.2605.06206","title":"Federation of Experts: Communication Efficient Distributed Inference for Large Language Models","display_name":"Federation of Experts: Communication Efficient Distributed Inference for Large Language Models","publication_year":2026,"publication_date":"2026-05-07","ids":{"openalex":"https://openalex.org/W7160657638","doi":"https://doi.org/10.48550/arxiv.2605.06206"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.06206","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06206","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.06206","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5011298605","display_name":"Muhammad Shahir Rahman","orcid":"https://orcid.org/0009-0006-5723-9997"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Abdurrahman, Muhammad Shahir","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135716609","display_name":"Chun Deng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deng, Chun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135639090","display_name":"Azalia Mirhoseini","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mirhoseini, Azalia","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5108245984","display_name":"Philip Levis","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Levis, Philip","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.10379999876022339,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.10379999876022339,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11704","display_name":"Mobile Crowdsensing and Crowdsourcing","score":0.08749999850988388,"subfield":{"id":"https://openalex.org/subfields/1706","display_name":"Computer Science Applications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.05860000103712082,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.602400004863739},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.5756999850273132},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5601000189781189},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.41780000925064087},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.3458000123500824},{"id":"https://openalex.org/keywords/mixture-model","display_name":"Mixture model","score":0.32179999351501465},{"id":"https://openalex.org/keywords/communication-in-small-groups","display_name":"Communication in small groups","score":0.3183000087738037}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7412999868392944},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.602400004863739},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.5756999850273132},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5601000189781189},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43860000371932983},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.41780000925064087},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.3458000123500824},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.335999995470047},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.32690000534057617},{"id":"https://openalex.org/C61224824","wikidata":"https://www.wikidata.org/wiki/Q2260434","display_name":"Mixture model","level":2,"score":0.32179999351501465},{"id":"https://openalex.org/C44871818","wikidata":"https://www.wikidata.org/wiki/Q5154139","display_name":"Communication in small groups","level":2,"score":0.3183000087738037},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.29980000853538513},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.2953999936580658},{"id":"https://openalex.org/C192126672","wikidata":"https://www.wikidata.org/wiki/Q1068715","display_name":"Telecommunications network","level":2,"score":0.2793999910354614},{"id":"https://openalex.org/C2779227376","wikidata":"https://www.wikidata.org/wiki/Q6505497","display_name":"Layer (electronics)","level":2,"score":0.27149999141693115},{"id":"https://openalex.org/C158156997","wikidata":"https://www.wikidata.org/wiki/Q1416645","display_name":"Models of communication","level":2,"score":0.26750001311302185},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.2660999894142151},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.26570001244544983},{"id":"https://openalex.org/C101765175","wikidata":"https://www.wikidata.org/wiki/Q577764","display_name":"Communications system","level":2,"score":0.2630999982357025}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.06206","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06206","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.06206","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.06206","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Mixture":[0],"of":[1,35,44,59,129,175,178],"experts":[2,25,101,176],"has":[3],"emerged":[4],"as":[5,99],"the":[6,32,41,60,76,86,108,119,149,179],"primary":[7],"mechanism":[8],"for":[9,56,85],"making":[10],"Large":[11],"Language":[12],"Models":[13],"(LLMs)":[14],"computationally":[15],"efficient.":[16],"However,":[17],"in":[18,142],"distributed":[19],"settings,":[20,113,147],"communicating":[21],"token":[22],"embeddings":[23],"between":[24,68],"is":[26,54,66],"a":[27,45,73,91,103,173],"significant":[28],"bottleneck.":[29],"We":[30],"present":[31],"novel":[33],"Federation":[34],"Experts":[36],"(FoE)":[37],"architecture.":[38],"FoE":[39,94,114,130,135],"restructures":[40],"MoE":[42,50,88],"block":[43],"transformer":[46],"layer":[47],"into":[48],"multiple":[49],"clusters.":[51],"Each":[52],"cluster":[53],"responsible":[55],"only":[57],"one":[58],"KV":[61],"heads":[62],"and":[63,83,140,145,160,182],"expert":[64],"parallelism":[65],"applied":[67],"those":[69],"experts.":[70],"Between":[71],"clusters,":[72],"sum":[74],"synchronizes":[75],"post-attention":[77],"residuals,":[78],"which":[79],"then":[80],"drives":[81],"routing":[82],"dispatch":[84],"next":[87],"block.":[89],"In":[90,111],"single-node":[92,144],"setting,":[93],"completely":[95],"eliminates":[96],"all-to-all":[97,116],"communication":[98,117,125],"all":[100],"within":[102],"group":[104],"are":[105],"contained":[106],"on":[107,133],"same":[109,180],"GPU.":[110],"multi-node":[112,146],"confines":[115],"to":[118,155,172],"intra-node":[120],"fabric,":[121],"thus":[122],"significantly":[123,136],"reducing":[124,148],"overhead.":[126],"An":[127],"implementation":[128],"finds":[131],"that":[132],"LongBench,":[134],"improves":[137],"inference":[138],"throughput":[139],"latency":[141,152],"both":[143],"end-to-end":[150],"forward-pass":[151],"by":[153,158,162],"up":[154],"5.2x,":[156],"TTFT":[157],"3.62x,":[159],"TBT":[161],"1.95x.":[163],"It":[164],"does":[165],"so":[166],"while":[167],"achieving":[168],"comparable":[169],"generation":[170],"quality":[171],"mixture":[174],"model":[177],"size":[181],"training":[183],"configuration.":[184]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-09T00:00:00"}
