{"id":"https://openalex.org/W7138960589","doi":"https://doi.org/10.1109/euc66494.2025.00010","title":"Adaptive Model Partitioning for Distributed LLM Inference Across Heterogeneous Devices","display_name":"Adaptive Model Partitioning for Distributed LLM Inference Across Heterogeneous Devices","publication_year":2025,"publication_date":"2025-11-14","ids":{"openalex":"https://openalex.org/W7138960589","doi":"https://doi.org/10.1109/euc66494.2025.00010"},"language":null,"primary_location":{"id":"doi:10.1109/euc66494.2025.00010","is_oa":false,"landing_page_url":"https://doi.org/10.1109/euc66494.2025.00010","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 23rd International Conference on Embedded and Ubiquitous Computing (EUC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5130041499","display_name":"Junda Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Junda Wang","raw_affiliation_strings":["College of Information Science and Electronic Engineering, Zhejiang University,Hangzhou,China,310007"],"affiliations":[{"raw_affiliation_string":"College of Information Science and Electronic Engineering, Zhejiang University,Hangzhou,China,310007","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130013363","display_name":"Zhaoyang Li","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhaoyang Li","raw_affiliation_strings":["College of Information Science and Electronic Engineering, Zhejiang University,Hangzhou,China,310007"],"affiliations":[{"raw_affiliation_string":"College of Information Science and Electronic Engineering, Zhejiang University,Hangzhou,China,310007","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Qianqian Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qianqian Yang","raw_affiliation_strings":["College of Information Science and Electronic Engineering, Zhejiang University,Hangzhou,China,310007"],"affiliations":[{"raw_affiliation_string":"College of Information Science and Electronic Engineering, Zhejiang University,Hangzhou,China,310007","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129795346","display_name":"Jing Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jing Li","raw_affiliation_strings":["National Key Laboratory of Complex System Control and Intelligent Agent Cooperation,Beijing,China,100074"],"affiliations":[{"raw_affiliation_string":"National Key Laboratory of Complex System Control and Intelligent Agent Cooperation,Beijing,China,100074","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5130021050","display_name":"Xiaoqiang Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I4210120144","display_name":"Beijing Jingshida Electromechanical Equipment Research Institute","ror":"https://ror.org/02vx4zx98","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210120144"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiaoqiang Zhao","raw_affiliation_strings":["Beijing Electro-mechanical Engineering Institute,Beijing,China,102200"],"affiliations":[{"raw_affiliation_string":"Beijing Electro-mechanical Engineering Institute,Beijing,China,102200","institution_ids":["https://openalex.org/I4210120144"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5130018492","display_name":"Wenbo Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wenbo Zhang","raw_affiliation_strings":["National Key Laboratory of Complex System Control and Intelligent Agent Cooperation,Beijing,China,100074"],"affiliations":[{"raw_affiliation_string":"National Key Laboratory of Complex System Control and Intelligent Agent Cooperation,Beijing,China,100074","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5130041499"],"corresponding_institution_ids":["https://openalex.org/I76130692"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.77886396,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"11","last_page":"16"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.2996000051498413,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.2996000051498413,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.17489999532699585,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.0908999964594841,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.8069999814033508},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.6963000297546387},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5647000074386597},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.548799991607666},{"id":"https://openalex.org/keywords/computational-complexity-theory","display_name":"Computational complexity theory","score":0.39410001039505005},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.3483999967575073},{"id":"https://openalex.org/keywords/predictive-modelling","display_name":"Predictive modelling","score":0.29440000653266907}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.819100022315979},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.8069999814033508},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.6963000297546387},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5647000074386597},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.548799991607666},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41370001435279846},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4081000089645386},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.39410001039505005},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.37130001187324524},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.3483999967575073},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.33059999346733093},{"id":"https://openalex.org/C45804977","wikidata":"https://www.wikidata.org/wiki/Q7239673","display_name":"Predictive modelling","level":2,"score":0.29440000653266907},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.27709999680519104},{"id":"https://openalex.org/C66024118","wikidata":"https://www.wikidata.org/wiki/Q1122506","display_name":"Computational model","level":2,"score":0.2669999897480011},{"id":"https://openalex.org/C60908668","wikidata":"https://www.wikidata.org/wiki/Q690207","display_name":"Perceptron","level":3,"score":0.26600000262260437},{"id":"https://openalex.org/C3017489831","wikidata":"https://www.wikidata.org/wiki/Q2393193","display_name":"Running time","level":2,"score":0.26570001244544983},{"id":"https://openalex.org/C83546350","wikidata":"https://www.wikidata.org/wiki/Q1139051","display_name":"Regression","level":2,"score":0.26170000433921814},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.26109999418258667},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2606000006198883},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.25600001215934753}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/euc66494.2025.00010","is_oa":false,"landing_page_url":"https://doi.org/10.1109/euc66494.2025.00010","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 23rd International Conference on Embedded and Ubiquitous Computing (EUC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":9,"referenced_works":["https://openalex.org/W2969388332","https://openalex.org/W2991040477","https://openalex.org/W4379033976","https://openalex.org/W4387088914","https://openalex.org/W4391136507","https://openalex.org/W4396505967","https://openalex.org/W4405934565","https://openalex.org/W4410427444","https://openalex.org/W4413640445"],"related_works":[],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,38],"(LLMs)":[3],"deliver":[4],"strong":[5],"performance":[6],"across":[7],"diverse":[8],"tasks":[9],"but":[10],"their":[11],"growing":[12],"computational":[13,92],"demands":[14],"make":[15],"single-device":[16],"inference":[17,21,70,159],"increasingly":[18],"infeasible.":[19],"Distributed":[20],"on":[22,35,126],"heterogeneous":[23,73],"accelerators":[24],"offers":[25],"a":[26,114],"scalable":[27],"solution,":[28],"yet":[29],"existing":[30],"partitioning":[31,112],"strategies":[32],"often":[33],"depend":[34],"oversimplified":[36],"latency":[37,62,106,160],"that":[39,58,129],"fail":[40],"to":[41,67,101,123,147,164],"capture":[42],"runtime":[43],"variability":[44],"caused":[45],"by":[46],"device":[47,87],"diversity":[48],"and":[49,63,91,140],"fluctuating":[50],"input":[51],"lengths.":[52],"We":[53],"present":[54],"an":[55],"adaptive":[56],"framework":[57],"jointly":[59],"predicts":[60],"submodel":[61,85],"optimally":[64],"partitions":[65],"LLMs":[66],"minimize":[68],"end-to-end":[69],"time":[71],"in":[72],"clusters.":[74],"A":[75],"multilayer":[76],"perceptron":[77],"(MLP)":[78],"regression":[79],"model":[80,111],"integrates":[81],"multidimensional":[82],"features\u2014such":[83],"as":[84,113],"masks,":[86],"characteristics,":[88],"token":[89],"counts,":[90],"demand\u2014and":[93],"employs":[94],"lightweight":[95],"adapter":[96],"layers":[97],"for":[98],"rapid":[99],"fine-tuning":[100,152],"unseen":[102],"environments.":[103],"Using":[104],"these":[105],"predictions,":[107],"we":[108],"formulate":[109],"pipeline-aware":[110],"dynamic":[115],"programming":[116],"problem,":[117],"reducing":[118],"search":[119],"complexity":[120],"from":[121,145,161],"exponential":[122],"polynomial.":[124],"Experiments":[125],"GPT2":[127],"show":[128],"our":[130],"approach":[131],"achieves":[132],"high":[133],"prediction":[134],"accuracy":[135,144],"(91.8%":[136],"within":[137],"20%":[138],"error)":[139],"improves":[141],"adaptability,":[142],"raising":[143],"20.3%":[146],"76.0%":[148],"with":[149],"only":[150],"50":[151],"samples.":[153],"Overall,":[154],"the":[155],"proposed":[156],"method":[157],"reduces":[158],"234.70":[162],"ms":[163],"130.45":[165],"ms,":[166],"significantly":[167],"outperforming":[168],"baseline":[169],"loadbalancing":[170],"methods.":[171]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2026-03-20T00:00:00"}
