{"id":"https://openalex.org/W4400277194","doi":"https://doi.org/10.1109/wcnc57260.2024.10571127","title":"Edge Intelligence Optimization for Large Language Model Inference with Batching and Quantization","display_name":"Edge Intelligence Optimization for Large Language Model Inference with Batching and Quantization","publication_year":2024,"publication_date":"2024-04-21","ids":{"openalex":"https://openalex.org/W4400277194","doi":"https://doi.org/10.1109/wcnc57260.2024.10571127"},"language":"en","primary_location":{"id":"doi:10.1109/wcnc57260.2024.10571127","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wcnc57260.2024.10571127","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Wireless Communications and Networking Conference (WCNC)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100400250","display_name":"Xinyuan Zhang","orcid":"https://orcid.org/0000-0003-2141-431X"},"institutions":[{"id":"https://openalex.org/I4392021250","display_name":"State Key Laboratory of Networking and Switching Technology","ror":"https://ror.org/00qtv5q45","country_code":null,"type":"facility","lineage":["https://openalex.org/I139759216","https://openalex.org/I4392021250"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinyuan Zhang","raw_affiliation_strings":["BUPT,State Key Laboratory of Networking and Switching Technology,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"BUPT,State Key Laboratory of Networking and Switching Technology,China","institution_ids":["https://openalex.org/I4392021250"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100325222","display_name":"Jiang Liu","orcid":"https://orcid.org/0000-0002-0729-1299"},"institutions":[{"id":"https://openalex.org/I4392021250","display_name":"State Key Laboratory of Networking and Switching Technology","ror":"https://ror.org/00qtv5q45","country_code":null,"type":"facility","lineage":["https://openalex.org/I139759216","https://openalex.org/I4392021250"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiang Liu","raw_affiliation_strings":["BUPT,State Key Laboratory of Networking and Switching Technology,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"BUPT,State Key Laboratory of Networking and Switching Technology,China","institution_ids":["https://openalex.org/I4392021250"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5005327587","display_name":"Zehui Xiong","orcid":"https://orcid.org/0000-0002-4440-941X"},"institutions":[{"id":"https://openalex.org/I152815399","display_name":"Singapore University of Technology and Design","ror":"https://ror.org/05j6fvn87","country_code":"SG","type":"education","lineage":["https://openalex.org/I152815399"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Zehui Xiong","raw_affiliation_strings":["Information Systems Technology and Design Pillar, SUTD,Singapore"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Information Systems Technology and Design Pillar, SUTD,Singapore","institution_ids":["https://openalex.org/I152815399"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100768893","display_name":"Yudong Huang","orcid":"https://orcid.org/0000-0002-9998-557X"},"institutions":[{"id":"https://openalex.org/I4392021250","display_name":"State Key Laboratory of Networking and Switching Technology","ror":"https://ror.org/00qtv5q45","country_code":null,"type":"facility","lineage":["https://openalex.org/I139759216","https://openalex.org/I4392021250"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yudong Huang","raw_affiliation_strings":["BUPT,State Key Laboratory of Networking and Switching Technology,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"BUPT,State Key Laboratory of Networking and Switching Technology,China","institution_ids":["https://openalex.org/I4392021250"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110842613","display_name":"Gaochang Xie","orcid":"https://orcid.org/0000-0002-1561-3577"},"institutions":[{"id":"https://openalex.org/I4392021250","display_name":"State Key Laboratory of Networking and Switching Technology","ror":"https://ror.org/00qtv5q45","country_code":null,"type":"facility","lineage":["https://openalex.org/I139759216","https://openalex.org/I4392021250"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Gaochang Xie","raw_affiliation_strings":["BUPT,State Key Laboratory of Networking and Switching Technology,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"BUPT,State Key Laboratory of Networking and Switching Technology,China","institution_ids":["https://openalex.org/I4392021250"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100439282","display_name":"Ran Zhang","orcid":"https://orcid.org/0000-0001-7666-0599"},"institutions":[{"id":"https://openalex.org/I4392021250","display_name":"State Key Laboratory of Networking and Switching Technology","ror":"https://ror.org/00qtv5q45","country_code":null,"type":"facility","lineage":["https://openalex.org/I139759216","https://openalex.org/I4392021250"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ran Zhang","raw_affiliation_strings":["BUPT,State Key Laboratory of Networking and Switching Technology,China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"BUPT,State Key Laboratory of Networking and Switching Technology,China","institution_ids":["https://openalex.org/I4392021250"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":3.0547,"has_fulltext":false,"cited_by_count":10,"citation_normalized_percentile":{"value":0.92322527,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":90,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9873999953269958,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9873999953269958,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9775000214576721,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12702","display_name":"Brain Tumor Detection and Classification","score":0.9603999853134155,"subfield":{"id":"https://openalex.org/subfields/2808","display_name":"Neurology"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7146068215370178},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6807326674461365},{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.5556495189666748},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5246610641479492},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.46817702054977417},{"id":"https://openalex.org/keywords/enhanced-data-rates-for-gsm-evolution","display_name":"Enhanced Data Rates for GSM Evolution","score":0.46164196729660034},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.3693488538265228},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.22228789329528809}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7146068215370178},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6807326674461365},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.5556495189666748},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5246610641479492},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.46817702054977417},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.46164196729660034},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3693488538265228},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.22228789329528809}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/wcnc57260.2024.10571127","is_oa":false,"landing_page_url":"https://doi.org/10.1109/wcnc57260.2024.10571127","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE Wireless Communications and Networking Conference (WCNC)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G4675584026","display_name":null,"funder_award_id":"62171064","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W2950865323","https://openalex.org/W2980856918","https://openalex.org/W3207223387","https://openalex.org/W3214374352","https://openalex.org/W4210457632","https://openalex.org/W4285257856","https://openalex.org/W4320005538","https://openalex.org/W4362515116","https://openalex.org/W4380319827","https://openalex.org/W4385245566","https://openalex.org/W4390828922","https://openalex.org/W6851775633","https://openalex.org/W6954724923","https://openalex.org/W7000926438"],"related_works":["https://openalex.org/W2055243143","https://openalex.org/W4321636575","https://openalex.org/W1986418932","https://openalex.org/W2357796999","https://openalex.org/W2169518243","https://openalex.org/W2045526782","https://openalex.org/W2741131631","https://openalex.org/W2979160909","https://openalex.org/W2156919374","https://openalex.org/W2114837856"],"abstract_inverted_index":{"Generative":[0],"Artificial":[1],"Intelligence":[2],"(GAI)":[3],"is":[4],"taking":[5],"the":[6,22,28,86,118,121,146,225],"world":[7],"by":[8,58,220],"storm":[9],"with":[10,117,186],"its":[11],"unparalleled":[12],"content":[13],"creation":[14],"ability.":[15],"Large":[16],"Language":[17],"Models":[18],"(LLMs)":[19],"are":[20],"at":[21],"forefront":[23],"of":[24,32,89,120,155,170],"this":[25,103,176],"movement.":[26],"However,":[27],"significant":[29],"resource":[30,164],"demands":[31],"LLMs":[33],"often":[34],"require":[35],"cloud":[36],"hosting,":[37],"which":[38],"raises":[39],"issues":[40],"regarding":[41],"privacy,":[42],"latency,":[43],"and":[44,79,99,124,152,157,166,172,212,215],"usage":[45],"limitations.":[46],"Although":[47],"edge":[48,65,108,129,163],"intelligence":[49,109],"has":[50,73,80],"long":[51],"been":[52],"utilized":[53],"to":[54,68,144,224],"solve":[55],"these":[56],"challenges":[57],"enabling":[59],"real-time":[60],"AI":[61,77],"computation":[62,158],"on":[63,75,127],"ubiquitous":[64],"resources":[66],"close":[67],"data":[69],"sources,":[70],"most":[71],"research":[72],"focused":[74],"traditional":[76],"models":[78],"left":[81],"a":[82,193],"gap":[83],"in":[84,206],"addressing":[85],"unique":[87],"characteristics":[88],"LLM":[90,114],"inference,":[91],"such":[92],"as":[93],"considerable":[94],"model":[95,125,135],"size,":[96],"auto-regressive":[97],"processes,":[98],"self-attention":[100],"mechanisms.":[101],"In":[102],"paper,":[104],"we":[105,131,179],"present":[106],"an":[107,133,181],"optimization":[110],"problem":[111],"tailored":[112],"for":[113,136],"inference.":[115],"Specifically,":[116],"deployment":[119],"batching":[122,204],"technique":[123],"quantization":[126,213],"resource-limited":[128],"devices,":[130],"formulate":[132],"inference":[134,147],"transformer":[137],"decoder-based":[138],"LLMs.":[139],"Furthermore,":[140],"our":[141],"approach":[142],"aims":[143],"maximize":[145],"throughput":[148,207],"via":[149],"batch":[150],"scheduling":[151],"joint":[153],"allocation":[154],"communication":[156],"resources,":[159],"while":[160],"also":[161],"considering":[162],"constraints":[165],"varying":[167],"user":[168,210],"requirements":[169],"latency":[171],"accuracy.":[173],"To":[174],"address":[175],"NP-hard":[177],"problem,":[178],"develop":[180],"optimal":[182],"Depth-First":[183],"Tree-Searching":[184],"algorithm":[185],"online":[187],"tree-Pruning":[188],"(DFTSP)":[189],"that":[190,200],"operates":[191],"within":[192],"feasible":[194],"time":[195,218],"complexity.":[196],"Simulation":[197],"results":[198],"indicate":[199],"DFTSP":[201],"surpasses":[202],"other":[203],"benchmarks":[205],"across":[208],"diverse":[209],"settings":[211],"techniques,":[214],"it":[216],"reduces":[217],"complexity":[219],"over":[221],"45%":[222],"compared":[223],"brute-force":[226],"searching":[227],"method.":[228]},"counts_by_year":[{"year":2026,"cited_by_count":3},{"year":2025,"cited_by_count":6},{"year":2024,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
