{"id":"https://openalex.org/W7084082117","doi":"https://doi.org/10.1109/infocom55648.2025.11044522","title":"SPIN: Accelerating Large Language Model Inference with Heterogeneous Speculative Models","display_name":"SPIN: Accelerating Large Language Model Inference with Heterogeneous Speculative Models","publication_year":2025,"publication_date":"2025-05-19","ids":{"openalex":"https://openalex.org/W7084082117","doi":"https://doi.org/10.1109/infocom55648.2025.11044522"},"language":"en","primary_location":{"id":"doi:10.1109/infocom55648.2025.11044522","is_oa":false,"landing_page_url":"https://doi.org/10.1109/infocom55648.2025.11044522","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE INFOCOM 2025 - IEEE Conference on Computer Communications","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Fahao Chen","orcid":null},"institutions":[{"id":"https://openalex.org/I141591182","display_name":"University of Aizu","ror":"https://ror.org/02pg0e883","country_code":"JP","type":"education","lineage":["https://openalex.org/I141591182"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Fahao Chen","raw_affiliation_strings":["School of Computer Science and Engineering, The University of Aizu,Japan"],"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, The University of Aizu,Japan","institution_ids":["https://openalex.org/I141591182"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Peng Li","orcid":null},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peng Li","raw_affiliation_strings":["School of Cyber Science and Engineering, Xi&#x0027;an Jiaotong University,China"],"affiliations":[{"raw_affiliation_string":"School of Cyber Science and Engineering, Xi&#x0027;an Jiaotong University,China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Tom H. Luan","orcid":null},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Tom H. Luan","raw_affiliation_strings":["School of Cyber Science and Engineering, Xi&#x0027;an Jiaotong University,China"],"affiliations":[{"raw_affiliation_string":"School of Cyber Science and Engineering, Xi&#x0027;an Jiaotong University,China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Zhou Su","orcid":null},"institutions":[{"id":"https://openalex.org/I87445476","display_name":"Xi'an Jiaotong University","ror":"https://ror.org/017zhmm22","country_code":"CN","type":"education","lineage":["https://openalex.org/I87445476"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhou Su","raw_affiliation_strings":["School of Cyber Science and Engineering, Xi&#x0027;an Jiaotong University,China"],"affiliations":[{"raw_affiliation_string":"School of Cyber Science and Engineering, Xi&#x0027;an Jiaotong University,China","institution_ids":["https://openalex.org/I87445476"]}]},{"author_position":"last","author":{"id":null,"display_name":"Jing Deng","orcid":null},"institutions":[{"id":"https://openalex.org/I169335092","display_name":"University of North Carolina at Greensboro","ror":"https://ror.org/04fnxsj42","country_code":"US","type":"education","lineage":["https://openalex.org/I169335092"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jing Deng","raw_affiliation_strings":["University of North Carolina at Greensboro,Department of Computer Science,USA"],"affiliations":[{"raw_affiliation_string":"University of North Carolina at Greensboro,Department of Computer Science,USA","institution_ids":["https://openalex.org/I169335092"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I141591182"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.4830887,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https://openalex.org/T13511","display_name":"Geodetic Measurements and Engineering Structures","score":0.22130000591278076,"subfield":{"id":"https://openalex.org/subfields/2205","display_name":"Civil and Structural Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13511","display_name":"Geodetic Measurements and Engineering Structures","score":0.22130000591278076,"subfield":{"id":"https://openalex.org/subfields/2205","display_name":"Civil and Structural Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14353","display_name":"Wireless Sensor Networks for Data Analysis","score":0.054499998688697815,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13312","display_name":"Mechanical and Thermal Properties Analysis","score":0.020400000736117363,"subfield":{"id":"https://openalex.org/subfields/2210","display_name":"Mechanical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speculation","display_name":"Speculation","score":0.8838000297546387},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6804999709129333},{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.6323999762535095},{"id":"https://openalex.org/keywords/overhead","display_name":"Overhead (engineering)","score":0.5539000034332275},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.5314000248908997},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5231000185012817},{"id":"https://openalex.org/keywords/homogeneous","display_name":"Homogeneous","score":0.43810001015663147},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4081000089645386}],"concepts":[{"id":"https://openalex.org/C47941915","wikidata":"https://www.wikidata.org/wiki/Q107885","display_name":"Speculation","level":2,"score":0.8838000297546387},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8356999754905701},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6804999709129333},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.6323999762535095},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.5539000034332275},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.5314000248908997},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5231000185012817},{"id":"https://openalex.org/C66882249","wikidata":"https://www.wikidata.org/wiki/Q169336","display_name":"Homogeneous","level":2,"score":0.43810001015663147},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4081000089645386},{"id":"https://openalex.org/C141331961","wikidata":"https://www.wikidata.org/wiki/Q2164465","display_name":"Speculative execution","level":2,"score":0.38749998807907104},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.3833000063896179},{"id":"https://openalex.org/C124681953","wikidata":"https://www.wikidata.org/wiki/Q339062","display_name":"Decomposition","level":2,"score":0.3684999942779541},{"id":"https://openalex.org/C15296174","wikidata":"https://www.wikidata.org/wiki/Q7575343","display_name":"Speculative multithreading","level":4,"score":0.3580000102519989},{"id":"https://openalex.org/C28719098","wikidata":"https://www.wikidata.org/wiki/Q44946","display_name":"Point (geometry)","level":2,"score":0.3231000006198883},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.3068999946117401},{"id":"https://openalex.org/C44291984","wikidata":"https://www.wikidata.org/wiki/Q1074173","display_name":"Question answering","level":2,"score":0.2948000133037567},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.29109999537467957},{"id":"https://openalex.org/C2777472644","wikidata":"https://www.wikidata.org/wiki/Q16968992","display_name":"Approximate inference","level":3,"score":0.2808000147342682},{"id":"https://openalex.org/C125411270","wikidata":"https://www.wikidata.org/wiki/Q18653","display_name":"Encoding (memory)","level":2,"score":0.27959999442100525},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.27720001339912415},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.271699994802475},{"id":"https://openalex.org/C2776834041","wikidata":"https://www.wikidata.org/wiki/Q25346349","display_name":"Execution model","level":2,"score":0.26159998774528503},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.2572999894618988},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.2529999911785126},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.25270000100135803}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/infocom55648.2025.11044522","is_oa":false,"landing_page_url":"https://doi.org/10.1109/infocom55648.2025.11044522","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE INFOCOM 2025 - IEEE Conference on Computer Communications","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.4319012761116028,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Speculative":[0,20],"decoding":[1,47],"has":[2],"been":[3],"shown":[4],"as":[5],"an":[6,84],"effective":[7],"way":[8],"to":[9,23,95,135,155],"accelerate":[10],"Large":[11],"Language":[12],"Model":[13,21],"(LLM)":[14],"inference":[15,87],"by":[16,36,108,149],"using":[17,58,109],"a":[18,28,40,114,131,169],"Small":[19],"(SSM)":[22],"generate":[24],"candidate":[25],"tokens":[26],"in":[27,39],"so-called":[29],"speculation":[30,74,107,145],"phase,":[31],"which":[32],"are":[33],"subsequently":[34],"verified":[35],"the":[37],"LLM":[38,86,140],"verification":[41,76,147],"phase.":[42],"However,":[43],"current":[44],"state-of-the-art":[45,166],"speculative":[46,92],"approaches":[48],"have":[49],"three":[50,100],"key":[51],"limitations:":[52],"handling":[53],"requests":[54],"with":[55,113],"varying":[56],"difficulty":[57],"homogeneous":[59],"SSMs,":[60,112],"lack":[61],"of":[62,125,172],"robust":[63],"support":[64],"for":[65,72,117],"batch":[66],"processing,":[67],"and":[68,75,146],"insufficient":[69],"holistic":[70],"optimization":[71],"both":[73],"phases.":[77],"In":[78],"this":[79],"paper,":[80],"we":[81],"introduce":[82],"Spin,":[83],"efficient":[85],"serving":[88],"system":[89],"based":[90],"on":[91,153],"decoding,":[93],"designed":[94],"address":[96],"these":[97],"challenges":[98],"through":[99],"main":[101],"innovations.":[102],"First,":[103],"SPIN":[104,129,143,163],"improves":[105],"token":[106],"multiple":[110],"heterogeneous":[111],"learning-based":[115],"algorithm":[116],"SSM":[118],"selection":[119],"that":[120,162],"operates":[121],"without":[122],"prior":[123],"knowledge":[124],"request":[126,132],"difficulty.":[127],"Second,":[128],"employs":[130],"decomposition":[133],"method":[134],"minimize":[136],"batching":[137],"overhead":[138],"during":[139],"verification.":[141],"Finally,":[142],"orchestrates":[144],"phases":[148],"pipelining":[150],"their":[151],"executions":[152],"GPUs":[154],"achieve":[156],"further":[157],"acceleration.":[158],"Experimental":[159],"results":[160],"demonstrate":[161],"significantly":[164],"outperforms":[165],"methods,":[167],"achieving":[168],"performance":[170],"increase":[171],"approximately":[173],"2.28\u00d7.":[174]},"counts_by_year":[],"updated_date":"2025-12-28T23:10:05.387466","created_date":"2025-10-10T00:00:00"}
