{"id":"https://openalex.org/W7135178564","doi":"https://doi.org/10.48550/arxiv.2603.11873","title":"AdaFuse: Accelerating Dynamic Adapter Inference via Token-Level Pre-Gating and Fused Kernel Optimization","display_name":"AdaFuse: Accelerating Dynamic Adapter Inference via Token-Level Pre-Gating and Fused Kernel Optimization","publication_year":2026,"publication_date":"2026-03-12","ids":{"openalex":"https://openalex.org/W7135178564","doi":"https://doi.org/10.48550/arxiv.2603.11873"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.11873","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11873","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.11873","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5129008092","display_name":"Qiyang Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Qiyang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129041935","display_name":"Rui Kong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kong, Rui","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128968654","display_name":"Yuchen Li","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Yuchen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128994028","display_name":"Hengyi Cai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cai, Hengyi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5128996508","display_name":"Shuaiqiang Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shuaiqiang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129032385","display_name":"Linghe Kong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kong, Linghe","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129096407","display_name":"Guihai Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Guihai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5128925592","display_name":"Dawei Yin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yin, Dawei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.27869999408721924,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.27869999408721924,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.08799999952316284,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.06880000233650208,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.6873999834060669},{"id":"https://openalex.org/keywords/adapter","display_name":"Adapter (computing)","score":0.6541000008583069},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6067000031471252},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.560699999332428},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.5408999919891357},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.5364999771118164},{"id":"https://openalex.org/keywords/bridging","display_name":"Bridging (networking)","score":0.4821999967098236},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.47999998927116394}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7990999817848206},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.6873999834060669},{"id":"https://openalex.org/C177284502","wikidata":"https://www.wikidata.org/wiki/Q1005390","display_name":"Adapter (computing)","level":2,"score":0.6541000008583069},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6067000031471252},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.560699999332428},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.5408999919891357},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.5364999771118164},{"id":"https://openalex.org/C174348530","wikidata":"https://www.wikidata.org/wiki/Q188635","display_name":"Bridging (networking)","level":2,"score":0.4821999967098236},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.47999998927116394},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.45809999108314514},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.4456000030040741},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4171000123023987},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.40560001134872437},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.38609999418258667},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.3847000002861023},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.3522000014781952},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.3343000113964081},{"id":"https://openalex.org/C115874739","wikidata":"https://www.wikidata.org/wiki/Q825377","display_name":"Critical path method","level":2,"score":0.32829999923706055},{"id":"https://openalex.org/C49020025","wikidata":"https://www.wikidata.org/wiki/Q1059099","display_name":"Chaining","level":2,"score":0.314300000667572},{"id":"https://openalex.org/C24856439","wikidata":"https://www.wikidata.org/wiki/Q352483","display_name":"Adaptive routing","level":5,"score":0.3052000105381012},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.27959999442100525},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.27950000762939453},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.27549999952316284},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.2621000111103058},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.2563000023365021}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.11873","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11873","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.11873","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.11873","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"The":[0],"integration":[1],"of":[2,178,217],"dynamic,":[3],"sparse":[4],"structures":[5],"like":[6],"Mixture-of-Experts":[7],"(MoE)":[8],"with":[9,205],"parameter-efficient":[10],"adapters":[11,182,208],"(e.g.,":[12],"LoRA)":[13],"is":[14,139],"a":[15,30,54,91,95,121,127,137,165,171,188,215],"powerful":[16],"technique":[17],"for":[18,80,132,150,156],"enhancing":[19],"Large":[20],"Language":[21],"Models":[22],"(LLMs).":[23],"However,":[24],"this":[25,86,162],"architectural":[26],"enhancement":[27],"comes":[28],"at":[29],"steep":[31],"cost:":[32],"despite":[33],"minimal":[34],"increases":[35],"in":[36,64,69,187],"computational":[37],"load,":[38],"the":[39,60,65,70,99,102,147,176,184,222],"inference":[40,228],"latency":[41,213],"often":[42],"skyrockets,":[43],"leading":[44],"to":[45,106],"decoding":[46,212],"speeds":[47],"slowing":[48],"by":[49,163,214],"over":[50,218],"2.5":[51],"times.":[52],"Through":[53],"fine-grained":[55],"performance":[56],"analysis,":[57],"we":[58,88],"pinpoint":[59],"primary":[61],"bottleneck":[62],"not":[63],"computation":[66],"itself,":[67],"but":[68],"severe":[71],"overhead":[72],"from":[73,113],"fragmented,":[74],"sequential":[75],"CUDA":[76,167],"kernel":[77,168],"launches":[78],"required":[79],"conventional":[81,114],"dynamic":[82,109,207],"routing.":[83],"To":[84],"address":[85],"challenge,":[87],"introduce":[89],"AdaFuse,":[90],"framework":[92],"built":[93],"on":[94,161,194,203],"tight":[96],"co-design":[97],"between":[98,224],"algorithm":[100],"and":[101,227],"underlying":[103],"hardware":[104],"system":[105],"enable":[107],"efficient":[108,190],"adapter":[110,134],"execution.":[111],"Departing":[112],"layer-wise":[115],"or":[116],"block-wise":[117],"routing,":[118],"AdaFuse":[119,200],"employs":[120],"token-level":[122],"pre-gating":[123],"strategy,":[124],"which":[125],"makes":[126],"single,":[128,189],"global":[129],"routing":[130],"decision":[131],"all":[133,179],"layers":[135],"before":[136],"token":[138],"processed.":[140],"This":[141],"\"decide-once,":[142],"apply-everywhere\"":[143],"approach":[144],"effectively":[145],"staticizes":[146],"execution":[148],"path":[149],"each":[151],"token,":[152],"creating":[153],"an":[154],"opportunity":[155],"holistic":[157],"optimization.":[158],"We":[159],"capitalize":[160],"developing":[164],"custom":[166],"that":[169,199],"performs":[170],"fused":[172],"switching":[173],"operation,":[174],"merging":[175],"parameters":[177],"selected":[180],"LoRA":[181],"into":[183],"backbone":[185],"model":[186,225],"pass.":[191],"Experimental":[192],"results":[193],"popular":[195],"open-source":[196],"LLMs":[197],"show":[198],"achieves":[201],"accuracy":[202],"par":[204],"state-of-the-art":[206],"while":[209],"drastically":[210],"cutting":[211],"factor":[216],"2.4x,":[219],"thereby":[220],"bridging":[221],"gap":[223],"capability":[226],"efficiency.":[229]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-03-14T00:00:00"}
