{"id":"https://openalex.org/W7138092506","doi":"https://doi.org/10.1609/aaai.v40i38.40502","title":"SlimInfer: Accelerating Long-Context LLM Inference via Dynamic Token Pruning","display_name":"SlimInfer: Accelerating Long-Context LLM Inference via Dynamic Token Pruning","publication_year":2026,"publication_date":"2026-03-14","ids":{"openalex":"https://openalex.org/W7138092506","doi":"https://doi.org/10.1609/aaai.v40i38.40502"},"language":null,"primary_location":{"id":"doi:10.1609/aaai.v40i38.40502","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i38.40502","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/40502/44463","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://ojs.aaai.org/index.php/AAAI/article/download/40502/44463","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101291748","display_name":"Lingkun Long","orcid":null},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Lingkun Long","raw_affiliation_strings":["Beihang University"],"affiliations":[{"raw_affiliation_string":"Beihang University","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129681945","display_name":"Rubing Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rubing Yang","raw_affiliation_strings":["Beihang University"],"affiliations":[{"raw_affiliation_string":"Beihang University","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129742168","display_name":"Yushi Huang","orcid":null},"institutions":[{"id":"https://openalex.org/I200769079","display_name":"Hong Kong University of Science and Technology","ror":"https://ror.org/00q4vv597","country_code":"HK","type":"education","lineage":["https://openalex.org/I200769079"]}],"countries":["HK"],"is_corresponding":false,"raw_author_name":"Yushi Huang","raw_affiliation_strings":["Hong Kong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Hong Kong University of Science and Technology","institution_ids":["https://openalex.org/I200769079"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129657701","display_name":"Desheng Hui","orcid":null},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Desheng Hui","raw_affiliation_strings":["Beihang University"],"affiliations":[{"raw_affiliation_string":"Beihang University","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5129648586","display_name":"Ao Zhou","orcid":null},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ao Zhou","raw_affiliation_strings":["Beihang University"],"affiliations":[{"raw_affiliation_string":"Beihang University","institution_ids":["https://openalex.org/I82880672"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5129741596","display_name":"Jianlei Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I82880672","display_name":"Beihang University","ror":"https://ror.org/00wk2mp56","country_code":"CN","type":"education","lineage":["https://openalex.org/I82880672"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianlei Yang","raw_affiliation_strings":["Beihang University"],"affiliations":[{"raw_affiliation_string":"Beihang University","institution_ids":["https://openalex.org/I82880672"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5101291748"],"corresponding_institution_ids":["https://openalex.org/I82880672"],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.45251397,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"40","issue":"38","first_page":"32284","last_page":"32292"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.13269999623298645,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.13269999623298645,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13702","display_name":"Machine Learning in Healthcare","score":0.1273999959230423,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.07050000131130219,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/security-token","display_name":"Security token","score":0.7390000224113464},{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.710099995136261},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.692799985408783},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.6643000245094299},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.5694000124931335},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.5321999788284302},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.5101000070571899},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5019999742507935}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8736000061035156},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.7390000224113464},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.710099995136261},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.692799985408783},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.6643000245094299},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.5694000124931335},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5321999788284302},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5101000070571899},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5019999742507935},{"id":"https://openalex.org/C111335779","wikidata":"https://www.wikidata.org/wiki/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.42320001125335693},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.4196999967098236},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.38429999351501465},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.3587999939918518},{"id":"https://openalex.org/C188198153","wikidata":"https://www.wikidata.org/wiki/Q1613840","display_name":"Limiting","level":2,"score":0.35040000081062317},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.33970001339912415},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.33820000290870667},{"id":"https://openalex.org/C48103436","wikidata":"https://www.wikidata.org/wiki/Q599031","display_name":"State (computer science)","level":2,"score":0.3199999928474426},{"id":"https://openalex.org/C23224414","wikidata":"https://www.wikidata.org/wiki/Q176769","display_name":"Hidden Markov model","level":2,"score":0.31360000371932983},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.30309998989105225},{"id":"https://openalex.org/C46637626","wikidata":"https://www.wikidata.org/wiki/Q6693015","display_name":"Low latency (capital markets)","level":2,"score":0.301800012588501},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.2874999940395355},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.27619999647140503},{"id":"https://openalex.org/C115067241","wikidata":"https://www.wikidata.org/wiki/Q1639854","display_name":"Token passing","level":3,"score":0.26840001344680786}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1609/aaai.v40i38.40502","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i38.40502","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/40502/44463","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1609/aaai.v40i38.40502","is_oa":true,"landing_page_url":"https://doi.org/10.1609/aaai.v40i38.40502","pdf_url":"https://ojs.aaai.org/index.php/AAAI/article/download/40502/44463","source":{"id":"https://openalex.org/S4210191458","display_name":"Proceedings of the AAAI Conference on Artificial Intelligence","issn_l":"2159-5399","issn":["2159-5399","2374-3468"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/P4310320058","host_organization_name":"Association for the Advancement of Artificial Intelligence","host_organization_lineage":["https://openalex.org/P4310320058"],"host_organization_lineage_names":["Association for the Advancement of Artificial Intelligence"],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the AAAI Conference on Artificial Intelligence","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W7138092506.pdf","grobid_xml":"https://content.openalex.org/works/W7138092506.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Long-context":[0],"inference":[1,49],"for":[2,172],"Large":[3],"Language":[4],"Models":[5],"(LLMs)":[6],"is":[7,64],"heavily":[8],"limited":[9],"by":[10,50,109],"high":[11],"computational":[12],"demands.":[13],"While":[14],"several":[15],"existing":[16],"methods":[17],"optimize":[18],"attention":[19],"computation,":[20],"they":[21],"still":[22],"process":[23,86],"the":[24,58,81],"full":[25],"set":[26],"of":[27,123],"hidden":[28,106,124],"states":[29],"at":[30,126],"each":[31],"layer,":[32],"limiting":[33],"overall":[34],"efficiency.":[35],"In":[36],"this":[37],"work,":[38],"we":[39],"propose":[40],"SlimInfer,":[41],"an":[42,65,134],"innovative":[43],"framework":[44],"that":[45,88,118,139,157],"aims":[46],"to":[47,162],"accelerate":[48],"directly":[51],"pruning":[52,116,131],"less":[53],"critical":[54,72,101],"prompt":[55],"tokens":[56,73,122],"during":[57],"forward":[59],"pass.":[60],"Our":[61],"key":[62],"insight":[63],"information":[66,70],"diffusion":[67,85],"phenomenon:":[68],"As":[69],"from":[71],"propagates":[74],"through":[75],"layers,":[76],"it":[77],"becomes":[78],"distributed":[79],"across":[80],"entire":[82],"sequence.":[83],"This":[84,129],"suggests":[87],"LLMs":[89],"can":[90,159],"maintain":[91],"their":[92],"semantic":[93],"integrity":[94],"when":[95],"excessive":[96],"tokens,":[97],"even":[98],"including":[99],"these":[100],"ones,":[102],"are":[103],"pruned":[104],"in":[105],"states.":[107],"Motivated":[108],"this,":[110],"SlimInfer":[111,158],"introduces":[112],"a":[113,175],"dynamic":[114],"fine-grained":[115],"mechanism":[117],"accurately":[119],"removes":[120],"redundant":[121],"state":[125],"intermediate":[127],"layers.":[128],"layer-wise":[130],"naturally":[132],"enables":[133],"asynchronous":[135],"KV":[136],"cache":[137],"manager":[138],"prefetches":[140],"required":[141],"token":[142],"blocks":[143],"without":[144,179],"complex":[145],"predictors,":[146],"reducing":[147],"both":[148],"memory":[149],"usage":[150],"and":[151,167],"I/O":[152],"costs.":[153],"Extensive":[154],"experiments":[155],"show":[156],"achieve":[160],"up":[161],"2.53\u00d7":[163],"time-to-first-token":[164],"(TTFT)":[165],"speedup":[166],"1.88\u00d7":[168],"end-to-end":[169],"latency":[170],"reduction":[171],"LLaMA3.1-8B-Instruct":[173],"on":[174,182],"single":[176],"RTX":[177],"4090,":[178],"sacrificing":[180],"performance":[181],"LongBench.":[183]},"counts_by_year":[],"updated_date":"2026-03-20T20:47:17.329874","created_date":"2026-03-18T00:00:00"}
