{"id":"https://openalex.org/W7131855216","doi":"https://doi.org/10.1109/ton.2026.3669011","title":"Serving Long-Context LLMs at the Mobile Edge: Test-Time Reinforcement Learning-Based Model Caching and Inference Offloading","display_name":"Serving Long-Context LLMs at the Mobile Edge: Test-Time Reinforcement Learning-Based Model Caching and Inference Offloading","publication_year":2026,"publication_date":"2026-01-01","ids":{"openalex":"https://openalex.org/W7131855216","doi":"https://doi.org/10.1109/ton.2026.3669011"},"language":"en","primary_location":{"id":"doi:10.1109/ton.2026.3669011","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ton.2026.3669011","pdf_url":null,"source":{"id":"https://openalex.org/S5407042750","display_name":"IEEE Transactions on Networking","issn_l":"2998-4157","issn":["2998-4157"],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Networking","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123045587","display_name":"Minrui Xu","orcid":null},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":true,"raw_author_name":"Minrui Xu","raw_affiliation_strings":["College of Computing and Data Science, Nanyang Technological University, Nanyang Avenue, Singapore"],"affiliations":[{"raw_affiliation_string":"College of Computing and Data Science, Nanyang Technological University, Nanyang Avenue, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5127174162","display_name":"Dusit Niyato","orcid":null},"institutions":[{"id":"https://openalex.org/I172675005","display_name":"Nanyang Technological University","ror":"https://ror.org/02e7b5302","country_code":"SG","type":"education","lineage":["https://openalex.org/I172675005"]}],"countries":["SG"],"is_corresponding":false,"raw_author_name":"Dusit Niyato","raw_affiliation_strings":["College of Computing and Data Science, Nanyang Technological University, Nanyang Avenue, Singapore"],"affiliations":[{"raw_affiliation_string":"College of Computing and Data Science, Nanyang Technological University, Nanyang Avenue, Singapore","institution_ids":["https://openalex.org/I172675005"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5127260516","display_name":"Christopher G. Brinton","orcid":null},"institutions":[{"id":"https://openalex.org/I219193219","display_name":"Purdue University West Lafayette","ror":"https://ror.org/02dqehb95","country_code":"US","type":"education","lineage":["https://openalex.org/I219193219"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Christopher G. Brinton","raw_affiliation_strings":["Elmore Family School of Electrical and Computer Engineering, Purdue University, West Lafayette, IN, USA"],"affiliations":[{"raw_affiliation_string":"Elmore Family School of Electrical and Computer Engineering, Purdue University, West Lafayette, IN, USA","institution_ids":["https://openalex.org/I219193219"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5123045587"],"corresponding_institution_ids":["https://openalex.org/I172675005"],"apc_list":null,"apc_paid":null,"fwci":46.6056,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.99606604,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"34","issue":null,"first_page":"3808","last_page":"3823"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.3449999988079071,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11478","display_name":"Caching and Content Delivery","score":0.3449999988079071,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11896","display_name":"Opportunistic and Delay-Tolerant Networks","score":0.12290000170469284,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10444","display_name":"Context-Aware Activity Recognition Systems","score":0.07859999686479568,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computation-offloading","display_name":"Computation offloading","score":0.7116000056266785},{"id":"https://openalex.org/keywords/reinforcement-learning","display_name":"Reinforcement learning","score":0.6940000057220459},{"id":"https://openalex.org/keywords/server","display_name":"Server","score":0.5521000027656555},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.4982999861240387},{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.47290000319480896},{"id":"https://openalex.org/keywords/resource-allocation","display_name":"Resource allocation","score":0.45739999413490295},{"id":"https://openalex.org/keywords/mobile-edge-computing","display_name":"Mobile edge computing","score":0.4341999888420105},{"id":"https://openalex.org/keywords/enhanced-data-rates-for-gsm-evolution","display_name":"Enhanced Data Rates for GSM Evolution","score":0.40310001373291016},{"id":"https://openalex.org/keywords/service","display_name":"Service (business)","score":0.3873000144958496}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.847000002861023},{"id":"https://openalex.org/C2781041963","wikidata":"https://www.wikidata.org/wiki/Q18348618","display_name":"Computation offloading","level":4,"score":0.7116000056266785},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6940000057220459},{"id":"https://openalex.org/C93996380","wikidata":"https://www.wikidata.org/wiki/Q44127","display_name":"Server","level":2,"score":0.5521000027656555},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.4982999861240387},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.48069998621940613},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.47290000319480896},{"id":"https://openalex.org/C29202148","wikidata":"https://www.wikidata.org/wiki/Q287260","display_name":"Resource allocation","level":2,"score":0.45739999413490295},{"id":"https://openalex.org/C2776061582","wikidata":"https://www.wikidata.org/wiki/Q25325231","display_name":"Mobile edge computing","level":3,"score":0.4341999888420105},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.40310001373291016},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4027999937534332},{"id":"https://openalex.org/C2780378061","wikidata":"https://www.wikidata.org/wiki/Q25351891","display_name":"Service (business)","level":2,"score":0.3873000144958496},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.3846000134944916},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.36250001192092896},{"id":"https://openalex.org/C138236772","wikidata":"https://www.wikidata.org/wiki/Q25098575","display_name":"Edge device","level":3,"score":0.35740000009536743},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3467999994754791},{"id":"https://openalex.org/C186967261","wikidata":"https://www.wikidata.org/wiki/Q5082128","display_name":"Mobile device","level":2,"score":0.3407999873161316},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.3382999897003174},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.33059999346733093},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.31630000472068787},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.30480000376701355},{"id":"https://openalex.org/C2778456923","wikidata":"https://www.wikidata.org/wiki/Q5337692","display_name":"Edge computing","level":3,"score":0.30250000953674316},{"id":"https://openalex.org/C144543869","wikidata":"https://www.wikidata.org/wiki/Q2738570","display_name":"Mobile computing","level":2,"score":0.290800005197525},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.28619998693466187},{"id":"https://openalex.org/C183322885","wikidata":"https://www.wikidata.org/wiki/Q17007702","display_name":"Context model","level":3,"score":0.2840999960899353},{"id":"https://openalex.org/C2780609101","wikidata":"https://www.wikidata.org/wiki/Q17156588","display_name":"Resource management (computing)","level":2,"score":0.27799999713897705},{"id":"https://openalex.org/C2781368080","wikidata":"https://www.wikidata.org/wiki/Q501688","display_name":"Context awareness","level":3,"score":0.2732999920845032},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.2687999904155731},{"id":"https://openalex.org/C68649174","wikidata":"https://www.wikidata.org/wiki/Q1379116","display_name":"Base station","level":2,"score":0.25589999556541443},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2533999979496002},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.2531999945640564}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/ton.2026.3669011","is_oa":false,"landing_page_url":"https://doi.org/10.1109/ton.2026.3669011","pdf_url":null,"source":{"id":"https://openalex.org/S5407042750","display_name":"IEEE Transactions on Networking","issn_l":"2998-4157","issn":["2998-4157"],"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Networking","raw_type":"journal-article"},{"id":"pmh:oai:dr.ntu.edu.sg:10356/211860","is_oa":false,"landing_page_url":"https://hdl.handle.net/10356/211860","pdf_url":null,"source":{"id":"https://openalex.org/S4306402609","display_name":"DR-NTU (Nanyang Technological University)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I172675005","host_organization_name":"Nanyang Technological University","host_organization_lineage":["https://openalex.org/I172675005"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":null}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W2061938485","https://openalex.org/W2251935656","https://openalex.org/W2885965959","https://openalex.org/W2924982946","https://openalex.org/W3047040155","https://openalex.org/W3159959439","https://openalex.org/W3187297977","https://openalex.org/W3214250514","https://openalex.org/W4283218271","https://openalex.org/W4283219705","https://openalex.org/W4283219818","https://openalex.org/W4285197124","https://openalex.org/W4386071707","https://openalex.org/W4386322075","https://openalex.org/W4387126943","https://openalex.org/W4399361151","https://openalex.org/W4400447774","https://openalex.org/W4400728225","https://openalex.org/W4401508395","https://openalex.org/W4401508667","https://openalex.org/W4401878796","https://openalex.org/W4402742293","https://openalex.org/W4406894859","https://openalex.org/W4414008559","https://openalex.org/W7133224126"],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"can":[4,125,194],"perform":[5],"zero-shot":[6],"learning":[7,13,88],"on":[8,14],"unseen":[9],"tasks":[10],"and":[11,41,66,79,93,108,132,141,144,149,215],"few-shot":[12],"complex":[15],"reasoning":[16,177,184,216],"tasks.":[17,217],"However,":[18],"resource-limited":[19],"mobile":[20],"edge":[21,46,52],"networks":[22],"struggle":[23],"to":[24,90,136,146,203],"support":[25],"long-context":[26,97],"LLM":[27,30,49,98,210],"serving":[28,50],"for":[29,96],"agents":[31,211],"during":[32,152],"multi-round":[33],"interactions":[34],"with":[35,179],"users.":[36],"Unlike":[37],"stateless":[38],"computation":[39],"offloading":[40,44,81],"static":[42],"service":[43,142],"in":[45,119,127,212],"computing,":[47],"optimizing":[48],"at":[51,199],"servers":[53],"is":[54],"challenging":[55],"because":[56],"LLMs":[57],"continuously":[58],"learn":[59,126],"from":[60],"context":[61,117,147],"which":[62,168],"raises":[63],"accuracy,":[64],"latency,":[65],"resource":[67,157],"consumption":[68],"dynamics.":[69],"In":[70,100],"this":[71,101],"paper,":[72],"we":[73,103,160],"propose":[74,161],"a":[75,162],"joint":[76],"model":[77],"caching":[78],"inference":[80],"framework":[82],"that":[83,190],"utilizes":[84],"test-time":[85],"deep":[86],"reinforcement":[87],"(T2DRL)":[89],"optimize":[91],"deployment":[92],"execution":[94],"strategies":[95],"serving.":[99],"framework,":[102],"analyze":[104],"the":[105,114,122,129,133,171,180,191,207],"performance":[106,208],"convergence":[107],"design":[109],"an":[110,175],"optimization":[111],"problem":[112],"considering":[113],"utilization":[115],"of":[116,174,183,209],"windows":[118],"LLMs.":[120],"Furthermore,":[121],"T2DRL":[123,192],"algorithm":[124,193],"both":[128],"training":[130],"phase":[131,135],"testing":[134],"proactively":[137],"manage":[138],"cached":[139],"models":[140],"requests":[143],"adapt":[145],"changes":[148],"usage":[150],"patterns":[151],"execution.":[153],"To":[154],"further":[155],"enhance":[156],"allocation":[158],"efficiency,":[159],"double":[163],"Dutch":[164],"auction":[165],"(DDA)":[166],"mechanism,":[167],"dynamically":[169],"aligns":[170],"marginal":[172,181],"value":[173],"additional":[176],"path":[178],"cost":[182],"services.":[185],"Finally,":[186],"experimental":[187],"results":[188],"demonstrate":[189],"reduce":[195],"system":[196],"costs":[197],"by":[198],"least":[200],"30%":[201],"compared":[202],"baselines":[204],"while":[205],"guaranteeing":[206],"real-world":[213],"perception":[214]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-04-02T15:55:50.835912","created_date":"2026-02-28T00:00:00"}
