{"id":"https://openalex.org/W7155060099","doi":"https://doi.org/10.48550/arxiv.2604.16400","title":"CoLLM: Continuous Adaptation for SLO-Aware LLM Serving on Shared GPU Clusters","display_name":"CoLLM: Continuous Adaptation for SLO-Aware LLM Serving on Shared GPU Clusters","publication_year":2026,"publication_date":"2026-03-31","ids":{"openalex":"https://openalex.org/W7155060099","doi":"https://doi.org/10.48550/arxiv.2604.16400"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2604.16400","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.16400","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2604.16400","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5000210993","display_name":"Shaoyuan Huang","orcid":"https://orcid.org/0000-0002-4091-6457"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Huang, Shaoyuan","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134108134","display_name":"Xiaokai Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yunfeng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134175926","display_name":"Na Yan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yan, Na","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134128151","display_name":"Xiaofei Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Tiancheng","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134136901","display_name":"Wenyu Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xiaokai","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5134147779","display_name":"Yansha Deng","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xiaofei","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Wenyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Wenyu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Deng, Yansha","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Deng, Yansha","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5000210993"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.08860000222921371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.08860000222921371,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10273","display_name":"IoT and Edge/Fog Computing","score":0.08540000021457672,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.08190000057220459,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.8586000204086304},{"id":"https://openalex.org/keywords/replica","display_name":"Replica","score":0.4742000102996826},{"id":"https://openalex.org/keywords/reuse","display_name":"Reuse","score":0.46459999680519104},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.4577000141143799},{"id":"https://openalex.org/keywords/enhanced-data-rates-for-gsm-evolution","display_name":"Enhanced Data Rates for GSM Evolution","score":0.42899999022483826},{"id":"https://openalex.org/keywords/inference-engine","display_name":"Inference engine","score":0.4117000102996826},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.3621000051498413}],"concepts":[{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.8586000204086304},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7727000117301941},{"id":"https://openalex.org/C2775937380","wikidata":"https://www.wikidata.org/wiki/Q1232589","display_name":"Replica","level":2,"score":0.4742000102996826},{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.46459999680519104},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.4577000141143799},{"id":"https://openalex.org/C162307627","wikidata":"https://www.wikidata.org/wiki/Q204833","display_name":"Enhanced Data Rates for GSM Evolution","level":2,"score":0.42899999022483826},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.4117000102996826},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3698999881744385},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.36340001225471497},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3621000051498413},{"id":"https://openalex.org/C2777472644","wikidata":"https://www.wikidata.org/wiki/Q16968992","display_name":"Approximate inference","level":3,"score":0.35989999771118164},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.33340001106262207},{"id":"https://openalex.org/C158600405","wikidata":"https://www.wikidata.org/wiki/Q5054566","display_name":"Causal inference","level":2,"score":0.3278000056743622},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.32339999079704285},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.31040000915527344},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.28839999437332153},{"id":"https://openalex.org/C177284502","wikidata":"https://www.wikidata.org/wiki/Q1005390","display_name":"Adapter (computing)","level":2,"score":0.2750000059604645},{"id":"https://openalex.org/C42058472","wikidata":"https://www.wikidata.org/wiki/Q810214","display_name":"Base (topology)","level":2,"score":0.2680000066757202},{"id":"https://openalex.org/C2164484","wikidata":"https://www.wikidata.org/wiki/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.26080000400543213},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.25110000371932983}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2604.16400","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.16400","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2604.16400","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2604.16400","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"As":[0],"Large":[1],"Language":[2],"Models":[3],"(LLMs)":[4],"are":[5,57],"increasingly":[6],"adopted":[7],"in":[8,40,69,75,188],"edge":[9,103,194],"intelligence":[10],"to":[11,34,152,181],"power":[12],"domain-specific":[13],"applications":[14],"and":[15,20,28,46,55,67,72,88,99,105,115,134,138,149,159,168],"personalized":[16],"services,":[17],"the":[18,23],"quality":[19,157],"efficiency":[21],"of":[22],"LLM":[24,177,191],"post-training":[25,192],"phase-including":[26],"fine-tuning":[27,43,54,148],"inference,":[29],"have":[30,49],"become":[31],"critical":[32],"due":[33],"constrained":[35],"resources.":[36],"Although":[37],"recent":[38],"advances":[39],"federated":[41],"parameter-efficient":[42],"(FL":[44],"PEFT)":[45],"low-latency":[47],"inference":[48,56,76,100,133,150,161],"improved":[50],"individual":[51],"task":[52],"performance,":[53],"still":[58],"handled":[59],"as":[60],"isolated":[61],"workloads,":[62],"which":[63],"overlooks":[64],"their":[65],"interdependence":[66],"results":[68],"redundant":[70],"deployments":[71],"delayed":[73],"improvement":[74],"quality.":[77],"To":[78],"address":[79],"these":[80],"limitations,":[81],"we":[82],"introduce":[83],"a":[84,93,140],"new":[85],"co-execution":[86],"framework":[87],"instantiate":[89],"it":[90],"with":[91],"CoLLM,":[92],"system":[94],"that":[95,125,145,172],"unifies":[96],"FL":[97],"PEFT":[98],"on":[101],"shared":[102],"replicas":[104],"model":[106,122,128,156],"parameters.":[107],"CoLLM":[108,173],"addresses":[109],"key":[110],"challenges":[111],"at":[112],"both":[113],"replica":[114],"cluster":[116],"levels":[117],"through:":[118],"(1)":[119],"an":[120],"intra-replica":[121],"sharing":[123],"mechanism":[124],"enables":[126],"real-time":[127],"parameter":[129],"reuse":[130],"via":[131],"unmerged":[132],"shadow":[135],"adapter":[136],"strategies;":[137],"(2)":[139],"two-timescale":[141],"inter-replica":[142],"coordination":[143],"algorithm":[144],"adaptively":[146],"balances":[147],"workloads":[151],"jointly":[153],"optimize":[154],"long-term":[155],"gains":[158],"short-term":[160],"efficiency.":[162],"Extensive":[163],"evaluation":[164],"across":[165],"diverse":[166],"LLMs":[167],"real-world":[169],"traces":[170],"show":[171],"consistently":[174],"outperforms":[175],"state-of-the-art":[176],"systems,":[178],"achieving":[179],"up":[180],"3x":[182],"higher":[183],"goodput,":[184],"demonstrating":[185],"its":[186],"effectiveness":[187],"enabling":[189],"seamless":[190],"for":[193],"intelligence.":[195]},"counts_by_year":[],"updated_date":"2026-05-20T06:11:20.791850","created_date":"2026-04-22T00:00:00"}
