{"id":"https://openalex.org/W4416004632","doi":"https://doi.org/10.1145/3731599.3767354","title":"Engine-Agnostic Model Hot-Swapping for Cost-Effective LLM Inference","display_name":"Engine-Agnostic Model Hot-Swapping for Cost-Effective LLM Inference","publication_year":2025,"publication_date":"2025-11-07","ids":{"openalex":"https://openalex.org/W4416004632","doi":"https://doi.org/10.1145/3731599.3767354"},"language":"en","primary_location":{"id":"doi:10.1145/3731599.3767354","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3731599.3767354","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3731599.3767354","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5063796147","display_name":"Radostin Stoyanov","orcid":"https://orcid.org/0000-0001-9688-2615"},"institutions":[{"id":"https://openalex.org/I40120149","display_name":"University of Oxford","ror":"https://ror.org/052gg0110","country_code":"GB","type":"education","lineage":["https://openalex.org/I40120149"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Radostin Stoyanov","raw_affiliation_strings":["University of Oxford, Oxford, United Kingdom"],"affiliations":[{"raw_affiliation_string":"University of Oxford, Oxford, United Kingdom","institution_ids":["https://openalex.org/I40120149"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5059329816","display_name":"Vikt\u00f3ria Spi\u0161akov\u00e1","orcid":"https://orcid.org/0000-0001-5204-1478"},"institutions":[{"id":"https://openalex.org/I21449261","display_name":"Masaryk University","ror":"https://ror.org/02j46qs45","country_code":"CZ","type":"education","lineage":["https://openalex.org/I21449261"]}],"countries":["CZ"],"is_corresponding":false,"raw_author_name":"Vikt\u00f3ria Spi\u0161akov\u00e1","raw_affiliation_strings":["Masaryk University, Brno, Czech Republic"],"affiliations":[{"raw_affiliation_string":"Masaryk University, Brno, Czech Republic","institution_ids":["https://openalex.org/I21449261"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5106808379","display_name":"Adrian Reber","orcid":"https://orcid.org/0009-0007-4959-6561"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Adrian Reber","raw_affiliation_strings":["Red Hat, Stuttgart, Germany"],"affiliations":[{"raw_affiliation_string":"Red Hat, Stuttgart, Germany","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5080149070","display_name":"Wesley Armour","orcid":"https://orcid.org/0000-0003-1756-3064"},"institutions":[{"id":"https://openalex.org/I40120149","display_name":"University of Oxford","ror":"https://ror.org/052gg0110","country_code":"GB","type":"education","lineage":["https://openalex.org/I40120149"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Wesley Armour","raw_affiliation_strings":["University of Oxford, Oxford, United Kingdom"],"affiliations":[{"raw_affiliation_string":"University of Oxford, Oxford, United Kingdom","institution_ids":["https://openalex.org/I40120149"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5077355144","display_name":"Marcin Copik","orcid":"https://orcid.org/0000-0002-7606-5519"},"institutions":[{"id":"https://openalex.org/I35440088","display_name":"ETH Zurich","ror":"https://ror.org/05a28rw58","country_code":"CH","type":"education","lineage":["https://openalex.org/I2799323385","https://openalex.org/I35440088"]}],"countries":["CH"],"is_corresponding":false,"raw_author_name":"Marcin Copik","raw_affiliation_strings":["ETH Zurich, Zurich, Switzerland"],"affiliations":[{"raw_affiliation_string":"ETH Zurich, Zurich, Switzerland","institution_ids":["https://openalex.org/I35440088"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5041696536","display_name":"Rodrigo Bruno","orcid":"https://orcid.org/0000-0003-1578-5149"},"institutions":[{"id":"https://openalex.org/I121345201","display_name":"Instituto de Engenharia de Sistemas e Computadores Investiga\u00e7\u00e3o e Desenvolvimento","ror":"https://ror.org/04mqy3p58","country_code":"PT","type":"nonprofit","lineage":["https://openalex.org/I121345201","https://openalex.org/I4210125590"]},{"id":"https://openalex.org/I141596103","display_name":"University of Lisbon","ror":"https://ror.org/01c27hj86","country_code":"PT","type":"education","lineage":["https://openalex.org/I141596103"]}],"countries":["PT"],"is_corresponding":false,"raw_author_name":"Rodrigo Bruno","raw_affiliation_strings":["INESC-ID, Instituto Superior T\u00e9cnico, University of Lisbon, Lisbon, Portugal"],"affiliations":[{"raw_affiliation_string":"INESC-ID, Instituto Superior T\u00e9cnico, University of Lisbon, Lisbon, Portugal","institution_ids":["https://openalex.org/I121345201","https://openalex.org/I141596103"]}]}],"institutions":[],"countries_distinct_count":4,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5063796147"],"corresponding_institution_ids":["https://openalex.org/I40120149"],"apc_list":null,"apc_paid":null,"fwci":1.242,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.85197837,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"114","last_page":"125"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.186599999666214,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.186599999666214,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.18000000715255737,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.08609999716281891,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.8184999823570251},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.6442000269889832},{"id":"https://openalex.org/keywords/preemption","display_name":"Preemption","score":0.6370999813079834},{"id":"https://openalex.org/keywords/reservation","display_name":"Reservation","score":0.6118999719619751},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5449000000953674},{"id":"https://openalex.org/keywords/resource","display_name":"Resource (disambiguation)","score":0.489300012588501},{"id":"https://openalex.org/keywords/inference-engine","display_name":"Inference engine","score":0.4693000018596649}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8327000141143799},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.8184999823570251},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.6442000269889832},{"id":"https://openalex.org/C206952183","wikidata":"https://www.wikidata.org/wiki/Q1193100","display_name":"Preemption","level":2,"score":0.6370999813079834},{"id":"https://openalex.org/C2777632111","wikidata":"https://www.wikidata.org/wiki/Q1937518","display_name":"Reservation","level":2,"score":0.6118999719619751},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5449000000953674},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.489300012588501},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.4693000018596649},{"id":"https://openalex.org/C198370458","wikidata":"https://www.wikidata.org/wiki/Q586459","display_name":"Type inference","level":3,"score":0.3935000002384186},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.38850000500679016},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3799999952316284},{"id":"https://openalex.org/C29202148","wikidata":"https://www.wikidata.org/wiki/Q287260","display_name":"Resource allocation","level":2,"score":0.3776000142097473},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3628000020980835},{"id":"https://openalex.org/C2777472644","wikidata":"https://www.wikidata.org/wiki/Q16968992","display_name":"Approximate inference","level":3,"score":0.35010001063346863},{"id":"https://openalex.org/C74172769","wikidata":"https://www.wikidata.org/wiki/Q1446839","display_name":"Routing (electronic design automation)","level":2,"score":0.34630000591278076},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.29820001125335693},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.2904999852180481},{"id":"https://openalex.org/C158600405","wikidata":"https://www.wikidata.org/wiki/Q5054566","display_name":"Causal inference","level":2,"score":0.2903999984264374},{"id":"https://openalex.org/C2777735758","wikidata":"https://www.wikidata.org/wiki/Q817765","display_name":"Path (computing)","level":2,"score":0.27549999952316284}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.1145/3731599.3767354","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3731599.3767354","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},{"id":"pmh:doi:10.3929/ethz-c-000789144","is_oa":true,"landing_page_url":"http://hdl.handle.net/20.500.11850/789144","pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Conference Paper"},{"id":"pmh:oai:ora.ox.ac.uk:uuid:f15d8679-07b3-467d-b27d-01e112c7b4b1","is_oa":false,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4306402636","display_name":"Oxford University Research Archive (ORA) (University of Oxford)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I40120149","host_organization_name":"University of Oxford","host_organization_lineage":["https://openalex.org/I40120149"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Symplectic Elements","raw_type":"http://purl.org/coar/resource_type/c_5794"}],"best_oa_location":{"id":"doi:10.1145/3731599.3767354","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3731599.3767354","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W3012028616","https://openalex.org/W3130689885","https://openalex.org/W3204998121","https://openalex.org/W3205898353","https://openalex.org/W4214690606","https://openalex.org/W4321013654","https://openalex.org/W4387321091","https://openalex.org/W4388041447","https://openalex.org/W4388874804","https://openalex.org/W4394923312","https://openalex.org/W4396823873","https://openalex.org/W4396833177","https://openalex.org/W4401211704","https://openalex.org/W4403337153","https://openalex.org/W4404385775","https://openalex.org/W4409248734"],"related_works":[],"abstract_inverted_index":{"The":[0],"widespread":[1],"adoption":[2],"of":[3,22],"Large":[4],"Language":[5],"Models":[6],"(LLMs)":[7],"has":[8],"led":[9],"to":[10,40,46,130,134,137],"an":[11],"increased":[12],"demand":[13],"for":[14,24,64,77,122],"large-scale":[15],"inference":[16,43,112,124],"services,":[17],"presenting":[18],"a":[19,59,95],"unique":[20],"set":[21],"challenges":[23],"the":[25],"HPC":[26],"community.":[27],"These":[28],"services":[29],"are":[30],"characterized":[31],"by":[32,73,85,126],"moderate-scale":[33],"models":[34],"that":[35,117],"require":[36],"dedicating":[37],"expensive":[38],"GPUs":[39],"handle":[41],"bursty":[42],"requests,":[44,102],"leading":[45],"high":[47],"costs":[48],"and":[49,103,132],"resource":[50,83],"underutilization.":[51],"In":[52],"this":[53],"paper,":[54],"we":[55],"propose":[56],"SwapServeLLM":[57,81,118],"\u2014":[58],"novel":[60],"engine-agnostic":[61],"hot-swapping":[62,69],"method":[63],"cost-effective":[65,140],"inference.":[66,141],"This":[67],"model":[68,120],"approach":[70],"is":[71],"enabled":[72],"recent":[74],"driver":[75],"capabilities":[76],"transparent":[78],"GPU":[79,88],"checkpointing.":[80],"optimizes":[82,119],"utilization":[84],"dynamically":[86],"allocating":[87],"resources":[89],"with":[90,108],"two":[91],"key":[92],"mechanisms:":[93],"(1)":[94],"demand-aware":[96],"preemption":[97],"leveraging":[98],"information":[99],"about":[100],"concurrent":[101],"(2)":[104],"efficient":[105],"request":[106],"routing":[107],"memory":[109],"reservation":[110],"minimizing":[111],"latency.":[113],"Our":[114],"evaluation":[115],"demonstrates":[116],"loading":[121],"state-of-the-art":[123],"engines":[125],"31":[127],"\u00d7":[128],"compared":[129,136],"vLLM":[131],"up":[133],"29%":[135],"Ollama,":[138],"enabling":[139]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-11-07T00:00:00"}
