{"id":"https://openalex.org/W4414332261","doi":"https://doi.org/10.1145/3768622","title":"Real-time, Work-conserving GPU Scheduling for Concurrent DNN Inference","display_name":"Real-time, Work-conserving GPU Scheduling for Concurrent DNN Inference","publication_year":2025,"publication_date":"2025-09-18","ids":{"openalex":"https://openalex.org/W4414332261","doi":"https://doi.org/10.1145/3768622"},"language":"en","primary_location":{"id":"doi:10.1145/3768622","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3768622","pdf_url":null,"source":{"id":"https://openalex.org/S193109227","display_name":"ACM Transactions on Computer Systems","issn_l":"0734-2071","issn":["0734-2071","1557-7333"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Computer Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030930550","display_name":"Mingcong Han","orcid":"https://orcid.org/0009-0008-1536-7485"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Mingcong Han","raw_affiliation_strings":["Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University","Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai, China"],"affiliations":[{"raw_affiliation_string":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai, China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101543146","display_name":"Rong Chen","orcid":"https://orcid.org/0000-0002-6115-8130"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rong Chen","raw_affiliation_strings":["Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University","Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai China"],"affiliations":[{"raw_affiliation_string":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085303726","display_name":"Weihang Shen","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Weihang Shen","raw_affiliation_strings":["Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University","Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai China"],"affiliations":[{"raw_affiliation_string":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052531993","display_name":"Hanze Zhang","orcid":"https://orcid.org/0009-0009-0579-5707"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hanze Zhang","raw_affiliation_strings":["Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University","Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai China"],"affiliations":[{"raw_affiliation_string":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101837111","display_name":"Jianguang Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jinrong Yang","raw_affiliation_strings":["Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University","Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai China"],"affiliations":[{"raw_affiliation_string":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai China","institution_ids":["https://openalex.org/I183067930"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100406215","display_name":"Haibo Chen","orcid":"https://orcid.org/0000-0002-9720-0361"},"institutions":[{"id":"https://openalex.org/I183067930","display_name":"Shanghai Jiao Tong University","ror":"https://ror.org/0220qvk04","country_code":"CN","type":"education","lineage":["https://openalex.org/I183067930"]},{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Haibo Chen","raw_affiliation_strings":["China and Key Laboratory of System Software, Chinese Academy of Sciences","Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University","China and Key Laboratory of System Software, Chinese Academy of Sciences, Beijing China","Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai China"],"affiliations":[{"raw_affiliation_string":"China and Key Laboratory of System Software, Chinese Academy of Sciences","institution_ids":["https://openalex.org/I4210128818"]},{"raw_affiliation_string":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University","institution_ids":["https://openalex.org/I183067930"]},{"raw_affiliation_string":"China and Key Laboratory of System Software, Chinese Academy of Sciences, Beijing China","institution_ids":["https://openalex.org/I4210128818"]},{"raw_affiliation_string":"Institute of Parallel and Distributed Systems, Shanghai Jiao Tong University, Shanghai China","institution_ids":["https://openalex.org/I183067930"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5030930550"],"corresponding_institution_ids":["https://openalex.org/I183067930"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.24946423,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"44","issue":"1","first_page":"1","last_page":"42"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12702","display_name":"Brain Tumor Detection and Classification","score":0.9803000092506409,"subfield":{"id":"https://openalex.org/subfields/2808","display_name":"Neurology"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/inference","display_name":"Inference","score":0.7132999897003174},{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.63919997215271},{"id":"https://openalex.org/keywords/preemption","display_name":"Preemption","score":0.580299973487854},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.5051000118255615},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.44690001010894775},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.4442000091075897},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.40790000557899475},{"id":"https://openalex.org/keywords/virtual-machine","display_name":"Virtual machine","score":0.3952000141143799}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.9121000170707703},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.7132999897003174},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.63919997215271},{"id":"https://openalex.org/C206952183","wikidata":"https://www.wikidata.org/wiki/Q1193100","display_name":"Preemption","level":2,"score":0.580299973487854},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5771999955177307},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.5051000118255615},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.44690001010894775},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.4442000091075897},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4205000102519989},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.40790000557899475},{"id":"https://openalex.org/C25344961","wikidata":"https://www.wikidata.org/wiki/Q192726","display_name":"Virtual machine","level":2,"score":0.3952000141143799},{"id":"https://openalex.org/C165435473","wikidata":"https://www.wikidata.org/wiki/Q1509884","display_name":"Padding","level":2,"score":0.33379998803138733},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.3292999863624573},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3165000081062317},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.31619998812675476},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.3100000023841858},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.3070000112056732},{"id":"https://openalex.org/C2778915421","wikidata":"https://www.wikidata.org/wiki/Q3643177","display_name":"Performance improvement","level":2,"score":0.3005000054836273},{"id":"https://openalex.org/C108094655","wikidata":"https://www.wikidata.org/wiki/Q181593","display_name":"Sorting algorithm","level":3,"score":0.28519999980926514},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.274399995803833},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.2732999920845032},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.2687999904155731},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.259799987077713},{"id":"https://openalex.org/C156884757","wikidata":"https://www.wikidata.org/wiki/Q798554","display_name":"Backtracking","level":2,"score":0.2538999915122986},{"id":"https://openalex.org/C46743427","wikidata":"https://www.wikidata.org/wiki/Q1341685","display_name":"Inference engine","level":3,"score":0.25060001015663147}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3768622","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3768622","pdf_url":null,"source":{"id":"https://openalex.org/S193109227","display_name":"ACM Transactions on Computer Systems","issn_l":"0734-2071","issn":["0734-2071","1557-7333"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Computer Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1985229168","https://openalex.org/W2152517358","https://openalex.org/W2153190325","https://openalex.org/W2153375074","https://openalex.org/W2160815625","https://openalex.org/W2163264184","https://openalex.org/W2323909431","https://openalex.org/W2791964734","https://openalex.org/W2811206245","https://openalex.org/W2913208203","https://openalex.org/W2914959946","https://openalex.org/W3019425739","https://openalex.org/W3117590843","https://openalex.org/W3206333142","https://openalex.org/W3214035171","https://openalex.org/W4214690606","https://openalex.org/W4242746603","https://openalex.org/W4318541551","https://openalex.org/W4372263604","https://openalex.org/W4376571587","https://openalex.org/W4381785762","https://openalex.org/W4387321503","https://openalex.org/W4394944658","https://openalex.org/W4415003708"],"related_works":[],"abstract_inverted_index":{"Many":[0],"intelligent":[1],"applications,":[2],"such":[3],"as":[4],"autonomous":[5],"driving":[6],"and":[7,15,27,40,83,88,99,141,155,195,202],"virtual":[8],"reality,":[9],"require":[10],"running":[11],"both":[12,25,200],"latency-critical":[13],"(real-time)":[14],"best-effort":[16,56,89,143,173],"deep":[17],"neural":[18],"network":[19],"(DNN)":[20],"inference":[21,77,119,150,188],"tasks":[22,52,57,219,232],"to":[23,46,53,58,175,227,230,261],"achieve":[24],"real-time":[26,51,87,133,169,218,267],"work-conserving":[28,84],"on":[29,114,135,199],"the":[30,73,115,136,168,178,222,236],"GPU.":[31],"However,":[32],"commodity":[33],"GPUs":[34,204],"lack":[35],"efficient":[36],"preemptive":[37],"scheduling":[38,231],"support,":[39],"existing":[41],"state-of-the-art":[42],"approaches":[43],"either":[44],"have":[45,152],"monopolize":[47],"GPU":[48,104,137,179],"or":[49,66],"let":[50],"wait":[54],"for":[55,85,217,266],"complete,":[59],"resulting":[60],"in":[61,103,109,214,263],"low":[62],"utilization,":[63],"high":[64],"latency,":[65,157],"both.":[67],"This":[68],"article":[69],"presents":[70],"Reef":[71,91,106,124,158,207,244,256],",":[72],"first":[74],"GPU-accelerated":[75],"DNN":[76,118,149,187],"serving":[78,189,250],"system":[79],"that":[80,117,130,165,206,255],"achieves":[81],"low-latency":[82],"concurrent":[86,101],"tasks.":[90],"accomplishes":[92],"this":[93],"by":[94,138,225,259],"enabling":[95],"microsecond-scale":[96],"kernel":[97,134,162,170],"preemption":[98,128],"controlled":[100],"execution":[102],"scheduling.":[105],"is":[107],"novel":[108],"two":[110],"ways.":[111],"First,":[112],"based":[113],"observation":[116],"kernels":[120,144,151,174],"are":[121],"mostly":[122],"idempotent,":[123],"devises":[125],"a":[126,132,160,185,196,247],"reset-based":[127],"scheme":[129],"launches":[131],"proactively":[139],"killing":[140],"restoring":[142],"at":[145],"microsecond-scale.":[146],"Second,":[147],"since":[148],"varied":[153],"parallelism":[154],"predictable":[156],"proposes":[159],"dynamic":[161],"padding":[163],"mechanism":[164],"dynamically":[166],"pads":[167],"with":[171,180,192,245],"appropriate":[172],"fully":[176],"utilize":[177],"negligible":[181],"overhead.":[182],"Evaluation":[183],"using":[184],"new":[186],"benchmark":[190],"(DISB)":[191],"diverse":[193],"workloads":[194],"real-world":[197],"trace":[198],"NVIDIA":[201],"AMD":[203],"shows":[205,254],"only":[208],"incurs":[209],"less":[210],"than":[211],"5%":[212],"overhead":[213],"end-to-end":[215,264],"latency":[216,265],"but":[220],"increases":[221],"overall":[223],"throughput":[224],"up":[226],"1.53\u00d7,":[228],"compared":[229],"sequentially.":[233],"To":[234],"demonstrate":[235],"practical":[237],"benefits":[238],"of":[239],"our":[240],"approach,":[241],"we":[242],"compare":[243],"Triton,":[246],"widely-adopted":[248],"production-level":[249],"system.":[251],"Our":[252],"evaluation":[253],"outperforms":[257],"Triton":[258],"1.12\u00d7":[260],"5.20\u00d7":[262],"tasks,":[268],"while":[269],"maintaining":[270],"comparable":[271],"throughput.":[272]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
