{"id":"https://openalex.org/W7116437496","doi":"https://doi.org/10.1145/3754598.3754634","title":"CoreTuner: Predicting and Scheduling Framework for Optimizing the Joint Allocation of CPU and GPU in Training Cluster","display_name":"CoreTuner: Predicting and Scheduling Framework for Optimizing the Joint Allocation of CPU and GPU in Training Cluster","publication_year":2025,"publication_date":"2025-09-08","ids":{"openalex":"https://openalex.org/W7116437496","doi":"https://doi.org/10.1145/3754598.3754634"},"language":null,"primary_location":{"id":"doi:10.1145/3754598.3754634","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754634","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3754598.3754634","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5120889847","display_name":"Hao Dong","orcid":null},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Hao Dong","raw_affiliation_strings":["Tongji University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0004-4092-4096","affiliations":[{"raw_affiliation_string":"Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100528405","display_name":"Yuehao Xu","orcid":"https://orcid.org/0000-0003-4172-8084"},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuehao Xu","raw_affiliation_strings":["Tongji University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0003-4172-8084","affiliations":[{"raw_affiliation_string":"Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120881111","display_name":"Xiaohui Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaohui Wang","raw_affiliation_strings":["UCloud Technology Co., Ltd, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0005-1281-7987","affiliations":[{"raw_affiliation_string":"UCloud Technology Co., Ltd, Shanghai, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xinhua Ji","orcid":"https://orcid.org/0009-0004-9885-4000"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xinhua Ji","raw_affiliation_strings":["UCloud Technology Co., Ltd, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0004-9885-4000","affiliations":[{"raw_affiliation_string":"UCloud Technology Co., Ltd, Shanghai, China","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5004062085","display_name":"Zhijun Ding","orcid":null},"institutions":[{"id":"https://openalex.org/I116953780","display_name":"Tongji University","ror":"https://ror.org/03rc6as71","country_code":"CN","type":"education","lineage":["https://openalex.org/I116953780"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhijun Ding","raw_affiliation_strings":["Tongji University, Shanghai, China"],"raw_orcid":"https://orcid.org/0000-0003-2178-6201","affiliations":[{"raw_affiliation_string":"Tongji University, Shanghai, China","institution_ids":["https://openalex.org/I116953780"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5120889847"],"corresponding_institution_ids":["https://openalex.org/I116953780"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.72703414,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"43","last_page":"52"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.3587999939918518,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.3587999939918518,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.1639000028371811,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.1543000042438507,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.6759999990463257},{"id":"https://openalex.org/keywords/central-processing-unit","display_name":"Central processing unit","score":0.555400013923645},{"id":"https://openalex.org/keywords/multi-core-processor","display_name":"Multi-core processor","score":0.4562000036239624},{"id":"https://openalex.org/keywords/execution-time","display_name":"Execution time","score":0.4359999895095825},{"id":"https://openalex.org/keywords/cpu-shielding","display_name":"CPU shielding","score":0.4120999872684479},{"id":"https://openalex.org/keywords/job-shop-scheduling","display_name":"Job shop scheduling","score":0.39899998903274536},{"id":"https://openalex.org/keywords/turnaround-time","display_name":"Turnaround time","score":0.39309999346733093},{"id":"https://openalex.org/keywords/processor-scheduling","display_name":"Processor scheduling","score":0.3928999900817871},{"id":"https://openalex.org/keywords/resource-allocation","display_name":"Resource allocation","score":0.38429999351501465}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8396000266075134},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.6759999990463257},{"id":"https://openalex.org/C49154492","wikidata":"https://www.wikidata.org/wiki/Q5300","display_name":"Central processing unit","level":2,"score":0.555400013923645},{"id":"https://openalex.org/C78766204","wikidata":"https://www.wikidata.org/wiki/Q555032","display_name":"Multi-core processor","level":2,"score":0.4562000036239624},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.4359999895095825},{"id":"https://openalex.org/C180613757","wikidata":"https://www.wikidata.org/wiki/Q5013757","display_name":"CPU shielding","level":3,"score":0.4120999872684479},{"id":"https://openalex.org/C55416958","wikidata":"https://www.wikidata.org/wiki/Q6206757","display_name":"Job shop scheduling","level":3,"score":0.39899998903274536},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.397599995136261},{"id":"https://openalex.org/C176553487","wikidata":"https://www.wikidata.org/wiki/Q7855819","display_name":"Turnaround time","level":2,"score":0.39309999346733093},{"id":"https://openalex.org/C2984822820","wikidata":"https://www.wikidata.org/wiki/Q1123036","display_name":"Processor scheduling","level":3,"score":0.3928999900817871},{"id":"https://openalex.org/C29202148","wikidata":"https://www.wikidata.org/wiki/Q287260","display_name":"Resource allocation","level":2,"score":0.38429999351501465},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.37540000677108765},{"id":"https://openalex.org/C31689143","wikidata":"https://www.wikidata.org/wiki/Q733809","display_name":"Fair-share scheduling","level":3,"score":0.37130001187324524},{"id":"https://openalex.org/C175154964","wikidata":"https://www.wikidata.org/wiki/Q380077","display_name":"Task analysis","level":3,"score":0.361299991607666},{"id":"https://openalex.org/C2780898871","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Performance metric","level":2,"score":0.3571000099182129},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.3481000065803528},{"id":"https://openalex.org/C18555067","wikidata":"https://www.wikidata.org/wiki/Q8375051","display_name":"Joint (building)","level":2,"score":0.3303999900817871},{"id":"https://openalex.org/C107568181","wikidata":"https://www.wikidata.org/wiki/Q5319000","display_name":"Dynamic priority scheduling","level":3,"score":0.328000009059906},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.3257000148296356},{"id":"https://openalex.org/C2780609101","wikidata":"https://www.wikidata.org/wiki/Q17156588","display_name":"Resource management (computing)","level":2,"score":0.29739999771118164},{"id":"https://openalex.org/C29140674","wikidata":"https://www.wikidata.org/wiki/Q206637","display_name":"Computer cluster","level":2,"score":0.28349998593330383},{"id":"https://openalex.org/C111873713","wikidata":"https://www.wikidata.org/wiki/Q1641413","display_name":"Job scheduler","level":3,"score":0.2833999991416931},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.2800000011920929},{"id":"https://openalex.org/C200130814","wikidata":"https://www.wikidata.org/wiki/Q362858","display_name":"Worst-case execution time","level":3,"score":0.2791999876499176},{"id":"https://openalex.org/C127456818","wikidata":"https://www.wikidata.org/wiki/Q238879","display_name":"Rate-monotonic scheduling","level":4,"score":0.2750000059604645},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.2606000006198883}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3754598.3754634","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754634","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3754598.3754634","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3754598.3754634","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 54th International Conference on Parallel Processing","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":25,"referenced_works":["https://openalex.org/W639708223","https://openalex.org/W2031489346","https://openalex.org/W2064675550","https://openalex.org/W2073291219","https://openalex.org/W2083842231","https://openalex.org/W2105947650","https://openalex.org/W2183341477","https://openalex.org/W2194775991","https://openalex.org/W2219888463","https://openalex.org/W2257408573","https://openalex.org/W2605350416","https://openalex.org/W2618530766","https://openalex.org/W2798515322","https://openalex.org/W2903557836","https://openalex.org/W2910100551","https://openalex.org/W3022548332","https://openalex.org/W3086105743","https://openalex.org/W3135013702","https://openalex.org/W3136172274","https://openalex.org/W3197816522","https://openalex.org/W3204802562","https://openalex.org/W4362647413","https://openalex.org/W4385731844","https://openalex.org/W4401408744","https://openalex.org/W4401408761"],"related_works":[],"abstract_inverted_index":{"Resource":[0],"wastage":[1],"is":[2,21],"common":[3],"in":[4,46,65],"GPU":[5,40,233],"clusters":[6],"running":[7],"deep":[8],"learning":[9],"training":[10],"(DLT)":[11],"tasks.":[12],"Studies":[13],"show":[14,217],"that":[15,134,163,198,218],"improper":[16],"allocation":[17,44,52,85,114,194],"of":[18,23,73,77,115,210],"CPU":[19,42,116,123,129,138],"resources":[20],"one":[22,174],"the":[24,35,59,67,74,82,112,153,196,206,213,223,226,246],"main":[25],"factors":[26],"contributing":[27],"to":[28,54,195,202,238,245],"this":[29,150],"waste.":[30],"Existing":[31],"methods":[32,62,87],"can":[33,144,220],"predict":[34],"task":[36,91,227],"durations":[37],"under":[38],"different":[39],"and":[41,48,70,79,93,108,117,126,152,166,179,232,241],"joint":[43,113],"schemes":[45],"advance":[47],"then":[49],"guide":[50],"resource":[51,56,84,193],"scheduling":[53,86,109,188],"reduce":[55],"wastage.":[57],"However,":[58],"existing":[60,83,247],"prediction":[61,68,161,171,177],"are":[63,199],"challenging":[64],"meeting":[66],"accuracy":[69],"efficiency":[71,178],"requirements":[72],"complex":[75],"combination":[76],"GPUs":[78],"CPUs.":[80],"Meanwhile,":[81],"only":[88,173],"consider":[89],"individual":[90],"optimization":[92],"cannot":[94],"achieve":[95],"global":[96],"optimal":[97,187],"allocation.":[98],"To":[99],"address":[100],"these":[101],"weaknesses,":[102],"we":[103,120,156,182],"propose":[104,157],"CoreTuner,":[105],"a":[106,122,141,158,184],"predicting":[107],"framework":[110],"for":[111],"GPU.":[118],"Firstly,":[119],"define":[121],"influence":[124],"model":[125,151],"an":[127],"Optimal":[128],"Core":[130],"Count":[131],"(OCCC)":[132],"metric":[133],"quantifies":[135],"how":[136],"increasing":[137],"cores":[139],"beyond":[140],"certain":[142],"threshold":[143],"degrade":[145],"performance.":[146],"Secondly,":[147],"based":[148],"on":[149],"OCCC":[154],"metric,":[155],"multi-schemes":[159,170],"duration":[160],"algorithm":[162,191],"combines":[164],"sampling":[165],"extrapolation,":[167],"achieving":[168],"accurate":[169],"with":[172],"sampling,":[175],"balancing":[176],"accuracy.":[180],"Finally,":[181],"designed":[183],"cluster-level":[185],"dynamic":[186],"algorithm.":[189],"This":[190],"prioritizes":[192],"tasks":[197,211],"most":[200],"sensitive":[201],"resources,":[203],"thereby":[204],"reducing":[205],"overall":[207],"execution":[208],"time":[209],"across":[212],"cluster.":[214],"Experimental":[215],"results":[216],"CoreTuner":[219],"significantly":[221],"improve":[222],"cluster\u2019s":[224],"performance:":[225],"makespan,":[228],"average":[229],"turnaround":[230],"time,":[231],"utilization":[234],"increase":[235],"by":[236],"up":[237],"51.3%,":[239],"75.3%,":[240],"56.54%,":[242],"respectively,":[243],"compared":[244],"algorithms.":[248]},"counts_by_year":[],"updated_date":"2025-12-21T02:06:08.432651","created_date":"2025-12-21T00:00:00"}
