{"id":"https://openalex.org/W7124166242","doi":"https://doi.org/10.1109/icpads67057.2025.11323066","title":"Efficiency Optimization Under Spatiotemporal Sharing Fairness for Deep Learning Workloads in Heterogeneous GPU Clusters","display_name":"Efficiency Optimization Under Spatiotemporal Sharing Fairness for Deep Learning Workloads in Heterogeneous GPU Clusters","publication_year":2025,"publication_date":"2025-12-14","ids":{"openalex":"https://openalex.org/W7124166242","doi":"https://doi.org/10.1109/icpads67057.2025.11323066"},"language":null,"primary_location":{"id":"doi:10.1109/icpads67057.2025.11323066","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icpads67057.2025.11323066","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 31th International Conference on Parallel and Distributed Systems (ICPADS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5123011720","display_name":"Chunhong Du","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Chunhong Du","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University,China"],"affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123016019","display_name":"Mengyu Shi","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Mengyu Shi","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University,China"],"affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5010031092","display_name":"Shanjiang Tang","orcid":"https://orcid.org/0000-0001-9533-9899"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shanjiang Tang","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University,China"],"affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123021403","display_name":"Jianhang Tang","orcid":null},"institutions":[{"id":"https://openalex.org/I178232147","display_name":"Guizhou University","ror":"https://ror.org/02wmsc916","country_code":"CN","type":"education","lineage":["https://openalex.org/I178232147"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianhang Tang","raw_affiliation_strings":["Guizhou University,State Key Laboratory of Public Big Data,China"],"affiliations":[{"raw_affiliation_string":"Guizhou University,State Key Laboratory of Public Big Data,China","institution_ids":["https://openalex.org/I178232147"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123017527","display_name":"Ce Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Ce Yu","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University,China"],"affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048451391","display_name":"Jun Xiao","orcid":"https://orcid.org/0000-0003-0065-9975"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jian Xiao","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University,China"],"affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100418660","display_name":"Chao Sun","orcid":"https://orcid.org/0000-0003-4237-0635"},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Chao Sun","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University,China"],"affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University,China","institution_ids":["https://openalex.org/I162868743"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5123020562","display_name":"Bin Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I162868743","display_name":"Tianjin University","ror":"https://ror.org/012tb2g32","country_code":"CN","type":"education","lineage":["https://openalex.org/I162868743"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Bin Yang","raw_affiliation_strings":["College of Intelligence and Computing, Tianjin University,China"],"affiliations":[{"raw_affiliation_string":"College of Intelligence and Computing, Tianjin University,China","institution_ids":["https://openalex.org/I162868743"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5123011720"],"corresponding_institution_ids":["https://openalex.org/I162868743"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.80097726,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.1729000061750412,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.1729000061750412,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.1670999974012375,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.13840000331401825,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scheduling","display_name":"Scheduling (production processes)","score":0.6808000206947327},{"id":"https://openalex.org/keywords/job-shop-scheduling","display_name":"Job shop scheduling","score":0.5626999735832214},{"id":"https://openalex.org/keywords/fairness-measure","display_name":"Fairness measure","score":0.4941999912261963},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.4339999854564667},{"id":"https://openalex.org/keywords/resource-allocation","display_name":"Resource allocation","score":0.39910000562667847},{"id":"https://openalex.org/keywords/constraint","display_name":"Constraint (computer-aided design)","score":0.3977000117301941},{"id":"https://openalex.org/keywords/throughput","display_name":"Throughput","score":0.3970000147819519},{"id":"https://openalex.org/keywords/time-complexity","display_name":"Time complexity","score":0.39640000462532043},{"id":"https://openalex.org/keywords/shared-resource","display_name":"Shared resource","score":0.36559998989105225}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8281999826431274},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.6808000206947327},{"id":"https://openalex.org/C55416958","wikidata":"https://www.wikidata.org/wiki/Q6206757","display_name":"Job shop scheduling","level":3,"score":0.5626999735832214},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5600000023841858},{"id":"https://openalex.org/C11867375","wikidata":"https://www.wikidata.org/wiki/Q5430671","display_name":"Fairness measure","level":4,"score":0.4941999912261963},{"id":"https://openalex.org/C126255220","wikidata":"https://www.wikidata.org/wiki/Q141495","display_name":"Mathematical optimization","level":1,"score":0.44429999589920044},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.4339999854564667},{"id":"https://openalex.org/C29202148","wikidata":"https://www.wikidata.org/wiki/Q287260","display_name":"Resource allocation","level":2,"score":0.39910000562667847},{"id":"https://openalex.org/C2776036281","wikidata":"https://www.wikidata.org/wiki/Q48769818","display_name":"Constraint (computer-aided design)","level":2,"score":0.3977000117301941},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.3970000147819519},{"id":"https://openalex.org/C311688","wikidata":"https://www.wikidata.org/wiki/Q2393193","display_name":"Time complexity","level":2,"score":0.39640000462532043},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.36970001459121704},{"id":"https://openalex.org/C51332947","wikidata":"https://www.wikidata.org/wiki/Q1172305","display_name":"Shared resource","level":2,"score":0.36559998989105225},{"id":"https://openalex.org/C33676613","wikidata":"https://www.wikidata.org/wiki/Q13415176","display_name":"Dimension (graph theory)","level":2,"score":0.36329999566078186},{"id":"https://openalex.org/C2780609101","wikidata":"https://www.wikidata.org/wiki/Q17156588","display_name":"Resource management (computing)","level":2,"score":0.3456000089645386},{"id":"https://openalex.org/C191015642","wikidata":"https://www.wikidata.org/wiki/Q1132459","display_name":"Fragmentation (computing)","level":2,"score":0.3418999910354614},{"id":"https://openalex.org/C158207573","wikidata":"https://www.wikidata.org/wiki/Q5747224","display_name":"Heterogeneous network","level":4,"score":0.32409998774528503},{"id":"https://openalex.org/C2989134064","wikidata":"https://www.wikidata.org/wiki/Q288510","display_name":"Execution time","level":2,"score":0.3206999897956848},{"id":"https://openalex.org/C177972170","wikidata":"https://www.wikidata.org/wiki/Q17097315","display_name":"Max-min fairness","level":3,"score":0.31349998712539673},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.3068000078201294},{"id":"https://openalex.org/C172430144","wikidata":"https://www.wikidata.org/wiki/Q17111997","display_name":"Symmetric multiprocessor system","level":2,"score":0.30059999227523804},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3003999888896942},{"id":"https://openalex.org/C2781335571","wikidata":"https://www.wikidata.org/wiki/Q2633544","display_name":"GPU cluster","level":3,"score":0.29989999532699585},{"id":"https://openalex.org/C164866538","wikidata":"https://www.wikidata.org/wiki/Q367351","display_name":"Cluster (spacecraft)","level":2,"score":0.2994999885559082},{"id":"https://openalex.org/C56086750","wikidata":"https://www.wikidata.org/wiki/Q6042592","display_name":"Integer programming","level":2,"score":0.28929999470710754},{"id":"https://openalex.org/C148764684","wikidata":"https://www.wikidata.org/wiki/Q621751","display_name":"Approximation algorithm","level":2,"score":0.28850001096725464},{"id":"https://openalex.org/C2984822820","wikidata":"https://www.wikidata.org/wiki/Q1123036","display_name":"Processor scheduling","level":3,"score":0.28279998898506165},{"id":"https://openalex.org/C3018263672","wikidata":"https://www.wikidata.org/wiki/Q1296251","display_name":"Efficient algorithm","level":2,"score":0.2770000100135803},{"id":"https://openalex.org/C179799912","wikidata":"https://www.wikidata.org/wiki/Q205084","display_name":"Computational complexity theory","level":2,"score":0.25839999318122864}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icpads67057.2025.11323066","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icpads67057.2025.11323066","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 IEEE 31th International Conference on Parallel and Distributed Systems (ICPADS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.44934943318367004,"display_name":"Decent work and economic growth","id":"https://metadata.un.org/sdg/8"}],"awards":[{"id":"https://openalex.org/G6145281776","display_name":null,"funder_award_id":"2023YFF1204101","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"},{"id":"https://openalex.org/G7436653633","display_name":null,"funder_award_id":"2025AB027","funder_id":"https://openalex.org/F4320326153","funder_display_name":"Xinjiang Production and Construction Corps"}],"funders":[{"id":"https://openalex.org/F4320326153","display_name":"Xinjiang Production and Construction Corps","ror":"https://ror.org/03hcmxw73"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":14,"referenced_works":["https://openalex.org/W2144578941","https://openalex.org/W2798515322","https://openalex.org/W2962793481","https://openalex.org/W3016889634","https://openalex.org/W3019166713","https://openalex.org/W3022298203","https://openalex.org/W3022548332","https://openalex.org/W3039718313","https://openalex.org/W4205916020","https://openalex.org/W4214940715","https://openalex.org/W4245843336","https://openalex.org/W4387321109","https://openalex.org/W4404385885","https://openalex.org/W4404787801"],"related_works":[],"abstract_inverted_index":{"Modern":[0],"GPU":[1,15,46,64,84],"clusters":[2,34,149],"increasingly":[3],"comprise":[4],"diverse":[5],"heterogeneous":[6,148],"GPUs,":[7],"driven":[8],"by":[9,162,169,177],"the":[10,50,140],"continuous":[11],"release":[12],"of":[13,53],"new":[14],"models.":[16],"Achieving":[17],"a":[18,76,90,99],"balance":[19],"between":[20],"fairness":[21,44,79,93,114],"and":[22,66,95,146,150,174],"efficiency":[23],"when":[24],"scheduling":[25,117],"multi-tenant":[26],"Deep":[27],"Learning":[28],"(DL)":[29],"training":[30,40],"jobs":[31,97],"on":[32,143],"such":[33],"is":[35,56,119,128],"inherently":[36],"challenging.":[37],"Existing":[38],"DL":[39],"schedulers":[41],"largely":[42],"emphasize":[43],"through":[45],"temporal":[47],"sharing,":[48],"while":[49,111],"spatial":[51],"dimension":[52],"resource":[54],"allocation":[55,101],"often":[57],"underexplored.":[58],"This":[59],"oversight":[60],"can":[61],"lead":[62],"to":[63,130,185],"fragmentation":[65],"suboptimal":[67],"system":[68],"performance.":[69],"In":[70],"this":[71],"paper,":[72],"we":[73],"propose":[74],"STS-Fairness,":[75],"spatiotemporal":[77,92,113],"sharing":[78],"scheduler.":[80],"STS-Fairness":[81,106,141,158],"partitions":[82],"each":[83],"into":[85],"multiple":[86],"isolated":[87],"slots":[88],"under":[89],"novel":[91],"constraint":[94],"allocates":[96],"using":[98],"round-based":[100],"mechanism.":[102],"We":[103,138],"guarantee":[104],"that":[105,127,157],"achieves":[107],"overall":[108],"performance":[109],"optimality":[110,131],"satisfying":[112],"constraints.":[115],"The":[116],"problem":[118],"formulated":[120],"as":[121],"an":[122],"integer":[123],"nonlinear":[124],"program":[125],"(INLP)":[126],"solved":[129],"in":[132],"polynomial":[133],"time":[134],"via":[135],"dynamic":[136],"programming.":[137],"deployed":[139],"framework":[142],"both":[144],"physical":[145],"simulated":[147],"conducted":[151],"large-scale":[152],"experiments.":[153],"These":[154],"results":[155],"demonstrate":[156],"reduces":[159],"average":[160],"JCT":[161],"<tex":[163,170,178],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[164,171,179],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$1.2":[165],"\\times$</tex>,":[166,173],"shortens":[167],"makespan":[168],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$1.24":[172],"increases":[175],"throughput":[176],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">$\\mathbf{1.":[180],"2":[181],"5}":[182],"\\times$</tex>":[183],"compared":[184],"state-of-the-art":[186],"(SoTA)":[187],"schedulers.":[188]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2026-01-15T00:00:00"}
