{"id":"https://openalex.org/W4400907367","doi":"https://doi.org/10.1109/tpds.2024.3432620","title":"Sophisticated Orchestrating Concurrent DLRM Training on CPU/GPU Platform","display_name":"Sophisticated Orchestrating Concurrent DLRM Training on CPU/GPU Platform","publication_year":2024,"publication_date":"2024-07-23","ids":{"openalex":"https://openalex.org/W4400907367","doi":"https://doi.org/10.1109/tpds.2024.3432620"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2024.3432620","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2024.3432620","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100771415","display_name":"Rui Tian","orcid":"https://orcid.org/0009-0007-6582-0877"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rui Tian","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0009-0007-6582-0877","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5048148542","display_name":"Jiazhi Jiang","orcid":"https://orcid.org/0000-0002-1417-3012"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiazhi Jiang","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0002-1417-3012","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024085713","display_name":"Jiangsu Du","orcid":"https://orcid.org/0000-0003-4707-9492"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiangsu Du","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0003-4707-9492","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041534890","display_name":"Dan Huang","orcid":"https://orcid.org/0000-0001-5582-1031"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Dan Huang","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":"https://orcid.org/0000-0001-5582-1031","affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101633465","display_name":"Yutong Lu","orcid":"https://orcid.org/0000-0001-5315-3375"},"institutions":[{"id":"https://openalex.org/I157773358","display_name":"Sun Yat-sen University","ror":"https://ror.org/0064kty71","country_code":"CN","type":"education","lineage":["https://openalex.org/I157773358"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yutong Lu","raw_affiliation_strings":["School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China","institution_ids":["https://openalex.org/I157773358"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I157773358"],"apc_list":null,"apc_paid":null,"fwci":1.3511,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.85116868,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":95},"biblio":{"volume":"35","issue":"11","first_page":"2177","last_page":"2192"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10203","display_name":"Recommender Systems and Techniques","score":0.9926000237464905,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10203","display_name":"Recommender Systems and Techniques","score":0.9926000237464905,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9847000241279602,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9574000239372253,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8831826448440552},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5474733114242554},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.45195263624191284},{"id":"https://openalex.org/keywords/computer-architecture","display_name":"Computer architecture","score":0.41608884930610657}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8831826448440552},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5474733114242554},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.45195263624191284},{"id":"https://openalex.org/C118524514","wikidata":"https://www.wikidata.org/wiki/Q173212","display_name":"Computer architecture","level":1,"score":0.41608884930610657},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2024.3432620","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2024.3432620","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":30,"referenced_works":["https://openalex.org/W2475334473","https://openalex.org/W2723293840","https://openalex.org/W2753767282","https://openalex.org/W2947737663","https://openalex.org/W2962745591","https://openalex.org/W2964182926","https://openalex.org/W2984100107","https://openalex.org/W2987219697","https://openalex.org/W2997024057","https://openalex.org/W3014367186","https://openalex.org/W3014810041","https://openalex.org/W3016842236","https://openalex.org/W3035078899","https://openalex.org/W3043004181","https://openalex.org/W3043433718","https://openalex.org/W3131607724","https://openalex.org/W3155455841","https://openalex.org/W3158146252","https://openalex.org/W3206393216","https://openalex.org/W4200542591","https://openalex.org/W4224903494","https://openalex.org/W4285815213","https://openalex.org/W4286635279","https://openalex.org/W4321636494","https://openalex.org/W4382203129","https://openalex.org/W4401211718","https://openalex.org/W6763737044","https://openalex.org/W6769725786","https://openalex.org/W6774806506","https://openalex.org/W6776172991"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W230091440","https://openalex.org/W2390279801","https://openalex.org/W2233261550","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2810751659"],"abstract_inverted_index":{"Recommendation":[0,16],"systems":[1,61],"are":[2],"essential":[3],"to":[4,26,87,90,179,224],"the":[5,8,38,56,68,100,143,163],"operation":[6],"of":[7,10,35,40,58,70,102,147,165],"majority":[9,57],"internet":[11],"services,":[12],"with":[13,172,198],"Deep":[14],"Learning":[15],"Models":[17],"(DLRMs)":[18],"serving":[19],"as":[20,55,94],"a":[21,112,168,199,210],"crucial":[22],"component.":[23],"However,":[24],"due":[25,89],"distinct":[27],"computation,":[28],"data":[29],"access,":[30],"and":[31,99,125,153,192,208,226,233],"memory":[32,97,145],"usage":[33],"characteristics":[34],"recommendation":[36,242],"models,":[37],"trainning":[39],"DLRMs":[41],"may":[42],"suffer":[43],"from":[44],"low":[45,72],"resource":[46,73,155,206],"utilization":[47,74,156,207],"on":[48,64,132,162,184,201,230],"prevalent":[49],"heterogeneous":[50,133],"CPU-GPU":[51,104,134,185],"hardware":[52],"platforms.":[53,135],"Furthermore,":[54],"high-performance":[59],"computing":[60,66],"presently":[62],"depend":[63],"multi-GPU":[65],"nodes,":[67],"challenge":[69],"addressing":[71],"becomes":[75],"even":[76],"more":[77],"pronounced.":[78],"Existing":[79],"concurrent":[80],"training":[81,130,138,159,182,238],"solutions":[82],"cannot":[83],"be":[84],"straightforwardly":[85],"applied":[86],"DLRM":[88,129,158,181],"various":[91,241],"factors,":[92],"such":[93],"insufficient":[95],"fine-grained":[96],"management":[98,124],"lack":[101],"collaborative":[103],"scheduling.":[105],"In":[106],"this":[107],"paper,":[108],"we":[109,140,151],"introduce":[110],"RMixer,":[111],"scheduling":[113,126,175],"framework":[114],"that":[115,219],"addresses":[116],"these":[117],"challenges":[118],"by":[119],"providing":[120],"an":[121],"efficient":[122],"job":[123,170],"mechanism":[127],"for":[128,157,189,237],"jobs":[131,183,239],"To":[136],"facilitate":[137],"co-location,":[139],"first":[141],"estimate":[142],"peak":[144],"consumption":[146],"each":[148],"job.":[149],"Additionally,":[150],"track":[152],"collect":[154],"jobs.":[160],"Based":[161],"information":[164],"computational":[166],"patterns,":[167],"batched":[169],"dispatcher":[171],"dynamic":[173],"resource-complementary":[174],"policy":[176],"is":[177],"proposed":[178],"co-locate":[180],"platform.":[186],"Scheduling":[187],"strategies":[188],"both":[190],"intra-GPU":[191],"inter-GPU":[193],"scenarios":[194],"were":[195],"meticulously":[196],"devised,":[197],"focus":[200],"thoroughly":[202],"examining":[203],"individual":[204],"GPU":[205,232,235],"achieving":[209],"balanced":[211],"state":[212],"across":[213],"multiple":[214],"GPUs.":[215],"Experimental":[216],"results":[217],"demonstrate":[218],"our":[220],"implementation":[221],"achieved":[222],"up":[223],"5.3\u00d7":[225],"7.5\u00d7":[227],"higher":[228],"throughput":[229],"single":[231],"4":[234],"respectively":[236],"involving":[240],"models.":[243]},"counts_by_year":[{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1}],"updated_date":"2026-06-26T08:34:08.712188","created_date":"2025-10-10T00:00:00"}
