{"id":"https://openalex.org/W4321192305","doi":"https://doi.org/10.1109/icce56470.2023.10043527","title":"Addressing Straggler Problem Through Dynamic Partial All-Reduce for Distributed Deep Learning in Heterogeneous GPU Clusters","display_name":"Addressing Straggler Problem Through Dynamic Partial All-Reduce for Distributed Deep Learning in Heterogeneous GPU Clusters","publication_year":2023,"publication_date":"2023-01-06","ids":{"openalex":"https://openalex.org/W4321192305","doi":"https://doi.org/10.1109/icce56470.2023.10043527"},"language":"en","primary_location":{"id":"doi:10.1109/icce56470.2023.10043527","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icce56470.2023.10043527","pdf_url":null,"source":{"id":"https://openalex.org/S4363607959","display_name":"2023 IEEE International Conference on Consumer Electronics (ICCE)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Conference on Consumer Electronics (ICCE)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100388393","display_name":"Hyungjun Kim","orcid":"https://orcid.org/0009-0003-5078-4405"},"institutions":[{"id":"https://openalex.org/I197347611","display_name":"Korea University","ror":"https://ror.org/047dqcg40","country_code":"KR","type":"education","lineage":["https://openalex.org/I197347611"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"HyungJun Kim","raw_affiliation_strings":["Korea University,Dept. of Computer Science and Engineering,Seoul,Republic of Korea","Dept. of Computer Science and Engineering, Korea University, Seoul, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Korea University,Dept. of Computer Science and Engineering,Seoul,Republic of Korea","institution_ids":["https://openalex.org/I197347611"]},{"raw_affiliation_string":"Dept. of Computer Science and Engineering, Korea University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I197347611"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113377726","display_name":"Chunggeon Song","orcid":null},"institutions":[{"id":"https://openalex.org/I197347611","display_name":"Korea University","ror":"https://ror.org/047dqcg40","country_code":"KR","type":"education","lineage":["https://openalex.org/I197347611"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Chunggeon Song","raw_affiliation_strings":["Korea University,Dept. of Computer Science and Engineering,Seoul,Republic of Korea","Dept. of Computer Science and Engineering, Korea University, Seoul, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Korea University,Dept. of Computer Science and Engineering,Seoul,Republic of Korea","institution_ids":["https://openalex.org/I197347611"]},{"raw_affiliation_string":"Dept. of Computer Science and Engineering, Korea University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I197347611"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051519943","display_name":"HwaMin Lee\ufffd","orcid":"https://orcid.org/0000-0002-6482-3511"},"institutions":[{"id":"https://openalex.org/I2799980853","display_name":"Korea University Medical Center","ror":"https://ror.org/02cs2sd33","country_code":"KR","type":"healthcare","lineage":["https://openalex.org/I197347611","https://openalex.org/I2799980853"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"HwaMin Lee","raw_affiliation_strings":["Korea University,Dept. of Medical Informatics,Seoul,Republic of Korea","Dept. of Medical Informatics, Korea University, Seoul, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Korea University,Dept. of Medical Informatics,Seoul,Republic of Korea","institution_ids":["https://openalex.org/I2799980853"]},{"raw_affiliation_string":"Dept. of Medical Informatics, Korea University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I2799980853"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5052703193","display_name":"Heonchang Yu","orcid":"https://orcid.org/0000-0003-2216-595X"},"institutions":[{"id":"https://openalex.org/I197347611","display_name":"Korea University","ror":"https://ror.org/047dqcg40","country_code":"KR","type":"education","lineage":["https://openalex.org/I197347611"]}],"countries":["KR"],"is_corresponding":false,"raw_author_name":"Heonchang Yu","raw_affiliation_strings":["Korea University,Dept. of Computer Science and Engineering,Seoul,Republic of Korea","Dept. of Computer Science and Engineering, Korea University, Seoul, Republic of Korea"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"Korea University,Dept. of Computer Science and Engineering,Seoul,Republic of Korea","institution_ids":["https://openalex.org/I197347611"]},{"raw_affiliation_string":"Dept. of Computer Science and Engineering, Korea University, Seoul, Republic of Korea","institution_ids":["https://openalex.org/I197347611"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.4159,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.51184346,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":90,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11612","display_name":"Stochastic Gradient Optimization Techniques","score":0.9994999766349792,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9987000226974487,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12676","display_name":"Machine Learning and ELM","score":0.9983000159263611,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7972052097320557},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.7545809745788574},{"id":"https://openalex.org/keywords/deep-learning","display_name":"Deep learning","score":0.5692113041877747},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.5665539503097534},{"id":"https://openalex.org/keywords/convergence","display_name":"Convergence (economics)","score":0.5359533429145813},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.4921712279319763},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.48023277521133423},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.45954981446266174},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.4544218182563782},{"id":"https://openalex.org/keywords/distributed-algorithm","display_name":"Distributed algorithm","score":0.4108010530471802},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.2989301085472107},{"id":"https://openalex.org/keywords/computer-network","display_name":"Computer network","score":0.16843605041503906}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7972052097320557},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.7545809745788574},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.5692113041877747},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.5665539503097534},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.5359533429145813},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.4921712279319763},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.48023277521133423},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45954981446266174},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.4544218182563782},{"id":"https://openalex.org/C130120984","wikidata":"https://www.wikidata.org/wiki/Q2835898","display_name":"Distributed algorithm","level":2,"score":0.4108010530471802},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.2989301085472107},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.16843605041503906},{"id":"https://openalex.org/C50522688","wikidata":"https://www.wikidata.org/wiki/Q189833","display_name":"Economic growth","level":1,"score":0.0},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icce56470.2023.10043527","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icce56470.2023.10043527","pdf_url":null,"source":{"id":"https://openalex.org/S4363607959","display_name":"2023 IEEE International Conference on Consumer Electronics (ICCE)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 IEEE International Conference on Consumer Electronics (ICCE)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.5799999833106995,"display_name":"Peace, Justice and strong institutions"},{"id":"https://metadata.un.org/sdg/10","score":0.44999998807907104,"display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":12,"referenced_works":["https://openalex.org/W2194775991","https://openalex.org/W2559655401","https://openalex.org/W2622263826","https://openalex.org/W2982664135","https://openalex.org/W3011751313","https://openalex.org/W3141595720","https://openalex.org/W4212774754","https://openalex.org/W4301239768","https://openalex.org/W6679154944","https://openalex.org/W6745458143","https://openalex.org/W6756976665","https://openalex.org/W6766978945"],"related_works":["https://openalex.org/W2058965144","https://openalex.org/W2164382479","https://openalex.org/W2146343568","https://openalex.org/W4246549241","https://openalex.org/W2168758875","https://openalex.org/W2410733619","https://openalex.org/W2963483475","https://openalex.org/W2120877146","https://openalex.org/W2766289720","https://openalex.org/W4310174516"],"abstract_inverted_index":{"Distributed":[0,42],"deep":[1,10,19,43],"learning":[2,8,11,20,44,133,238,245],"is":[3,45,81,88,96,165,171,210],"an":[4],"inevitable":[5],"choice":[6],"in":[7,33,146,219],"large-scale":[9],"models":[12,21],"today.":[13],"Beyond":[14],"a":[15,82,131,137,162],"certain":[16],"level,":[17],"training":[18],"can":[22,28,109],"take":[23],"days":[24],"or":[25,40],"months,":[26],"which":[27],"lead":[29],"to":[30,54,90,113,118,149,185,200,230,249,256],"catastrophic":[31],"consequences":[32],"applications":[34],"that":[35,84,135],"require":[36],"rapid":[37],"trend":[38],"reflection":[39],"decision-making.":[41],"largely":[46],"divided":[47],"into":[48],"synchronous":[49,138],"and":[50,79,180,221,252,265,267],"asynchronous":[51],"methods":[52],"according":[53],"the":[55,59,67,70,85,91,105,114,119,152,155,167,175,182,187,191,195,201,208,216,224,231,237,240,250,253],"synchronization":[56,148,202],"method":[57,139],"at":[58],"time":[60],"of":[61,73,154,169,226,233,242],"parameter":[62,68],"update.":[63],"The":[64,94],"former":[65],"updates":[66,100],"with":[69],"average":[71],"value":[72],"gradient":[74,121],"calculated":[75],"by":[76,173,212],"all":[77,268],"workers,":[78],"there":[80],"problem":[83],"processing":[86],"speed":[87,220],"matched":[89],"slowest":[92,106],"worker.":[93],"latter":[95],"faster":[97],"because":[98],"it":[99,198],"parameters":[101],"without":[102],"waiting":[103],"for":[104],"worker,":[107,251],"but":[108,140],"converge":[110],"more":[111],"slowly":[112],"optimal":[115],"state":[116],"due":[117,229,255],"stale":[120],"problem.":[122,157],"In":[123,158,205],"this":[124,159,206,260],"paper,":[125],"we":[126],"propose":[127],"Dynamic":[128],"Partial":[129],"All-Reduce,":[130],"distributed":[132,248],"algorithm":[134,261],"uses":[136],"dynamically":[141],"manages":[142],"whether":[143],"workers":[144,184],"participate":[145],"global":[147,178],"autonomously":[150],"control":[151],"effects":[153],"straggler":[156,170],"algorithm,":[160],"if":[161],"slow":[163,192],"worker":[164,176,193],"detected,":[166],"influence":[168],"limited":[172],"excluding":[174],"from":[177,236],"communication":[179],"allowing":[181],"remaining":[183],"update":[186],"parameters.":[188],"Then,":[189],"when":[190],"recovers":[194],"normal":[196],"speed,":[197],"returns":[199],"group":[203],"again.":[204],"way,":[207],"decision":[209],"made":[211],"comparing":[213],"what":[214],"causes":[215],"greater":[217],"loss":[218,225],"convergence":[222],"between":[223],"computational":[227],"power":[228],"exclusion":[232],"one":[234],"GPU":[235],"process,":[239],"omission":[241],"as":[243,247],"much":[244],"data":[246],"slowdown":[254],"straggler.":[257],"We":[258],"implemented":[259],"based":[262],"on":[263,272],"PyTorch":[264],"Horovod,":[266],"experiments":[269],"were":[270],"conducted":[271],"Tencent":[273],"Cloud.":[274]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1},{"year":2024,"cited_by_count":1},{"year":2023,"cited_by_count":1}],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
