{"id":"https://openalex.org/W4402897214","doi":"https://doi.org/10.1109/iwqos61813.2024.10682910","title":"TaLB: Tensor-aware Load Balancing for Distributed DNN Training Acceleration","display_name":"TaLB: Tensor-aware Load Balancing for Distributed DNN Training Acceleration","publication_year":2024,"publication_date":"2024-06-19","ids":{"openalex":"https://openalex.org/W4402897214","doi":"https://doi.org/10.1109/iwqos61813.2024.10682910"},"language":"en","primary_location":{"id":"doi:10.1109/iwqos61813.2024.10682910","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iwqos61813.2024.10682910","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/ACM 32nd International Symposium on Quality of Service (IWQoS)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5022806757","display_name":"Jinbin Hu","orcid":"https://orcid.org/0000-0001-8216-9683"},"institutions":[{"id":"https://openalex.org/I56934997","display_name":"Changsha University of Science and Technology","ror":"https://ror.org/03yph8055","country_code":"CN","type":"education","lineage":["https://openalex.org/I56934997"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jinbin Hu","raw_affiliation_strings":["Changsha University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Changsha University of Science and Technology","institution_ids":["https://openalex.org/I56934997"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062236368","display_name":"Yi He","orcid":"https://orcid.org/0009-0005-7430-428X"},"institutions":[{"id":"https://openalex.org/I56934997","display_name":"Changsha University of Science and Technology","ror":"https://ror.org/03yph8055","country_code":"CN","type":"education","lineage":["https://openalex.org/I56934997"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yi He","raw_affiliation_strings":["Changsha University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Changsha University of Science and Technology","institution_ids":["https://openalex.org/I56934997"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051907475","display_name":"Wangqing Luo","orcid":null},"institutions":[{"id":"https://openalex.org/I56934997","display_name":"Changsha University of Science and Technology","ror":"https://ror.org/03yph8055","country_code":"CN","type":"education","lineage":["https://openalex.org/I56934997"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wangqing Luo","raw_affiliation_strings":["Changsha University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Changsha University of Science and Technology","institution_ids":["https://openalex.org/I56934997"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008483730","display_name":"Jiawei Huang","orcid":"https://orcid.org/0000-0002-7578-4490"},"institutions":[{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiawei Huang","raw_affiliation_strings":["Central South University"],"affiliations":[{"raw_affiliation_string":"Central South University","institution_ids":["https://openalex.org/I139660479"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100438360","display_name":"Jianxin Wang","orcid":"https://orcid.org/0000-0003-1516-0480"},"institutions":[{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]},{"id":"https://openalex.org/I56934997","display_name":"Changsha University of Science and Technology","ror":"https://ror.org/03yph8055","country_code":"CN","type":"education","lineage":["https://openalex.org/I56934997"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jianxin Wang","raw_affiliation_strings":["Central South University","Changsha University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Central South University","institution_ids":["https://openalex.org/I139660479"]},{"raw_affiliation_string":"Changsha University of Science and Technology","institution_ids":["https://openalex.org/I56934997"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5113414416","display_name":"Jin Wang","orcid":"https://orcid.org/0000-0002-8298-4378"},"institutions":[{"id":"https://openalex.org/I56934997","display_name":"Changsha University of Science and Technology","ror":"https://ror.org/03yph8055","country_code":"CN","type":"education","lineage":["https://openalex.org/I56934997"]},{"id":"https://openalex.org/I139660479","display_name":"Central South University","ror":"https://ror.org/00f1zfq44","country_code":"CN","type":"education","lineage":["https://openalex.org/I139660479"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jin Wang","raw_affiliation_strings":["Central South University","Changsha University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Central South University","institution_ids":["https://openalex.org/I139660479"]},{"raw_affiliation_string":"Changsha University of Science and Technology","institution_ids":["https://openalex.org/I56934997"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5022806757"],"corresponding_institution_ids":["https://openalex.org/I56934997"],"apc_list":null,"apc_paid":null,"fwci":0.5319,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.64152664,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9775999784469604,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9775999784469604,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7753673791885376},{"id":"https://openalex.org/keywords/acceleration","display_name":"Acceleration","score":0.7216376662254333},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.5928770303726196},{"id":"https://openalex.org/keywords/load-balancing","display_name":"Load balancing (electrical power)","score":0.5901210308074951},{"id":"https://openalex.org/keywords/tensor","display_name":"Tensor (intrinsic definition)","score":0.5274420380592346},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.41499772667884827},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.096772700548172},{"id":"https://openalex.org/keywords/physics","display_name":"Physics","score":0.05510425567626953}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7753673791885376},{"id":"https://openalex.org/C117896860","wikidata":"https://www.wikidata.org/wiki/Q11376","display_name":"Acceleration","level":2,"score":0.7216376662254333},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.5928770303726196},{"id":"https://openalex.org/C138959212","wikidata":"https://www.wikidata.org/wiki/Q1806783","display_name":"Load balancing (electrical power)","level":3,"score":0.5901210308074951},{"id":"https://openalex.org/C155281189","wikidata":"https://www.wikidata.org/wiki/Q3518150","display_name":"Tensor (intrinsic definition)","level":2,"score":0.5274420380592346},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.41499772667884827},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.096772700548172},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.05510425567626953},{"id":"https://openalex.org/C202444582","wikidata":"https://www.wikidata.org/wiki/Q837863","display_name":"Pure mathematics","level":1,"score":0.0},{"id":"https://openalex.org/C187691185","wikidata":"https://www.wikidata.org/wiki/Q2020720","display_name":"Grid","level":2,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C2524010","wikidata":"https://www.wikidata.org/wiki/Q8087","display_name":"Geometry","level":1,"score":0.0},{"id":"https://openalex.org/C74650414","wikidata":"https://www.wikidata.org/wiki/Q11397","display_name":"Classical mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/iwqos61813.2024.10682910","is_oa":false,"landing_page_url":"https://doi.org/10.1109/iwqos61813.2024.10682910","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/ACM 32nd International Symposium on Quality of Service (IWQoS)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/7","score":0.7799999713897705,"display_name":"Affordable and clean energy"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"},{"id":"https://openalex.org/F4320322843","display_name":"Natural Science Foundation of\u00a0Hunan Province","ror":null}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":41,"referenced_works":["https://openalex.org/W1994926493","https://openalex.org/W2060393849","https://openalex.org/W2132320636","https://openalex.org/W2142480021","https://openalex.org/W2149939304","https://openalex.org/W2157990152","https://openalex.org/W2169246522","https://openalex.org/W2402144811","https://openalex.org/W2498764059","https://openalex.org/W2744567246","https://openalex.org/W2744698795","https://openalex.org/W2920397365","https://openalex.org/W2968755040","https://openalex.org/W2969388332","https://openalex.org/W2975712713","https://openalex.org/W2991040477","https://openalex.org/W2993199282","https://openalex.org/W3047002910","https://openalex.org/W3047498084","https://openalex.org/W3155611867","https://openalex.org/W3174868702","https://openalex.org/W4237445262","https://openalex.org/W4289401659","https://openalex.org/W4372260356","https://openalex.org/W4381328668","https://openalex.org/W4386709690","https://openalex.org/W4389989040","https://openalex.org/W4389989295","https://openalex.org/W4392024151","https://openalex.org/W4401508579","https://openalex.org/W6636819530","https://openalex.org/W6736026156","https://openalex.org/W6736413256","https://openalex.org/W6739693220","https://openalex.org/W6746200960","https://openalex.org/W6754818105","https://openalex.org/W6762211661","https://openalex.org/W6766978945","https://openalex.org/W6794216301","https://openalex.org/W6989061410","https://openalex.org/W7005924706"],"related_works":["https://openalex.org/W3004117068","https://openalex.org/W2043284556","https://openalex.org/W3175372827","https://openalex.org/W2488220206","https://openalex.org/W2146301767","https://openalex.org/W191730236","https://openalex.org/W4384948837","https://openalex.org/W4312680855","https://openalex.org/W1524016490","https://openalex.org/W3191990038"],"abstract_inverted_index":{"Increasingly":[0],"large-scale":[1,132],"models":[2],"and":[3,26,37,68,85,112,131,159,171],"rich":[4],"data":[5],"sets":[6],"make":[7],"communication":[8],"overhead":[9],"a":[10,95],"key":[11],"bottleneck":[12],"for":[13,70],"distributed":[14],"Deep":[15],"Neural":[16],"Network":[17],"(DNN)":[18],"training,":[19],"constantly":[20],"attracting":[21],"the":[22,58,75,108,118,123,149,161],"attention":[23],"of":[24,65],"academia":[25],"industry.":[27],"Despite":[28],"continuous":[29],"efforts,":[30],"prior":[31],"solutions":[32],"such":[33],"as":[34],"pipelining":[35],"computation/communication":[36],"in-network":[38],"gradient":[39,71],"compression/scheduling":[40],"do":[41],"not":[42],"focus":[43],"on":[44,117,168],"how":[45],"to":[46,101,121,157,166],"accelerate":[47,102],"DNN":[48,76,103,141],"training":[49,77,142,163],"through":[50],"load":[51,60],"balancing":[52,61],"in":[53,80],"datacenter":[54],"networks":[55],"(DCNs).":[56],"However,":[57],"existing":[59],"mechanisms":[62],"are":[63],"unaware":[64],"tensor":[66,82],"integrity":[67],"priority":[69,110],"parameter":[72],"synchronization":[73],"during":[74],"iterations,":[78],"resulting":[79],"severe":[81],"tail":[83,126],"latency":[84],"slow":[86],"model":[87,162],"convergence":[88],"speed.":[89,143],"In":[90],"this":[91],"paper,":[92],"we":[93],"present":[94],"Tensor-aware":[96],"Load":[97],"Balancing":[98],"(TaLB)":[99],"scheme":[100],"training.":[104],"Specifically,":[105],"TaLB":[106,138,146],"identifies":[107],"different":[109],"tensors":[111,125],"makes":[113],"(re)routing":[114],"decisions":[115],"based":[116],"tensor-level":[119],"granularity":[120],"cut":[122],"high-priority":[124],"delay.":[127],"The":[128],"testbed":[129],"implementation":[130],"NS-3":[133],"simulation":[134],"results":[135],"show":[136],"that":[137],"effectively":[139],"accelerates":[140,160],"For":[144],"example,":[145],"significantly":[147],"reduces":[148],"average":[150],"flow":[151],"completion":[152],"time":[153],"(FCT)":[154],"by":[155],"up":[156,165],"55%,":[158],"speed":[164],"2.37\u00d7":[167],"VGG19,":[169],"ResNet50":[170],"AlexNet":[172],"models.":[173]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-12-19T19:40:27.379048","created_date":"2025-10-10T00:00:00"}
