{"id":"https://openalex.org/W4409131506","doi":"https://doi.org/10.1631/fitee.2400602","title":"Memory-efficient tensor parallelism for long-sequence Transformer training","display_name":"Memory-efficient tensor parallelism for long-sequence Transformer training","publication_year":2025,"publication_date":"2025-04-02","ids":{"openalex":"https://openalex.org/W4409131506","doi":"https://doi.org/10.1631/fitee.2400602"},"language":"en","primary_location":{"id":"doi:10.1631/fitee.2400602","is_oa":false,"landing_page_url":"https://doi.org/10.1631/fitee.2400602","pdf_url":null,"source":{"id":"https://openalex.org/S4210189857","display_name":"Frontiers of Information Technology & Electronic Engineering","issn_l":"2095-9184","issn":["2095-9184","2095-9230"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers of Information Technology &amp; Electronic Engineering","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5061876398","display_name":"Peng Liang","orcid":"https://orcid.org/0000-0002-5590-5179"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peng Liang","raw_affiliation_strings":["National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, 410073, China","National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, China"],"raw_orcid":"https://orcid.org/0000-0002-5590-5179","affiliations":[{"raw_affiliation_string":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, 410073, China","institution_ids":["https://openalex.org/I170215575"]},{"raw_affiliation_string":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043045076","display_name":"Linbo Qiao","orcid":"https://orcid.org/0000-0002-8285-2738"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Linbo Qiao","raw_affiliation_strings":["National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, 410073, China","National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, 410073, China","institution_ids":["https://openalex.org/I170215575"]},{"raw_affiliation_string":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yanqi Shi","orcid":null},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yanqi Shi","raw_affiliation_strings":["National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, 410073, China","National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, 410073, China","institution_ids":["https://openalex.org/I170215575"]},{"raw_affiliation_string":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101981248","display_name":"Hao Zheng","orcid":"https://orcid.org/0000-0003-0388-0755"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Hao Zheng","raw_affiliation_strings":["National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, 410073, China","National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, 410073, China","institution_ids":["https://openalex.org/I170215575"]},{"raw_affiliation_string":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102970145","display_name":"Yu Tang","orcid":"https://orcid.org/0000-0002-8595-1547"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yu Tang","raw_affiliation_strings":["National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, 410073, China","National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, China"],"raw_orcid":null,"affiliations":[{"raw_affiliation_string":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, 410073, China","institution_ids":["https://openalex.org/I170215575"]},{"raw_affiliation_string":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100440903","display_name":"Dongsheng Li","orcid":"https://orcid.org/0000-0001-9743-2034"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Dongsheng Li","raw_affiliation_strings":["National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, 410073, China","National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, China"],"raw_orcid":"https://orcid.org/0000-0001-9743-2034","affiliations":[{"raw_affiliation_string":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, 410073, China","institution_ids":["https://openalex.org/I170215575"]},{"raw_affiliation_string":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology, Changsha, China","institution_ids":["https://openalex.org/I170215575"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100440903"],"corresponding_institution_ids":["https://openalex.org/I170215575"],"apc_list":null,"apc_paid":null,"fwci":2.1733,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.87786312,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":"26","issue":"5","first_page":"770","last_page":"787"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T13650","display_name":"Computational Physics and Python Applications","score":0.9900000095367432,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T13650","display_name":"Computational Physics and Python Applications","score":0.9900000095367432,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9836000204086304,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10271","display_name":"Seismic Imaging and Inversion Techniques","score":0.9646000266075134,"subfield":{"id":"https://openalex.org/subfields/1908","display_name":"Geophysics"},"field":{"id":"https://openalex.org/fields/19","display_name":"Earth and Planetary Sciences"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6064991354942322},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5663422346115112},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.5335234999656677},{"id":"https://openalex.org/keywords/parallelism","display_name":"Parallelism (grammar)","score":0.5095141530036926},{"id":"https://openalex.org/keywords/sequence","display_name":"Sequence (biology)","score":0.5025351047515869},{"id":"https://openalex.org/keywords/arithmetic","display_name":"Arithmetic","score":0.32906574010849},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.18336701393127441},{"id":"https://openalex.org/keywords/electrical-engineering","display_name":"Electrical engineering","score":0.13941821455955505},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.09371304512023926},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.07382363080978394},{"id":"https://openalex.org/keywords/chemistry","display_name":"Chemistry","score":0.07029449939727783}],"concepts":[{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6064991354942322},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5663422346115112},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5335234999656677},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.5095141530036926},{"id":"https://openalex.org/C2778112365","wikidata":"https://www.wikidata.org/wiki/Q3511065","display_name":"Sequence (biology)","level":2,"score":0.5025351047515869},{"id":"https://openalex.org/C94375191","wikidata":"https://www.wikidata.org/wiki/Q11205","display_name":"Arithmetic","level":1,"score":0.32906574010849},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.18336701393127441},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.13941821455955505},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.09371304512023926},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.07382363080978394},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.07029449939727783},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1631/fitee.2400602","is_oa":false,"landing_page_url":"https://doi.org/10.1631/fitee.2400602","pdf_url":null,"source":{"id":"https://openalex.org/S4210189857","display_name":"Frontiers of Information Technology & Electronic Engineering","issn_l":"2095-9184","issn":["2095-9184","2095-9230"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers of Information Technology &amp; Electronic Engineering","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":28,"referenced_works":["https://openalex.org/W2926767350","https://openalex.org/W2963341956","https://openalex.org/W3015468748","https://openalex.org/W3081168214","https://openalex.org/W3101708369","https://openalex.org/W3204998121","https://openalex.org/W4322718191","https://openalex.org/W4327694855","https://openalex.org/W4327810158","https://openalex.org/W4382119135","https://openalex.org/W4384648639","https://openalex.org/W4384920109","https://openalex.org/W4385291782","https://openalex.org/W4385571616","https://openalex.org/W4386348101","https://openalex.org/W4386768656","https://openalex.org/W4387356039","https://openalex.org/W4388927764","https://openalex.org/W4392341509","https://openalex.org/W4399452094","https://openalex.org/W6600225990","https://openalex.org/W6600466347","https://openalex.org/W6600708310","https://openalex.org/W6601630192","https://openalex.org/W6604932849","https://openalex.org/W6606960104","https://openalex.org/W6631190155","https://openalex.org/W6743235451"],"related_works":["https://openalex.org/W2045183646","https://openalex.org/W2162409446","https://openalex.org/W2109463584","https://openalex.org/W2504075107","https://openalex.org/W2354058185","https://openalex.org/W3014500535","https://openalex.org/W2319812372","https://openalex.org/W1585021387","https://openalex.org/W1595672120","https://openalex.org/W4230999561"],"abstract_inverted_index":{"Transformer-based":[0],"models":[1,5],"like":[2],"large":[3],"language":[4],"(LLMs)":[6],"have":[7],"attracted":[8],"significant":[9],"attention":[10,202],"in":[11,118],"recent":[12],"years":[13],"due":[14],"to":[15,30,127,137,155,171,200,211,216,235],"their":[16],"superior":[17],"performance.":[18],"A":[19],"long":[20],"sequence":[21,44,229],"of":[22,43,98,124,134,168,177],"input":[23],"tokens":[24],"is":[25,113,183],"essential":[26],"for":[27,48,63,95,140],"industrial":[28],"LLMs":[29],"provide":[31],"better":[32,157],"user":[33],"services.":[34],"However,":[35],"memory":[36,65,194,209],"consumption":[37],"increases":[38],"quadratically":[39],"with":[40],"the":[41,96,114,142,151,165,174,180,228],"increase":[42,227],"length,":[45],"posing":[46],"challenges":[47],"scaling":[49],"up":[50],"long-sequence":[51,175],"training.":[52,120],"Current":[53],"parallelism":[54,69,87,92],"methods":[55,237],"produce":[56],"duplicated":[57,146],"tensors":[58],"during":[59],"execution,":[60],"leaving":[61],"space":[62],"improving":[64],"efficiency.":[66],"Additionally,":[67],"tensor":[68,91],"(TP)":[70],"cannot":[71],"achieve":[72,156],"effective":[73],"overlap":[74,158,170],"between":[75,107,159],"computation":[76,97,116,160],"and":[77,103,130,161,203],"communication.":[78,162],"To":[79],"solve":[80],"these":[81],"weaknesses,":[82],"we":[83,188],"propose":[84],"a":[85,104],"general":[86],"method":[88],"called":[89],"memory-efficient":[90],"(METP),":[93],"designed":[94],"two":[99],"consecutive":[100],"matrix":[101],"multiplications":[102],"possible":[105],"function":[106],"them":[108],"(O":[109],"=":[110],"f(AB)C),":[111],"which":[112],"kernel":[115],"component":[117],"Transformer":[119],"METP":[121,191,225],"distributes":[122],"subtasks":[123],"computing":[125],"O":[126],"multiple":[128],"devices":[129],"uses":[131],"send/recv":[132],"instead":[133],"collective":[135],"communication":[136],"exchange":[138],"submatrices":[139],"finishing":[141],"computation,":[143],"avoiding":[144],"producing":[145],"tensors.":[147],"We":[148,163],"also":[149],"apply":[150],"double":[152],"buffering":[153],"technique":[154],"present":[164],"theoretical":[166,186],"condition":[167],"full":[169],"help":[172],"instruct":[173],"training":[176],"Transformers.":[178],"Suppose":[179],"parallel":[181],"degree":[182],"p;":[184],"through":[185],"analysis,":[187],"prove":[189],"that":[190,224],"provides":[192],"O(1/p3)":[193],"overhead":[195],"when":[196,213,238],"not":[197],"using":[198,214,239],"FlashAttention":[199,215],"compute":[201,217],"could":[204],"save":[205],"at":[206],"least":[207],"41.7%":[208],"compared":[210,234],"TP":[212],"multi-head":[218],"self-attention.":[219],"Our":[220],"experimental":[221],"results":[222],"demonstrate":[223],"can":[226],"length":[230],"by":[231],"2.38\u20132.99":[232],"times":[233],"other":[236],"eight":[240],"A100":[241],"graphics":[242],"processing":[243],"units":[244],"(GPUs).":[245]},"counts_by_year":[{"year":2026,"cited_by_count":1}],"updated_date":"2026-03-05T07:30:30.508283","created_date":"2025-10-10T00:00:00"}
