{"id":"https://openalex.org/W7134042964","doi":"https://doi.org/10.3233/faia260015","title":"TAGC-2: More Efficient Transformer Training in Distributed Environments","display_name":"TAGC-2: More Efficient Transformer Training in Distributed Environments","publication_year":2026,"publication_date":"2026-03-04","ids":{"openalex":"https://openalex.org/W7134042964","doi":"https://doi.org/10.3233/faia260015"},"language":null,"primary_location":{"id":"doi:10.3233/faia260015","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia260015","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},"type":"book-chapter","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://doi.org/10.3233/faia260015","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5017891100","display_name":"Igor N. Polyakov","orcid":null},"institutions":[{"id":"https://openalex.org/I173089394","display_name":"ITMO University","ror":"https://ror.org/04txgxn49","country_code":"RU","type":"education","lineage":["https://openalex.org/I173089394"]}],"countries":["RU"],"is_corresponding":true,"raw_author_name":"Igor Polyakov","raw_affiliation_strings":["ITMO University, Russia"],"raw_orcid":"https://orcid.org/0009-0004-0229-5380","affiliations":[{"raw_affiliation_string":"ITMO University, Russia","institution_ids":["https://openalex.org/I173089394"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5127258637","display_name":"A. V. Dukhanov","orcid":null},"institutions":[{"id":"https://openalex.org/I173089394","display_name":"ITMO University","ror":"https://ror.org/04txgxn49","country_code":"RU","type":"education","lineage":["https://openalex.org/I173089394"]}],"countries":["RU"],"is_corresponding":false,"raw_author_name":"Alexey Dukhanov","raw_affiliation_strings":["ITMO University, Russia"],"raw_orcid":"https://orcid.org/0000-0002-1011-9932","affiliations":[{"raw_affiliation_string":"ITMO University, Russia","institution_ids":["https://openalex.org/I173089394"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5017891100"],"corresponding_institution_ids":["https://openalex.org/I173089394"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.45404178,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.28110000491142273,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.28110000491142273,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T14347","display_name":"Big Data and Digital Economy","score":0.11980000138282776,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.08190000057220459,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/quantization","display_name":"Quantization (signal processing)","score":0.4668000042438507},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.4287000000476837},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.4171999990940094},{"id":"https://openalex.org/keywords/rewriting","display_name":"Rewriting","score":0.4156999886035919},{"id":"https://openalex.org/keywords/distributed-algorithm","display_name":"Distributed algorithm","score":0.36559998989105225},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.35519999265670776},{"id":"https://openalex.org/keywords/synchronism","display_name":"Synchronism","score":0.35359999537467957},{"id":"https://openalex.org/keywords/bandwidth","display_name":"Bandwidth (computing)","score":0.34150001406669617}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7897999882698059},{"id":"https://openalex.org/C28855332","wikidata":"https://www.wikidata.org/wiki/Q198099","display_name":"Quantization (signal processing)","level":2,"score":0.4668000042438507},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.44999998807907104},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.4287000000476837},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.42730000615119934},{"id":"https://openalex.org/C79403827","wikidata":"https://www.wikidata.org/wiki/Q3988","display_name":"Real-time computing","level":1,"score":0.4262000024318695},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.4171999990940094},{"id":"https://openalex.org/C154690210","wikidata":"https://www.wikidata.org/wiki/Q1668499","display_name":"Rewriting","level":2,"score":0.4156999886035919},{"id":"https://openalex.org/C130120984","wikidata":"https://www.wikidata.org/wiki/Q2835898","display_name":"Distributed algorithm","level":2,"score":0.36559998989105225},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3653999865055084},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.35519999265670776},{"id":"https://openalex.org/C2779853468","wikidata":"https://www.wikidata.org/wiki/Q7662139","display_name":"Synchronism","level":3,"score":0.35359999537467957},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3504999876022339},{"id":"https://openalex.org/C2776257435","wikidata":"https://www.wikidata.org/wiki/Q1576430","display_name":"Bandwidth (computing)","level":2,"score":0.34150001406669617},{"id":"https://openalex.org/C43126263","wikidata":"https://www.wikidata.org/wiki/Q128751","display_name":"Source code","level":2,"score":0.30799999833106995},{"id":"https://openalex.org/C187691185","wikidata":"https://www.wikidata.org/wiki/Q2020720","display_name":"Grid","level":2,"score":0.29760000109672546},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.29589998722076416},{"id":"https://openalex.org/C91481028","wikidata":"https://www.wikidata.org/wiki/Q1054686","display_name":"Distributed memory","level":3,"score":0.29350000619888306},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.2849999964237213},{"id":"https://openalex.org/C78548338","wikidata":"https://www.wikidata.org/wiki/Q2493","display_name":"Data compression","level":2,"score":0.27480000257492065},{"id":"https://openalex.org/C3739613","wikidata":"https://www.wikidata.org/wiki/Q679003","display_name":"Distributed Computing Environment","level":2,"score":0.2732999920845032},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.26969999074935913},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.263700008392334},{"id":"https://openalex.org/C179518139","wikidata":"https://www.wikidata.org/wiki/Q5140297","display_name":"Coding (social sciences)","level":2,"score":0.2603999972343445},{"id":"https://openalex.org/C193415008","wikidata":"https://www.wikidata.org/wiki/Q639681","display_name":"Network architecture","level":2,"score":0.2590000033378601},{"id":"https://openalex.org/C193519340","wikidata":"https://www.wikidata.org/wiki/Q891179","display_name":"Data loss","level":2,"score":0.25200000405311584}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.3233/faia260015","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia260015","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"}],"best_oa_location":{"id":"doi:10.3233/faia260015","is_oa":true,"landing_page_url":"https://doi.org/10.3233/faia260015","pdf_url":null,"source":{"id":"https://openalex.org/S4210201731","display_name":"Frontiers in artificial intelligence and applications","issn_l":"0922-6389","issn":["0922-6389","1879-8314"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Frontiers in Artificial Intelligence and Applications","raw_type":"book-chapter"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Rapidly":[0],"developing":[1],"large":[2,16],"language":[3],"models":[4],"(LLMs)":[5],"stimulate":[6],"the":[7,15,30,53,65,69,72,77,82,117,131,148],"search":[8],"for":[9],"efficient":[10],"training":[11,33,98,120],"strategies":[12],"coping":[13],"with":[14,68,136],"computational":[17],"expenses":[18],"caused":[19],"by":[20,122,134],"distributed":[21,32],"training.":[22],"Gradient":[23,46],"synchronization":[24],"is":[25,157],"an":[26,161],"important":[27],"component":[28],"of":[29,52],"overall":[31],"process,":[34],"especially":[35],"in":[36,59],"Sharded":[37,150],"Parallelism":[38],"mode.":[39],"In":[40],"this":[41],"paper,":[42],"we":[43],"present":[44],"Transformer-Aware":[45],"Compression-2":[47],"(TAGC-2),":[48],"a":[49,137],"further":[50,63],"advancement":[51],"gradient":[54],"compression":[55],"algorithm":[56],"that":[57,114],"specializes":[58],"transformer-based":[60],"models.":[61],"TAGC-2":[62,95,115],"develops":[64],"TAGC":[66],"method":[67,78],"following":[70],"improvements:":[71],"communication/computation":[73],"overlap":[74],"was":[75,79],"optimized;":[76],"adapted":[80],"to":[81,147],"network":[83,107,126,143],"conditions":[84,128],"currently":[85],"used":[86],"employing":[87],"bfloat16":[88],"quantization":[89],"and":[90,104,129],"rewriting":[91],"sparsification":[92],"as":[93,100,160],"kernels.":[94],"enables":[96],"long":[97],"runs":[99],"it":[101],"implements":[102],"checkpointing":[103],"restarts":[105],"upon":[106],"or":[108],"memory":[109],"failures.":[110],"The":[111,155],"experiments":[112],"demonstrate":[113],"improves":[116],"wall":[118],"clock":[119],"time":[121,133],"3.1%":[123],"under":[124,141],"low":[125],"bandwidth":[127,144],"shortens":[130],"iteration":[132],"4.6-10%":[135],"minimal":[138],"loss":[139],"degradation":[140],"common":[142],"conditions,":[145],"compared":[146],"Fully":[149],"Data":[151],"Parallel":[152],"(FSDP)":[153],"baseline.":[154],"implementation":[156],"publicly":[158],"available":[159],"open":[162],"source":[163],"code":[164],"at":[165],"https://github.com/ipolyakov/TAGC.":[166]},"counts_by_year":[],"updated_date":"2026-05-21T06:26:12.895304","created_date":"2026-03-07T00:00:00"}
