{"id":"https://openalex.org/W4415125101","doi":"https://doi.org/10.1109/icmlt65785.2025.11193392","title":"AD-KFAC: Asynchronous Decentralized Distributed K-FAC with Dynamic Load Balancing and Fault Tolerance","display_name":"AD-KFAC: Asynchronous Decentralized Distributed K-FAC with Dynamic Load Balancing and Fault Tolerance","publication_year":2025,"publication_date":"2025-05-23","ids":{"openalex":"https://openalex.org/W4415125101","doi":"https://doi.org/10.1109/icmlt65785.2025.11193392"},"language":"en","primary_location":{"id":"doi:10.1109/icmlt65785.2025.11193392","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icmlt65785.2025.11193392","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 10th International Conference on Machine Learning Technologies (ICMLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5070945903","display_name":"Mingzhe Yu","orcid":null},"institutions":[{"id":"https://openalex.org/I146399215","display_name":"University of Tsukuba","ror":"https://ror.org/02956yf07","country_code":"JP","type":"education","lineage":["https://openalex.org/I146399215"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Mingzhe Yu","raw_affiliation_strings":["University of Tsukuba,Graduate School of Science and Technology,Tsukuba,Japan"],"affiliations":[{"raw_affiliation_string":"University of Tsukuba,Graduate School of Science and Technology,Tsukuba,Japan","institution_ids":["https://openalex.org/I146399215"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5085773116","display_name":"Osamu Tatebe","orcid":null},"institutions":[{"id":"https://openalex.org/I146399215","display_name":"University of Tsukuba","ror":"https://ror.org/02956yf07","country_code":"JP","type":"education","lineage":["https://openalex.org/I146399215"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Osamu Tatebe","raw_affiliation_strings":["University of Tsukuba,Center for Computational Sciences,Tsukuba,Japan"],"affiliations":[{"raw_affiliation_string":"University of Tsukuba,Center for Computational Sciences,Tsukuba,Japan","institution_ids":["https://openalex.org/I146399215"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5070945903"],"corresponding_institution_ids":["https://openalex.org/I146399215"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.33235643,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"373","last_page":"382"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.9930999875068665,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.9930999875068665,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9797000288963318,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9757999777793884,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.8658999800682068},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7192999720573425},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness (evolution)","score":0.6532999873161316},{"id":"https://openalex.org/keywords/fault-tolerance","display_name":"Fault tolerance","score":0.6259999871253967},{"id":"https://openalex.org/keywords/load-balancing","display_name":"Load balancing (electrical power)","score":0.5681999921798706},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.49970000982284546},{"id":"https://openalex.org/keywords/asynchrony","display_name":"Asynchrony (computer programming)","score":0.44909998774528503},{"id":"https://openalex.org/keywords/synchronizer","display_name":"Synchronizer","score":0.4115999937057495}],"concepts":[{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.8658999800682068},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8258000016212463},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.7513999938964844},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7192999720573425},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.6532999873161316},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.6259999871253967},{"id":"https://openalex.org/C138959212","wikidata":"https://www.wikidata.org/wiki/Q1806783","display_name":"Load balancing (electrical power)","level":3,"score":0.5681999921798706},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.49970000982284546},{"id":"https://openalex.org/C2779019669","wikidata":"https://www.wikidata.org/wiki/Q25203946","display_name":"Asynchrony (computer programming)","level":3,"score":0.44909998774528503},{"id":"https://openalex.org/C66727535","wikidata":"https://www.wikidata.org/wiki/Q7662199","display_name":"Synchronizer","level":2,"score":0.4115999937057495},{"id":"https://openalex.org/C107107730","wikidata":"https://www.wikidata.org/wiki/Q2994424","display_name":"Consensus","level":3,"score":0.38530001044273376},{"id":"https://openalex.org/C130120984","wikidata":"https://www.wikidata.org/wiki/Q2835898","display_name":"Distributed algorithm","level":2,"score":0.3831000030040741},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.3513000011444092},{"id":"https://openalex.org/C534932454","wikidata":"https://www.wikidata.org/wiki/Q161410","display_name":"Peer-to-peer","level":2,"score":0.3093999922275543},{"id":"https://openalex.org/C137836250","wikidata":"https://www.wikidata.org/wiki/Q984063","display_name":"Optimization problem","level":2,"score":0.3003999888896942},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.2863999903202057},{"id":"https://openalex.org/C194571728","wikidata":"https://www.wikidata.org/wiki/Q206047","display_name":"Asynchronous Transfer Mode","level":2,"score":0.2797999978065491},{"id":"https://openalex.org/C192126672","wikidata":"https://www.wikidata.org/wiki/Q1068715","display_name":"Telecommunications network","level":2,"score":0.27720001339912415},{"id":"https://openalex.org/C65813073","wikidata":"https://www.wikidata.org/wiki/Q1622420","display_name":"High availability","level":2,"score":0.26100000739097595},{"id":"https://openalex.org/C205875254","wikidata":"https://www.wikidata.org/wiki/Q17156857","display_name":"Decentralised system","level":3,"score":0.25949999690055847}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/icmlt65785.2025.11193392","is_oa":false,"landing_page_url":"https://doi.org/10.1109/icmlt65785.2025.11193392","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2025 10th International Conference on Machine Learning Technologies (ICMLT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[{"id":"https://openalex.org/F4320337504","display_name":"Research and Development","ror":"https://ror.org/027s68j25"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":8,"referenced_works":["https://openalex.org/W2194775991","https://openalex.org/W3081233814","https://openalex.org/W3131087743","https://openalex.org/W3180201702","https://openalex.org/W4295308225","https://openalex.org/W4322766202","https://openalex.org/W4386260566","https://openalex.org/W4394717655"],"related_works":[],"abstract_inverted_index":{"Second-order":[0],"optimization":[1,64],"methods,":[2,121],"such":[3],"as":[4,135],"Kronecker-Factored":[5],"Approximate":[6],"Curvature":[7],"(K-FAC),":[8],"offer":[9],"superior":[10],"convergence":[11],"properties":[12],"compared":[13],"to":[14,39,42],"first-order":[15,118],"methods":[16,28],"but":[17],"have":[18],"predominantly":[19],"been":[20],"explored":[21],"within":[22],"synchronous":[23,27,113],"communication":[24,47],"frameworks.":[25],"However,":[26],"are":[29],"inherently":[30],"limited":[31],"in":[32,146],"heterogeneous":[33],"or":[34],"unstable":[35,148],"distributed":[36,142],"environments":[37],"due":[38],"their":[40],"sensitivity":[41],"node":[43,128],"failures,":[44],"latency,":[45],"and":[46,81,94,116,127,138],"delays.":[48],"In":[49],"this":[50],"paper,":[51],"we":[52],"propose":[53],"an":[54],"Asynchronous":[55],"Decentralized":[56],"Distributed":[57],"K-FAC":[58,73,114],"(AD-KFAC)":[59],"framework,":[60],"effectively":[61],"extending":[62],"second-order":[63],"into":[65],"asynchronous":[66,72,119],"decentralized":[67,120],"settings.":[68],"Our":[69],"approach":[70],"integrates":[71],"computations,":[74],"peer-to-peer":[75],"Remote":[76],"Procedure":[77],"Call":[78],"(RPC)-based":[79],"communication,":[80],"dynamic":[82],"load":[83],"balancing":[84],"mechanisms":[85],"leveraging":[86],"the":[87],"Raft":[88],"consensus":[89],"algorithm,":[90],"significantly":[91],"enhancing":[92],"robustness":[93],"scalability.":[95],"Experimental":[96],"results":[97,131],"conducted":[98],"on":[99,105],"a":[100,136],"16-node":[101],"cluster":[102],"using":[103],"ResNet-34":[104],"CIFAR-10":[106],"demonstrate":[107],"that":[108],"AD-KFAC":[109],"consistently":[110],"outperforms":[111],"both":[112],"baselines":[115],"existing":[117],"particularly":[122],"under":[123],"scenarios":[124],"involving":[125],"latency":[126],"failures.":[129],"These":[130],"highlight":[132],"AD-KFAC\u2019s":[133],"potential":[134],"robust":[137],"scalable":[139],"solution":[140],"for":[141],"deep":[143],"learning":[144],"tasks":[145],"realistic,":[147],"network":[149],"conditions.":[150]},"counts_by_year":[],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-14T00:00:00"}
