{"id":"https://openalex.org/W4408564308","doi":"https://doi.org/10.1109/tpds.2025.3553066","title":"OmniLearn: A Framework for Distributed Deep Learning Over Heterogeneous Clusters","display_name":"OmniLearn: A Framework for Distributed Deep Learning Over Heterogeneous Clusters","publication_year":2025,"publication_date":"2025-03-18","ids":{"openalex":"https://openalex.org/W4408564308","doi":"https://doi.org/10.1109/tpds.2025.3553066"},"language":"en","primary_location":{"id":"doi:10.1109/tpds.2025.3553066","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2025.3553066","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5038735030","display_name":"Sahil Tyagi","orcid":"https://orcid.org/0009-0007-8314-4745"},"institutions":[{"id":"https://openalex.org/I4210119109","display_name":"Indiana University Bloomington","ror":"https://ror.org/02k40bc56","country_code":"US","type":"education","lineage":["https://openalex.org/I4210119109","https://openalex.org/I592451"]}],"countries":["US"],"is_corresponding":true,"raw_author_name":"Sahil Tyagi","raw_affiliation_strings":["Department of Intelligent Systems Engineering School: Luddy School of Informatics, Computing and Engineering University: Indiana University Bloomington, Indiana, USA"],"affiliations":[{"raw_affiliation_string":"Department of Intelligent Systems Engineering School: Luddy School of Informatics, Computing and Engineering University: Indiana University Bloomington, Indiana, USA","institution_ids":["https://openalex.org/I4210119109"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100718226","display_name":"Prateek Sharma","orcid":"https://orcid.org/0000-0003-1789-0145"},"institutions":[{"id":"https://openalex.org/I4210119109","display_name":"Indiana University Bloomington","ror":"https://ror.org/02k40bc56","country_code":"US","type":"education","lineage":["https://openalex.org/I4210119109","https://openalex.org/I592451"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Prateek Sharma","raw_affiliation_strings":["Department of Intelligent Systems Engineering School: Luddy School of Informatics, Computing and Engineering University: Indiana University Bloomington, Indiana, USA"],"affiliations":[{"raw_affiliation_string":"Department of Intelligent Systems Engineering School: Luddy School of Informatics, Computing and Engineering University: Indiana University Bloomington, Indiana, USA","institution_ids":["https://openalex.org/I4210119109"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5038735030"],"corresponding_institution_ids":["https://openalex.org/I4210119109"],"apc_list":null,"apc_paid":null,"fwci":4.7137,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.93898326,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":98},"biblio":{"volume":"36","issue":"6","first_page":"1253","last_page":"1267"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10637","display_name":"Advanced Clustering Algorithms Research","score":0.7479000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10637","display_name":"Advanced Clustering Algorithms Research","score":0.7479000091552734,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10203","display_name":"Recommender Systems and Techniques","score":0.6832000017166138,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8108043074607849},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.5466211438179016},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.340329110622406}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8108043074607849},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.5466211438179016},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.340329110622406}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/tpds.2025.3553066","is_oa":false,"landing_page_url":"https://doi.org/10.1109/tpds.2025.3553066","pdf_url":null,"source":{"id":"https://openalex.org/S97130795","display_name":"IEEE Transactions on Parallel and Distributed Systems","issn_l":"1045-9219","issn":["1045-9219","1558-2183","2161-9883"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319808","host_organization_name":"Institute of Electrical and Electronics Engineers","host_organization_lineage":["https://openalex.org/P4310319808"],"host_organization_lineage_names":["Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"IEEE Transactions on Parallel and Distributed Systems","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G8767938004","display_name":null,"funder_award_id":"OAC-2112606","funder_id":"https://openalex.org/F4320306076","funder_display_name":"National Science Foundation"}],"funders":[{"id":"https://openalex.org/F4320306076","display_name":"National Science Foundation","ror":"https://ror.org/021nxhr62"}],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":44,"referenced_works":["https://openalex.org/W12634471","https://openalex.org/W114517082","https://openalex.org/W2083842231","https://openalex.org/W2131613942","https://openalex.org/W2140246545","https://openalex.org/W2155904486","https://openalex.org/W2194775991","https://openalex.org/W2606207877","https://openalex.org/W2612026221","https://openalex.org/W2618530766","https://openalex.org/W2732026016","https://openalex.org/W2911382970","https://openalex.org/W3011751313","https://openalex.org/W3086105743","https://openalex.org/W3091156884","https://openalex.org/W3109982791","https://openalex.org/W3129831491","https://openalex.org/W3131944805","https://openalex.org/W3177263144","https://openalex.org/W4318185476","https://openalex.org/W4383749489","https://openalex.org/W4387006253","https://openalex.org/W4388855628","https://openalex.org/W4391092828","https://openalex.org/W6635810480","https://openalex.org/W6637373629","https://openalex.org/W6638803421","https://openalex.org/W6679393576","https://openalex.org/W6680402377","https://openalex.org/W6703420464","https://openalex.org/W6726983090","https://openalex.org/W6728757088","https://openalex.org/W6730398498","https://openalex.org/W6738534199","https://openalex.org/W6739622702","https://openalex.org/W6739693220","https://openalex.org/W6745410505","https://openalex.org/W6745458143","https://openalex.org/W6755543977","https://openalex.org/W6757053730","https://openalex.org/W6757392376","https://openalex.org/W6766745794","https://openalex.org/W6784425352","https://openalex.org/W6791545107"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W4391913857","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052"],"abstract_inverted_index":{"Deep":[0],"learning":[1],"systems":[2],"are":[3],"optimized":[4],"for":[5],"clusters":[6],"with":[7],"homogeneous":[8],"resources.":[9],"However,":[10],"heterogeneity":[11,60],"is":[12,66],"prevalent":[13],"in":[14,61],"computing":[15],"infrastructure":[16],"across":[17,74],"edge,":[18],"cloud":[19],"and":[20,39,77],"HPC.":[21],"When":[22],"training":[23,94],"neural":[24],"networks":[25],"using":[26],"stochastic":[27],"gradient":[28],"descent":[29],"techniques":[30,105],"on":[31],"heterogeneous":[32,75],"resources,":[33],"performance":[34],"degrades":[35],"due":[36],"to":[37,55,71,110],"stragglers":[38],"stale":[40],"updates.":[41],"In":[42],"this":[43],"work,":[44],"we":[45],"develop":[46],"an":[47],"adaptive":[48],"batch-scaling":[49],"framework":[50],"called":[51],"<monospace":[52,90],"xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"":[53,91],"xmlns:xlink=\"http://www.w3.org/1999/xlink\">OmniLearn</monospace>":[54,92],"mitigate":[56],"the":[57],"effects":[58],"of":[59],"distributed":[62],"training.":[63],"Our":[64],"approach":[65],"inspired":[67],"by":[68,96,108],"proportional":[69],"controllers":[70],"balance":[72],"computation":[73],"servers,":[76],"works":[78],"under":[79],"varying":[80],"resource":[81],"availability.":[82],"By":[83],"dynamically":[84],"adjusting":[85],"worker":[86],"mini-batches":[87],"at":[88],"runtime,":[89],"reduces":[93],"time":[95],"14-85%.":[97],"We":[98],"also":[99],"investigate":[100],"asynchronous":[101],"training,":[102],"where":[103],"our":[104],"improve":[106],"accuracy":[107],"up":[109],"6.9%.":[111]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-04-09T08:11:56.329763","created_date":"2025-10-10T00:00:00"}
