{"id":"https://openalex.org/W4408871788","doi":"https://doi.org/10.1145/3689031.3717466","title":"Comprehensive Deadlock Prevention for GPU Collective Communication","display_name":"Comprehensive Deadlock Prevention for GPU Collective Communication","publication_year":2025,"publication_date":"2025-03-26","ids":{"openalex":"https://openalex.org/W4408871788","doi":"https://doi.org/10.1145/3689031.3717466"},"language":"en","primary_location":{"id":"doi:10.1145/3689031.3717466","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3689031.3717466","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Twentieth European Conference on Computer Systems","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5091272448","display_name":"Lichen Pan","orcid":"https://orcid.org/0000-0001-7451-0140"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Lichen Pan","raw_affiliation_strings":["School of Computer Science, Peking University"],"raw_orcid":"https://orcid.org/0000-0001-7451-0140","affiliations":[{"raw_affiliation_string":"School of Computer Science, Peking University","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113067040","display_name":"Juncheng Liu","orcid":"https://orcid.org/0000-0002-8213-7718"},"institutions":[{"id":"https://openalex.org/I4210086639","display_name":"InflowControl (Norway)","ror":"https://ror.org/003ms1470","country_code":"NO","type":"company","lineage":["https://openalex.org/I4210086639"]},{"id":"https://openalex.org/I4210119527","display_name":"Qi2","ror":"https://ror.org/021wvan87","country_code":"US","type":"company","lineage":["https://openalex.org/I4210119527"]}],"countries":["NO","US"],"is_corresponding":false,"raw_author_name":"Juncheng Liu","raw_affiliation_strings":["OneFlow Research"],"raw_orcid":"https://orcid.org/0000-0002-8213-7718","affiliations":[{"raw_affiliation_string":"OneFlow Research","institution_ids":["https://openalex.org/I4210086639","https://openalex.org/I4210119527"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003730869","display_name":"Yongquan Fu","orcid":"https://orcid.org/0000-0002-7564-5239"},"institutions":[{"id":"https://openalex.org/I170215575","display_name":"National University of Defense Technology","ror":"https://ror.org/05d2yfz11","country_code":"CN","type":"education","lineage":["https://openalex.org/I170215575"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yongquan Fu","raw_affiliation_strings":["National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology"],"raw_orcid":"https://orcid.org/0000-0002-7564-5239","affiliations":[{"raw_affiliation_string":"National Key Laboratory of Parallel and Distributed Computing, College of Computer Science and Technology, National University of Defense Technology","institution_ids":["https://openalex.org/I170215575"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101489641","display_name":"Yuan Jinhui","orcid":"https://orcid.org/0000-0002-0700-2645"},"institutions":[{"id":"https://openalex.org/I4210086639","display_name":"InflowControl (Norway)","ror":"https://ror.org/003ms1470","country_code":"NO","type":"company","lineage":["https://openalex.org/I4210086639"]},{"id":"https://openalex.org/I4210119527","display_name":"Qi2","ror":"https://ror.org/021wvan87","country_code":"US","type":"company","lineage":["https://openalex.org/I4210119527"]}],"countries":["NO","US"],"is_corresponding":false,"raw_author_name":"Jinhui Yuan","raw_affiliation_strings":["OneFlow Research"],"raw_orcid":"https://orcid.org/0000-0002-0700-2645","affiliations":[{"raw_affiliation_string":"OneFlow Research","institution_ids":["https://openalex.org/I4210086639","https://openalex.org/I4210119527"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061105350","display_name":"Rongkai Zhang","orcid":"https://orcid.org/0009-0003-1652-7202"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Rongkai Zhang","raw_affiliation_strings":["School of Computer Science, Peking University"],"raw_orcid":"https://orcid.org/0009-0003-1652-7202","affiliations":[{"raw_affiliation_string":"School of Computer Science, Peking University","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5001255720","display_name":"Pengze Li","orcid":"https://orcid.org/0000-0001-7015-0491"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Pengze Li","raw_affiliation_strings":["School of Computer Science, Peking University"],"raw_orcid":"https://orcid.org/0000-0001-7015-0491","affiliations":[{"raw_affiliation_string":"School of Computer Science, Peking University","institution_ids":["https://openalex.org/I20231570"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5102979232","display_name":"Zhen Xiao","orcid":"https://orcid.org/0000-0002-6784-9709"},"institutions":[{"id":"https://openalex.org/I20231570","display_name":"Peking University","ror":"https://ror.org/02v51f717","country_code":"CN","type":"education","lineage":["https://openalex.org/I20231570"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhen Xiao","raw_affiliation_strings":["School of Computer Science, Peking University"],"raw_orcid":"https://orcid.org/0000-0002-6784-9709","affiliations":[{"raw_affiliation_string":"School of Computer Science, Peking University","institution_ids":["https://openalex.org/I20231570"]}]}],"institutions":[],"countries_distinct_count":3,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5091272448"],"corresponding_institution_ids":["https://openalex.org/I20231570"],"apc_list":null,"apc_paid":null,"fwci":0.6531,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.67924684,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"541","last_page":"557"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10502","display_name":"Advanced Memory and Neural Computing","score":0.9969000220298767,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9965000152587891,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10829","display_name":"Interconnection Networks and Systems","score":0.9936000108718872,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8039090633392334},{"id":"https://openalex.org/keywords/deadlock","display_name":"Deadlock","score":0.6975573897361755},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.5907732248306274},{"id":"https://openalex.org/keywords/distributed-computing","display_name":"Distributed computing","score":0.35276228189468384}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8039090633392334},{"id":"https://openalex.org/C159023740","wikidata":"https://www.wikidata.org/wiki/Q623276","display_name":"Deadlock","level":2,"score":0.6975573897361755},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5907732248306274},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.35276228189468384}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3689031.3717466","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3689031.3717466","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Twentieth European Conference on Computer Systems","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":36,"referenced_works":["https://openalex.org/W95608104","https://openalex.org/W1506474685","https://openalex.org/W1589576271","https://openalex.org/W2029804882","https://openalex.org/W2088613263","https://openalex.org/W2125551452","https://openalex.org/W2194775991","https://openalex.org/W2581065617","https://openalex.org/W2604787577","https://openalex.org/W2734941459","https://openalex.org/W2747329762","https://openalex.org/W2887629402","https://openalex.org/W2894576937","https://openalex.org/W2944793600","https://openalex.org/W2969388332","https://openalex.org/W2975712713","https://openalex.org/W2981114289","https://openalex.org/W3019425739","https://openalex.org/W3038098022","https://openalex.org/W3047537431","https://openalex.org/W3072623287","https://openalex.org/W3081168214","https://openalex.org/W3086105743","https://openalex.org/W3124352525","https://openalex.org/W3129488589","https://openalex.org/W3193985311","https://openalex.org/W3204998121","https://openalex.org/W4224308101","https://openalex.org/W4235366964","https://openalex.org/W4239965559","https://openalex.org/W4281790033","https://openalex.org/W4288093768","https://openalex.org/W4288357791","https://openalex.org/W4318541593","https://openalex.org/W6769475105","https://openalex.org/W6810081322"],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W397120980","https://openalex.org/W4391913857","https://openalex.org/W3175828148","https://openalex.org/W125668343","https://openalex.org/W2363501516","https://openalex.org/W1506291714"],"abstract_inverted_index":{"Distributed":[0],"deep":[1,23,49],"neural":[2],"network":[3],"training":[4],"necessitates":[5],"efficient":[6],"GPU":[7,16,34],"collective":[8,17,35,82],"communications,":[9],"which":[10],"are":[11,56],"inherently":[12],"susceptible":[13],"to":[14,41,79],"deadlocks.":[15,85],"deadlocks":[18,36],"arise":[19],"easily":[20],"in":[21,60,70],"distributed":[22,48],"learning":[24],"applications":[25],"when":[26],"multiple":[27],"collectives":[28,69],"circularly":[29],"wait":[30],"for":[31],"each":[32],"other.":[33],"pose":[37],"a":[38,71],"significant":[39],"challenge":[40],"the":[42],"correct":[43],"functioning":[44],"and":[45,51,84],"efficiency":[46],"of":[47],"learning,":[50],"no":[52],"general":[53],"effective":[54],"solutions":[55],"currently":[57],"available.":[58],"Only":[59],"specific":[61],"scenarios,":[62],"ad-hoc":[63],"methods,":[64],"making":[65],"an":[66],"application":[67],"invoke":[68],"consistent":[72],"order":[73],"across":[74],"GPUs,":[75],"can":[76],"be":[77],"used":[78],"prevent":[80],"circular":[81],"dependency":[83]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-12-21T01:58:51.020947","created_date":"2025-10-10T00:00:00"}
