{"id":"https://openalex.org/W4416004305","doi":"https://doi.org/10.1145/3731599.3767508","title":"Redesigning GROMACS Halo Exchange: Improving Strong Scaling with GPU-initiated NVSHMEM","display_name":"Redesigning GROMACS Halo Exchange: Improving Strong Scaling with GPU-initiated NVSHMEM","publication_year":2025,"publication_date":"2025-11-07","ids":{"openalex":"https://openalex.org/W4416004305","doi":"https://doi.org/10.1145/3731599.3767508"},"language":"en","primary_location":{"id":"doi:10.1145/3731599.3767508","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3731599.3767508","pdf_url":null,"source":null,"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3731599.3767508","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5018490573","display_name":"Mahesh Doijade","orcid":"https://orcid.org/0009-0003-5953-0436"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Mahesh Doijade","raw_affiliation_strings":["NVIDIA, Santa Clara, USA"],"raw_orcid":"https://orcid.org/0009-0003-5953-0436","affiliations":[{"raw_affiliation_string":"NVIDIA, Santa Clara, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029901108","display_name":"Andrey Alekseenko","orcid":"https://orcid.org/0000-0003-4906-7241"},"institutions":[{"id":"https://openalex.org/I86987016","display_name":"KTH Royal Institute of Technology","ror":"https://ror.org/026vcq606","country_code":"SE","type":"education","lineage":["https://openalex.org/I86987016"]}],"countries":["SE"],"is_corresponding":false,"raw_author_name":"Andrey Alekseenko","raw_affiliation_strings":["KTH Royal Institute of Technology, Stockholm, Sweden"],"raw_orcid":"https://orcid.org/0000-0003-4906-7241","affiliations":[{"raw_affiliation_string":"KTH Royal Institute of Technology, Stockholm, Sweden","institution_ids":["https://openalex.org/I86987016"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069175208","display_name":"A. P. G. Brown","orcid":"https://orcid.org/0009-0000-7116-7535"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Ania Brown","raw_affiliation_strings":["NVIDIA, Santa Clara, USA"],"raw_orcid":"https://orcid.org/0009-0000-7116-7535","affiliations":[{"raw_affiliation_string":"NVIDIA, Santa Clara, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109854669","display_name":"Alan Gray","orcid":"https://orcid.org/0009-0009-7731-1855"},"institutions":[{"id":"https://openalex.org/I4210127875","display_name":"Nvidia (United States)","ror":"https://ror.org/03jdj4y14","country_code":"US","type":"company","lineage":["https://openalex.org/I4210127875"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Alan Gray","raw_affiliation_strings":["NVIDIA, Santa Clara, USA"],"raw_orcid":"https://orcid.org/0009-0009-7731-1855","affiliations":[{"raw_affiliation_string":"NVIDIA, Santa Clara, USA","institution_ids":["https://openalex.org/I4210127875"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5009705616","display_name":"Szil\u00e1rd P\u00e1ll","orcid":"https://orcid.org/0000-0003-0603-5514"},"institutions":[{"id":"https://openalex.org/I86987016","display_name":"KTH Royal Institute of Technology","ror":"https://ror.org/026vcq606","country_code":"SE","type":"education","lineage":["https://openalex.org/I86987016"]}],"countries":["SE"],"is_corresponding":false,"raw_author_name":"Szil\u00e1rd P\u00e1ll","raw_affiliation_strings":["KTH Royal Institute of Technology, Stockholm, Sweden"],"raw_orcid":"https://orcid.org/0000-0003-0603-5514","affiliations":[{"raw_affiliation_string":"KTH Royal Institute of Technology, Stockholm, Sweden","institution_ids":["https://openalex.org/I86987016"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.32282664,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1314","last_page":"1329"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8260999917984009,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.8260999917984009,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.0284000001847744,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.02759999968111515,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.7886000275611877},{"id":"https://openalex.org/keywords/asynchronous-communication","display_name":"Asynchronous communication","score":0.704800009727478},{"id":"https://openalex.org/keywords/scaling","display_name":"Scaling","score":0.6344000101089478},{"id":"https://openalex.org/keywords/kernel","display_name":"Kernel (algebra)","score":0.49950000643730164},{"id":"https://openalex.org/keywords/latency","display_name":"Latency (audio)","score":0.4316999912261963},{"id":"https://openalex.org/keywords/remote-direct-memory-access","display_name":"Remote direct memory access","score":0.37869998812675476},{"id":"https://openalex.org/keywords/domain","display_name":"Domain (mathematical analysis)","score":0.37389999628067017},{"id":"https://openalex.org/keywords/pci-express","display_name":"PCI Express","score":0.37139999866485596},{"id":"https://openalex.org/keywords/domain-decomposition-methods","display_name":"Domain decomposition methods","score":0.36970001459121704},{"id":"https://openalex.org/keywords/solver","display_name":"Solver","score":0.36239999532699585}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8009999990463257},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.7886000275611877},{"id":"https://openalex.org/C151319957","wikidata":"https://www.wikidata.org/wiki/Q752739","display_name":"Asynchronous communication","level":2,"score":0.704800009727478},{"id":"https://openalex.org/C99844830","wikidata":"https://www.wikidata.org/wiki/Q102441924","display_name":"Scaling","level":2,"score":0.6344000101089478},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.5587000250816345},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.5537999868392944},{"id":"https://openalex.org/C74193536","wikidata":"https://www.wikidata.org/wiki/Q574844","display_name":"Kernel (algebra)","level":2,"score":0.49950000643730164},{"id":"https://openalex.org/C82876162","wikidata":"https://www.wikidata.org/wiki/Q17096504","display_name":"Latency (audio)","level":2,"score":0.4316999912261963},{"id":"https://openalex.org/C130795937","wikidata":"https://www.wikidata.org/wiki/Q2561570","display_name":"Remote direct memory access","level":2,"score":0.37869998812675476},{"id":"https://openalex.org/C36503486","wikidata":"https://www.wikidata.org/wiki/Q11235244","display_name":"Domain (mathematical analysis)","level":2,"score":0.37389999628067017},{"id":"https://openalex.org/C64270927","wikidata":"https://www.wikidata.org/wiki/Q206924","display_name":"PCI Express","level":3,"score":0.37139999866485596},{"id":"https://openalex.org/C198880260","wikidata":"https://www.wikidata.org/wiki/Q5289813","display_name":"Domain decomposition methods","level":3,"score":0.36970001459121704},{"id":"https://openalex.org/C2778770139","wikidata":"https://www.wikidata.org/wiki/Q1966904","display_name":"Solver","level":2,"score":0.36239999532699585},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.3488999903202057},{"id":"https://openalex.org/C157742956","wikidata":"https://www.wikidata.org/wiki/Q3237776","display_name":"Frequency scaling","level":3,"score":0.33559998869895935},{"id":"https://openalex.org/C141353440","wikidata":"https://www.wikidata.org/wiki/Q182221","display_name":"Fuse (electrical)","level":2,"score":0.33500000834465027},{"id":"https://openalex.org/C204323151","wikidata":"https://www.wikidata.org/wiki/Q905424","display_name":"Range (aeronautics)","level":2,"score":0.33489999175071716},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.32519999146461487},{"id":"https://openalex.org/C50805821","wikidata":"https://www.wikidata.org/wiki/Q1136670","display_name":"Titan (rocket family)","level":2,"score":0.31369999051094055},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.3091000020503998},{"id":"https://openalex.org/C59593255","wikidata":"https://www.wikidata.org/wiki/Q901663","display_name":"Molecular dynamics","level":2,"score":0.3089999854564667},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.2964000105857849},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.29190000891685486},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.29089999198913574},{"id":"https://openalex.org/C62611344","wikidata":"https://www.wikidata.org/wiki/Q1062658","display_name":"Node (physics)","level":2,"score":0.2816999852657318},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.2775999903678894},{"id":"https://openalex.org/C2781335571","wikidata":"https://www.wikidata.org/wiki/Q2633544","display_name":"GPU cluster","level":3,"score":0.2720000147819519},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.2689000070095062},{"id":"https://openalex.org/C50630238","wikidata":"https://www.wikidata.org/wiki/Q971505","display_name":"General-purpose computing on graphics processing units","level":3,"score":0.26809999346733093},{"id":"https://openalex.org/C2777032711","wikidata":"https://www.wikidata.org/wiki/Q5318993","display_name":"Dynamic mode decomposition","level":2,"score":0.2644999921321869},{"id":"https://openalex.org/C45347329","wikidata":"https://www.wikidata.org/wiki/Q5166604","display_name":"Convolution (computer science)","level":3,"score":0.26420000195503235},{"id":"https://openalex.org/C854659","wikidata":"https://www.wikidata.org/wiki/Q1859284","display_name":"Message passing","level":2,"score":0.258899986743927},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.25850000977516174},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.2531999945640564},{"id":"https://openalex.org/C124681953","wikidata":"https://www.wikidata.org/wiki/Q339062","display_name":"Decomposition","level":2,"score":0.25209999084472656},{"id":"https://openalex.org/C75172450","wikidata":"https://www.wikidata.org/wiki/Q623950","display_name":"Fast Fourier transform","level":2,"score":0.25200000405311584},{"id":"https://openalex.org/C162319229","wikidata":"https://www.wikidata.org/wiki/Q175263","display_name":"Data structure","level":2,"score":0.2517000138759613},{"id":"https://openalex.org/C2777655017","wikidata":"https://www.wikidata.org/wiki/Q1501161","display_name":"Toolbox","level":2,"score":0.25130000710487366},{"id":"https://openalex.org/C2775937380","wikidata":"https://www.wikidata.org/wiki/Q1232589","display_name":"Replica","level":2,"score":0.25099998712539673}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3731599.3767508","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3731599.3767508","pdf_url":null,"source":null,"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2509.21527","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2509.21527","pdf_url":"https://arxiv.org/pdf/2509.21527","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.1145/3731599.3767508","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3731599.3767508","pdf_url":null,"source":null,"license":"cc-by-sa","license_id":"https://openalex.org/licenses/cc-by-sa","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":24,"referenced_works":["https://openalex.org/W1966078827","https://openalex.org/W1973858832","https://openalex.org/W2064388657","https://openalex.org/W2082586732","https://openalex.org/W2095179363","https://openalex.org/W2123768693","https://openalex.org/W2124018752","https://openalex.org/W2160544821","https://openalex.org/W2294929133","https://openalex.org/W2785822536","https://openalex.org/W3035762603","https://openalex.org/W3137530921","https://openalex.org/W3170377433","https://openalex.org/W3174311009","https://openalex.org/W3202517391","https://openalex.org/W4238826819","https://openalex.org/W4318612650","https://openalex.org/W4381327256","https://openalex.org/W4406157368","https://openalex.org/W4406164050","https://openalex.org/W4407542320","https://openalex.org/W4413458368","https://openalex.org/W6912710317","https://openalex.org/W7080119950"],"related_works":[],"abstract_inverted_index":{"Improving":[0],"time-to-solution":[1],"in":[2,22],"molecular":[3],"dynamics":[4],"simulations":[5],"often":[6],"requires":[7],"strong":[8,112],"scaling":[9,113],"due":[10],"to":[11,98,119,127],"fixed-sized":[12],"problems.":[13],"GROMACS":[14,60,111],"is":[15],"highly":[16],"latency-sensitive,":[17],"with":[18],"peak":[19],"iteration":[20],"rates":[21],"the":[23,59,92,134],"sub-millisecond,":[24],"making":[25],"scalability":[26],"on":[27,37],"heterogeneous":[28],"supercomputers":[29],"challenging.":[30],"MPI\u2019s":[31],"CPU-centric":[32],"nature":[33],"introduces":[34],"additional":[35],"latencies":[36],"GPU-resident":[38,104],"applications\u2019":[39],"critical":[40],"path,":[41],"hindering":[42],"GPU":[43,55,67],"utilization":[44],"and":[45,72,90,101,122,125],"scalability.":[46],"To":[47],"address":[48],"these":[49],"limitations,":[50],"we":[51],"present":[52],"an":[53],"NVSHMEM-based":[54],"kernel-initiated":[56],"redesign":[57],"of":[58,137,145],"domain":[61],"decomposition":[62],"halo-exchange":[63],"algorithm.":[64],"Highly":[65],"tuned":[66],"kernels":[68],"fuse":[69],"data":[70,86],"packing":[71],"communication,":[73],"leveraging":[74],"hardware":[75],"latency-hiding":[76],"for":[77,140],"fine-grained":[78],"overlap.":[79],"We":[80],"employ":[81],"kernel":[82],"fusion":[83],"across":[84,115],"overlapped":[85],"forwarding":[87],"communication":[88,139],"phases":[89],"utilize":[91],"asynchronous":[93],"copy":[94],"engine":[95],"over":[96,130],"NVLink":[97,116],"optimize":[99],"latency":[100],"bandwidth.":[102],"Our":[103],"formulation":[105],"greatly":[106],"increases":[107],"communication-computation":[108],"overlap,":[109],"improving":[110],"performance":[114],"by":[117],"up":[118,126],"1.5x":[120],"(intra-node)":[121],"2x":[123],"(multi-node),":[124],"1.3x":[128],"multi-node":[129],"NVLink+InfiniBand.":[131],"This":[132],"demonstrates":[133],"profound":[135],"benefits":[136],"GPU-initiated":[138],"strong-scaling":[141],"a":[142],"broad":[143],"range":[144],"latency-sensitive":[146],"applications.":[147]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
