{"id":"https://openalex.org/W4388581291","doi":"https://doi.org/10.1145/3624062.3624249","title":"Advancing the distributed Multi-GPU ChASE library through algorithm optimization and NCCL library","display_name":"Advancing the distributed Multi-GPU ChASE library through algorithm optimization and NCCL library","publication_year":2023,"publication_date":"2023-11-10","ids":{"openalex":"https://openalex.org/W4388581291","doi":"https://doi.org/10.1145/3624062.3624249"},"language":"en","primary_location":{"id":"doi:10.1145/3624062.3624249","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3624062.3624249","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the SC '23 Workshops of the International Conference on High Performance Computing, Network, Storage, and Analysis","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5070784128","display_name":"Xinzhe Wu","orcid":"https://orcid.org/0000-0001-5716-3116"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Xinzhe Wu","raw_affiliation_strings":["J\u00fclich Supercomputing Centre, Germany"],"raw_orcid":"https://orcid.org/0000-0001-5716-3116","affiliations":[{"raw_affiliation_string":"J\u00fclich Supercomputing Centre, Germany","institution_ids":[]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5055932268","display_name":"Edoardo Di Napoli","orcid":"https://orcid.org/0000-0001-5821-5897"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Edoardo Di Napoli","raw_affiliation_strings":["J\u00fclich Supercomputing Centre, Germany"],"raw_orcid":"https://orcid.org/0000-0001-5821-5897","affiliations":[{"raw_affiliation_string":"J\u00fclich Supercomputing Centre, Germany","institution_ids":[]}]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5070784128"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.4962,"has_fulltext":false,"cited_by_count":5,"citation_normalized_percentile":{"value":0.82185309,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":97},"biblio":{"volume":null,"issue":null,"first_page":"1688","last_page":"1696"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9972000122070312,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10792","display_name":"Matrix Theory and Algorithms","score":0.9966999888420105,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11809","display_name":"Advanced NMR Techniques and Applications","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1607","display_name":"Spectroscopy"},"field":{"id":"https://openalex.org/fields/16","display_name":"Chemistry"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8115110397338867},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.7019854784011841},{"id":"https://openalex.org/keywords/subspace-topology","display_name":"Subspace topology","score":0.639472246170044},{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.5476776361465454},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.4934808015823364},{"id":"https://openalex.org/keywords/chebyshev-filter","display_name":"Chebyshev filter","score":0.4861023426055908},{"id":"https://openalex.org/keywords/synchronization","display_name":"Synchronization (alternating current)","score":0.46882718801498413},{"id":"https://openalex.org/keywords/chebyshev-polynomials","display_name":"Chebyshev polynomials","score":0.4524180293083191},{"id":"https://openalex.org/keywords/graphics-processing-unit","display_name":"Graphics processing unit","score":0.4293590486049652},{"id":"https://openalex.org/keywords/gpu-cluster","display_name":"GPU cluster","score":0.42697036266326904},{"id":"https://openalex.org/keywords/scheme","display_name":"Scheme (mathematics)","score":0.41581639647483826},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.39543235301971436},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.3168995976448059},{"id":"https://openalex.org/keywords/cuda","display_name":"CUDA","score":0.24784037470817566},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.12990623712539673},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.09431338310241699},{"id":"https://openalex.org/keywords/telecommunications","display_name":"Telecommunications","score":0.0719490647315979}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8115110397338867},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7019854784011841},{"id":"https://openalex.org/C32834561","wikidata":"https://www.wikidata.org/wiki/Q660730","display_name":"Subspace topology","level":2,"score":0.639472246170044},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.5476776361465454},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.4934808015823364},{"id":"https://openalex.org/C21424316","wikidata":"https://www.wikidata.org/wiki/Q718621","display_name":"Chebyshev filter","level":2,"score":0.4861023426055908},{"id":"https://openalex.org/C2778562939","wikidata":"https://www.wikidata.org/wiki/Q1298791","display_name":"Synchronization (alternating current)","level":3,"score":0.46882718801498413},{"id":"https://openalex.org/C129785596","wikidata":"https://www.wikidata.org/wiki/Q619511","display_name":"Chebyshev polynomials","level":2,"score":0.4524180293083191},{"id":"https://openalex.org/C2779851693","wikidata":"https://www.wikidata.org/wiki/Q183484","display_name":"Graphics processing unit","level":2,"score":0.4293590486049652},{"id":"https://openalex.org/C2781335571","wikidata":"https://www.wikidata.org/wiki/Q2633544","display_name":"GPU cluster","level":3,"score":0.42697036266326904},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.41581639647483826},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.39543235301971436},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3168995976448059},{"id":"https://openalex.org/C2778119891","wikidata":"https://www.wikidata.org/wiki/Q477690","display_name":"CUDA","level":2,"score":0.24784037470817566},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.12990623712539673},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.09431338310241699},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0719490647315979},{"id":"https://openalex.org/C127162648","wikidata":"https://www.wikidata.org/wiki/Q16858953","display_name":"Channel (broadcasting)","level":2,"score":0.0},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3624062.3624249","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3624062.3624249","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the SC '23 Workshops of the International Conference on High Performance Computing, Network, Storage, and Analysis","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":16,"referenced_works":["https://openalex.org/W182691100","https://openalex.org/W1998689980","https://openalex.org/W2004286430","https://openalex.org/W2044540053","https://openalex.org/W2053277271","https://openalex.org/W2068704041","https://openalex.org/W2083154122","https://openalex.org/W2116808829","https://openalex.org/W2148678374","https://openalex.org/W2208990223","https://openalex.org/W2252007067","https://openalex.org/W2963530710","https://openalex.org/W3007543653","https://openalex.org/W3007862902","https://openalex.org/W3034725130","https://openalex.org/W3199509385"],"related_works":["https://openalex.org/W2382532955","https://openalex.org/W2106962382","https://openalex.org/W4213342932","https://openalex.org/W2056379897","https://openalex.org/W2074316639","https://openalex.org/W2541927208","https://openalex.org/W1582436825","https://openalex.org/W2056717482","https://openalex.org/W2346971659","https://openalex.org/W1482681439"],"abstract_inverted_index":{"As":[0],"supercomputers":[1],"become":[2],"larger":[3],"with":[4,17,122],"powerful":[5],"Graphics":[6],"Processing":[7],"Unit":[8],"(GPU),":[9],"traditional":[10],"direct":[11],"eigensolvers":[12],"struggle":[13],"to":[14,25,61,93,125,131],"keep":[15],"up":[16,124,130],"the":[18,34,63,107,111,132,144,149],"hardware":[19],"evolution":[20],"and":[21,27,44,48,105,127],"scale":[22],"efficiently":[23],"due":[24],"communication":[26,47],"synchronization":[28,49],"demands.":[29],"Conversely,":[30],"subspace":[31,55],"eigensolvers,":[32],"like":[33],"Chebyshev":[35,59],"Accelerated":[36],"Subspace":[37],"Eigensolver":[38],"(ChASE),":[39],"have":[40,79],"a":[41,53,88,94],"simpler":[42],"structure":[43],"can":[45,118],"overcome":[46],"bottlenecks.":[50],"ChASE":[51,81],"is":[52],"modern":[54],"eigensolver":[56],"that":[57],"uses":[58],"polynomials":[60],"accelerate":[62],"computation":[64],"of":[65,68,101],"extremal":[66],"eigenpairs":[67],"dense":[69,120],"Hermitian":[70],"eigenproblems.":[71],"In":[72],"this":[73],"work":[74],"we":[75,78],"show":[76],"how":[77],"modified":[80],"by":[82,110,138],"rethinking":[83],"its":[84,102],"memory":[85],"layout,":[86],"introducing":[87],"novel":[89],"parallelization":[90],"scheme,":[91],"switching":[92],"more":[95],"performing":[96],"communication-avoiding":[97],"algorithm":[98],"for":[99],"one":[100,136],"inner":[103],"modules,":[104],"substituting":[106],"MPI":[108],"library":[109,117],"vendor-optimized":[112],"NCCL":[113],"library.":[114],"The":[115],"resulting":[116],"tackle":[119],"problems":[121],"size":[123],",":[126],"scales":[128],"effortlessly":[129],"full":[133],"900":[134],"nodes\u2014each":[135],"powered":[137],"4":[139],"\u00d7":[140],"A100":[141],"NVIDIA":[142],"GPUs\u2014of":[143],"JUWELS":[145],"Booster":[146],"hosted":[147],"at":[148],"J\u00fclich":[150],"Supercomputing":[151],"Centre.":[152]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
