{"id":"https://openalex.org/W2979588510","doi":"https://doi.org/10.1177/1094342019882246","title":"Accelerating and tuning small matrix multiplications on Sunway TaihuLight: A case study of spectral element CFD Code Nek5000","display_name":"Accelerating and tuning small matrix multiplications on Sunway TaihuLight: A case study of spectral element CFD Code Nek5000","publication_year":2019,"publication_date":"2019-10-09","ids":{"openalex":"https://openalex.org/W2979588510","doi":"https://doi.org/10.1177/1094342019882246","mag":"2979588510"},"language":"en","primary_location":{"id":"doi:10.1177/1094342019882246","is_oa":false,"landing_page_url":"https://doi.org/10.1177/1094342019882246","pdf_url":null,"source":{"id":"https://openalex.org/S60606485","display_name":"The International Journal of High Performance Computing Applications","issn_l":"1094-3420","issn":["1094-3420","1741-2846"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320017","host_organization_name":"SAGE Publishing","host_organization_lineage":["https://openalex.org/P4310320017"],"host_organization_lineage_names":["SAGE Publishing"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The International Journal of High Performance Computing Applications","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103132550","display_name":"Xianmeng Wang","orcid":null},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xianmeng Wang","raw_affiliation_strings":["School of Computer and Communication Engineering, University of Science and Technology Beijing, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer and Communication Engineering, University of Science and Technology Beijing, Beijing, China","institution_ids":["https://openalex.org/I92403157"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110207844","display_name":"Zhifeng Zhou","orcid":"https://orcid.org/0000-0003-3160-589X"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhifeng Zhou","raw_affiliation_strings":["School of Computer and Communication Engineering, University of Science and Technology Beijing, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer and Communication Engineering, University of Science and Technology Beijing, Beijing, China","institution_ids":["https://openalex.org/I92403157"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103142109","display_name":"Changjun Hu","orcid":"https://orcid.org/0000-0003-3857-7262"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Changjun Hu","raw_affiliation_strings":["School of Computer and Communication Engineering, University of Science and Technology Beijing, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer and Communication Engineering, University of Science and Technology Beijing, Beijing, China","institution_ids":["https://openalex.org/I92403157"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055352538","display_name":"Wen Yang","orcid":"https://orcid.org/0000-0003-1222-4987"},"institutions":[{"id":"https://openalex.org/I4210096139","display_name":"China Institute of Atomic Energy","ror":"https://ror.org/00v5gqm66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210096139"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wen Yang","raw_affiliation_strings":["China Institute of Atomic Energy, Beijing, China"],"affiliations":[{"raw_affiliation_string":"China Institute of Atomic Energy, Beijing, China","institution_ids":["https://openalex.org/I4210096139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5054066775","display_name":"Minfu Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I4210096139","display_name":"China Institute of Atomic Energy","ror":"https://ror.org/00v5gqm66","country_code":"CN","type":"facility","lineage":["https://openalex.org/I4210096139"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Minfu Zhao","raw_affiliation_strings":["China Institute of Atomic Energy, Beijing, China"],"affiliations":[{"raw_affiliation_string":"China Institute of Atomic Energy, Beijing, China","institution_ids":["https://openalex.org/I4210096139"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028877799","display_name":"Zhaoshun Wang","orcid":"https://orcid.org/0000-0002-2736-9970"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhaoshun Wang","raw_affiliation_strings":["School of Computer and Communication Engineering, University of Science and Technology Beijing, Beijing, China"],"affiliations":[{"raw_affiliation_string":"School of Computer and Communication Engineering, University of Science and Technology Beijing, Beijing, China","institution_ids":["https://openalex.org/I92403157"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101483396","display_name":"Peng Shi","orcid":"https://orcid.org/0000-0002-5349-6383"},"institutions":[{"id":"https://openalex.org/I92403157","display_name":"University of Science and Technology Beijing","ror":"https://ror.org/02egmk993","country_code":"CN","type":"education","lineage":["https://openalex.org/I92403157"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Peng Shi","raw_affiliation_strings":["National Center for Materials Service Safety, University of Science and Technology Beijing, Beijing, China"],"affiliations":[{"raw_affiliation_string":"National Center for Materials Service Safety, University of Science and Technology Beijing, Beijing, China","institution_ids":["https://openalex.org/I92403157"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":7,"corresponding_author_ids":["https://openalex.org/A5103142109"],"corresponding_institution_ids":["https://openalex.org/I92403157"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.12705453,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":94,"max":96},"biblio":{"volume":"34","issue":"2","first_page":"178","last_page":"186"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9991999864578247,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9986000061035156,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/supercomputer","display_name":"Supercomputer","score":0.9305132627487183},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6793652772903442},{"id":"https://openalex.org/keywords/computational-science","display_name":"Computational science","score":0.66991126537323},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.6511382460594177},{"id":"https://openalex.org/keywords/vectorization","display_name":"Vectorization (mathematics)","score":0.5931612253189087},{"id":"https://openalex.org/keywords/porting","display_name":"Porting","score":0.5625597834587097},{"id":"https://openalex.org/keywords/computational-fluid-dynamics","display_name":"Computational fluid dynamics","score":0.5546303987503052},{"id":"https://openalex.org/keywords/matrix","display_name":"Matrix (chemical analysis)","score":0.4448885917663574},{"id":"https://openalex.org/keywords/computation","display_name":"Computation","score":0.42212235927581787},{"id":"https://openalex.org/keywords/software","display_name":"Software","score":0.22517144680023193},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.16087067127227783},{"id":"https://openalex.org/keywords/operating-system","display_name":"Operating system","score":0.15656423568725586},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.11099240183830261},{"id":"https://openalex.org/keywords/materials-science","display_name":"Materials science","score":0.1048017144203186}],"concepts":[{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.9305132627487183},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6793652772903442},{"id":"https://openalex.org/C459310","wikidata":"https://www.wikidata.org/wiki/Q117801","display_name":"Computational science","level":1,"score":0.66991126537323},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.6511382460594177},{"id":"https://openalex.org/C41681595","wikidata":"https://www.wikidata.org/wiki/Q7917855","display_name":"Vectorization (mathematics)","level":2,"score":0.5931612253189087},{"id":"https://openalex.org/C106251023","wikidata":"https://www.wikidata.org/wiki/Q851989","display_name":"Porting","level":3,"score":0.5625597834587097},{"id":"https://openalex.org/C1633027","wikidata":"https://www.wikidata.org/wiki/Q815820","display_name":"Computational fluid dynamics","level":2,"score":0.5546303987503052},{"id":"https://openalex.org/C106487976","wikidata":"https://www.wikidata.org/wiki/Q685816","display_name":"Matrix (chemical analysis)","level":2,"score":0.4448885917663574},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.42212235927581787},{"id":"https://openalex.org/C2777904410","wikidata":"https://www.wikidata.org/wiki/Q7397","display_name":"Software","level":2,"score":0.22517144680023193},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.16087067127227783},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.15656423568725586},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.11099240183830261},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.1048017144203186},{"id":"https://openalex.org/C146978453","wikidata":"https://www.wikidata.org/wiki/Q3798668","display_name":"Aerospace engineering","level":1,"score":0.0},{"id":"https://openalex.org/C159985019","wikidata":"https://www.wikidata.org/wiki/Q181790","display_name":"Composite material","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1177/1094342019882246","is_oa":false,"landing_page_url":"https://doi.org/10.1177/1094342019882246","pdf_url":null,"source":{"id":"https://openalex.org/S60606485","display_name":"The International Journal of High Performance Computing Applications","issn_l":"1094-3420","issn":["1094-3420","1741-2846"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320017","host_organization_name":"SAGE Publishing","host_organization_lineage":["https://openalex.org/P4310320017"],"host_organization_lineage_names":["SAGE Publishing"],"type":"journal"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"The International Journal of High Performance Computing Applications","raw_type":"journal-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5985737152","display_name":null,"funder_award_id":"2017YFB0202303","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"}],"funders":[{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":10,"referenced_works":["https://openalex.org/W2030859425","https://openalex.org/W2109241917","https://openalex.org/W2173732377","https://openalex.org/W2475126267","https://openalex.org/W2502318050","https://openalex.org/W2621991888","https://openalex.org/W2753495321","https://openalex.org/W2775348209","https://openalex.org/W2793489890","https://openalex.org/W2884546353"],"related_works":["https://openalex.org/W2356602486","https://openalex.org/W2351992668","https://openalex.org/W2324828474","https://openalex.org/W2374315191","https://openalex.org/W2391207559","https://openalex.org/W2384715785","https://openalex.org/W32529763","https://openalex.org/W2106561276","https://openalex.org/W1582436825","https://openalex.org/W1996803181"],"abstract_inverted_index":{"The":[0,21,117,166],"matrix\u2013matrix":[1],"products":[2],"for":[3,114],"matrices":[4],"of":[5,18,41,121,140],"small":[6,45,56,127],"size":[7],"have":[8,134],"continued":[9],"to":[10,27,38,52,94,133,137,182],"play":[11],"an":[12,100],"important":[13],"part":[14],"in":[15,31,74],"a":[16,29,50,91],"range":[17],"scientific":[19],"applications.":[20],"heterogeneous":[22],"architecture,":[23],"which":[24,63,122],"is":[25,80,99,126,131],"predicted":[26],"be":[28],"trend":[30],"the":[32,39,68,75,109,123,135,175],"exascale":[33],"supercomputing":[34],"era,":[35],"gives":[36],"rises":[37],"challenges":[40],"porting":[42],"and":[43,54],"optimizing":[44,145],"matrix":[46,57,129],"products.":[47],"We":[48,87,158,179],"present":[49],"method":[51,112],"accelerating":[53],"tune":[55],"multiplications":[58],"on":[59,108,154],"Sunway":[60,78],"TaihuLight":[61,79],"supercomputer,":[62],"has":[64],"been":[65],"titled":[66],"as":[67,90],"most":[69],"powerful":[70],"supercomputer":[71],"four":[72],"times":[73],"Top5000":[76],"list.":[77],"equipped":[81],"with":[82,186],"Shen-Wei":[83],"hybrid":[84],"manycore":[85],"processors.":[86],"use":[88],"Nek5000":[89,98,160,181],"case":[92],"study":[93],"demonstrate":[95],"our":[96],"methods.":[97],"open-source":[101],"computational":[102],"fluid":[103],"dynamics":[104],"(CFD)":[105],"solver":[106],"based":[107],"spectral":[110],"element":[111],"(SEM)":[113],"incompressible":[115],"flow.":[116],"high-order":[118],"SEM":[119],"method,":[120],"computation":[124],"kernel":[125],"dense":[128],"products,":[130],"regarded":[132],"potential":[136],"overcome":[138],"constraints":[139],"standard":[141],"CFD":[142],"software.":[143],"By":[144],"using":[146,161],"vectorization,":[147],"we":[148],"gained":[149],"about":[150,190],"30%":[151,191],"performance":[152,177,192],"improvement":[153],"management":[155],"processing":[156,163],"element.":[157],"accelerated":[159],"computing":[162],"elements":[164],"(CPEs).":[165],"experiments":[167],"results":[168],"suggest":[169],"that":[170],"employing":[171],"32":[172],"CPEs":[173],"delivers":[174],"best":[176],"enhancements.":[178],"scaled":[180],"16,384":[183],"core":[184],"groups":[185],"540,672":[187],"cores,":[188],"reaching":[189],"improvements.":[193]},"counts_by_year":[{"year":2025,"cited_by_count":2},{"year":2023,"cited_by_count":2}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}
