{"id":"https://openalex.org/W4381251538","doi":"https://doi.org/10.1145/3605148","title":"MFFT: A GPU Accelerated Highly Efficient Mixed-Precision Large-Scale FFT Framework","display_name":"MFFT: A GPU Accelerated Highly Efficient Mixed-Precision Large-Scale FFT Framework","publication_year":2023,"publication_date":"2023-06-19","ids":{"openalex":"https://openalex.org/W4381251538","doi":"https://doi.org/10.1145/3605148"},"language":"en","primary_location":{"id":"doi:10.1145/3605148","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3605148","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3605148","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3605148","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5054319371","display_name":"Yuwen Zhao","orcid":null},"institutions":[{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Yuwen Zhao","raw_affiliation_strings":["Institute of Software, Chinese Academy of Sciences, China and University of Chinese Academy of Sciences, China"],"affiliations":[{"raw_affiliation_string":"Institute of Software, Chinese Academy of Sciences, China and University of Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210128818","https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5070636985","display_name":"Fangfang Liu","orcid":"https://orcid.org/0000-0001-7344-7493"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]},{"id":"https://openalex.org/I4391767820","display_name":"State Key Laboratory of Computer Science","ror":"https://ror.org/01hsx4r68","country_code":null,"type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818","https://openalex.org/I4391767820"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Fangfang Liu","raw_affiliation_strings":["Institute of Software, Chinese Academy of Sciences, China and State Key Laboratory of Computer Science, China"],"affiliations":[{"raw_affiliation_string":"Institute of Software, Chinese Academy of Sciences, China and State Key Laboratory of Computer Science, China","institution_ids":["https://openalex.org/I4210128818","https://openalex.org/I19820366","https://openalex.org/I4391767820"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100695928","display_name":"Wenjing Ma","orcid":"https://orcid.org/0000-0002-1795-4498"},"institutions":[{"id":"https://openalex.org/I4391767820","display_name":"State Key Laboratory of Computer Science","ror":"https://ror.org/01hsx4r68","country_code":null,"type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818","https://openalex.org/I4391767820"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Wenjing Ma","raw_affiliation_strings":["Institute of Software, Chinese Academy of Sciences, China and State Key Laboratory of Computer Science, China"],"affiliations":[{"raw_affiliation_string":"Institute of Software, Chinese Academy of Sciences, China and State Key Laboratory of Computer Science, China","institution_ids":["https://openalex.org/I4210128818","https://openalex.org/I19820366","https://openalex.org/I4391767820"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100637657","display_name":"Huiyuan Li","orcid":"https://orcid.org/0000-0002-6326-9926"},"institutions":[{"id":"https://openalex.org/I4391767820","display_name":"State Key Laboratory of Computer Science","ror":"https://ror.org/01hsx4r68","country_code":null,"type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818","https://openalex.org/I4391767820"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huiyuan Li","raw_affiliation_strings":["Institute of Software, Chinese Academy of Sciences, China and State Key Laboratory of Computer Science, China"],"affiliations":[{"raw_affiliation_string":"Institute of Software, Chinese Academy of Sciences, China and State Key Laboratory of Computer Science, China","institution_ids":["https://openalex.org/I4210128818","https://openalex.org/I19820366","https://openalex.org/I4391767820"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108301047","display_name":"Yuanchi Peng","orcid":null},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"education","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]},{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuanchi Peng","raw_affiliation_strings":["Institute of Software, Chinese Academy of Sciences, China and University of Chinese Academy of Sciences, China"],"affiliations":[{"raw_affiliation_string":"Institute of Software, Chinese Academy of Sciences, China and University of Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210128818","https://openalex.org/I4210165038"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101490688","display_name":"Cui Wang","orcid":"https://orcid.org/0009-0004-7841-7946"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210128818","display_name":"Institute of Software","ror":"https://ror.org/033dfsn42","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210128818"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Cui Wang","raw_affiliation_strings":["Institute of Software, Chinese Academy of Sciences, China"],"affiliations":[{"raw_affiliation_string":"Institute of Software, Chinese Academy of Sciences, China","institution_ids":["https://openalex.org/I4210128818","https://openalex.org/I19820366"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5054319371"],"corresponding_institution_ids":["https://openalex.org/I4210128818","https://openalex.org/I4210165038"],"apc_list":null,"apc_paid":null,"fwci":2.4345,"has_fulltext":true,"cited_by_count":8,"citation_normalized_percentile":{"value":0.88230781,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":96,"max":99},"biblio":{"volume":"20","issue":"3","first_page":"1","last_page":"23"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9993000030517578,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11697","display_name":"Numerical Methods and Algorithms","score":0.9947999715805054,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11034","display_name":"Digital Filter Design and Implementation","score":0.9865000247955322,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/fast-fourier-transform","display_name":"Fast Fourier transform","score":0.8536141514778137},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7825376987457275},{"id":"https://openalex.org/keywords/parallel-computing","display_name":"Parallel computing","score":0.4366002380847931},{"id":"https://openalex.org/keywords/bottleneck","display_name":"Bottleneck","score":0.4184555411338806},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.3741757273674011},{"id":"https://openalex.org/keywords/embedded-system","display_name":"Embedded system","score":0.09214231371879578}],"concepts":[{"id":"https://openalex.org/C75172450","wikidata":"https://www.wikidata.org/wiki/Q623950","display_name":"Fast Fourier transform","level":2,"score":0.8536141514778137},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7825376987457275},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.4366002380847931},{"id":"https://openalex.org/C2780513914","wikidata":"https://www.wikidata.org/wiki/Q18210350","display_name":"Bottleneck","level":2,"score":0.4184555411338806},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3741757273674011},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.09214231371879578}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3605148","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3605148","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3605148","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.1145/3605148","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3605148","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3605148","source":{"id":"https://openalex.org/S26056741","display_name":"ACM Transactions on Architecture and Code Optimization","issn_l":"1544-3566","issn":["1544-3566","1544-3973"],"is_oa":true,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"ACM Transactions on Architecture and Code Optimization","raw_type":"journal-article"},"sustainable_development_goals":[{"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7","score":0.44999998807907104}],"awards":[{"id":"https://openalex.org/G4243928136","display_name":null,"funder_award_id":"2021YFB0300203","funder_id":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China"}],"funders":[{"id":"https://openalex.org/F4320332382","display_name":"Oak Ridge Institute for Science and Education","ror":"https://ror.org/0526p1y61"},{"id":"https://openalex.org/F4320335777","display_name":"National Key Research and Development Program of China","ror":null}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4381251538.pdf","grobid_xml":"https://content.openalex.org/works/W4381251538.grobid-xml"},"referenced_works_count":32,"referenced_works":["https://openalex.org/W182691100","https://openalex.org/W1272383281","https://openalex.org/W1965612527","https://openalex.org/W1973825452","https://openalex.org/W1987433500","https://openalex.org/W1994148398","https://openalex.org/W2002722812","https://openalex.org/W2042478655","https://openalex.org/W2052440657","https://openalex.org/W2071739952","https://openalex.org/W2102182691","https://openalex.org/W2136952590","https://openalex.org/W2158261096","https://openalex.org/W2576489192","https://openalex.org/W2911591717","https://openalex.org/W2992612536","https://openalex.org/W3010781441","https://openalex.org/W3036005033","https://openalex.org/W3037675408","https://openalex.org/W3103952756","https://openalex.org/W3127518410","https://openalex.org/W3157925188","https://openalex.org/W3185689094","https://openalex.org/W3207730444","https://openalex.org/W4232611972","https://openalex.org/W4234373857","https://openalex.org/W4239514724","https://openalex.org/W4244305136","https://openalex.org/W4299856158","https://openalex.org/W4321364834","https://openalex.org/W4404988331","https://openalex.org/W6779924746"],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W1657880117","https://openalex.org/W2595172197","https://openalex.org/W2127970246","https://openalex.org/W2084856301","https://openalex.org/W1001352512","https://openalex.org/W4382618745","https://openalex.org/W2885125400","https://openalex.org/W1989889224","https://openalex.org/W2025911710"],"abstract_inverted_index":{"Fast":[0],"Fourier":[1],"transform":[2],"(FFT)":[3],"is":[4,17,143],"widely":[5],"used":[6],"in":[7,10],"computing":[8],"applications":[9],"large-scale":[11,38],"parallel":[12,28,44,179,198],"programs,":[13],"and":[14,24,125,154,160,172,192],"data":[15,79],"communication":[16],"the":[18,53,76,96,101,112,197],"main":[19],"performance":[20,158],"bottleneck":[21],"of":[22,78,117,149,181],"FFT":[23,39,45],"seriously":[25],"affects":[26],"its":[27],"efficiency.":[29],"To":[30,61],"tackle":[31],"this":[32],"problem,":[33],"we":[34,66,87,105],"propose":[35,67],"a":[36,47,68,89,132],"new":[37,48],"framework,":[40,104],"MFFT,":[41],"which":[42,74],"optimizes":[43],"with":[46,134,190],"mixed-precision":[49,102],"optimization":[50,108],"technique,":[51,73],"adopting":[52],"\u201chigh":[54],"precision":[55,58,64],"computation,":[56],"low":[57],"communication\u201d":[59],"strategy.":[60],"enable":[62],"\u201clow":[63],"communication\u201d,":[65],"shared-exponent":[69,141,193],"floating-point":[70],"number":[71],"compression":[72],"reduces":[75],"volume":[77],"communication,":[80],"while":[81],"maintaining":[82],"higher":[83,164],"accuracy.":[84],"In":[85],"addition,":[86],"apply":[88,106],"two-phase":[90],"normalization":[91],"technique":[92],"to":[93,110,187,200],"further":[94,195],"reduce":[95],"round-off":[97],"error.":[98],"Based":[99],"on":[100,131,152,162],"MFFT":[103,130,142,151,156,183,194],"several":[107],"techniques":[109],"improve":[111],"performance,":[113],"such":[114],"as":[115],"streaming":[116],"GPU":[118],"kernels,":[119],"MPI":[120],"message":[121],"combination,":[122],"kernel":[123],"optimization,":[124],"memory":[126],"optimization.":[127],"We":[128],"evaluate":[129],"system":[133],"4,096":[135],"GPUs.":[136],"The":[137,178],"results":[138],"show":[139],"that":[140,148],"1.23":[144],"\u00d7":[145],"faster":[146],"than":[147,165],"double-precision":[150,155,182],"average,":[153],"achieves":[157],"3.53\u00d7":[159],"9.48\u00d7":[161],"average":[163],"open":[166],"source":[167],"library":[168],"2Decomp&amp;FFT":[169],"(CPU-based":[170],"version)":[171],"heFFTe":[173],"(AMD":[174],"GPU-based":[175],"version),":[176],"respectively.":[177],"efficiency":[180,199],"increased":[184],"from":[185],"53.2%":[186],"78.1%":[188],"compared":[189],"2Decomp&amp;FFT,":[191],"increases":[196],"83.8%.":[201]},"counts_by_year":[{"year":2026,"cited_by_count":2},{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":3}],"updated_date":"2026-03-24T08:02:53.985720","created_date":"2025-10-10T00:00:00"}
